From 2d23d0736e3a4a0fdb92b8e46ea476639f16aae8 Mon Sep 17 00:00:00 2001
From: Roee Zamir <roee.zamir@intel.com>
Date: Sun, 6 Aug 2017 11:38:22 +0300
Subject: nl80211: add OCE scan and capability flags

Add Optimized Connectivity Experience (OCE) scan and capability flags.
Some of them unique to OCE and some are stand alone.
And add scan flags to enable/disable them.

Signed-off-by: Roee Zamir <roee.zamir@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 51626b4175c0..76404d8a8863 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4914,6 +4914,15 @@ enum nl80211_feature_flags {
  *	handshake with 802.1X in station mode (will pass EAP frames to the host
  *	and accept the set_pmk/del_pmk commands), doing it in the host might not
  *	be supported.
+ * @NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME: Driver is capable of overriding
+ *	the max channel attribute in the FILS request params IE with the
+ *	actual dwell time.
+ * @NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP: Driver accepts broadcast probe
+ *	response
+ * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE: Driver supports sending
+ *	the first probe request in each channel at rate of at least 5.5Mbps.
+ * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports
+ *	probe request tx deferral and suppression
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4936,6 +4945,10 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_FILS_SK_OFFLOAD,
 	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK,
 	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X,
+	NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME,
+	NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP,
+	NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE,
+	NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
@@ -5012,12 +5025,28 @@ enum nl80211_timeout_reason {
  *	locally administered 1, multicast 0) is assumed.
  *	This flag must not be requested when the feature isn't supported, check
  *	the nl80211 feature flags for the device.
+ * @NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME: fill the dwell time in the FILS
+ *	request parameters IE in the probe request
+ * @NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP: accept broadcast probe responses
+ * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE: send probe request frames at
+ *	rate of at least 5.5M. In case non OCE AP is dicovered in the channel,
+ *	only the first probe req in the channel will be sent in high rate.
+ * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: allow probe request
+ *	tx deferral (dot11FILSProbeDelay shall be set to 15ms)
+ *	and suppression (if it has received a broadcast Probe Response frame,
+ *	Beacon frame or FILS Discovery frame from an AP that the STA considers
+ *	a suitable candidate for (re-)association - suitable in terms of
+ *	SSID and/or RSSI
  */
 enum nl80211_scan_flags {
-	NL80211_SCAN_FLAG_LOW_PRIORITY			= 1<<0,
-	NL80211_SCAN_FLAG_FLUSH				= 1<<1,
-	NL80211_SCAN_FLAG_AP				= 1<<2,
-	NL80211_SCAN_FLAG_RANDOM_ADDR			= 1<<3,
+	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
+	NL80211_SCAN_FLAG_FLUSH					= 1<<1,
+	NL80211_SCAN_FLAG_AP					= 1<<2,
+	NL80211_SCAN_FLAG_RANDOM_ADDR				= 1<<3,
+	NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME			= 1<<4,
+	NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP		= 1<<5,
+	NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE		= 1<<6,
+	NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION	= 1<<7,
 };
 
 /**
-- 
cgit v1.2.3


From 65026002d69de006e273749bb799d3b01b757eb0 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Fri, 18 Aug 2017 15:31:41 +0300
Subject: nl80211: add an option to allow MFP without requiring it

The user space can now allow the kernel to associate to an AP that
requires MFP or that doesn't have MFP enabled in the same
NL80211_CMD_CONNECT command, by using a new NL80211_MFP_OPTIONAL flag.
The driver / firmware will decide whether to use it or not.

Include a feature bit to advertise support for NL80211_MFP_OPTIONAL.
This allows new user space to run on old kernels and know that it
cannot use the new attribute if it isn't supported.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 13 +++++++++++--
 net/wireless/nl80211.c       |  8 +++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 76404d8a8863..59ba6ca66a0d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1407,8 +1407,12 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_USE_MFP: Whether management frame protection (IEEE 802.11w) is
  *	used for the association (&enum nl80211_mfp, represented as a u32);
- *	this attribute can be used
- *	with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests
+ *	this attribute can be used with %NL80211_CMD_ASSOCIATE and
+ *	%NL80211_CMD_CONNECT requests. %NL80211_MFP_OPTIONAL is not allowed for
+ *	%NL80211_CMD_ASSOCIATE since user space SME is expected and hence, it
+ *	must have decided whether to use management frame protection or not.
+ *	Setting %NL80211_MFP_OPTIONAL with a %NL80211_CMD_CONNECT request will
+ *	let the driver (or the firmware) decide whether to use MFP or not.
  *
  * @NL80211_ATTR_STA_FLAGS2: Attribute containing a
  *	&struct nl80211_sta_flag_update.
@@ -3947,10 +3951,12 @@ enum nl80211_key_type {
  * enum nl80211_mfp - Management frame protection state
  * @NL80211_MFP_NO: Management frame protection not used
  * @NL80211_MFP_REQUIRED: Management frame protection required
+ * @NL80211_MFP_OPTIONAL: Management frame protection is optional
  */
 enum nl80211_mfp {
 	NL80211_MFP_NO,
 	NL80211_MFP_REQUIRED,
+	NL80211_MFP_OPTIONAL,
 };
 
 enum nl80211_wpa_versions {
@@ -4923,6 +4929,8 @@ enum nl80211_feature_flags {
  *	the first probe request in each channel at rate of at least 5.5Mbps.
  * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports
  *	probe request tx deferral and suppression
+ * @NL80211_EXT_FEATURE_MFP_OPTIONAL: Driver supports the %NL80211_MFP_OPTIONAL
+ *	value in %NL80211_ATTR_USE_MFP.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4949,6 +4957,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP,
 	NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE,
 	NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
+	NL80211_EXT_FEATURE_MFP_OPTIONAL,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2e6f5f4065f9..1e39ba3cfd06 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8952,8 +8952,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 
 	if (info->attrs[NL80211_ATTR_USE_MFP]) {
 		connect.mfp = nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
+		if (connect.mfp == NL80211_MFP_OPTIONAL &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_MFP_OPTIONAL))
+			return -EOPNOTSUPP;
+
 		if (connect.mfp != NL80211_MFP_REQUIRED &&
-		    connect.mfp != NL80211_MFP_NO)
+		    connect.mfp != NL80211_MFP_NO &&
+		    connect.mfp != NL80211_MFP_OPTIONAL)
 			return -EINVAL;
 	} else {
 		connect.mfp = NL80211_MFP_NO;
-- 
cgit v1.2.3


From 333ef6bd10c3ffdaf6da94e34dc6cae675ed27fc Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 15 Aug 2017 10:07:25 -0400
Subject: media: cec: add CEC_EVENT_PIN_HPD_LOW/HIGH events

Add support for two new low-level events: PIN_HPD_LOW and PIN_HPD_HIGH.

This is specifically meant for use with the upcoming cec-gpio driver
and makes it possible to trace when the HPD pin changes. Some HDMI
sinks do strange things with the HPD and this makes it easy to debug
this.

Note that this also moves the initialization of a devnode mutex and
list to the allocate_adapter function: if the HPD is high, then as
soon as the HPD interrupt is created an interrupt occurs and
cec_queue_pin_hpd_event() is called which requires that the devnode
mutex and list are initialized.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-adap.c | 18 +++++++++++++++++-
 drivers/media/cec/cec-api.c  | 18 ++++++++++++++----
 drivers/media/cec/cec-core.c |  8 ++++----
 include/media/cec-pin.h      |  4 ++++
 include/media/cec.h          | 12 +++++++++++-
 include/uapi/linux/cec.h     |  2 ++
 6 files changed, 52 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 84d1b67f850c..dd0c9cacd1d0 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -86,7 +86,7 @@ void cec_queue_event_fh(struct cec_fh *fh,
 			const struct cec_event *new_ev, u64 ts)
 {
 	static const u8 max_events[CEC_NUM_EVENTS] = {
-		1, 1, 64, 64,
+		1, 1, 64, 64, 8, 8,
 	};
 	struct cec_event_entry *entry;
 	unsigned int ev_idx = new_ev->event - 1;
@@ -170,6 +170,22 @@ void cec_queue_pin_cec_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
 }
 EXPORT_SYMBOL_GPL(cec_queue_pin_cec_event);
 
+/* Notify userspace that the HPD pin changed state at the given time. */
+void cec_queue_pin_hpd_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
+{
+	struct cec_event ev = {
+		.event = is_high ? CEC_EVENT_PIN_HPD_HIGH :
+				   CEC_EVENT_PIN_HPD_LOW,
+	};
+	struct cec_fh *fh;
+
+	mutex_lock(&adap->devnode.lock);
+	list_for_each_entry(fh, &adap->devnode.fhs, list)
+		cec_queue_event_fh(fh, &ev, ktime_to_ns(ts));
+	mutex_unlock(&adap->devnode.lock);
+}
+EXPORT_SYMBOL_GPL(cec_queue_pin_hpd_event);
+
 /*
  * Queue a new message for this filehandle.
  *
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index a079f7fe018c..465bb3ec21f6 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -529,7 +529,7 @@ static int cec_open(struct inode *inode, struct file *filp)
 	 * Initial events that are automatically sent when the cec device is
 	 * opened.
 	 */
-	struct cec_event ev_state = {
+	struct cec_event ev = {
 		.event = CEC_EVENT_STATE_CHANGE,
 		.flags = CEC_EVENT_FL_INITIAL_STATE,
 	};
@@ -569,9 +569,19 @@ static int cec_open(struct inode *inode, struct file *filp)
 	filp->private_data = fh;
 
 	/* Queue up initial state events */
-	ev_state.state_change.phys_addr = adap->phys_addr;
-	ev_state.state_change.log_addr_mask = adap->log_addrs.log_addr_mask;
-	cec_queue_event_fh(fh, &ev_state, 0);
+	ev.state_change.phys_addr = adap->phys_addr;
+	ev.state_change.log_addr_mask = adap->log_addrs.log_addr_mask;
+	cec_queue_event_fh(fh, &ev, 0);
+#ifdef CONFIG_CEC_PIN
+	if (adap->pin && adap->pin->ops->read_hpd) {
+		err = adap->pin->ops->read_hpd(adap);
+		if (err >= 0) {
+			ev.event = err ? CEC_EVENT_PIN_HPD_HIGH :
+					 CEC_EVENT_PIN_HPD_LOW;
+			cec_queue_event_fh(fh, &ev, 0);
+		}
+	}
+#endif
 
 	list_add(&fh->list, &devnode->fhs);
 	mutex_unlock(&devnode->lock);
diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c
index 648136e552d5..e3a1fb6d6690 100644
--- a/drivers/media/cec/cec-core.c
+++ b/drivers/media/cec/cec-core.c
@@ -112,10 +112,6 @@ static int __must_check cec_devnode_register(struct cec_devnode *devnode,
 	int minor;
 	int ret;
 
-	/* Initialization */
-	INIT_LIST_HEAD(&devnode->fhs);
-	mutex_init(&devnode->lock);
-
 	/* Part 1: Find a free minor number */
 	mutex_lock(&cec_devnode_lock);
 	minor = find_next_zero_bit(cec_devnode_nums, CEC_NUM_DEVICES, 0);
@@ -242,6 +238,10 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops,
 	INIT_LIST_HEAD(&adap->wait_queue);
 	init_waitqueue_head(&adap->kthread_waitq);
 
+	/* adap->devnode initialization */
+	INIT_LIST_HEAD(&adap->devnode.fhs);
+	mutex_init(&adap->devnode.lock);
+
 	adap->kthread = kthread_run(cec_thread_func, adap, "cec-%s", name);
 	if (IS_ERR(adap->kthread)) {
 		pr_err("cec-%s: kernel_thread() failed\n", name);
diff --git a/include/media/cec-pin.h b/include/media/cec-pin.h
index f09cc9579d53..ea84b9c9e0c3 100644
--- a/include/media/cec-pin.h
+++ b/include/media/cec-pin.h
@@ -97,6 +97,9 @@ enum cec_pin_state {
  * @free:	optional. Free any allocated resources. Called when the
  *		adapter is deleted.
  * @status:	optional, log status information.
+ * @read_hpd:	read the HPD pin. Return true if high, false if low or
+ *		an error if negative. If NULL or -ENOTTY is returned,
+ *		then this is not supported.
  *
  * These operations are used by the cec pin framework to manipulate
  * the CEC pin.
@@ -109,6 +112,7 @@ struct cec_pin_ops {
 	void (*disable_irq)(struct cec_adapter *adap);
 	void (*free)(struct cec_adapter *adap);
 	void (*status)(struct cec_adapter *adap, struct seq_file *file);
+	int  (*read_hpd)(struct cec_adapter *adap);
 };
 
 #define CEC_NUM_PIN_EVENTS 128
diff --git a/include/media/cec.h b/include/media/cec.h
index df6b3bd31284..9d0f983faea9 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -91,7 +91,7 @@ struct cec_event_entry {
 };
 
 #define CEC_NUM_CORE_EVENTS 2
-#define CEC_NUM_EVENTS CEC_EVENT_PIN_CEC_HIGH
+#define CEC_NUM_EVENTS CEC_EVENT_PIN_HPD_HIGH
 
 struct cec_fh {
 	struct list_head	list;
@@ -296,6 +296,16 @@ static inline void cec_received_msg(struct cec_adapter *adap,
 void cec_queue_pin_cec_event(struct cec_adapter *adap,
 			     bool is_high, ktime_t ts);
 
+/**
+ * cec_queue_pin_hpd_event() - queue a pin event with a given timestamp.
+ *
+ * @adap:	pointer to the cec adapter
+ * @is_high:	when true the HPD pin is high, otherwise it is low
+ * @ts:		the timestamp for this event
+ *
+ */
+void cec_queue_pin_hpd_event(struct cec_adapter *adap, bool is_high, ktime_t ts);
+
 /**
  * cec_get_edid_phys_addr() - find and return the physical address
  *
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index 4351c3481aea..b9f8df3a0477 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -410,6 +410,8 @@ struct cec_log_addrs {
 #define CEC_EVENT_LOST_MSGS		2
 #define CEC_EVENT_PIN_CEC_LOW		3
 #define CEC_EVENT_PIN_CEC_HIGH		4
+#define CEC_EVENT_PIN_HPD_LOW		5
+#define CEC_EVENT_PIN_HPD_HIGH		6
 
 #define CEC_EVENT_FL_INITIAL_STATE	(1 << 0)
 #define CEC_EVENT_FL_DROPPED_EVENTS	(1 << 1)
-- 
cgit v1.2.3


From 943170998b200190f99d3fe7e771437e2c51f319 Mon Sep 17 00:00:00 2001
From: Petar Penkov <peterpenkov96@gmail.com>
Date: Fri, 22 Sep 2017 13:49:14 -0700
Subject: tun: enable NAPI for TUN/TAP driver

Changes TUN driver to use napi_gro_receive() upon receiving packets
rather than netif_rx_ni(). Adds flag IFF_NAPI that enables these
changes and operation is not affected if the flag is disabled.  SKBs
are constructed upon packet arrival and are queued to be processed
later.

The new path was evaluated with a benchmark with the following setup:
Open two tap devices and a receiver thread that reads in a loop for
each device. Start one sender thread and pin all threads to different
CPUs. Send 1M minimum UDP packets to each device and measure sending
time for each of the sending methods:
	napi_gro_receive():	4.90s
	netif_rx_ni():		4.90s
	netif_receive_skb():	7.20s

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 133 +++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 119 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3c9985f29950..f16407242b18 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -121,7 +121,7 @@ do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE)
+		      IFF_MULTI_QUEUE | IFF_NAPI)
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -172,6 +172,7 @@ struct tun_file {
 		u16 queue_index;
 		unsigned int ifindex;
 	};
+	struct napi_struct napi;
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -229,6 +230,68 @@ struct tun_struct {
 	struct bpf_prog __rcu *xdp_prog;
 };
 
+static int tun_napi_receive(struct napi_struct *napi, int budget)
+{
+	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+	struct sk_buff_head process_queue;
+	struct sk_buff *skb;
+	int received = 0;
+
+	__skb_queue_head_init(&process_queue);
+
+	spin_lock(&queue->lock);
+	skb_queue_splice_tail_init(queue, &process_queue);
+	spin_unlock(&queue->lock);
+
+	while (received < budget && (skb = __skb_dequeue(&process_queue))) {
+		napi_gro_receive(napi, skb);
+		++received;
+	}
+
+	if (!skb_queue_empty(&process_queue)) {
+		spin_lock(&queue->lock);
+		skb_queue_splice(&process_queue, queue);
+		spin_unlock(&queue->lock);
+	}
+
+	return received;
+}
+
+static int tun_napi_poll(struct napi_struct *napi, int budget)
+{
+	unsigned int received;
+
+	received = tun_napi_receive(napi, budget);
+
+	if (received < budget)
+		napi_complete_done(napi, received);
+
+	return received;
+}
+
+static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
+			  bool napi_en)
+{
+	if (napi_en) {
+		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
+			       NAPI_POLL_WEIGHT);
+		napi_enable(&tfile->napi);
+	}
+}
+
+static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
+{
+	if (tun->flags & IFF_NAPI)
+		napi_disable(&tfile->napi);
+}
+
+static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
+{
+	if (tun->flags & IFF_NAPI)
+		netif_napi_del(&tfile->napi);
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -541,6 +604,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 
 	tun = rtnl_dereference(tfile->tun);
 
+	if (tun && clean) {
+		tun_napi_disable(tun, tfile);
+		tun_napi_del(tun, tfile);
+	}
+
 	if (tun && !tfile->detached) {
 		u16 index = tfile->queue_index;
 		BUG_ON(index >= tun->numqueues);
@@ -598,6 +666,7 @@ static void tun_detach_all(struct net_device *dev)
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
 		BUG_ON(!tfile);
+		tun_napi_disable(tun, tfile);
 		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 		RCU_INIT_POINTER(tfile->tun, NULL);
@@ -613,6 +682,7 @@ static void tun_detach_all(struct net_device *dev)
 	synchronize_net();
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
+		tun_napi_del(tun, tfile);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
 		sock_put(&tfile->sk);
@@ -631,7 +701,8 @@ static void tun_detach_all(struct net_device *dev)
 		module_put(THIS_MODULE);
 }
 
-static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
+static int tun_attach(struct tun_struct *tun, struct file *file,
+		      bool skip_filter, bool napi)
 {
 	struct tun_file *tfile = file->private_data;
 	struct net_device *dev = tun->dev;
@@ -677,10 +748,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 	tun->numqueues++;
 
-	if (tfile->detached)
+	if (tfile->detached) {
 		tun_enable_queue(tfile);
-	else
+	} else {
 		sock_hold(&tfile->sk);
+		tun_napi_init(tun, tfile, napi);
+	}
 
 	tun_set_real_num_queues(tun);
 
@@ -956,13 +1029,28 @@ static void tun_poll_controller(struct net_device *dev)
 	 * Tun only receives frames when:
 	 * 1) the char device endpoint gets data from user space
 	 * 2) the tun socket gets a sendmsg call from user space
-	 * Since both of those are synchronous operations, we are guaranteed
-	 * never to have pending data when we poll for it
-	 * so there is nothing to do here but return.
+	 * If NAPI is not enabled, since both of those are synchronous
+	 * operations, we are guaranteed never to have pending data when we poll
+	 * for it so there is nothing to do here but return.
 	 * We need this though so netpoll recognizes us as an interface that
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
+	 * If NAPI is enabled, however, we need to schedule polling for all
+	 * queues.
 	 */
+	struct tun_struct *tun = netdev_priv(dev);
+
+	if (tun->flags & IFF_NAPI) {
+		struct tun_file *tfile;
+		int i;
+
+		rcu_read_lock();
+		for (i = 0; i < tun->numqueues; i++) {
+			tfile = rcu_dereference(tun->tfiles[i]);
+			napi_schedule(&tfile->napi);
+		}
+		rcu_read_unlock();
+	}
 	return;
 }
 #endif
@@ -1549,11 +1637,25 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	}
 
 	rxhash = __skb_get_hash_symmetric(skb);
-#ifndef CONFIG_4KSTACKS
-	tun_rx_batched(tun, tfile, skb, more);
-#else
-	netif_rx_ni(skb);
-#endif
+
+	if (tun->flags & IFF_NAPI) {
+		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+		int queue_len;
+
+		spin_lock_bh(&queue->lock);
+		__skb_queue_tail(queue, skb);
+		queue_len = skb_queue_len(queue);
+		spin_unlock(&queue->lock);
+
+		if (!more || queue_len > NAPI_POLL_WEIGHT)
+			napi_schedule(&tfile->napi);
+
+		local_bh_enable();
+	} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
+		tun_rx_batched(tun, tfile, skb, more);
+	} else {
+		netif_rx_ni(skb);
+	}
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -1980,7 +2082,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		if (err < 0)
 			return err;
 
-		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER);
+		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
+				 ifr->ifr_flags & IFF_NAPI);
 		if (err < 0)
 			return err;
 
@@ -2066,7 +2169,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 				       NETIF_F_HW_VLAN_STAG_TX);
 
 		INIT_LIST_HEAD(&tun->disabled);
-		err = tun_attach(tun, file, false);
+		err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI);
 		if (err < 0)
 			goto err_free_flow;
 
@@ -2216,7 +2319,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 		ret = security_tun_dev_attach_queue(tun->security);
 		if (ret < 0)
 			goto unlock;
-		ret = tun_attach(tun, file, false);
+		ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI);
 	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
 		tun = rtnl_dereference(tfile->tun);
 		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d85ddd..30b6184884eb 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
+#define IFF_NAPI	0x0010
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
cgit v1.2.3


From 90e33d45940793def6f773b2d528e9f3c84ffdc7 Mon Sep 17 00:00:00 2001
From: Petar Penkov <peterpenkov96@gmail.com>
Date: Fri, 22 Sep 2017 13:49:15 -0700
Subject: tun: enable napi_gro_frags() for TUN/TAP driver

Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities. This flag is accepted only if the
device is in TAP mode and has the IFF_NAPI flag set as well. This is
done because both of these are explicit requirements for correct
operation in this mode.

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
Acked-by: Mahesh Bandewar <maheshb@google,com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 134 ++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 129 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f16407242b18..9880b3bc8fa5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
 #include <linux/skb_array.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/mutex.h>
 
 #include <linux/uaccess.h>
 
@@ -121,7 +122,8 @@ do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE | IFF_NAPI)
+		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@ struct tun_file {
 		unsigned int ifindex;
 	};
 	struct napi_struct napi;
+	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
 		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 			       NAPI_POLL_WEIGHT);
 		napi_enable(&tfile->napi);
+		mutex_init(&tfile->napi_mutex);
 	}
 }
 
@@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
 		netif_napi_del(&tfile->napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+	return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev)
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
 	 * If NAPI is enabled, however, we need to schedule polling for all
-	 * queues.
+	 * queues unless we are using napi_gro_frags(), which we call in
+	 * process context and not in NAPI context.
 	 */
 	struct tun_struct *tun = netdev_priv(dev);
 
@@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev)
 		struct tun_file *tfile;
 		int i;
 
+		if (tun_napi_frags_enabled(tun))
+			return;
+
 		rcu_read_lock();
 		for (i = 0; i < tun->numqueues; i++) {
 			tfile = rcu_dereference(tun->tfiles[i]);
@@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+					    size_t len,
+					    const struct iov_iter *it)
+{
+	struct sk_buff *skb;
+	size_t linear;
+	int err;
+	int i;
+
+	if (it->nr_segs > MAX_SKB_FRAGS + 1)
+		return ERR_PTR(-ENOMEM);
+
+	local_bh_disable();
+	skb = napi_get_frags(&tfile->napi);
+	local_bh_enable();
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	linear = iov_iter_single_seg_count(it);
+	err = __skb_grow(skb, linear);
+	if (err)
+		goto free;
+
+	skb->len = len;
+	skb->data_len = len - linear;
+	skb->truesize += skb->data_len;
+
+	for (i = 1; i < it->nr_segs; i++) {
+		size_t fragsz = it->iov[i].iov_len;
+		unsigned long offset;
+		struct page *page;
+		void *data;
+
+		if (fragsz == 0 || fragsz > PAGE_SIZE) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		local_bh_disable();
+		data = napi_alloc_frag(fragsz);
+		local_bh_enable();
+		if (!data) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		page = virt_to_head_page(data);
+		offset = data - page_address(page);
+		skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+	}
+
+	return skb;
+free:
+	/* frees skb and all frags allocated with napi_alloc_frag() */
+	napi_free_frags(&tfile->napi);
+	return ERR_PTR(err);
+}
+
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int err;
 	u32 rxhash;
 	int skb_xdp = 1;
+	bool frags = tun_napi_frags_enabled(tun);
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
 		/* For the packet that is not easy to be processed
 		 * (e.g gso or jumbo packet), we will do it at after
 		 * skb was created with generic XDP routine.
@@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 				linear = tun16_to_cpu(tun, gso.hdr_len);
 		}
 
-		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (frags) {
+			mutex_lock(&tfile->napi_mutex);
+			skb = tun_napi_alloc_frags(tfile, copylen, from);
+			/* tun_napi_alloc_frags() enforces a layout for the skb.
+			 * If zerocopy is enabled, then this layout will be
+			 * overwritten by zerocopy_sg_from_iter().
+			 */
+			zerocopy = false;
+		} else {
+			skb = tun_alloc_skb(tfile, align, copylen, linear,
+					    noblock);
+		}
+
 		if (IS_ERR(skb)) {
 			if (PTR_ERR(skb) != -EAGAIN)
 				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			if (frags)
+				mutex_unlock(&tfile->napi_mutex);
 			return PTR_ERR(skb);
 		}
 
@@ -1571,6 +1657,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		if (err) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
 			kfree_skb(skb);
+			if (frags) {
+				tfile->napi.skb = NULL;
+				mutex_unlock(&tfile->napi_mutex);
+			}
+
 			return -EFAULT;
 		}
 	}
@@ -1578,6 +1669,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
 		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
 		kfree_skb(skb);
+		if (frags) {
+			tfile->napi.skb = NULL;
+			mutex_unlock(&tfile->napi_mutex);
+		}
+
 		return -EINVAL;
 	}
 
@@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb->dev = tun->dev;
 		break;
 	case IFF_TAP:
-		skb->protocol = eth_type_trans(skb, tun->dev);
+		if (!frags)
+			skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
 
@@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	rxhash = __skb_get_hash_symmetric(skb);
 
-	if (tun->flags & IFF_NAPI) {
+	if (frags) {
+		/* Exercise flow dissector code path. */
+		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+		if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			napi_free_frags(&tfile->napi);
+			mutex_unlock(&tfile->napi_mutex);
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+
+		local_bh_disable();
+		napi_gro_frags(&tfile->napi);
+		local_bh_enable();
+		mutex_unlock(&tfile->napi_mutex);
+	} else if (tun->flags & IFF_NAPI) {
 		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
 		int queue_len;
 
@@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	if (tfile->detached)
 		return -EINVAL;
 
+	if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		if (!(ifr->ifr_flags & IFF_NAPI) ||
+		    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
+			return -EINVAL;
+	}
+
 	dev = __dev_get_by_name(net, ifr->ifr_name);
 	if (dev) {
 		if (ifr->ifr_flags & IFF_TUN_EXCL)
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 30b6184884eb..365ade5685c9 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,7 @@
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
 #define IFF_NAPI	0x0010
+#define IFF_NAPI_FRAGS	0x0020
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
cgit v1.2.3


From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:51 +0200
Subject: bpf: add meta pointer for direct access

This work enables generic transfer of metadata from XDP into skb. The
basic idea is that we can make use of the fact that the resulting skb
must be linear and already comes with a larger headroom for supporting
bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
for adjusting a new pointer called xdp->data_meta. Thus, the packet has
a flexible and programmable room for meta data, followed by the actual
packet data. struct xdp_buff is therefore laid out that we first point
to data_hard_start, then data_meta directly prepended to data followed
by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
account whether we have meta data already prepended and if so, memmove()s
this along with the given offset provided there's enough room.

xdp->data_meta is optional and programs are not required to use it. The
rationale is that when we process the packet in XDP (e.g. as DoS filter),
we can push further meta data along with it for the XDP_PASS case, and
give the guarantee that a clsact ingress BPF program on the same device
can pick this up for further post-processing. Since we work with skb
there, we can also set skb->mark, skb->priority or other skb meta data
out of BPF, thus having this scratch space generic and programmable
allows for more flexibility than defining a direct 1:1 transfer of
potentially new XDP members into skb (it's also more efficient as we
don't need to initialize/handle each of such new members). The facility
also works together with GRO aggregation. The scratch space at the head
of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
yet supporting xdp->data_meta can simply be set up with xdp->data_meta
as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
such that the subsequent match against xdp->data for later access is
guaranteed to fail.

The verifier treats xdp->data_meta/xdp->data the same way as we treat
xdp->data/xdp->data_end pointer comparisons. The requirement for doing
the compare against xdp->data is that it hasn't been modified from it's
original address we got from ctx access. It may have a range marking
already from prior successful xdp->data/xdp->data_end pointer comparisons
though.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |   1 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |   1 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |   1 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |   1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |   1 +
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |   1 +
 drivers/net/tun.c                                  |   1 +
 drivers/net/virtio_net.c                           |   2 +
 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |  21 +++-
 include/linux/skbuff.h                             |  68 +++++++++++-
 include/uapi/linux/bpf.h                           |  13 ++-
 kernel/bpf/verifier.c                              | 114 ++++++++++++++++-----
 net/bpf/test_run.c                                 |   1 +
 net/core/dev.c                                     |  31 +++++-
 net/core/filter.c                                  |  77 +++++++++++++-
 net/core/skbuff.c                                  |   2 +
 19 files changed, 297 insertions(+), 42 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index d8f0c837b72c..06ce63c00821 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 
 	xdp.data_hard_start = *data_ptr - offset;
 	xdp.data = *data_ptr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = *data_ptr + *len;
 	orig_data = xdp.data;
 	mapping = rx_buf->mapping - bp->rx_dma_offset;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 49b80da51ba7..d68478afccbf 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 
 	xdp.data_hard_start = page_address(page);
 	xdp.data = (void *)cpu_addr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + len;
 	orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1519dfb851d0..f426762bd83a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d962368d08d0..04bb03bda1cd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..8f9cb8abc497 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			xdp.data_hard_start = va - frags[0].page_offset;
 			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_end = xdp.data + length;
 			orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f1dd638384d3..30b3f3fbd719 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		return false;
 
 	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 	xdp.data_hard_start = va;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1c0187f0af51..e3a38be3600a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
 
 	xdp.data_hard_start = hard_start;
 	xdp.data = data + *off;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = data + *off + *len;
 
 	orig_data = xdp.data;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 6fc854b120b0..48ec4c56cddf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 
 	xdp.data_hard_start = page_address(bd->data);
 	xdp.data = xdp.data_hard_start + *data_offset;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 
 	/* Queues always have a full reset currently, so for the time
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2c36f6ebad79..a6e0bffe3d29 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		xdp.data_hard_start = buf;
 		xdp.data = buf + pad;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index dd14a4547932..fc059f193e7d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 
 		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
 		xdp.data = xdp.data_hard_start + xdp_headroom;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		data = page_address(xdp_page) + offset;
 		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..2b672c50f160 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -137,6 +137,7 @@ enum bpf_reg_type {
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 052bab3d62e7..911d454af107 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -487,12 +487,14 @@ struct sk_filter {
 
 struct bpf_skb_data_end {
 	struct qdisc_skb_cb qdisc_cb;
+	void *data_meta;
 	void *data_end;
 };
 
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_meta;
 	void *data_hard_start;
 };
 
@@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
 	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
-	cb->data_end = skb->data + skb_headlen(skb);
+	cb->data_meta = skb->data - skb_metadata_len(skb);
+	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 void bpf_warn_invalid_xdp_action(u32 act);
-void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 struct sock *do_sk_redirect_map(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f9db5539a6fb..19e64bfb1a66 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	unsigned short	_unused;
-	unsigned char	nr_frags;
+	__u8		__unused;
+	__u8		meta_len;
+	__u8		nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
 	return 0;
 }
 
+static inline u8 skb_metadata_len(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->meta_len;
+}
+
+static inline void *skb_metadata_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb);
+}
+
+static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
+					  const struct sk_buff *skb_b,
+					  u8 meta_len)
+{
+	const void *a = skb_metadata_end(skb_a);
+	const void *b = skb_metadata_end(skb_b);
+	/* Using more efficient varaiant than plain call to memcmp(). */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	u64 diffs = 0;
+
+	switch (meta_len) {
+#define __it(x, op) (x -= sizeof(u##op))
+#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
+	case 32: diffs |= __it_diff(a, b, 64);
+	case 24: diffs |= __it_diff(a, b, 64);
+	case 16: diffs |= __it_diff(a, b, 64);
+	case  8: diffs |= __it_diff(a, b, 64);
+		break;
+	case 28: diffs |= __it_diff(a, b, 64);
+	case 20: diffs |= __it_diff(a, b, 64);
+	case 12: diffs |= __it_diff(a, b, 64);
+	case  4: diffs |= __it_diff(a, b, 32);
+		break;
+	}
+	return diffs;
+#else
+	return memcmp(a - meta_len, b - meta_len, meta_len);
+#endif
+}
+
+static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
+					const struct sk_buff *skb_b)
+{
+	u8 len_a = skb_metadata_len(skb_a);
+	u8 len_b = skb_metadata_len(skb_b);
+
+	if (!(len_a | len_b))
+		return false;
+
+	return len_a != len_b ?
+	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
+}
+
+static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
+{
+	skb_shinfo(skb)->meta_len = meta_len;
+}
+
+static inline void skb_metadata_clear(struct sk_buff *skb)
+{
+	skb_metadata_set(skb, 0);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..e43491ac4823 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -582,6 +582,12 @@ union bpf_attr {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +644,7 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +722,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +730,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +793,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..f849eca36052 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 	va_end(args);
 }
 
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -187,6 +193,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
 	[PTR_TO_STACK]		= "fp",
 	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
@@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 			verbose("(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
 				verbose(",off=%d", reg->off);
-			if (t == PTR_TO_PACKET)
+			if (type_is_pkt_pointer(t))
 				verbose(",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
@@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+	return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+	return reg_is_pkt_pointer(reg) ||
+	       reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+				    enum bpf_reg_type which)
+{
+	/* The register can already have a range from prior markings.
+	 * This is fine as long as it hasn't been advanced from its
+	 * origin.
+	 */
+	return reg->type == which &&
+	       reg->id == 0 &&
+	       reg->off == 0 &&
+	       tnum_equals_const(reg->var_off, 0);
+}
+
 /* Attempts to improve min/max values based on var_off information */
 static void __update_reg_bounds(struct bpf_reg_state *reg)
 {
@@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		/* special case, because of NET_IP_ALIGN */
+	case PTR_TO_PACKET_META:
+		/* Special case, because of NET_IP_ALIGN. Given metadata sits
+		 * right in front, treat it the very same way.
+		 */
 		return check_pkt_ptr_alignment(reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
@@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
-			 * PTR_TO_PACKET[_END].  In the latter case, we know
-			 * the offset is zero.
+			 * PTR_TO_PACKET[_META,_END]. In the latter
+			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
 				mark_reg_unknown(state->regs, value_regno);
@@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
-	} else if (reg->type == PTR_TO_PACKET) {
+	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
@@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size);
@@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET &&
+	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
@@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (type != PTR_TO_PACKET && type != expected_type)
+		if (!type_is_pkt_pointer(type) &&
+		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (register_is_null(*reg))
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->key_size);
 		else
@@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->value_size);
 		else
@@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET ||
-		    regs[i].type == PTR_TO_PACKET_END)
+		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type != PTR_TO_PACKET &&
-		    reg->type != PTR_TO_PACKET_END)
-			continue;
-		__mark_reg_unknown(reg);
+		if (reg_is_pkt_pointer_any(reg))
+			__mark_reg_unknown(reg);
 	}
 }
 
@@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			dst_reg->range = 0;
@@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
@@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-				   struct bpf_reg_state *dst_reg)
+				   struct bpf_reg_state *dst_reg,
+				   enum bpf_reg_type type)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
@@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+		if (regs[i].type == type && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
 			regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
 
@@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, dst_reg->off);
 	}
 }
@@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(this_branch, dst_reg);
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(other_branch, dst_reg);
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return false;
 		/* Check our ids match any regs they're supposed to */
 		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
-		if (rcur->type != PTR_TO_PACKET)
+		if (rcur->type != rold->type)
 			return false;
 		/* We must have at least as much range as the old ptr
 		 * did, so that any accesses which were safe before are
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index df672517b4fd..a86e6687026e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	xdp.data_hard_start = data;
 	xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + size;
 
 	retval = bpf_test_run(prog, &xdp, repeat, &duration);
diff --git a/net/core/dev.c b/net/core/dev.c
index 97abddd9039a..e350c768d4b5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3864,8 +3864,8 @@ drop:
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	u32 act = XDP_DROP;
 	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
@@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_cloned(skb))
 		return XDP_PASS;
 
-	if (skb_linearize(skb))
-		goto do_drop;
+	/* XDP packets must be linear and must have sufficient headroom
+	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+	 * native XDP provides, thus we need to do it here as well.
+	 */
+	if (skb_is_nonlinear(skb) ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+		int troom = skb->tail + skb->data_len - skb->end;
+
+		/* In case we have to go down the path and also linearize,
+		 * then lets do the pskb_expand_head() work just once here.
+		 */
+		if (pskb_expand_head(skb,
+				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+			goto do_drop;
+		if (troom > 0 && __skb_linearize(skb))
+			goto do_drop;
+	}
 
 	/* The XDP program wants to see the packet starting at the MAC
 	 * header.
@@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
 	xdp.data = skb->data - mac_len;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
@@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	case XDP_REDIRECT:
 	case XDP_TX:
 		__skb_push(skb, mac_len);
-		/* fall through */
+		break;
 	case XDP_PASS:
+		metalen = xdp.data - xdp.data_meta;
+		if (metalen)
+			skb_metadata_set(skb, metalen);
 		break;
-
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		/* fall through */
@@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
 		diffs |= skb_metadata_dst_cmp(p, skb);
+		diffs |= skb_metadata_differs(p, skb);
 		if (maclen == ETH_HLEN)
 			diffs |= compare_ether_header(skb_mac_header(p),
 						      skb_mac_header(skb));
diff --git a/net/core/filter.c b/net/core/filter.c
index c468e7cfad19..9b6e7e84aafd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+	return xdp_data_meta_unsupported(xdp) ? 0 :
+	       xdp->data - xdp->data_meta;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	unsigned long metalen = xdp_get_metalen(xdp);
+	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
 
-	if (unlikely(data < xdp->data_hard_start ||
+	if (unlikely(data < data_start ||
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	if (metalen)
+		memmove(xdp->data_meta + offset,
+			xdp->data_meta, metalen);
+	xdp->data_meta += offset;
 	xdp->data = data;
 
 	return 0;
@@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+	void *meta = xdp->data_meta + offset;
+	unsigned long metalen = xdp->data - meta;
+
+	if (xdp_data_meta_unsupported(xdp))
+		return -ENOTSUPP;
+	if (unlikely(meta < xdp->data_hard_start ||
+		     meta > xdp->data))
+		return -EINVAL;
+	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
+		     (metalen > 32)))
+		return -EACCES;
+
+	xdp->data_meta = meta;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+	.func		= bpf_xdp_adjust_meta,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static int __bpf_tx_xdp(struct net_device *dev,
 			struct bpf_map *map,
 			struct xdp_buff *xdp,
@@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head)
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta)
 		return true;
 
 	return false;
@@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
+	case BPF_FUNC_xdp_adjust_meta:
+		return &bpf_xdp_adjust_meta_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
@@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
 	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
 			return false;
@@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
@@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 		return false;
 	}
 
@@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size,
 	case offsetof(struct xdp_md, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case offsetof(struct xdp_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		return false;
+	}
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case bpf_ctx_range(struct __sk_buff, mark):
@@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
-	case bpf_ctx_range(struct __sk_buff, tc_classid):
-		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sk_buff, data));
 		break;
 
+	case offsetof(struct __sk_buff, data_meta):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_meta);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_meta);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
+
 	case offsetof(struct __sk_buff, data_end):
 		off  = si->off;
 		off -= offsetof(struct __sk_buff, data_end);
@@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, data_meta));
+		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      si->dst_reg, si->src_reg,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 000ce735fa8d..d98c2e3ce2bf 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 
+	skb_metadata_clear(skb);
+
 	/* It is not generally safe to change skb->truesize.
 	 * For the moment, we really care of rx path, or
 	 * when skb is orphaned (not attached to a socket).
-- 
cgit v1.2.3


From 262832bc5acda76fd8f901d39f4da1121d951222 Mon Sep 17 00:00:00 2001
From: Alice Frosi <alice@linux.vnet.ibm.com>
Date: Thu, 14 Sep 2017 12:36:03 +0200
Subject: s390/ptrace: add runtime instrumention register get/set

Add runtime instrumention register get and set which allows to read
and modify the runtime instrumention control block.

Signed-off-by: Alice Frosi <alice@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/ptrace.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h  |   1 +
 2 files changed, 110 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index a5f8f0c8ccf0..ea711f141bb8 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -30,6 +30,9 @@
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/switch_to.h>
+#include <asm/runtime_instr.h>
+#include <asm/facility.h>
+
 #include "entry.h"
 
 #ifdef CONFIG_COMPAT
@@ -1239,6 +1242,96 @@ static int s390_gs_bc_set(struct task_struct *target,
 				  data, 0, sizeof(struct gs_cb));
 }
 
+static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
+{
+	return (cb->rca & 0x1f) == 0 &&
+		(cb->roa & 0xfff) == 0 &&
+		(cb->rla & 0xfff) == 0xfff &&
+		cb->s == 1 &&
+		cb->k == 1 &&
+		cb->h == 0 &&
+		cb->reserved1 == 0 &&
+		cb->ps == 1 &&
+		cb->qs == 0 &&
+		cb->pc == 1 &&
+		cb->qc == 0 &&
+		cb->reserved2 == 0 &&
+		cb->key == PAGE_DEFAULT_KEY &&
+		cb->reserved3 == 0 &&
+		cb->reserved4 == 0 &&
+		cb->reserved5 == 0 &&
+		cb->reserved6 == 0 &&
+		cb->reserved7 == 0 &&
+		cb->reserved8 == 0 &&
+		cb->rla >= cb->roa &&
+		cb->rca >= cb->roa &&
+		cb->rca <= cb->rla+1 &&
+		cb->m < 3;
+}
+
+static int s390_runtime_instr_get(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				void *kbuf, void __user *ubuf)
+{
+	struct runtime_instr_cb *data = target->thread.ri_cb;
+
+	if (!test_facility(64))
+		return -ENODEV;
+	if (!data)
+		return -ENODATA;
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   data, 0, sizeof(struct runtime_instr_cb));
+}
+
+static int s390_runtime_instr_set(struct task_struct *target,
+				  const struct user_regset *regset,
+				  unsigned int pos, unsigned int count,
+				  const void *kbuf, const void __user *ubuf)
+{
+	struct runtime_instr_cb ri_cb = { }, *data = NULL;
+	int rc;
+
+	if (!test_facility(64))
+		return -ENODEV;
+
+	if (!target->thread.ri_cb) {
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+		if (!data)
+			return -ENOMEM;
+	}
+
+	if (target->thread.ri_cb) {
+		if (target == current)
+			store_runtime_instr_cb(&ri_cb);
+		else
+			ri_cb = *target->thread.ri_cb;
+	}
+
+	rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&ri_cb, 0, sizeof(struct runtime_instr_cb));
+	if (rc) {
+		kfree(data);
+		return -EFAULT;
+	}
+
+	if (!is_ri_cb_valid(&ri_cb)) {
+		kfree(data);
+		return -EINVAL;
+	}
+
+	preempt_disable();
+	if (!target->thread.ri_cb)
+		target->thread.ri_cb = data;
+	*target->thread.ri_cb = ri_cb;
+	if (target == current)
+		load_runtime_instr_cb(target->thread.ri_cb);
+	preempt_enable();
+
+	return 0;
+}
+
 static const struct user_regset s390_regsets[] = {
 	{
 		.core_note_type = NT_PRSTATUS,
@@ -1312,6 +1405,14 @@ static const struct user_regset s390_regsets[] = {
 		.get = s390_gs_bc_get,
 		.set = s390_gs_bc_set,
 	},
+	{
+		.core_note_type = NT_S390_RI_CB,
+		.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
+		.size = sizeof(__u64),
+		.align = sizeof(__u64),
+		.get = s390_runtime_instr_get,
+		.set = s390_runtime_instr_set,
+	},
 };
 
 static const struct user_regset_view user_s390_view = {
@@ -1548,6 +1649,14 @@ static const struct user_regset s390_compat_regsets[] = {
 		.get = s390_gs_cb_get,
 		.set = s390_gs_cb_set,
 	},
+	{
+		.core_note_type = NT_S390_RI_CB,
+		.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
+		.size = sizeof(__u64),
+		.align = sizeof(__u64),
+		.get = s390_runtime_instr_get,
+		.set = s390_runtime_instr_set,
+	},
 };
 
 static const struct user_regset_view user_s390_compat_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b5280db9ef6a..e3739c330c15 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -411,6 +411,7 @@ typedef struct elf64_shdr {
 #define NT_S390_VXRS_HIGH	0x30a	/* s390 vector registers 16-31 */
 #define NT_S390_GS_CB	0x30b		/* s390 guarded storage registers */
 #define NT_S390_GS_BC	0x30c		/* s390 guarded storage broadcast control block */
+#define NT_S390_RI_CB	0x30d		/* s390 runtime instrumentation */
 #define NT_ARM_VFP	0x400		/* ARM VFP/NEON registers */
 #define NT_ARM_TLS	0x401		/* ARM TLS register */
 #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */
-- 
cgit v1.2.3


From 5af48b59f35cf712793badabe1a574a0d0ce3bd3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 27 Sep 2017 16:12:44 +0300
Subject: net: bridge: add per-port group_fwd_mask with less restrictions

We need to be able to transparently forward most link-local frames via
tunnels (e.g. vxlan, qinq). Currently the bridge's group_fwd_mask has a
mask which restricts the forwarding of STP and LACP, but we need to be able
to forward these over tunnels and control that forwarding on a per-port
basis thus add a new per-port group_fwd_mask option which only disallows
mac pause frames to be forwarded (they're always dropped anyway).
The patch does not change the current default situation - all of the others
are still restricted unless configured for forwarding.
We have successfully tested this patch with LACP and STP forwarding over
VxLAN and qinq tunnels.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_input.c        |  1 +
 net/bridge/br_netlink.c      | 14 +++++++++++++-
 net/bridge/br_private.h      | 10 +++++++++-
 net/bridge/br_sysfs_if.c     | 18 ++++++++++++++++++
 5 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d062c58d5cb..ea87bd708ee9 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -325,6 +325,7 @@ enum {
 	IFLA_BRPORT_MCAST_TO_UCAST,
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
+	IFLA_BRPORT_GROUP_FWD_MASK,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7637f58c1226..7cb613776b31 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -289,6 +289,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		 *
 		 * Others reserved for future standardization
 		 */
+		fwd_mask |= p->group_fwd_mask;
 		switch (dest[5]) {
 		case 0x00:	/* Bridge Group Address */
 			/* If STP is turned off,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 3bc890716c89..dea88a255d26 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void)
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 		+ nla_total_size(sizeof(u8))	/* IFLA_BRPORT_MULTICAST_ROUTER */
 #endif
+		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_GROUP_FWD_MASK */
 		+ 0;
 }
 
@@ -208,7 +209,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       p->topology_change_ack) ||
 	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
 	    nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
-							BR_VLAN_TUNNEL)))
+							BR_VLAN_TUNNEL)) ||
+	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -637,6 +639,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
 	[IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 },
 	[IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 },
+	[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -773,6 +776,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 			return err;
 	}
 #endif
+
+	if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) {
+		u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]);
+
+		if (fwd_mask & BR_GROUPFWD_MACPAUSE)
+			return -EINVAL;
+		p->group_fwd_mask = fwd_mask;
+	}
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e870cfc85b14..020c709a017f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -36,7 +36,14 @@
 /* Control of forwarding link local multicast */
 #define BR_GROUPFWD_DEFAULT	0
 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */
-#define BR_GROUPFWD_RESTRICTED	0x0007u
+enum {
+	BR_GROUPFWD_STP		= BIT(0),
+	BR_GROUPFWD_MACPAUSE	= BIT(1),
+	BR_GROUPFWD_LACP	= BIT(2),
+};
+
+#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \
+				BR_GROUPFWD_LACP)
 /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */
 #define BR_GROUPFWD_8021AD	0xB801u
 
@@ -268,6 +275,7 @@ struct net_bridge_port {
 #ifdef CONFIG_NET_SWITCHDEV
 	int				offload_fwd_mark;
 #endif
+	u16				group_fwd_mask;
 };
 
 #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 5d5d413a6cf8..9110d5e56085 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -165,6 +165,23 @@ static int store_flush(struct net_bridge_port *p, unsigned long v)
 }
 static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);
 
+static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%#x\n", p->group_fwd_mask);
+}
+
+static int store_group_fwd_mask(struct net_bridge_port *p,
+				unsigned long v)
+{
+	if (v & BR_GROUPFWD_MACPAUSE)
+		return -EINVAL;
+	p->group_fwd_mask = v;
+
+	return 0;
+}
+static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask,
+		   store_group_fwd_mask);
+
 BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
 BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
 BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
@@ -223,6 +240,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_proxyarp_wifi,
 	&brport_attr_multicast_flood,
 	&brport_attr_broadcast_flood,
+	&brport_attr_group_fwd_mask,
 	NULL
 };
 
-- 
cgit v1.2.3


From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:52 -0700
Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info

The patch adds name and load_time to struct bpf_prog_aux.  They
are also exported to bpf_prog_info.

The bpf_prog's name is passed by userspace during BPF_PROG_LOAD.
The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes
and the name stored in the kernel is always \0 terminated.

The kernel will reject name that contains characters other than
isalnum() and '_'.  It will also reject name that is not null
terminated.

The existing 'user->uid' of the bpf_prog_aux is also exported to
the bpf_prog_info as created_by_uid.

The existing 'used_maps' of the bpf_prog_aux is exported to
the newly added members 'nr_map_ids' and 'map_ids' of
the bpf_prog_info.  On the input, nr_map_ids tells how
big the userspace's map_ids buffer is.  On the output,
nr_map_ids tells the exact user_map_cnt and it will only
copy up to the userspace's map_ids buffer is allowed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  8 ++++++++
 kernel/bpf/syscall.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b672c50f160..33ccc474fb04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,8 @@ struct bpf_prog_aux {
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
+	u64 load_time; /* ns since boottime */
+	u8 name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e43491ac4823..bd6348269bf5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -175,6 +175,8 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -210,6 +212,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -812,6 +815,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..45970df3f820 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
 #include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+	const char *end = src + BPF_OBJ_NAME_LEN;
+
+	/* Copy all isalnum() and '_' char */
+	while (src < end && *src) {
+		if (!isalnum(*src) && *src != '_')
+			return -EINVAL;
+		*dst++ = *src++;
+	}
+
+	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+	if (src == end)
+		return -EINVAL;
+
+	/* '\0' terminates dst */
+	*dst = 0;
+
+	return 0;
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD numa_node
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define	BPF_PROG_LOAD_LAST_FIELD prog_name
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_prog;
 
+	prog->aux->load_time = ktime_get_boot_ns();
+	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+	if (err)
+		goto free_prog;
+
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr);
 	if (err < 0)
@@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	info.type = prog->type;
 	info.id = prog->aux->id;
+	info.load_time = prog->aux->load_time;
+	info.created_by_uid = from_kuid_munged(current_user_ns(),
+					       prog->aux->user->uid);
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+	ulen = info.nr_map_ids;
+	info.nr_map_ids = prog->aux->used_map_cnt;
+	ulen = min_t(u32, info.nr_map_ids, ulen);
+	if (ulen) {
+		u32 *user_map_ids = (u32 *)info.map_ids;
+		u32 i;
+
+		for (i = 0; i < ulen; i++)
+			if (put_user(prog->aux->used_maps[i]->id,
+				     &user_map_ids[i]))
+				return -EFAULT;
+	}
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
-- 
cgit v1.2.3


From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:53 -0700
Subject: bpf: Add map_name to bpf_map_info

This patch allows userspace to specify a name for a map
during BPF_MAP_CREATE.

The map's name can later be exported to user space
via BPF_OBJ_GET_INFO_BY_FD.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 1 +
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 7 ++++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33ccc474fb04..252f4bc9eb25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
+	u8 name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd6348269bf5..6d2137b4cf38 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -190,6 +190,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -829,6 +830,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 45970df3f820..11a7f82a55d1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+#define BPF_MAP_CREATE_LAST_FIELD map_name
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	err = bpf_obj_name_cpy(map->name, attr->map_name);
+	if (err)
+		goto free_map_nouncharge;
+
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
@@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.value_size = map->value_size;
 	info.max_entries = map->max_entries;
 	info.map_flags = map->map_flags;
+	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From 84e14fe353de7624872e582887712079ba0b2d56 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 26 Sep 2017 21:32:42 -0700
Subject: net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable
even on IPv6 sockets.

However, it turns out it is perfectly reasonable to want to set freebind
on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP
socket option on such a socket (they're all blindly errored out).

One use case for this is to allow spoofing src ip on a raw socket
via sendmsg cmsg.

Tested:
  built, and booted
  # python
  >>> import socket
  >>> SOL_IP = socket.SOL_IP
  >>> SOL_IPV6 = socket.IPPROTO_IPV6
  >>> IP_FREEBIND = 15
  >>> IPV6_FREEBIND = 78
  >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0)
  >>> s.getsockopt(SOL_IP, IP_FREEBIND)
  0
  >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND)
  0
  >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1)
  >>> s.getsockopt(SOL_IP, IP_FREEBIND)
  1
  >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND)
  1

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in6.h |  1 +
 net/ipv6/ipv6_sockglue.c | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 46444f8fbee4..4f8f3eb0699f 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -284,6 +284,7 @@ struct in6_flowlabel_req {
 #define IPV6_TRANSPARENT        75
 #define IPV6_UNICAST_IF         76
 #define IPV6_RECVFRAGSIZE	77
+#define IPV6_FREEBIND		78
 
 /*
  * Multicast Routing:
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a5e466d4e093..b9404feabd78 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -377,6 +377,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		retv = 0;
 		break;
 
+	case IPV6_FREEBIND:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		/* we also don't have a separate freebind bit for IPV6 */
+		inet_sk(sk)->freebind = valbool;
+		retv = 0;
+		break;
+
 	case IPV6_RECVORIGDSTADDR:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1214,6 +1222,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = inet_sk(sk)->transparent;
 		break;
 
+	case IPV6_FREEBIND:
+		val = inet_sk(sk)->freebind;
+		break;
+
 	case IPV6_RECVORIGDSTADDR:
 		val = np->rxopt.bits.rxorigdstaddr;
 		break;
-- 
cgit v1.2.3


From 503c1fb98ba3859c13863957c7c65c92371a9e50 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 29 Sep 2017 14:21:49 +0200
Subject: cfg80211/nl80211: add a port authorized event

Add an event that indicates that a connection is authorized
(i.e. the 4 way handshake was performed by the driver). This event
should be sent by the driver after sending a connect/roamed event.

This is useful for networks that require 802.1X authentication.
In cases that the driver supports 4 way handshake offload, but the
802.1X authentication is managed by user space, the driver needs to
inform user space right after the 802.11 association was completed
so user space can initialize its 802.1X state machine etc.
However, it is also possible that the AP will choose to skip the
802.1X authentication (e.g. when PMKSA caching is used) and proceed
with the 4 way handshake immediately. In this case the driver needs
to inform user space that 802.1X authentication is no longer required
(e.g. to prevent user space from disconnecting since it did not get
any EAPOLs from the AP).

This is also useful for roaming, in which case it is possible that
the driver used the Fast Transition protocol so 802.1X is not
required.

Since there will now be a dedicated notification indicating that the
connection is authorized, the authorized flag can be removed from the
roamed event. Drivers can send the new port authorized event right
after sending the roamed event to indicate the new AP is already
authorized. This therefore reserves the old PORT_AUTHORIZED attribute.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 21 +++++++++++++++++----
 include/uapi/linux/nl80211.h | 28 ++++++++++++++++-----------
 net/wireless/core.h          |  5 +++++
 net/wireless/nl80211.c       | 34 ++++++++++++++++++++++++++++++---
 net/wireless/nl80211.h       |  2 ++
 net/wireless/sme.c           | 45 +++++++++++++++++++++++++++++++++++++++++++-
 net/wireless/util.c          |  3 +++
 7 files changed, 119 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cc1996081463..8b8118a7fadb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5428,9 +5428,6 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
  * @resp_ie_len: assoc response IEs length
- * @authorized: true if the 802.1X authentication was done by the driver or is
- *	not needed (e.g., when Fast Transition protocol was used), false
- *	otherwise. Ignored for networks that don't use 802.1X authentication.
  */
 struct cfg80211_roam_info {
 	struct ieee80211_channel *channel;
@@ -5440,7 +5437,6 @@ struct cfg80211_roam_info {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
-	bool authorized;
 };
 
 /**
@@ -5464,6 +5460,23 @@ struct cfg80211_roam_info {
 void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 		     gfp_t gfp);
 
+/**
+ * cfg80211_port_authorized - notify cfg80211 of successful security association
+ *
+ * @dev: network device
+ * @bssid: the BSSID of the AP
+ * @gfp: allocation flags
+ *
+ * This function should be called by a driver that supports 4 way handshake
+ * offload after a security association was successfully established (i.e.,
+ * the 4 way handshake was completed successfully). The call to this function
+ * should be preceded with a call to cfg80211_connect_result(),
+ * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to
+ * indicate the 802.11 association.
+ */
+void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
+			      gfp_t gfp);
+
 /**
  * cfg80211_disconnected - notify cfg80211 that connection was dropped
  *
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 59ba6ca66a0d..95832ce03a44 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -569,13 +569,14 @@
  *	authentication/association or not receiving a response from the AP.
  *	Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as
  *	well to remain backwards compatible.
- * @NL80211_CMD_ROAM: notifcation indicating the card/driver roamed by itself.
- *	When the driver roamed in a network that requires 802.1X authentication,
- *	%NL80211_ATTR_PORT_AUTHORIZED should be set if the 802.1X authentication
- *	was done by the driver or if roaming was done using Fast Transition
- *	protocol (in which case 802.1X authentication is not needed). If
- *	%NL80211_ATTR_PORT_AUTHORIZED is not set, user space is responsible for
- *	the 802.1X authentication.
+ *	When establishing a security association, drivers that support 4 way
+ *	handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when
+ *	the 4 way handshake is completed successfully.
+ * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself.
+ *	When a security association was established with the new AP (e.g. if
+ *	the FT protocol was used for roaming or the driver completed the 4 way
+ *	handshake), this event should be followed by an
+ *	%NL80211_CMD_PORT_AUTHORIZED event.
  * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify
  *	userspace that a connection was dropped by the AP or due to other
  *	reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and
@@ -982,6 +983,12 @@
  * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously
  *	configured PMK for the authenticator address identified by
  *	&NL80211_ATTR_MAC.
+ * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way
+ *	handshake was completed successfully by the driver. The BSSID is
+ *	specified with &NL80211_ATTR_MAC. Drivers that support 4 way handshake
+ *	offload should send this event after indicating 802.11 association with
+ *	&NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed
+ *	&NL80211_CMD_DISCONNECT should be indicated instead.
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -1185,6 +1192,8 @@ enum nl80211_commands {
 	NL80211_CMD_SET_PMK,
 	NL80211_CMD_DEL_PMK,
 
+	NL80211_CMD_PORT_AUTHORIZED,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2138,10 +2147,7 @@ enum nl80211_commands {
  *	in %NL80211_CMD_CONNECT to indicate that for 802.1X authentication it
  *	wants to use the supported offload of the 4-way handshake.
  * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT.
- * @NL80211_ATTR_PORT_AUTHORIZED: flag attribute used in %NL80211_CMD_ROAMED
- *	notification indicating that that 802.1X authentication was done by
- *	the driver or is not needed (because roaming used the Fast Transition
- *	protocol).
+ * @NL80211_ATTR_PORT_AUTHORIZED: (reserved)
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 6e809325af3b..35165f42c2a8 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -216,6 +216,7 @@ enum cfg80211_event_type {
 	EVENT_DISCONNECTED,
 	EVENT_IBSS_JOINED,
 	EVENT_STOPPED,
+	EVENT_PORT_AUTHORIZED,
 };
 
 struct cfg80211_event {
@@ -235,6 +236,9 @@ struct cfg80211_event {
 			u8 bssid[ETH_ALEN];
 			struct ieee80211_channel *channel;
 		} ij;
+		struct {
+			u8 bssid[ETH_ALEN];
+		} pa;
 	};
 };
 
@@ -385,6 +389,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 			bool wextev);
 void __cfg80211_roamed(struct wireless_dev *wdev,
 		       struct cfg80211_roam_info *info);
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid);
 int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev);
 void cfg80211_autodisconnect_wk(struct work_struct *work);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1e39ba3cfd06..90e212db6889 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13830,9 +13830,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 		     info->req_ie)) ||
 	    (info->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
-		     info->resp_ie)) ||
-	    (info->authorized &&
-	     nla_put_flag(msg, NL80211_ATTR_PORT_AUTHORIZED)))
+		     info->resp_ie)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
@@ -13846,6 +13844,36 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	nlmsg_free(msg);
 }
 
+void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
+				  struct net_device *netdev, const u8 *bssid)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PORT_AUTHORIZED);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+				NL80211_MCGRP_MLME, GFP_KERNEL);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
 void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 			       struct net_device *netdev, u16 reason,
 			       const u8 *ie, size_t ie_len, bool from_ap)
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index b96933322077..bf9e772a30b9 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -58,6 +58,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev,
 			 struct cfg80211_roam_info *info, gfp_t gfp);
+void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
+				  struct net_device *netdev, const u8 *bssid);
 void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 			       struct net_device *netdev, u16 reason,
 			       const u8 *ie, size_t ie_len, bool from_ap);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 0a49b88070d0..f38ed490e42b 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -960,7 +960,6 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	ev->rm.resp_ie_len = info->resp_ie_len;
 	memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len);
 	ev->rm.bss = info->bss;
-	ev->rm.authorized = info->authorized;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
 	list_add_tail(&ev->list, &wdev->event_list);
@@ -969,6 +968,50 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 }
 EXPORT_SYMBOL(cfg80211_roamed);
 
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid)
+{
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+		return;
+
+	if (WARN_ON(!wdev->current_bss) ||
+	    WARN_ON(!ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+		return;
+
+	nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev,
+				     bssid);
+}
+
+void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
+			      gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct cfg80211_event *ev;
+	unsigned long flags;
+
+	if (WARN_ON(!bssid))
+		return;
+
+	ev = kzalloc(sizeof(*ev), gfp);
+	if (!ev)
+		return;
+
+	ev->type = EVENT_PORT_AUTHORIZED;
+	memcpy(ev->pa.bssid, bssid, ETH_ALEN);
+
+	/*
+	 * Use the wdev event list so that if there are pending
+	 * connected/roamed events, they will be reported first.
+	 */
+	spin_lock_irqsave(&wdev->event_lock, flags);
+	list_add_tail(&ev->list, &wdev->event_list);
+	spin_unlock_irqrestore(&wdev->event_lock, flags);
+	queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_port_authorized);
+
 void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 			     size_t ie_len, u16 reason, bool from_ap)
 {
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 7a1fcc6ee060..ff21c314a609 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -846,6 +846,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 		case EVENT_STOPPED:
 			__cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
 			break;
+		case EVENT_PORT_AUTHORIZED:
+			__cfg80211_port_authorized(wdev, ev->pa.bssid);
+			break;
 		}
 		wdev_unlock(wdev);
 
-- 
cgit v1.2.3


From 5bbbbe32a43199c2b9ea5ea66fab6241c64beb51 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:13 -0300
Subject: sctp: introduce stream scheduler foundations

This patch introduces the hooks necessary to do stream scheduling, as
per RFC Draft ndata.  It also introduces the first scheduler, which is
what we do today but now factored out: first come first served (FCFS).

With stream scheduling now we have to track which chunk was enqueued on
which stream and be able to select another other than the in front of
the main outqueue. So we introduce a list on sctp_stream_out_ext
structure for this purpose.

We reuse sctp_chunk->transmitted_list space for the list above, as the
chunk cannot belong to the two lists at the same time. By using the
union in there, we can have distinct names for these moments.

sctp_sched_ops are the operations expected to be implemented by each
scheduler. The dequeueing is a bit particular to this implementation but
it is to match how we dequeue packets today. We first dequeue and then
check if it fits the packet and if not, we requeue it at head. Thus why
we don't have a peek operation but have dequeue_done instead, which is
called once the chunk can be safely considered as transmitted.

The check removed from sctp_outq_flush is now performed by
sctp_stream_outq_migrate, which is only called during assoc setup.
(sctp_sendmsg() also checks for it)

The only operation that is foreseen but not yet added here is a way to
signalize that a new packet is starting or that the packet is done, for
round robin scheduler per packet, but is intentionally left to the
patch that actually implements it.

Support for I-DATA chunks, also described in this RFC, with user message
interleaving is straightforward as it just requires the schedulers to
probe for the feature and ignore datamsg boundaries when dequeueing.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_sched.h |  72 +++++++++++
 include/net/sctp/structs.h      |  15 ++-
 include/uapi/linux/sctp.h       |   6 +
 net/sctp/Makefile               |   2 +-
 net/sctp/outqueue.c             |  59 +++++----
 net/sctp/sm_sideeffect.c        |   3 +
 net/sctp/stream.c               |  88 +++++++++++--
 net/sctp/stream_sched.c         | 270 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 477 insertions(+), 38 deletions(-)
 create mode 100644 include/net/sctp/stream_sched.h
 create mode 100644 net/sctp/stream_sched.c

(limited to 'include/uapi/linux')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
new file mode 100644
index 000000000000..c676550a4c7d
--- /dev/null
+++ b/include/net/sctp/stream_sched.h
@@ -0,0 +1,72 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * These are definitions used by the stream schedulers, defined in RFC
+ * draft ndata (https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-11)
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation  is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresses:
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *   Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#ifndef __sctp_stream_sched_h__
+#define __sctp_stream_sched_h__
+
+struct sctp_sched_ops {
+	/* Property handling for a given stream */
+	int (*set)(struct sctp_stream *stream, __u16 sid, __u16 value,
+		   gfp_t gfp);
+	int (*get)(struct sctp_stream *stream, __u16 sid, __u16 *value);
+
+	/* Init the specific scheduler */
+	int (*init)(struct sctp_stream *stream);
+	/* Init a stream */
+	int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
+	/* Frees the entire thing */
+	void (*free)(struct sctp_stream *stream);
+
+	/* Enqueue a chunk */
+	void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg);
+	/* Dequeue a chunk */
+	struct sctp_chunk *(*dequeue)(struct sctp_outq *q);
+	/* Called only if the chunk fit the packet */
+	void (*dequeue_done)(struct sctp_outq *q, struct sctp_chunk *chunk);
+	/* Sched all chunks already enqueued */
+	void (*sched_all)(struct sctp_stream *steam);
+	/* Unched all chunks already enqueued */
+	void (*unsched_all)(struct sctp_stream *steam);
+};
+
+int sctp_sched_set_sched(struct sctp_association *asoc,
+			 enum sctp_sched_type sched);
+int sctp_sched_get_sched(struct sctp_association *asoc);
+int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 value, gfp_t gfp);
+int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 *value);
+void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch);
+
+void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch);
+int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
+struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
+
+#endif /* __sctp_stream_sched_h__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index c48f7999fe9b..3c22a30fd71b 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -84,7 +84,6 @@ struct sctp_ulpq;
 struct sctp_ep_common;
 struct crypto_shash;
 struct sctp_stream;
-struct sctp_stream_out;
 
 
 #include <net/sctp/tsnmap.h>
@@ -531,8 +530,12 @@ struct sctp_chunk {
 	/* How many times this chunk have been sent, for prsctp RTX policy */
 	int sent_count;
 
-	/* This is our link to the per-transport transmitted list.  */
-	struct list_head transmitted_list;
+	union {
+		/* This is our link to the per-transport transmitted list.  */
+		struct list_head transmitted_list;
+		/* List in specific stream outq */
+		struct list_head stream_list;
+	};
 
 	/* This field is used by chunks that hold fragmented data.
 	 * For the first fragment this is the list that holds the rest of
@@ -1019,6 +1022,9 @@ struct sctp_outq {
 	/* Data pending that has never been transmitted.  */
 	struct list_head out_chunk_list;
 
+	/* Stream scheduler being used */
+	struct sctp_sched_ops *sched;
+
 	unsigned int out_qlen;	/* Total length of queued data chunks. */
 
 	/* Error of send failed, may used in SCTP_SEND_FAILED event. */
@@ -1325,6 +1331,7 @@ struct sctp_inithdr_host {
 struct sctp_stream_out_ext {
 	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
 	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
+	struct list_head outq; /* chunks enqueued by this stream */
 };
 
 struct sctp_stream_out {
@@ -1342,6 +1349,8 @@ struct sctp_stream {
 	struct sctp_stream_in *in;
 	__u16 outcnt;
 	__u16 incnt;
+	/* Current stream being sent, if any */
+	struct sctp_stream_out *out_curr;
 };
 
 #define SCTP_STREAM_CLOSED		0x00
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6217ff8500a1..4487e7625ddb 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1088,4 +1088,10 @@ struct sctp_add_streams {
 	uint16_t sas_outstrms;
 };
 
+/* SCTP Stream schedulers */
+enum sctp_sched_type {
+	SCTP_SS_FCFS,
+	SCTP_SS_MAX = SCTP_SS_FCFS
+};
+
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 70f1b570bab9..0f6e6d1d69fd 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -12,7 +12,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o
+	  offload.o stream_sched.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 746b07b7937d..4db012aa25f7 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -50,6 +50,7 @@
 
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 /* Declare internal functions here.  */
 static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn);
@@ -72,32 +73,38 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
 static inline void sctp_outq_head_data(struct sctp_outq *q,
-					struct sctp_chunk *ch)
+				       struct sctp_chunk *ch)
 {
+	struct sctp_stream_out_ext *oute;
+	__u16 stream;
+
 	list_add(&ch->list, &q->out_chunk_list);
 	q->out_qlen += ch->skb->len;
+
+	stream = sctp_chunk_stream_no(ch);
+	oute = q->asoc->stream.out[stream].ext;
+	list_add(&ch->stream_list, &oute->outq);
 }
 
 /* Take data from the front of the queue. */
 static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
 {
-	struct sctp_chunk *ch = NULL;
-
-	if (!list_empty(&q->out_chunk_list)) {
-		struct list_head *entry = q->out_chunk_list.next;
-
-		ch = list_entry(entry, struct sctp_chunk, list);
-		list_del_init(entry);
-		q->out_qlen -= ch->skb->len;
-	}
-	return ch;
+	return q->sched->dequeue(q);
 }
+
 /* Add data chunk to the end of the queue. */
 static inline void sctp_outq_tail_data(struct sctp_outq *q,
 				       struct sctp_chunk *ch)
 {
+	struct sctp_stream_out_ext *oute;
+	__u16 stream;
+
 	list_add_tail(&ch->list, &q->out_chunk_list);
 	q->out_qlen += ch->skb->len;
+
+	stream = sctp_chunk_stream_no(ch);
+	oute = q->asoc->stream.out[stream].ext;
+	list_add_tail(&ch->stream_list, &oute->outq);
 }
 
 /*
@@ -207,6 +214,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
 	INIT_LIST_HEAD(&q->retransmit);
 	INIT_LIST_HEAD(&q->sacked);
 	INIT_LIST_HEAD(&q->abandoned);
+	sctp_sched_set_sched(asoc, SCTP_SS_FCFS);
 }
 
 /* Free the outqueue structure and any related pending chunks.
@@ -258,6 +266,7 @@ static void __sctp_outq_teardown(struct sctp_outq *q)
 
 	/* Throw away any leftover data chunks. */
 	while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+		sctp_sched_dequeue_done(q, chunk);
 
 		/* Mark as send failure. */
 		sctp_chunk_fail(chunk, q->error);
@@ -391,13 +400,14 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 	struct sctp_outq *q = &asoc->outqueue;
 	struct sctp_chunk *chk, *temp;
 
+	q->sched->unsched_all(&asoc->stream);
+
 	list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
 		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
 		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
 			continue;
 
-		list_del_init(&chk->list);
-		q->out_qlen -= chk->skb->len;
+		sctp_sched_dequeue_common(q, chk);
 		asoc->sent_cnt_removable--;
 		asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
 		if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) {
@@ -415,6 +425,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 			break;
 	}
 
+	q->sched->sched_all(&asoc->stream);
+
 	return msg_len;
 }
 
@@ -1033,22 +1045,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
 			__u32 sid = ntohs(chunk->subh.data_hdr->stream);
 
-			/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
-			 * stream identifier.
-			 */
-			if (chunk->sinfo.sinfo_stream >= asoc->stream.outcnt) {
-
-				/* Mark as failed send. */
-				sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
-				if (asoc->peer.prsctp_capable &&
-				    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
-					asoc->sent_cnt_removable--;
-				sctp_chunk_free(chunk);
-				continue;
-			}
-
 			/* Has this chunk expired? */
 			if (sctp_chunk_abandoned(chunk)) {
+				sctp_sched_dequeue_done(q, chunk);
 				sctp_chunk_fail(chunk, 0);
 				sctp_chunk_free(chunk);
 				continue;
@@ -1070,6 +1069,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED) {
 				WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
+				sctp_sched_dequeue_done(q, chunk);
 				sctp_chunk_fail(chunk, 0);
 				sctp_chunk_free(chunk);
 				continue;
@@ -1133,6 +1133,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				else
 					asoc->stats.oodchunks++;
 
+				/* Only now it's safe to consider this
+				 * chunk as sent, sched-wise.
+				 */
+				sctp_sched_dequeue_done(q, chunk);
+
 				break;
 
 			default:
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index e6a2974e020e..402bfbb888cd 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -50,6 +50,7 @@
 #include <net/sock.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 static int sctp_cmd_interpreter(enum sctp_event event_type,
 				union sctp_subtype subtype,
@@ -1089,6 +1090,8 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc,
 
 	list_for_each_entry(chunk, &msg->chunks, frag_list)
 		sctp_outq_tail(&asoc->outqueue, chunk, gfp);
+
+	asoc->outqueue.sched->enqueue(&asoc->outqueue, msg);
 }
 
 
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 055ca25bbc91..5ea33a2c453b 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -32,8 +32,61 @@
  *    Xin Long <lucien.xin@gmail.com>
  */
 
+#include <linux/list.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Migrates chunks from stream queues to new stream queues if needed,
+ * but not across associations. Also, removes those chunks to streams
+ * higher than the new max.
+ */
+static void sctp_stream_outq_migrate(struct sctp_stream *stream,
+				     struct sctp_stream *new, __u16 outcnt)
+{
+	struct sctp_association *asoc;
+	struct sctp_chunk *ch, *temp;
+	struct sctp_outq *outq;
+	int i;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	outq = &asoc->outqueue;
+
+	list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) {
+		__u16 sid = sctp_chunk_stream_no(ch);
+
+		if (sid < outcnt)
+			continue;
+
+		sctp_sched_dequeue_common(outq, ch);
+		/* No need to call dequeue_done here because
+		 * the chunks are not scheduled by now.
+		 */
+
+		/* Mark as failed send. */
+		sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM);
+		if (asoc->peer.prsctp_capable &&
+		    SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags))
+			asoc->sent_cnt_removable--;
+
+		sctp_chunk_free(ch);
+	}
+
+	if (new) {
+		/* Here we actually move the old ext stuff into the new
+		 * buffer, because we want to keep it. Then
+		 * sctp_stream_update will swap ->out pointers.
+		 */
+		for (i = 0; i < outcnt; i++) {
+			kfree(new->out[i].ext);
+			new->out[i].ext = stream->out[i].ext;
+			stream->out[i].ext = NULL;
+		}
+	}
+
+	for (i = outcnt; i < stream->outcnt; i++)
+		kfree(stream->out[i].ext);
+}
 
 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 				 gfp_t gfp)
@@ -87,7 +140,8 @@ static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
 int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 		     gfp_t gfp)
 {
-	int i;
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	int i, ret = 0;
 
 	gfp |= __GFP_NOWARN;
 
@@ -97,6 +151,11 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	if (outcnt == stream->outcnt)
 		goto in;
 
+	/* Filter out chunks queued on streams that won't exist anymore */
+	sched->unsched_all(stream);
+	sctp_stream_outq_migrate(stream, NULL, outcnt);
+	sched->sched_all(stream);
+
 	i = sctp_stream_alloc_out(stream, outcnt, gfp);
 	if (i)
 		return i;
@@ -105,20 +164,27 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	for (i = 0; i < stream->outcnt; i++)
 		stream->out[i].state = SCTP_STREAM_OPEN;
 
+	sched->init(stream);
+
 in:
 	if (!incnt)
-		return 0;
+		goto out;
 
 	i = sctp_stream_alloc_in(stream, incnt, gfp);
 	if (i) {
-		kfree(stream->out);
-		stream->out = NULL;
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto free;
 	}
 
 	stream->incnt = incnt;
+	goto out;
 
-	return 0;
+free:
+	sched->free(stream);
+	kfree(stream->out);
+	stream->out = NULL;
+out:
+	return ret;
 }
 
 int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
@@ -130,13 +196,15 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
 		return -ENOMEM;
 	stream->out[sid].ext = soute;
 
-	return 0;
+	return sctp_sched_init_sid(stream, sid, GFP_KERNEL);
 }
 
 void sctp_stream_free(struct sctp_stream *stream)
 {
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
 	int i;
 
+	sched->free(stream);
 	for (i = 0; i < stream->outcnt; i++)
 		kfree(stream->out[i].ext);
 	kfree(stream->out);
@@ -156,6 +224,10 @@ void sctp_stream_clear(struct sctp_stream *stream)
 
 void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
 {
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+
+	sched->unsched_all(stream);
+	sctp_stream_outq_migrate(stream, new, new->outcnt);
 	sctp_stream_free(stream);
 
 	stream->out = new->out;
@@ -163,6 +235,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
 	stream->outcnt = new->outcnt;
 	stream->incnt  = new->incnt;
 
+	sched->sched_all(stream);
+
 	new->out = NULL;
 	new->in  = NULL;
 }
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
new file mode 100644
index 000000000000..40a9a9de2b98
--- /dev/null
+++ b/net/sctp/stream_sched.c
@@ -0,0 +1,270 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* First Come First Serve (a.k.a. FIFO)
+ * RFC DRAFT ndata Section 3.1
+ */
+static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid,
+			       __u16 value, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid,
+			       __u16 *value)
+{
+	*value = 0;
+	return 0;
+}
+
+static int sctp_sched_fcfs_init(struct sctp_stream *stream)
+{
+	return 0;
+}
+
+static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid,
+				    gfp_t gfp)
+{
+	return 0;
+}
+
+static void sctp_sched_fcfs_free(struct sctp_stream *stream)
+{
+}
+
+static void sctp_sched_fcfs_enqueue(struct sctp_outq *q,
+				    struct sctp_datamsg *msg)
+{
+}
+
+static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_chunk *ch = NULL;
+	struct list_head *entry;
+
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	if (stream->out_curr) {
+		ch = list_entry(stream->out_curr->ext->outq.next,
+				struct sctp_chunk, stream_list);
+	} else {
+		entry = q->out_chunk_list.next;
+		ch = list_entry(entry, struct sctp_chunk, list);
+	}
+
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q,
+					 struct sctp_chunk *chunk)
+{
+}
+
+static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream)
+{
+}
+
+static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream)
+{
+}
+
+static struct sctp_sched_ops sctp_sched_fcfs = {
+	.set = sctp_sched_fcfs_set,
+	.get = sctp_sched_fcfs_get,
+	.init = sctp_sched_fcfs_init,
+	.init_sid = sctp_sched_fcfs_init_sid,
+	.free = sctp_sched_fcfs_free,
+	.enqueue = sctp_sched_fcfs_enqueue,
+	.dequeue = sctp_sched_fcfs_dequeue,
+	.dequeue_done = sctp_sched_fcfs_dequeue_done,
+	.sched_all = sctp_sched_fcfs_sched_all,
+	.unsched_all = sctp_sched_fcfs_unsched_all,
+};
+
+/* API to other parts of the stack */
+
+struct sctp_sched_ops *sctp_sched_ops[] = {
+	&sctp_sched_fcfs,
+};
+
+int sctp_sched_set_sched(struct sctp_association *asoc,
+			 enum sctp_sched_type sched)
+{
+	struct sctp_sched_ops *n = sctp_sched_ops[sched];
+	struct sctp_sched_ops *old = asoc->outqueue.sched;
+	struct sctp_datamsg *msg = NULL;
+	struct sctp_chunk *ch;
+	int i, ret = 0;
+
+	if (old == n)
+		return ret;
+
+	if (sched > SCTP_SS_MAX)
+		return -EINVAL;
+
+	if (old) {
+		old->free(&asoc->stream);
+
+		/* Give the next scheduler a clean slate. */
+		for (i = 0; i < asoc->stream.outcnt; i++) {
+			void *p = asoc->stream.out[i].ext;
+
+			if (!p)
+				continue;
+
+			p += offsetofend(struct sctp_stream_out_ext, outq);
+			memset(p, 0, sizeof(struct sctp_stream_out_ext) -
+				     offsetofend(struct sctp_stream_out_ext, outq));
+		}
+	}
+
+	asoc->outqueue.sched = n;
+	n->init(&asoc->stream);
+	for (i = 0; i < asoc->stream.outcnt; i++) {
+		if (!asoc->stream.out[i].ext)
+			continue;
+
+		ret = n->init_sid(&asoc->stream, i, GFP_KERNEL);
+		if (ret)
+			goto err;
+	}
+
+	/* We have to requeue all chunks already queued. */
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		if (ch->msg == msg)
+			continue;
+		msg = ch->msg;
+		n->enqueue(&asoc->outqueue, msg);
+	}
+
+	return ret;
+
+err:
+	n->free(&asoc->stream);
+	asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */
+
+	return ret;
+}
+
+int sctp_sched_get_sched(struct sctp_association *asoc)
+{
+	int i;
+
+	for (i = 0; i <= SCTP_SS_MAX; i++)
+		if (asoc->outqueue.sched == sctp_sched_ops[i])
+			return i;
+
+	return 0;
+}
+
+int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 value, gfp_t gfp)
+{
+	if (sid >= asoc->stream.outcnt)
+		return -EINVAL;
+
+	if (!asoc->stream.out[sid].ext) {
+		int ret;
+
+		ret = sctp_stream_init_ext(&asoc->stream, sid);
+		if (ret)
+			return ret;
+	}
+
+	return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp);
+}
+
+int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 *value)
+{
+	if (sid >= asoc->stream.outcnt)
+		return -EINVAL;
+
+	if (!asoc->stream.out[sid].ext)
+		return 0;
+
+	return asoc->outqueue.sched->get(&asoc->stream, sid, value);
+}
+
+void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
+{
+	if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) {
+		struct sctp_stream_out *sout;
+		__u16 sid;
+
+		/* datamsg is not finish, so save it as current one,
+		 * in case application switch scheduler or a higher
+		 * priority stream comes in.
+		 */
+		sid = sctp_chunk_stream_no(ch);
+		sout = &q->asoc->stream.out[sid];
+		q->asoc->stream.out_curr = sout;
+		return;
+	}
+
+	q->asoc->stream.out_curr = NULL;
+	q->sched->dequeue_done(q, ch);
+}
+
+/* Auxiliary functions for the schedulers */
+void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch)
+{
+	list_del_init(&ch->list);
+	list_del_init(&ch->stream_list);
+	q->out_qlen -= ch->skb->len;
+}
+
+int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp)
+{
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+
+	INIT_LIST_HEAD(&stream->out[sid].ext->outq);
+	return sched->init_sid(stream, sid, gfp);
+}
+
+struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+
+	return asoc->outqueue.sched;
+}
-- 
cgit v1.2.3


From 13aa8770fe42d246c6f3a8eb814b85bccb428011 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:14 -0300
Subject: sctp: add sockopt to get/set stream scheduler

As defined per RFC Draft ndata Section 4.3.2, named as
SCTP_STREAM_SCHEDULER.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  1 +
 net/sctp/socket.c         | 75 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 4487e7625ddb..0050f10087d2 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -122,6 +122,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_RESET_ASSOC	120
 #define SCTP_ADD_STREAMS	121
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
+#define SCTP_STREAM_SCHEDULER	123
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d207734326b0..ae35dbf2810f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -79,6 +79,7 @@
 #include <net/sock.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 /* Forward declarations for internal helper functions. */
 static int sctp_writeable(struct sock *sk);
@@ -3914,6 +3915,36 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_scheduler(struct sock *sk,
+				     char __user *optval,
+				     unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_assoc_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_value > SCTP_SS_MAX)
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_sched_set_sched(asoc, params.assoc_value);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4095,6 +4126,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_ADD_STREAMS:
 		retval = sctp_setsockopt_add_streams(sk, optval, optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER:
+		retval = sctp_setsockopt_scheduler(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6793,6 +6827,43 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_scheduler(struct sock *sk, int len,
+				     char __user *optval,
+				     int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	params.assoc_value = sctp_sched_get_sched(asoc);
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -6975,6 +7046,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_enable_strreset(sk, len, optval,
 							 optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER:
+		retval = sctp_getsockopt_scheduler(sk, len, optval,
+						   optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 0ccdf3c7fdeda511b10def19505178a9d2d3fccd Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:15 -0300
Subject: sctp: add sockopt to get/set stream scheduler parameters

As defined per RFC Draft ndata Section 4.3.3, named as
SCTP_STREAM_SCHEDULER_VALUE.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  7 +++++
 net/sctp/socket.c         | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 0050f10087d2..00ac417d2c4f 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -123,6 +123,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ADD_STREAMS	121
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
 #define SCTP_STREAM_SCHEDULER	123
+#define SCTP_STREAM_SCHEDULER_VALUE	124
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -815,6 +816,12 @@ struct sctp_assoc_value {
     uint32_t                assoc_value;
 };
 
+struct sctp_stream_value {
+	sctp_assoc_t assoc_id;
+	uint16_t stream_id;
+	uint16_t stream_value;
+};
+
 /*
  * 7.2.2 Peer Address Information
  *
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ae35dbf2810f..88c28421ec15 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3945,6 +3945,34 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_scheduler_value(struct sock *sk,
+					   char __user *optval,
+					   unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_sched_set_value(asoc, params.stream_id,
+				      params.stream_value, GFP_KERNEL);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4129,6 +4157,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_STREAM_SCHEDULER:
 		retval = sctp_setsockopt_scheduler(sk, optval, optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER_VALUE:
+		retval = sctp_setsockopt_scheduler_value(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6864,6 +6895,48 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_scheduler_value(struct sock *sk, int len,
+					   char __user *optval,
+					   int __user *optlen)
+{
+	struct sctp_stream_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = sctp_sched_get_value(asoc, params.stream_id,
+				      &params.stream_value);
+	if (retval)
+		goto out;
+
+	if (put_user(len, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(optval, &params, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7050,6 +7123,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_scheduler(sk, len, optval,
 						   optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER_VALUE:
+		retval = sctp_getsockopt_scheduler_value(sk, len, optval,
+							 optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 637784ade221a3c8a7ecd0f583eddd95d6276b9a Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:16 -0300
Subject: sctp: introduce priority based stream scheduler

This patch introduces RFC Draft ndata section 3.4 Priority Based
Scheduler (SCTP_SS_PRIO).

It works by having a struct sctp_stream_priority for each priority
configured. This struct is then enlisted on a queue ordered per priority
if, and only if, there is a stream with data queued, so that dequeueing
is very straightforward: either finish current datamsg or simply dequeue
from the highest priority queued, which is the next stream pointed, and
that's it.

If there are multiple streams assigned with the same priority and with
data queued, it will do round robin amongst them while respecting
datamsgs boundaries (when not using idata chunks), to be reasonably
fair.

We intentionally don't maintain a list of priorities nor a list of all
streams with the same priority to save memory. The first would mean at
least 2 other pointers per priority (which, for 1000 priorities, that
can mean 16kB) and the second would also mean 2 other pointers but per
stream. As SCTP supports up to 65535 streams on a given asoc, that's
1MB. This impacts when giving a priority to some stream, as we have to
find out if the new priority is already being used and if we can free
the old one, and also when tearing down.

The new fields in struct sctp_stream_out_ext and sctp_stream are added
under a union because that memory is to be shared with other schedulers.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h   |  24 +++
 include/uapi/linux/sctp.h    |   3 +-
 net/sctp/Makefile            |   2 +-
 net/sctp/stream_sched.c      |   3 +
 net/sctp/stream_sched_prio.c | 347 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 377 insertions(+), 2 deletions(-)
 create mode 100644 net/sctp/stream_sched_prio.c

(limited to 'include/uapi/linux')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 3c22a30fd71b..40eb8d66a37c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1328,10 +1328,27 @@ struct sctp_inithdr_host {
 	__u32 initial_tsn;
 };
 
+struct sctp_stream_priorities {
+	/* List of priorities scheduled */
+	struct list_head prio_sched;
+	/* List of streams scheduled */
+	struct list_head active;
+	/* The next stream stream in line */
+	struct sctp_stream_out_ext *next;
+	__u16 prio;
+};
+
 struct sctp_stream_out_ext {
 	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
 	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
 	struct list_head outq; /* chunks enqueued by this stream */
+	union {
+		struct {
+			/* Scheduled streams list */
+			struct list_head prio_list;
+			struct sctp_stream_priorities *prio_head;
+		};
+	};
 };
 
 struct sctp_stream_out {
@@ -1351,6 +1368,13 @@ struct sctp_stream {
 	__u16 incnt;
 	/* Current stream being sent, if any */
 	struct sctp_stream_out *out_curr;
+	union {
+		/* Fields used by priority scheduler */
+		struct {
+			/* List of priorities scheduled */
+			struct list_head prio_list;
+		};
+	};
 };
 
 #define SCTP_STREAM_CLOSED		0x00
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 00ac417d2c4f..850fa8b29d7e 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1099,7 +1099,8 @@ struct sctp_add_streams {
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
 	SCTP_SS_FCFS,
-	SCTP_SS_MAX = SCTP_SS_FCFS
+	SCTP_SS_PRIO,
+	SCTP_SS_MAX = SCTP_SS_PRIO
 };
 
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 0f6e6d1d69fd..647c9cfd4e95 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -12,7 +12,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o stream_sched.o
+	  offload.o stream_sched.o stream_sched_prio.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 40a9a9de2b98..115ddb765169 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -121,8 +121,11 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 
 /* API to other parts of the stack */
 
+extern struct sctp_sched_ops sctp_sched_prio;
+
 struct sctp_sched_ops *sctp_sched_ops[] = {
 	&sctp_sched_fcfs,
+	&sctp_sched_prio,
 };
 
 int sctp_sched_set_sched(struct sctp_association *asoc,
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
new file mode 100644
index 000000000000..384dbf3c8760
--- /dev/null
+++ b/net/sctp/stream_sched_prio.c
@@ -0,0 +1,347 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Priority handling
+ * RFC DRAFT ndata section 3.4
+ */
+
+static void sctp_sched_prio_unsched_all(struct sctp_stream *stream);
+
+static struct sctp_stream_priorities *sctp_sched_prio_new_head(
+			struct sctp_stream *stream, int prio, gfp_t gfp)
+{
+	struct sctp_stream_priorities *p;
+
+	p = kmalloc(sizeof(*p), gfp);
+	if (!p)
+		return NULL;
+
+	INIT_LIST_HEAD(&p->prio_sched);
+	INIT_LIST_HEAD(&p->active);
+	p->next = NULL;
+	p->prio = prio;
+
+	return p;
+}
+
+static struct sctp_stream_priorities *sctp_sched_prio_get_head(
+			struct sctp_stream *stream, int prio, gfp_t gfp)
+{
+	struct sctp_stream_priorities *p;
+	int i;
+
+	/* Look into scheduled priorities first, as they are sorted and
+	 * we can find it fast IF it's scheduled.
+	 */
+	list_for_each_entry(p, &stream->prio_list, prio_sched) {
+		if (p->prio == prio)
+			return p;
+		if (p->prio > prio)
+			break;
+	}
+
+	/* No luck. So we search on all streams now. */
+	for (i = 0; i < stream->outcnt; i++) {
+		if (!stream->out[i].ext)
+			continue;
+
+		p = stream->out[i].ext->prio_head;
+		if (!p)
+			/* Means all other streams won't be initialized
+			 * as well.
+			 */
+			break;
+		if (p->prio == prio)
+			return p;
+	}
+
+	/* If not even there, allocate a new one. */
+	return sctp_sched_prio_new_head(stream, prio, gfp);
+}
+
+static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p)
+{
+	struct list_head *pos;
+
+	pos = p->next->prio_list.next;
+	if (pos == &p->active)
+		pos = pos->next;
+	p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list);
+}
+
+static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute)
+{
+	bool scheduled = false;
+
+	if (!list_empty(&soute->prio_list)) {
+		struct sctp_stream_priorities *prio_head = soute->prio_head;
+
+		/* Scheduled */
+		scheduled = true;
+
+		if (prio_head->next == soute)
+			/* Try to move to the next stream */
+			sctp_sched_prio_next_stream(prio_head);
+
+		list_del_init(&soute->prio_list);
+
+		/* Also unsched the priority if this was the last stream */
+		if (list_empty(&prio_head->active)) {
+			list_del_init(&prio_head->prio_sched);
+			/* If there is no stream left, clear next */
+			prio_head->next = NULL;
+		}
+	}
+
+	return scheduled;
+}
+
+static void sctp_sched_prio_sched(struct sctp_stream *stream,
+				  struct sctp_stream_out_ext *soute)
+{
+	struct sctp_stream_priorities *prio, *prio_head;
+
+	prio_head = soute->prio_head;
+
+	/* Nothing to do if already scheduled */
+	if (!list_empty(&soute->prio_list))
+		return;
+
+	/* Schedule the stream. If there is a next, we schedule the new
+	 * one before it, so it's the last in round robin order.
+	 * If there isn't, we also have to schedule the priority.
+	 */
+	if (prio_head->next) {
+		list_add(&soute->prio_list, prio_head->next->prio_list.prev);
+		return;
+	}
+
+	list_add(&soute->prio_list, &prio_head->active);
+	prio_head->next = soute;
+
+	list_for_each_entry(prio, &stream->prio_list, prio_sched) {
+		if (prio->prio > prio_head->prio) {
+			list_add(&prio_head->prio_sched, prio->prio_sched.prev);
+			return;
+		}
+	}
+
+	list_add_tail(&prio_head->prio_sched, &stream->prio_list);
+}
+
+static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
+			       __u16 prio, gfp_t gfp)
+{
+	struct sctp_stream_out *sout = &stream->out[sid];
+	struct sctp_stream_out_ext *soute = sout->ext;
+	struct sctp_stream_priorities *prio_head, *old;
+	bool reschedule = false;
+	int i;
+
+	prio_head = sctp_sched_prio_get_head(stream, prio, gfp);
+	if (!prio_head)
+		return -ENOMEM;
+
+	reschedule = sctp_sched_prio_unsched(soute);
+	old = soute->prio_head;
+	soute->prio_head = prio_head;
+	if (reschedule)
+		sctp_sched_prio_sched(stream, soute);
+
+	if (!old)
+		/* Happens when we set the priority for the first time */
+		return 0;
+
+	for (i = 0; i < stream->outcnt; i++) {
+		soute = stream->out[i].ext;
+		if (soute && soute->prio_head == old)
+			/* It's still in use, nothing else to do here. */
+			return 0;
+	}
+
+	/* No hits, we are good to free it. */
+	kfree(old);
+
+	return 0;
+}
+
+static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid,
+			       __u16 *value)
+{
+	*value = stream->out[sid].ext->prio_head->prio;
+	return 0;
+}
+
+static int sctp_sched_prio_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->prio_list);
+
+	return 0;
+}
+
+static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid,
+				    gfp_t gfp)
+{
+	INIT_LIST_HEAD(&stream->out[sid].ext->prio_list);
+	return sctp_sched_prio_set(stream, sid, 0, gfp);
+}
+
+static void sctp_sched_prio_free(struct sctp_stream *stream)
+{
+	struct sctp_stream_priorities *prio, *n;
+	LIST_HEAD(list);
+	int i;
+
+	/* As we don't keep a list of priorities, to avoid multiple
+	 * frees we have to do it in 3 steps:
+	 *   1. unsched everyone, so the lists are free to use in 2.
+	 *   2. build the list of the priorities
+	 *   3. free the list
+	 */
+	sctp_sched_prio_unsched_all(stream);
+	for (i = 0; i < stream->outcnt; i++) {
+		if (!stream->out[i].ext)
+			continue;
+		prio = stream->out[i].ext->prio_head;
+		if (prio && list_empty(&prio->prio_sched))
+			list_add(&prio->prio_sched, &list);
+	}
+	list_for_each_entry_safe(prio, n, &list, prio_sched) {
+		list_del_init(&prio->prio_sched);
+		kfree(prio);
+	}
+}
+
+static void sctp_sched_prio_enqueue(struct sctp_outq *q,
+				    struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_prio_sched(stream, stream->out[sid].ext);
+}
+
+static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_priorities *prio;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch = NULL;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	/* Find which chunk is next. It's easy, it's either the current
+	 * one or the first chunk on the next active stream.
+	 */
+	if (stream->out_curr) {
+		soute = stream->out_curr->ext;
+	} else {
+		prio = list_entry(stream->prio_list.next,
+				  struct sctp_stream_priorities, prio_sched);
+		soute = prio->next;
+	}
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_prio_dequeue_done(struct sctp_outq *q,
+					 struct sctp_chunk *ch)
+{
+	struct sctp_stream_priorities *prio;
+	struct sctp_stream_out_ext *soute;
+	__u16 sid;
+
+	/* Last chunk on that msg, move to the next stream on
+	 * this priority.
+	 */
+	sid = sctp_chunk_stream_no(ch);
+	soute = q->asoc->stream.out[sid].ext;
+	prio = soute->prio_head;
+
+	sctp_sched_prio_next_stream(prio);
+
+	if (list_empty(&soute->outq))
+		sctp_sched_prio_unsched(soute);
+}
+
+static void sctp_sched_prio_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_out *sout;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid;
+
+		sid = sctp_chunk_stream_no(ch);
+		sout = &stream->out[sid];
+		if (sout->ext)
+			sctp_sched_prio_sched(stream, sout->ext);
+	}
+}
+
+static void sctp_sched_prio_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_priorities *p, *tmp;
+	struct sctp_stream_out_ext *soute, *souttmp;
+
+	list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched)
+		list_for_each_entry_safe(soute, souttmp, &p->active, prio_list)
+			sctp_sched_prio_unsched(soute);
+}
+
+struct sctp_sched_ops sctp_sched_prio = {
+	.set = sctp_sched_prio_set,
+	.get = sctp_sched_prio_get,
+	.init = sctp_sched_prio_init,
+	.init_sid = sctp_sched_prio_init_sid,
+	.free = sctp_sched_prio_free,
+	.enqueue = sctp_sched_prio_enqueue,
+	.dequeue = sctp_sched_prio_dequeue,
+	.dequeue_done = sctp_sched_prio_dequeue_done,
+	.sched_all = sctp_sched_prio_sched_all,
+	.unsched_all = sctp_sched_prio_unsched_all,
+};
-- 
cgit v1.2.3


From ac1ed8b82cd60ba8e7d84103ac1414b8c577c485 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:17 -0300
Subject: sctp: introduce round robin stream scheduler

This patch introduces RFC Draft ndata section 3.2 Priority Based
Scheduler (SCTP_SS_RR).

Works by maintaining a list of enqueued streams and tracking the last
one used to send data. When the datamsg is done, it switches to the next
stream.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  11 +++
 include/uapi/linux/sctp.h  |   3 +-
 net/sctp/Makefile          |   3 +-
 net/sctp/stream_sched.c    |   2 +
 net/sctp/stream_sched_rr.c | 201 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 net/sctp/stream_sched_rr.c

(limited to 'include/uapi/linux')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 40eb8d66a37c..16f949eef52f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1348,6 +1348,10 @@ struct sctp_stream_out_ext {
 			struct list_head prio_list;
 			struct sctp_stream_priorities *prio_head;
 		};
+		/* Fields used by RR scheduler */
+		struct {
+			struct list_head rr_list;
+		};
 	};
 };
 
@@ -1374,6 +1378,13 @@ struct sctp_stream {
 			/* List of priorities scheduled */
 			struct list_head prio_list;
 		};
+		/* Fields used by RR scheduler */
+		struct {
+			/* List of streams scheduled */
+			struct list_head rr_list;
+			/* The next stream stream in line */
+			struct sctp_stream_out_ext *rr_next;
+		};
 	};
 };
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 850fa8b29d7e..6cd7d416ca40 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1100,7 +1100,8 @@ struct sctp_add_streams {
 enum sctp_sched_type {
 	SCTP_SS_FCFS,
 	SCTP_SS_PRIO,
-	SCTP_SS_MAX = SCTP_SS_PRIO
+	SCTP_SS_RR,
+	SCTP_SS_MAX = SCTP_SS_RR
 };
 
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 647c9cfd4e95..bf90c5397719 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -12,7 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o stream_sched.o stream_sched_prio.o
+	  offload.o stream_sched.o stream_sched_prio.o \
+	  stream_sched_rr.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 115ddb765169..03513a9fa110 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -122,10 +122,12 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 /* API to other parts of the stack */
 
 extern struct sctp_sched_ops sctp_sched_prio;
+extern struct sctp_sched_ops sctp_sched_rr;
 
 struct sctp_sched_ops *sctp_sched_ops[] = {
 	&sctp_sched_fcfs,
 	&sctp_sched_prio,
+	&sctp_sched_rr,
 };
 
 int sctp_sched_set_sched(struct sctp_association *asoc,
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
new file mode 100644
index 000000000000..7612a438c5b9
--- /dev/null
+++ b/net/sctp/stream_sched_rr.c
@@ -0,0 +1,201 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Priority handling
+ * RFC DRAFT ndata section 3.2
+ */
+static void sctp_sched_rr_unsched_all(struct sctp_stream *stream);
+
+static void sctp_sched_rr_next_stream(struct sctp_stream *stream)
+{
+	struct list_head *pos;
+
+	pos = stream->rr_next->rr_list.next;
+	if (pos == &stream->rr_list)
+		pos = pos->next;
+	stream->rr_next = list_entry(pos, struct sctp_stream_out_ext, rr_list);
+}
+
+static void sctp_sched_rr_unsched(struct sctp_stream *stream,
+				  struct sctp_stream_out_ext *soute)
+{
+	if (stream->rr_next == soute)
+		/* Try to move to the next stream */
+		sctp_sched_rr_next_stream(stream);
+
+	list_del_init(&soute->rr_list);
+
+	/* If we have no other stream queued, clear next */
+	if (list_empty(&stream->rr_list))
+		stream->rr_next = NULL;
+}
+
+static void sctp_sched_rr_sched(struct sctp_stream *stream,
+				struct sctp_stream_out_ext *soute)
+{
+	if (!list_empty(&soute->rr_list))
+		/* Already scheduled. */
+		return;
+
+	/* Schedule the stream */
+	list_add_tail(&soute->rr_list, &stream->rr_list);
+
+	if (!stream->rr_next)
+		stream->rr_next = soute;
+}
+
+static int sctp_sched_rr_set(struct sctp_stream *stream, __u16 sid,
+			     __u16 prio, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_rr_get(struct sctp_stream *stream, __u16 sid,
+			     __u16 *value)
+{
+	return 0;
+}
+
+static int sctp_sched_rr_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->rr_list);
+	stream->rr_next = NULL;
+
+	return 0;
+}
+
+static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid,
+				  gfp_t gfp)
+{
+	INIT_LIST_HEAD(&stream->out[sid].ext->rr_list);
+
+	return 0;
+}
+
+static void sctp_sched_rr_free(struct sctp_stream *stream)
+{
+	sctp_sched_rr_unsched_all(stream);
+}
+
+static void sctp_sched_rr_enqueue(struct sctp_outq *q,
+				  struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_rr_sched(stream, stream->out[sid].ext);
+}
+
+static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch = NULL;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	/* Find which chunk is next */
+	if (stream->out_curr)
+		soute = stream->out_curr->ext;
+	else
+		soute = stream->rr_next;
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_rr_dequeue_done(struct sctp_outq *q,
+				       struct sctp_chunk *ch)
+{
+	struct sctp_stream_out_ext *soute;
+	__u16 sid;
+
+	/* Last chunk on that msg, move to the next stream */
+	sid = sctp_chunk_stream_no(ch);
+	soute = q->asoc->stream.out[sid].ext;
+
+	sctp_sched_rr_next_stream(&q->asoc->stream);
+
+	if (list_empty(&soute->outq))
+		sctp_sched_rr_unsched(&q->asoc->stream, soute);
+}
+
+static void sctp_sched_rr_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid;
+
+		sid = sctp_chunk_stream_no(ch);
+		soute = stream->out[sid].ext;
+		if (soute)
+			sctp_sched_rr_sched(stream, soute);
+	}
+}
+
+static void sctp_sched_rr_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_out_ext *soute, *tmp;
+
+	list_for_each_entry_safe(soute, tmp, &stream->rr_list, rr_list)
+		sctp_sched_rr_unsched(stream, soute);
+}
+
+struct sctp_sched_ops sctp_sched_rr = {
+	.set = sctp_sched_rr_set,
+	.get = sctp_sched_rr_get,
+	.init = sctp_sched_rr_init,
+	.init_sid = sctp_sched_rr_init_sid,
+	.free = sctp_sched_rr_free,
+	.enqueue = sctp_sched_rr_enqueue,
+	.dequeue = sctp_sched_rr_dequeue,
+	.dequeue_done = sctp_sched_rr_dequeue_done,
+	.sched_all = sctp_sched_rr_sched_all,
+	.unsched_all = sctp_sched_rr_unsched_all,
+};
-- 
cgit v1.2.3


From 6263368c5b0b758d8639cad37a2a6493c9370425 Mon Sep 17 00:00:00 2001
From: Ed Blake <ed.blake@sondrel.com>
Date: Tue, 26 Sep 2017 11:40:02 +0100
Subject: serial: Add define for max baud rate divisor

Add a define for the maximum baud rate divisor, to improve code
readability.

Signed-off-by: Ed Blake <ed.blake@sondrel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_mtk.c  | 2 +-
 drivers/tty/serial/8250/8250_omap.c | 4 ++--
 drivers/tty/serial/8250/8250_port.c | 2 +-
 include/uapi/linux/serial_reg.h     | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c
index fb45770d47aa..fef9823d7b4c 100644
--- a/drivers/tty/serial/8250/8250_mtk.c
+++ b/drivers/tty/serial/8250/8250_mtk.c
@@ -61,7 +61,7 @@ mtk8250_set_termios(struct uart_port *port, struct ktermios *termios,
 	 * registers to their default values.
 	 */
 	baud = uart_get_baud_rate(port, termios, old,
-				  port->uartclk / 16 / 0xffff,
+				  port->uartclk / 16 / UART_DIV_MAX,
 				  port->uartclk);
 
 	if (baud <= 115200) {
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 833771bca0a5..4938d338e01f 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -199,7 +199,7 @@ static void omap_8250_get_divisor(struct uart_port *port, unsigned int baud,
 	 * Old custom speed handling.
 	 */
 	if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST) {
-		priv->quot = port->custom_divisor & 0xffff;
+		priv->quot = port->custom_divisor & UART_DIV_MAX;
 		/*
 		 * I assume that nobody is using this. But hey, if somebody
 		 * would like to specify the divisor _and_ the mode then the
@@ -358,7 +358,7 @@ static void omap_8250_set_termios(struct uart_port *port,
 	 * Ask the core to calculate the divisor for us.
 	 */
 	baud = uart_get_baud_rate(port, termios, old,
-				  port->uartclk / 16 / 0xffff,
+				  port->uartclk / 16 / UART_DIV_MAX,
 				  port->uartclk / 13);
 	omap_8250_get_divisor(port, baud, priv);
 
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index f0cc04f62b67..01ab2188a151 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2601,7 +2601,7 @@ static unsigned int serial8250_get_baud_rate(struct uart_port *port,
 	 * causing transmission errors.
 	 */
 	return uart_get_baud_rate(port, termios, old,
-				  port->uartclk / 16 / 0xffff,
+				  port->uartclk / 16 / UART_DIV_MAX,
 				  port->uartclk);
 }
 
diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h
index 5db76880b4ad..dea05724c760 100644
--- a/include/uapi/linux/serial_reg.h
+++ b/include/uapi/linux/serial_reg.h
@@ -157,6 +157,7 @@
  */
 #define UART_DLL	0	/* Out: Divisor Latch Low */
 #define UART_DLM	1	/* Out: Divisor Latch High */
+#define UART_DIV_MAX	0xFFFF	/* Max divisor value */
 
 /*
  * LCR=0xBF (or DLAB=1 for 16C660)
-- 
cgit v1.2.3


From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:21 -0700
Subject: bpf: multi program support for cgroup+bpf

introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple
bpf programs to a cgroup.

The difference between three possible flags for BPF_PROG_ATTACH command:
- NONE(default): No further bpf programs allowed in the subtree.
- BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
  the program in this cgroup yields to sub-cgroup program.
- BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
  that cgroup program gets run in addition to the program in this cgroup.

NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't
change their behavior. It only clarifies the semantics in relation
to new flag.

Only one program is allowed to be attached to a cgroup with
NONE or BPF_F_ALLOW_OVERRIDE flag.
Multiple programs are allowed to be attached to a cgroup with
BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
(those that were attached first, run first)
The programs of sub-cgroup are executed first, then programs of
this cgroup and then programs of parent cgroup.
All eligible programs are executed regardless of return code from
earlier programs.

To allow efficient execution of multiple programs attached to a cgroup
and to avoid penalizing cgroups without any programs attached
introduce 'struct bpf_prog_array' which is RCU protected array
of pointers to bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  46 +++--
 include/linux/bpf.h        |  32 ++++
 include/linux/filter.h     |   2 +-
 include/uapi/linux/bpf.h   |  42 +++-
 kernel/bpf/cgroup.c        | 467 ++++++++++++++++++++++++++++++++-------------
 kernel/bpf/core.c          |  31 +++
 kernel/bpf/syscall.c       |  37 ++--
 kernel/cgroup/cgroup.c     |  28 ++-
 8 files changed, 516 insertions(+), 169 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d41d40ac3efd..102e56fbb6de 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -14,27 +14,42 @@ struct bpf_sock_ops_kern;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_prog_list {
+	struct list_head node;
+	struct bpf_prog *prog;
+};
+
+struct bpf_prog_array;
+
 struct cgroup_bpf {
-	/*
-	 * Store two sets of bpf_prog pointers, one for programs that are
-	 * pinned directly to this cgroup, and one for those that are effective
-	 * when this cgroup is accessed.
+	/* array of effective progs in this cgroup */
+	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+
+	/* attached progs to this cgroup and attach flags
+	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
+	 * have either zero or one element
+	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
-	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
-	bool disallow_override[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_BPF_ATTACH_TYPE];
+	u32 flags[MAX_BPF_ATTACH_TYPE];
+
+	/* temp storage for effective prog array used by prog_attach/detach */
+	struct bpf_prog_array __rcu *inactive;
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+int cgroup_bpf_inherit(struct cgroup *cgrp);
 
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool overridable);
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
 
-/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable);
+/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 
 struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
-static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
-				      struct cgroup *parent) {}
+static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 252f4bc9eb25..a6964b75f070 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
 
+/* an array of programs to be executed under rcu_lock.
+ *
+ * Typical usage:
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ *
+ * the structure returned by bpf_prog_array_alloc() should be populated
+ * with program pointers and the last pointer must be NULL.
+ * The user has to keep refcnt on the program and make sure the program
+ * is removed from the array before bpf_prog_put().
+ * The 'struct bpf_prog_array *' should only be replaced with xchg()
+ * since other cpus are walking the array of pointers in parallel.
+ */
+struct bpf_prog_array {
+	struct rcu_head rcu;
+	struct bpf_prog *progs[0];
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog **_prog;		\
+		u32 _ret = 1;				\
+		rcu_read_lock();			\
+		_prog = rcu_dereference(array)->progs;	\
+		for (; *_prog; _prog++)			\
+			_ret &= func(*_prog, ctx);	\
+		rcu_read_unlock();			\
+		_ret;					\
+	 })
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 911d454af107..2d2db394b0ca 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -481,7 +481,7 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6d2137b4cf38..762f74bc6c47 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,11 +143,47 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
- * to the given target_fd cgroup the descendent cgroup will be able to
- * override effective bpf program that was inherited from this cgroup
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..6b7500bbdb53 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 {
 	unsigned int type;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
-		struct bpf_prog *prog = cgrp->bpf.prog[type];
-
-		if (prog) {
-			bpf_prog_put(prog);
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+		struct list_head *progs = &cgrp->bpf.progs[type];
+		struct bpf_prog_list *pl, *tmp;
+
+		list_for_each_entry_safe(pl, tmp, progs, node) {
+			list_del(&pl->node);
+			bpf_prog_put(pl->prog);
+			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
+		bpf_prog_array_free(cgrp->bpf.effective[type]);
+	}
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+	struct bpf_prog_list *pl;
+	u32 cnt = 0;
+
+	list_for_each_entry(pl, head, node) {
+		if (!pl->prog)
+			continue;
+		cnt++;
 	}
+	return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+				    enum bpf_attach_type type,
+				    u32 new_flags)
+{
+	struct cgroup *p;
+
+	p = cgroup_parent(cgrp);
+	if (!p)
+		return true;
+	do {
+		u32 flags = p->bpf.flags[type];
+		u32 cnt;
+
+		if (flags & BPF_F_ALLOW_MULTI)
+			return true;
+		cnt = prog_list_length(&p->bpf.progs[type]);
+		WARN_ON_ONCE(cnt > 1);
+		if (cnt == 1)
+			return !!(flags & BPF_F_ALLOW_OVERRIDE);
+		p = cgroup_parent(p);
+	} while (p);
+	return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_prog_array __rcu **array)
+{
+	struct bpf_prog_array __rcu *progs;
+	struct bpf_prog_list *pl;
+	struct cgroup *p = cgrp;
+	int cnt = 0;
+
+	/* count number of effective programs by walking parents */
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[type]);
+		p = cgroup_parent(p);
+	} while (p);
+
+	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+	if (!progs)
+		return -ENOMEM;
+
+	/* populate the array with effective progs */
+	cnt = 0;
+	p = cgrp;
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			list_for_each_entry(pl,
+					    &p->bpf.progs[type], node) {
+				if (!pl->prog)
+					continue;
+				rcu_dereference_protected(progs, 1)->
+					progs[cnt++] = pl->prog;
+			}
+		p = cgroup_parent(p);
+	} while (p);
+
+	*array = progs;
+	return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+				     enum bpf_attach_type type,
+				     struct bpf_prog_array __rcu *array)
+{
+	struct bpf_prog_array __rcu *old_array;
+
+	old_array = xchg(&cgrp->bpf.effective[type], array);
+	/* free prog array after grace period, since __cgroup_bpf_run_*()
+	 * might be still walking the array
+	 */
+	bpf_prog_array_free(old_array);
 }
 
 /**
  * cgroup_bpf_inherit() - inherit effective programs from parent
  * @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
  */
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
 {
-	unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define	NR ARRAY_SIZE(cgrp->bpf.effective)
+	struct bpf_prog_array __rcu *arrays[NR] = {};
+	int i;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
-		struct bpf_prog *e;
+	for (i = 0; i < NR; i++)
+		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
-		e = rcu_dereference_protected(parent->bpf.effective[type],
-					      lockdep_is_held(&cgroup_mutex));
-		rcu_assign_pointer(cgrp->bpf.effective[type], e);
-		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
-	}
+	for (i = 0; i < NR; i++)
+		if (compute_effective_progs(cgrp, i, &arrays[i]))
+			goto cleanup;
+
+	for (i = 0; i < NR; i++)
+		activate_effective_progs(cgrp, i, arrays[i]);
+
+	return 0;
+cleanup:
+	for (i = 0; i < NR; i++)
+		bpf_prog_array_free(arrays[i]);
+	return -ENOMEM;
 }
 
+#define BPF_CGROUP_MAX_PROGS 64
+
 /**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
  *
  * Must be called with cgroup_mutex held.
  */
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags)
 {
-	struct bpf_prog *old_prog, *effective = NULL;
-	struct cgroup_subsys_state *pos;
-	bool overridable = true;
-
-	if (parent) {
-		overridable = !parent->bpf.disallow_override[type];
-		effective = rcu_dereference_protected(parent->bpf.effective[type],
-						      lockdep_is_held(&cgroup_mutex));
-	}
-
-	if (prog && effective && !overridable)
-		/* if parent has non-overridable prog attached, disallow
-		 * attaching new programs to descendent cgroup
-		 */
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	bool pl_was_allocated;
+	u32 old_flags;
+	int err;
+
+	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+		/* invalid combination */
+		return -EINVAL;
+
+	if (!hierarchy_allows_attach(cgrp, type, flags))
 		return -EPERM;
 
-	if (prog && effective && overridable != new_overridable)
-		/* if parent has overridable prog attached, only
-		 * allow overridable programs in descendent cgroup
+	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+		/* Disallow attaching non-overridable on top
+		 * of existing overridable in this cgroup.
+		 * Disallow attaching multi-prog if overridable or none
 		 */
 		return -EPERM;
 
-	old_prog = cgrp->bpf.prog[type];
-
-	if (prog) {
-		overridable = new_overridable;
-		effective = prog;
-		if (old_prog &&
-		    cgrp->bpf.disallow_override[type] == new_overridable)
-			/* disallow attaching non-overridable on top
-			 * of existing overridable in this cgroup
-			 * and vice versa
-			 */
-			return -EPERM;
+	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+		return -E2BIG;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		list_for_each_entry(pl, progs, node)
+			if (pl->prog == prog)
+				/* disallow attaching the same prog twice */
+				return -EINVAL;
+
+		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+		if (!pl)
+			return -ENOMEM;
+		pl_was_allocated = true;
+		pl->prog = prog;
+		list_add_tail(&pl->node, progs);
+	} else {
+		if (list_empty(progs)) {
+			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+			if (!pl)
+				return -ENOMEM;
+			pl_was_allocated = true;
+			list_add_tail(&pl->node, progs);
+		} else {
+			pl = list_first_entry(progs, typeof(*pl), node);
+			old_prog = pl->prog;
+			pl_was_allocated = false;
+		}
+		pl->prog = prog;
 	}
 
-	if (!prog && !old_prog)
-		/* report error when trying to detach and nothing is attached */
-		return -ENOENT;
+	old_flags = cgrp->bpf.flags[type];
+	cgrp->bpf.flags[type] = flags;
 
-	cgrp->bpf.prog[type] = prog;
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-	css_for_each_descendant_pre(pos, &cgrp->self) {
-		struct cgroup *desc = container_of(pos, struct cgroup, self);
-
-		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp) {
-			pos = css_rightmost_descendant(pos);
-		} else {
-			rcu_assign_pointer(desc->bpf.effective[type],
-					   effective);
-			desc->bpf.disallow_override[type] = !overridable;
-		}
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
 	}
 
-	if (prog)
-		static_branch_inc(&cgroup_bpf_enabled_key);
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	static_branch_inc(&cgroup_bpf_enabled_key);
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
 	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and cleanup the prog list */
+	pl->prog = old_prog;
+	if (pl_was_allocated) {
+		list_del(&pl->node);
+		kfree(pl);
+	}
+	return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 unused_flags)
+{
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	int err;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		if (!prog)
+			/* to detach MULTI prog the user has to specify valid FD
+			 * of the program to be detached
+			 */
+			return -EINVAL;
+	} else {
+		if (list_empty(progs))
+			/* report error when trying to detach and nothing is attached */
+			return -ENOENT;
+	}
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		/* find the prog and detach it */
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog != prog)
+				continue;
+			old_prog = prog;
+			/* mark it deleted, so it's ignored while
+			 * recomputing effective
+			 */
+			pl->prog = NULL;
+			break;
+		}
+		if (!old_prog)
+			return -ENOENT;
+	} else {
+		/* to maintain backward compatibility NONE and OVERRIDE cgroups
+		 * allow detaching with invalid FD (prog==NULL)
+		 */
+		pl = list_first_entry(progs, typeof(*pl), node);
+		old_prog = pl->prog;
+		pl->prog = NULL;
+	}
+
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
+	}
+
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* now can actually delete it from this cgroup list */
+	list_del(&pl->node);
+	kfree(pl);
+	if (list_empty(progs))
+		/* last program was detached, reset flags to zero */
+		cgrp->bpf.flags[type] = 0;
+
+	bpf_prog_put(old_prog);
+	static_branch_dec(&cgroup_bpf_enabled_key);
+	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and restore back old_prog */
+	pl->prog = old_prog;
+	return err;
 }
 
 /**
@@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type)
 {
-	struct bpf_prog *prog;
+	unsigned int offset = skb->data - skb_network_header(skb);
+	struct sock *save_sk;
 	struct cgroup *cgrp;
-	int ret = 0;
+	int ret;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	if (sk->sk_family != AF_INET &&
-	    sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog) {
-		unsigned int offset = skb->data - skb_network_header(skb);
-		struct sock *save_sk = skb->sk;
-
-		skb->sk = sk;
-		__skb_push(skb, offset);
-		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
-		__skb_pull(skb, offset);
-		skb->sk = save_sk;
-	}
-
-	rcu_read_unlock();
-
-	return ret;
+	save_sk = skb->sk;
+	skb->sk = sk;
+	__skb_push(skb, offset);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+				 bpf_prog_run_save_cb);
+	__skb_pull(skb, offset);
+	skb->sk = save_sk;
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 
@@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+	int ret;
 
-	rcu_read_unlock();
-
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
-
-	rcu_read_unlock();
+	int ret;
 
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+				 BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..6b49e1991ae7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+	struct bpf_prog_array hdr;
+	struct bpf_prog *null_prog;
+} empty_prog_array = {
+	.null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+	if (prog_cnt)
+		return kzalloc(sizeof(struct bpf_prog_array) +
+			       sizeof(struct bpf_prog *) * (prog_cnt + 1),
+			       flags);
+
+	return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+	if (!progs ||
+	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+		return;
+	kfree_rcu(progs, rcu);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b927da66f653..51bee695d32c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
 	return 0;
 }
 
+#define BPF_F_ATTACH_MASK \
+	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
@@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
-	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
 		return -EINVAL;
 
 	switch (attr->attach_type) {
@@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
-				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
 	cgroup_put(cgrp);
@@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
+	struct bpf_prog *prog;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
+		break;
 	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	case BPF_CGROUP_SOCK_OPS:
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp))
-			return PTR_ERR(cgrp);
-
-		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		ret = sockmap_get_from_fd(attr, false);
-		break;
+		return sockmap_get_from_fd(attr, false);
 	default:
 		return -EINVAL;
 	}
 
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+	cgroup_put(cgrp);
 	return ret;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..57eb866ae78d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	if (ret)
 		goto destroy_root;
 
+	ret = cgroup_bpf_inherit(root_cgrp);
+	WARN_ON_ONCE(ret);
+
 	trace_cgroup_setup_root(root);
 
 	/*
@@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 	cgrp->level = level;
+	ret = cgroup_bpf_inherit(cgrp);
+	if (ret)
+		goto out_idr_free;
 
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
-	if (parent)
-		cgroup_bpf_inherit(cgrp, parent);
-
 	cgroup_propagate_control(cgrp);
 
 	return cgrp;
 
+out_idr_free:
+	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
 #ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
 {
-	struct cgroup *parent = cgroup_parent(cgrp);
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+	ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
-- 
cgit v1.2.3


From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:22 -0700
Subject: bpf: introduce BPF_PROG_QUERY command

introduce BPF_PROG_QUERY command to retrieve a set of either
attached programs to given cgroup or a set of effective programs
that will execute for events within a cgroup

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  4 ++++
 include/linux/bpf.h        |  3 +++
 include/uapi/linux/bpf.h   | 13 +++++++++++++
 kernel/bpf/cgroup.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c          | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       | 34 ++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c     | 10 ++++++++++
 7 files changed, 148 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 102e56fbb6de..359b6f5d3d90 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 
 /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a6964b75f070..a67daea731ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -260,6 +260,9 @@ struct bpf_prog_array {
 
 struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt);
 
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	({						\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 762f74bc6c47..cb2b9f95160a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
 };
 
 enum bpf_map_type {
@@ -211,6 +212,9 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 #define BPF_OBJ_NAME_LEN 16U
 
 union bpf_attr {
@@ -289,6 +293,15 @@ union bpf_attr {
 		__u32		info_len;
 		__aligned_u64	info;
 	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6b7500bbdb53..e88abc0865d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -384,6 +384,52 @@ cleanup:
 	return err;
 }
 
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	enum bpf_attach_type type = attr->query.attach_type;
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	int cnt, ret = 0, i;
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+	else
+		cnt = prog_list_length(progs);
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+		return -EFAULT;
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+		return -EFAULT;
+	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+		/* return early if user requested only program count + flags */
+		return 0;
+	if (attr->query.prog_cnt < cnt) {
+		cnt = attr->query.prog_cnt;
+		ret = -ENOSPC;
+	}
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+						   prog_ids, cnt);
+	} else {
+		struct bpf_prog_list *pl;
+		u32 id;
+
+		i = 0;
+		list_for_each_entry(pl, progs, node) {
+			id = pl->prog->aux->id;
+			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+				return -EFAULT;
+			if (++i == cnt)
+				break;
+		}
+	}
+	return ret;
+}
+
 /**
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6b49e1991ae7..eba966c09053 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
 	kfree_rcu(progs, rcu);
 }
 
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+	struct bpf_prog **prog;
+	u32 cnt = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++)
+		cnt++;
+	rcu_read_unlock();
+	return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt)
+{
+	struct bpf_prog **prog;
+	u32 i = 0, id;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++) {
+		id = (*prog)->aux->id;
+		if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+			rcu_read_unlock();
+			return -EFAULT;
+		}
+		if (++i == cnt) {
+			prog++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (*prog)
+		return -ENOSPC;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 51bee695d32c..0048cb24ba7b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (CHECK_ATTR(BPF_PROG_QUERY))
+		return -EINVAL;
+	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+		return -EINVAL;
+
+	switch (attr->query.attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
+		break;
+	default:
+		return -EINVAL;
+	}
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	cgroup_put(cgrp);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_DETACH:
 		err = bpf_prog_detach(&attr);
 		break;
+	case BPF_PROG_QUERY:
+		err = bpf_prog_query(&attr, uattr);
+		break;
 #endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 57eb866ae78d..269512b94a94 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_query(cgrp, attr, uattr);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
-- 
cgit v1.2.3


From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 3 Oct 2017 13:53:23 +0200
Subject: dev: advertise the new nsid when the netns iface changes

x-netns interfaces are bound to two netns: the link netns and the upper
netns. Usually, this kind of interfaces is created in the link netns and
then moved to the upper netns. At the end, the interface is visible only
in the upper netns. The link nsid is advertised via netlink in the upper
netns, thus the user always knows where is the link part.

There is no such mechanism in the link netns. When the interface is moved
to another netns, the user cannot "follow" it.
This patch adds a new netlink attribute which helps to follow an interface
which moves to another netns. When the interface is unregistered, the new
nsid is advertised. If the interface is a x-netns interface (ie
rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed.

CC: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h    |  4 +++-
 include/uapi/linux/if_link.h |  1 +
 net/core/dev.c               | 11 ++++++++---
 net/core/rtnetlink.c         | 31 ++++++++++++++++++++++---------
 4 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index dea59c8eec54..1251638e60d3 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -17,9 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
-				       gfp_t flags);
+				       gfp_t flags, int *new_nsid);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ea87bd708ee9..cd580fc0e58f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -158,6 +158,7 @@ enum {
 	IFLA_PAD,
 	IFLA_XDP,
 	IFLA_EVENT,
+	IFLA_NEW_NETNSID,
 	__IFLA_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 454f05441546..bffc75429184 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -145,6 +145,7 @@
 #include <linux/crash_dump.h>
 #include <linux/sctp.h>
 #include <net/udp_tunnel.h>
+#include <linux/net_namespace.h>
 
 #include "net-sysfs.h"
 
@@ -7204,7 +7205,7 @@ static void rollback_registered_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL);
+						     GFP_KERNEL, NULL);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -8291,7 +8292,7 @@ EXPORT_SYMBOL(unregister_netdev);
 
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
-	int err;
+	int err, new_nsid;
 
 	ASSERT_RTNL();
 
@@ -8347,7 +8348,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net)
+		new_nsid = peernet2id_alloc(dev_net(dev), net);
+	else
+		new_nsid = peernet2id(dev_net(dev), net);
+	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
 
 	/*
 	 *	Flush the unicast and multicast chains
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3fb1ca33cba4..1ee98b1369d5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -915,6 +915,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
 	       + rtnl_xdp_size() /* IFLA_XDP */
 	       + nla_total_size(4)  /* IFLA_EVENT */
+	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1384,7 +1385,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event)
+			    u32 event, int *new_nsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1472,6 +1473,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (rtnl_fill_link_netnsid(skb, dev))
 		goto nla_put_failure;
 
+	if (new_nsid &&
+	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
+		goto nla_put_failure;
+
 	if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
 		goto nla_put_failure;
 
@@ -1701,7 +1706,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0);
+					       ext_filter_mask, 0, NULL);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -2808,7 +2813,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -2893,7 +2898,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
-				       u32 event, gfp_t flags)
+				       u32 event, gfp_t flags, int *new_nsid)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2904,7 +2909,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event,
+			       new_nsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -2927,14 +2933,14 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags)
+			       gfp_t flags, int *new_nsid)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
@@ -2942,10 +2948,17 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		  gfp_t flags)
 {
-	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags);
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL);
 }
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid)
+{
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+			   new_nsid);
+}
+
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
 				   struct net_device *dev,
 				   u8 *addr, u16 vid, u32 pid, u32 seq,
@@ -4321,7 +4334,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL);
+				   GFP_KERNEL, NULL);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 413a4317aca7d6367d57a5971b0c461f03851207 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 5 Oct 2017 16:46:53 -0400
Subject: VSOCK: add sock_diag interface

This patch adds the sock_diag interface for querying sockets from
userspace.  Tools like ss(8) and netstat(8) can use this interface to
list open sockets.

The userspace ABI is defined in <linux/vm_sockets_diag.h> and includes
netlink request and response structs.  The request can query sockets
based on their sk_state (e.g. listening sockets only) and the response
contains socket information fields including the local/remote addresses,
inode number, etc.

This patch does not dump VMCI pending sockets because I have only tested
the virtio transport, which does not use pending sockets.  Support can
be added later by extending vsock_diag_dump() if needed by VMCI users.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                          |   2 +
 include/uapi/linux/vm_sockets_diag.h |  33 +++++++
 net/vmw_vsock/Kconfig                |  10 ++
 net/vmw_vsock/Makefile               |   3 +
 net/vmw_vsock/diag.c                 | 186 +++++++++++++++++++++++++++++++++++
 5 files changed, 234 insertions(+)
 create mode 100644 include/uapi/linux/vm_sockets_diag.h
 create mode 100644 net/vmw_vsock/diag.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index d0cbb3d7a0ca..0fd9121953bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14286,6 +14286,8 @@ S:	Maintained
 F:	include/linux/virtio_vsock.h
 F:	include/uapi/linux/virtio_vsock.h
 F:	include/uapi/linux/vsockmon.h
+F:	include/uapi/linux/vm_sockets_diag.h
+F:	net/vmw_vsock/diag.c
 F:	net/vmw_vsock/af_vsock_tap.c
 F:	net/vmw_vsock/virtio_transport_common.c
 F:	net/vmw_vsock/virtio_transport.c
diff --git a/include/uapi/linux/vm_sockets_diag.h b/include/uapi/linux/vm_sockets_diag.h
new file mode 100644
index 000000000000..14cd7dc5a187
--- /dev/null
+++ b/include/uapi/linux/vm_sockets_diag.h
@@ -0,0 +1,33 @@
+/* AF_VSOCK sock_diag(7) interface for querying open sockets */
+
+#ifndef _UAPI__VM_SOCKETS_DIAG_H__
+#define _UAPI__VM_SOCKETS_DIAG_H__
+
+#include <linux/types.h>
+
+/* Request */
+struct vsock_diag_req {
+	__u8	sdiag_family;	/* must be AF_VSOCK */
+	__u8	sdiag_protocol;	/* must be 0 */
+	__u16	pad;		/* must be 0 */
+	__u32	vdiag_states;	/* query bitmap (e.g. 1 << TCP_LISTEN) */
+	__u32	vdiag_ino;	/* must be 0 (reserved) */
+	__u32	vdiag_show;	/* must be 0 (reserved) */
+	__u32	vdiag_cookie[2];
+};
+
+/* Response */
+struct vsock_diag_msg {
+	__u8	vdiag_family;	/* AF_VSOCK */
+	__u8	vdiag_type;	/* SOCK_STREAM or SOCK_DGRAM */
+	__u8	vdiag_state;	/* sk_state (e.g. TCP_LISTEN) */
+	__u8	vdiag_shutdown; /* local RCV_SHUTDOWN | SEND_SHUTDOWN */
+	__u32   vdiag_src_cid;
+	__u32   vdiag_src_port;
+	__u32   vdiag_dst_cid;
+	__u32   vdiag_dst_port;
+	__u32	vdiag_ino;
+	__u32	vdiag_cookie[2];
+};
+
+#endif /* _UAPI__VM_SOCKETS_DIAG_H__ */
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index a24369d175fd..970f96489fe7 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -15,6 +15,16 @@ config VSOCKETS
 	  To compile this driver as a module, choose M here: the module
 	  will be called vsock. If unsure, say N.
 
+config VSOCKETS_DIAG
+	tristate "Virtual Sockets monitoring interface"
+	depends on VSOCKETS
+	default y
+	help
+	  Support for PF_VSOCK sockets monitoring interface used by the ss tool.
+	  If unsure, say Y.
+
+	  Enable this module so userspace applications can query open sockets.
+
 config VMWARE_VMCI_VSOCKETS
 	tristate "VMware VMCI transport for Virtual Sockets"
 	depends on VSOCKETS && VMWARE_VMCI
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index e63d574234a9..64afc06805da 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_VSOCKETS) += vsock.o
+obj-$(CONFIG_VSOCKETS_DIAG) += vsock_diag.o
 obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
 obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
 obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
@@ -6,6 +7,8 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
 
 vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
 
+vsock_diag-y += diag.o
+
 vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
 	vmci_transport_notify_qstate.o
 
diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c
new file mode 100644
index 000000000000..31b567652250
--- /dev/null
+++ b/net/vmw_vsock/diag.c
@@ -0,0 +1,186 @@
+/*
+ * vsock sock_diag(7) module
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ * Author: Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/module.h>
+#include <linux/sock_diag.h>
+#include <linux/vm_sockets_diag.h>
+#include <net/af_vsock.h>
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+			u32 portid, u32 seq, u32 flags)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct vsock_diag_msg *rep;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep),
+			flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	rep = nlmsg_data(nlh);
+	rep->vdiag_family = AF_VSOCK;
+
+	/* Lock order dictates that sk_lock is acquired before
+	 * vsock_table_lock, so we cannot lock here.  Simply don't take
+	 * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is
+	 * held.
+	 */
+	rep->vdiag_type = sk->sk_type;
+	rep->vdiag_state = sk->sk_state;
+	rep->vdiag_shutdown = sk->sk_shutdown;
+	rep->vdiag_src_cid = vsk->local_addr.svm_cid;
+	rep->vdiag_src_port = vsk->local_addr.svm_port;
+	rep->vdiag_dst_cid = vsk->remote_addr.svm_cid;
+	rep->vdiag_dst_port = vsk->remote_addr.svm_port;
+	rep->vdiag_ino = sock_i_ino(sk);
+
+	sock_diag_save_cookie(sk, rep->vdiag_cookie);
+
+	return 0;
+}
+
+static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct vsock_diag_req *req;
+	struct vsock_sock *vsk;
+	unsigned int bucket;
+	unsigned int last_i;
+	unsigned int table;
+	struct net *net;
+	unsigned int i;
+
+	req = nlmsg_data(cb->nlh);
+	net = sock_net(skb->sk);
+
+	/* State saved between calls: */
+	table = cb->args[0];
+	bucket = cb->args[1];
+	i = last_i = cb->args[2];
+
+	/* TODO VMCI pending sockets? */
+
+	spin_lock_bh(&vsock_table_lock);
+
+	/* Bind table (locally created sockets) */
+	if (table == 0) {
+		while (bucket < ARRAY_SIZE(vsock_bind_table)) {
+			struct list_head *head = &vsock_bind_table[bucket];
+
+			i = 0;
+			list_for_each_entry(vsk, head, bound_table) {
+				struct sock *sk = sk_vsock(vsk);
+
+				if (!net_eq(sock_net(sk), net))
+					continue;
+				if (i < last_i)
+					goto next_bind;
+				if (!(req->vdiag_states & (1 << sk->sk_state)))
+					goto next_bind;
+				if (sk_diag_fill(sk, skb,
+						 NETLINK_CB(cb->skb).portid,
+						 cb->nlh->nlmsg_seq,
+						 NLM_F_MULTI) < 0)
+					goto done;
+next_bind:
+				i++;
+			}
+			last_i = 0;
+			bucket++;
+		}
+
+		table++;
+		bucket = 0;
+	}
+
+	/* Connected table (accepted connections) */
+	while (bucket < ARRAY_SIZE(vsock_connected_table)) {
+		struct list_head *head = &vsock_connected_table[bucket];
+
+		i = 0;
+		list_for_each_entry(vsk, head, connected_table) {
+			struct sock *sk = sk_vsock(vsk);
+
+			/* Skip sockets we've already seen above */
+			if (__vsock_in_bound_table(vsk))
+				continue;
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+			if (i < last_i)
+				goto next_connected;
+			if (!(req->vdiag_states & (1 << sk->sk_state)))
+				goto next_connected;
+			if (sk_diag_fill(sk, skb,
+					 NETLINK_CB(cb->skb).portid,
+					 cb->nlh->nlmsg_seq,
+					 NLM_F_MULTI) < 0)
+				goto done;
+next_connected:
+			i++;
+		}
+		last_i = 0;
+		bucket++;
+	}
+
+done:
+	spin_unlock_bh(&vsock_table_lock);
+
+	cb->args[0] = table;
+	cb->args[1] = bucket;
+	cb->args[2] = i;
+
+	return skb->len;
+}
+
+static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+	int hdrlen = sizeof(struct vsock_diag_req);
+	struct net *net = sock_net(skb->sk);
+
+	if (nlmsg_len(h) < hdrlen)
+		return -EINVAL;
+
+	if (h->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = vsock_diag_dump,
+		};
+		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct sock_diag_handler vsock_diag_handler = {
+	.family = AF_VSOCK,
+	.dump = vsock_diag_handler_dump,
+};
+
+static int __init vsock_diag_init(void)
+{
+	return sock_diag_register(&vsock_diag_handler);
+}
+
+static void __exit vsock_diag_exit(void)
+{
+	sock_diag_unregister(&vsock_diag_handler);
+}
+
+module_init(vsock_diag_init);
+module_exit(vsock_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG,
+			       40 /* AF_VSOCK */);
-- 
cgit v1.2.3


From bdc476413dcdb5c38a7dec90fb2bca327021273a Mon Sep 17 00:00:00 2001
From: Amine Kherbouche <amine.kherbouche@6wind.com>
Date: Wed, 4 Oct 2017 19:35:57 +0200
Subject: ip_tunnel: add mpls over gre support

This commit introduces the MPLSoGRE support (RFC 4023), using ip tunnel
API by simply adding ipgre_tunnel_encap_(add|del)_mpls_ops() and the new
tunnel type TUNNEL_ENCAP_MPLS.

Signed-off-by: Amine Kherbouche <amine.kherbouche@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_tunnel.h |  1 +
 net/mpls/af_mpls.c             | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 2e520883c054..a2f48c01365e 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -84,6 +84,7 @@ enum tunnel_encap_types {
 	TUNNEL_ENCAP_NONE,
 	TUNNEL_ENCAP_FOU,
 	TUNNEL_ENCAP_GUE,
+	TUNNEL_ENCAP_MPLS,
 };
 
 #define TUNNEL_ENCAP_FLAG_CSUM		(1<<0)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c5b9ce41d66f..9745e8f69810 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -16,6 +16,7 @@
 #include <net/arp.h>
 #include <net/ip_fib.h>
 #include <net/netevent.h>
+#include <net/ip_tunnels.h>
 #include <net/netns/generic.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -39,6 +40,36 @@ static int one = 1;
 static int label_limit = (1 << 20) - 1;
 static int ttl_max = 255;
 
+#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
+size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e)
+{
+	return sizeof(struct mpls_shim_hdr);
+}
+
+static const struct ip_tunnel_encap_ops mpls_iptun_ops = {
+	.encap_hlen	= ipgre_mpls_encap_hlen,
+};
+
+static int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+	return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+	ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+#else
+static int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+	return 0;
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+}
+#endif
+
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
 		       struct nlmsghdr *nlh, struct net *net, u32 portid,
 		       unsigned int nlm_flags);
@@ -2485,6 +2516,10 @@ static int __init mpls_init(void)
 		      0);
 	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
 		      mpls_netconf_dump_devconf, 0);
+	err = ipgre_tunnel_encap_add_mpls_ops();
+	if (err)
+		pr_err("Can't add mpls over gre tunnel ops\n");
+
 	err = 0;
 out:
 	return err;
@@ -2502,6 +2537,7 @@ static void __exit mpls_exit(void)
 	dev_remove_pack(&mpls_packet_type);
 	unregister_netdevice_notifier(&mpls_dev_notifier);
 	unregister_pernet_subsys(&mpls_net_ops);
+	ipgre_tunnel_encap_del_mpls_ops();
 }
 module_exit(mpls_exit);
 
-- 
cgit v1.2.3


From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:20 -0700
Subject: bpf: add helper bpf_perf_event_read_value for perf event array map

Hardware pmu counters are limited resources. When there are more
pmu based perf events opened than available counters, kernel will
multiplex these events so each event gets certain percentage
(but not 100%) of the pmu time. In case that multiplexing happens,
the number of samples or counter value will not reflect the
case compared to no multiplexing. This makes comparison between
different runs difficult.

Typically, the number of samples or counter value should be
normalized before comparing to other experiments. The typical
normalization is done like:
  normalized_num_samples = num_samples * time_enabled / time_running
  normalized_counter_value = counter_value * time_enabled / time_running
where time_enabled is the time enabled for event and time_running is
the time running for event since last normalization.

This patch adds helper bpf_perf_event_read_value for kprobed based perf
event array map, to read perf counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.
To achieve scaling factor between two bpf invocations, users
can can use cpu_id as the key (which is typical for perf array usage model)
to remember the previous value and do the calculation inside the
bpf program.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 21 +++++++++++++++++++--
 kernel/bpf/verifier.c    |  4 +++-
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6082faf5fd2a..7b57a212c7d7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -641,6 +641,14 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: An positive/negative integer to be added to xdp_md.data_meta
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_perf_event_read_value(map, flags, buf, buf_size)
+ *     read perf event counter value and perf event enabled/running time
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return: 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -697,7 +705,8 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
-	FN(xdp_adjust_meta),
+	FN(xdp_adjust_meta),		\
+	FN(perf_event_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -741,7 +750,9 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
-/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
 #define BPF_F_INDEX_MASK		0xffffffffULL
 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -934,4 +945,10 @@ enum {
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
 #define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
+struct bpf_perf_event_value {
+	__u64 counter;
+	__u64 enabled;
+	__u64 running;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 52b022310f6a..590125e29161 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1552,7 +1552,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 		if (func_id != BPF_FUNC_perf_event_read &&
-		    func_id != BPF_FUNC_perf_event_output)
+		    func_id != BPF_FUNC_perf_event_output &&
+		    func_id != BPF_FUNC_perf_event_read_value)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -1595,6 +1596,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_FUNC_perf_event_read:
 	case BPF_FUNC_perf_event_output:
+	case BPF_FUNC_perf_event_read_value:
 		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			goto error;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 95888ae6c263..0be86cc0130e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+		     u64 *value, u64 *enabled, u64 *running)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
-	u64 value = 0;
-	int err;
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value, NULL, NULL);
+	return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+	u64 value = 0;
+	int err;
+
+	err = get_map_perf_counter(map, flags, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
@@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+				   &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+	.func		= bpf_perf_event_read_value,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
 static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
 
 static __always_inline u64
@@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_perf_event_read_value:
+		return &bpf_perf_event_read_value_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:22 -0700
Subject: bpf: add helper bpf_perf_prog_read_value

This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf
programs, to read event counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.

The typical use case for perf event based bpf program is to attach itself
to a single event. In such cases, if it is desirable to get scaling factor
between two bpf invocations, users can can save the time values in a map,
and use the value from the map and the current value to calculate
the scaling factor.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7b57a212c7d7..5bbbec17aa5a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -649,6 +649,13 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_perf_prog_read_value(ctx, buf, buf_size)
+ *     read perf prog attached perf event counter and enabled/running time
+ *     @ctx: pointer to ctx
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return : 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -706,7 +713,8 @@ union bpf_attr {
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
 	FN(xdp_adjust_meta),		\
-	FN(perf_event_read_value),
+	FN(perf_event_read_value),	\
+	FN(perf_prog_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0be86cc0130e..04ea5314f2bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -613,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+				    &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+         .func           = bpf_perf_prog_read_value_tp,
+         .gpl_only       = true,
+         .ret_type       = RET_INTEGER,
+         .arg1_type      = ARG_PTR_TO_CTX,
+         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+         .arg3_type      = ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -620,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_perf_prog_read_value:
+		return &bpf_perf_prog_read_value_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 067cae47771c864604969fd902efe10916e0d79c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:12 -0700
Subject: bpf: Use char in prog and map name

Instead of u8, use char for prog and map name.  It can avoid the
userspace tool getting compiler's signess warning.  The
bpf_prog_aux, bpf_map, bpf_attr, bpf_prog_info and
bpf_map_info are changed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h            | 4 ++--
 include/uapi/linux/bpf.h       | 8 ++++----
 tools/include/uapi/linux/bpf.h | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a67daea731ab..bc7da2ddfcaf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
-	u8 name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
@@ -189,7 +189,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
-	u8 name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5bbbec17aa5a..6db9e1d679cd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
-		__u8	map_name[BPF_OBJ_NAME_LEN];
+		char	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
-		__u8		prog_name[BPF_OBJ_NAME_LEN];
+		char		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -888,7 +888,7 @@ struct bpf_prog_info {
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -898,7 +898,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0894fd20b12b..fb4fb81ce5b0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
-		__u8	map_name[BPF_OBJ_NAME_LEN];
+		char	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
-		__u8		prog_name[BPF_OBJ_NAME_LEN];
+		char		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -871,7 +871,7 @@ struct bpf_prog_info {
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -881,7 +881,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit v1.2.3


From 821f1b21cabb46827ce39ddf82e2789680b5042a Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Fri, 6 Oct 2017 22:12:37 -0700
Subject: bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd
 flood

This patch adds a new bridge port flag BR_NEIGH_SUPPRESS to
suppress arp and nd flood on bridge ports. It implements
rfc7432, section 10.
https://tools.ietf.org/html/rfc7432#section-10
for ethernet VPN deployments. It is similar to the existing
BR_PROXYARP* flags but has a few semantic differences to conform
to EVPN standard. Unlike the existing flags, this new flag suppresses
flood of all neigh discovery packets (arp and nd) to tunnel ports.
Supports both vlan filtering and non-vlan filtering bridges.

In case of EVPN, it is mainly used to avoid flooding
of arp and nd packets to tunnel ports like vxlan.

This patch adds netlink and sysfs support to set this bridge port
flag.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/Makefile          |  2 +-
 net/bridge/br_arp_nd_proxy.c | 32 ++++++++++++++++++++++++++++++++
 net/bridge/br_forward.c      |  2 +-
 net/bridge/br_if.c           |  5 +++++
 net/bridge/br_netlink.c      | 10 +++++++++-
 net/bridge/br_private.h      |  2 ++
 net/bridge/br_sysfs_if.c     |  2 ++
 9 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 net/bridge/br_arp_nd_proxy.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 3cd18ac0697f..316ee113a220 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -49,6 +49,7 @@ struct br_ip_list {
 #define BR_MULTICAST_TO_UNICAST	BIT(12)
 #define BR_VLAN_TUNNEL		BIT(13)
 #define BR_BCAST_FLOOD		BIT(14)
+#define BR_NEIGH_SUPPRESS	BIT(15)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cd580fc0e58f..b037e0ab1975 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -327,6 +327,7 @@ enum {
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
 	IFLA_BRPORT_GROUP_FWD_MASK,
+	IFLA_BRPORT_NEIGH_SUPPRESS,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 40b1ede527ca..4aee55fdcc92 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o
 bridge-y	:= br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
 			br_ioctl.o br_stp.o br_stp_bpdu.o \
 			br_stp_if.o br_stp_timer.o br_netlink.o \
-			br_netlink_tunnel.o
+			br_netlink_tunnel.o br_arp_nd_proxy.o
 
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
new file mode 100644
index 000000000000..f889ad5f0048
--- /dev/null
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -0,0 +1,32 @@
+/*
+ *  Handle bridge arp/nd proxy/suppress
+ *
+ *  Copyright (C) 2017 Cumulus Networks
+ *  Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  Authors:
+ *	Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include "br_private.h"
+
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+	bool neigh_suppress = false;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->flags & BR_NEIGH_SUPPRESS) {
+			neigh_suppress = true;
+			break;
+		}
+	}
+
+	br->neigh_suppress_enabled = neigh_suppress;
+}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 48fb17417fac..b4eed113d2ec 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -204,7 +204,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 		/* Do not flood to ports that enable proxy ARP */
 		if (p->flags & BR_PROXYARP)
 			continue;
-		if ((p->flags & BR_PROXYARP_WIFI) &&
+		if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) &&
 		    BR_INPUT_SKB_CB(skb)->proxyarp_replied)
 			continue;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 59a74a414e20..ae38547bbf91 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -310,6 +310,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
 		del_nbp(p);
 	}
 
+	br_recalculate_neigh_suppress_enabled(br);
+
 	br_fdb_delete_by_port(br, NULL, 0, 1);
 
 	cancel_delayed_work_sync(&br->gc_work);
@@ -660,4 +662,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
 
 	if (mask & BR_AUTO_MASK)
 		nbp_update_port_count(br);
+
+	if (mask & BR_NEIGH_SUPPRESS)
+		br_recalculate_neigh_suppress_enabled(br);
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index dea88a255d26..f0e82682e071 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -138,6 +138,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP_WIFI */
 		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
+		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -210,7 +211,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
 	    nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
 							BR_VLAN_TUNNEL)) ||
-	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask))
+	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
+	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
+		       !!(p->flags & BR_NEIGH_SUPPRESS)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -785,6 +788,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 		p->group_fwd_mask = fwd_mask;
 	}
 
+	err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS,
+			       BR_NEIGH_SUPPRESS);
+	if (err)
+		return err;
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index ab4df24f7bba..00fa371b1fb2 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -404,6 +404,7 @@ struct net_bridge {
 #ifdef CONFIG_NET_SWITCHDEV
 	int offload_fwd_mark;
 #endif
+	bool				neigh_suppress_enabled;
 };
 
 struct br_input_skb_cb {
@@ -1139,4 +1140,5 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
 }
 #endif /* CONFIG_NET_SWITCHDEV */
 
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
 #endif
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 9110d5e56085..0a1fa9ccd8b7 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -191,6 +191,7 @@ BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
 BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
 BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
 BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
+BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -241,6 +242,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_multicast_flood,
 	&brport_attr_broadcast_flood,
 	&brport_attr_group_fwd_mask,
+	&brport_attr_neigh_suppress,
 	NULL
 };
 
-- 
cgit v1.2.3


From ceaa001a170e43608854d5290a48064f57b565ed Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 4 Oct 2017 17:03:12 -0700
Subject: openvswitch: Add erspan tunnel support.

Add erspan netlink interface for OVS.

Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  1 +
 net/openvswitch/flow_netlink.c   | 51 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 156ee4cab82e..efdbfbfd3ee2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -359,6 +359,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_IPV6_SRC,		/* struct in6_addr src IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_IPV6_DST,		/* struct in6_addr dst IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_PAD,
+	OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,	/* be32 ERSPAN index. */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e8eb427ce6d1..fc0ca9a89b8e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -48,6 +48,7 @@
 #include <net/ndisc.h>
 #include <net/mpls.h>
 #include <net/vxlan.h>
+#include <net/erspan.h>
 
 #include "flow_netlink.h"
 
@@ -319,7 +320,8 @@ size_t ovs_tun_key_attr_size(void)
 		 * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
 		 */
 		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
-		+ nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+		+ nla_total_size(4);   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */
 }
 
 size_t ovs_key_attr_size(void)
@@ -371,6 +373,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 						.next = ovs_vxlan_ext_key_lens },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_SRC]      = { .len = sizeof(struct in6_addr) },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
+	[OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = sizeof(u32) },
 };
 
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
@@ -593,6 +596,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
 	return 0;
 }
 
+static int erspan_tun_opt_from_nlattr(const struct nlattr *attr,
+				      struct sw_flow_match *match, bool is_mask,
+				      bool log)
+{
+	unsigned long opt_key_offset;
+	struct erspan_metadata opts;
+
+	BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
+
+	memset(&opts, 0, sizeof(opts));
+	opts.index = nla_get_be32(attr);
+
+	/* Index has only 20-bit */
+	if (ntohl(opts.index) & ~INDEX_MASK) {
+		OVS_NLERR(log, "ERSPAN index number %x too large.",
+			  ntohl(opts.index));
+		return -EINVAL;
+	}
+
+	SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask);
+	opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
+	SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
+				  is_mask);
+
+	return 0;
+}
+
 static int ip_tun_from_nlattr(const struct nlattr *attr,
 			      struct sw_flow_match *match, bool is_mask,
 			      bool log)
@@ -700,6 +730,19 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_PAD:
 			break;
+		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+			if (opts_type) {
+				OVS_NLERR(log, "Multiple metadata blocks provided");
+				return -EINVAL;
+			}
+
+			err = erspan_tun_opt_from_nlattr(a, match, is_mask, log);
+			if (err)
+				return err;
+
+			tun_flags |= TUNNEL_ERSPAN_OPT;
+			opts_type = type;
+			break;
 		default:
 			OVS_NLERR(log, "Unknown IP tunnel attribute %d",
 				  type);
@@ -824,6 +867,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 		else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
 			 vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
 			return -EMSGSIZE;
+		else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
+			 nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
+				      ((struct erspan_metadata *)tun_opts)->index))
+			return -EMSGSIZE;
 	}
 
 	return 0;
@@ -2195,6 +2242,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
 			break;
+		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+			break;
 		}
 	};
 
-- 
cgit v1.2.3


From de8cd83e91bc3ee212b3e6ec6e4283af9e4ab269 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Mon, 2 Oct 2017 20:21:39 -0400
Subject: audit: Record fanotify access control decisions

The fanotify interface allows user space daemons to make access
control decisions. Under common criteria requirements, we need to
optionally record decisions based on policy. This patch adds a bit mask,
FAN_AUDIT, that a user space daemon can 'or' into the response decision
which will tell the kernel that it made a decision and record it.

It would be used something like this in user space code:

  response.response = FAN_DENY | FAN_AUDIT;
  write(fd, &response, sizeof(struct fanotify_response));

When the syscall ends, the audit system will record the decision as a
AUDIT_FANOTIFY auxiliary record to denote that the reason this event
occurred is the result of an access control decision from fanotify
rather than DAC or MAC policy.

A sample event looks like this:

type=PATH msg=audit(1504310584.332:290): item=0 name="./evil-ls"
inode=1319561 dev=fc:03 mode=0100755 ouid=1000 ogid=1000 rdev=00:00
obj=unconfined_u:object_r:user_home_t:s0 nametype=NORMAL
type=CWD msg=audit(1504310584.332:290): cwd="/home/sgrubb"
type=SYSCALL msg=audit(1504310584.332:290): arch=c000003e syscall=2
success=no exit=-1 a0=32cb3fca90 a1=0 a2=43 a3=8 items=1 ppid=901
pid=959 auid=1000 uid=1000 gid=1000 euid=1000 suid=1000
fsuid=1000 egid=1000 sgid=1000 fsgid=1000 tty=pts1 ses=3 comm="bash"
exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t:
s0-s0:c0.c1023 key=(null)
type=FANOTIFY msg=audit(1504310584.332:290): resp=2

Prior to using the audit flag, the developer needs to call
fanotify_init or'ing in FAN_ENABLE_AUDIT to ensure that the kernel
supports auditing. The calling process must also have the CAP_AUDIT_WRITE
capability.

Signed-off-by: sgrubb <sgrubb@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      |  8 +++++++-
 fs/notify/fanotify/fanotify_user.c | 16 +++++++++++++++-
 fs/notify/fdinfo.c                 |  3 +++
 include/linux/audit.h              | 10 ++++++++++
 include/linux/fsnotify_backend.h   |  1 +
 include/uapi/linux/audit.h         |  1 +
 include/uapi/linux/fanotify.h      |  3 +++
 kernel/auditsc.c                   |  6 ++++++
 8 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 2fa99aeaa095..1968d21a3f37 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,6 +9,7 @@
 #include <linux/sched/user.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/audit.h>
 
 #include "fanotify.h"
 
@@ -78,7 +79,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
 	fsnotify_finish_user_wait(iter_info);
 out:
 	/* userspace responded, convert to something usable */
-	switch (event->response) {
+	switch (event->response & ~FAN_AUDIT) {
 	case FAN_ALLOW:
 		ret = 0;
 		break;
@@ -86,6 +87,11 @@ out:
 	default:
 		ret = -EPERM;
 	}
+
+	/* Check if the response should be audited */
+	if (event->response & FAN_AUDIT)
+		audit_fanotify(event->response & ~FAN_AUDIT);
+
 	event->response = 0;
 
 	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 907a481ac781..0455ea729384 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -179,7 +179,7 @@ static int process_access_response(struct fsnotify_group *group,
 	 * userspace can send a valid response or we will clean it up after the
 	 * timeout
 	 */
-	switch (response) {
+	switch (response & ~FAN_AUDIT) {
 	case FAN_ALLOW:
 	case FAN_DENY:
 		break;
@@ -190,6 +190,9 @@ static int process_access_response(struct fsnotify_group *group,
 	if (fd < 0)
 		return -EINVAL;
 
+	if ((response & FAN_AUDIT) && !group->fanotify_data.audit)
+		return -EINVAL;
+
 	event = dequeue_event(group, fd);
 	if (!event)
 		return -ENOENT;
@@ -721,7 +724,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+#ifdef CONFIG_AUDITSYSCALL
+	if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT))
+#else
 	if (flags & ~FAN_ALL_INIT_FLAGS)
+#endif
 		return -EINVAL;
 
 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
@@ -805,6 +812,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
 	}
 
+	if (flags & FAN_ENABLE_AUDIT) {
+		fd = -EPERM;
+		if (!capable(CAP_AUDIT_WRITE))
+			goto out_destroy_group;
+		group->fanotify_data.audit = true;
+	}
+
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
 		goto out_destroy_group;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index dd63aa9a6f9a..645ab561e790 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -156,6 +156,9 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
 	if (group->fanotify_data.max_marks == UINT_MAX)
 		flags |= FAN_UNLIMITED_MARKS;
 
+	if (group->fanotify_data.audit)
+		flags |= FAN_ENABLE_AUDIT;
+
 	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
 		   flags, group->fanotify_data.f_flags);
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index cb708eb8accc..d66220dac364 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -356,6 +356,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
 extern void __audit_log_kern_module(char *name);
+extern void __audit_fanotify(unsigned int response);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -452,6 +453,12 @@ static inline void audit_log_kern_module(char *name)
 		__audit_log_kern_module(name);
 }
 
+static inline void audit_fanotify(unsigned int response)
+{
+	if (!audit_dummy_context())
+		__audit_fanotify(response);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
@@ -568,6 +575,9 @@ static inline void audit_log_kern_module(char *name)
 {
 }
 
+static inline void audit_fanotify(unsigned int response)
+{ }
+
 static inline void audit_ptrace(struct task_struct *t)
 { }
 #define audit_n_rules 0
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c6c69318752b..4a474f972910 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -190,6 +190,7 @@ struct fsnotify_group {
 			int f_flags;
 			unsigned int max_marks;
 			struct user_struct *user;
+			bool audit;
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 0714a66f0e0c..221f8b7f01b2 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -112,6 +112,7 @@
 #define AUDIT_FEATURE_CHANGE	1328	/* audit log listing feature changes */
 #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
 #define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
+#define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 030508d195d3..5dda19a9a947 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -35,6 +35,7 @@
 
 #define FAN_UNLIMITED_QUEUE	0x00000010
 #define FAN_UNLIMITED_MARKS	0x00000020
+#define FAN_ENABLE_AUDIT	0x00000040
 
 #define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK | \
 				 FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE |\
@@ -99,6 +100,8 @@ struct fanotify_response {
 /* Legit userspace responses to a _PERM event */
 #define FAN_ALLOW	0x01
 #define FAN_DENY	0x02
+#define FAN_AUDIT	0x10	/* Bit mask to create audit record for result */
+
 /* No fd set in event */
 #define FAN_NOFD	-1
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ecc23e25c9eb..9c723e978245 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2390,6 +2390,12 @@ void __audit_log_kern_module(char *name)
 	context->type = AUDIT_KERN_MODULE;
 }
 
+void __audit_fanotify(unsigned int response)
+{
+	audit_log(current->audit_context, GFP_KERNEL,
+		AUDIT_FANOTIFY,	"resp=%u", response);
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
-- 
cgit v1.2.3


From b8226962b1c49c784aeddb9d2fafbf53dfdc2190 Mon Sep 17 00:00:00 2001
From: Eric Garver <e@erig.me>
Date: Tue, 10 Oct 2017 16:54:44 -0400
Subject: openvswitch: add ct_clear action

This adds a ct_clear action for clearing conntrack state. ct_clear is
currently implemented in OVS userspace, but is not backed by an action
in the kernel datapath. This is useful for flows that may modify a
packet tuple after a ct lookup has already occurred.

Signed-off-by: Eric Garver <e@erig.me>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  2 ++
 net/openvswitch/actions.c        |  4 ++++
 net/openvswitch/conntrack.c      | 11 +++++++++++
 net/openvswitch/conntrack.h      |  7 +++++++
 net/openvswitch/flow_netlink.c   |  5 +++++
 5 files changed, 29 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index efdbfbfd3ee2..0cd6f8833147 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -807,6 +807,7 @@ struct ovs_action_push_eth {
  * packet.
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
+ * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -836,6 +837,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a54a556fcdb5..a551232daf61 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 				return err == -EINPROGRESS ? 0 : err;
 			break;
 
+		case OVS_ACTION_ATTR_CT_CLEAR:
+			err = ovs_ct_clear(skb, key);
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_ETH:
 			err = push_eth(skb, key, nla_data(a));
 			break;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index d558e882ca0c..fe861e2f0deb 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1129,6 +1129,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	return err;
 }
 
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	if (skb_nfct(skb)) {
+		nf_conntrack_put(skb_nfct(skb));
+		nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+		ovs_ct_fill_key(skb, key);
+	}
+
+	return 0;
+}
+
 static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 			     const struct sw_flow_key *key, bool log)
 {
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index bc7efd1867ab..399dfdd2c4f9 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *);
 
 int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
 		   const struct ovs_conntrack_info *);
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key);
 
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
@@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static inline int ovs_ct_clear(struct sk_buff *skb,
+			       struct sw_flow_key *key)
+{
+	return -ENOTSUPP;
+}
+
 static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 				   struct sw_flow_key *key)
 {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index fc0ca9a89b8e..dc0d79092e74 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -76,6 +76,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 			break;
 
 		case OVS_ACTION_ATTR_CT:
+		case OVS_ACTION_ATTR_CT_CLEAR:
 		case OVS_ACTION_ATTR_HASH:
 		case OVS_ACTION_ATTR_POP_ETH:
 		case OVS_ACTION_ATTR_POP_MPLS:
@@ -2528,6 +2529,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
 			[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
 			[OVS_ACTION_ATTR_CT] = (u32)-1,
+			[OVS_ACTION_ATTR_CT_CLEAR] = 0,
 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
@@ -2669,6 +2671,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			skip_copy = true;
 			break;
 
+		case OVS_ACTION_ATTR_CT_CLEAR:
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_ETH:
 			/* Disallow pushing an Ethernet header if one
 			 * is already present */
-- 
cgit v1.2.3


From 1ea4ff3e9f0b8d53e680a2bb9e8e644bf03aeb4d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 13 Sep 2017 16:07:22 +0200
Subject: cfg80211: support reloading regulatory database

If the regulatory database is loaded, and then updated, it may
be necessary to reload it. Add an nl80211 command to do this.

Note that this just reloads the database, it doesn't re-apply
the rules from it immediately.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  4 +++
 net/wireless/nl80211.c       | 11 ++++++
 net/wireless/reg.c           | 80 +++++++++++++++++++++++++++++++++-----------
 net/wireless/reg.h           |  6 ++++
 4 files changed, 81 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 95832ce03a44..f882fe1f9709 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -990,6 +990,8 @@
  *	&NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed
  *	&NL80211_CMD_DISCONNECT should be indicated instead.
  *
+ * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1194,6 +1196,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_PORT_AUTHORIZED,
 
+	NL80211_CMD_RELOAD_REGDB,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5129342151e6..67a03f2885a4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5678,6 +5678,11 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
 	}
 }
 
+static int nl80211_reload_regdb(struct sk_buff *skb, struct genl_info *info)
+{
+	return reg_reload_regdb();
+}
+
 static int nl80211_get_mesh_config(struct sk_buff *skb,
 				   struct genl_info *info)
 {
@@ -12708,6 +12713,12 @@ static const struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_RELOAD_REGDB,
+		.doit = nl80211_reload_regdb,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 	{
 		.cmd = NL80211_CMD_GET_MESH_CONFIG,
 		.doit = nl80211_get_mesh_config,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index e9aeb05aaf3e..180addda52af 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -781,6 +781,8 @@ static int query_regdb(const char *alpha2)
 	const struct fwdb_header *hdr = regdb;
 	const struct fwdb_country *country;
 
+	ASSERT_RTNL();
+
 	if (IS_ERR(regdb))
 		return PTR_ERR(regdb);
 
@@ -796,41 +798,47 @@ static int query_regdb(const char *alpha2)
 
 static void regdb_fw_cb(const struct firmware *fw, void *context)
 {
+	int set_error = 0;
+	bool restore = true;
 	void *db;
 
 	if (!fw) {
 		pr_info("failed to load regulatory.db\n");
-		regdb = ERR_PTR(-ENODATA);
-		goto restore;
-	}
-
-	if (!valid_regdb(fw->data, fw->size)) {
+		set_error = -ENODATA;
+	} else if (!valid_regdb(fw->data, fw->size)) {
 		pr_info("loaded regulatory.db is malformed\n");
-		release_firmware(fw);
-		regdb = ERR_PTR(-EINVAL);
-		goto restore;
+		set_error = -EINVAL;
 	}
 
-	db = kmemdup(fw->data, fw->size, GFP_KERNEL);
-	release_firmware(fw);
+	rtnl_lock();
+	if (WARN_ON(regdb && !IS_ERR(regdb))) {
+		/* just restore and free new db */
+	} else if (set_error) {
+		regdb = ERR_PTR(set_error);
+	} else if (fw) {
+		db = kmemdup(fw->data, fw->size, GFP_KERNEL);
+		if (db) {
+			regdb = db;
+			restore = context && query_regdb(context);
+		} else {
+			restore = true;
+		}
+	}
 
-	if (!db)
-		goto restore;
-	regdb = db;
+	if (restore)
+		restore_regulatory_settings(true);
 
-	if (query_regdb(context))
-		goto restore;
-	goto free;
- restore:
-	rtnl_lock();
-	restore_regulatory_settings(true);
 	rtnl_unlock();
- free:
+
 	kfree(context);
+
+	release_firmware(fw);
 }
 
 static int query_regdb_file(const char *alpha2)
 {
+	ASSERT_RTNL();
+
 	if (regdb)
 		return query_regdb(alpha2);
 
@@ -843,6 +851,38 @@ static int query_regdb_file(const char *alpha2)
 				       (void *)alpha2, regdb_fw_cb);
 }
 
+int reg_reload_regdb(void)
+{
+	const struct firmware *fw;
+	void *db;
+	int err;
+
+	err = request_firmware(&fw, "regulatory.db", &reg_pdev->dev);
+	if (err)
+		return err;
+
+	if (!valid_regdb(fw->data, fw->size)) {
+		err = -ENODATA;
+		goto out;
+	}
+
+	db = kmemdup(fw->data, fw->size, GFP_KERNEL);
+	if (!db) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rtnl_lock();
+	if (!IS_ERR_OR_NULL(regdb))
+		kfree(regdb);
+	regdb = db;
+	rtnl_unlock();
+
+ out:
+	release_firmware(fw);
+	return err;
+}
+
 static bool reg_query_database(struct regulatory_request *request)
 {
 	/* query internal regulatory database (if it exists) */
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index ca7fedf2e7a1..9529c522611a 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -179,4 +179,10 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy,
  * @wiphy2 - wiphy it's dfs_region to be checked against that of wiphy1
  */
 bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2);
+
+/**
+ * reg_reload_regdb - reload the regulatory.db firmware file
+ */
+int reg_reload_regdb(void);
+
 #endif  /* __NET_WIRELESS_REG_H */
-- 
cgit v1.2.3


From 259a41d9ae8f3689742267f340ad2b159d00b302 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Mon, 18 Sep 2017 08:21:37 -0400
Subject: media: dvb_frontend: fix return values for FE_SET_PROPERTY

There are several problems with regards to the return of
FE_SET_PROPERTY. The original idea were to return per-property
return codes via tvp->result field, and to return an updated
set of values.

However, that never worked. What's actually implemented is:

- the FE_SET_PROPERTY implementation doesn't call .get_frontend
  callback in order to get the actual parameters after return;

- the tvp->result field is only filled if there's no error.
  So, it is always filled with zero;

- FE_SET_PROPERTY doesn't call memdup_user() nor any other
  copy_to_user() function. So, any changes to the properties
  will be lost;

- FE_SET_PROPERTY is declared as a write-only ioctl (IOW).

While we could fix the above, it could cause regressions.

So, let's just assume what the code really does, updating
the documentation accordingly and removing the logic that
would update the discarded tvp->result.

Reviewed-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/fe-get-property.rst | 7 +++++--
 drivers/media/dvb-core/dvb_frontend.c            | 2 --
 include/uapi/linux/dvb/frontend.h                | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/fe-get-property.rst b/Documentation/media/uapi/dvb/fe-get-property.rst
index 948d2ba84f2c..b69741d9cedf 100644
--- a/Documentation/media/uapi/dvb/fe-get-property.rst
+++ b/Documentation/media/uapi/dvb/fe-get-property.rst
@@ -48,8 +48,11 @@ depends on the delivery system and on the device:
 
    -  This call requires read/write access to the device.
 
-   -  At return, the values are updated to reflect the actual parameters
-      used.
+.. note::
+
+   At return, the values aren't updated to reflect the actual
+   parameters used. If the actual parameters are needed, an explicit
+   call to ``FE_GET_PROPERTY`` is needed.
 
 -  ``FE_GET_PROPERTY:``
 
diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index b19f40be0ab2..5e3bcae477d2 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -2125,7 +2125,6 @@ static int dvb_frontend_handle_ioctl(struct file *file,
 				kfree(tvp);
 				return err;
 			}
-			(tvp + i)->result = err;
 		}
 		kfree(tvp);
 		break;
@@ -2170,7 +2169,6 @@ static int dvb_frontend_handle_ioctl(struct file *file,
 				kfree(tvp);
 				return err;
 			}
-			(tvp + i)->result = err;
 		}
 
 		if (copy_to_user((void __user *)tvps->props, tvp,
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 861cacd5711f..6bc26f35217b 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -830,7 +830,7 @@ struct dtv_fe_stats {
  * @cmd:	Digital TV command.
  * @reserved:	Not used.
  * @u:		Union with the values for the command.
- * @result:	Result of the command set (currently unused).
+ * @result:	Unused
  *
  * The @u union may have either one of the values below:
  *
-- 
cgit v1.2.3


From 28978713c51b0a70acf748f76f9d6d2d20dcf980 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 10 Oct 2017 23:45:18 -0700
Subject: net: qrtr: Move constants to header file

The constants are used by both the name server and clients, so clarify
their value and move them to the uapi header.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/qrtr.h | 3 +++
 net/qrtr/qrtr.c           | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
index 9d76c566f66e..63e8803e4d90 100644
--- a/include/uapi/linux/qrtr.h
+++ b/include/uapi/linux/qrtr.h
@@ -4,6 +4,9 @@
 #include <linux/socket.h>
 #include <linux/types.h>
 
+#define QRTR_NODE_BCAST	0xffffffffu
+#define QRTR_PORT_CTRL	0xfffffffeu
+
 struct sockaddr_qrtr {
 	__kernel_sa_family_t sq_family;
 	__u32 sq_node;
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 7e4b49a8349e..15981abc042c 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -61,8 +61,6 @@ struct qrtr_hdr {
 } __packed;
 
 #define QRTR_HDR_SIZE sizeof(struct qrtr_hdr)
-#define QRTR_NODE_BCAST ((unsigned int)-1)
-#define QRTR_PORT_CTRL ((unsigned int)-2)
 
 struct qrtr_sock {
 	/* WARNING: sk must be the first member */
-- 
cgit v1.2.3


From da7653f0faabbe45eb2d3fd6e4b400fe003e81ae Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 10 Oct 2017 23:45:19 -0700
Subject: net: qrtr: Add control packet definition to uapi

The QMUX protocol specification defines structure of the special control
packet messages being sent between handlers of the control port.

Add these to the uapi header, as this structure and the associated types
are shared between the kernel and all userspace handlers of control
messages.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/qrtr.h | 32 ++++++++++++++++++++++++++++++++
 net/qrtr/qrtr.c           | 12 ------------
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
index 63e8803e4d90..179af64846e0 100644
--- a/include/uapi/linux/qrtr.h
+++ b/include/uapi/linux/qrtr.h
@@ -13,4 +13,36 @@ struct sockaddr_qrtr {
 	__u32 sq_port;
 };
 
+enum qrtr_pkt_type {
+	QRTR_TYPE_DATA		= 1,
+	QRTR_TYPE_HELLO		= 2,
+	QRTR_TYPE_BYE		= 3,
+	QRTR_TYPE_NEW_SERVER	= 4,
+	QRTR_TYPE_DEL_SERVER	= 5,
+	QRTR_TYPE_DEL_CLIENT	= 6,
+	QRTR_TYPE_RESUME_TX	= 7,
+	QRTR_TYPE_EXIT          = 8,
+	QRTR_TYPE_PING          = 9,
+	QRTR_TYPE_NEW_LOOKUP	= 10,
+	QRTR_TYPE_DEL_LOOKUP	= 11,
+};
+
+struct qrtr_ctrl_pkt {
+	__le32 cmd;
+
+	union {
+		struct {
+			__le32 service;
+			__le32 instance;
+			__le32 node;
+			__le32 port;
+		} server;
+
+		struct {
+			__le32 node;
+			__le32 port;
+		} client;
+	};
+} __packed;
+
 #endif /* _LINUX_QRTR_H */
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 15981abc042c..d85ca7170b8f 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -26,18 +26,6 @@
 #define QRTR_MIN_EPH_SOCKET 0x4000
 #define QRTR_MAX_EPH_SOCKET 0x7fff
 
-enum qrtr_pkt_type {
-	QRTR_TYPE_DATA		= 1,
-	QRTR_TYPE_HELLO		= 2,
-	QRTR_TYPE_BYE		= 3,
-	QRTR_TYPE_NEW_SERVER	= 4,
-	QRTR_TYPE_DEL_SERVER	= 5,
-	QRTR_TYPE_DEL_CLIENT	= 6,
-	QRTR_TYPE_RESUME_TX	= 7,
-	QRTR_TYPE_EXIT		= 8,
-	QRTR_TYPE_PING		= 9,
-};
-
 /**
  * struct qrtr_hdr - (I|R)PCrouter packet header
  * @version: protocol version
-- 
cgit v1.2.3


From ad2d116c5242875bba27522682ec5ba7f0df75f0 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 11 Oct 2017 11:14:49 -0700
Subject: sched: tc_mirred: Remove whitespaces

This file contains unnecessary whitespaces as newlines, remove them,
found by looking at what struct tc_mirred looks like.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_mirred.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h
index 3d7a2b352a62..69038c29e8a9 100644
--- a/include/uapi/linux/tc_act/tc_mirred.h
+++ b/include/uapi/linux/tc_act/tc_mirred.h
@@ -9,13 +9,13 @@
 #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */
 #define TCA_INGRESS_REDIR 3  /* packet redirect to INGRESS*/
 #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */
-                                                                                
+
 struct tc_mirred {
 	tc_gen;
 	int                     eaction;   /* one of IN/EGRESS_MIRROR/REDIR */
 	__u32                   ifindex;  /* ifindex of egress port */
 };
-                                                                                
+
 enum {
 	TCA_MIRRED_UNSPEC,
 	TCA_MIRRED_TM,
@@ -24,5 +24,5 @@ enum {
 	__TCA_MIRRED_MAX
 };
 #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1)
-                                                                                
+
 #endif
-- 
cgit v1.2.3


From 75da2163dbb6af9f2dce1d80056d11d290dd19a5 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 13 Oct 2017 11:04:23 +0200
Subject: tipc: introduce communication groups

As a preparation for introducing flow control for multicast and datagram
messaging we need a more strictly defined framework than we have now. A
socket must be able keep track of exactly how many and which other
sockets it is allowed to communicate with at any moment, and keep the
necessary state for those.

We therefore introduce a new concept we have named Communication Group.
Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN.
The call takes four parameters: 'type' serves as group identifier,
'instance' serves as an logical member identifier, and 'scope' indicates
the visibility of the group (node/cluster/zone). Finally, 'flags' makes
it possible to set certain properties for the member. For now, there is
only one flag, indicating if the creator of the socket wants to receive
a copy of broadcast or multicast messages it is sending via the socket,
and if wants to be eligible as destination for its own anycasts.

A group is closed, i.e., sockets which have not joined a group will
not be able to send messages to or receive messages from members of
the group, and vice versa.

Any member of a group can send multicast ('group broadcast') messages
to all group members, optionally including itself, using the primitive
send(). The messages are received via the recvmsg() primitive. A socket
can only be member of one group at a time.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |  14 ++
 net/tipc/Makefile         |   2 +-
 net/tipc/group.c          | 404 ++++++++++++++++++++++++++++++++++++++++++++++
 net/tipc/group.h          |  64 ++++++++
 net/tipc/link.c           |   3 +-
 net/tipc/msg.h            |  50 +++++-
 net/tipc/name_table.c     |  44 +++--
 net/tipc/name_table.h     |   3 +
 net/tipc/node.h           |   3 +-
 net/tipc/socket.c         | 209 ++++++++++++++++++++----
 10 files changed, 748 insertions(+), 48 deletions(-)
 create mode 100644 net/tipc/group.c
 create mode 100644 net/tipc/group.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 5351b08c897a..5f7b2c4a09ab 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -231,6 +231,20 @@ struct sockaddr_tipc {
 #define TIPC_SOCK_RECVQ_DEPTH	132	/* Default: none (read only) */
 #define TIPC_MCAST_BROADCAST    133     /* Default: TIPC selects. No arg */
 #define TIPC_MCAST_REPLICAST    134     /* Default: TIPC selects. No arg */
+#define TIPC_GROUP_JOIN         135     /* Takes struct tipc_group_req* */
+#define TIPC_GROUP_LEAVE        136     /* No argument */
+
+/*
+ * Flag values
+ */
+#define TIPC_GROUP_LOOPBACK     0x1  /* Receive copy of sent msg when match */
+
+struct tipc_group_req {
+	__u32 type;      /* group id */
+	__u32 instance;  /* member id */
+	__u32 scope;     /* zone/cluster/node */
+	__u32 flags;
+};
 
 /*
  * Maximum sizes of TIPC bearer-related names (including terminating NULL)
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
index 31b9f9c52974..a3af73ec0b78 100644
--- a/net/tipc/Makefile
+++ b/net/tipc/Makefile
@@ -8,7 +8,7 @@ tipc-y	+= addr.o bcast.o bearer.o \
 	   core.o link.o discover.o msg.o  \
 	   name_distr.o  subscr.o monitor.o name_table.o net.o  \
 	   netlink.o netlink_compat.o node.o socket.o eth_media.o \
-	   server.o socket.o
+	   server.o socket.o group.o
 
 tipc-$(CONFIG_TIPC_MEDIA_UDP)	+= udp_media.o
 tipc-$(CONFIG_TIPC_MEDIA_IB)	+= ib_media.o
diff --git a/net/tipc/group.c b/net/tipc/group.c
new file mode 100644
index 000000000000..3f0e1ce1e3b9
--- /dev/null
+++ b/net/tipc/group.c
@@ -0,0 +1,404 @@
+/*
+ * net/tipc/group.c: TIPC group messaging code
+ *
+ * Copyright (c) 2017, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "addr.h"
+#include "group.h"
+#include "bcast.h"
+#include "server.h"
+#include "msg.h"
+#include "socket.h"
+#include "node.h"
+#include "name_table.h"
+#include "subscr.h"
+
+#define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1)
+#define ADV_IDLE ADV_UNIT
+
+enum mbr_state {
+	MBR_QUARANTINED,
+	MBR_DISCOVERED,
+	MBR_JOINING,
+	MBR_PUBLISHED,
+	MBR_JOINED,
+	MBR_LEAVING
+};
+
+struct tipc_member {
+	struct rb_node tree_node;
+	struct list_head list;
+	u32 node;
+	u32 port;
+	enum mbr_state state;
+	u16 bc_rcv_nxt;
+};
+
+struct tipc_group {
+	struct rb_root members;
+	struct tipc_nlist dests;
+	struct net *net;
+	int subid;
+	u32 type;
+	u32 instance;
+	u32 domain;
+	u32 scope;
+	u32 portid;
+	u16 member_cnt;
+	u16 bc_snd_nxt;
+	bool loopback;
+};
+
+static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
+				  int mtyp, struct sk_buff_head *xmitq);
+
+u16 tipc_group_bc_snd_nxt(struct tipc_group *grp)
+{
+	return grp->bc_snd_nxt;
+}
+
+static bool tipc_group_is_receiver(struct tipc_member *m)
+{
+	return m && m->state >= MBR_JOINED;
+}
+
+int tipc_group_size(struct tipc_group *grp)
+{
+	return grp->member_cnt;
+}
+
+struct tipc_group *tipc_group_create(struct net *net, u32 portid,
+				     struct tipc_group_req *mreq)
+{
+	struct tipc_group *grp;
+	u32 type = mreq->type;
+
+	grp = kzalloc(sizeof(*grp), GFP_ATOMIC);
+	if (!grp)
+		return NULL;
+	tipc_nlist_init(&grp->dests, tipc_own_addr(net));
+	grp->members = RB_ROOT;
+	grp->net = net;
+	grp->portid = portid;
+	grp->domain = addr_domain(net, mreq->scope);
+	grp->type = type;
+	grp->instance = mreq->instance;
+	grp->scope = mreq->scope;
+	grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK;
+	if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid))
+		return grp;
+	kfree(grp);
+	return NULL;
+}
+
+void tipc_group_delete(struct net *net, struct tipc_group *grp)
+{
+	struct rb_root *tree = &grp->members;
+	struct tipc_member *m, *tmp;
+	struct sk_buff_head xmitq;
+
+	__skb_queue_head_init(&xmitq);
+
+	rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) {
+		tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq);
+		list_del(&m->list);
+		kfree(m);
+	}
+	tipc_node_distr_xmit(net, &xmitq);
+	tipc_nlist_purge(&grp->dests);
+	tipc_topsrv_kern_unsubscr(net, grp->subid);
+	kfree(grp);
+}
+
+struct tipc_member *tipc_group_find_member(struct tipc_group *grp,
+					   u32 node, u32 port)
+{
+	struct rb_node *n = grp->members.rb_node;
+	u64 nkey, key = (u64)node << 32 | port;
+	struct tipc_member *m;
+
+	while (n) {
+		m = container_of(n, struct tipc_member, tree_node);
+		nkey = (u64)m->node << 32 | m->port;
+		if (key < nkey)
+			n = n->rb_left;
+		else if (key > nkey)
+			n = n->rb_right;
+		else
+			return m;
+	}
+	return NULL;
+}
+
+static struct tipc_member *tipc_group_find_node(struct tipc_group *grp,
+						u32 node)
+{
+	struct tipc_member *m;
+	struct rb_node *n;
+
+	for (n = rb_first(&grp->members); n; n = rb_next(n)) {
+		m = container_of(n, struct tipc_member, tree_node);
+		if (m->node == node)
+			return m;
+	}
+	return NULL;
+}
+
+static void tipc_group_add_to_tree(struct tipc_group *grp,
+				   struct tipc_member *m)
+{
+	u64 nkey, key = (u64)m->node << 32 | m->port;
+	struct rb_node **n, *parent = NULL;
+	struct tipc_member *tmp;
+
+	n = &grp->members.rb_node;
+	while (*n) {
+		tmp = container_of(*n, struct tipc_member, tree_node);
+		parent = *n;
+		tmp = container_of(parent, struct tipc_member, tree_node);
+		nkey = (u64)tmp->node << 32 | tmp->port;
+		if (key < nkey)
+			n = &(*n)->rb_left;
+		else if (key > nkey)
+			n = &(*n)->rb_right;
+		else
+			return;
+	}
+	rb_link_node(&m->tree_node, parent, n);
+	rb_insert_color(&m->tree_node, &grp->members);
+}
+
+static struct tipc_member *tipc_group_create_member(struct tipc_group *grp,
+						    u32 node, u32 port,
+						    int state)
+{
+	struct tipc_member *m;
+
+	m = kzalloc(sizeof(*m), GFP_ATOMIC);
+	if (!m)
+		return NULL;
+	INIT_LIST_HEAD(&m->list);
+	m->node = node;
+	m->port = port;
+	grp->member_cnt++;
+	tipc_group_add_to_tree(grp, m);
+	tipc_nlist_add(&grp->dests, m->node);
+	m->state = state;
+	return m;
+}
+
+void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port)
+{
+	tipc_group_create_member(grp, node, port, MBR_DISCOVERED);
+}
+
+static void tipc_group_delete_member(struct tipc_group *grp,
+				     struct tipc_member *m)
+{
+	rb_erase(&m->tree_node, &grp->members);
+	grp->member_cnt--;
+	list_del_init(&m->list);
+
+	/* If last member on a node, remove node from dest list */
+	if (!tipc_group_find_node(grp, m->node))
+		tipc_nlist_del(&grp->dests, m->node);
+
+	kfree(m);
+}
+
+struct tipc_nlist *tipc_group_dests(struct tipc_group *grp)
+{
+	return &grp->dests;
+}
+
+void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq,
+		     int *scope)
+{
+	seq->type = grp->type;
+	seq->lower = grp->instance;
+	seq->upper = grp->instance;
+	*scope = grp->scope;
+}
+
+void tipc_group_update_bc_members(struct tipc_group *grp)
+{
+	grp->bc_snd_nxt++;
+}
+
+/* tipc_group_filter_msg() - determine if we should accept arriving message
+ */
+void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq,
+			   struct sk_buff_head *xmitq)
+{
+	struct sk_buff *skb = __skb_dequeue(inputq);
+	struct tipc_member *m;
+	struct tipc_msg *hdr;
+	u32 node, port;
+	int mtyp;
+
+	if (!skb)
+		return;
+
+	hdr = buf_msg(skb);
+	mtyp = msg_type(hdr);
+	node =  msg_orignode(hdr);
+	port = msg_origport(hdr);
+
+	if (!msg_in_group(hdr))
+		goto drop;
+
+	m = tipc_group_find_member(grp, node, port);
+	if (!tipc_group_is_receiver(m))
+		goto drop;
+
+	__skb_queue_tail(inputq, skb);
+
+	m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1;
+	return;
+drop:
+	kfree_skb(skb);
+}
+
+static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
+				  int mtyp, struct sk_buff_head *xmitq)
+{
+	struct tipc_msg *hdr;
+	struct sk_buff *skb;
+
+	skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0,
+			      m->node, tipc_own_addr(grp->net),
+			      m->port, grp->portid, 0);
+	if (!skb)
+		return;
+
+	hdr = buf_msg(skb);
+	if (mtyp == GRP_JOIN_MSG)
+		msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt);
+	__skb_queue_tail(xmitq, skb);
+}
+
+void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
+			  struct sk_buff_head *xmitq)
+{
+	u32 node = msg_orignode(hdr);
+	u32 port = msg_origport(hdr);
+	struct tipc_member *m;
+
+	if (!grp)
+		return;
+
+	m = tipc_group_find_member(grp, node, port);
+
+	switch (msg_type(hdr)) {
+	case GRP_JOIN_MSG:
+		if (!m)
+			m = tipc_group_create_member(grp, node, port,
+						     MBR_QUARANTINED);
+		if (!m)
+			return;
+		m->bc_rcv_nxt = msg_grp_bc_syncpt(hdr);
+
+		/* Wait until PUBLISH event is received */
+		if (m->state == MBR_DISCOVERED)
+			m->state = MBR_JOINING;
+		else if (m->state == MBR_PUBLISHED)
+			m->state = MBR_JOINED;
+		return;
+	case GRP_LEAVE_MSG:
+		if (!m)
+			return;
+
+		/* Wait until WITHDRAW event is received */
+		if (m->state != MBR_LEAVING) {
+			m->state = MBR_LEAVING;
+			return;
+		}
+		/* Otherwise deliver already received WITHDRAW event */
+		tipc_group_delete_member(grp, m);
+		return;
+	default:
+		pr_warn("Received unknown GROUP_PROTO message\n");
+	}
+}
+
+/* tipc_group_member_evt() - receive and handle a member up/down event
+ */
+void tipc_group_member_evt(struct tipc_group *grp,
+			   struct sk_buff *skb,
+			   struct sk_buff_head *xmitq)
+{
+	struct tipc_msg *hdr = buf_msg(skb);
+	struct tipc_event *evt = (void *)msg_data(hdr);
+	u32 node = evt->port.node;
+	u32 port = evt->port.ref;
+	struct tipc_member *m;
+	struct net *net;
+	u32 self;
+
+	if (!grp)
+		goto drop;
+
+	net = grp->net;
+	self = tipc_own_addr(net);
+	if (!grp->loopback && node == self && port == grp->portid)
+		goto drop;
+
+	m = tipc_group_find_member(grp, node, port);
+
+	if (evt->event == TIPC_PUBLISHED) {
+		if (!m)
+			m = tipc_group_create_member(grp, node, port,
+						     MBR_DISCOVERED);
+		if (!m)
+			goto drop;
+
+		/* Wait if JOIN message not yet received */
+		if (m->state == MBR_DISCOVERED)
+			m->state = MBR_PUBLISHED;
+		else
+			m->state = MBR_JOINED;
+		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
+	} else if (evt->event == TIPC_WITHDRAWN) {
+		if (!m)
+			goto drop;
+
+		/* Keep back event if more messages might be expected */
+		if (m->state != MBR_LEAVING && tipc_node_is_up(net, node))
+			m->state = MBR_LEAVING;
+		else
+			tipc_group_delete_member(grp, m);
+	}
+drop:
+	kfree_skb(skb);
+}
diff --git a/net/tipc/group.h b/net/tipc/group.h
new file mode 100644
index 000000000000..9bdf4479fc03
--- /dev/null
+++ b/net/tipc/group.h
@@ -0,0 +1,64 @@
+/*
+ * net/tipc/group.h: Include file for TIPC group unicast/multicast functions
+ *
+ * Copyright (c) 2017, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_GROUP_H
+#define _TIPC_GROUP_H
+
+#include "core.h"
+
+struct tipc_group;
+struct tipc_member;
+struct tipc_msg;
+
+struct tipc_group *tipc_group_create(struct net *net, u32 portid,
+				     struct tipc_group_req *mreq);
+void tipc_group_delete(struct net *net, struct tipc_group *grp);
+void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port);
+struct tipc_nlist *tipc_group_dests(struct tipc_group *grp);
+void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq,
+		     int *scope);
+void tipc_group_filter_msg(struct tipc_group *grp,
+			   struct sk_buff_head *inputq,
+			   struct sk_buff_head *xmitq);
+void tipc_group_member_evt(struct tipc_group *grp,
+			   struct sk_buff *skb,
+			   struct sk_buff_head *xmitq);
+void tipc_group_proto_rcv(struct tipc_group *grp,
+			  struct tipc_msg *hdr,
+			  struct sk_buff_head *xmitq);
+void tipc_group_update_bc_members(struct tipc_group *grp);
+u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
+int tipc_group_size(struct tipc_group *grp);
+#endif
diff --git a/net/tipc/link.c b/net/tipc/link.c
index ac0144f532aa..bd25bff63925 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1046,11 +1046,12 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
 	case TIPC_MEDIUM_IMPORTANCE:
 	case TIPC_HIGH_IMPORTANCE:
 	case TIPC_CRITICAL_IMPORTANCE:
-		if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) {
+		if (unlikely(msg_mcast(hdr))) {
 			skb_queue_tail(l->bc_rcvlink->inputq, skb);
 			return true;
 		}
 	case CONN_MANAGER:
+	case GROUP_PROTOCOL:
 		skb_queue_tail(inputq, skb);
 		return true;
 	case NAME_DISTRIBUTOR:
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index be3e38aa9dd2..dad400935405 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -1,7 +1,7 @@
 /*
  * net/tipc/msg.h: Include file for TIPC message header routines
  *
- * Copyright (c) 2000-2007, 2014-2015 Ericsson AB
+ * Copyright (c) 2000-2007, 2014-2017 Ericsson AB
  * Copyright (c) 2005-2008, 2010-2011, Wind River Systems
  * All rights reserved.
  *
@@ -61,10 +61,11 @@ struct plist;
 /*
  * Payload message types
  */
-#define TIPC_CONN_MSG		0
-#define TIPC_MCAST_MSG		1
-#define TIPC_NAMED_MSG		2
-#define TIPC_DIRECT_MSG		3
+#define TIPC_CONN_MSG           0
+#define TIPC_MCAST_MSG          1
+#define TIPC_NAMED_MSG          2
+#define TIPC_DIRECT_MSG         3
+#define TIPC_GRP_BCAST_MSG      4
 
 /*
  * Internal message users
@@ -73,6 +74,7 @@ struct plist;
 #define  MSG_BUNDLER          6
 #define  LINK_PROTOCOL        7
 #define  CONN_MANAGER         8
+#define  GROUP_PROTOCOL       9
 #define  TUNNEL_PROTOCOL      10
 #define  NAME_DISTRIBUTOR     11
 #define  MSG_FRAGMENTER       12
@@ -87,6 +89,7 @@ struct plist;
 #define BASIC_H_SIZE              32	/* Basic payload message */
 #define NAMED_H_SIZE              40	/* Named payload message */
 #define MCAST_H_SIZE              44	/* Multicast payload message */
+#define GROUP_H_SIZE              44	/* Group payload message */
 #define INT_H_SIZE                40	/* Internal messages */
 #define MIN_H_SIZE                24	/* Smallest legal TIPC header size */
 #define MAX_H_SIZE                60	/* Largest possible TIPC header size */
@@ -252,6 +255,11 @@ static inline void msg_set_type(struct tipc_msg *m, u32 n)
 	msg_set_bits(m, 1, 29, 0x7, n);
 }
 
+static inline int msg_in_group(struct tipc_msg *m)
+{
+	return (msg_type(m) == TIPC_GRP_BCAST_MSG);
+}
+
 static inline u32 msg_named(struct tipc_msg *m)
 {
 	return msg_type(m) == TIPC_NAMED_MSG;
@@ -259,7 +267,9 @@ static inline u32 msg_named(struct tipc_msg *m)
 
 static inline u32 msg_mcast(struct tipc_msg *m)
 {
-	return msg_type(m) == TIPC_MCAST_MSG;
+	int mtyp = msg_type(m);
+
+	return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG));
 }
 
 static inline u32 msg_connected(struct tipc_msg *m)
@@ -514,6 +524,12 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
 #define DSC_REQ_MSG		0
 #define DSC_RESP_MSG		1
 
+/*
+ * Group protocol message types
+ */
+#define GRP_JOIN_MSG         0
+#define GRP_LEAVE_MSG        1
+
 /*
  * Word 1
  */
@@ -795,6 +811,28 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
 	msg_set_bits(m, 9, 0, 0xffff, n);
 }
 
+static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n)
+{
+	msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
+/* Word 10
+ */
+static inline u16 msg_grp_bc_seqno(struct tipc_msg *m)
+{
+	return msg_bits(m, 10, 16, 0xffff);
+}
+
+static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 10, 16, 0xffff, n);
+}
+
 static inline bool msg_peer_link_is_up(struct tipc_msg *m)
 {
 	if (likely(msg_user(m) != LINK_PROTOCOL))
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 76bd2777baaf..114d72bab827 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -43,6 +43,7 @@
 #include "bcast.h"
 #include "addr.h"
 #include "node.h"
+#include "group.h"
 #include <net/genetlink.h>
 
 #define TIPC_NAMETBL_SIZE 1024		/* must be a power of 2 */
@@ -596,18 +597,6 @@ not_found:
 	return ref;
 }
 
-/**
- * tipc_nametbl_mc_translate - find multicast destinations
- *
- * Creates list of all local ports that overlap the given multicast address;
- * also determines if any off-node ports overlap.
- *
- * Note: Publications with a scope narrower than 'limit' are ignored.
- * (i.e. local node-scope publications mustn't receive messages arriving
- * from another node, even if the multcast link brought it here)
- *
- * Returns non-zero if any off-node ports overlap
- */
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 			      u32 limit, struct list_head *dports)
 {
@@ -679,6 +668,37 @@ exit:
 	rcu_read_unlock();
 }
 
+/* tipc_nametbl_build_group - build list of communication group members
+ */
+void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
+			      u32 type, u32 domain)
+{
+	struct sub_seq *sseq, *stop;
+	struct name_info *info;
+	struct publication *p;
+	struct name_seq *seq;
+
+	rcu_read_lock();
+	seq = nametbl_find_seq(net, type);
+	if (!seq)
+		goto exit;
+
+	spin_lock_bh(&seq->lock);
+	sseq = seq->sseqs;
+	stop = seq->sseqs + seq->first_free;
+	for (; sseq != stop; sseq++) {
+		info = sseq->info;
+		list_for_each_entry(p, &info->zone_list, zone_list) {
+			if (!tipc_in_scope(domain, p->node))
+				continue;
+			tipc_group_add_member(grp, p->node, p->ref);
+		}
+	}
+	spin_unlock_bh(&seq->lock);
+exit:
+	rcu_read_unlock();
+}
+
 /*
  * tipc_nametbl_publish - add name publication to network name tables
  */
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index d121175a92b5..97646b17a4a2 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -40,6 +40,7 @@
 struct tipc_subscription;
 struct tipc_plist;
 struct tipc_nlist;
+struct tipc_group;
 
 /*
  * TIPC name types reserved for internal TIPC use (both current and planned)
@@ -101,6 +102,8 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 			      u32 limit, struct list_head *dports);
+void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
+			      u32 type, u32 domain);
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
 				   u32 upper, u32 domain,
 				   struct tipc_nlist *nodes);
diff --git a/net/tipc/node.h b/net/tipc/node.h
index df2f2197c4ad..acd58d23a70e 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -48,7 +48,8 @@ enum {
 	TIPC_BCAST_SYNCH      = (1 << 1),
 	TIPC_BCAST_STATE_NACK = (1 << 2),
 	TIPC_BLOCK_FLOWCTL    = (1 << 3),
-	TIPC_BCAST_RCAST      = (1 << 4)
+	TIPC_BCAST_RCAST      = (1 << 4),
+	TIPC_MCAST_GROUPS     = (1 << 5)
 };
 
 #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index daf7c4df4531..64bbf9d03629 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/socket.c: TIPC socket API
  *
- * Copyright (c) 2001-2007, 2012-2016, Ericsson AB
+ * Copyright (c) 2001-2007, 2012-2017, Ericsson AB
  * Copyright (c) 2004-2008, 2010-2013, Wind River Systems
  * All rights reserved.
  *
@@ -45,6 +45,7 @@
 #include "socket.h"
 #include "bcast.h"
 #include "netlink.h"
+#include "group.h"
 
 #define CONN_TIMEOUT_DEFAULT	8000	/* default connect timeout = 8s */
 #define CONN_PROBING_INTERVAL	msecs_to_jiffies(3600000)  /* [ms] => 1 h */
@@ -78,7 +79,7 @@ enum {
  * @conn_timeout: the time we can wait for an unresponded setup request
  * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
  * @cong_link_cnt: number of congested links
- * @sent_unacked: # messages sent by socket, and not yet acked by peer
+ * @snt_unacked: # messages sent by socket, and not yet acked by peer
  * @rcv_unacked: # messages read by user, but not yet acked back to peer
  * @peer: 'connected' peer for dgram/rdm
  * @node: hash table node
@@ -109,6 +110,7 @@ struct tipc_sock {
 	struct rhash_head node;
 	struct tipc_mc_method mc_method;
 	struct rcu_head rcu;
+	struct tipc_group *group;
 };
 
 static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
@@ -123,6 +125,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
 			   struct tipc_name_seq const *seq);
 static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 			    struct tipc_name_seq const *seq);
+static int tipc_sk_leave(struct tipc_sock *tsk);
 static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
 static int tipc_sk_insert(struct tipc_sock *tsk);
 static void tipc_sk_remove(struct tipc_sock *tsk);
@@ -559,6 +562,7 @@ static int tipc_release(struct socket *sock)
 
 	__tipc_shutdown(sock, TIPC_ERR_NO_PORT);
 	sk->sk_shutdown = SHUTDOWN_MASK;
+	tipc_sk_leave(tsk);
 	tipc_sk_withdraw(tsk, 0, NULL);
 	sk_stop_timer(sk, &sk->sk_timer);
 	tipc_sk_remove(tsk);
@@ -601,7 +605,10 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr,
 		res = tipc_sk_withdraw(tsk, 0, NULL);
 		goto exit;
 	}
-
+	if (tsk->group) {
+		res = -EACCES;
+		goto exit;
+	}
 	if (uaddr_len < sizeof(struct sockaddr_tipc)) {
 		res = -EINVAL;
 		goto exit;
@@ -698,6 +705,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_group *grp = tsk->group;
 	u32 mask = 0;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
@@ -718,8 +726,9 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 			mask |= (POLLIN | POLLRDNORM);
 		break;
 	case TIPC_OPEN:
-		if (!tsk->cong_link_cnt)
-			mask |= POLLOUT;
+		if (!grp || tipc_group_size(grp))
+			if (!tsk->cong_link_cnt)
+				mask |= POLLOUT;
 		if (tipc_sk_type_connectionless(sk) &&
 		    (!skb_queue_empty(&sk->sk_receive_queue)))
 			mask |= (POLLIN | POLLRDNORM);
@@ -757,6 +766,9 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	struct tipc_nlist dsts;
 	int rc;
 
+	if (tsk->group)
+		return -EACCES;
+
 	/* Block or return if any destination link is congested */
 	rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
 	if (unlikely(rc))
@@ -793,6 +805,64 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	return rc ? rc : dlen;
 }
 
+/**
+ * tipc_send_group_bcast - send message to all members in communication group
+ * @sk: socket structure
+ * @m: message to send
+ * @dlen: total length of message data
+ * @timeout: timeout to wait for wakeup
+ *
+ * Called from function tipc_sendmsg(), which has done all sanity checks
+ * Returns the number of bytes sent on success, or errno
+ */
+static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
+				 int dlen, long timeout)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_group *grp = tsk->group;
+	struct tipc_nlist *dsts = tipc_group_dests(grp);
+	struct tipc_mc_method *method = &tsk->mc_method;
+	struct tipc_msg *hdr = &tsk->phdr;
+	int mtu = tipc_bcast_get_mtu(net);
+	struct sk_buff_head pkts;
+	int rc = -EHOSTUNREACH;
+
+	if (!dsts->local && !dsts->remote)
+		return -EHOSTUNREACH;
+
+	/* Block or return if any destination link is congested */
+	rc = tipc_wait_for_cond(sock, &timeout,	!tsk->cong_link_cnt);
+	if (unlikely(rc))
+		return rc;
+
+	/* Complete message header */
+	msg_set_type(hdr, TIPC_GRP_BCAST_MSG);
+	msg_set_hdr_sz(hdr, MCAST_H_SIZE);
+	msg_set_destport(hdr, 0);
+	msg_set_destnode(hdr, 0);
+	msg_set_nameinst(hdr, 0);
+	msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp));
+
+	/* Build message as chain of buffers */
+	skb_queue_head_init(&pkts);
+	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
+	if (unlikely(rc != dlen))
+		return rc;
+
+	/* Send message */
+	rc = tipc_mcast_xmit(net, &pkts, method, dsts,
+			     &tsk->cong_link_cnt);
+	if (unlikely(rc))
+		return rc;
+
+	/* Update broadcast sequence number */
+	tipc_group_update_bc_members(tsk->group);
+
+	return dlen;
+}
+
 /**
  * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets
  * @arrvq: queue with arriving messages, to be cloned after destination lookup
@@ -803,13 +873,15 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 		       struct sk_buff_head *inputq)
 {
-	struct tipc_msg *msg;
-	struct list_head dports;
-	u32 portid;
 	u32 scope = TIPC_CLUSTER_SCOPE;
-	struct sk_buff_head tmpq;
-	uint hsz;
+	u32 self = tipc_own_addr(net);
 	struct sk_buff *skb, *_skb;
+	u32 lower = 0, upper = ~0;
+	struct sk_buff_head tmpq;
+	u32 portid, oport, onode;
+	struct list_head dports;
+	struct tipc_msg *msg;
+	int hsz;
 
 	__skb_queue_head_init(&tmpq);
 	INIT_LIST_HEAD(&dports);
@@ -818,14 +890,18 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 	for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
 		msg = buf_msg(skb);
 		hsz = skb_headroom(skb) + msg_hdr_sz(msg);
-
-		if (in_own_node(net, msg_orignode(msg)))
+		oport = msg_origport(msg);
+		onode = msg_orignode(msg);
+		if (onode == self)
 			scope = TIPC_NODE_SCOPE;
 
 		/* Create destination port list and message clones: */
-		tipc_nametbl_mc_translate(net,
-					  msg_nametype(msg), msg_namelower(msg),
-					  msg_nameupper(msg), scope, &dports);
+		if (!msg_in_group(msg)) {
+			lower = msg_namelower(msg);
+			upper = msg_nameupper(msg);
+		}
+		tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper,
+					  scope, &dports);
 		while (tipc_dest_pop(&dports, NULL, &portid)) {
 			_skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
 			if (_skb) {
@@ -895,10 +971,6 @@ exit:
 	kfree_skb(skb);
 }
 
-static void tipc_sk_top_evt(struct tipc_sock *tsk, struct tipc_event *evt)
-{
-}
-
 /**
  * tipc_sendmsg - send message in connectionless manner
  * @sock: socket structure
@@ -934,6 +1006,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
 	struct list_head *clinks = &tsk->cong_links;
 	bool syn = !tipc_sk_type_connectionless(sk);
+	struct tipc_group *grp = tsk->group;
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_name_seq *seq;
 	struct sk_buff_head pkts;
@@ -944,6 +1017,9 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
 		return -EMSGSIZE;
 
+	if (unlikely(grp))
+		return tipc_send_group_bcast(sock, m, dlen, timeout);
+
 	if (unlikely(!dest)) {
 		dest = &tsk->peer;
 		if (!syn || dest->family != AF_TIPC)
@@ -1543,6 +1619,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 	struct sk_buff *skb = __skb_dequeue(inputq);
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_msg *hdr = buf_msg(skb);
+	struct tipc_group *grp = tsk->group;
 
 	switch (msg_user(hdr)) {
 	case CONN_MANAGER:
@@ -1553,8 +1630,12 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 		tsk->cong_link_cnt--;
 		sk->sk_write_space(sk);
 		break;
+	case GROUP_PROTOCOL:
+		tipc_group_proto_rcv(grp, hdr, xmitq);
+		break;
 	case TOP_SRV:
-		tipc_sk_top_evt(tsk, (void *)msg_data(hdr));
+		tipc_group_member_evt(tsk->group, skb, xmitq);
+		skb = NULL;
 		break;
 	default:
 		break;
@@ -1699,6 +1780,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
 {
 	bool sk_conn = !tipc_sk_type_connectionless(sk);
 	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_group *grp = tsk->group;
 	struct tipc_msg *hdr = buf_msg(skb);
 	struct net *net = sock_net(sk);
 	struct sk_buff_head inputq;
@@ -1710,15 +1792,19 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
 
 	if (unlikely(!msg_isdata(hdr)))
 		tipc_sk_proto_rcv(sk, &inputq, xmitq);
-	else if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG))
+	else if (unlikely(msg_type(hdr) > TIPC_GRP_BCAST_MSG))
 		return kfree_skb(skb);
 
+	if (unlikely(grp))
+		tipc_group_filter_msg(grp, &inputq, xmitq);
+
 	/* Validate and add to receive buffer if there is space */
 	while ((skb = __skb_dequeue(&inputq))) {
 		hdr = buf_msg(skb);
 		limit = rcvbuf_limit(sk, skb);
 		if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) ||
-		    (!sk_conn && msg_connected(hdr)))
+		    (!sk_conn && msg_connected(hdr)) ||
+		    (!grp && msg_in_group(hdr)))
 			err = TIPC_ERR_NO_PORT;
 		else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit)
 			err = TIPC_ERR_OVERLOAD;
@@ -1837,7 +1923,6 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
 			sock_put(sk);
 			continue;
 		}
-
 		/* No destination socket => dequeue skb if still there */
 		skb = tipc_skb_dequeue(inputq, dport);
 		if (!skb)
@@ -1905,6 +1990,11 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
 
 	lock_sock(sk);
 
+	if (tsk->group) {
+		res = -EINVAL;
+		goto exit;
+	}
+
 	if (dst->family == AF_UNSPEC) {
 		memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc));
 		if (!tipc_sk_type_connectionless(sk))
@@ -2341,6 +2431,52 @@ void tipc_sk_rht_destroy(struct net *net)
 	rhashtable_destroy(&tn->sk_rht);
 }
 
+static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
+{
+	struct net *net = sock_net(&tsk->sk);
+	u32 domain = addr_domain(net, mreq->scope);
+	struct tipc_group *grp = tsk->group;
+	struct tipc_msg *hdr = &tsk->phdr;
+	struct tipc_name_seq seq;
+	int rc;
+
+	if (mreq->type < TIPC_RESERVED_TYPES)
+		return -EACCES;
+	if (grp)
+		return -EACCES;
+	grp = tipc_group_create(net, tsk->portid, mreq);
+	if (!grp)
+		return -ENOMEM;
+	tsk->group = grp;
+	msg_set_lookup_scope(hdr, mreq->scope);
+	msg_set_nametype(hdr, mreq->type);
+	msg_set_dest_droppable(hdr, true);
+	seq.type = mreq->type;
+	seq.lower = mreq->instance;
+	seq.upper = seq.lower;
+	tipc_nametbl_build_group(net, grp, mreq->type, domain);
+	rc = tipc_sk_publish(tsk, mreq->scope, &seq);
+	if (rc)
+		tipc_group_delete(net, grp);
+	return rc;
+}
+
+static int tipc_sk_leave(struct tipc_sock *tsk)
+{
+	struct net *net = sock_net(&tsk->sk);
+	struct tipc_group *grp = tsk->group;
+	struct tipc_name_seq seq;
+	int scope;
+
+	if (!grp)
+		return -EINVAL;
+	tipc_group_self(grp, &seq, &scope);
+	tipc_group_delete(net, grp);
+	tsk->group = NULL;
+	tipc_sk_withdraw(tsk, scope, &seq);
+	return 0;
+}
+
 /**
  * tipc_setsockopt - set socket option
  * @sock: socket structure
@@ -2359,6 +2495,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_group_req mreq;
 	u32 value = 0;
 	int res = 0;
 
@@ -2374,9 +2511,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 	case TIPC_CONN_TIMEOUT:
 		if (ol < sizeof(value))
 			return -EINVAL;
-		res = get_user(value, (u32 __user *)ov);
-		if (res)
-			return res;
+		if (get_user(value, (u32 __user *)ov))
+			return -EFAULT;
+		break;
+	case TIPC_GROUP_JOIN:
+		if (ol < sizeof(mreq))
+			return -EINVAL;
+		if (copy_from_user(&mreq, ov, sizeof(mreq)))
+			return -EFAULT;
 		break;
 	default:
 		if (ov || ol)
@@ -2409,6 +2551,12 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 		tsk->mc_method.rcast = true;
 		tsk->mc_method.mandatory = true;
 		break;
+	case TIPC_GROUP_JOIN:
+		res = tipc_sk_join(tsk, &mreq);
+		break;
+	case TIPC_GROUP_LEAVE:
+		res = tipc_sk_leave(tsk);
+		break;
 	default:
 		res = -EINVAL;
 	}
@@ -2436,7 +2584,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
-	int len;
+	struct tipc_name_seq seq;
+	int len, scope;
 	u32 value;
 	int res;
 
@@ -2470,6 +2619,12 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
 	case TIPC_SOCK_RECVQ_DEPTH:
 		value = skb_queue_len(&sk->sk_receive_queue);
 		break;
+	case TIPC_GROUP_JOIN:
+		seq.type = 0;
+		if (tsk->group)
+			tipc_group_self(tsk->group, &seq, &scope);
+		value = seq.type;
+		break;
 	default:
 		res = -EINVAL;
 	}
-- 
cgit v1.2.3


From ae236fb208a6fbbd2e7a6913385e8fb78ac807f8 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 13 Oct 2017 11:04:25 +0200
Subject: tipc: receive group membership events via member socket

Like with any other service, group members' availability can be
subscribed for by connecting to be topology server. However, because
the events arrive via a different socket than the member socket, there
is a real risk that membership events my arrive out of synch with the
actual JOIN/LEAVE action. I.e., it is possible to receive the first
messages from a new member before the corresponding JOIN event arrives,
just as it is possible to receive the last messages from a leaving
member after the LEAVE event has already been received.

Since each member socket is internally also subscribing for membership
events, we now fix this problem by passing those events on to the user
via the member socket. We leverage the already present member synch-
ronization protocol to guarantee correct message/event order. An event
is delivered to the user as an empty message where the two source
addresses identify the new/lost member. Furthermore, we set the MSG_OOB
bit in the message flags to mark it as an event. If the event is an
indication about a member loss we also set the MSG_EOR bit, so it can
be distinguished from a member addition event.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |  1 +
 net/tipc/group.c          | 60 +++++++++++++++++++++++++++++++++++++----------
 net/tipc/group.h          |  2 ++
 net/tipc/msg.h            | 22 +++++++++++++++--
 net/tipc/socket.c         | 49 ++++++++++++++++++++++++--------------
 5 files changed, 101 insertions(+), 33 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 5f7b2c4a09ab..ef41c11a7f38 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -238,6 +238,7 @@ struct sockaddr_tipc {
  * Flag values
  */
 #define TIPC_GROUP_LOOPBACK     0x1  /* Receive copy of sent msg when match */
+#define TIPC_GROUP_MEMBER_EVTS  0x2  /* Receive membership events in socket */
 
 struct tipc_group_req {
 	__u32 type;      /* group id */
diff --git a/net/tipc/group.c b/net/tipc/group.c
index beb214a3420c..1bfa9348b26d 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -59,6 +59,7 @@ enum mbr_state {
 struct tipc_member {
 	struct rb_node tree_node;
 	struct list_head list;
+	struct sk_buff *event_msg;
 	u32 node;
 	u32 port;
 	u32 instance;
@@ -79,6 +80,7 @@ struct tipc_group {
 	u16 member_cnt;
 	u16 bc_snd_nxt;
 	bool loopback;
+	bool events;
 };
 
 static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
@@ -117,6 +119,7 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	grp->instance = mreq->instance;
 	grp->scope = mreq->scope;
 	grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK;
+	grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS;
 	if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid))
 		return grp;
 	kfree(grp);
@@ -279,6 +282,13 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq,
 	if (!msg_in_group(hdr))
 		goto drop;
 
+	if (mtyp == TIPC_GRP_MEMBER_EVT) {
+		if (!grp->events)
+			goto drop;
+		__skb_queue_tail(inputq, skb);
+		return;
+	}
+
 	m = tipc_group_find_member(grp, node, port);
 	if (!tipc_group_is_receiver(m))
 		goto drop;
@@ -311,6 +321,7 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
 }
 
 void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
+			  struct sk_buff_head *inputq,
 			  struct sk_buff_head *xmitq)
 {
 	u32 node = msg_orignode(hdr);
@@ -332,10 +343,12 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
 		m->bc_rcv_nxt = msg_grp_bc_syncpt(hdr);
 
 		/* Wait until PUBLISH event is received */
-		if (m->state == MBR_DISCOVERED)
+		if (m->state == MBR_DISCOVERED) {
 			m->state = MBR_JOINING;
-		else if (m->state == MBR_PUBLISHED)
+		} else if (m->state == MBR_PUBLISHED) {
 			m->state = MBR_JOINED;
+			__skb_queue_tail(inputq, m->event_msg);
+		}
 		return;
 	case GRP_LEAVE_MSG:
 		if (!m)
@@ -347,6 +360,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
 			return;
 		}
 		/* Otherwise deliver already received WITHDRAW event */
+		__skb_queue_tail(inputq, m->event_msg);
 		tipc_group_delete_member(grp, m);
 		return;
 	default:
@@ -354,16 +368,17 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
 	}
 }
 
-/* tipc_group_member_evt() - receive and handle a member up/down event
- */
 void tipc_group_member_evt(struct tipc_group *grp,
 			   struct sk_buff *skb,
+			   struct sk_buff_head *inputq,
 			   struct sk_buff_head *xmitq)
 {
 	struct tipc_msg *hdr = buf_msg(skb);
 	struct tipc_event *evt = (void *)msg_data(hdr);
+	u32 instance = evt->found_lower;
 	u32 node = evt->port.node;
 	u32 port = evt->port.ref;
+	int event = evt->event;
 	struct tipc_member *m;
 	struct net *net;
 	u32 self;
@@ -376,32 +391,51 @@ void tipc_group_member_evt(struct tipc_group *grp,
 	if (!grp->loopback && node == self && port == grp->portid)
 		goto drop;
 
+	/* Convert message before delivery to user */
+	msg_set_hdr_sz(hdr, GROUP_H_SIZE);
+	msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE);
+	msg_set_type(hdr, TIPC_GRP_MEMBER_EVT);
+	msg_set_origport(hdr, port);
+	msg_set_orignode(hdr, node);
+	msg_set_nametype(hdr, grp->type);
+	msg_set_grp_evt(hdr, event);
+
 	m = tipc_group_find_member(grp, node, port);
 
-	if (evt->event == TIPC_PUBLISHED) {
+	if (event == TIPC_PUBLISHED) {
 		if (!m)
 			m = tipc_group_create_member(grp, node, port,
 						     MBR_DISCOVERED);
 		if (!m)
 			goto drop;
 
-		/* Wait if JOIN message not yet received */
-		if (m->state == MBR_DISCOVERED)
+		/* Hold back event if JOIN message not yet received */
+		if (m->state == MBR_DISCOVERED) {
+			m->event_msg = skb;
 			m->state = MBR_PUBLISHED;
-		else
+		} else {
+			__skb_queue_tail(inputq, skb);
 			m->state = MBR_JOINED;
-		m->instance = evt->found_lower;
+		}
+		m->instance = instance;
+		TIPC_SKB_CB(skb)->orig_member = m->instance;
 		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
-	} else if (evt->event == TIPC_WITHDRAWN) {
+	} else if (event == TIPC_WITHDRAWN) {
 		if (!m)
 			goto drop;
 
-		/* Keep back event if more messages might be expected */
-		if (m->state != MBR_LEAVING && tipc_node_is_up(net, node))
+		TIPC_SKB_CB(skb)->orig_member = m->instance;
+
+		/* Hold back event if more messages might be expected */
+		if (m->state != MBR_LEAVING && tipc_node_is_up(net, node)) {
+			m->event_msg = skb;
 			m->state = MBR_LEAVING;
-		else
+		} else {
+			__skb_queue_tail(inputq, skb);
 			tipc_group_delete_member(grp, m);
+		}
 	}
+	return;
 drop:
 	kfree_skb(skb);
 }
diff --git a/net/tipc/group.h b/net/tipc/group.h
index 9bdf4479fc03..5d3f10d28967 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -54,9 +54,11 @@ void tipc_group_filter_msg(struct tipc_group *grp,
 			   struct sk_buff_head *xmitq);
 void tipc_group_member_evt(struct tipc_group *grp,
 			   struct sk_buff *skb,
+			   struct sk_buff_head *inputq,
 			   struct sk_buff_head *xmitq);
 void tipc_group_proto_rcv(struct tipc_group *grp,
 			  struct tipc_msg *hdr,
+			  struct sk_buff_head *inputq,
 			  struct sk_buff_head *xmitq);
 void tipc_group_update_bc_members(struct tipc_group *grp);
 u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index e438716d2372..1b527b154e46 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -65,7 +65,8 @@ struct plist;
 #define TIPC_MCAST_MSG          1
 #define TIPC_NAMED_MSG          2
 #define TIPC_DIRECT_MSG         3
-#define TIPC_GRP_BCAST_MSG      4
+#define TIPC_GRP_MEMBER_EVT     4
+#define TIPC_GRP_BCAST_MSG      5
 
 /*
  * Internal message users
@@ -258,7 +259,14 @@ static inline void msg_set_type(struct tipc_msg *m, u32 n)
 
 static inline int msg_in_group(struct tipc_msg *m)
 {
-	return (msg_type(m) == TIPC_GRP_BCAST_MSG);
+	int mtyp = msg_type(m);
+
+	return (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MEMBER_EVT);
+}
+
+static inline bool msg_is_grp_evt(struct tipc_msg *m)
+{
+	return msg_type(m) == TIPC_GRP_MEMBER_EVT;
 }
 
 static inline u32 msg_named(struct tipc_msg *m)
@@ -824,6 +832,16 @@ static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n)
 
 /* Word 10
  */
+static inline u16 msg_grp_evt(struct tipc_msg *m)
+{
+	return msg_bits(m, 10, 0, 0x3);
+}
+
+static inline void msg_set_grp_evt(struct tipc_msg *m, int n)
+{
+	msg_set_bits(m, 10, 0, 0x3, n);
+}
+
 static inline u16 msg_grp_bc_seqno(struct tipc_msg *m)
 {
 	return msg_bits(m, 10, 16, 0xffff);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 25ecf1201527..0a2eac309177 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -709,41 +709,43 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 			      poll_table *wait)
 {
 	struct sock *sk = sock->sk;
+	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_group *grp = tsk->group;
-	u32 mask = 0;
+	u32 revents = 0;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+		revents |= POLLRDHUP | POLLIN | POLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		revents |= POLLHUP;
 
 	switch (sk->sk_state) {
 	case TIPC_ESTABLISHED:
 		if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
-			mask |= POLLOUT;
+			revents |= POLLOUT;
 		/* fall thru' */
 	case TIPC_LISTEN:
 	case TIPC_CONNECTING:
-		if (!skb_queue_empty(&sk->sk_receive_queue))
-			mask |= (POLLIN | POLLRDNORM);
+		if (skb)
+			revents |= POLLIN | POLLRDNORM;
 		break;
 	case TIPC_OPEN:
 		if (!grp || tipc_group_size(grp))
 			if (!tsk->cong_link_cnt)
-				mask |= POLLOUT;
-		if (tipc_sk_type_connectionless(sk) &&
-		    (!skb_queue_empty(&sk->sk_receive_queue)))
-			mask |= (POLLIN | POLLRDNORM);
+				revents |= POLLOUT;
+		if (!tipc_sk_type_connectionless(sk))
+			break;
+		if (!skb)
+			break;
+		revents |= POLLIN | POLLRDNORM;
 		break;
 	case TIPC_DISCONNECTING:
-		mask = (POLLIN | POLLRDNORM | POLLHUP);
+		revents = POLLIN | POLLRDNORM | POLLHUP;
 		break;
 	}
-
-	return mask;
+	return revents;
 }
 
 /**
@@ -1415,11 +1417,12 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
 			size_t buflen,	int flags)
 {
 	struct sock *sk = sock->sk;
-	struct tipc_sock *tsk = tipc_sk(sk);
-	struct sk_buff *skb;
-	struct tipc_msg *hdr;
 	bool connected = !tipc_sk_type_connectionless(sk);
+	struct tipc_sock *tsk = tipc_sk(sk);
 	int rc, err, hlen, dlen, copy;
+	struct tipc_msg *hdr;
+	struct sk_buff *skb;
+	bool grp_evt;
 	long timeout;
 
 	/* Catch invalid receive requests */
@@ -1443,6 +1446,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
 		dlen = msg_data_sz(hdr);
 		hlen = msg_hdr_sz(hdr);
 		err = msg_errcode(hdr);
+		grp_evt = msg_is_grp_evt(hdr);
 		if (likely(dlen || err))
 			break;
 		tsk_advance_rx_queue(sk);
@@ -1469,11 +1473,20 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
 	if (unlikely(rc))
 		goto exit;
 
+	/* Mark message as group event if applicable */
+	if (unlikely(grp_evt)) {
+		if (msg_grp_evt(hdr) == TIPC_WITHDRAWN)
+			m->msg_flags |= MSG_EOR;
+		m->msg_flags |= MSG_OOB;
+		copy = 0;
+	}
+
 	/* Caption of data or error code/rejected data was successful */
 	if (unlikely(flags & MSG_PEEK))
 		goto exit;
 
 	tsk_advance_rx_queue(sk);
+
 	if (likely(!connected))
 		goto exit;
 
@@ -1648,10 +1661,10 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 		sk->sk_write_space(sk);
 		break;
 	case GROUP_PROTOCOL:
-		tipc_group_proto_rcv(grp, hdr, xmitq);
+		tipc_group_proto_rcv(grp, hdr, inputq, xmitq);
 		break;
 	case TOP_SRV:
-		tipc_group_member_evt(tsk->group, skb, xmitq);
+		tipc_group_member_evt(tsk->group, skb, inputq, xmitq);
 		skb = NULL;
 		break;
 	default:
-- 
cgit v1.2.3


From 4e8b86c062695454df0b76f3fee4fab8dc4bb716 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Thu, 7 Sep 2017 04:00:06 -0700
Subject: mqprio: Introduce new hardware offload mode and shaper in mqprio

The offload types currently supported in mqprio are 0 (no offload) and
1 (offload only TCs) by setting these values for the 'hw' option. If
offloads are supported by setting the 'hw' option to 1, the default
offload mode is 'dcb' where only the TC values are offloaded to the
device. This patch introduces a new hardware offload mode called
'channel' with 'hw' set to 1 in mqprio which makes full use of the
mqprio options, the TCs, the queue configurations and the QoS parameters
for the TCs. This is achieved through a new netlink attribute for the
'mode' option which takes values such as 'dcb' (default) and 'channel'.
The 'channel' mode also supports QoS attributes for traffic class such as
minimum and maximum values for bandwidth rate limits.

This patch enables configuring additional HW shaper attributes associated
with a traffic class. Currently the shaper for bandwidth rate limiting is
supported which takes options such as minimum and maximum bandwidth rates
and are offloaded to the hardware in the 'channel' mode. The min and max
limits for bandwidth rates are provided by the user along with the TCs
and the queue configurations when creating the mqprio qdisc. The interface
can be extended to support new HW shapers in future through the 'shaper'
attribute.

Introduces a new data structure 'tc_mqprio_qopt_offload' for offloading
mqprio queue options and use this to be shared between the kernel and
device driver. This contains a copy of the existing data structure
for mqprio queue options. This new data structure can be extended when
adding new attributes for traffic class such as mode, shaper, shaper
parameters (bandwidth rate limits). The existing data structure for mqprio
queue options will be shared between the kernel and userspace.

Example:
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit

To dump the bandwidth rates:

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
             queues:(0:3) (4:7)
             mode:channel
             shaper:bw_rlimit   min_rate:1Gbit 2Gbit   max_rate:4Gbit 5Gbit

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/net/pkt_cls.h          |   9 ++
 include/uapi/linux/pkt_sched.h |  32 +++++++
 net/sched/sch_mqprio.c         | 183 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 215 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f5263743076b..60d39789e4f0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -546,6 +546,15 @@ struct tc_cls_bpf_offload {
 	u32 gen_flags;
 };
 
+struct tc_mqprio_qopt_offload {
+	/* struct tc_mqprio_qopt must always be the first element */
+	struct tc_mqprio_qopt qopt;
+	u16 mode;
+	u16 shaper;
+	u32 flags;
+	u64 min_rate[TC_QOPT_MAX_QUEUE];
+	u64 max_rate[TC_QOPT_MAX_QUEUE];
+};
 
 /* This structure holds cookie structure that is passed from user
  * to the kernel for actions and classifiers
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf5528fed..e95b5c9b9fad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -625,6 +625,22 @@ enum {
 
 #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
 
+enum {
+	TC_MQPRIO_MODE_DCB,
+	TC_MQPRIO_MODE_CHANNEL,
+	__TC_MQPRIO_MODE_MAX
+};
+
+#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
+
+enum {
+	TC_MQPRIO_SHAPER_DCB,
+	TC_MQPRIO_SHAPER_BW_RATE,	/* Add new shapers below */
+	__TC_MQPRIO_SHAPER_MAX
+};
+
+#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
+
 struct tc_mqprio_qopt {
 	__u8	num_tc;
 	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -633,6 +649,22 @@ struct tc_mqprio_qopt {
 	__u16	offset[TC_QOPT_MAX_QUEUE];
 };
 
+#define TC_MQPRIO_F_MODE		0x1
+#define TC_MQPRIO_F_SHAPER		0x2
+#define TC_MQPRIO_F_MIN_RATE		0x4
+#define TC_MQPRIO_F_MAX_RATE		0x8
+
+enum {
+	TCA_MQPRIO_UNSPEC,
+	TCA_MQPRIO_MODE,
+	TCA_MQPRIO_SHAPER,
+	TCA_MQPRIO_MIN_RATE64,
+	TCA_MQPRIO_MAX_RATE64,
+	__TCA_MQPRIO_MAX,
+};
+
+#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
+
 /* SFB */
 
 enum {
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 6bcdfe6e7b63..f1ae9be83934 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -18,10 +18,16 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/sch_generic.h>
+#include <net/pkt_cls.h>
 
 struct mqprio_sched {
 	struct Qdisc		**qdiscs;
+	u16 mode;
+	u16 shaper;
 	int hw_offload;
+	u32 flags;
+	u64 min_rate[TC_QOPT_MAX_QUEUE];
+	u64 max_rate[TC_QOPT_MAX_QUEUE];
 };
 
 static void mqprio_destroy(struct Qdisc *sch)
@@ -39,9 +45,17 @@ static void mqprio_destroy(struct Qdisc *sch)
 	}
 
 	if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
-		struct tc_mqprio_qopt mqprio = {};
+		struct tc_mqprio_qopt_offload mqprio = { { 0 } };
 
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &mqprio);
+		switch (priv->mode) {
+		case TC_MQPRIO_MODE_DCB:
+		case TC_MQPRIO_MODE_CHANNEL:
+			dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+						      &mqprio);
+			break;
+		default:
+			return;
+		}
 	} else {
 		netdev_set_num_tc(dev, 0);
 	}
@@ -97,6 +111,26 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
 	return 0;
 }
 
+static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
+	[TCA_MQPRIO_MODE]	= { .len = sizeof(u16) },
+	[TCA_MQPRIO_SHAPER]	= { .len = sizeof(u16) },
+	[TCA_MQPRIO_MIN_RATE64]	= { .type = NLA_NESTED },
+	[TCA_MQPRIO_MAX_RATE64]	= { .type = NLA_NESTED },
+};
+
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		      const struct nla_policy *policy, int len)
+{
+	int nested_len = nla_len(nla) - NLA_ALIGN(len);
+
+	if (nested_len >= nla_attr_size(0))
+		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+				 nested_len, policy, NULL);
+
+	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+	return 0;
+}
+
 static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct net_device *dev = qdisc_dev(sch);
@@ -105,6 +139,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	struct Qdisc *qdisc;
 	int i, err = -EOPNOTSUPP;
 	struct tc_mqprio_qopt *qopt = NULL;
+	struct nlattr *tb[TCA_MQPRIO_MAX + 1];
+	struct nlattr *attr;
+	int rem;
+	int len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
 
 	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
 	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
@@ -122,6 +160,58 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	if (mqprio_parse_opt(dev, qopt))
 		return -EINVAL;
 
+	if (len > 0) {
+		err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
+				 sizeof(*qopt));
+		if (err < 0)
+			return err;
+
+		if (!qopt->hw)
+			return -EINVAL;
+
+		if (tb[TCA_MQPRIO_MODE]) {
+			priv->flags |= TC_MQPRIO_F_MODE;
+			priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]);
+		}
+
+		if (tb[TCA_MQPRIO_SHAPER]) {
+			priv->flags |= TC_MQPRIO_F_SHAPER;
+			priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]);
+		}
+
+		if (tb[TCA_MQPRIO_MIN_RATE64]) {
+			if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+				return -EINVAL;
+			i = 0;
+			nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
+					    rem) {
+				if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
+					return -EINVAL;
+				if (i >= qopt->num_tc)
+					break;
+				priv->min_rate[i] = *(u64 *)nla_data(attr);
+				i++;
+			}
+			priv->flags |= TC_MQPRIO_F_MIN_RATE;
+		}
+
+		if (tb[TCA_MQPRIO_MAX_RATE64]) {
+			if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+				return -EINVAL;
+			i = 0;
+			nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
+					    rem) {
+				if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
+					return -EINVAL;
+				if (i >= qopt->num_tc)
+					break;
+				priv->max_rate[i] = *(u64 *)nla_data(attr);
+				i++;
+			}
+			priv->flags |= TC_MQPRIO_F_MAX_RATE;
+		}
+	}
+
 	/* pre-allocate qdisc, attachment can't fail */
 	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
 			       GFP_KERNEL);
@@ -146,14 +236,36 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 * supplied and verified mapping
 	 */
 	if (qopt->hw) {
-		struct tc_mqprio_qopt mqprio = *qopt;
+		struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
 
-		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+		switch (priv->mode) {
+		case TC_MQPRIO_MODE_DCB:
+			if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
+				return -EINVAL;
+			break;
+		case TC_MQPRIO_MODE_CHANNEL:
+			mqprio.flags = priv->flags;
+			if (priv->flags & TC_MQPRIO_F_MODE)
+				mqprio.mode = priv->mode;
+			if (priv->flags & TC_MQPRIO_F_SHAPER)
+				mqprio.shaper = priv->shaper;
+			if (priv->flags & TC_MQPRIO_F_MIN_RATE)
+				for (i = 0; i < mqprio.qopt.num_tc; i++)
+					mqprio.min_rate[i] = priv->min_rate[i];
+			if (priv->flags & TC_MQPRIO_F_MAX_RATE)
+				for (i = 0; i < mqprio.qopt.num_tc; i++)
+					mqprio.max_rate[i] = priv->max_rate[i];
+			break;
+		default:
+			return -EINVAL;
+		}
+		err = dev->netdev_ops->ndo_setup_tc(dev,
+						    TC_SETUP_MQPRIO,
 						    &mqprio);
 		if (err)
 			return err;
 
-		priv->hw_offload = mqprio.hw;
+		priv->hw_offload = mqprio.qopt.hw;
 	} else {
 		netdev_set_num_tc(dev, qopt->num_tc);
 		for (i = 0; i < qopt->num_tc; i++)
@@ -223,11 +335,51 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
 	return 0;
 }
 
+static int dump_rates(struct mqprio_sched *priv,
+		      struct tc_mqprio_qopt *opt, struct sk_buff *skb)
+{
+	struct nlattr *nest;
+	int i;
+
+	if (priv->flags & TC_MQPRIO_F_MIN_RATE) {
+		nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64);
+		if (!nest)
+			goto nla_put_failure;
+
+		for (i = 0; i < opt->num_tc; i++) {
+			if (nla_put(skb, TCA_MQPRIO_MIN_RATE64,
+				    sizeof(priv->min_rate[i]),
+				    &priv->min_rate[i]))
+				goto nla_put_failure;
+		}
+		nla_nest_end(skb, nest);
+	}
+
+	if (priv->flags & TC_MQPRIO_F_MAX_RATE) {
+		nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64);
+		if (!nest)
+			goto nla_put_failure;
+
+		for (i = 0; i < opt->num_tc; i++) {
+			if (nla_put(skb, TCA_MQPRIO_MAX_RATE64,
+				    sizeof(priv->max_rate[i]),
+				    &priv->max_rate[i]))
+				goto nla_put_failure;
+		}
+		nla_nest_end(skb, nest);
+	}
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
 static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mqprio_sched *priv = qdisc_priv(sch);
-	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
 	struct tc_mqprio_qopt opt = { 0 };
 	struct Qdisc *qdisc;
 	unsigned int i;
@@ -258,12 +410,25 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 		opt.offset[i] = dev->tc_to_txq[i].offset;
 	}
 
-	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+	if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt))
+		goto nla_put_failure;
+
+	if ((priv->flags & TC_MQPRIO_F_MODE) &&
+	    nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode))
+		goto nla_put_failure;
+
+	if ((priv->flags & TC_MQPRIO_F_SHAPER) &&
+	    nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper))
+		goto nla_put_failure;
+
+	if ((priv->flags & TC_MQPRIO_F_MIN_RATE ||
+	     priv->flags & TC_MQPRIO_F_MAX_RATE) &&
+	    (dump_rates(priv, &opt, skb) != 0))
 		goto nla_put_failure;
 
-	return skb->len;
+	return nla_nest_end(skb, nla);
 nla_put_failure:
-	nlmsg_trim(skb, b);
+	nlmsg_trim(skb, nla);
 	return -1;
 }
 
-- 
cgit v1.2.3


From fd4f6f2a78aeaebb7094c1bb9b30623d18a86e4c Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Thu, 12 Oct 2017 02:16:11 -0400
Subject: cramfs: implement uncompressed and arbitrary data block positioning

Two new capabilities are introduced here:

- The ability to store some blocks uncompressed.

- The ability to locate blocks anywhere.

Those capabilities can be used independently, but the combination
opens the possibility for execute-in-place (XIP) of program text segments
that must remain uncompressed, and in the MMU case, must have a specific
alignment.  It is even possible to still have the writable data segments
from the same file compressed as they have to be copied into RAM anyway.

This is achieved by giving special meanings to some unused block pointer
bits while remaining compatible with legacy cramfs images.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Tested-by: Chris Brandt <chris.brandt@renesas.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cramfs/README               | 31 ++++++++++++++-
 fs/cramfs/inode.c              | 90 +++++++++++++++++++++++++++++++++---------
 include/uapi/linux/cramfs_fs.h | 26 +++++++++++-
 3 files changed, 126 insertions(+), 21 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/cramfs/README b/fs/cramfs/README
index 9d4e7ea311f4..d71b27e0ff15 100644
--- a/fs/cramfs/README
+++ b/fs/cramfs/README
@@ -49,17 +49,46 @@ same as the start of the (i+1)'th <block> if there is one).  The first
 <block> immediately follows the last <block_pointer> for the file.
 <block_pointer>s are each 32 bits long.
 
+When the CRAMFS_FLAG_EXT_BLOCK_POINTERS capability bit is set, each
+<block_pointer>'s top bits may contain special flags as follows:
+
+CRAMFS_BLK_FLAG_UNCOMPRESSED (bit 31):
+	The block data is not compressed and should be copied verbatim.
+
+CRAMFS_BLK_FLAG_DIRECT_PTR (bit 30):
+	The <block_pointer> stores the actual block start offset and not
+	its end, shifted right by 2 bits. The block must therefore be
+	aligned to a 4-byte boundary. The block size is either blksize
+	if CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified, otherwise
+	the compressed data length is included in the first 2 bytes of
+	the block data. This is used to allow discontiguous data layout
+	and specific data block alignments e.g. for XIP applications.
+
+
 The order of <file_data>'s is a depth-first descent of the directory
 tree, i.e. the same order as `find -size +0 \( -type f -o -type l \)
 -print'.
 
 
 <block>: The i'th <block> is the output of zlib's compress function
-applied to the i'th blksize-sized chunk of the input data.
+applied to the i'th blksize-sized chunk of the input data if the
+corresponding CRAMFS_BLK_FLAG_UNCOMPRESSED <block_ptr> bit is not set,
+otherwise it is the input data directly.
 (For the last <block> of the file, the input may of course be smaller.)
 Each <block> may be a different size.  (See <block_pointer> above.)
+
 <block>s are merely byte-aligned, not generally u32-aligned.
 
+When CRAMFS_BLK_FLAG_DIRECT_PTR is specified then the corresponding
+<block> may be located anywhere and not necessarily contiguous with
+the previous/next blocks. In that case it is minimally u32-aligned.
+If CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified then the size is always
+blksize except for the last block which is limited by the file length.
+If CRAMFS_BLK_FLAG_DIRECT_PTR is set and CRAMFS_BLK_FLAG_UNCOMPRESSED
+is not set then the first 2 bytes of the block contains the size of the
+remaining block data as this cannot be determined from the placement of
+logically adjacent blocks.
+
 
 Holes
 -----
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index bcdccb7a820b..19045453a8f3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -618,34 +618,86 @@ static int cramfs_readpage(struct file *file, struct page *page)
 
 	if (page->index < maxblock) {
 		struct super_block *sb = inode->i_sb;
-		u32 blkptr_offset = OFFSET(inode) + page->index*4;
-		u32 start_offset, compr_len;
+		u32 blkptr_offset = OFFSET(inode) + page->index * 4;
+		u32 block_ptr, block_start, block_len;
+		bool uncompressed, direct;
 
-		start_offset = OFFSET(inode) + maxblock*4;
 		mutex_lock(&read_mutex);
-		if (page->index)
-			start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
-				4);
-		compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
-			start_offset);
-		mutex_unlock(&read_mutex);
+		block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4);
+		uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED);
+		direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR);
+		block_ptr &= ~CRAMFS_BLK_FLAGS;
+
+		if (direct) {
+			/*
+			 * The block pointer is an absolute start pointer,
+			 * shifted by 2 bits. The size is included in the
+			 * first 2 bytes of the data block when compressed,
+			 * or PAGE_SIZE otherwise.
+			 */
+			block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+			if (uncompressed) {
+				block_len = PAGE_SIZE;
+				/* if last block: cap to file length */
+				if (page->index == maxblock - 1)
+					block_len =
+						offset_in_page(inode->i_size);
+			} else {
+				block_len = *(u16 *)
+					cramfs_read(sb, block_start, 2);
+				block_start += 2;
+			}
+		} else {
+			/*
+			 * The block pointer indicates one past the end of
+			 * the current block (start of next block). If this
+			 * is the first block then it starts where the block
+			 * pointer table ends, otherwise its start comes
+			 * from the previous block's pointer.
+			 */
+			block_start = OFFSET(inode) + maxblock * 4;
+			if (page->index)
+				block_start = *(u32 *)
+					cramfs_read(sb, blkptr_offset - 4, 4);
+			/* Beware... previous ptr might be a direct ptr */
+			if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
+				/* See comments on earlier code. */
+				u32 prev_start = block_start;
+			       block_start = prev_start & ~CRAMFS_BLK_FLAGS;
+			       block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+				if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
+					block_start += PAGE_SIZE;
+				} else {
+					block_len = *(u16 *)
+						cramfs_read(sb, block_start, 2);
+					block_start += 2 + block_len;
+				}
+			}
+			block_start &= ~CRAMFS_BLK_FLAGS;
+			block_len = block_ptr - block_start;
+		}
 
-		if (compr_len == 0)
+		if (block_len == 0)
 			; /* hole */
-		else if (unlikely(compr_len > (PAGE_SIZE << 1))) {
-			pr_err("bad compressed blocksize %u\n",
-				compr_len);
+		else if (unlikely(block_len > 2*PAGE_SIZE ||
+				  (uncompressed && block_len > PAGE_SIZE))) {
+			mutex_unlock(&read_mutex);
+			pr_err("bad data blocksize %u\n", block_len);
 			goto err;
+		} else if (uncompressed) {
+			memcpy(pgdata,
+			       cramfs_read(sb, block_start, block_len),
+			       block_len);
+			bytes_filled = block_len;
 		} else {
-			mutex_lock(&read_mutex);
 			bytes_filled = cramfs_uncompress_block(pgdata,
 				 PAGE_SIZE,
-				 cramfs_read(sb, start_offset, compr_len),
-				 compr_len);
-			mutex_unlock(&read_mutex);
-			if (unlikely(bytes_filled < 0))
-				goto err;
+				 cramfs_read(sb, block_start, block_len),
+				 block_len);
 		}
+		mutex_unlock(&read_mutex);
+		if (unlikely(bytes_filled < 0))
+			goto err;
 	}
 
 	memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
diff --git a/include/uapi/linux/cramfs_fs.h b/include/uapi/linux/cramfs_fs.h
index e4611a9b9243..ce2c885133e2 100644
--- a/include/uapi/linux/cramfs_fs.h
+++ b/include/uapi/linux/cramfs_fs.h
@@ -73,6 +73,7 @@ struct cramfs_super {
 #define CRAMFS_FLAG_HOLES		0x00000100	/* support for holes */
 #define CRAMFS_FLAG_WRONG_SIGNATURE	0x00000200	/* reserved */
 #define CRAMFS_FLAG_SHIFTED_ROOT_OFFSET	0x00000400	/* shifted root fs */
+#define CRAMFS_FLAG_EXT_BLOCK_POINTERS	0x00000800	/* block pointer extensions */
 
 /*
  * Valid values in super.flags.  Currently we refuse to mount
@@ -82,7 +83,30 @@ struct cramfs_super {
 #define CRAMFS_SUPPORTED_FLAGS	( 0x000000ff \
 				| CRAMFS_FLAG_HOLES \
 				| CRAMFS_FLAG_WRONG_SIGNATURE \
-				| CRAMFS_FLAG_SHIFTED_ROOT_OFFSET )
+				| CRAMFS_FLAG_SHIFTED_ROOT_OFFSET \
+				| CRAMFS_FLAG_EXT_BLOCK_POINTERS )
 
+/*
+ * Block pointer flags
+ *
+ * The maximum block offset that needs to be represented is roughly:
+ *
+ *   (1 << CRAMFS_OFFSET_WIDTH) * 4 +
+ *   (1 << CRAMFS_SIZE_WIDTH) / PAGE_SIZE * (4 + PAGE_SIZE)
+ *   = 0x11004000
+ *
+ * That leaves room for 3 flag bits in the block pointer table.
+ */
+#define CRAMFS_BLK_FLAG_UNCOMPRESSED	(1 << 31)
+#define CRAMFS_BLK_FLAG_DIRECT_PTR	(1 << 30)
+
+#define CRAMFS_BLK_FLAGS	( CRAMFS_BLK_FLAG_UNCOMPRESSED \
+				| CRAMFS_BLK_FLAG_DIRECT_PTR )
+
+/*
+ * Direct blocks are at least 4-byte aligned.
+ * Pointers to direct blocks are shifted down by 2 bits.
+ */
+#define CRAMFS_BLK_DIRECT_PTR_SHIFT	2
 
 #endif /* _UAPI__CRAMFS_H */
-- 
cgit v1.2.3


From 32302902ff093891d8e64439cbb8ceae83e21ef8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 12 Oct 2017 11:38:45 -0700
Subject: mqprio: Reserve last 32 classid values for HW traffic classes and
 misc IDs

This patch makes a slight tweak to mqprio in order to bring the
classid values used back in line with what is used for mq. The general idea
is to reserve values :ffe0 - :ffef to identify hardware traffic classes
normally reported via dev->num_tc. By doing this we can maintain a
consistent behavior with mq for classid where :1 - :ffdf will represent a
physical qdisc mapped onto a Tx queue represented by classid - 1, and the
traffic classes will be mapped onto a known subset of classid values
reserved for our virtual qdiscs.

Note I reserved the range from :fff0 - :ffff since this way we might be
able to reuse these classid values with clsact and ingress which would mean
that for mq, mqprio, ingress, and clsact we should be able to maintain a
similar classid layout.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_mqprio.c         | 79 ++++++++++++++++++++++++------------------
 2 files changed, 47 insertions(+), 33 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e95b5c9b9fad..e7cc3d3c7421 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -74,6 +74,7 @@ struct tc_estimator {
 #define TC_H_INGRESS    (0xFFFFFFF1U)
 #define TC_H_CLSACT	TC_H_INGRESS
 
+#define TC_H_MIN_PRIORITY	0xFFE0U
 #define TC_H_MIN_INGRESS	0xFFF2U
 #define TC_H_MIN_EGRESS		0xFFF3U
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index f1ae9be83934..cae91b4b08a6 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -153,6 +153,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!netif_is_multiqueue(dev))
 		return -EOPNOTSUPP;
 
+	/* make certain can allocate enough classids to handle queues */
+	if (dev->num_tx_queues >= TC_H_MIN_PRIORITY)
+		return -ENOMEM;
+
 	if (!opt || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
 
@@ -305,7 +309,7 @@ static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
 					     unsigned long cl)
 {
 	struct net_device *dev = qdisc_dev(sch);
-	unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+	unsigned long ntx = cl - 1;
 
 	if (ntx >= dev->num_tx_queues)
 		return NULL;
@@ -447,38 +451,35 @@ static unsigned long mqprio_find(struct Qdisc *sch, u32 classid)
 	struct net_device *dev = qdisc_dev(sch);
 	unsigned int ntx = TC_H_MIN(classid);
 
-	if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
-		return 0;
-	return ntx;
+	/* There are essentially two regions here that have valid classid
+	 * values. The first region will have a classid value of 1 through
+	 * num_tx_queues. All of these are backed by actual Qdiscs.
+	 */
+	if (ntx < TC_H_MIN_PRIORITY)
+		return (ntx <= dev->num_tx_queues) ? ntx : 0;
+
+	/* The second region represents the hardware traffic classes. These
+	 * are represented by classid values of TC_H_MIN_PRIORITY through
+	 * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1
+	 */
+	return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0;
 }
 
 static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
 			 struct sk_buff *skb, struct tcmsg *tcm)
 {
-	struct net_device *dev = qdisc_dev(sch);
+	if (cl < TC_H_MIN_PRIORITY) {
+		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+		struct net_device *dev = qdisc_dev(sch);
+		int tc = netdev_txq_to_tc(dev, cl - 1);
 
-	if (cl <= netdev_get_num_tc(dev)) {
+		tcm->tcm_parent = (tc < 0) ? 0 :
+			TC_H_MAKE(TC_H_MAJ(sch->handle),
+				  TC_H_MIN(tc + TC_H_MIN_PRIORITY));
+		tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	} else {
 		tcm->tcm_parent = TC_H_ROOT;
 		tcm->tcm_info = 0;
-	} else {
-		int i;
-		struct netdev_queue *dev_queue;
-
-		dev_queue = mqprio_queue_get(sch, cl);
-		tcm->tcm_parent = 0;
-		for (i = 0; i < netdev_get_num_tc(dev); i++) {
-			struct netdev_tc_txq tc = dev->tc_to_txq[i];
-			int q_idx = cl - netdev_get_num_tc(dev);
-
-			if (q_idx > tc.offset &&
-			    q_idx <= tc.offset + tc.count) {
-				tcm->tcm_parent =
-					TC_H_MAKE(TC_H_MAJ(sch->handle),
-						  TC_H_MIN(i + 1));
-				break;
-			}
-		}
-		tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
 	}
 	tcm->tcm_handle |= TC_H_MIN(cl);
 	return 0;
@@ -489,15 +490,14 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	__releases(d->lock)
 	__acquires(d->lock)
 {
-	struct net_device *dev = qdisc_dev(sch);
-
-	if (cl <= netdev_get_num_tc(dev)) {
+	if (cl >= TC_H_MIN_PRIORITY) {
 		int i;
 		__u32 qlen = 0;
 		struct Qdisc *qdisc;
 		struct gnet_stats_queue qstats = {0};
 		struct gnet_stats_basic_packed bstats = {0};
-		struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+		struct net_device *dev = qdisc_dev(sch);
+		struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK];
 
 		/* Drop lock here it will be reclaimed before touching
 		 * statistics this is required because the d->lock we
@@ -550,12 +550,25 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 
 	/* Walk hierarchy with a virtual class per tc */
 	arg->count = arg->skip;
-	for (ntx = arg->skip;
-	     ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
-	     ntx++) {
+	for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) {
+		if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+
+	/* Pad the values and skip over unused traffic classes */
+	if (ntx < TC_MAX_QUEUE) {
+		arg->count = TC_MAX_QUEUE;
+		ntx = TC_MAX_QUEUE;
+	}
+
+	/* Reset offset, sort out remaining per-queue qdiscs */
+	for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) {
 		if (arg->fn(sch, ntx + 1, arg) < 0) {
 			arg->stop = 1;
-			break;
+			return;
 		}
 		arg->count++;
 	}
-- 
cgit v1.2.3


From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:28 +0200
Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP

The 'cpumap' is primarily used as a backend map for XDP BPF helper
call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.

This patch implement the main part of the map.  It is not connected to
the XDP redirect system yet, and no SKB allocation are done yet.

The main concern in this patch is to ensure the datapath can run
without any locking.  This adds complexity to the setup and tear-down
procedure, which assumptions are extra carefully documented in the
code comments.

V2:
 - make sure array isn't larger than NR_CPUS
 - make sure CPUs added is a valid possible CPU

V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl>

V5:
 - Restrict map allocation to root / CAP_SYS_ADMIN
 - WARN_ON_ONCE if queue is not empty on tear-down
 - Return -EPERM on memlock limit instead of -ENOMEM
 - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup()
 - Moved cpu_map_enqueue() to next patch

V6: all notice by Daniel Borkmann
 - Fix err return code in cpu_map_alloc() introduced in V5
 - Move cpu_possible() check after max_entries boundary check
 - Forbid usage initially in check_map_func_compatibility()

V7:
 - Fix alloc error path spotted by Daniel Borkmann
 - Did stress test adding+removing CPUs from the map concurrently
 - Fixed refcnt issue on cpu_map_entry, kthread started too soon
 - Make sure packets are flushed during tear-down, involved use of
   rcu_barrier() and kthread_run only exit after queue is empty
 - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring

V8:
 - Nitpicking comments and gramma by Edward Cree
 - Fix missing semi-colon introduced in V7 due to rebasing
 - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/Makefile            |   1 +
 kernel/bpf/cpumap.c            | 560 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |   8 +-
 kernel/bpf/verifier.c          |   5 +
 tools/include/uapi/linux/bpf.h |   1 +
 7 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/cpumap.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 6f1a567667b8..814c1081a4a9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #ifdef CONFIG_STREAM_PARSER
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6db9e1d679cd..4303fb6c3817 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 53fb09f92e3f..e597daae6120 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..e1e25ddba038
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,560 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU.  The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage.  This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call.  It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+	void *q[CPU_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+	u32 qsize;  /* Queue size placeholder for map lookup */
+
+	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+	struct xdp_bulk_queue __percpu *bulkq;
+
+	/* Queue with potential multi-producers, and single-consumer kthread */
+	struct ptr_ring *queue;
+	struct task_struct *kthread;
+	struct work_struct kthread_stop_wq;
+
+	atomic_t refcnt; /* Control when this struct can be free'ed */
+	struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+	struct bpf_map map;
+	/* Below members specific for map type */
+	struct bpf_cpu_map_entry **cpu_map;
+	unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_cpu_map *cmap;
+	int err = -ENOMEM;
+	u64 cost;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+		return ERR_PTR(-EINVAL);
+
+	cmap = kzalloc(sizeof(*cmap), GFP_USER);
+	if (!cmap)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	cmap->map.map_type = attr->map_type;
+	cmap->map.key_size = attr->key_size;
+	cmap->map.value_size = attr->value_size;
+	cmap->map.max_entries = attr->max_entries;
+	cmap->map.map_flags = attr->map_flags;
+	cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+	/* Pre-limit array size based on NR_CPUS, not final CPU check */
+	if (cmap->map.max_entries > NR_CPUS) {
+		err = -E2BIG;
+		goto free_cmap;
+	}
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_cmap;
+	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	ret = bpf_map_precharge_memlock(cmap->map.pages);
+	if (ret) {
+		err = ret;
+		goto free_cmap;
+	}
+
+	/* A per cpu bitfield with a bit per possible CPU in map  */
+	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+					    __alignof__(unsigned long));
+	if (!cmap->flush_needed)
+		goto free_cmap;
+
+	/* Alloc array for possible remote "destination" CPUs */
+	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+					   sizeof(struct bpf_cpu_map_entry *),
+					   cmap->map.numa_node);
+	if (!cmap->cpu_map)
+		goto free_percpu;
+
+	return &cmap->map;
+free_percpu:
+	free_percpu(cmap->flush_needed);
+free_cmap:
+	kfree(cmap);
+	return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	if (WARN_ON_ONCE(ptr))
+		page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+	struct bpf_cpu_map_entry *rcpu;
+
+	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+	 * as it waits until all in-flight call_rcu() callbacks complete.
+	 */
+	rcu_barrier();
+
+	/* kthread_stop will wake_up_process and wait for it to complete */
+	kthread_stop(rcpu->kthread);
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+	struct bpf_cpu_map_entry *rcpu = data;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* When kthread gives stop order, then rcpu have been disconnected
+	 * from map, thus no new packets can enter. Remaining in-flight
+	 * per CPU stored packets are flushed to this queue.  Wait honoring
+	 * kthread_stop signal until queue is empty.
+	 */
+	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		struct xdp_pkt *xdp_pkt;
+
+		schedule();
+		/* Do work */
+		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
+			/* For now just "refcnt-free" */
+			page_frag_free(xdp_pkt);
+		}
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	put_cpu_map_entry(rcpu);
+	return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+	struct bpf_cpu_map_entry *rcpu;
+	int numa, err;
+
+	/* Have map->numa_node, but choose node of redirect target CPU */
+	numa = cpu_to_node(cpu);
+
+	rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+	if (!rcpu)
+		return NULL;
+
+	/* Alloc percpu bulkq */
+	rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+					 sizeof(void *), gfp);
+	if (!rcpu->bulkq)
+		goto free_rcu;
+
+	/* Alloc queue */
+	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+	if (!rcpu->queue)
+		goto free_bulkq;
+
+	err = ptr_ring_init(rcpu->queue, qsize, gfp);
+	if (err)
+		goto free_queue;
+
+	rcpu->qsize = qsize;
+
+	/* Setup kthread */
+	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+					       "cpumap/%d/map:%d", cpu, map_id);
+	if (IS_ERR(rcpu->kthread))
+		goto free_ptr_ring;
+
+	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+	/* Make sure kthread runs on a single CPU */
+	kthread_bind(rcpu->kthread, cpu);
+	wake_up_process(rcpu->kthread);
+
+	return rcpu;
+
+free_ptr_ring:
+	ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+	kfree(rcpu->queue);
+free_bulkq:
+	free_percpu(rcpu->bulkq);
+free_rcu:
+	kfree(rcpu);
+	return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_cpu_map_entry *rcpu;
+	int cpu;
+
+	/* This cpu_map_entry have been disconnected from map and one
+	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any
+	 * new packets and cannot change/set flush_needed that can
+	 * find this entry.
+	 */
+	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+	/* Flush remaining packets in percpu bulkq */
+	for_each_online_cpu(cpu) {
+		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+		/* No concurrent bq_enqueue can run at this point */
+		bq_flush_to_queue(rcpu, bq);
+	}
+	free_percpu(rcpu->bulkq);
+	/* Cannot kthread_stop() here, last put free rcpu resources */
+	put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program.  The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq).  A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue.  Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+	struct bpf_cpu_map_entry *old_rcpu;
+
+	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+	if (old_rcpu) {
+		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+		schedule_work(&old_rcpu->kthread_stop_wq);
+	}
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 key_cpu = *(u32 *)key;
+
+	if (key_cpu >= map->max_entries)
+		return -EINVAL;
+
+	/* notice caller map_delete_elem() use preempt_disable() */
+	__cpu_map_entry_replace(cmap, key_cpu, NULL);
+	return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	/* Array index key correspond to CPU number */
+	u32 key_cpu = *(u32 *)key;
+	/* Value is the queue size */
+	u32 qsize = *(u32 *)value;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(key_cpu >= cmap->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+	if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+		return -EOVERFLOW;
+
+	/* Make sure CPU is a valid possible cpu */
+	if (!cpu_possible(key_cpu))
+		return -ENODEV;
+
+	if (qsize == 0) {
+		rcpu = NULL; /* Same as deleting */
+	} else {
+		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+		if (!rcpu)
+			return -ENOMEM;
+	}
+	rcu_read_lock();
+	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
+	rcu_read_unlock();
+	return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	int cpu;
+	u32 i;
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the bpf programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+	 * It does __not__ ensure pending flush operations (if any) are
+	 * complete.
+	 */
+	synchronize_rcu();
+
+	/* To ensure all pending flush operations have completed wait for flush
+	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+	 * Because the above synchronize_rcu() ensures the map is disconnected
+	 * from the program we can assume no new bits will be set.
+	 */
+	for_each_online_cpu(cpu) {
+		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+		while (!bitmap_empty(bitmap, cmap->map.max_entries))
+			cond_resched();
+	}
+
+	/* For cpu_map the remote CPUs can still be using the entries
+	 * (struct bpf_cpu_map_entry).
+	 */
+	for (i = 0; i < cmap->map.max_entries; i++) {
+		struct bpf_cpu_map_entry *rcpu;
+
+		rcpu = READ_ONCE(cmap->cpu_map[i]);
+		if (!rcpu)
+			continue;
+
+		/* bq flush and cleanup happens after RCU graze-period */
+		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+	}
+	free_percpu(cmap->flush_needed);
+	bpf_map_area_free(cmap->cpu_map);
+	kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	rcpu = READ_ONCE(cmap->cpu_map[key]);
+	return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map_entry *rcpu =
+		__cpu_map_lookup_elem(map, *(u32 *)key);
+
+	return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= cmap->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == cmap->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+	.map_alloc		= cpu_map_alloc,
+	.map_free		= cpu_map_free,
+	.map_delete_elem	= cpu_map_delete_elem,
+	.map_update_elem	= cpu_map_update_elem,
+	.map_lookup_elem	= cpu_map_lookup_elem,
+	.map_get_next_key	= cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq)
+{
+	struct ptr_ring *q;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	q = rcpu->queue;
+	spin_lock(&q->producer_lock);
+
+	for (i = 0; i < bq->count; i++) {
+		void *xdp_pkt = bq->q[i];
+		int err;
+
+		err = __ptr_ring_produce(q, xdp_pkt);
+		if (err) {
+			/* Free xdp_pkt */
+			page_frag_free(xdp_pkt);
+		}
+	}
+	bq->count = 0;
+	spin_unlock(&q->producer_lock);
+
+	return 0;
+}
+
+/* Notice: Will change in later patch */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+};
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+		bq_flush_to_queue(rcpu, bq);
+
+	/* Notice, xdp_buff/page MUST be queued here, long enough for
+	 * driver to code invoking us to finished, due to driver
+	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
+	 *
+	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * with another CPU on page-refcnt and remaining driver code).
+	 * Queue time is very short, as driver will invoke flush
+	 * operation, when completing napi->poll call.
+	 */
+	bq->q[bq->count++] = xdp_pkt;
+	return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+	__set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+	u32 bit;
+
+	/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+	 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+	 * bitmap indicate which percpu bulkq have packets.
+	 */
+	for_each_set_bit(bit, bitmap, map->max_entries) {
+		struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+		struct xdp_bulk_queue *bq;
+
+		/* This is possible if entry is removed by user space
+		 * between xdp redirect and flush op.
+		 */
+		if (unlikely(!rcpu))
+			continue;
+
+		__clear_bit(bit, bitmap);
+
+		/* Flush all frames in bulkq to real queue */
+		bq = this_cpu_ptr(rcpu->bulkq);
+		bq_flush_to_queue(rcpu, bq);
+
+		/* If already running, costs spin_lock_irqsave + smb_mb */
+		wake_up_process(rcpu->kthread);
+	}
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d124e702e040..54fba06942f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
+	/* Need to create a kthread, thus must support schedule */
+	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		err = map->ops->map_update_elem(map, key, value, attr->flags);
+		goto out;
+	}
+
 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
 	 * inside bpf map update or delete otherwise deadlocks are possible
 	 */
@@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr)
 	}
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9755279d94cb..cefa64be9a2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	/* Restrict bpf side of cpumap, open when use-cases appear */
+	case BPF_MAP_TYPE_CPUMAP:
+		if (func_id != BPF_FUNC_redirect_map)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fb4fb81ce5b0..fa93033dc521 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
-- 
cgit v1.2.3


From 085b30625e39df67d7320f22269796276c6b0c11 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 23 Sep 2016 14:05:12 +0100
Subject: perf/core: Add PERF_AUX_FLAG_COLLISION to report colliding samples

The ARM SPE architecture permits an implementation to ignore a sample
if the sample is due to be taken whilst another sample is already being
produced. In this case, it is desirable to report the collision to
userspace, as they may want to lower the sample period.

This patch adds a PERF_AUX_FLAG_COLLISION flag, so that such events can
be relayed to userspace.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 140ae638cfd6..7ca1b22ea417 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -941,6 +941,7 @@ enum perf_callchain_context {
 #define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
 #define PERF_AUX_FLAG_OVERWRITE		0x02	/* snapshot from overwrite mode */
 #define PERF_AUX_FLAG_PARTIAL		0x04	/* record contains gaps */
+#define PERF_AUX_FLAG_COLLISION		0x08	/* sample collided with another */
 
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
-- 
cgit v1.2.3


From 7c950b9e53732f574e3a46d37c62f1f33d0b218c Mon Sep 17 00:00:00 2001
From: Dongdong Liu <liudongdong3@huawei.com>
Date: Wed, 11 Oct 2017 18:52:58 +0800
Subject: PCI/portdrv: Add #defines for AER and DPC Interrupt Message Number
 masks

In the AER case, the mask isn't strictly necessary because there are no
higher-order bits above the Interrupt Message Number, but using a #define
will make it possible to grep for it.

Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/pcie/portdrv_core.c | 4 ++--
 include/uapi/linux/pci_regs.h   | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 313a21df1692..72fcbe5567dd 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -114,7 +114,7 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
 		 */
 		pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &reg32);
-		entry = reg32 >> 27;
+		entry = (reg32 & PCI_ERR_ROOT_AER_IRQ) >> 27;
 		if (entry >= nr_entries)
 			goto out_free_irqs;
 
@@ -141,7 +141,7 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
 		 */
 		pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC);
 		pci_read_config_word(dev, pos + PCI_EXP_DPC_CAP, &reg16);
-		entry = reg16 & 0x1f;
+		entry = reg16 & PCI_EXP_DPC_IRQ;
 		if (entry >= nr_entries)
 			goto out_free_irqs;
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index f8d58045926f..f7c09a4c494a 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -746,6 +746,7 @@
 #define PCI_ERR_ROOT_FIRST_FATAL	0x00000010 /* First UNC is Fatal */
 #define PCI_ERR_ROOT_NONFATAL_RCV	0x00000020 /* Non-Fatal Received */
 #define PCI_ERR_ROOT_FATAL_RCV		0x00000040 /* Fatal Received */
+#define PCI_ERR_ROOT_AER_IRQ		0xf8000000 /* Advanced Error Interrupt Message Number */
 #define PCI_ERR_ROOT_ERR_SRC	52	/* Error Source Identification */
 
 /* Virtual Channel */
@@ -960,6 +961,7 @@
 
 /* Downstream Port Containment */
 #define PCI_EXP_DPC_CAP			4	/* DPC Capability */
+#define PCI_EXP_DPC_IRQ			0x1f	/* DPC Interrupt Message Number */
 #define  PCI_EXP_DPC_CAP_RP_EXT		0x20	/* Root Port Extensions for DPC */
 #define  PCI_EXP_DPC_CAP_POISONED_TLP	0x40	/* Poisoned TLP Egress Blocking Supported */
 #define  PCI_EXP_DPC_CAP_SW_TRIGGER	0x80	/* Software Triggering Supported */
-- 
cgit v1.2.3


From 1fba70e5b6bed53496ba1f1f16127f5be01b5fb6 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Oct 2017 11:22:51 -0700
Subject: tcp: socket option to set TCP fast open key

New socket option TCP_FASTOPEN_KEY to allow different keys per
listener.  The listener by default uses the global key until the
socket option is set.  The key is a 16 bytes long binary data. This
option has no effect on regular non-listener TCP sockets.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/request_sock.h |  2 ++
 include/net/tcp.h          |  5 +++--
 include/uapi/linux/tcp.h   |  1 +
 net/ipv4/sysctl_net_ipv4.c |  3 ++-
 net/ipv4/tcp.c             | 33 +++++++++++++++++++++++++++
 net/ipv4/tcp_fastopen.c    | 56 +++++++++++++++++++++++++++++++++-------------
 net/ipv4/tcp_ipv4.c        |  1 +
 7 files changed, 82 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 23e22054aa60..347015515a7d 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -150,6 +150,8 @@ struct fastopen_queue {
 	spinlock_t	lock;
 	int		qlen;		/* # of pending (TCP_SYN_RECV) reqs */
 	int		max_qlen;	/* != 0 iff TFO is currently enabled */
+
+	struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */
 };
 
 /** struct request_sock_queue - queue of request_socks
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3b3b9b968e2d..1efe8365cb28 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1555,9 +1555,10 @@ struct tcp_fastopen_request {
 	int				copied;	/* queued in tcp_connect() */
 };
 void tcp_free_fastopen_req(struct tcp_sock *tp);
-
+void tcp_fastopen_destroy_cipher(struct sock *sk);
 void tcp_fastopen_ctx_destroy(struct net *net);
-int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len);
+int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
+			      void *key, unsigned int len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 15c25eccab2b..69c7493e42f8 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -119,6 +119,7 @@ enum {
 #define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
 #define TCP_ULP			31	/* Attach a ULP to a TCP connection */
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
+#define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index cac8dd309f39..81d218346cf7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -284,7 +284,8 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
-		tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH);
+		tcp_fastopen_reset_cipher(net, NULL, user_key,
+					  TCP_FASTOPEN_KEY_LENGTH);
 	}
 
 bad_key:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3b34850d361f..8b1fa4dd4538 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2571,6 +2571,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		release_sock(sk);
 		return err;
 	}
+	case TCP_FASTOPEN_KEY: {
+		__u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+		if (optlen != sizeof(key))
+			return -EINVAL;
+
+		if (copy_from_user(key, optval, optlen))
+			return -EFAULT;
+
+		return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+	}
 	default:
 		/* fallthru */
 		break;
@@ -3157,6 +3168,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		return 0;
 
+	case TCP_FASTOPEN_KEY: {
+		__u8 key[TCP_FASTOPEN_KEY_LENGTH];
+		struct tcp_fastopen_context *ctx;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		rcu_read_lock();
+		ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
+		if (ctx)
+			memcpy(key, ctx->key, sizeof(key));
+		else
+			len = 0;
+		rcu_read_unlock();
+
+		len = min_t(unsigned int, len, sizeof(key));
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, key, len))
+			return -EFAULT;
+		return 0;
+	}
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 7ee4aadcdd71..21075ce19cb6 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -29,7 +29,7 @@ void tcp_fastopen_init_key_once(struct net *net)
 	 * for a valid cookie, so this is an acceptable risk.
 	 */
 	get_random_bytes(key, sizeof(key));
-	tcp_fastopen_reset_cipher(net, key, sizeof(key));
+	tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key));
 }
 
 static void tcp_fastopen_ctx_free(struct rcu_head *head)
@@ -40,6 +40,16 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
 	kfree(ctx);
 }
 
+void tcp_fastopen_destroy_cipher(struct sock *sk)
+{
+	struct tcp_fastopen_context *ctx;
+
+	ctx = rcu_dereference_protected(
+			inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1);
+	if (ctx)
+		call_rcu(&ctx->rcu, tcp_fastopen_ctx_free);
+}
+
 void tcp_fastopen_ctx_destroy(struct net *net)
 {
 	struct tcp_fastopen_context *ctxt;
@@ -55,10 +65,12 @@ void tcp_fastopen_ctx_destroy(struct net *net)
 		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
 }
 
-int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len)
+int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
+			      void *key, unsigned int len)
 {
-	int err;
 	struct tcp_fastopen_context *ctx, *octx;
+	struct fastopen_queue *q;
+	int err;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
@@ -79,27 +91,39 @@ error:		kfree(ctx);
 	}
 	memcpy(ctx->key, key, len);
 
-	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
 
-	octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
-				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
-	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
-	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+	if (sk) {
+		q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+		spin_lock_bh(&q->lock);
+		octx = rcu_dereference_protected(q->ctx,
+						 lockdep_is_held(&q->lock));
+		rcu_assign_pointer(q->ctx, ctx);
+		spin_unlock_bh(&q->lock);
+	} else {
+		spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
+		octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+			lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+		rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+		spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+	}
 
 	if (octx)
 		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
 	return err;
 }
 
-static bool __tcp_fastopen_cookie_gen(struct net *net,
-				      const void *path,
+static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path,
 				      struct tcp_fastopen_cookie *foc)
 {
 	struct tcp_fastopen_context *ctx;
 	bool ok = false;
 
 	rcu_read_lock();
-	ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+
+	ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
+	if (!ctx)
+		ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
+
 	if (ctx) {
 		crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
@@ -115,7 +139,7 @@ static bool __tcp_fastopen_cookie_gen(struct net *net,
  *
  * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
  */
-static bool tcp_fastopen_cookie_gen(struct net *net,
+static bool tcp_fastopen_cookie_gen(struct sock *sk,
 				    struct request_sock *req,
 				    struct sk_buff *syn,
 				    struct tcp_fastopen_cookie *foc)
@@ -124,7 +148,7 @@ static bool tcp_fastopen_cookie_gen(struct net *net,
 		const struct iphdr *iph = ip_hdr(syn);
 
 		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
-		return __tcp_fastopen_cookie_gen(net, path, foc);
+		return __tcp_fastopen_cookie_gen(sk, path, foc);
 	}
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -132,13 +156,13 @@ static bool tcp_fastopen_cookie_gen(struct net *net,
 		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
 		struct tcp_fastopen_cookie tmp;
 
-		if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) {
+		if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) {
 			struct in6_addr *buf = &tmp.addr;
 			int i;
 
 			for (i = 0; i < 4; i++)
 				buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
-			return __tcp_fastopen_cookie_gen(net, buf, foc);
+			return __tcp_fastopen_cookie_gen(sk, buf, foc);
 		}
 	}
 #endif
@@ -313,7 +337,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 		goto fastopen;
 
 	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
-	    tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) &&
+	    tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) &&
 	    foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
 	    foc->len == valid_foc.len &&
 	    !memcmp(foc->val, valid_foc.val, foc->len)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ecee4ddb24c5..28ca4e177047 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1893,6 +1893,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	/* If socket is aborted during connect operation */
 	tcp_free_fastopen_req(tp);
+	tcp_fastopen_destroy_cipher(sk);
 	tcp_saved_syn_free(tp);
 
 	sk_sockets_allocated_dec(sk);
-- 
cgit v1.2.3


From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:22 -0700
Subject: bpf: Add file mode configuration into bpf maps

Introduce the map read/write flags to the eBPF syscalls that returns the
map fd. The flags is used to set up the file mode when construct a new
file descriptor for bpf maps. To not break the backward capability, the
f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise
it should be O_RDONLY or O_WRONLY. When the userspace want to modify or
read the map content, it will check the file mode to see if it is
allowed to make the change.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  8 +++--
 include/uapi/linux/bpf.h |  6 ++++
 kernel/bpf/arraymap.c    |  6 +++-
 kernel/bpf/devmap.c      |  5 ++-
 kernel/bpf/hashtab.c     |  5 +--
 kernel/bpf/inode.c       | 15 ++++++---
 kernel/bpf/lpm_trie.c    |  3 +-
 kernel/bpf/sockmap.c     |  5 ++-
 kernel/bpf/stackmap.c    |  5 ++-
 kernel/bpf/syscall.c     | 88 ++++++++++++++++++++++++++++++++++++++++++------
 net/netfilter/xt_bpf.c   |  2 +-
 11 files changed, 122 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d67ccdc0099f..3e5508f2fa87 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -315,11 +315,11 @@ void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-int bpf_map_new_fd(struct bpf_map *map);
+int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
-int bpf_obj_get_user(const char __user *pathname);
+int bpf_obj_get_user(const char __user *pathname, int flags);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -338,6 +338,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
+int bpf_get_file_flag(int flags);
+
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
  * Best-effort only.  No barriers here, since it _will_ race with concurrent
@@ -421,7 +423,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
-static inline int bpf_obj_get_user(const char __user *pathname)
+static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4303fb6c3817..d83f95ea6a1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,10 @@ enum bpf_attach_type {
 
 #define BPF_OBJ_NAME_LEN 16U
 
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY		(1U << 3)
+#define BPF_F_WRONLY		(1U << 4)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -260,6 +264,7 @@ union bpf_attr {
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
+		__u32		file_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
@@ -287,6 +292,7 @@ union bpf_attr {
 			__u32		map_id;
 		};
 		__u32		next_id;
+		__u32		open_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 68d866628be0..988c04c91e10 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
 
 #include "map_in_map.h"
 
+#define ARRAY_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
 	int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+	    attr->value_size == 0 ||
+	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return ERR_PTR(-EINVAL);
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e093d9a2c4dd..e5d3de7cff2e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 
+#define DEV_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_dtab_netdev {
 	struct net_device *dev;
 	struct bpf_dtab *dtab;
@@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..919955236e63 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
 
-#define HTAB_CREATE_FLAG_MASK \
-	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK						\
+	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
+	 BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bucket {
 	struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
 }
 
 static void *bpf_obj_do_get(const struct filename *pathname,
-			    enum bpf_type *type)
+			    enum bpf_type *type, int flags)
 {
 	struct inode *inode;
 	struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
 		return ERR_PTR(ret);
 
 	inode = d_backing_inode(path.dentry);
-	ret = inode_permission(inode, MAY_WRITE);
+	ret = inode_permission(inode, ACC_MODE(flags));
 	if (ret)
 		goto out;
 
@@ -326,18 +326,23 @@ out:
 	return ERR_PTR(ret);
 }
 
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	enum bpf_type type = BPF_TYPE_UNSPEC;
 	struct filename *pname;
 	int ret = -ENOENT;
+	int f_flags;
 	void *raw;
 
+	f_flags = bpf_get_file_flag(flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	pname = getname(pathname);
 	if (IS_ERR(pname))
 		return PTR_ERR(pname);
 
-	raw = bpf_obj_do_get(pname, &type);
+	raw = bpf_obj_do_get(pname, &type, f_flags);
 	if (IS_ERR(raw)) {
 		ret = PTR_ERR(raw);
 		goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
 	if (type == BPF_TYPE_PROG)
 		ret = bpf_prog_new_fd(raw);
 	else if (type == BPF_TYPE_MAP)
-		ret = bpf_map_new_fd(raw);
+		ret = bpf_map_new_fd(raw, f_flags);
 	else
 		goto out;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 34d8a690ea05..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -495,7 +495,8 @@ out:
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
-#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\
+				 BPF_F_RDONLY | BPF_F_WRONLY)
 
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a298d6666698..86ec846f2d5e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -40,6 +40,9 @@
 #include <linux/list.h>
 #include <net/strparser.h>
 
+#define SOCK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_stab {
 	struct bpf_map map;
 	struct sock **sock_map;
@@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
 #include <linux/perf_event.h>
 #include "percpu_freelist.h"
 
+#define STACK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct stack_map_bucket {
 	struct pcpu_freelist_node fnode;
 	u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags & ~BPF_F_NUMA_NODE)
+	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0e893cac6795..676a06e6b322 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -34,6 +34,8 @@
 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
 
+#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -294,17 +296,48 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+			      loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_READ.
+	 */
+	return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+			       size_t siz, loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_WRITE.
+	 */
+	return -EINVAL;
+}
+
 static const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
 	.release	= bpf_map_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
-				O_RDWR | O_CLOEXEC);
+				flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+		return -EINVAL;
+	if (flags & BPF_F_RDONLY)
+		return O_RDONLY;
+	if (flags & BPF_F_WRONLY)
+		return O_WRONLY;
+	return O_RDWR;
 }
 
 /* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -344,12 +377,17 @@ static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_map *map;
+	int f_flags;
 	int err;
 
 	err = CHECK_ATTR(BPF_MAP_CREATE);
 	if (err)
 		return -EINVAL;
 
+	f_flags = bpf_get_file_flag(attr->map_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	if (numa_node != NUMA_NO_NODE &&
 	    ((unsigned int)numa_node >= nr_node_ids ||
 	     !node_online(numa_node)))
@@ -375,7 +413,7 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map;
 
-	err = bpf_map_new_fd(map);
+	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
 		 * bpf_map_put() is needed because the above
@@ -490,6 +528,11 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -570,6 +613,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -659,6 +707,11 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -702,6 +755,11 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	if (ukey) {
 		key = memdup_user(ukey, map->key_size);
 		if (IS_ERR(key)) {
@@ -908,6 +966,8 @@ static const struct file_operations bpf_prog_fops = {
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
 	.release	= bpf_prog_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
@@ -1117,11 +1177,11 @@ free_prog_nouncharge:
 	return err;
 }
 
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
 
 static int bpf_obj_pin(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ))
+	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
 		return -EINVAL;
 
 	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1129,10 +1189,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
 
 static int bpf_obj_get(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+				attr->file_flags);
 }
 
 #ifdef CONFIG_CGROUP_BPF
@@ -1392,20 +1454,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
 
 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 {
 	struct bpf_map *map;
 	u32 id = attr->map_id;
+	int f_flags;
 	int fd;
 
-	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	f_flags = bpf_get_file_flag(attr->open_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	spin_lock_bh(&map_idr_lock);
 	map = idr_find(&map_idr, id);
 	if (map)
@@ -1417,7 +1485,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	fd = bpf_map_new_fd(map);
+	fd = bpf_map_new_fd(map, f_flags);
 	if (fd < 0)
 		bpf_map_put(map);
 
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 29123934887b..041da0d9c06f 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -56,7 +56,7 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 	int retval, fd;
 
 	set_fs(KERNEL_DS);
-	fd = bpf_obj_get_user(path);
+	fd = bpf_obj_get_user(path, 0);
 	set_fs(oldfs);
 	if (fd < 0)
 		return fd;
-- 
cgit v1.2.3


From e6546ef6d86d0fc38e0e84ccae80e641f3fc0087 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 20 Oct 2017 11:05:39 -0700
Subject: bpf: add support for BPF_SOCK_OPS_BASE_RTT

A congestion control algorithm can make a call to the BPF socket_ops
program to request the base RTT. The base RTT can be congestion control
dependent and is meant to represent a congestion threshold such that
RTTs above it indicate congestion. This is especially useful for flows
within a DC where the base RTT is easy to obtain.

Being provided a base RTT solves a basic problem in RTT based congestion
avoidance algorithms (such as Vegas, NV and BBR). Although it is easy
to get the base RTT when the network is not congested, it is very
diffcult to do when it is very congested. Newer connections get an
inflated value of the base RTT leading to unfariness (newer flows with a
larger base RTT get more bandwidth). As a result, RTT based congestion
avoidance algorithms tend to update their base RTTs to improve fairness.
In very congested networks this can lead to base RTT inflation, reducing
the ability of these RTT based congestion control algorithms to prevent
congestion.

Note that in my experiments with TCP-NV, the base RTT provided can be
much larger than the actual hardware RTT. For example, experimenting
with hosts within a rack where the hardware RTT is 16-20us, I've used
base RTTs up to 150us. The effect of using a larger base RTT is that the
congestion avoidance algorithm will allow more queueing. When there are
only a few flows the main effect is larger measured RTTs and RPC
latencies due to the increased queueing. When there are a lot of flows,
a larger base RTT can lead to more congestion and more packet drops.
For this case, where the hardware RTT is 20us, a base RTT of 80us
produces good results.

This patch only introduces BPF_SOCK_OPS_BASE_RTT, a later patch in this
set adds support for using it in TCP-NV. Further study and testing is
needed before support can be added to other delay based congestion
avoidance algorithms.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d83f95ea6a1b..1aca744c220f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -955,6 +955,13 @@ enum {
 	BPF_SOCK_OPS_NEEDS_ECN,		/* If connection's congestion control
 					 * needs ECN
 					 */
+	BPF_SOCK_OPS_BASE_RTT,		/* Get base RTT. The correct value is
+					 * based on the path and may be
+					 * dependent on the congestion control
+					 * algorithm. In general it indicates
+					 * a congestion threshold. RTTs above
+					 * this indicate congestion
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
-- 
cgit v1.2.3


From cd86d1fd21025fdd6daf23d1288da405e7ad0ec6 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 20 Oct 2017 11:05:40 -0700
Subject: bpf: Adding helper function bpf_getsockops

Adding support for helper function bpf_getsockops to socket_ops BPF
programs. This patch only supports TCP_CONGESTION.

Signed-off-by: Vlad Vysotsky <vlad@cs.ucla.edu>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h                  | 19 ++++++++++---
 net/core/filter.c                         | 46 ++++++++++++++++++++++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++
 3 files changed, 63 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1aca744c220f..f650346aaa1a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -613,12 +613,22 @@ union bpf_attr {
  * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
  *     Calls setsockopt. Not all opts are available, only those with
  *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPROTO_TCP
+ *     Supported levels: SOL_SOCKET and IPPROTO_TCP
  *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPROTO_TCP
+ *     @level: SOL_SOCKET or IPPROTO_TCP
  *     @optname: option name
  *     @optval: pointer to option value
- *     @optlen: length of optval in byes
+ *     @optlen: length of optval in bytes
+ *     Return: 0 or negative error
+ *
+ * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
+ *     Calls getsockopt. Not all opts are available.
+ *     Supported levels: IPPROTO_TCP
+ *     @bpf_socket: pointer to bpf_socket
+ *     @level: IPPROTO_TCP
+ *     @optname: option name
+ *     @optval: pointer to option value
+ *     @optlen: length of optval in bytes
  *     Return: 0 or negative error
  *
  * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
@@ -721,7 +731,8 @@ union bpf_attr {
 	FN(sock_map_update),		\
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
-	FN(perf_prog_read_value),
+	FN(perf_prog_read_value),	\
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 09e011f20291..ccf62f44140a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3273,7 +3273,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 
 static const struct bpf_func_proto bpf_setsockopt_proto = {
 	.func		= bpf_setsockopt,
-	.gpl_only	= true,
+	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
@@ -3282,6 +3282,48 @@ static const struct bpf_func_proto bpf_setsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, level, int, optname, char *, optval, int, optlen)
+{
+	struct sock *sk = bpf_sock->sk;
+	int ret = 0;
+
+	if (!sk_fullsock(sk))
+		goto err_clear;
+
+#ifdef CONFIG_INET
+	if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
+		if (optname == TCP_CONGESTION) {
+			struct inet_connection_sock *icsk = inet_csk(sk);
+
+			if (!icsk->icsk_ca_ops || optlen <= 1)
+				goto err_clear;
+			strncpy(optval, icsk->icsk_ca_ops->name, optlen);
+			optval[optlen - 1] = 0;
+		} else {
+			goto err_clear;
+		}
+	} else {
+		goto err_clear;
+	}
+	return ret;
+#endif
+err_clear:
+	memset(optval, 0, optlen);
+	return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_getsockopt_proto = {
+	.func		= bpf_getsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3460,6 +3502,8 @@ static const struct bpf_func_proto *
 	switch (func_id) {
 	case BPF_FUNC_setsockopt:
 		return &bpf_setsockopt_proto;
+	case BPF_FUNC_getsockopt:
+		return &bpf_getsockopt_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
 	default:
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index e25dbf6038cf..609514f74482 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -67,6 +67,9 @@ static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) =
 static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
 			     int optlen) =
 	(void *) BPF_FUNC_setsockopt;
+static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval,
+			     int optlen) =
+	(void *) BPF_FUNC_getsockopt;
 static int (*bpf_sk_redirect_map)(void *map, int key, int flags) =
 	(void *) BPF_FUNC_sk_redirect_map;
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
-- 
cgit v1.2.3


From 40b16b9be5773a314948656c96adf7bf7cfdbd0b Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 21 Oct 2017 11:45:46 +0200
Subject: batman-adv: use inline kernel-doc for uapi constants

The enums of constants for netlink tends to become rather large over time.
Documenting them is easier when the kernel-doc is actually next to constant
and not in a different block above the enum.

Also inline kernel-doc allows multi-paragraph description. This could be
required to better document the netlink command types and the expected
return values.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h | 369 +++++++++++++++++++++++++++++++---------
 1 file changed, 290 insertions(+), 79 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index a83ddb7b63db..efd641c8a5d6 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -24,20 +24,6 @@
 
 /**
  * enum batadv_tt_client_flags - TT client specific flags
- * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
- * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new
- *  update telling its new real location has not been received/sent yet
- * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface.
- *  This information is used by the "AP Isolation" feature
- * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
- *  information is used by the Extended Isolation feature
- * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table
- * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has
- *  not been announced yet
- * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept
- *  in the table for one more originator interval for consistency purposes
- * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of
- *  the network but no nnode has already announced it
  *
  * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire.
  * Bits from 8 to 15 are called _local flags_ because they are used for local
@@ -48,160 +34,385 @@
  * in the TT CRC computation.
  */
 enum batadv_tt_client_flags {
+	/**
+	 * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
+	 */
 	BATADV_TT_CLIENT_DEL     = (1 << 0),
+
+	/**
+	 * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and
+	 * the new update telling its new real location has not been
+	 * received/sent yet
+	 */
 	BATADV_TT_CLIENT_ROAM    = (1 << 1),
+
+	/**
+	 * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi
+	 * interface. This information is used by the "AP Isolation" feature
+	 */
 	BATADV_TT_CLIENT_WIFI    = (1 << 4),
+
+	/**
+	 * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
+	 * information is used by the Extended Isolation feature
+	 */
 	BATADV_TT_CLIENT_ISOLA	 = (1 << 5),
+
+	/**
+	 * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from
+	 * the table
+	 */
 	BATADV_TT_CLIENT_NOPURGE = (1 << 8),
+
+	/**
+	 * @BATADV_TT_CLIENT_NEW: this client has been added to the local table
+	 * but has not been announced yet
+	 */
 	BATADV_TT_CLIENT_NEW     = (1 << 9),
+
+	/**
+	 * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it
+	 * is kept in the table for one more originator interval for consistency
+	 * purposes
+	 */
 	BATADV_TT_CLIENT_PENDING = (1 << 10),
+
+	/**
+	 * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be
+	 * part of the network but no nnode has already announced it
+	 */
 	BATADV_TT_CLIENT_TEMP	 = (1 << 11),
 };
 
 /**
  * enum batadv_nl_attrs - batman-adv netlink attributes
- *
- * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors
- * @BATADV_ATTR_VERSION: batman-adv version string
- * @BATADV_ATTR_ALGO_NAME: name of routing algorithm
- * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface
- * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface
- * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface
- * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface
- * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface
- * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv interface
- * @BATADV_ATTR_ORIG_ADDRESS: originator mac address
- * @BATADV_ATTR_TPMETER_RESULT: result of run (see batadv_tp_meter_status)
- * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took
- * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run
- * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session
- * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment
- * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active
- * @BATADV_ATTR_TT_ADDRESS: Client MAC address
- * @BATADV_ATTR_TT_TTVN: Translation table version
- * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version
- * @BATADV_ATTR_TT_CRC32: CRC32 over translation table
- * @BATADV_ATTR_TT_VID: VLAN ID
- * @BATADV_ATTR_TT_FLAGS: Translation table client flags
- * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best
- * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
- * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
- * @BATADV_ATTR_TQ: TQ to neighbour
- * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour
- * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth
- * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth
- * @BATADV_ATTR_ROUTER: Gateway router MAC address
- * @BATADV_ATTR_BLA_OWN: Flag indicating own originator
- * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address
- * @BATADV_ATTR_BLA_VID: BLA VLAN ID
- * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address
- * @BATADV_ATTR_BLA_CRC: BLA CRC
- * @__BATADV_ATTR_AFTER_LAST: internal use
- * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
- * @BATADV_ATTR_MAX: highest attribute number currently defined
  */
 enum batadv_nl_attrs {
+	/**
+	 * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors
+	 */
 	BATADV_ATTR_UNSPEC,
+
+	/**
+	 * @BATADV_ATTR_VERSION: batman-adv version string
+	 */
 	BATADV_ATTR_VERSION,
+
+	/**
+	 * @BATADV_ATTR_ALGO_NAME: name of routing algorithm
+	 */
 	BATADV_ATTR_ALGO_NAME,
+
+	/**
+	 * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_IFINDEX,
+
+	/**
+	 * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_IFNAME,
+
+	/**
+	 * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface
+	 */
 	BATADV_ATTR_HARD_IFINDEX,
+
+	/**
+	 * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface
+	 */
 	BATADV_ATTR_HARD_IFNAME,
+
+	/**
+	 * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv
+	 * interface
+	 */
 	BATADV_ATTR_HARD_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_ORIG_ADDRESS: originator mac address
+	 */
 	BATADV_ATTR_ORIG_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_RESULT: result of run (see
+	 * batadv_tp_meter_status)
+	 */
 	BATADV_ATTR_TPMETER_RESULT,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took
+	 */
 	BATADV_ATTR_TPMETER_TEST_TIME,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run
+	 */
 	BATADV_ATTR_TPMETER_BYTES,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session
+	 */
 	BATADV_ATTR_TPMETER_COOKIE,
+
+	/**
+	 * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment
+	 */
 	BATADV_ATTR_PAD,
+
+	/**
+	 * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active
+	 */
 	BATADV_ATTR_ACTIVE,
+
+	/**
+	 * @BATADV_ATTR_TT_ADDRESS: Client MAC address
+	 */
 	BATADV_ATTR_TT_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TT_TTVN: Translation table version
+	 */
 	BATADV_ATTR_TT_TTVN,
+
+	/**
+	 * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version
+	 */
 	BATADV_ATTR_TT_LAST_TTVN,
+
+	/**
+	 * @BATADV_ATTR_TT_CRC32: CRC32 over translation table
+	 */
 	BATADV_ATTR_TT_CRC32,
+
+	/**
+	 * @BATADV_ATTR_TT_VID: VLAN ID
+	 */
 	BATADV_ATTR_TT_VID,
+
+	/**
+	 * @BATADV_ATTR_TT_FLAGS: Translation table client flags
+	 */
 	BATADV_ATTR_TT_FLAGS,
+
+	/**
+	 * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best
+	 */
 	BATADV_ATTR_FLAG_BEST,
+
+	/**
+	 * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
+	 */
 	BATADV_ATTR_LAST_SEEN_MSECS,
+
+	/**
+	 * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
+	 */
 	BATADV_ATTR_NEIGH_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TQ: TQ to neighbour
+	 */
 	BATADV_ATTR_TQ,
+
+	/**
+	 * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour
+	 */
 	BATADV_ATTR_THROUGHPUT,
+
+	/**
+	 * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth
+	 */
 	BATADV_ATTR_BANDWIDTH_UP,
+
+	/**
+	 * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth
+	 */
 	BATADV_ATTR_BANDWIDTH_DOWN,
+
+	/**
+	 * @BATADV_ATTR_ROUTER: Gateway router MAC address
+	 */
 	BATADV_ATTR_ROUTER,
+
+	/**
+	 * @BATADV_ATTR_BLA_OWN: Flag indicating own originator
+	 */
 	BATADV_ATTR_BLA_OWN,
+
+	/**
+	 * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address
+	 */
 	BATADV_ATTR_BLA_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_BLA_VID: BLA VLAN ID
+	 */
 	BATADV_ATTR_BLA_VID,
+
+	/**
+	 * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address
+	 */
 	BATADV_ATTR_BLA_BACKBONE,
+
+	/**
+	 * @BATADV_ATTR_BLA_CRC: BLA CRC
+	 */
 	BATADV_ATTR_BLA_CRC,
+
 	/* add attributes above here, update the policy in netlink.c */
+
+	/**
+	 * @__BATADV_ATTR_AFTER_LAST: internal use
+	 */
 	__BATADV_ATTR_AFTER_LAST,
+
+	/**
+	 * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
+	 */
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
+
+	/**
+	 * @BATADV_ATTR_MAX: highest attribute number currently defined
+	 */
 	BATADV_ATTR_MAX = __BATADV_ATTR_AFTER_LAST - 1
 };
 
 /**
  * enum batadv_nl_commands - supported batman-adv netlink commands
- *
- * @BATADV_CMD_UNSPEC: unspecified command to catch errors
- * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv device
- * @BATADV_CMD_TP_METER: Start a tp meter session
- * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
- * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
- * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
- * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
- * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations
- * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
- * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
- * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
- * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims
- * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance backbones
- * @__BATADV_CMD_AFTER_LAST: internal use
- * @BATADV_CMD_MAX: highest used command number
  */
 enum batadv_nl_commands {
+	/**
+	 * @BATADV_CMD_UNSPEC: unspecified command to catch errors
+	 */
 	BATADV_CMD_UNSPEC,
+
+	/**
+	 * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv
+	 * device
+	 */
 	BATADV_CMD_GET_MESH_INFO,
+
+	/**
+	 * @BATADV_CMD_TP_METER: Start a tp meter session
+	 */
 	BATADV_CMD_TP_METER,
+
+	/**
+	 * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
+	 */
 	BATADV_CMD_TP_METER_CANCEL,
+
+	/**
+	 * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
+	 */
 	BATADV_CMD_GET_ROUTING_ALGOS,
+
+	/**
+	 * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
+	 */
 	BATADV_CMD_GET_HARDIFS,
+
+	/**
+	 * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
+	 */
 	BATADV_CMD_GET_TRANSTABLE_LOCAL,
+
+	/**
+	 * @BATADV_CMD_GET_TRANSTABLE_GLOBAL: Query list of global translations
+	 */
 	BATADV_CMD_GET_TRANSTABLE_GLOBAL,
+
+	/**
+	 * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
+	 */
 	BATADV_CMD_GET_ORIGINATORS,
+
+	/**
+	 * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
+	 */
 	BATADV_CMD_GET_NEIGHBORS,
+
+	/**
+	 * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
+	 */
 	BATADV_CMD_GET_GATEWAYS,
+
+	/**
+	 * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims
+	 */
 	BATADV_CMD_GET_BLA_CLAIM,
+
+	/**
+	 * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance
+	 * backbones
+	 */
 	BATADV_CMD_GET_BLA_BACKBONE,
+
 	/* add new commands above here */
+
+	/**
+	 * @__BATADV_CMD_AFTER_LAST: internal use
+	 */
 	__BATADV_CMD_AFTER_LAST,
+
+	/**
+	 * @BATADV_CMD_MAX: highest used command number
+	 */
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
 };
 
 /**
  * enum batadv_tp_meter_reason - reason of a tp meter test run stop
- * @BATADV_TP_REASON_COMPLETE: sender finished tp run
- * @BATADV_TP_REASON_CANCEL: sender was stopped during run
- * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or didn't
- *  answer
- * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit
- * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node already
- *  ongoing
- * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory
- * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface
- * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions
  */
 enum batadv_tp_meter_reason {
+	/**
+	 * @BATADV_TP_REASON_COMPLETE: sender finished tp run
+	 */
 	BATADV_TP_REASON_COMPLETE		= 3,
+
+	/**
+	 * @BATADV_TP_REASON_CANCEL: sender was stopped during run
+	 */
 	BATADV_TP_REASON_CANCEL			= 4,
+
 	/* error status >= 128 */
+
+	/**
+	 * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or
+	 * didn't answer
+	 */
 	BATADV_TP_REASON_DST_UNREACHABLE	= 128,
+
+	/**
+	 * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit
+	 */
 	BATADV_TP_REASON_RESEND_LIMIT		= 129,
+
+	/**
+	 * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node
+	 * already ongoing
+	 */
 	BATADV_TP_REASON_ALREADY_ONGOING	= 130,
+
+	/**
+	 * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory
+	 */
 	BATADV_TP_REASON_MEMORY_ERROR		= 131,
+
+	/**
+	 * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface
+	 */
 	BATADV_TP_REASON_CANT_SEND		= 132,
+
+	/**
+	 * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions
+	 */
 	BATADV_TP_REASON_TOO_MANY		= 133,
 };
 
-- 
cgit v1.2.3


From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Mon, 23 Oct 2017 13:22:23 -0700
Subject: tcp: Configure TFO without cookie per socket and/or per route

We already allow to enable TFO without a cookie by using the
fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or
TFO_CLIENT_NO_COOKIE).
This is safe to do in certain environments where we know that there
isn't a malicous host (aka., data-centers) or when the
application-protocol already provides an authentication mechanism in the
first flight of data.

A server however might be providing multiple services or talking to both
sides (public Internet and data-center). So, this server would want to
enable cookie-less TFO for certain services and/or for connections that
go to the data-center.

This patch exposes a socket-option and a per-route attribute to enable such
fine-grained configurations.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h            |  3 ++-
 include/net/tcp.h              |  3 ++-
 include/uapi/linux/rtnetlink.h |  2 ++
 include/uapi/linux/tcp.h       |  1 +
 net/ipv4/tcp.c                 | 12 ++++++++++++
 net/ipv4/tcp_fastopen.c        | 20 +++++++++++++++++---
 net/ipv4/tcp_input.c           |  2 +-
 7 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1d2c44e09e31..173a7c2f9636 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -215,7 +215,8 @@ struct tcp_sock {
 	u8	chrono_type:2,	/* current chronograph type */
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
-		unused:4;
+		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+		unused:3;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2c13484704cb..2392f74074e7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1567,7 +1567,8 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
-			      struct tcp_fastopen_cookie *foc);
+			      struct tcp_fastopen_cookie *foc,
+			      const struct dst_entry *dst);
 void tcp_fastopen_init_key_once(struct net *net);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index dab7dad9e01a..fe6679268901 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -430,6 +430,8 @@ enum {
 #define RTAX_QUICKACK RTAX_QUICKACK
 	RTAX_CC_ALGO,
 #define RTAX_CC_ALGO RTAX_CC_ALGO
+	RTAX_FASTOPEN_NO_COOKIE,
+#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE
 	__RTAX_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 69c7493e42f8..d67e1d40c6d6 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -120,6 +120,7 @@ enum {
 #define TCP_ULP			31	/* Attach a ULP to a TCP connection */
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
+#define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index be07e9b6dbdd..8f36277e82e9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2836,6 +2836,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EOPNOTSUPP;
 		}
 		break;
+	case TCP_FASTOPEN_NO_COOKIE:
+		if (val > 1 || val < 0)
+			err = -EINVAL;
+		else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+			err = -EINVAL;
+		else
+			tp->fastopen_no_cookie = val;
+		break;
 	case TCP_TIMESTAMP:
 		if (!tp->repair)
 			err = -EPERM;
@@ -3256,6 +3264,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = tp->fastopen_connect;
 		break;
 
+	case TCP_FASTOPEN_NO_COOKIE:
+		val = tp->fastopen_no_cookie;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp_raw() + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 21075ce19cb6..e0a4b56644aa 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -310,13 +310,23 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
 	return true;
 }
 
+static bool tcp_fastopen_no_cookie(const struct sock *sk,
+				   const struct dst_entry *dst,
+				   int flag)
+{
+	return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) ||
+	       tcp_sk(sk)->fastopen_no_cookie ||
+	       (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
+}
+
 /* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
  * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
  * cookie request (foc->len == 0).
  */
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
-			      struct tcp_fastopen_cookie *foc)
+			      struct tcp_fastopen_cookie *foc,
+			      const struct dst_entry *dst)
 {
 	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
 	int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
@@ -333,7 +343,8 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 		return NULL;
 	}
 
-	if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+	if (syn_data &&
+	    tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
 		goto fastopen;
 
 	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
@@ -370,6 +381,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			       struct tcp_fastopen_cookie *cookie)
 {
 	unsigned long last_syn_loss = 0;
+	const struct dst_entry *dst;
 	int syn_loss = 0;
 
 	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
@@ -387,7 +399,9 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 		return false;
 	}
 
-	if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+	dst = __sk_dst_get(sk);
+
+	if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
 		cookie->len = -1;
 		return true;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5e64d4b5839..893286db4623 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6332,7 +6332,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_openreq_init_rwin(req, sk, dst);
 	if (!want_cookie) {
 		tcp_reqsk_record_syn(sk, req, skb);
-		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
+		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
 	}
 	if (fastopen_sk) {
 		af_ops->send_synack(fastopen_sk, dst, &fl, req,
-- 
cgit v1.2.3


From d15155824c5014803d91b829736d249c500bdda6 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 24 Oct 2017 11:22:46 +0100
Subject: linux/compiler.h: Split into compiler.h and compiler_types.h

linux/compiler.h is included indirectly by linux/types.h via
uapi/linux/types.h -> uapi/linux/posix_types.h -> linux/stddef.h
-> uapi/linux/stddef.h and is needed to provide a proper definition of
offsetof.

Unfortunately, compiler.h requires a definition of
smp_read_barrier_depends() for defining lockless_dereference() and soon
for defining READ_ONCE(), which means that all
users of READ_ONCE() will need to include asm/barrier.h to avoid splats
such as:

   In file included from include/uapi/linux/stddef.h:1:0,
                    from include/linux/stddef.h:4,
                    from arch/h8300/kernel/asm-offsets.c:11:
   include/linux/list.h: In function 'list_empty':
>> include/linux/compiler.h:343:2: error: implicit declaration of function 'smp_read_barrier_depends' [-Werror=implicit-function-declaration]
     smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \
     ^

A better alternative is to include asm/barrier.h in linux/compiler.h,
but this requires a type definition for "bool" on some architectures
(e.g. x86), which is defined later by linux/types.h. Type "bool" is also
used directly in linux/compiler.h, so the whole thing is pretty fragile.

This patch splits compiler.h in two: compiler_types.h contains type
annotations, definitions and the compiler-specific parts, whereas
compiler.h #includes compiler-types.h and additionally defines macros
such as {READ,WRITE.ACCESS}_ONCE().

uapi/linux/stddef.h and linux/linkage.h are then moved over to include
linux/compiler_types.h, which fixes the build for h8 and blackfin.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1508840570-22169-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/ptrace.h   |   3 +-
 arch/sparc/include/asm/ptrace.h |   1 +
 arch/um/include/shared/init.h   |   2 +-
 include/linux/compiler-clang.h  |   2 +-
 include/linux/compiler-gcc.h    |   2 +-
 include/linux/compiler-intel.h  |   2 +-
 include/linux/compiler.h        | 265 +-------------------------------------
 include/linux/compiler_types.h  | 274 ++++++++++++++++++++++++++++++++++++++++
 include/linux/linkage.h         |   2 +-
 include/uapi/linux/stddef.h     |   2 +-
 scripts/headers_install.sh      |   2 +-
 11 files changed, 286 insertions(+), 271 deletions(-)
 create mode 100644 include/linux/compiler_types.h

(limited to 'include/uapi/linux')

diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h
index e9c9a117bd25..c7cdbb43ae7c 100644
--- a/arch/arm/include/asm/ptrace.h
+++ b/arch/arm/include/asm/ptrace.h
@@ -126,8 +126,7 @@ extern unsigned long profile_pc(struct pt_regs *regs);
 /*
  * kprobe-based event tracer support
  */
-#include <linux/stddef.h>
-#include <linux/types.h>
+#include <linux/compiler.h>
 #define MAX_REG_OFFSET (offsetof(struct pt_regs, ARM_ORIG_r0))
 
 extern int regs_query_register_offset(const char *name);
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index d73428e4333c..b383484edcd3 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -6,6 +6,7 @@
 #if defined(__sparc__) && defined(__arch64__)
 #ifndef __ASSEMBLY__
 
+#include <linux/compiler.h>
 #include <linux/threads.h>
 #include <asm/switch_to.h>
 
diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h
index 233e2593eee0..094e96ce653b 100644
--- a/arch/um/include/shared/init.h
+++ b/arch/um/include/shared/init.h
@@ -40,7 +40,7 @@
 typedef int (*initcall_t)(void);
 typedef void (*exitcall_t)(void);
 
-#include <linux/compiler.h>
+#include <linux/compiler_types.h>
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index de179993e039..5947a3e6c0e6 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -1,4 +1,4 @@
-#ifndef __LINUX_COMPILER_H
+#ifndef __LINUX_COMPILER_TYPES_H
 #error "Please don't include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead."
 #endif
 
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 16d41de92ee3..ce8e965646ef 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -1,4 +1,4 @@
-#ifndef __LINUX_COMPILER_H
+#ifndef __LINUX_COMPILER_TYPES_H
 #error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
 #endif
 
diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
index d4c71132d07f..e438ac89c692 100644
--- a/include/linux/compiler-intel.h
+++ b/include/linux/compiler-intel.h
@@ -1,4 +1,4 @@
-#ifndef __LINUX_COMPILER_H
+#ifndef __LINUX_COMPILER_TYPES_H
 #error "Please don't include <linux/compiler-intel.h> directly, include <linux/compiler.h> instead."
 #endif
 
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index e95a2631e545..08083186e54f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -1,111 +1,12 @@
 #ifndef __LINUX_COMPILER_H
 #define __LINUX_COMPILER_H
 
-#ifndef __ASSEMBLY__
+#include <linux/compiler_types.h>
 
-#ifdef __CHECKER__
-# define __user		__attribute__((noderef, address_space(1)))
-# define __kernel	__attribute__((address_space(0)))
-# define __safe		__attribute__((safe))
-# define __force	__attribute__((force))
-# define __nocast	__attribute__((nocast))
-# define __iomem	__attribute__((noderef, address_space(2)))
-# define __must_hold(x)	__attribute__((context(x,1,1)))
-# define __acquires(x)	__attribute__((context(x,0,1)))
-# define __releases(x)	__attribute__((context(x,1,0)))
-# define __acquire(x)	__context__(x,1)
-# define __release(x)	__context__(x,-1)
-# define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
-# define __percpu	__attribute__((noderef, address_space(3)))
-# define __rcu		__attribute__((noderef, address_space(4)))
-# define __private	__attribute__((noderef))
-extern void __chk_user_ptr(const volatile void __user *);
-extern void __chk_io_ptr(const volatile void __iomem *);
-# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
-#else /* __CHECKER__ */
-# ifdef STRUCTLEAK_PLUGIN
-#  define __user __attribute__((user))
-# else
-#  define __user
-# endif
-# define __kernel
-# define __safe
-# define __force
-# define __nocast
-# define __iomem
-# define __chk_user_ptr(x) (void)0
-# define __chk_io_ptr(x) (void)0
-# define __builtin_warning(x, y...) (1)
-# define __must_hold(x)
-# define __acquires(x)
-# define __releases(x)
-# define __acquire(x) (void)0
-# define __release(x) (void)0
-# define __cond_lock(x,c) (c)
-# define __percpu
-# define __rcu
-# define __private
-# define ACCESS_PRIVATE(p, member) ((p)->member)
-#endif /* __CHECKER__ */
-
-/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
-#define ___PASTE(a,b) a##b
-#define __PASTE(a,b) ___PASTE(a,b)
+#ifndef __ASSEMBLY__
 
 #ifdef __KERNEL__
 
-#ifdef __GNUC__
-#include <linux/compiler-gcc.h>
-#endif
-
-#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__)
-#define notrace __attribute__((hotpatch(0,0)))
-#else
-#define notrace __attribute__((no_instrument_function))
-#endif
-
-/* Intel compiler defines __GNUC__. So we will overwrite implementations
- * coming from above header files here
- */
-#ifdef __INTEL_COMPILER
-# include <linux/compiler-intel.h>
-#endif
-
-/* Clang compiler defines __GNUC__. So we will overwrite implementations
- * coming from above header files here
- */
-#ifdef __clang__
-#include <linux/compiler-clang.h>
-#endif
-
-/*
- * Generic compiler-dependent macros required for kernel
- * build go below this comment. Actual compiler/compiler version
- * specific implementations come from the above header files
- */
-
-struct ftrace_branch_data {
-	const char *func;
-	const char *file;
-	unsigned line;
-	union {
-		struct {
-			unsigned long correct;
-			unsigned long incorrect;
-		};
-		struct {
-			unsigned long miss;
-			unsigned long hit;
-		};
-		unsigned long miss_hit[2];
-	};
-};
-
-struct ftrace_likely_data {
-	struct ftrace_branch_data	data;
-	unsigned long			constant;
-};
-
 /*
  * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
  * to disable branch tracing on a per file basis.
@@ -332,6 +233,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * with an explicit memory barrier or atomic instruction that provides the
  * required ordering.
  */
+#include <asm/barrier.h>
 
 #define __READ_ONCE(x, check)						\
 ({									\
@@ -362,167 +264,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 
 #endif /* __ASSEMBLY__ */
 
-#ifdef __KERNEL__
-/*
- * Allow us to mark functions as 'deprecated' and have gcc emit a nice
- * warning for each use, in hopes of speeding the functions removal.
- * Usage is:
- * 		int __deprecated foo(void)
- */
-#ifndef __deprecated
-# define __deprecated		/* unimplemented */
-#endif
-
-#ifdef MODULE
-#define __deprecated_for_modules __deprecated
-#else
-#define __deprecated_for_modules
-#endif
-
-#ifndef __must_check
-#define __must_check
-#endif
-
-#ifndef CONFIG_ENABLE_MUST_CHECK
-#undef __must_check
-#define __must_check
-#endif
-#ifndef CONFIG_ENABLE_WARN_DEPRECATED
-#undef __deprecated
-#undef __deprecated_for_modules
-#define __deprecated
-#define __deprecated_for_modules
-#endif
-
-#ifndef __malloc
-#define __malloc
-#endif
-
-/*
- * Allow us to avoid 'defined but not used' warnings on functions and data,
- * as well as force them to be emitted to the assembly file.
- *
- * As of gcc 3.4, static functions that are not marked with attribute((used))
- * may be elided from the assembly file.  As of gcc 3.4, static data not so
- * marked will not be elided, but this may change in a future gcc version.
- *
- * NOTE: Because distributions shipped with a backported unit-at-a-time
- * compiler in gcc 3.3, we must define __used to be __attribute__((used))
- * for gcc >=3.3 instead of 3.4.
- *
- * In prior versions of gcc, such functions and data would be emitted, but
- * would be warned about except with attribute((unused)).
- *
- * Mark functions that are referenced only in inline assembly as __used so
- * the code is emitted even though it appears to be unreferenced.
- */
-#ifndef __used
-# define __used			/* unimplemented */
-#endif
-
-#ifndef __maybe_unused
-# define __maybe_unused		/* unimplemented */
-#endif
-
-#ifndef __always_unused
-# define __always_unused	/* unimplemented */
-#endif
-
-#ifndef noinline
-#define noinline
-#endif
-
-/*
- * Rather then using noinline to prevent stack consumption, use
- * noinline_for_stack instead.  For documentation reasons.
- */
-#define noinline_for_stack noinline
-
-#ifndef __always_inline
-#define __always_inline inline
-#endif
-
-#endif /* __KERNEL__ */
-
-/*
- * From the GCC manual:
- *
- * Many functions do not examine any values except their arguments,
- * and have no effects except the return value.  Basically this is
- * just slightly more strict class than the `pure' attribute above,
- * since function is not allowed to read global memory.
- *
- * Note that a function that has pointer arguments and examines the
- * data pointed to must _not_ be declared `const'.  Likewise, a
- * function that calls a non-`const' function usually must not be
- * `const'.  It does not make sense for a `const' function to return
- * `void'.
- */
-#ifndef __attribute_const__
-# define __attribute_const__	/* unimplemented */
-#endif
-
-#ifndef __designated_init
-# define __designated_init
-#endif
-
-#ifndef __latent_entropy
-# define __latent_entropy
-#endif
-
-#ifndef __randomize_layout
-# define __randomize_layout __designated_init
-#endif
-
-#ifndef __no_randomize_layout
-# define __no_randomize_layout
-#endif
-
-#ifndef randomized_struct_fields_start
-# define randomized_struct_fields_start
-# define randomized_struct_fields_end
-#endif
-
-/*
- * Tell gcc if a function is cold. The compiler will assume any path
- * directly leading to the call is unlikely.
- */
-
-#ifndef __cold
-#define __cold
-#endif
-
-/* Simple shorthand for a section definition */
-#ifndef __section
-# define __section(S) __attribute__ ((__section__(#S)))
-#endif
-
-#ifndef __visible
-#define __visible
-#endif
-
-#ifndef __nostackprotector
-# define __nostackprotector
-#endif
-
-/*
- * Assume alignment of return value.
- */
-#ifndef __assume_aligned
-#define __assume_aligned(a, ...)
-#endif
-
-
-/* Are two types/vars the same type (ignoring qualifiers)? */
-#ifndef __same_type
-# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
-#endif
-
-/* Is this type a native word size -- useful for atomic operations */
-#ifndef __native_word
-# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
-#endif
-
 /* Compile time object size, -1 for unknown */
 #ifndef __compiletime_object_size
 # define __compiletime_object_size(obj) -1
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
new file mode 100644
index 000000000000..6b79a9bba9a7
--- /dev/null
+++ b/include/linux/compiler_types.h
@@ -0,0 +1,274 @@
+#ifndef __LINUX_COMPILER_TYPES_H
+#define __LINUX_COMPILER_TYPES_H
+
+#ifndef __ASSEMBLY__
+
+#ifdef __CHECKER__
+# define __user		__attribute__((noderef, address_space(1)))
+# define __kernel	__attribute__((address_space(0)))
+# define __safe		__attribute__((safe))
+# define __force	__attribute__((force))
+# define __nocast	__attribute__((nocast))
+# define __iomem	__attribute__((noderef, address_space(2)))
+# define __must_hold(x)	__attribute__((context(x,1,1)))
+# define __acquires(x)	__attribute__((context(x,0,1)))
+# define __releases(x)	__attribute__((context(x,1,0)))
+# define __acquire(x)	__context__(x,1)
+# define __release(x)	__context__(x,-1)
+# define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
+# define __percpu	__attribute__((noderef, address_space(3)))
+# define __rcu		__attribute__((noderef, address_space(4)))
+# define __private	__attribute__((noderef))
+extern void __chk_user_ptr(const volatile void __user *);
+extern void __chk_io_ptr(const volatile void __iomem *);
+# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
+#else /* __CHECKER__ */
+# ifdef STRUCTLEAK_PLUGIN
+#  define __user __attribute__((user))
+# else
+#  define __user
+# endif
+# define __kernel
+# define __safe
+# define __force
+# define __nocast
+# define __iomem
+# define __chk_user_ptr(x) (void)0
+# define __chk_io_ptr(x) (void)0
+# define __builtin_warning(x, y...) (1)
+# define __must_hold(x)
+# define __acquires(x)
+# define __releases(x)
+# define __acquire(x) (void)0
+# define __release(x) (void)0
+# define __cond_lock(x,c) (c)
+# define __percpu
+# define __rcu
+# define __private
+# define ACCESS_PRIVATE(p, member) ((p)->member)
+#endif /* __CHECKER__ */
+
+/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
+#define ___PASTE(a,b) a##b
+#define __PASTE(a,b) ___PASTE(a,b)
+
+#ifdef __KERNEL__
+
+#ifdef __GNUC__
+#include <linux/compiler-gcc.h>
+#endif
+
+#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__)
+#define notrace __attribute__((hotpatch(0,0)))
+#else
+#define notrace __attribute__((no_instrument_function))
+#endif
+
+/* Intel compiler defines __GNUC__. So we will overwrite implementations
+ * coming from above header files here
+ */
+#ifdef __INTEL_COMPILER
+# include <linux/compiler-intel.h>
+#endif
+
+/* Clang compiler defines __GNUC__. So we will overwrite implementations
+ * coming from above header files here
+ */
+#ifdef __clang__
+#include <linux/compiler-clang.h>
+#endif
+
+/*
+ * Generic compiler-dependent macros required for kernel
+ * build go below this comment. Actual compiler/compiler version
+ * specific implementations come from the above header files
+ */
+
+struct ftrace_branch_data {
+	const char *func;
+	const char *file;
+	unsigned line;
+	union {
+		struct {
+			unsigned long correct;
+			unsigned long incorrect;
+		};
+		struct {
+			unsigned long miss;
+			unsigned long hit;
+		};
+		unsigned long miss_hit[2];
+	};
+};
+
+struct ftrace_likely_data {
+	struct ftrace_branch_data	data;
+	unsigned long			constant;
+};
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef __KERNEL__
+/*
+ * Allow us to mark functions as 'deprecated' and have gcc emit a nice
+ * warning for each use, in hopes of speeding the functions removal.
+ * Usage is:
+ * 		int __deprecated foo(void)
+ */
+#ifndef __deprecated
+# define __deprecated		/* unimplemented */
+#endif
+
+#ifdef MODULE
+#define __deprecated_for_modules __deprecated
+#else
+#define __deprecated_for_modules
+#endif
+
+#ifndef __must_check
+#define __must_check
+#endif
+
+#ifndef CONFIG_ENABLE_MUST_CHECK
+#undef __must_check
+#define __must_check
+#endif
+#ifndef CONFIG_ENABLE_WARN_DEPRECATED
+#undef __deprecated
+#undef __deprecated_for_modules
+#define __deprecated
+#define __deprecated_for_modules
+#endif
+
+#ifndef __malloc
+#define __malloc
+#endif
+
+/*
+ * Allow us to avoid 'defined but not used' warnings on functions and data,
+ * as well as force them to be emitted to the assembly file.
+ *
+ * As of gcc 3.4, static functions that are not marked with attribute((used))
+ * may be elided from the assembly file.  As of gcc 3.4, static data not so
+ * marked will not be elided, but this may change in a future gcc version.
+ *
+ * NOTE: Because distributions shipped with a backported unit-at-a-time
+ * compiler in gcc 3.3, we must define __used to be __attribute__((used))
+ * for gcc >=3.3 instead of 3.4.
+ *
+ * In prior versions of gcc, such functions and data would be emitted, but
+ * would be warned about except with attribute((unused)).
+ *
+ * Mark functions that are referenced only in inline assembly as __used so
+ * the code is emitted even though it appears to be unreferenced.
+ */
+#ifndef __used
+# define __used			/* unimplemented */
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused		/* unimplemented */
+#endif
+
+#ifndef __always_unused
+# define __always_unused	/* unimplemented */
+#endif
+
+#ifndef noinline
+#define noinline
+#endif
+
+/*
+ * Rather then using noinline to prevent stack consumption, use
+ * noinline_for_stack instead.  For documentation reasons.
+ */
+#define noinline_for_stack noinline
+
+#ifndef __always_inline
+#define __always_inline inline
+#endif
+
+#endif /* __KERNEL__ */
+
+/*
+ * From the GCC manual:
+ *
+ * Many functions do not examine any values except their arguments,
+ * and have no effects except the return value.  Basically this is
+ * just slightly more strict class than the `pure' attribute above,
+ * since function is not allowed to read global memory.
+ *
+ * Note that a function that has pointer arguments and examines the
+ * data pointed to must _not_ be declared `const'.  Likewise, a
+ * function that calls a non-`const' function usually must not be
+ * `const'.  It does not make sense for a `const' function to return
+ * `void'.
+ */
+#ifndef __attribute_const__
+# define __attribute_const__	/* unimplemented */
+#endif
+
+#ifndef __designated_init
+# define __designated_init
+#endif
+
+#ifndef __latent_entropy
+# define __latent_entropy
+#endif
+
+#ifndef __randomize_layout
+# define __randomize_layout __designated_init
+#endif
+
+#ifndef __no_randomize_layout
+# define __no_randomize_layout
+#endif
+
+#ifndef randomized_struct_fields_start
+# define randomized_struct_fields_start
+# define randomized_struct_fields_end
+#endif
+
+/*
+ * Tell gcc if a function is cold. The compiler will assume any path
+ * directly leading to the call is unlikely.
+ */
+
+#ifndef __cold
+#define __cold
+#endif
+
+/* Simple shorthand for a section definition */
+#ifndef __section
+# define __section(S) __attribute__ ((__section__(#S)))
+#endif
+
+#ifndef __visible
+#define __visible
+#endif
+
+#ifndef __nostackprotector
+# define __nostackprotector
+#endif
+
+/*
+ * Assume alignment of return value.
+ */
+#ifndef __assume_aligned
+#define __assume_aligned(a, ...)
+#endif
+
+
+/* Are two types/vars the same type (ignoring qualifiers)? */
+#ifndef __same_type
+# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#endif
+
+/* Is this type a native word size -- useful for atomic operations */
+#ifndef __native_word
+# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
+#endif
+
+#endif /* __LINUX_COMPILER_TYPES_H */
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index a6a42dd02466..ebd61b80fed4 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_LINKAGE_H
 #define _LINUX_LINKAGE_H
 
-#include <linux/compiler.h>
+#include <linux/compiler_types.h>
 #include <linux/stringify.h>
 #include <linux/export.h>
 #include <asm/linkage.h>
diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h
index 621fa8ac4425..d1f7cb732dfc 100644
--- a/include/uapi/linux/stddef.h
+++ b/include/uapi/linux/stddef.h
@@ -1,4 +1,4 @@
-#include <linux/compiler.h>
+#include <linux/compiler_types.h>
 
 #ifndef __always_inline
 #define __always_inline inline
diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh
index fdebd66f8fc1..63b8cc26456a 100755
--- a/scripts/headers_install.sh
+++ b/scripts/headers_install.sh
@@ -33,7 +33,7 @@ do
 	sed -r \
 		-e 's/([ \t(])(__user|__force|__iomem)[ \t]/\1/g' \
 		-e 's/__attribute_const__([ \t]|$)/\1/g' \
-		-e 's@^#include <linux/compiler.h>@@' \
+		-e 's@^#include <linux/compiler(|_types).h>@@' \
 		-e 's/(^|[^a-zA-Z0-9])__packed([^a-zA-Z0-9_]|$)/\1__attribute__((packed))\2/g' \
 		-e 's/(^|[ \t(])(inline|asm|volatile)([ \t(]|$)/\1__\2__\3/g' \
 		-e 's@#(ifndef|define|endif[ \t]*/[*])[ \t]*_UAPI@#\1 @' \
-- 
cgit v1.2.3


From 276b738deb5bf856b9f6049fcd92a967f52643d7 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 24 Oct 2017 14:40:20 -0500
Subject: PCI: Add resizable BAR infrastructure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add resizable BAR infrastructure, including defines and helper functions to
read the possible sizes of a BAR and update its size.  See PCIe r3.1, sec
7.22.

Link: https://pcisig.com/sites/default/files/specification_documents/ECN_Resizable-BAR_24Apr2008.pdf
Signed-off-by: Christian König <christian.koenig@amd.com>
[bhelgaas: rename to functions with "rebar" (to match #defines), drop shift
#defines, drop "_MASK" suffixes, fix typos, fix kerneldoc]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
---
 drivers/pci/pci.c             | 101 ++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h             |   8 ++++
 include/uapi/linux/pci_regs.h |   8 +++-
 3 files changed, 115 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6078dfc11b11..832b96756e83 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2965,6 +2965,107 @@ bool pci_acs_path_enabled(struct pci_dev *start,
 	return true;
 }
 
+/**
+ * pci_rebar_find_pos - find position of resize ctrl reg for BAR
+ * @pdev: PCI device
+ * @bar: BAR to find
+ *
+ * Helper to find the position of the ctrl register for a BAR.
+ * Returns -ENOTSUPP if resizable BARs are not supported at all.
+ * Returns -ENOENT if no ctrl register for the BAR could be found.
+ */
+static int pci_rebar_find_pos(struct pci_dev *pdev, int bar)
+{
+	unsigned int pos, nbars, i;
+	u32 ctrl;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR);
+	if (!pos)
+		return -ENOTSUPP;
+
+	pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+	nbars = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >>
+		    PCI_REBAR_CTRL_NBAR_SHIFT;
+
+	for (i = 0; i < nbars; i++, pos += 8) {
+		int bar_idx;
+
+		pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+		bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX;
+		if (bar_idx == bar)
+			return pos;
+	}
+
+	return -ENOENT;
+}
+
+/**
+ * pci_rebar_get_possible_sizes - get possible sizes for BAR
+ * @pdev: PCI device
+ * @bar: BAR to query
+ *
+ * Get the possible sizes of a resizable BAR as bitmask defined in the spec
+ * (bit 0=1MB, bit 19=512GB). Returns 0 if BAR isn't resizable.
+ */
+u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
+{
+	int pos;
+	u32 cap;
+
+	pos = pci_rebar_find_pos(pdev, bar);
+	if (pos < 0)
+		return 0;
+
+	pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap);
+	return (cap & PCI_REBAR_CAP_SIZES) >> 4;
+}
+
+/**
+ * pci_rebar_get_current_size - get the current size of a BAR
+ * @pdev: PCI device
+ * @bar: BAR to set size to
+ *
+ * Read the size of a BAR from the resizable BAR config.
+ * Returns size if found or negative error code.
+ */
+int pci_rebar_get_current_size(struct pci_dev *pdev, int bar)
+{
+	int pos;
+	u32 ctrl;
+
+	pos = pci_rebar_find_pos(pdev, bar);
+	if (pos < 0)
+		return pos;
+
+	pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+	return (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> 8;
+}
+
+/**
+ * pci_rebar_set_size - set a new size for a BAR
+ * @pdev: PCI device
+ * @bar: BAR to set size to
+ * @size: new size as defined in the spec (0=1MB, 19=512GB)
+ *
+ * Set the new size of a BAR as defined in the spec.
+ * Returns zero if resizing was successful, error code otherwise.
+ */
+int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size)
+{
+	int pos;
+	u32 ctrl;
+
+	pos = pci_rebar_find_pos(pdev, bar);
+	if (pos < 0)
+		return pos;
+
+	pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+	ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
+	ctrl |= size << 8;
+	pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
+	return 0;
+}
+
 /**
  * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge
  * @dev: the PCI device
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index a6560c9baa52..33469a33738d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -366,4 +366,12 @@ int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
 			  struct resource *res);
 #endif
 
+u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
+int pci_rebar_get_current_size(struct pci_dev *pdev, int bar);
+int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size);
+static inline u64 pci_rebar_size_to_bytes(int size)
+{
+	return 1ULL << (size + 20);
+}
+
 #endif /* DRIVERS_PCI_H */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index f8d58045926f..d34000a59f24 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -939,9 +939,13 @@
 #define PCI_SATA_SIZEOF_LONG	16
 
 /* Resizable BARs */
+#define PCI_REBAR_CAP		4	/* capability register */
+#define  PCI_REBAR_CAP_SIZES		0x00FFFFF0  /* supported BAR sizes */
 #define PCI_REBAR_CTRL		8	/* control register */
-#define  PCI_REBAR_CTRL_NBAR_MASK	(7 << 5)	/* mask for # bars */
-#define  PCI_REBAR_CTRL_NBAR_SHIFT	5	/* shift for # bars */
+#define  PCI_REBAR_CTRL_BAR_IDX		0x00000007  /* BAR index */
+#define  PCI_REBAR_CTRL_NBAR_MASK	0x000000E0  /* # of resizable BARs */
+#define  PCI_REBAR_CTRL_NBAR_SHIFT	5  	    /* shift for # of BARs */
+#define  PCI_REBAR_CTRL_BAR_SIZE	0x00001F00  /* BAR size */
 
 /* Dynamic Power Allocation */
 #define PCI_DPA_CAP		4	/* capability register */
-- 
cgit v1.2.3


From 908d140a87a794bf89717ceae54aba5ce86c52e4 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Sat, 21 Oct 2017 00:25:15 +0300
Subject: ip6_tunnel: Allow rcv/xmit even if remote address is a local address

Currently, ip6_tnl_xmit_ctl drops tunneled packets if the remote
address (outer v6 destination) is one of host's locally configured
addresses.
Same applies to ip6_tnl_rcv_ctl: it drops packets if the remote address
(outer v6 source) is a local address.

This prevents using ipxip6 (and ip6_gre) tunnels whose local/remote
endpoints are on same host; OTOH v4 tunnels (ipip or gre) allow such
configurations.

An example where this proves useful is a system where entities are
identified by their unique v6 addresses, and use tunnels to encapsulate
traffic between them. The limitation prevents placing several entities
on same host.

Introduce IP6_TNL_F_ALLOW_LOCAL_REMOTE which allows to bypass this
restriction.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ip6_tunnel.h | 2 ++
 net/ipv6/ip6_tunnel.c           | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h
index 425926c467d7..ffebbe365478 100644
--- a/include/uapi/linux/ip6_tunnel.h
+++ b/include/uapi/linux/ip6_tunnel.h
@@ -20,6 +20,8 @@
 #define IP6_TNL_F_RCV_DSCP_COPY 0x10
 /* copy fwmark from inner packet */
 #define IP6_TNL_F_USE_ORIG_FWMARK 0x20
+/* allow remote endpoint on the local node */
+#define IP6_TNL_F_ALLOW_LOCAL_REMOTE 0x40
 
 struct ip6_tnl_parm {
 	char name[IFNAMSIZ];	/* name of tunnel device */
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 4212879ff35e..439d65f7e094 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -770,7 +770,8 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
 
 		if ((ipv6_addr_is_multicast(laddr) ||
 		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
-		    likely(!ipv6_chk_addr(net, raddr, NULL, 0)))
+		    ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
+		     likely(!ipv6_chk_addr(net, raddr, NULL, 0))))
 			ret = 1;
 	}
 	return ret;
@@ -1000,7 +1001,8 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
 		if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0)))
 			pr_warn("%s xmit: Local address not yet configured!\n",
 				p->name);
-		else if (!ipv6_addr_is_multicast(raddr) &&
+		else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
+			 !ipv6_addr_is_multicast(raddr) &&
 			 unlikely(ipv6_chk_addr(net, raddr, NULL, 0)))
 			pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
 				p->name);
-- 
cgit v1.2.3


From 585d763af09cc21daf48ecc873604ccdb70f6014 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 16 Oct 2017 18:01:26 -0700
Subject: net/sched: Introduce Credit Based Shaper (CBS) qdisc

This queueing discipline implements the shaper algorithm defined by
the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.

It's primary usage is to apply some bandwidth reservation to user
defined traffic classes, which are mapped to different queues via the
mqprio qdisc.

Only a simple software implementation is added for now.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Tested-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/uapi/linux/pkt_sched.h |  19 +++
 net/sched/Kconfig              |  11 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_cbs.c            | 293 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 324 insertions(+)
 create mode 100644 net/sched/sch_cbs.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e7cc3d3c7421..0e88cc262ca0 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -904,4 +904,23 @@ struct tc_pie_xstats {
 	__u32 maxq;             /* maximum queue size */
 	__u32 ecn_mark;         /* packets marked with ecn*/
 };
+
+/* CBS */
+struct tc_cbs_qopt {
+	__u8 offload;
+	__u8 _pad[3];
+	__s32 hicredit;
+	__s32 locredit;
+	__s32 idleslope;
+	__s32 sendslope;
+};
+
+enum {
+	TCA_CBS_UNSPEC,
+	TCA_CBS_PARMS,
+	__TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e70ed26485a2..c03d86a7775e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -172,6 +172,17 @@ config NET_SCH_TBF
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_tbf.
 
+config NET_SCH_CBS
+	tristate "Credit Based Shaper (CBS)"
+	---help---
+	  Say Y here if you want to use the Credit Based Shaper (CBS) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_cbs.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_cbs.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 7b915d226de7..80c8f92d162d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
 obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
+obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
new file mode 100644
index 000000000000..0e85133c5653
--- /dev/null
+++ b/net/sched/sch_cbs.c
@@ -0,0 +1,293 @@
+/*
+ * net/sched/sch_cbs.c	Credit Based Shaper
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+/* Credit Based Shaper (CBS)
+ * =========================
+ *
+ * This is a simple rate-limiting shaper aimed at TSN applications on
+ * systems with known traffic workloads.
+ *
+ * Its algorithm is defined by the IEEE 802.1Q-2014 Specification,
+ * Section 8.6.8.2, and explained in more detail in the Annex L of the
+ * same specification.
+ *
+ * There are four tunables to be considered:
+ *
+ *	'idleslope': Idleslope is the rate of credits that is
+ *	accumulated (in kilobits per second) when there is at least
+ *	one packet waiting for transmission. Packets are transmitted
+ *	when the current value of credits is equal or greater than
+ *	zero. When there is no packet to be transmitted the amount of
+ *	credits is set to zero. This is the main tunable of the CBS
+ *	algorithm.
+ *
+ *	'sendslope':
+ *	Sendslope is the rate of credits that is depleted (it should be a
+ *	negative number of kilobits per second) when a transmission is
+ *	ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
+ *	8.6.8.2 item g):
+ *
+ *	sendslope = idleslope - port_transmit_rate
+ *
+ *	'hicredit': Hicredit defines the maximum amount of credits (in
+ *	bytes) that can be accumulated. Hicredit depends on the
+ *	characteristics of interfering traffic,
+ *	'max_interference_size' is the maximum size of any burst of
+ *	traffic that can delay the transmission of a frame that is
+ *	available for transmission for this traffic class, (IEEE
+ *	802.1Q-2014 Annex L, Equation L-3):
+ *
+ *	hicredit = max_interference_size * (idleslope / port_transmit_rate)
+ *
+ *	'locredit': Locredit is the minimum amount of credits that can
+ *	be reached. It is a function of the traffic flowing through
+ *	this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2):
+ *
+ *	locredit = max_frame_size * (sendslope / port_transmit_rate)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+#define BYTES_PER_KBIT (1000LL / 8)
+
+struct cbs_sched_data {
+	s64 port_rate; /* in bytes/s */
+	s64 last; /* timestamp in ns */
+	s64 credits; /* in bytes */
+	s32 locredit; /* in bytes */
+	s32 hicredit; /* in bytes */
+	s64 sendslope; /* in bytes/s */
+	s64 idleslope; /* in bytes/s */
+	struct qdisc_watchdog watchdog;
+	int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch);
+	struct sk_buff *(*dequeue)(struct Qdisc *sch);
+};
+
+static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	if (sch->q.qlen == 0 && q->credits > 0) {
+		/* We need to stop accumulating credits when there's
+		 * no enqueued packets and q->credits is positive.
+		 */
+		q->credits = 0;
+		q->last = ktime_get_ns();
+	}
+
+	return qdisc_enqueue_tail(skb, sch);
+}
+
+static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	return q->enqueue(skb, sch);
+}
+
+/* timediff is in ns, slope is in bytes/s */
+static s64 timediff_to_credits(s64 timediff, s64 slope)
+{
+	return div64_s64(timediff * slope, NSEC_PER_SEC);
+}
+
+static s64 delay_from_credits(s64 credits, s64 slope)
+{
+	if (unlikely(slope == 0))
+		return S64_MAX;
+
+	return div64_s64(-credits * NSEC_PER_SEC, slope);
+}
+
+static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate)
+{
+	if (unlikely(port_rate == 0))
+		return S64_MAX;
+
+	return div64_s64(len * slope, port_rate);
+}
+
+static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	s64 now = ktime_get_ns();
+	struct sk_buff *skb;
+	s64 credits;
+	int len;
+
+	if (q->credits < 0) {
+		credits = timediff_to_credits(now - q->last, q->idleslope);
+
+		credits = q->credits + credits;
+		q->credits = min_t(s64, credits, q->hicredit);
+
+		if (q->credits < 0) {
+			s64 delay;
+
+			delay = delay_from_credits(q->credits, q->idleslope);
+			qdisc_watchdog_schedule_ns(&q->watchdog, now + delay);
+
+			q->last = now;
+
+			return NULL;
+		}
+	}
+
+	skb = qdisc_dequeue_head(sch);
+	if (!skb)
+		return NULL;
+
+	len = qdisc_pkt_len(skb);
+
+	/* As sendslope is a negative number, this will decrease the
+	 * amount of q->credits.
+	 */
+	credits = credits_from_len(len, q->sendslope, q->port_rate);
+	credits += q->credits;
+
+	q->credits = max_t(s64, credits, q->locredit);
+	q->last = now;
+
+	return skb;
+}
+
+static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	return q->dequeue(sch);
+}
+
+static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
+	[TCA_CBS_PARMS]	= { .len = sizeof(struct tc_cbs_qopt) },
+};
+
+static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct nlattr *tb[TCA_CBS_MAX + 1];
+	struct ethtool_link_ksettings ecmd;
+	struct tc_cbs_qopt *qopt;
+	s64 link_speed;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_CBS_PARMS])
+		return -EINVAL;
+
+	qopt = nla_data(tb[TCA_CBS_PARMS]);
+
+	if (qopt->offload)
+		return -EOPNOTSUPP;
+
+	if (!__ethtool_get_link_ksettings(dev, &ecmd))
+		link_speed = ecmd.base.speed;
+	else
+		link_speed = SPEED_1000;
+
+	q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
+
+	q->enqueue = cbs_enqueue_soft;
+	q->dequeue = cbs_dequeue_soft;
+
+	q->hicredit = qopt->hicredit;
+	q->locredit = qopt->locredit;
+	q->idleslope = qopt->idleslope * BYTES_PER_KBIT;
+	q->sendslope = qopt->sendslope * BYTES_PER_KBIT;
+
+	return 0;
+}
+
+static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	if (!opt)
+		return -EINVAL;
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	return cbs_change(sch, opt);
+}
+
+static void cbs_destroy(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct tc_cbs_qopt opt = { };
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.hicredit = q->hicredit;
+	opt.locredit = q->locredit;
+	opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT);
+	opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT);
+	opt.offload = 0;
+
+	if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
+	.id		=	"cbs",
+	.priv_size	=	sizeof(struct cbs_sched_data),
+	.enqueue	=	cbs_enqueue,
+	.dequeue	=	cbs_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	cbs_init,
+	.reset		=	qdisc_reset_queue,
+	.destroy	=	cbs_destroy,
+	.change		=	cbs_change,
+	.dump		=	cbs_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init cbs_module_init(void)
+{
+	return register_qdisc(&cbs_qdisc_ops);
+}
+
+static void __exit cbs_module_exit(void)
+{
+	unregister_qdisc(&cbs_qdisc_ops);
+}
+module_init(cbs_module_init)
+module_exit(cbs_module_exit)
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 7e86a365a8319970e002f83c73701a86d95a69e6 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@amd.com>
Date: Fri, 27 Oct 2017 19:35:30 -0400
Subject: drm/amdkfd: increase limit of signal events to 4096 per process

Signed-off-by: Oded Gabbay <oded.gabbay@amd.com>
Reviewed-by: Ben Goz <ben.goz@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 26283fefdf5f..731d0df722e3 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -169,7 +169,7 @@ struct kfd_ioctl_dbg_wave_control_args {
 #define KFD_IOC_WAIT_RESULT_TIMEOUT		1
 #define KFD_IOC_WAIT_RESULT_FAIL		2
 
-#define KFD_SIGNAL_EVENT_LIMIT			256
+#define KFD_SIGNAL_EVENT_LIMIT			4096
 
 struct kfd_ioctl_create_event_args {
 	__u64 event_page_offset;	/* from KFD */
-- 
cgit v1.2.3


From 2ea2352ede9d97585164a7e19224955f4e4ca8db Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 27 Oct 2017 17:30:12 -0700
Subject: ipv6: prevent user from adding cached routes

Cached routes should only be created by the system when receiving pmtu
discovery or ip redirect msg. Users should not be allowed to create
cached routes.

Furthermore, after the patch series to move cached routes into exception
table, user added cached routes will trigger the following warning in
fib6_add():

WARNING: CPU: 0 PID: 2985 at net/ipv6/ip6_fib.c:1137
fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137
Kernel panic - not syncing: panic_on_warn set ...

CPU: 0 PID: 2985 Comm: syzkaller320388 Not tainted 4.14.0-rc3+ #74
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 panic+0x1e4/0x417 kernel/panic.c:181
 __warn+0x1c4/0x1d9 kernel/panic.c:542
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178
 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:261
 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311
 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137
RSP: 0018:ffff8801cf09f6a0 EFLAGS: 00010297
RAX: ffff8801ce45e340 RBX: 1ffff10039e13eec RCX: ffff8801d749c814
RDX: 0000000000000000 RSI: ffff8801d749c700 RDI: ffff8801d749c780
RBP: ffff8801cf09fa08 R08: 0000000000000000 R09: ffff8801cf09f360
R10: ffff8801cf09f2d8 R11: 1ffff10039c8befb R12: 0000000000000001
R13: dffffc0000000000 R14: ffff8801d749c700 R15: ffffffff860655c0
 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011
 ip6_route_add+0x148/0x1a0 net/ipv6/route.c:2782
 ipv6_route_ioctl+0x4d5/0x690 net/ipv6/route.c:3291
 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:521
 sock_do_ioctl+0x65/0xb0 net/socket.c:961
 sock_ioctl+0x2c2/0x440 net/socket.c:1058
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

So we fix this by failing the attemp to add cached routes from userspace
with returning EINVAL error.

Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6_route.h | 2 +-
 net/ipv6/route.c                | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index d496c02e14bc..c15d8054905c 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -28,7 +28,7 @@
 
 #define RTF_ROUTEINFO	0x00800000	/* route information - RA	*/
 
-#define RTF_CACHE	0x01000000	/* cache entry			*/
+#define RTF_CACHE	0x01000000	/* read-only: can not be set by user */
 #define RTF_FLOW	0x02000000	/* flow significant route	*/
 #define RTF_POLICY	0x04000000	/* policy route			*/
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 605e5dc1c010..70d9659fc1e9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2478,6 +2478,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	/* RTF_CACHE is an internal flag; can not be set by userspace */
+	if (cfg->fc_flags & RTF_CACHE) {
+		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
+		goto out;
+	}
+
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
-- 
cgit v1.2.3


From a190d04db93710ae166749055b6985397c6d13f5 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 26 Oct 2017 15:09:21 -0700
Subject: ipvlan: introduce 'private' attribute for all existing modes.

IPvlan has always operated in bridge mode. However there are scenarios
where each slave should be able to talk through the master device but
not necessarily across each other. Think of an environment where each
of a namespace is a private and independant customer. In this scenario
the machine which is hosting these namespaces neither want to tell who
their neighbor is nor the individual namespaces care to talk to neighbor
on short-circuited network path.

This patch implements the mode that is very similar to the 'private' mode
in macvlan where individual slaves can send and receive traffic through
the master device, just that they can not talk among slave devices.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ipvlan.txt | 30 ++++++++++++++++++++++++++---
 drivers/net/ipvlan/ipvlan.h         | 16 ++++++++++++++++
 drivers/net/ipvlan/ipvlan_core.c    | 15 ++++++++++++---
 drivers/net/ipvlan/ipvlan_main.c    | 38 +++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_link.h        |  3 +++
 5 files changed, 94 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index 1fe42a874aae..bfa91c77a4c9 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,9 +22,19 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
 	There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-	ip link add link <master-dev> name <slave-dev> type ipvlan mode { l2 | l3 | l3s }
+    ip link add link <master> name <slave> type ipvlan [ mode MODE ] [ FLAGS ]
+       where
+         MODE: l3 (default) | l3s | l2
+         FLAGS: bridge (default) | private
 
-	e.g. ip link add link eth0 name ipvl0 type ipvlan mode l2
+    e.g.
+    (a) Following will create IPvlan link with eth0 as master in
+        L3 bridge mode
+          bash# ip link add link eth0 name ipvl0 type ipvlan
+    (b) This command will create IPvlan link in L2 bridge mode.
+          bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge
+    (c) This command will create an IPvlan device in L2 private mode.
+          bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private
 
 
 4. Operating modes:
@@ -54,7 +64,21 @@ works in this mode and hence it is L3-symmetric (L3s). This will have slightly l
 performance but that shouldn't matter since you are choosing this mode over plain-L3
 mode to make conn-tracking work.
 
-5. What to choose (macvlan vs. ipvlan)?
+5. Mode flags:
+	At this time following mode flags are available
+
+5.1 bridge:
+	This is the default option. To configure the IPvlan port in this mode,
+user can choose to either add this option on the command-line or don't specify
+anything. This is the traditional mode where slaves can cross-talk among
+themseleves apart from talking through the master device.
+
+5.2 private:
+	If this option is added to the command-line, the port is set in private
+mode. i.e. port wont allow cross communication between slaves.
+
+
+6. What to choose (macvlan vs. ipvlan)?
 	These two devices are very similar in many regards and the specific use
 case could very well define which device to choose. if one of the following
 situations defines your use case then you can choose to use ipvlan -
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index ba8173a0b62e..9941851bcc13 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -96,6 +96,7 @@ struct ipvl_port {
 	struct hlist_head	hlhead[IPVLAN_HASH_SIZE];
 	struct list_head	ipvlans;
 	u16			mode;
+	u16			flags;
 	u16			dev_id_start;
 	struct work_struct	wq;
 	struct sk_buff_head	backlog;
@@ -123,6 +124,21 @@ static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d)
 	return rtnl_dereference(d->rx_handler_data);
 }
 
+static inline bool ipvlan_is_private(const struct ipvl_port *port)
+{
+	return !!(port->flags & IPVLAN_F_PRIVATE);
+}
+
+static inline void ipvlan_mark_private(struct ipvl_port *port)
+{
+	port->flags |= IPVLAN_F_PRIVATE;
+}
+
+static inline void ipvlan_clear_private(struct ipvl_port *port)
+{
+	port->flags &= ~IPVLAN_F_PRIVATE;
+}
+
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 1f3295e274d0..72fd56de9c00 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -515,9 +515,13 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
 		goto out;
 
 	addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
-	if (addr)
+	if (addr) {
+		if (ipvlan_is_private(ipvlan->port)) {
+			consume_skb(skb);
+			return NET_XMIT_DROP;
+		}
 		return ipvlan_rcv_frame(addr, &skb, true);
-
+	}
 out:
 	ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
 	return ipvlan_process_outbound(skb);
@@ -535,8 +539,13 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
 		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
 		if (lyr3h) {
 			addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
-			if (addr)
+			if (addr) {
+				if (ipvlan_is_private(ipvlan->port)) {
+					consume_skb(skb);
+					return NET_XMIT_DROP;
+				}
 				return ipvlan_rcv_frame(addr, &skb, true);
+			}
 		}
 		skb = skb_share_check(skb, GFP_ATOMIC);
 		if (!skb)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index f0ab55df57f1..4368afb1934c 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -462,11 +462,24 @@ static int ipvlan_nl_changelink(struct net_device *dev,
 	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
 	int err = 0;
 
-	if (data && data[IFLA_IPVLAN_MODE]) {
+	if (!data)
+		return 0;
+
+	if (data[IFLA_IPVLAN_MODE]) {
 		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
 		err = ipvlan_set_port_mode(port, nmode);
 	}
+
+	if (!err && data[IFLA_IPVLAN_FLAGS]) {
+		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
+
+		if (flags & IPVLAN_F_PRIVATE)
+			ipvlan_mark_private(port);
+		else
+			ipvlan_clear_private(port);
+	}
+
 	return err;
 }
 
@@ -474,18 +487,30 @@ static size_t ipvlan_nl_getsize(const struct net_device *dev)
 {
 	return (0
 		+ nla_total_size(2) /* IFLA_IPVLAN_MODE */
+		+ nla_total_size(2) /* IFLA_IPVLAN_FLAGS */
 		);
 }
 
 static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[],
 			      struct netlink_ext_ack *extack)
 {
-	if (data && data[IFLA_IPVLAN_MODE]) {
+	if (!data)
+		return 0;
+
+	if (data[IFLA_IPVLAN_MODE]) {
 		u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
 		if (mode < IPVLAN_MODE_L2 || mode >= IPVLAN_MODE_MAX)
 			return -EINVAL;
 	}
+	if (data[IFLA_IPVLAN_FLAGS]) {
+		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
+
+		/* Only one bit is used at this moment. */
+		if (flags & ~IPVLAN_F_PRIVATE)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -502,6 +527,8 @@ static int ipvlan_nl_fillinfo(struct sk_buff *skb,
 	ret = -EMSGSIZE;
 	if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode))
 		goto err;
+	if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags))
+		goto err;
 
 	return 0;
 
@@ -549,6 +576,12 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 	ipvlan_adjust_mtu(ipvlan, phy_dev);
 	INIT_LIST_HEAD(&ipvlan->addrs);
 
+	/* Flags are per port and latest update overrides. User has
+	 * to be consistent in setting it just like the mode attribute.
+	 */
+	if (data && data[IFLA_IPVLAN_FLAGS])
+		ipvlan->port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
+
 	/* If the port-id base is at the MAX value, then wrap it around and
 	 * begin from 0x1 again. This may be due to a busy system where lots
 	 * of slaves are getting created and deleted.
@@ -644,6 +677,7 @@ EXPORT_SYMBOL_GPL(ipvlan_link_setup);
 static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] =
 {
 	[IFLA_IPVLAN_MODE] = { .type = NLA_U16 },
+	[IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 },
 };
 
 static struct rtnl_link_ops ipvlan_link_ops = {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b037e0ab1975..052e32cd584c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -465,6 +465,7 @@ enum macsec_validation_type {
 enum {
 	IFLA_IPVLAN_UNSPEC,
 	IFLA_IPVLAN_MODE,
+	IFLA_IPVLAN_FLAGS,
 	__IFLA_IPVLAN_MAX
 };
 
@@ -477,6 +478,8 @@ enum ipvlan_mode {
 	IPVLAN_MODE_MAX
 };
 
+#define IPVLAN_F_PRIVATE	0x01
+
 /* VXLAN section */
 enum {
 	IFLA_VXLAN_UNSPEC,
-- 
cgit v1.2.3


From fe89aa6b250c1011ccf425fbb7998e96bd54263f Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 26 Oct 2017 15:09:25 -0700
Subject: ipvlan: implement VEPA mode

This is very similar to the Macvlan VEPA mode, however, there is some
difference. IPvlan uses the mac-address of the lower device, so the VEPA
mode has implications of ICMP-redirects for packets destined for its
immediate neighbors sharing same master since the packets will have same
source and dest mac. The external switch/router will send redirect msg.

Having said that, this will be useful tool in terms of debugging
since IPvlan will not switch packets within its slaves and rely completely
on the external entity as intended in 802.1Qbg.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ipvlan.txt | 12 +++++++++++-
 drivers/net/ipvlan/ipvlan.h         | 15 +++++++++++++++
 drivers/net/ipvlan/ipvlan_core.c    | 17 ++++++++++-------
 drivers/net/ipvlan/ipvlan_main.c    | 13 +++++++++++--
 include/uapi/linux/if_link.h        |  1 +
 5 files changed, 48 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index bfa91c77a4c9..812ef003e0a8 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -25,7 +25,7 @@ using IProute2/ip utility.
     ip link add link <master> name <slave> type ipvlan [ mode MODE ] [ FLAGS ]
        where
          MODE: l3 (default) | l3s | l2
-         FLAGS: bridge (default) | private
+         FLAGS: bridge (default) | private | vepa
 
     e.g.
     (a) Following will create IPvlan link with eth0 as master in
@@ -35,6 +35,8 @@ using IProute2/ip utility.
           bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge
     (c) This command will create an IPvlan device in L2 private mode.
           bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private
+    (d) This command will create an IPvlan device in L2 vepa mode.
+          bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa
 
 
 4. Operating modes:
@@ -77,6 +79,14 @@ themseleves apart from talking through the master device.
 	If this option is added to the command-line, the port is set in private
 mode. i.e. port wont allow cross communication between slaves.
 
+5.3 vepa:
+	If this is added to the command-line, the port is set in VEPA mode.
+i.e. port will offload switching functionality to the external entity as
+described in 802.1Qbg
+Note: VEPA mode in IPvlan has limitations. IPvlan uses the mac-address of the
+master-device, so the packets which are emitted in this mode for the adjacent
+neighbor will have source and destination mac same. This will make the switch /
+router send the redirect message.
 
 6. What to choose (macvlan vs. ipvlan)?
 	These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 9941851bcc13..5166575a164d 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -139,6 +139,21 @@ static inline void ipvlan_clear_private(struct ipvl_port *port)
 	port->flags &= ~IPVLAN_F_PRIVATE;
 }
 
+static inline bool ipvlan_is_vepa(const struct ipvl_port *port)
+{
+	return !!(port->flags & IPVLAN_F_VEPA);
+}
+
+static inline void ipvlan_mark_vepa(struct ipvl_port *port)
+{
+	port->flags |= IPVLAN_F_VEPA;
+}
+
+static inline void ipvlan_clear_vepa(struct ipvl_port *port)
+{
+	port->flags &= ~IPVLAN_F_VEPA;
+}
+
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 72fd56de9c00..034ae4c57196 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -514,13 +514,15 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
 	if (!lyr3h)
 		goto out;
 
-	addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
-	if (addr) {
-		if (ipvlan_is_private(ipvlan->port)) {
-			consume_skb(skb);
-			return NET_XMIT_DROP;
+	if (!ipvlan_is_vepa(ipvlan->port)) {
+		addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
+		if (addr) {
+			if (ipvlan_is_private(ipvlan->port)) {
+				consume_skb(skb);
+				return NET_XMIT_DROP;
+			}
+			return ipvlan_rcv_frame(addr, &skb, true);
 		}
-		return ipvlan_rcv_frame(addr, &skb, true);
 	}
 out:
 	ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
@@ -535,7 +537,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
 	void *lyr3h;
 	int addr_type;
 
-	if (ether_addr_equal(eth->h_dest, eth->h_source)) {
+	if (!ipvlan_is_vepa(ipvlan->port) &&
+	    ether_addr_equal(eth->h_dest, eth->h_source)) {
 		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
 		if (lyr3h) {
 			addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 4368afb1934c..a266aa435d4d 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -478,6 +478,11 @@ static int ipvlan_nl_changelink(struct net_device *dev,
 			ipvlan_mark_private(port);
 		else
 			ipvlan_clear_private(port);
+
+		if (flags & IPVLAN_F_VEPA)
+			ipvlan_mark_vepa(port);
+		else
+			ipvlan_clear_vepa(port);
 	}
 
 	return err;
@@ -506,8 +511,12 @@ static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_IPVLAN_FLAGS]) {
 		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
 
-		/* Only one bit is used at this moment. */
-		if (flags & ~IPVLAN_F_PRIVATE)
+		/* Only two bits are used at this moment. */
+		if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
+			return -EINVAL;
+		/* Also both flags can't be active at the same time. */
+		if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ==
+		    (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
 			return -EINVAL;
 	}
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 052e32cd584c..81f26473d728 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -479,6 +479,7 @@ enum ipvlan_mode {
 };
 
 #define IPVLAN_F_PRIVATE	0x01
+#define IPVLAN_F_VEPA		0x02
 
 /* VXLAN section */
 enum {
-- 
cgit v1.2.3


From 40c3c40947324d9f40bf47830c92c59a9bbadf4a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:58 +0900
Subject: btrfs: Add sanity check for EXTENT_DATA when reading out leaf

Add extra checks for item with EXTENT_DATA type.  This checks the
following thing:

0) Key offset
   All key offsets must be aligned to sectorsize.
   Inline extent must have 0 for key offset.

1) Item size
   Uncompressed inline file extent size must match item size.
   (Compressed inline file extent has no information about its on-disk size.)
   Regular/preallocated file extent size must be a fixed value.

2) Every member of regular file extent item
   Including alignment for bytenr and offset, possible value for
   compression/encryption/type.

3) Type/compression/encode must be one of the valid values.

This should be the most comprehensive and strict check in the context
of btrfs_item for EXTENT_DATA.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ switch to BTRFS_FILE_EXTENT_TYPES, similar to what
  BTRFS_COMPRESS_TYPES does ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c              | 103 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs_tree.h |   1 +
 2 files changed, 104 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d1770b3e0385..b863d41f7d0a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -549,6 +549,100 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 		   btrfs_header_level(eb) == 0 ? "leaf" : "node",	\
 		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
 
+static int check_extent_data_item(struct btrfs_root *root,
+				  struct extent_buffer *leaf,
+				  struct btrfs_key *key, int slot)
+{
+	struct btrfs_file_extent_item *fi;
+	u32 sectorsize = root->fs_info->sectorsize;
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for file extent",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+		CORRUPT("invalid file extent type", leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	/*
+	 * Support for new compression/encrption must introduce incompat flag,
+	 * and must be caught in open_ctree().
+	 */
+	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+		CORRUPT("invalid file extent compression", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_encryption(leaf, fi)) {
+		CORRUPT("invalid file extent encryption", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+		/* Inline extent must have 0 as key offset */
+		if (key->offset) {
+			CORRUPT("inline extent has non-zero key offset",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Compressed inline extent has no on-disk size, skip it */
+		if (btrfs_file_extent_compression(leaf, fi) !=
+		    BTRFS_COMPRESS_NONE)
+			return 0;
+
+		/* Uncompressed inline extent size must match item size */
+		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+		    btrfs_file_extent_ram_bytes(leaf, fi)) {
+			CORRUPT("plaintext inline extent has invalid size",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+		return 0;
+	}
+
+	/* Regular or preallocated extent has fixed item size */
+	if (item_size != sizeof(*fi)) {
+		CORRUPT(
+		"regluar or preallocated extent data item size is invalid",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
+		CORRUPT(
+		"regular or preallocated extent data item has unaligned value",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_root *root,
+			   struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	int ret = 0;
+
+	switch (key->type) {
+	case BTRFS_EXTENT_DATA_KEY:
+		ret = check_extent_data_item(root, leaf, key, slot);
+		break;
+	}
+	return ret;
+}
+
 static noinline int check_leaf(struct btrfs_root *root,
 			       struct extent_buffer *leaf)
 {
@@ -605,9 +699,13 @@ static noinline int check_leaf(struct btrfs_root *root,
 	 * 1) key order
 	 * 2) item offset and size
 	 *    No overlap, no hole, all inside the leaf.
+	 * 3) item content
+	 *    If possible, do comprehensive sanity check.
+	 *    NOTE: All checks must only rely on the item data itself.
 	 */
 	for (slot = 0; slot < nritems; slot++) {
 		u32 item_end_expected;
+		int ret;
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 
@@ -650,6 +748,11 @@ static noinline int check_leaf(struct btrfs_root *root,
 			return -EUCLEAN;
 		}
 
+		/* Check if the item size and content meet other criteria */
+		ret = check_leaf_item(root, leaf, &key, slot);
+		if (ret < 0)
+			return ret;
+
 		prev_key.objectid = key.objectid;
 		prev_key.type = key.type;
 		prev_key.offset = key.offset;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 10689e1fdf11..3142645a27f5 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -732,6 +732,7 @@ struct btrfs_balance_item {
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
+#define BTRFS_FILE_EXTENT_TYPES	2
 
 struct btrfs_file_extent_item {
 	/*
-- 
cgit v1.2.3


From cb91775711b2f3f7adea8d33aa83104baf75ee07 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 16:47:49 +0200
Subject: isofs: use unsigned char types consistently

Based on the discussion about the signed character field for the year,
I went through all fields in the iso9660 and rockridge standards to see
whether they should used signed or unsigned characters. Only a single
8-bit value is defined as signed per 'section 7.1.2': the timezone
offset in a timestamp, this has always been handled correctly through
explicit sign-extension.

All others are either '7.1.1 8-bit unsigned numerical values' or
composite fields. I also read the linux source code and came to the
same conclusion, also I could not find any other part of the
implementation that actually behaves differently for signed or
unsigned values.

Since it is still ambigous to use plain 'char' in interface definitions,
I'm changing all fields representing numbers and reserved bytes to
the unambiguous '__u8'. Fields that hold actual strings are left as
'char' arrays. I built the code with '-Wpointer-sign -Wsign-compare'
to see if anything got left out, but couldn't find anything wrong
with the remaining warnings.

This patch should not change runtime behavior and does not need to
be backported.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/isofs/isofs.h            |  20 +++---
 fs/isofs/rock.h             |  62 ++++++++---------
 include/uapi/linux/iso_fs.h | 162 ++++++++++++++++++++++----------------------
 3 files changed, 122 insertions(+), 122 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index bd4047585431..c882f207dd5c 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -72,36 +72,36 @@ static inline struct iso_inode_info *ISOFS_I(struct inode *inode)
 	return container_of(inode, struct iso_inode_info, vfs_inode);
 }
 
-static inline int isonum_711(char *p)
+static inline int isonum_711(u8 *p)
 {
-	return *(u8 *)p;
+	return *p;
 }
-static inline int isonum_712(char *p)
+static inline int isonum_712(s8 *p)
 {
-	return *(s8 *)p;
+	return *p;
 }
-static inline unsigned int isonum_721(char *p)
+static inline unsigned int isonum_721(u8 *p)
 {
 	return get_unaligned_le16(p);
 }
-static inline unsigned int isonum_722(char *p)
+static inline unsigned int isonum_722(u8 *p)
 {
 	return get_unaligned_be16(p);
 }
-static inline unsigned int isonum_723(char *p)
+static inline unsigned int isonum_723(u8 *p)
 {
 	/* Ignore bigendian datum due to broken mastering programs */
 	return get_unaligned_le16(p);
 }
-static inline unsigned int isonum_731(char *p)
+static inline unsigned int isonum_731(u8 *p)
 {
 	return get_unaligned_le32(p);
 }
-static inline unsigned int isonum_732(char *p)
+static inline unsigned int isonum_732(u8 *p)
 {
 	return get_unaligned_be32(p);
 }
-static inline unsigned int isonum_733(char *p)
+static inline unsigned int isonum_733(u8 *p)
 {
 	/* Ignore bigendian datum due to broken mastering programs */
 	return get_unaligned_le32(p);
diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h
index f835976ce033..8780d67c6ca5 100644
--- a/fs/isofs/rock.h
+++ b/fs/isofs/rock.h
@@ -6,62 +6,62 @@
  */
 
 struct SU_SP_s {
-	unsigned char magic[2];
-	unsigned char skip;
+	__u8 magic[2];
+	__u8 skip;
 } __attribute__ ((packed));
 
 struct SU_CE_s {
-	char extent[8];
-	char offset[8];
-	char size[8];
+	__u8 extent[8];
+	__u8 offset[8];
+	__u8 size[8];
 };
 
 struct SU_ER_s {
-	unsigned char len_id;
-	unsigned char len_des;
-	unsigned char len_src;
-	unsigned char ext_ver;
-	char data[0];
+	__u8 len_id;
+	__u8 len_des;
+	__u8 len_src;
+	__u8 ext_ver;
+	__u8 data[0];
 } __attribute__ ((packed));
 
 struct RR_RR_s {
-	char flags[1];
+	__u8 flags[1];
 } __attribute__ ((packed));
 
 struct RR_PX_s {
-	char mode[8];
-	char n_links[8];
-	char uid[8];
-	char gid[8];
+	__u8 mode[8];
+	__u8 n_links[8];
+	__u8 uid[8];
+	__u8 gid[8];
 };
 
 struct RR_PN_s {
-	char dev_high[8];
-	char dev_low[8];
+	__u8 dev_high[8];
+	__u8 dev_low[8];
 };
 
 struct SL_component {
-	unsigned char flags;
-	unsigned char len;
-	char text[0];
+	__u8 flags;
+	__u8 len;
+	__u8 text[0];
 } __attribute__ ((packed));
 
 struct RR_SL_s {
-	unsigned char flags;
+	__u8 flags;
 	struct SL_component link;
 } __attribute__ ((packed));
 
 struct RR_NM_s {
-	unsigned char flags;
+	__u8 flags;
 	char name[0];
 } __attribute__ ((packed));
 
 struct RR_CL_s {
-	char location[8];
+	__u8 location[8];
 };
 
 struct RR_PL_s {
-	char location[8];
+	__u8 location[8];
 };
 
 struct stamp {
@@ -69,15 +69,15 @@ struct stamp {
 } __attribute__ ((packed));
 
 struct RR_TF_s {
-	char flags;
+	__u8 flags;
 	struct stamp times[0];	/* Variable number of these beasts */
 } __attribute__ ((packed));
 
 /* Linux-specific extension for transparent decompression */
 struct RR_ZF_s {
-	char algorithm[2];
-	char parms[2];
-	char real_size[8];
+	__u8 algorithm[2];
+	__u8 parms[2];
+	__u8 real_size[8];
 };
 
 /*
@@ -93,9 +93,9 @@ struct RR_ZF_s {
 #define TF_LONG_FORM 128
 
 struct rock_ridge {
-	char signature[2];
-	unsigned char len;
-	unsigned char version;
+	__u8 signature[2];
+	__u8 len;
+	__u8 version;
 	union {
 		struct SU_SP_s SP;
 		struct SU_CE_s CE;
diff --git a/include/uapi/linux/iso_fs.h b/include/uapi/linux/iso_fs.h
index 4688ac4284e2..07c4c6405b3c 100644
--- a/include/uapi/linux/iso_fs.h
+++ b/include/uapi/linux/iso_fs.h
@@ -12,10 +12,10 @@
 #define ISODCL(from, to) (to - from + 1)
 
 struct iso_volume_descriptor {
-	char type[ISODCL(1,1)]; /* 711 */
+	__u8 type[ISODCL(1,1)]; /* 711 */
 	char id[ISODCL(2,6)];
-	char version[ISODCL(7,7)];
-	char data[ISODCL(8,2048)];
+	__u8 version[ISODCL(7,7)];
+	__u8 data[ISODCL(8,2048)];
 };
 
 /* volume descriptor types */
@@ -26,24 +26,24 @@ struct iso_volume_descriptor {
 #define ISO_STANDARD_ID "CD001"
 
 struct iso_primary_descriptor {
-	char type			[ISODCL (  1,   1)]; /* 711 */
+	__u8 type			[ISODCL (  1,   1)]; /* 711 */
 	char id				[ISODCL (  2,   6)];
-	char version			[ISODCL (  7,   7)]; /* 711 */
-	char unused1			[ISODCL (  8,   8)];
+	__u8 version			[ISODCL (  7,   7)]; /* 711 */
+	__u8 unused1			[ISODCL (  8,   8)];
 	char system_id			[ISODCL (  9,  40)]; /* achars */
 	char volume_id			[ISODCL ( 41,  72)]; /* dchars */
-	char unused2			[ISODCL ( 73,  80)];
-	char volume_space_size		[ISODCL ( 81,  88)]; /* 733 */
-	char unused3			[ISODCL ( 89, 120)];
-	char volume_set_size		[ISODCL (121, 124)]; /* 723 */
-	char volume_sequence_number	[ISODCL (125, 128)]; /* 723 */
-	char logical_block_size		[ISODCL (129, 132)]; /* 723 */
-	char path_table_size		[ISODCL (133, 140)]; /* 733 */
-	char type_l_path_table		[ISODCL (141, 144)]; /* 731 */
-	char opt_type_l_path_table	[ISODCL (145, 148)]; /* 731 */
-	char type_m_path_table		[ISODCL (149, 152)]; /* 732 */
-	char opt_type_m_path_table	[ISODCL (153, 156)]; /* 732 */
-	char root_directory_record	[ISODCL (157, 190)]; /* 9.1 */
+	__u8 unused2			[ISODCL ( 73,  80)];
+	__u8 volume_space_size		[ISODCL ( 81,  88)]; /* 733 */
+	__u8 unused3			[ISODCL ( 89, 120)];
+	__u8 volume_set_size		[ISODCL (121, 124)]; /* 723 */
+	__u8 volume_sequence_number	[ISODCL (125, 128)]; /* 723 */
+	__u8 logical_block_size		[ISODCL (129, 132)]; /* 723 */
+	__u8 path_table_size		[ISODCL (133, 140)]; /* 733 */
+	__u8 type_l_path_table		[ISODCL (141, 144)]; /* 731 */
+	__u8 opt_type_l_path_table	[ISODCL (145, 148)]; /* 731 */
+	__u8 type_m_path_table		[ISODCL (149, 152)]; /* 732 */
+	__u8 opt_type_m_path_table	[ISODCL (153, 156)]; /* 732 */
+	__u8 root_directory_record	[ISODCL (157, 190)]; /* 9.1 */
 	char volume_set_id		[ISODCL (191, 318)]; /* dchars */
 	char publisher_id		[ISODCL (319, 446)]; /* achars */
 	char preparer_id		[ISODCL (447, 574)]; /* achars */
@@ -51,36 +51,36 @@ struct iso_primary_descriptor {
 	char copyright_file_id		[ISODCL (703, 739)]; /* 7.5 dchars */
 	char abstract_file_id		[ISODCL (740, 776)]; /* 7.5 dchars */
 	char bibliographic_file_id	[ISODCL (777, 813)]; /* 7.5 dchars */
-	char creation_date		[ISODCL (814, 830)]; /* 8.4.26.1 */
-	char modification_date		[ISODCL (831, 847)]; /* 8.4.26.1 */
-	char expiration_date		[ISODCL (848, 864)]; /* 8.4.26.1 */
-	char effective_date		[ISODCL (865, 881)]; /* 8.4.26.1 */
-	char file_structure_version	[ISODCL (882, 882)]; /* 711 */
-	char unused4			[ISODCL (883, 883)];
-	char application_data		[ISODCL (884, 1395)];
-	char unused5			[ISODCL (1396, 2048)];
+	__u8 creation_date		[ISODCL (814, 830)]; /* 8.4.26.1 */
+	__u8 modification_date		[ISODCL (831, 847)]; /* 8.4.26.1 */
+	__u8 expiration_date		[ISODCL (848, 864)]; /* 8.4.26.1 */
+	__u8 effective_date		[ISODCL (865, 881)]; /* 8.4.26.1 */
+	__u8 file_structure_version	[ISODCL (882, 882)]; /* 711 */
+	__u8 unused4			[ISODCL (883, 883)];
+	__u8 application_data		[ISODCL (884, 1395)];
+	__u8 unused5			[ISODCL (1396, 2048)];
 };
 
 /* Almost the same as the primary descriptor but two fields are specified */
 struct iso_supplementary_descriptor {
-	char type			[ISODCL (  1,   1)]; /* 711 */
+	__u8 type			[ISODCL (  1,   1)]; /* 711 */
 	char id				[ISODCL (  2,   6)];
-	char version			[ISODCL (  7,   7)]; /* 711 */
-	char flags			[ISODCL (  8,   8)]; /* 853 */
+	__u8 version			[ISODCL (  7,   7)]; /* 711 */
+	__u8 flags			[ISODCL (  8,   8)]; /* 853 */
 	char system_id			[ISODCL (  9,  40)]; /* achars */
 	char volume_id			[ISODCL ( 41,  72)]; /* dchars */
-	char unused2			[ISODCL ( 73,  80)];
-	char volume_space_size		[ISODCL ( 81,  88)]; /* 733 */
-	char escape			[ISODCL ( 89, 120)]; /* 856 */
-	char volume_set_size		[ISODCL (121, 124)]; /* 723 */
-	char volume_sequence_number	[ISODCL (125, 128)]; /* 723 */
-	char logical_block_size		[ISODCL (129, 132)]; /* 723 */
-	char path_table_size		[ISODCL (133, 140)]; /* 733 */
-	char type_l_path_table		[ISODCL (141, 144)]; /* 731 */
-	char opt_type_l_path_table	[ISODCL (145, 148)]; /* 731 */
-	char type_m_path_table		[ISODCL (149, 152)]; /* 732 */
-	char opt_type_m_path_table	[ISODCL (153, 156)]; /* 732 */
-	char root_directory_record	[ISODCL (157, 190)]; /* 9.1 */
+	__u8 unused2			[ISODCL ( 73,  80)];
+	__u8 volume_space_size		[ISODCL ( 81,  88)]; /* 733 */
+	__u8 escape			[ISODCL ( 89, 120)]; /* 856 */
+	__u8 volume_set_size		[ISODCL (121, 124)]; /* 723 */
+	__u8 volume_sequence_number	[ISODCL (125, 128)]; /* 723 */
+	__u8 logical_block_size		[ISODCL (129, 132)]; /* 723 */
+	__u8 path_table_size		[ISODCL (133, 140)]; /* 733 */
+	__u8 type_l_path_table		[ISODCL (141, 144)]; /* 731 */
+	__u8 opt_type_l_path_table	[ISODCL (145, 148)]; /* 731 */
+	__u8 type_m_path_table		[ISODCL (149, 152)]; /* 732 */
+	__u8 opt_type_m_path_table	[ISODCL (153, 156)]; /* 732 */
+	__u8 root_directory_record	[ISODCL (157, 190)]; /* 9.1 */
 	char volume_set_id		[ISODCL (191, 318)]; /* dchars */
 	char publisher_id		[ISODCL (319, 446)]; /* achars */
 	char preparer_id		[ISODCL (447, 574)]; /* achars */
@@ -88,54 +88,54 @@ struct iso_supplementary_descriptor {
 	char copyright_file_id		[ISODCL (703, 739)]; /* 7.5 dchars */
 	char abstract_file_id		[ISODCL (740, 776)]; /* 7.5 dchars */
 	char bibliographic_file_id	[ISODCL (777, 813)]; /* 7.5 dchars */
-	char creation_date		[ISODCL (814, 830)]; /* 8.4.26.1 */
-	char modification_date		[ISODCL (831, 847)]; /* 8.4.26.1 */
-	char expiration_date		[ISODCL (848, 864)]; /* 8.4.26.1 */
-	char effective_date		[ISODCL (865, 881)]; /* 8.4.26.1 */
-	char file_structure_version	[ISODCL (882, 882)]; /* 711 */
-	char unused4			[ISODCL (883, 883)];
-	char application_data		[ISODCL (884, 1395)];
-	char unused5			[ISODCL (1396, 2048)];
+	__u8 creation_date		[ISODCL (814, 830)]; /* 8.4.26.1 */
+	__u8 modification_date		[ISODCL (831, 847)]; /* 8.4.26.1 */
+	__u8 expiration_date		[ISODCL (848, 864)]; /* 8.4.26.1 */
+	__u8 effective_date		[ISODCL (865, 881)]; /* 8.4.26.1 */
+	__u8 file_structure_version	[ISODCL (882, 882)]; /* 711 */
+	__u8 unused4			[ISODCL (883, 883)];
+	__u8 application_data		[ISODCL (884, 1395)];
+	__u8 unused5			[ISODCL (1396, 2048)];
 };
 
 
 #define HS_STANDARD_ID "CDROM"
 
 struct  hs_volume_descriptor {
-	char foo			[ISODCL (  1,   8)]; /* 733 */
-	char type			[ISODCL (  9,   9)]; /* 711 */
+	__u8 foo			[ISODCL (  1,   8)]; /* 733 */
+	__u8 type			[ISODCL (  9,   9)]; /* 711 */
 	char id				[ISODCL ( 10,  14)];
-	char version			[ISODCL ( 15,  15)]; /* 711 */
-	char data[ISODCL(16,2048)];
+	__u8 version			[ISODCL ( 15,  15)]; /* 711 */
+	__u8 data[ISODCL(16,2048)];
 };
 
 
 struct hs_primary_descriptor {
-	char foo			[ISODCL (  1,   8)]; /* 733 */
-	char type			[ISODCL (  9,   9)]; /* 711 */
-	char id				[ISODCL ( 10,  14)];
-	char version			[ISODCL ( 15,  15)]; /* 711 */
-	char unused1			[ISODCL ( 16,  16)]; /* 711 */
+	__u8 foo			[ISODCL (  1,   8)]; /* 733 */
+	__u8 type			[ISODCL (  9,   9)]; /* 711 */
+	__u8 id				[ISODCL ( 10,  14)];
+	__u8 version			[ISODCL ( 15,  15)]; /* 711 */
+	__u8 unused1			[ISODCL ( 16,  16)]; /* 711 */
 	char system_id			[ISODCL ( 17,  48)]; /* achars */
 	char volume_id			[ISODCL ( 49,  80)]; /* dchars */
-	char unused2			[ISODCL ( 81,  88)]; /* 733 */
-	char volume_space_size		[ISODCL ( 89,  96)]; /* 733 */
-	char unused3			[ISODCL ( 97, 128)]; /* 733 */
-	char volume_set_size		[ISODCL (129, 132)]; /* 723 */
-	char volume_sequence_number	[ISODCL (133, 136)]; /* 723 */
-	char logical_block_size		[ISODCL (137, 140)]; /* 723 */
-	char path_table_size		[ISODCL (141, 148)]; /* 733 */
-	char type_l_path_table		[ISODCL (149, 152)]; /* 731 */
-	char unused4			[ISODCL (153, 180)]; /* 733 */
-	char root_directory_record	[ISODCL (181, 214)]; /* 9.1 */
+	__u8 unused2			[ISODCL ( 81,  88)]; /* 733 */
+	__u8 volume_space_size		[ISODCL ( 89,  96)]; /* 733 */
+	__u8 unused3			[ISODCL ( 97, 128)]; /* 733 */
+	__u8 volume_set_size		[ISODCL (129, 132)]; /* 723 */
+	__u8 volume_sequence_number	[ISODCL (133, 136)]; /* 723 */
+	__u8 logical_block_size		[ISODCL (137, 140)]; /* 723 */
+	__u8 path_table_size		[ISODCL (141, 148)]; /* 733 */
+	__u8 type_l_path_table		[ISODCL (149, 152)]; /* 731 */
+	__u8 unused4			[ISODCL (153, 180)]; /* 733 */
+	__u8 root_directory_record	[ISODCL (181, 214)]; /* 9.1 */
 };
 
 /* We use this to help us look up the parent inode numbers. */
 
 struct iso_path_table{
-	unsigned char  name_len[2];	/* 721 */
-	char extent[4];		/* 731 */
-	char  parent[2];	/* 721 */
+	__u8  name_len[2];	/* 721 */
+	__u8  extent[4];	/* 731 */
+	__u8  parent[2];	/* 721 */
 	char name[0];
 } __attribute__((packed));
 
@@ -143,16 +143,16 @@ struct iso_path_table{
    there is an extra reserved byte after the flags */
 
 struct iso_directory_record {
-	char length			[ISODCL (1, 1)]; /* 711 */
-	char ext_attr_length		[ISODCL (2, 2)]; /* 711 */
-	char extent			[ISODCL (3, 10)]; /* 733 */
-	char size			[ISODCL (11, 18)]; /* 733 */
-	char date			[ISODCL (19, 25)]; /* 7 by 711 */
-	char flags			[ISODCL (26, 26)];
-	char file_unit_size		[ISODCL (27, 27)]; /* 711 */
-	char interleave			[ISODCL (28, 28)]; /* 711 */
-	char volume_sequence_number	[ISODCL (29, 32)]; /* 723 */
-	unsigned char name_len		[ISODCL (33, 33)]; /* 711 */
+	__u8 length			[ISODCL (1, 1)]; /* 711 */
+	__u8 ext_attr_length		[ISODCL (2, 2)]; /* 711 */
+	__u8 extent			[ISODCL (3, 10)]; /* 733 */
+	__u8 size			[ISODCL (11, 18)]; /* 733 */
+	__u8 date			[ISODCL (19, 25)]; /* 7 by 711 */
+	__u8 flags			[ISODCL (26, 26)];
+	__u8 file_unit_size		[ISODCL (27, 27)]; /* 711 */
+	__u8 interleave			[ISODCL (28, 28)]; /* 711 */
+	__u8 volume_sequence_number	[ISODCL (29, 32)]; /* 723 */
+	__u8 name_len			[ISODCL (33, 33)]; /* 711 */
 	char name			[0];
 } __attribute__((packed));
 
-- 
cgit v1.2.3


From d24a67b2d997c860a42516076f3315c2ad2d2884 Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Date: Fri, 22 Sep 2017 13:58:46 -0400
Subject: btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2

Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.

Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.

Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values.  Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized.  The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.

To avoid these problems, define a new ioctl LOGICAL_INO_V2.  We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field.  The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.

Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different).  A version parameter and an 'if' statement will suffice.

Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.

Motivation and background, copied from the patchset cover letter:

Suppose we have a file with one extent:

    root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
    root@tester:~# sync

Split the extent by overwriting it in the middle:

    root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a

We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:

    root@tester:~# btrfs-debug-tree /dev/vdc -t 2
    [...]
            item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
                    extent refs 2 gen 29 flags DATA
                    extent data backref root 5 objectid 261 offset 0 count 2
    [...]
            item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
                    extent refs 1 gen 30 flags DATA
                    extent data backref root 5 objectid 261 offset 8192 count 1
    [...]

and the ref tree looks like:

    root@tester:~# btrfs-debug-tree /dev/vdc -t 5
    [...]
            item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
                    extent data disk byte 1103101952 nr 73728
                    extent data offset 0 nr 8192 ram 73728
                    extent compression(none)
            item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
                    extent data disk byte 1103175680 nr 4096
                    extent data offset 0 nr 4096 ram 4096
                    extent compression(none)
            item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
                    extent data disk byte 1103101952 nr 73728
                    extent data offset 12288 nr 61440 ram 73728
                    extent compression(none)
    [...]

There are two references to the same extent with different, non-overlapping
byte offsets:

    [------------------72K extent at 1103101952----------------------]
    [--8K----------------|--4K unreachable----|--60K-----------------]
    ^                                         ^
    |                                         |
    [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
                         |
                         v
                         [-----4K extent-----] at 1103175680

We want to find all of the references to extent bytenr 1103101952.

Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:

    root@tester:~# btrfs ins log 1103101952 -P /test/
    Using LOGICAL_INO
    inode 261 offset 0 root 5

    root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
    inode 261 offset 0 root 5
    inode 261 offset 4096 root 5   <- same extent ref as offset 0
                                   (offset 8192 returns empty set, not reachable)
    inode 261 offset 12288 root 5
    inode 261 offset 16384 root 5  \
    inode 261 offset 20480 root 5  |
    inode 261 offset 24576 root 5  |
    inode 261 offset 28672 root 5  |
    inode 261 offset 32768 root 5  |
    inode 261 offset 36864 root 5  \
    inode 261 offset 40960 root 5   > all the same extent ref as offset 12288.
    inode 261 offset 45056 root 5  /  More processing required in userspace
    inode 261 offset 49152 root 5  |  to figure out these are all duplicates.
    inode 261 offset 53248 root 5  |
    inode 261 offset 57344 root 5  |
    inode 261 offset 61440 root 5  |
    inode 261 offset 65536 root 5  |
    inode 261 offset 69632 root 5  /

In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.

With the patch, we just use one call to map all refs to the extent at once:
    root@tester:~# btrfs ins log 1103101952 -P /test/
    Using LOGICAL_INO_V2
    inode 261 offset 0 root 5
    inode 261 offset 12288 root 5

The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references.  Userspace can use this information to make
better choices to dedup or defrag.

Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c           | 26 +++++++++++++++++++++++---
 include/uapi/linux/btrfs.h |  8 +++++++-
 2 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2497a5d45d9c..fa9996ab3da6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4530,13 +4530,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
 }
 
 static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
-					void __user *arg)
+					void __user *arg, int version)
 {
 	int ret = 0;
 	int size;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
+	bool ignore_offset;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -4545,6 +4546,22 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 	if (IS_ERR(loi))
 		return PTR_ERR(loi);
 
+	if (version == 1) {
+		ignore_offset = false;
+	} else {
+		/* All reserved bits must be 0 for now */
+		if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
+			ret = -EINVAL;
+			goto out_loi;
+		}
+		/* Only accept flags we have defined so far */
+		if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
+			ret = -EINVAL;
+			goto out_loi;
+		}
+		ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
@@ -4560,7 +4577,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
-					  build_ino_list, inodes, false);
+					  build_ino_list, inodes, ignore_offset);
 	if (ret == -EINVAL)
 		ret = -ENOENT;
 	if (ret < 0)
@@ -4574,6 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 out:
 	btrfs_free_path(path);
 	kvfree(inodes);
+out_loi:
 	kfree(loi);
 
 	return ret;
@@ -5575,7 +5593,9 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_INO_PATHS:
 		return btrfs_ioctl_ino_to_path(root, argp);
 	case BTRFS_IOC_LOGICAL_INO:
-		return btrfs_ioctl_logical_to_ino(fs_info, argp);
+		return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
+	case BTRFS_IOC_LOGICAL_INO_V2:
+		return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(fs_info, argp);
 	case BTRFS_IOC_SYNC: {
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 378230c163d5..99bb7988e6fe 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -608,10 +608,14 @@ struct btrfs_ioctl_ino_path_args {
 struct btrfs_ioctl_logical_ino_args {
 	__u64				logical;	/* in */
 	__u64				size;		/* in */
-	__u64				reserved[4];
+	__u64				reserved[3];	/* must be 0 for now */
+	__u64				flags;		/* in, v2 only */
 	/* struct btrfs_data_container	*inodes;	out   */
 	__u64				inodes;
 };
+/* Return every ref to the extent, not just those containing logical block.
+ * Requires logical == extent bytenr. */
+#define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET	(1ULL << 0)
 
 enum btrfs_dev_stat_values {
 	/* disk I/O failure stats */
@@ -835,5 +839,7 @@ enum btrfs_err_code {
 				   struct btrfs_ioctl_feature_flags[3])
 #define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \
 				   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \
+					struct btrfs_ioctl_logical_ino_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
-- 
cgit v1.2.3


From e20eaa2382e7888a4e06ccb015c476a6fb1fda0c Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Thu, 23 Nov 2017 16:26:35 +0800
Subject: vfio: ABI for mdev display dma-buf operation

Add VFIO_DEVICE_QUERY_GFX_PLANE ioctl command to let user query and get
a plane and its information. So far, two types of buffers are supported:
buffers based on dma-buf and buffers based on region.

This ioctl can be invoked with:
1) Either DMABUF or REGION flag. Vendor driver returns a plane_info
successfully only when the specific kind of buffer is supported.
2) Flag PROBE. And at the same time either DMABUF or REGION must be set,
so that vendor driver returns success only when the specific kind of
buffer is supported.

Add VFIO_DEVICE_GET_GFX_DMABUF ioctl command to let user get a specific
dma-buf fd of an exposed MDEV buffer provided by dmabuf_id which was
returned in VFIO_DEVICE_QUERY_GFX_PLANE ioctl command.

The life cycle of an exposed MDEV buffer is handled by userspace and
tracked by kernel space. The returned dmabuf_id in struct vfio_device_
query_gfx_plane can be a new id of a new exposed buffer or an old id of
a re-exported buffer. Host user can check the value of dmabuf_id to see
if it needs to create new resources according to the new exposed buffer
or just re-use the existing resource related to the old buffer.

v18:
- update comments for VFIO_DEVICE_GET_GFX_DMABUF. (Alex)

v17:
- modify VFIO_DEVICE_GET_GFX_DMABUF interface. (Alex)

v16:
- add x_hot and y_hot fields. (Gerd)
- add comments for VFIO_DEVICE_GET_GFX_DMABUF. (Alex)
- rebase to 4.14.0-rc6.

v15:
- add a ioctl to get a dmabuf for a given dmabuf id. (Gerd)

v14:
- add PROBE, DMABUF and REGION flags. (Alex)

v12:
- add drm_format_mod back. (Gerd and Zhenyu)
- add region_index. (Gerd)

v11:
- rename plane_type to drm_plane_type. (Gerd)
- move fields of vfio_device_query_gfx_plane to vfio_device_gfx_plane_info.
  (Gerd)
- remove drm_format_mod, start fields. (Daniel)
- remove plane_id.

v10:
- refine the ABI API VFIO_DEVICE_QUERY_GFX_PLANE. (Alex) (Gerd)

v3:
- add a field gvt_plane_info in the drm_i915_gem_obj structure to save
  the decoded plane information to avoid look up while need the plane
  info. (Gerd)

Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Reviewed-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 include/uapi/linux/vfio.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ae461050661a..5c1cca2ba04d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -502,6 +502,68 @@ struct vfio_pci_hot_reset {
 
 #define VFIO_DEVICE_PCI_HOT_RESET	_IO(VFIO_TYPE, VFIO_BASE + 13)
 
+/**
+ * VFIO_DEVICE_QUERY_GFX_PLANE - _IOW(VFIO_TYPE, VFIO_BASE + 14,
+ *                                    struct vfio_device_query_gfx_plane)
+ *
+ * Set the drm_plane_type and flags, then retrieve the gfx plane info.
+ *
+ * flags supported:
+ * - VFIO_GFX_PLANE_TYPE_PROBE and VFIO_GFX_PLANE_TYPE_DMABUF are set
+ *   to ask if the mdev supports dma-buf. 0 on support, -EINVAL on no
+ *   support for dma-buf.
+ * - VFIO_GFX_PLANE_TYPE_PROBE and VFIO_GFX_PLANE_TYPE_REGION are set
+ *   to ask if the mdev supports region. 0 on support, -EINVAL on no
+ *   support for region.
+ * - VFIO_GFX_PLANE_TYPE_DMABUF or VFIO_GFX_PLANE_TYPE_REGION is set
+ *   with each call to query the plane info.
+ * - Others are invalid and return -EINVAL.
+ *
+ * Note:
+ * 1. Plane could be disabled by guest. In that case, success will be
+ *    returned with zero-initialized drm_format, size, width and height
+ *    fields.
+ * 2. x_hot/y_hot is set to 0xFFFFFFFF if no hotspot information available
+ *
+ * Return: 0 on success, -errno on other failure.
+ */
+struct vfio_device_gfx_plane_info {
+	__u32 argsz;
+	__u32 flags;
+#define VFIO_GFX_PLANE_TYPE_PROBE (1 << 0)
+#define VFIO_GFX_PLANE_TYPE_DMABUF (1 << 1)
+#define VFIO_GFX_PLANE_TYPE_REGION (1 << 2)
+	/* in */
+	__u32 drm_plane_type;	/* type of plane: DRM_PLANE_TYPE_* */
+	/* out */
+	__u32 drm_format;	/* drm format of plane */
+	__u64 drm_format_mod;   /* tiled mode */
+	__u32 width;	/* width of plane */
+	__u32 height;	/* height of plane */
+	__u32 stride;	/* stride of plane */
+	__u32 size;	/* size of plane in bytes, align on page*/
+	__u32 x_pos;	/* horizontal position of cursor plane */
+	__u32 y_pos;	/* vertical position of cursor plane*/
+	__u32 x_hot;    /* horizontal position of cursor hotspot */
+	__u32 y_hot;    /* vertical position of cursor hotspot */
+	union {
+		__u32 region_index;	/* region index */
+		__u32 dmabuf_id;	/* dma-buf id */
+	};
+};
+
+#define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+/**
+ * VFIO_DEVICE_GET_GFX_DMABUF - _IOW(VFIO_TYPE, VFIO_BASE + 15, __u32)
+ *
+ * Return a new dma-buf file descriptor for an exposed guest framebuffer
+ * described by the provided dmabuf_id. The dmabuf_id is returned from VFIO_
+ * DEVICE_QUERY_GFX_PLANE as a token of the exposed guest framebuffer.
+ */
+
+#define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15)
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
cgit v1.2.3


From 7582e22038a266444eb87bc07c372592ad647439 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Tue, 31 Oct 2017 15:51:08 +0000
Subject: arm64/sve: Backend logic for setting the vector length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch implements the core logic for changing a task's vector
length on request from userspace.  This will be used by the ptrace
and prctl frontends that are implemented in later patches.

The SVE architecture permits, but does not require, implementations
to support vector lengths that are not a power of two.  To handle
this, logic is added to check a requested vector length against a
possibly sparse bitmap of available vector lengths at runtime, so
that the best supported value can be chosen.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/fpsimd.h |   8 +++
 arch/arm64/kernel/fpsimd.c      | 137 +++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/prctl.h      |   5 ++
 3 files changed, 149 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 9bbd74c0ea6b..86f550ce7b4d 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -20,6 +20,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/cache.h>
 #include <linux/stddef.h>
 
 /*
@@ -70,17 +71,24 @@ extern void fpsimd_update_current_state(struct fpsimd_state *state);
 
 extern void fpsimd_flush_task_state(struct task_struct *target);
 
+/* Maximum VL that SVE VL-agnostic software can transparently support */
+#define SVE_VL_ARCH_MAX 0x100
+
 extern void sve_save_state(void *state, u32 *pfpsr);
 extern void sve_load_state(void const *state, u32 const *pfpsr,
 			   unsigned long vq_minus_1);
 extern unsigned int sve_get_vl(void);
 
+extern int __ro_after_init sve_max_vl;
+
 #ifdef CONFIG_ARM64_SVE
 
 extern size_t sve_state_size(struct task_struct const *task);
 
 extern void sve_alloc(struct task_struct *task);
 extern void fpsimd_release_task(struct task_struct *task);
+extern int sve_set_vector_length(struct task_struct *task,
+				 unsigned long vl, unsigned long flags);
 
 #else /* ! CONFIG_ARM64_SVE */
 
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index e7733fb19388..667be3472114 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -17,8 +17,10 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/bitmap.h>
 #include <linux/bottom_half.h>
 #include <linux/bug.h>
+#include <linux/cache.h>
 #include <linux/compat.h>
 #include <linux/cpu.h>
 #include <linux/cpu_pm.h>
@@ -28,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
+#include <linux/prctl.h>
 #include <linux/ptrace.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
@@ -114,6 +117,20 @@ static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
 /* Default VL for tasks that don't set it explicitly: */
 static int sve_default_vl = SVE_VL_MIN;
 
+#ifdef CONFIG_ARM64_SVE
+
+/* Maximum supported vector length across all CPUs (initially poisoned) */
+int __ro_after_init sve_max_vl = -1;
+/* Set of available vector lengths, as vq_to_bit(vq): */
+static DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
+
+#else /* ! CONFIG_ARM64_SVE */
+
+/* Dummy declaration for code that will be optimised out: */
+extern DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
+
+#endif /* ! CONFIG_ARM64_SVE */
+
 /*
  * Call __sve_free() directly only if you know task can't be scheduled
  * or preempted.
@@ -271,6 +288,50 @@ static void task_fpsimd_save(void)
 	}
 }
 
+/*
+ * Helpers to translate bit indices in sve_vq_map to VQ values (and
+ * vice versa).  This allows find_next_bit() to be used to find the
+ * _maximum_ VQ not exceeding a certain value.
+ */
+
+static unsigned int vq_to_bit(unsigned int vq)
+{
+	return SVE_VQ_MAX - vq;
+}
+
+static unsigned int bit_to_vq(unsigned int bit)
+{
+	if (WARN_ON(bit >= SVE_VQ_MAX))
+		bit = SVE_VQ_MAX - 1;
+
+	return SVE_VQ_MAX - bit;
+}
+
+/*
+ * All vector length selection from userspace comes through here.
+ * We're on a slow path, so some sanity-checks are included.
+ * If things go wrong there's a bug somewhere, but try to fall back to a
+ * safe choice.
+ */
+static unsigned int find_supported_vector_length(unsigned int vl)
+{
+	int bit;
+	int max_vl = sve_max_vl;
+
+	if (WARN_ON(!sve_vl_valid(vl)))
+		vl = SVE_VL_MIN;
+
+	if (WARN_ON(!sve_vl_valid(max_vl)))
+		max_vl = SVE_VL_MIN;
+
+	if (vl > max_vl)
+		vl = max_vl;
+
+	bit = find_next_bit(sve_vq_map, SVE_VQ_MAX,
+			    vq_to_bit(sve_vq_from_vl(vl)));
+	return sve_vl_from_vq(bit_to_vq(bit));
+}
+
 #define ZREG(sve_state, vq, n) ((char *)(sve_state) +		\
 	(SVE_SIG_ZREG_OFFSET(vq, n) - SVE_SIG_REGS_OFFSET))
 
@@ -365,6 +426,76 @@ void sve_alloc(struct task_struct *task)
 	BUG_ON(!task->thread.sve_state);
 }
 
+int sve_set_vector_length(struct task_struct *task,
+			  unsigned long vl, unsigned long flags)
+{
+	if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
+				     PR_SVE_SET_VL_ONEXEC))
+		return -EINVAL;
+
+	if (!sve_vl_valid(vl))
+		return -EINVAL;
+
+	/*
+	 * Clamp to the maximum vector length that VL-agnostic SVE code can
+	 * work with.  A flag may be assigned in the future to allow setting
+	 * of larger vector lengths without confusing older software.
+	 */
+	if (vl > SVE_VL_ARCH_MAX)
+		vl = SVE_VL_ARCH_MAX;
+
+	vl = find_supported_vector_length(vl);
+
+	if (flags & (PR_SVE_VL_INHERIT |
+		     PR_SVE_SET_VL_ONEXEC))
+		task->thread.sve_vl_onexec = vl;
+	else
+		/* Reset VL to system default on next exec: */
+		task->thread.sve_vl_onexec = 0;
+
+	/* Only actually set the VL if not deferred: */
+	if (flags & PR_SVE_SET_VL_ONEXEC)
+		goto out;
+
+	if (vl == task->thread.sve_vl)
+		goto out;
+
+	/*
+	 * To ensure the FPSIMD bits of the SVE vector registers are preserved,
+	 * write any live register state back to task_struct, and convert to a
+	 * non-SVE thread.
+	 */
+	if (task == current) {
+		local_bh_disable();
+
+		task_fpsimd_save();
+		set_thread_flag(TIF_FOREIGN_FPSTATE);
+	}
+
+	fpsimd_flush_task_state(task);
+	if (test_and_clear_tsk_thread_flag(task, TIF_SVE))
+		sve_to_fpsimd(task);
+
+	if (task == current)
+		local_bh_enable();
+
+	/*
+	 * Force reallocation of task SVE state to the correct size
+	 * on next use:
+	 */
+	sve_free(task);
+
+	task->thread.sve_vl = vl;
+
+out:
+	if (flags & PR_SVE_VL_INHERIT)
+		set_tsk_thread_flag(task, TIF_SVE_VL_INHERIT);
+	else
+		clear_tsk_thread_flag(task, TIF_SVE_VL_INHERIT);
+
+	return 0;
+}
+
 /*
  * Called from the put_task_struct() path, which cannot get here
  * unless dead_task is really dead and not schedulable.
@@ -481,7 +612,7 @@ void fpsimd_thread_switch(struct task_struct *next)
 
 void fpsimd_flush_thread(void)
 {
-	int vl;
+	int vl, supported_vl;
 
 	if (!system_supports_fpsimd())
 		return;
@@ -509,6 +640,10 @@ void fpsimd_flush_thread(void)
 		if (WARN_ON(!sve_vl_valid(vl)))
 			vl = SVE_VL_MIN;
 
+		supported_vl = find_supported_vector_length(vl);
+		if (WARN_ON(supported_vl != vl))
+			vl = supported_vl;
+
 		current->thread.sve_vl = vl;
 
 		/*
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a8d0759a9e40..1b64901ca6b3 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -197,4 +197,9 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER		3
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
+/* arm64 Scalable Vector Extension controls */
+# define PR_SVE_SET_VL_ONEXEC		(1 << 18) /* defer effect until exec */
+# define PR_SVE_VL_LEN_MASK		0xffff
+# define PR_SVE_VL_INHERIT		(1 << 17) /* inherit across exec */
+
 #endif /* _LINUX_PRCTL_H */
-- 
cgit v1.2.3


From 43d4da2c45b2f5d62f8a79ff7c6f95089bb24656 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Tue, 31 Oct 2017 15:51:13 +0000
Subject: arm64/sve: ptrace and ELF coredump support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch defines and implements a new regset NT_ARM_SVE, which
describes a thread's SVE register state.  This allows a debugger to
manipulate the SVE state, as well as being included in ELF
coredumps for post-mortem debugging.

Because the regset size and layout are dependent on the thread's
current vector length, it is not possible to define a C struct to
describe the regset contents as is done for existing regsets.
Instead, and for the same reasons, NT_ARM_SVE is based on the
freeform variable-layout approach used for the SVE signal frame.

Additionally, to reduce debug overhead when debugging threads that
might or might not have live SVE register state, NT_ARM_SVE may be
presented in one of two different formats: the old struct
user_fpsimd_state format is embedded for describing the state of a
thread with no live SVE state, whereas a new variable-layout
structure is embedded for describing live SVE state.  This avoids a
debugger needing to poll NT_PRFPREG in addition to NT_ARM_SVE, and
allows existing userspace code to handle the non-SVE case without
too much modification.

For this to work, NT_ARM_SVE is defined with a fixed-format header
of type struct user_sve_header, which the recipient can use to
figure out the content, size and layout of the reset of the regset.
Accessor macros are defined to allow the vector-length-dependent
parts of the regset to be manipulated.

Signed-off-by: Alan Hayward <alan.hayward@arm.com>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alex Bennée <alex.bennee@linaro.org>
Cc: Okamoto Takayuki <tokamoto@jp.fujitsu.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/fpsimd.h      |  12 +-
 arch/arm64/include/uapi/asm/ptrace.h | 138 +++++++++++++++++
 arch/arm64/kernel/fpsimd.c           |  60 ++++++++
 arch/arm64/kernel/ptrace.c           | 280 ++++++++++++++++++++++++++++++++++-
 include/uapi/linux/elf.h             |   1 +
 5 files changed, 482 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index d8e0dc9f65a1..d754e5a6949c 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -38,13 +38,16 @@ struct fpsimd_state {
 			__uint128_t vregs[32];
 			u32 fpsr;
 			u32 fpcr;
+			/*
+			 * For ptrace compatibility, pad to next 128-bit
+			 * boundary here if extending this struct.
+			 */
 		};
 	};
 	/* the id of the last cpu to have restored this state */
 	unsigned int cpu;
 };
 
-
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
 #define VFP_FPSCR_STAT_MASK	0xf800009f
@@ -88,6 +91,10 @@ extern size_t sve_state_size(struct task_struct const *task);
 
 extern void sve_alloc(struct task_struct *task);
 extern void fpsimd_release_task(struct task_struct *task);
+extern void fpsimd_sync_to_sve(struct task_struct *task);
+extern void sve_sync_to_fpsimd(struct task_struct *task);
+extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
+
 extern int sve_set_vector_length(struct task_struct *task,
 				 unsigned long vl, unsigned long flags);
 
@@ -104,6 +111,9 @@ extern void __init sve_setup(void);
 
 static inline void sve_alloc(struct task_struct *task) { }
 static inline void fpsimd_release_task(struct task_struct *task) { }
+static inline void sve_sync_to_fpsimd(struct task_struct *task) { }
+static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { }
+
 static inline void sve_init_vq_map(void) { }
 static inline void sve_update_vq_map(void) { }
 static inline int sve_verify_vq_map(void) { return 0; }
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 3697d95ba0a1..e7085589f81c 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 
 #include <asm/hwcap.h>
+#include <asm/sigcontext.h>
 
 
 /*
@@ -62,6 +63,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/prctl.h>
+
 /*
  * User structures for general purpose, floating point and debug registers.
  */
@@ -89,6 +92,141 @@ struct user_hwdebug_state {
 	}		dbg_regs[16];
 };
 
+/* SVE/FP/SIMD state (NT_ARM_SVE) */
+
+struct user_sve_header {
+	__u32 size; /* total meaningful regset content in bytes */
+	__u32 max_size; /* maxmium possible size for this thread */
+	__u16 vl; /* current vector length */
+	__u16 max_vl; /* maximum possible vector length */
+	__u16 flags;
+	__u16 __reserved;
+};
+
+/* Definitions for user_sve_header.flags: */
+#define SVE_PT_REGS_MASK		(1 << 0)
+
+#define SVE_PT_REGS_FPSIMD		0
+#define SVE_PT_REGS_SVE			SVE_PT_REGS_MASK
+
+/*
+ * Common SVE_PT_* flags:
+ * These must be kept in sync with prctl interface in <linux/ptrace.h>
+ */
+#define SVE_PT_VL_INHERIT		(PR_SVE_VL_INHERIT >> 16)
+#define SVE_PT_VL_ONEXEC		(PR_SVE_SET_VL_ONEXEC >> 16)
+
+
+/*
+ * The remainder of the SVE state follows struct user_sve_header.  The
+ * total size of the SVE state (including header) depends on the
+ * metadata in the header:  SVE_PT_SIZE(vq, flags) gives the total size
+ * of the state in bytes, including the header.
+ *
+ * Refer to <asm/sigcontext.h> for details of how to pass the correct
+ * "vq" argument to these macros.
+ */
+
+/* Offset from the start of struct user_sve_header to the register data */
+#define SVE_PT_REGS_OFFSET					\
+	((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1))	\
+		/ SVE_VQ_BYTES * SVE_VQ_BYTES)
+
+/*
+ * The register data content and layout depends on the value of the
+ * flags field.
+ */
+
+/*
+ * (flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD case:
+ *
+ * The payload starts at offset SVE_PT_FPSIMD_OFFSET, and is of type
+ * struct user_fpsimd_state.  Additional data might be appended in the
+ * future: use SVE_PT_FPSIMD_SIZE(vq, flags) to compute the total size.
+ * SVE_PT_FPSIMD_SIZE(vq, flags) will never be less than
+ * sizeof(struct user_fpsimd_state).
+ */
+
+#define SVE_PT_FPSIMD_OFFSET		SVE_PT_REGS_OFFSET
+
+#define SVE_PT_FPSIMD_SIZE(vq, flags)	(sizeof(struct user_fpsimd_state))
+
+/*
+ * (flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE case:
+ *
+ * The payload starts at offset SVE_PT_SVE_OFFSET, and is of size
+ * SVE_PT_SVE_SIZE(vq, flags).
+ *
+ * Additional macros describe the contents and layout of the payload.
+ * For each, SVE_PT_SVE_x_OFFSET(args) is the start offset relative to
+ * the start of struct user_sve_header, and SVE_PT_SVE_x_SIZE(args) is
+ * the size in bytes:
+ *
+ *	x	type				description
+ *	-	----				-----------
+ *	ZREGS		\
+ *	ZREG		|
+ *	PREGS		| refer to <asm/sigcontext.h>
+ *	PREG		|
+ *	FFR		/
+ *
+ *	FPSR	uint32_t			FPSR
+ *	FPCR	uint32_t			FPCR
+ *
+ * Additional data might be appended in the future.
+ */
+
+#define SVE_PT_SVE_ZREG_SIZE(vq)	SVE_SIG_ZREG_SIZE(vq)
+#define SVE_PT_SVE_PREG_SIZE(vq)	SVE_SIG_PREG_SIZE(vq)
+#define SVE_PT_SVE_FFR_SIZE(vq)		SVE_SIG_FFR_SIZE(vq)
+#define SVE_PT_SVE_FPSR_SIZE		sizeof(__u32)
+#define SVE_PT_SVE_FPCR_SIZE		sizeof(__u32)
+
+#define __SVE_SIG_TO_PT(offset) \
+	((offset) - SVE_SIG_REGS_OFFSET + SVE_PT_REGS_OFFSET)
+
+#define SVE_PT_SVE_OFFSET		SVE_PT_REGS_OFFSET
+
+#define SVE_PT_SVE_ZREGS_OFFSET \
+	__SVE_SIG_TO_PT(SVE_SIG_ZREGS_OFFSET)
+#define SVE_PT_SVE_ZREG_OFFSET(vq, n) \
+	__SVE_SIG_TO_PT(SVE_SIG_ZREG_OFFSET(vq, n))
+#define SVE_PT_SVE_ZREGS_SIZE(vq) \
+	(SVE_PT_SVE_ZREG_OFFSET(vq, SVE_NUM_ZREGS) - SVE_PT_SVE_ZREGS_OFFSET)
+
+#define SVE_PT_SVE_PREGS_OFFSET(vq) \
+	__SVE_SIG_TO_PT(SVE_SIG_PREGS_OFFSET(vq))
+#define SVE_PT_SVE_PREG_OFFSET(vq, n) \
+	__SVE_SIG_TO_PT(SVE_SIG_PREG_OFFSET(vq, n))
+#define SVE_PT_SVE_PREGS_SIZE(vq) \
+	(SVE_PT_SVE_PREG_OFFSET(vq, SVE_NUM_PREGS) - \
+		SVE_PT_SVE_PREGS_OFFSET(vq))
+
+#define SVE_PT_SVE_FFR_OFFSET(vq) \
+	__SVE_SIG_TO_PT(SVE_SIG_FFR_OFFSET(vq))
+
+#define SVE_PT_SVE_FPSR_OFFSET(vq)				\
+	((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) +	\
+			(SVE_VQ_BYTES - 1))			\
+		/ SVE_VQ_BYTES * SVE_VQ_BYTES)
+#define SVE_PT_SVE_FPCR_OFFSET(vq) \
+	(SVE_PT_SVE_FPSR_OFFSET(vq) + SVE_PT_SVE_FPSR_SIZE)
+
+/*
+ * Any future extension appended after FPCR must be aligned to the next
+ * 128-bit boundary.
+ */
+
+#define SVE_PT_SVE_SIZE(vq, flags)					\
+	((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE		\
+			- SVE_PT_SVE_OFFSET + (SVE_VQ_BYTES - 1))	\
+		/ SVE_VQ_BYTES * SVE_VQ_BYTES)
+
+#define SVE_PT_SIZE(vq, flags)						\
+	 (((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ?		\
+		  SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags)	\
+		: SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags))
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 1e531156f1d7..b82d44693b9d 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -428,6 +428,66 @@ void sve_alloc(struct task_struct *task)
 	BUG_ON(!task->thread.sve_state);
 }
 
+
+/*
+ * Ensure that task->thread.sve_state is up to date with respect to
+ * the user task, irrespective of when SVE is in use or not.
+ *
+ * This should only be called by ptrace.  task must be non-runnable.
+ * task->thread.sve_state must point to at least sve_state_size(task)
+ * bytes of allocated kernel memory.
+ */
+void fpsimd_sync_to_sve(struct task_struct *task)
+{
+	if (!test_tsk_thread_flag(task, TIF_SVE))
+		fpsimd_to_sve(task);
+}
+
+/*
+ * Ensure that task->thread.fpsimd_state is up to date with respect to
+ * the user task, irrespective of whether SVE is in use or not.
+ *
+ * This should only be called by ptrace.  task must be non-runnable.
+ * task->thread.sve_state must point to at least sve_state_size(task)
+ * bytes of allocated kernel memory.
+ */
+void sve_sync_to_fpsimd(struct task_struct *task)
+{
+	if (test_tsk_thread_flag(task, TIF_SVE))
+		sve_to_fpsimd(task);
+}
+
+/*
+ * Ensure that task->thread.sve_state is up to date with respect to
+ * the task->thread.fpsimd_state.
+ *
+ * This should only be called by ptrace to merge new FPSIMD register
+ * values into a task for which SVE is currently active.
+ * task must be non-runnable.
+ * task->thread.sve_state must point to at least sve_state_size(task)
+ * bytes of allocated kernel memory.
+ * task->thread.fpsimd_state must already have been initialised with
+ * the new FPSIMD register values to be merged in.
+ */
+void sve_sync_from_fpsimd_zeropad(struct task_struct *task)
+{
+	unsigned int vq;
+	void *sst = task->thread.sve_state;
+	struct fpsimd_state const *fst = &task->thread.fpsimd_state;
+	unsigned int i;
+
+	if (!test_tsk_thread_flag(task, TIF_SVE))
+		return;
+
+	vq = sve_vq_from_vl(task->thread.sve_vl);
+
+	memset(sst, 0, SVE_SIG_REGS_SIZE(vq));
+
+	for (i = 0; i < 32; ++i)
+		memcpy(ZREG(sst, vq, i), &fst->vregs[i],
+		       sizeof(fst->vregs[i]));
+}
+
 int sve_set_vector_length(struct task_struct *task,
 			  unsigned long vl, unsigned long flags)
 {
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 9cbb6123208f..7c44658b316d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/init.h>
 #include <linux/signal.h>
+#include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
@@ -40,6 +41,7 @@
 #include <linux/elf.h>
 
 #include <asm/compat.h>
+#include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
 #include <asm/pgtable.h>
 #include <asm/stacktrace.h>
@@ -618,17 +620,56 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
 /*
  * TODO: update fp accessors for lazy context switching (sync/flush hwstate)
  */
-static int fpr_get(struct task_struct *target, const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   void *kbuf, void __user *ubuf)
+static int __fpr_get(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     void *kbuf, void __user *ubuf, unsigned int start_pos)
 {
 	struct user_fpsimd_state *uregs;
+
+	sve_sync_to_fpsimd(target);
+
 	uregs = &target->thread.fpsimd_state.user_fpsimd;
 
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs,
+				   start_pos, start_pos + sizeof(*uregs));
+}
+
+static int fpr_get(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
 	if (target == current)
 		fpsimd_preserve_current_state();
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, -1);
+	return __fpr_get(target, regset, pos, count, kbuf, ubuf, 0);
+}
+
+static int __fpr_set(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     const void *kbuf, const void __user *ubuf,
+		     unsigned int start_pos)
+{
+	int ret;
+	struct user_fpsimd_state newstate;
+
+	/*
+	 * Ensure target->thread.fpsimd_state is up to date, so that a
+	 * short copyin can't resurrect stale data.
+	 */
+	sve_sync_to_fpsimd(target);
+
+	newstate = target->thread.fpsimd_state.user_fpsimd;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate,
+				 start_pos, start_pos + sizeof(newstate));
+	if (ret)
+		return ret;
+
+	target->thread.fpsimd_state.user_fpsimd = newstate;
+
+	return ret;
 }
 
 static int fpr_set(struct task_struct *target, const struct user_regset *regset,
@@ -636,15 +677,14 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
 		   const void *kbuf, const void __user *ubuf)
 {
 	int ret;
-	struct user_fpsimd_state newstate =
-		target->thread.fpsimd_state.user_fpsimd;
 
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1);
+	ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, 0);
 	if (ret)
 		return ret;
 
-	target->thread.fpsimd_state.user_fpsimd = newstate;
+	sve_sync_from_fpsimd_zeropad(target);
 	fpsimd_flush_task_state(target);
+
 	return ret;
 }
 
@@ -702,6 +742,215 @@ static int system_call_set(struct task_struct *target,
 	return ret;
 }
 
+#ifdef CONFIG_ARM64_SVE
+
+static void sve_init_header_from_task(struct user_sve_header *header,
+				      struct task_struct *target)
+{
+	unsigned int vq;
+
+	memset(header, 0, sizeof(*header));
+
+	header->flags = test_tsk_thread_flag(target, TIF_SVE) ?
+		SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD;
+	if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT))
+		header->flags |= SVE_PT_VL_INHERIT;
+
+	header->vl = target->thread.sve_vl;
+	vq = sve_vq_from_vl(header->vl);
+
+	header->max_vl = sve_max_vl;
+	if (WARN_ON(!sve_vl_valid(sve_max_vl)))
+		header->max_vl = header->vl;
+
+	header->size = SVE_PT_SIZE(vq, header->flags);
+	header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
+				      SVE_PT_REGS_SVE);
+}
+
+static unsigned int sve_size_from_header(struct user_sve_header const *header)
+{
+	return ALIGN(header->size, SVE_VQ_BYTES);
+}
+
+static unsigned int sve_get_size(struct task_struct *target,
+				 const struct user_regset *regset)
+{
+	struct user_sve_header header;
+
+	if (!system_supports_sve())
+		return 0;
+
+	sve_init_header_from_task(&header, target);
+	return sve_size_from_header(&header);
+}
+
+static int sve_get(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
+	int ret;
+	struct user_sve_header header;
+	unsigned int vq;
+	unsigned long start, end;
+
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	/* Header */
+	sve_init_header_from_task(&header, target);
+	vq = sve_vq_from_vl(header.vl);
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header,
+				  0, sizeof(header));
+	if (ret)
+		return ret;
+
+	if (target == current)
+		fpsimd_preserve_current_state();
+
+	/* Registers: FPSIMD-only case */
+
+	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
+	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD)
+		return __fpr_get(target, regset, pos, count, kbuf, ubuf,
+				 SVE_PT_FPSIMD_OFFSET);
+
+	/* Otherwise: full SVE case */
+
+	BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
+	start = SVE_PT_SVE_OFFSET;
+	end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  target->thread.sve_state,
+				  start, end);
+	if (ret)
+		return ret;
+
+	start = end;
+	end = SVE_PT_SVE_FPSR_OFFSET(vq);
+	ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+				       start, end);
+	if (ret)
+		return ret;
+
+	/*
+	 * Copy fpsr, and fpcr which must follow contiguously in
+	 * struct fpsimd_state:
+	 */
+	start = end;
+	end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.fpsimd_state.fpsr,
+				  start, end);
+	if (ret)
+		return ret;
+
+	start = end;
+	end = sve_size_from_header(&header);
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					start, end);
+}
+
+static int sve_set(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	struct user_sve_header header;
+	unsigned int vq;
+	unsigned long start, end;
+
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	/* Header */
+	if (count < sizeof(header))
+		return -EINVAL;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header,
+				 0, sizeof(header));
+	if (ret)
+		goto out;
+
+	/*
+	 * Apart from PT_SVE_REGS_MASK, all PT_SVE_* flags are consumed by
+	 * sve_set_vector_length(), which will also validate them for us:
+	 */
+	ret = sve_set_vector_length(target, header.vl,
+		((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
+	if (ret)
+		goto out;
+
+	/* Actual VL set may be less than the user asked for: */
+	vq = sve_vq_from_vl(target->thread.sve_vl);
+
+	/* Registers: FPSIMD-only case */
+
+	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
+	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) {
+		ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
+				SVE_PT_FPSIMD_OFFSET);
+		clear_tsk_thread_flag(target, TIF_SVE);
+		goto out;
+	}
+
+	/* Otherwise: full SVE case */
+
+	/*
+	 * If setting a different VL from the requested VL and there is
+	 * register data, the data layout will be wrong: don't even
+	 * try to set the registers in this case.
+	 */
+	if (count && vq != sve_vq_from_vl(header.vl)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	sve_alloc(target);
+
+	/*
+	 * Ensure target->thread.sve_state is up to date with target's
+	 * FPSIMD regs, so that a short copyin leaves trailing registers
+	 * unmodified.
+	 */
+	fpsimd_sync_to_sve(target);
+	set_tsk_thread_flag(target, TIF_SVE);
+
+	BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
+	start = SVE_PT_SVE_OFFSET;
+	end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 target->thread.sve_state,
+				 start, end);
+	if (ret)
+		goto out;
+
+	start = end;
+	end = SVE_PT_SVE_FPSR_OFFSET(vq);
+	ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					start, end);
+	if (ret)
+		goto out;
+
+	/*
+	 * Copy fpsr, and fpcr which must follow contiguously in
+	 * struct fpsimd_state:
+	 */
+	start = end;
+	end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &target->thread.fpsimd_state.fpsr,
+				 start, end);
+
+out:
+	fpsimd_flush_task_state(target);
+	return ret;
+}
+
+#endif /* CONFIG_ARM64_SVE */
+
 enum aarch64_regset {
 	REGSET_GPR,
 	REGSET_FPR,
@@ -711,6 +960,9 @@ enum aarch64_regset {
 	REGSET_HW_WATCH,
 #endif
 	REGSET_SYSTEM_CALL,
+#ifdef CONFIG_ARM64_SVE
+	REGSET_SVE,
+#endif
 };
 
 static const struct user_regset aarch64_regsets[] = {
@@ -768,6 +1020,18 @@ static const struct user_regset aarch64_regsets[] = {
 		.get = system_call_get,
 		.set = system_call_set,
 	},
+#ifdef CONFIG_ARM64_SVE
+	[REGSET_SVE] = { /* Scalable Vector Extension */
+		.core_note_type = NT_ARM_SVE,
+		.n = DIV_ROUND_UP(SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE),
+				  SVE_VQ_BYTES),
+		.size = SVE_VQ_BYTES,
+		.align = SVE_VQ_BYTES,
+		.get = sve_get,
+		.set = sve_set,
+		.get_size = sve_get_size,
+	},
+#endif
 };
 
 static const struct user_regset_view user_aarch64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b5280db9ef6a..735b8f4d12fc 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -416,6 +416,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */
 #define NT_ARM_HW_WATCH	0x403		/* ARM hardware watchpoint registers */
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
+#define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_METAG_CBUF	0x500		/* Metag catch buffer registers */
 #define NT_METAG_RPIPE	0x501		/* Metag read pipeline state */
 #define NT_METAG_TLS	0x502		/* Metag TLS pointer */
-- 
cgit v1.2.3


From 2d2123bc7c7f843aa9db87720de159a049839862 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Tue, 31 Oct 2017 15:51:14 +0000
Subject: arm64/sve: Add prctl controls for userspace vector length management
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds two arm64-specific prctls, to permit userspace to
control its vector length:

 * PR_SVE_SET_VL: set the thread's SVE vector length and vector
   length inheritance mode.

 * PR_SVE_GET_VL: get the same information.

Although these prctls resemble instruction set features in the SVE
architecture, they provide additional control: the vector length
inheritance mode is Linux-specific and nothing to do with the
architecture, and the architecture does not permit EL0 to set its
own vector length directly.  Both can be used in portable tools
without requiring the use of SVE instructions.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: Fixed up prctl constants to avoid clash with PDEATHSIG]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/fpsimd.h    | 14 +++++++++++
 arch/arm64/include/asm/processor.h |  4 +++
 arch/arm64/kernel/fpsimd.c         | 50 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h         |  4 +++
 kernel/sys.c                       | 12 +++++++++
 5 files changed, 84 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index d754e5a6949c..b868412c815c 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -17,6 +17,7 @@
 #define __ASM_FP_H
 
 #include <asm/ptrace.h>
+#include <asm/errno.h>
 
 #ifndef __ASSEMBLY__
 
@@ -98,6 +99,9 @@ extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
 extern int sve_set_vector_length(struct task_struct *task,
 				 unsigned long vl, unsigned long flags);
 
+extern int sve_set_current_vl(unsigned long arg);
+extern int sve_get_current_vl(void);
+
 /*
  * Probing and setup functions.
  * Calls to these functions must be serialised with one another.
@@ -114,6 +118,16 @@ static inline void fpsimd_release_task(struct task_struct *task) { }
 static inline void sve_sync_to_fpsimd(struct task_struct *task) { }
 static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { }
 
+static inline int sve_set_current_vl(unsigned long arg)
+{
+	return -EINVAL;
+}
+
+static inline int sve_get_current_vl(void)
+{
+	return -EINVAL;
+}
+
 static inline void sve_init_vq_map(void) { }
 static inline void sve_update_vq_map(void) { }
 static inline int sve_verify_vq_map(void) { return 0; }
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index c6fddb005dc2..023cacb946c3 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -217,5 +217,9 @@ static inline void spin_lock_prefetch(const void *ptr)
 int cpu_enable_pan(void *__unused);
 int cpu_enable_cache_maint_trap(void *__unused);
 
+/* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */
+#define SVE_SET_VL(arg)	sve_set_current_vl(arg)
+#define SVE_GET_VL()	sve_get_current_vl()
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index b82d44693b9d..fd3cfdd7f9be 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -29,6 +29,7 @@
 #include <linux/irqflags.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
+#include <linux/prctl.h>
 #include <linux/preempt.h>
 #include <linux/prctl.h>
 #include <linux/ptrace.h>
@@ -558,6 +559,55 @@ out:
 	return 0;
 }
 
+/*
+ * Encode the current vector length and flags for return.
+ * This is only required for prctl(): ptrace has separate fields
+ *
+ * flags are as for sve_set_vector_length().
+ */
+static int sve_prctl_status(unsigned long flags)
+{
+	int ret;
+
+	if (flags & PR_SVE_SET_VL_ONEXEC)
+		ret = current->thread.sve_vl_onexec;
+	else
+		ret = current->thread.sve_vl;
+
+	if (test_thread_flag(TIF_SVE_VL_INHERIT))
+		ret |= PR_SVE_VL_INHERIT;
+
+	return ret;
+}
+
+/* PR_SVE_SET_VL */
+int sve_set_current_vl(unsigned long arg)
+{
+	unsigned long vl, flags;
+	int ret;
+
+	vl = arg & PR_SVE_VL_LEN_MASK;
+	flags = arg & ~vl;
+
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	ret = sve_set_vector_length(current, vl, flags);
+	if (ret)
+		return ret;
+
+	return sve_prctl_status(flags);
+}
+
+/* PR_SVE_GET_VL */
+int sve_get_current_vl(void)
+{
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	return sve_prctl_status(0);
+}
+
 /*
  * Bitmap for temporary storage of the per-CPU set of supported vector lengths
  * during secondary boot.
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 1b64901ca6b3..f60db5db6e8e 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,7 +198,11 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
 /* arm64 Scalable Vector Extension controls */
+/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */
+#define PR_SVE_SET_VL			50	/* set task vector length */
 # define PR_SVE_SET_VL_ONEXEC		(1 << 18) /* defer effect until exec */
+#define PR_SVE_GET_VL			51	/* get task vector length */
+/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */
 # define PR_SVE_VL_LEN_MASK		0xffff
 # define PR_SVE_VL_INHERIT		(1 << 17) /* inherit across exec */
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 9aebc2935013..c541916b38c6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -110,6 +110,12 @@
 #ifndef SET_FP_MODE
 # define SET_FP_MODE(a,b)	(-EINVAL)
 #endif
+#ifndef SVE_SET_VL
+# define SVE_SET_VL(a)		(-EINVAL)
+#endif
+#ifndef SVE_GET_VL
+# define SVE_GET_VL()		(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2385,6 +2391,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_SVE_SET_VL:
+		error = SVE_SET_VL(arg2);
+		break;
+	case PR_SVE_GET_VL:
+		error = SVE_GET_VL();
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 44b6b7661132b1b0e5fd3147ded66f1e4a817ca9 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@dell.com>
Date: Wed, 1 Nov 2017 14:25:35 -0500
Subject: platform/x86: wmi: create userspace interface for drivers

For WMI operations that are only Set or Query readable and writable sysfs
attributes created by WMI vendor drivers or the bus driver makes sense.

For other WMI operations that are run on Method, there needs to be a
way to guarantee to userspace that the results from the method call
belong to the data request to the method call.  Sysfs attributes don't
work well in this scenario because two userspace processes may be
competing at reading/writing an attribute and step on each other's
data.

When a WMI vendor driver declares a callback method in the wmi_driver
the WMI bus driver will create a character device that maps to that
function.  This callback method will be responsible for filtering
invalid requests and performing the actual call.

That character device will correspond to this path:
/dev/wmi/$driver

Performing read() on this character device will provide the size
of the buffer that the character device needs to perform calls.
This buffer size can be set by vendor drivers through a new symbol
or when MOF parsing is available by the MOF.

Performing ioctl() on this character device will be interpretd
by the WMI bus driver. It will perform sanity tests for size of
data, test them for a valid instance, copy the data from userspace
and pass iton to the vendor driver to further process and run.

This creates an implicit policy that each driver will only be allowed
a single character device.  If a module matches multiple GUID's,
the wmi_devices will need to be all handled by the same wmi_driver.

The WMI vendor drivers will be responsible for managing inappropriate
access to this character device and proper locking on data used by
it.

When a WMI vendor driver is unloaded the WMI bus driver will clean
up the character device and any memory allocated for the call.

Signed-off-by: Mario Limonciello <mario.limonciello@dell.com>
Reviewed-by: Edward O'Callaghan <quasisec@google.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 MAINTAINERS                |   1 +
 drivers/platform/x86/wmi.c | 189 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/wmi.h        |   5 ++
 include/uapi/linux/wmi.h   |  26 +++++++
 4 files changed, 219 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/linux/wmi.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index aede236d10f1..3af07502220a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -384,6 +384,7 @@ ACPI WMI DRIVER
 L:	platform-driver-x86@vger.kernel.org
 S:	Orphan
 F:	drivers/platform/x86/wmi.c
+F:	include/uapi/linux/wmi.h
 
 AD1889 ALSA SOUND DRIVER
 M:	Thibaut Varene <T-Bone@parisc-linux.org>
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index bcb41c1c7f52..8c31ed4f0e1b 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -38,12 +38,15 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/uaccess.h>
 #include <linux/uuid.h>
 #include <linux/wmi.h>
+#include <uapi/linux/wmi.h>
 
 ACPI_MODULE_NAME("wmi");
 MODULE_AUTHOR("Carlos Corbacho");
@@ -69,9 +72,12 @@ struct wmi_block {
 	struct wmi_device dev;
 	struct list_head list;
 	struct guid_block gblock;
+	struct miscdevice char_dev;
+	struct mutex char_mutex;
 	struct acpi_device *acpi_device;
 	wmi_notify_handler handler;
 	void *handler_data;
+	u64 req_buf_size;
 
 	bool read_takes_no_args;
 };
@@ -188,6 +194,25 @@ static acpi_status wmi_method_enable(struct wmi_block *wblock, int enable)
 /*
  * Exported WMI functions
  */
+
+/**
+ * set_required_buffer_size - Sets the buffer size needed for performing IOCTL
+ * @wdev: A wmi bus device from a driver
+ * @instance: Instance index
+ *
+ * Allocates memory needed for buffer, stores the buffer size in that memory
+ */
+int set_required_buffer_size(struct wmi_device *wdev, u64 length)
+{
+	struct wmi_block *wblock;
+
+	wblock = container_of(wdev, struct wmi_block, dev);
+	wblock->req_buf_size = length;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(set_required_buffer_size);
+
 /**
  * wmi_evaluate_method - Evaluate a WMI method
  * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
@@ -764,6 +789,111 @@ static int wmi_dev_match(struct device *dev, struct device_driver *driver)
 
 	return 0;
 }
+static int wmi_char_open(struct inode *inode, struct file *filp)
+{
+	const char *driver_name = filp->f_path.dentry->d_iname;
+	struct wmi_block *wblock = NULL;
+	struct wmi_block *next = NULL;
+
+	list_for_each_entry_safe(wblock, next, &wmi_block_list, list) {
+		if (!wblock->dev.dev.driver)
+			continue;
+		if (strcmp(driver_name, wblock->dev.dev.driver->name) == 0) {
+			filp->private_data = wblock;
+			break;
+		}
+	}
+
+	if (!filp->private_data)
+		return -ENODEV;
+
+	return nonseekable_open(inode, filp);
+}
+
+static ssize_t wmi_char_read(struct file *filp, char __user *buffer,
+	size_t length, loff_t *offset)
+{
+	struct wmi_block *wblock = filp->private_data;
+
+	return simple_read_from_buffer(buffer, length, offset,
+				       &wblock->req_buf_size,
+				       sizeof(wblock->req_buf_size));
+}
+
+static long wmi_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct wmi_ioctl_buffer __user *input =
+		(struct wmi_ioctl_buffer __user *) arg;
+	struct wmi_block *wblock = filp->private_data;
+	struct wmi_ioctl_buffer *buf = NULL;
+	struct wmi_driver *wdriver = NULL;
+	int ret;
+
+	if (_IOC_TYPE(cmd) != WMI_IOC)
+		return -ENOTTY;
+
+	/* make sure we're not calling a higher instance than exists*/
+	if (_IOC_NR(cmd) >= wblock->gblock.instance_count)
+		return -EINVAL;
+
+	mutex_lock(&wblock->char_mutex);
+	buf = wblock->handler_data;
+	if (get_user(buf->length, &input->length)) {
+		dev_dbg(&wblock->dev.dev, "Read length from user failed\n");
+		ret = -EFAULT;
+		goto out_ioctl;
+	}
+	/* if it's too small, abort */
+	if (buf->length < wblock->req_buf_size) {
+		dev_err(&wblock->dev.dev,
+			"Buffer %lld too small, need at least %lld\n",
+			buf->length, wblock->req_buf_size);
+		ret = -EINVAL;
+		goto out_ioctl;
+	}
+	/* if it's too big, warn, driver will only use what is needed */
+	if (buf->length > wblock->req_buf_size)
+		dev_warn(&wblock->dev.dev,
+			"Buffer %lld is bigger than required %lld\n",
+			buf->length, wblock->req_buf_size);
+
+	/* copy the structure from userspace */
+	if (copy_from_user(buf, input, wblock->req_buf_size)) {
+		dev_dbg(&wblock->dev.dev, "Copy %llu from user failed\n",
+			wblock->req_buf_size);
+		ret = -EFAULT;
+		goto out_ioctl;
+	}
+
+	/* let the driver do any filtering and do the call */
+	wdriver = container_of(wblock->dev.dev.driver,
+			       struct wmi_driver, driver);
+	if (!try_module_get(wdriver->driver.owner))
+		return -EBUSY;
+	ret = wdriver->filter_callback(&wblock->dev, cmd, buf);
+	module_put(wdriver->driver.owner);
+	if (ret)
+		goto out_ioctl;
+
+	/* return the result (only up to our internal buffer size) */
+	if (copy_to_user(input, buf, wblock->req_buf_size)) {
+		dev_dbg(&wblock->dev.dev, "Copy %llu to user failed\n",
+			wblock->req_buf_size);
+		ret = -EFAULT;
+	}
+
+out_ioctl:
+	mutex_unlock(&wblock->char_mutex);
+	return ret;
+}
+
+static const struct file_operations wmi_fops = {
+	.owner		= THIS_MODULE,
+	.read		= wmi_char_read,
+	.open		= wmi_char_open,
+	.unlocked_ioctl	= wmi_ioctl,
+	.compat_ioctl	= wmi_ioctl,
+};
 
 static int wmi_dev_probe(struct device *dev)
 {
@@ -771,16 +901,63 @@ static int wmi_dev_probe(struct device *dev)
 	struct wmi_driver *wdriver =
 		container_of(dev->driver, struct wmi_driver, driver);
 	int ret = 0;
+	int count;
+	char *buf;
 
 	if (ACPI_FAILURE(wmi_method_enable(wblock, 1)))
 		dev_warn(dev, "failed to enable device -- probing anyway\n");
 
 	if (wdriver->probe) {
 		ret = wdriver->probe(dev_to_wdev(dev));
-		if (ret != 0 && ACPI_FAILURE(wmi_method_enable(wblock, 0)))
-			dev_warn(dev, "failed to disable device\n");
+		if (ret != 0)
+			goto probe_failure;
 	}
 
+	/* driver wants a character device made */
+	if (wdriver->filter_callback) {
+		/* check that required buffer size declared by driver or MOF */
+		if (!wblock->req_buf_size) {
+			dev_err(&wblock->dev.dev,
+				"Required buffer size not set\n");
+			ret = -EINVAL;
+			goto probe_failure;
+		}
+
+		count = get_order(wblock->req_buf_size);
+		wblock->handler_data = (void *)__get_free_pages(GFP_KERNEL,
+								count);
+		if (!wblock->handler_data) {
+			ret = -ENOMEM;
+			goto probe_failure;
+		}
+
+		buf = kmalloc(strlen(wdriver->driver.name) + 4, GFP_KERNEL);
+		if (!buf) {
+			ret = -ENOMEM;
+			goto probe_string_failure;
+		}
+		sprintf(buf, "wmi/%s", wdriver->driver.name);
+		wblock->char_dev.minor = MISC_DYNAMIC_MINOR;
+		wblock->char_dev.name = buf;
+		wblock->char_dev.fops = &wmi_fops;
+		wblock->char_dev.mode = 0444;
+		ret = misc_register(&wblock->char_dev);
+		if (ret) {
+			dev_warn(dev, "failed to register char dev: %d", ret);
+			ret = -ENOMEM;
+			goto probe_misc_failure;
+		}
+	}
+
+	return 0;
+
+probe_misc_failure:
+	kfree(buf);
+probe_string_failure:
+	kfree(wblock->handler_data);
+probe_failure:
+	if (ACPI_FAILURE(wmi_method_enable(wblock, 0)))
+		dev_warn(dev, "failed to disable device\n");
 	return ret;
 }
 
@@ -791,6 +968,13 @@ static int wmi_dev_remove(struct device *dev)
 		container_of(dev->driver, struct wmi_driver, driver);
 	int ret = 0;
 
+	if (wdriver->filter_callback) {
+		misc_deregister(&wblock->char_dev);
+		kfree(wblock->char_dev.name);
+		free_pages((unsigned long)wblock->handler_data,
+			   get_order(wblock->req_buf_size));
+	}
+
 	if (wdriver->remove)
 		ret = wdriver->remove(dev_to_wdev(dev));
 
@@ -847,6 +1031,7 @@ static int wmi_create_device(struct device *wmi_bus_dev,
 
 	if (gblock->flags & ACPI_WMI_METHOD) {
 		wblock->dev.dev.type = &wmi_type_method;
+		mutex_init(&wblock->char_mutex);
 		goto out_init;
 	}
 
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index ddee427e0721..4757cb5077e5 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -18,6 +18,7 @@
 
 #include <linux/device.h>
 #include <linux/acpi.h>
+#include <uapi/linux/wmi.h>
 
 struct wmi_device {
 	struct device dev;
@@ -36,6 +37,8 @@ extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
 extern union acpi_object *wmidev_block_query(struct wmi_device *wdev,
 					     u8 instance);
 
+extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
+
 struct wmi_device_id {
 	const char *guid_string;
 };
@@ -47,6 +50,8 @@ struct wmi_driver {
 	int (*probe)(struct wmi_device *wdev);
 	int (*remove)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
+	long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd,
+				struct wmi_ioctl_buffer *arg);
 };
 
 extern int __must_check __wmi_driver_register(struct wmi_driver *driver,
diff --git a/include/uapi/linux/wmi.h b/include/uapi/linux/wmi.h
new file mode 100644
index 000000000000..7e52350ac9b3
--- /dev/null
+++ b/include/uapi/linux/wmi.h
@@ -0,0 +1,26 @@
+/*
+ *  User API methods for ACPI-WMI mapping driver
+ *
+ *  Copyright (C) 2017 Dell, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+#ifndef _UAPI_LINUX_WMI_H
+#define _UAPI_LINUX_WMI_H
+
+#include <linux/types.h>
+
+/* WMI bus will filter all WMI vendor driver requests through this IOC */
+#define WMI_IOC 'W'
+
+/* All ioctl requests through WMI should declare their size followed by
+ * relevant data objects
+ */
+struct wmi_ioctl_buffer {
+	__u64	length;
+	__u8	data[];
+};
+
+#endif
-- 
cgit v1.2.3


From f2645fa317b8905b8934f06a0601d5b7fa66aba0 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@dell.com>
Date: Wed, 1 Nov 2017 14:25:36 -0500
Subject: platform/x86: dell-smbios-wmi: introduce userspace interface

It's important for the driver to provide a R/W ioctl to ensure that
two competing userspace processes don't race to provide or read each
others data.

This userspace character device will be used to perform SMBIOS calls
from any applications.

It provides an ioctl that will allow passing the WMI calling
interface buffer between userspace and kernel space.

This character device is intended to deprecate the dcdbas kernel module
and the interface that it provides to userspace.

To perform an SMBIOS IOCTL call using the character device userspace will
perform a read() on the the character device.  The WMI bus will provide
a u64 variable containing the necessary size of the IOCTL buffer.

The API for interacting with this interface is defined in documentation
as well as the WMI uapi header provides the format of the structures.

Not all userspace requests will be accepted.  The dell-smbios filtering
functionality will be used to prevent access to certain tokens and calls.

All whitelisted commands and tokens are now shared out to userspace so
applications don't need to define them in their own headers.

Signed-off-by: Mario Limonciello <mario.limonciello@dell.com>
Reviewed-by: Edward O'Callaghan <quasisec@google.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 Documentation/ABI/testing/dell-smbios-wmi | 41 +++++++++++++++++++++++
 drivers/platform/x86/dell-smbios-wmi.c    | 54 ++++++++++++++++++++++++-------
 drivers/platform/x86/dell-smbios.h        | 32 ++----------------
 include/uapi/linux/wmi.h                  | 47 +++++++++++++++++++++++++++
 4 files changed, 133 insertions(+), 41 deletions(-)
 create mode 100644 Documentation/ABI/testing/dell-smbios-wmi

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/dell-smbios-wmi b/Documentation/ABI/testing/dell-smbios-wmi
new file mode 100644
index 000000000000..fc919ce16008
--- /dev/null
+++ b/Documentation/ABI/testing/dell-smbios-wmi
@@ -0,0 +1,41 @@
+What:		/dev/wmi/dell-smbios
+Date:		November 2017
+KernelVersion:	4.15
+Contact:	"Mario Limonciello" <mario.limonciello@dell.com>
+Description:
+		Perform SMBIOS calls on supported Dell machines.
+		through the Dell ACPI-WMI interface.
+
+		IOCTL's and buffer formats are defined in:
+		<uapi/linux/wmi.h>
+
+		1) To perform an SMBIOS call from userspace, you'll need to
+		first determine the minimum size of the calling interface
+		buffer for your machine.
+		Platforms that contain larger buffers can return larger
+		objects from the system firmware.
+		Commonly this size is either 4k or 32k.
+
+		To determine the size of the buffer read() a u64 dword from
+		the WMI character device /dev/wmi/dell-smbios.
+
+		2) After you've determined the minimum size of the calling
+		interface buffer, you can allocate a structure that represents
+		the structure documented above.
+
+		3) In the 'length' object store the size of the buffer you
+		determined above and allocated.
+
+		4) In this buffer object, prepare as necessary for the SMBIOS
+		call you're interested in.  Typically SMBIOS buffers have
+		"class", "select", and "input" defined to values that coincide
+		with the data you are interested in.
+		Documenting class/select/input values is outside of the scope
+		of this documentation. Check with the libsmbios project for
+		further documentation on these values.
+
+		6) Run the call by using ioctl() as described in the header.
+
+		7) The output will be returned in the buffer object.
+
+		8) Be sure to free up your allocated object.
diff --git a/drivers/platform/x86/dell-smbios-wmi.c b/drivers/platform/x86/dell-smbios-wmi.c
index b31f457e58c3..35c13815b24c 100644
--- a/drivers/platform/x86/dell-smbios-wmi.c
+++ b/drivers/platform/x86/dell-smbios-wmi.c
@@ -30,17 +30,6 @@ struct misc_bios_flags_structure {
 
 #define DELL_WMI_SMBIOS_GUID "A80593CE-A997-11DA-B012-B622A1EF5492"
 
-struct dell_wmi_extensions {
-	__u32 argattrib;
-	__u32 blength;
-	__u8 data[];
-} __packed;
-
-struct dell_wmi_smbios_buffer {
-	struct calling_interface_buffer std;
-	struct dell_wmi_extensions ext;
-} __packed;
-
 struct wmi_smbios_priv {
 	struct dell_wmi_smbios_buffer *buf;
 	struct list_head list;
@@ -117,6 +106,42 @@ int dell_smbios_wmi_call(struct calling_interface_buffer *buffer)
 	return ret;
 }
 
+static long dell_smbios_wmi_filter(struct wmi_device *wdev, unsigned int cmd,
+				   struct wmi_ioctl_buffer *arg)
+{
+	struct wmi_smbios_priv *priv;
+	int ret = 0;
+
+	switch (cmd) {
+	case DELL_WMI_SMBIOS_CMD:
+		mutex_lock(&call_mutex);
+		priv = dev_get_drvdata(&wdev->dev);
+		if (!priv) {
+			ret = -ENODEV;
+			goto fail_smbios_cmd;
+		}
+		memcpy(priv->buf, arg, priv->req_buf_size);
+		if (dell_smbios_call_filter(&wdev->dev, &priv->buf->std)) {
+			dev_err(&wdev->dev, "Invalid call %d/%d:%8x\n",
+				priv->buf->std.cmd_class,
+				priv->buf->std.cmd_select,
+				priv->buf->std.input[0]);
+			ret = -EFAULT;
+			goto fail_smbios_cmd;
+		}
+		ret = run_smbios_call(priv->wdev);
+		if (ret)
+			goto fail_smbios_cmd;
+		memcpy(arg, priv->buf, priv->req_buf_size);
+fail_smbios_cmd:
+		mutex_unlock(&call_mutex);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+	}
+	return ret;
+}
+
 static int dell_smbios_wmi_probe(struct wmi_device *wdev)
 {
 	struct wmi_smbios_priv *priv;
@@ -135,6 +160,12 @@ static int dell_smbios_wmi_probe(struct wmi_device *wdev)
 	if (!dell_wmi_get_size(&priv->req_buf_size))
 		return -EPROBE_DEFER;
 
+	/* add in the length object we will use internally with ioctl */
+	priv->req_buf_size += sizeof(u64);
+	ret = set_required_buffer_size(wdev, priv->req_buf_size);
+	if (ret)
+		return ret;
+
 	count = get_order(priv->req_buf_size);
 	priv->buf = (void *)__get_free_pages(GFP_KERNEL, count);
 	if (!priv->buf)
@@ -210,6 +241,7 @@ static struct wmi_driver dell_smbios_wmi_driver = {
 	.probe = dell_smbios_wmi_probe,
 	.remove = dell_smbios_wmi_remove,
 	.id_table = dell_smbios_wmi_id_table,
+	.filter_callback = dell_smbios_wmi_filter,
 };
 
 static int __init init_dell_smbios_wmi(void)
diff --git a/drivers/platform/x86/dell-smbios.h b/drivers/platform/x86/dell-smbios.h
index 91e8004d48ba..138d478d9adc 100644
--- a/drivers/platform/x86/dell-smbios.h
+++ b/drivers/platform/x86/dell-smbios.h
@@ -17,23 +17,11 @@
 #define _DELL_SMBIOS_H_
 
 #include <linux/device.h>
+#include <uapi/linux/wmi.h>
 
-/* Classes and selects used in kernel drivers */
-#define CLASS_TOKEN_READ 0
-#define CLASS_TOKEN_WRITE 1
-#define SELECT_TOKEN_STD 0
-#define SELECT_TOKEN_BAT 1
-#define SELECT_TOKEN_AC 2
+/* Classes and selects used only in kernel drivers */
 #define CLASS_KBD_BACKLIGHT 4
 #define SELECT_KBD_BACKLIGHT 11
-#define CLASS_FLASH_INTERFACE 7
-#define SELECT_FLASH_INTERFACE 3
-#define CLASS_ADMIN_PROP 10
-#define SELECT_ADMIN_PROP 3
-#define CLASS_INFO 17
-#define SELECT_RFKILL 11
-#define SELECT_APP_REGISTRATION	3
-#define SELECT_DOCK 22
 
 /* Tokens used in kernel drivers, any of these
  * should be filtered from userspace access
@@ -50,24 +38,8 @@
 #define GLOBAL_MIC_MUTE_ENABLE	0x0364
 #define GLOBAL_MIC_MUTE_DISABLE	0x0365
 
-/* tokens whitelisted to userspace use */
-#define CAPSULE_EN_TOKEN	0x0461
-#define CAPSULE_DIS_TOKEN	0x0462
-#define WSMT_EN_TOKEN		0x04EC
-#define WSMT_DIS_TOKEN		0x04ED
-
 struct notifier_block;
 
-/* This structure will be modified by the firmware when we enter
- * system management mode, hence the volatiles */
-
-struct calling_interface_buffer {
-	u16 cmd_class;
-	u16 cmd_select;
-	volatile u32 input[4];
-	volatile u32 output[4];
-} __packed;
-
 struct calling_interface_token {
 	u16 tokenID;
 	u16 location;
diff --git a/include/uapi/linux/wmi.h b/include/uapi/linux/wmi.h
index 7e52350ac9b3..7a92e9e3d1c0 100644
--- a/include/uapi/linux/wmi.h
+++ b/include/uapi/linux/wmi.h
@@ -10,6 +10,7 @@
 #ifndef _UAPI_LINUX_WMI_H
 #define _UAPI_LINUX_WMI_H
 
+#include <linux/ioctl.h>
 #include <linux/types.h>
 
 /* WMI bus will filter all WMI vendor driver requests through this IOC */
@@ -23,4 +24,50 @@ struct wmi_ioctl_buffer {
 	__u8	data[];
 };
 
+/* This structure may be modified by the firmware when we enter
+ * system management mode through SMM, hence the volatiles
+ */
+struct calling_interface_buffer {
+	__u16 cmd_class;
+	__u16 cmd_select;
+	volatile __u32 input[4];
+	volatile __u32 output[4];
+} __packed;
+
+struct dell_wmi_extensions {
+	__u32 argattrib;
+	__u32 blength;
+	__u8 data[];
+} __packed;
+
+struct dell_wmi_smbios_buffer {
+	__u64 length;
+	struct calling_interface_buffer std;
+	struct dell_wmi_extensions	ext;
+} __packed;
+
+/* Whitelisted smbios class/select commands */
+#define CLASS_TOKEN_READ	0
+#define CLASS_TOKEN_WRITE	1
+#define SELECT_TOKEN_STD	0
+#define SELECT_TOKEN_BAT	1
+#define SELECT_TOKEN_AC		2
+#define CLASS_FLASH_INTERFACE	7
+#define SELECT_FLASH_INTERFACE	3
+#define CLASS_ADMIN_PROP	10
+#define SELECT_ADMIN_PROP	3
+#define CLASS_INFO		17
+#define SELECT_RFKILL		11
+#define SELECT_APP_REGISTRATION	3
+#define SELECT_DOCK		22
+
+/* whitelisted tokens */
+#define CAPSULE_EN_TOKEN	0x0461
+#define CAPSULE_DIS_TOKEN	0x0462
+#define WSMT_EN_TOKEN		0x04EC
+#define WSMT_DIS_TOKEN		0x04ED
+
+/* Dell SMBIOS calling IOCTL command used by dell-smbios-wmi */
+#define DELL_WMI_SMBIOS_CMD	_IOWR(WMI_IOC, 0, struct dell_wmi_smbios_buffer)
+
 #endif
-- 
cgit v1.2.3


From ee20598194500e82c477cf13e52b58e569446ed0 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 15:42:15 -0500
Subject: net/dcb: Add dscp to priority selector type

IEEE specification P802.1Qcd/D2.1 defines priority selector 5.
This APP TLV selector defines DSCP to priority map.
This patch defines such DSCP selector.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/uapi/linux/dcbnl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index b6170a6af7c2..2c0c6453c3f4 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -206,6 +206,7 @@ struct cee_pfc {
 #define IEEE_8021QAZ_APP_SEL_STREAM	2
 #define IEEE_8021QAZ_APP_SEL_DGRAM	3
 #define IEEE_8021QAZ_APP_SEL_ANY	4
+#define IEEE_8021QAZ_APP_SEL_DSCP       5
 
 /* This structure contains the IEEE 802.1Qaz APP managed object. This
  * object is also used for the CEE std as well.
-- 
cgit v1.2.3


From 9354d452034273a50a4fd703bea31e5d6b1fc20b Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 2 Nov 2017 17:04:37 -0200
Subject: openvswitch: reliable interface indentification in port dumps

This patch allows reliable identification of netdevice interfaces connected
to openvswitch bridges. In particular, user space queries the netdev
interfaces belonging to the ports for statistics, up/down state, etc.
Datapath dump needs to provide enough information for the user space to be
able to do that.

Currently, only interface names are returned. This is not sufficient, as
openvswitch allows its ports to be in different name spaces and the
interface name is valid only in its name space. What is needed and generally
used in other netlink APIs, is the pair ifindex+netnsid.

The solution is addition of the ifindex+netnsid pair (or only ifindex if in
the same name space) to vport get/dump operation.

On request side, ideally the ifindex+netnsid pair could be used to
get/set/del the corresponding vport. This is not implemented by this patch
and can be added later if needed.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  2 ++
 net/openvswitch/datapath.c       | 47 +++++++++++++++++++++++++++++-----------
 net/openvswitch/datapath.h       |  4 ++--
 net/openvswitch/dp_notify.c      |  4 ++--
 4 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index ffe397daad49..501e4c4e2a03 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -258,6 +258,8 @@ enum ovs_vport_attr {
 				/* receiving upcalls */
 	OVS_VPORT_ATTR_STATS,	/* struct ovs_vport_stats */
 	OVS_VPORT_ATTR_PAD,
+	OVS_VPORT_ATTR_IFINDEX,
+	OVS_VPORT_ATTR_NETNSID,
 	__OVS_VPORT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c3aec6227c91..4d38ac044cee 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1848,7 +1848,8 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = {
 
 /* Called with ovs_mutex or RCU read lock. */
 static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
-				   u32 portid, u32 seq, u32 flags, u8 cmd)
+				   struct net *net, u32 portid, u32 seq,
+				   u32 flags, u8 cmd)
 {
 	struct ovs_header *ovs_header;
 	struct ovs_vport_stats vport_stats;
@@ -1864,9 +1865,17 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 	if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
 	    nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
 	    nla_put_string(skb, OVS_VPORT_ATTR_NAME,
-			   ovs_vport_name(vport)))
+			   ovs_vport_name(vport)) ||
+	    nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex))
 		goto nla_put_failure;
 
+	if (!net_eq(net, dev_net(vport->dev))) {
+		int id = peernet2id_alloc(net, dev_net(vport->dev));
+
+		if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
+			goto nla_put_failure;
+	}
+
 	ovs_vport_get_stats(vport, &vport_stats);
 	if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS,
 			  sizeof(struct ovs_vport_stats), &vport_stats,
@@ -1896,8 +1905,8 @@ static struct sk_buff *ovs_vport_cmd_alloc_info(void)
 }
 
 /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
-struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
-					 u32 seq, u8 cmd)
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
+					 u32 portid, u32 seq, u8 cmd)
 {
 	struct sk_buff *skb;
 	int retval;
@@ -1906,7 +1915,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
-	retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
+	retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd);
 	BUG_ON(retval < 0);
 
 	return skb;
@@ -1920,6 +1929,8 @@ static struct vport *lookup_vport(struct net *net,
 	struct datapath *dp;
 	struct vport *vport;
 
+	if (a[OVS_VPORT_ATTR_IFINDEX])
+		return ERR_PTR(-EOPNOTSUPP);
 	if (a[OVS_VPORT_ATTR_NAME]) {
 		vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
 		if (!vport)
@@ -1944,6 +1955,7 @@ static struct vport *lookup_vport(struct net *net,
 		return vport;
 	} else
 		return ERR_PTR(-EINVAL);
+
 }
 
 /* Called with ovs_mutex */
@@ -1983,6 +1995,8 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
 	    !a[OVS_VPORT_ATTR_UPCALL_PID])
 		return -EINVAL;
+	if (a[OVS_VPORT_ATTR_IFINDEX])
+		return -EOPNOTSUPP;
 
 	port_no = a[OVS_VPORT_ATTR_PORT_NO]
 		? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
@@ -2032,8 +2046,9 @@ restart:
 		goto exit_unlock_free;
 	}
 
-	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
-				      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
+				      info->snd_portid, info->snd_seq, 0,
+				      OVS_VPORT_CMD_NEW);
 
 	if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
 		update_headroom(dp);
@@ -2090,8 +2105,9 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
 			goto exit_unlock_free;
 	}
 
-	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
-				      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
+				      info->snd_portid, info->snd_seq, 0,
+				      OVS_VPORT_CMD_NEW);
 	BUG_ON(err < 0);
 
 	ovs_unlock();
@@ -2128,8 +2144,9 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		goto exit_unlock_free;
 	}
 
-	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
-				      info->snd_seq, 0, OVS_VPORT_CMD_DEL);
+	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
+				      info->snd_portid, info->snd_seq, 0,
+				      OVS_VPORT_CMD_DEL);
 	BUG_ON(err < 0);
 
 	/* the vport deletion may trigger dp headroom update */
@@ -2169,8 +2186,9 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	err = PTR_ERR(vport);
 	if (IS_ERR(vport))
 		goto exit_unlock_free;
-	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
-				      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
+				      info->snd_portid, info->snd_seq, 0,
+				      OVS_VPORT_CMD_NEW);
 	BUG_ON(err < 0);
 	rcu_read_unlock();
 
@@ -2202,6 +2220,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
 			if (j >= skip &&
 			    ovs_vport_cmd_fill_info(vport, skb,
+						    sock_net(skb->sk),
 						    NETLINK_CB(cb->skb).portid,
 						    cb->nlh->nlmsg_seq,
 						    NLM_F_MULTI,
@@ -2228,6 +2247,8 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
 	[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
 	[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
 	[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
+	[OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 },
+	[OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 },
 };
 
 static const struct genl_ops dp_vport_genl_ops[] = {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 480600649d0b..4a104ef9e12c 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -200,8 +200,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *,
 		  uint32_t cutlen);
 
 const char *ovs_dp_name(const struct datapath *dp);
-struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
-					 u8 cmd);
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
+					 u32 portid, u32 seq, u8 cmd);
 
 int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			const struct sw_flow_actions *, struct sw_flow_key *);
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 653d073bae45..f3ee2f2825c0 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -30,8 +30,8 @@ static void dp_detach_port_notify(struct vport *vport)
 	struct datapath *dp;
 
 	dp = vport->dp;
-	notify = ovs_vport_cmd_build_info(vport, 0, 0,
-					  OVS_VPORT_CMD_DEL);
+	notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp),
+					  0, 0, OVS_VPORT_CMD_DEL);
 	ovs_dp_detach_port(vport);
 	if (IS_ERR(notify)) {
 		genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0,
-- 
cgit v1.2.3


From 79e1ad148c844f5c8b9d76b36b26e3886dca95ae Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 2 Nov 2017 17:04:38 -0200
Subject: rtnetlink: use netnsid to query interface

Currently, when an application gets netnsid from the kernel (for example as
the result of RTM_GETLINK call on one end of the veth pair), it's not much
useful. There's no reliable way to get to the netns fd from the netnsid, nor
does any kernel API accept netnsid.

Extend the RTM_GETLINK call to also accept netnsid. It will operate on the
netns with the given netnsid in such case. Of course, the calling process
needs to have enough capabilities in the target name space; for now, require
CAP_NET_ADMIN. This can be relaxed in the future.

To signal to the calling process that the kernel understood the new
IFLA_IF_NETNSID attribute in the query, it will include it in the response.
This is needed to detect older kernels, as they will just ignore
IFLA_IF_NETNSID and query in the current name space.

This patch implemetns IFLA_IF_NETNSID only for get and dump. For set
operations, this can be extended later.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |   1 +
 net/core/rtnetlink.c         | 103 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 86 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b3cf5639ac8f..19fc02660e0c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -160,6 +160,7 @@ enum {
 	IFLA_XDP,
 	IFLA_EVENT,
 	IFLA_NEW_NETNSID,
+	IFLA_IF_NETNSID,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index de24d394c69e..8a8c51937edf 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -921,7 +921,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4)  /* IFLA_EVENT */
 	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
-
+	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
+	       + 0;
 }
 
 static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1370,13 +1371,14 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
 }
 
 static int rtnl_fill_link_netnsid(struct sk_buff *skb,
-				  const struct net_device *dev)
+				  const struct net_device *dev,
+				  struct net *src_net)
 {
 	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
 		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
 
 		if (!net_eq(dev_net(dev), link_net)) {
-			int id = peernet2id_alloc(dev_net(dev), link_net);
+			int id = peernet2id_alloc(src_net, link_net);
 
 			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
 				return -EMSGSIZE;
@@ -1427,10 +1429,11 @@ static int rtnl_fill_link_af(struct sk_buff *skb,
 	return 0;
 }
 
-static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
+static int rtnl_fill_ifinfo(struct sk_buff *skb,
+			    struct net_device *dev, struct net *src_net,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event, int *new_nsid)
+			    u32 event, int *new_nsid, int tgt_netnsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1448,6 +1451,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	ifm->ifi_flags = dev_get_flags(dev);
 	ifm->ifi_change = change;
 
+	if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid))
+		goto nla_put_failure;
+
 	if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
 	    nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
 	    nla_put_u8(skb, IFLA_OPERSTATE,
@@ -1513,7 +1519,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			goto nla_put_failure;
 	}
 
-	if (rtnl_fill_link_netnsid(skb, dev))
+	if (rtnl_fill_link_netnsid(skb, dev, src_net))
 		goto nla_put_failure;
 
 	if (new_nsid &&
@@ -1571,6 +1577,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_XDP]		= { .type = NLA_NESTED },
 	[IFLA_EVENT]		= { .type = NLA_U32 },
 	[IFLA_GROUP]		= { .type = NLA_U32 },
+	[IFLA_IF_NETNSID]	= { .type = NLA_S32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1674,9 +1681,28 @@ static bool link_dump_filtered(struct net_device *dev,
 	return false;
 }
 
+static struct net *get_target_net(struct sk_buff *skb, int netnsid)
+{
+	struct net *net;
+
+	net = get_net_ns_by_id(sock_net(skb->sk), netnsid);
+	if (!net)
+		return ERR_PTR(-EINVAL);
+
+	/* For now, the caller is required to have CAP_NET_ADMIN in
+	 * the user namespace owning the target net ns.
+	 */
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+		put_net(net);
+		return ERR_PTR(-EACCES);
+	}
+	return net;
+}
+
 static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
+	struct net *tgt_net = net;
 	int h, s_h;
 	int idx = 0, s_idx;
 	struct net_device *dev;
@@ -1686,6 +1712,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct rtnl_link_ops *kind_ops = NULL;
 	unsigned int flags = NLM_F_MULTI;
 	int master_idx = 0;
+	int netnsid = -1;
 	int err;
 	int hdrlen;
 
@@ -1704,6 +1731,15 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX,
 			ifla_policy, NULL) >= 0) {
+		if (tb[IFLA_IF_NETNSID]) {
+			netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+			tgt_net = get_target_net(skb, netnsid);
+			if (IS_ERR(tgt_net)) {
+				tgt_net = net;
+				netnsid = -1;
+			}
+		}
+
 		if (tb[IFLA_EXT_MASK])
 			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
 
@@ -1719,17 +1755,19 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
-		head = &net->dev_index_head[h];
+		head = &tgt_net->dev_index_head[h];
 		hlist_for_each_entry(dev, head, index_hlist) {
 			if (link_dump_filtered(dev, master_idx, kind_ops))
 				goto cont;
 			if (idx < s_idx)
 				goto cont;
-			err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+			err = rtnl_fill_ifinfo(skb, dev, net,
+					       RTM_NEWLINK,
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0, NULL);
+					       ext_filter_mask, 0, NULL,
+					       netnsid);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -1748,6 +1786,8 @@ out_err:
 	cb->args[0] = h;
 	cb->seq = net->dev_base_seq;
 	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+	if (netnsid >= 0)
+		put_net(tgt_net);
 
 	return err;
 }
@@ -2360,6 +2400,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
+	if (tb[IFLA_IF_NETNSID])
+		return -EOPNOTSUPP;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
@@ -2454,6 +2497,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	if (tb[IFLA_IF_NETNSID])
+		return -EOPNOTSUPP;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 
@@ -2585,6 +2631,9 @@ replay:
 	if (err < 0)
 		return err;
 
+	if (tb[IFLA_IF_NETNSID])
+		return -EOPNOTSUPP;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
@@ -2818,11 +2867,13 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
+	struct net *tgt_net = net;
 	struct ifinfomsg *ifm;
 	char ifname[IFNAMSIZ];
 	struct nlattr *tb[IFLA_MAX+1];
 	struct net_device *dev = NULL;
 	struct sk_buff *nskb;
+	int netnsid = -1;
 	int err;
 	u32 ext_filter_mask = 0;
 
@@ -2830,35 +2881,50 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	if (tb[IFLA_IF_NETNSID]) {
+		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+		tgt_net = get_target_net(skb, netnsid);
+		if (IS_ERR(tgt_net))
+			return PTR_ERR(tgt_net);
+	}
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 
 	if (tb[IFLA_EXT_MASK])
 		ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
 
+	err = -EINVAL;
 	ifm = nlmsg_data(nlh);
 	if (ifm->ifi_index > 0)
-		dev = __dev_get_by_index(net, ifm->ifi_index);
+		dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
 	else if (tb[IFLA_IFNAME])
-		dev = __dev_get_by_name(net, ifname);
+		dev = __dev_get_by_name(tgt_net, ifname);
 	else
-		return -EINVAL;
+		goto out;
 
+	err = -ENODEV;
 	if (dev == NULL)
-		return -ENODEV;
+		goto out;
 
+	err = -ENOBUFS;
 	nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
 	if (nskb == NULL)
-		return -ENOBUFS;
+		goto out;
 
-	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL);
+	err = rtnl_fill_ifinfo(nskb, dev, net,
+			       RTM_NEWLINK, NETLINK_CB(skb).portid,
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask,
+			       0, NULL, netnsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(nskb);
 	} else
 		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+out:
+	if (netnsid >= 0)
+		put_net(tgt_net);
 
 	return err;
 }
@@ -2948,8 +3014,9 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event,
-			       new_nsid);
+	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
+			       type, 0, 0, change, 0, 0, event,
+			       new_nsid, -1);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
cgit v1.2.3


From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:17 -0700
Subject: bpf: offload: add infrastructure for loading programs for a specific
 netdev

The fact that we don't know which device the program is going
to be used on is quite limiting in current eBPF infrastructure.
We have to reverse or limit the changes which kernel makes to
the loaded bytecode if we want it to be offloaded to a networking
device.  We also have to invent new APIs for debugging and
troubleshooting support.

Make it possible to load programs for a specific netdev.  This
helps us to bring the debug information closer to the core
eBPF infrastructure (e.g. we will be able to reuse the verifer
log in device JIT).  It allows device JITs to perform translation
on the original bytecode.

__bpf_prog_get() when called to get a reference for an attachment
point will now refuse to give it if program has a device assigned.
Following patches will add a version of that function which passes
the expected netdev in. @type argument in __bpf_prog_get() is
renamed to attach_type to make it clearer that it's only set on
attachment.

All calls to ndo_bpf are protected by rtnl, only verifier callbacks
are not.  We need a wait queue to make sure netdev doesn't get
destroyed while verifier is still running and calling its driver.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  36 +++++++++
 include/linux/bpf_verifier.h |  10 +++
 include/linux/netdevice.h    |  14 ++++
 include/uapi/linux/bpf.h     |   1 +
 kernel/bpf/Makefile          |   1 +
 kernel/bpf/core.c            |  10 ++-
 kernel/bpf/offload.c         | 182 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c         |  17 +++-
 kernel/bpf/verifier.c        |  15 +++-
 9 files changed, 278 insertions(+), 8 deletions(-)
 create mode 100644 kernel/bpf/offload.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520aeebe0d93..e45d43f9ec92 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
+#include <linux/wait.h>
 
 struct perf_event;
 struct bpf_prog;
@@ -182,6 +183,16 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 };
 
+struct bpf_dev_offload {
+	struct bpf_prog		*prog;
+	struct net_device	*netdev;
+	void			*dev_priv;
+	struct list_head	offloads;
+	bool			dev_state;
+	bool			verifier_running;
+	wait_queue_head_t	verifier_done;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -199,6 +210,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_dev_offload *offload;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -317,6 +329,7 @@ extern const struct file_operations bpf_prog_fops;
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_prog_ops bpf_offload_prog_ops;
 extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
@@ -491,6 +504,29 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+int bpf_prog_offload_compile(struct bpf_prog *prog);
+void bpf_prog_offload_destroy(struct bpf_prog *prog);
+
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return aux->offload;
+}
+#else
+static inline int bpf_prog_offload_init(struct bpf_prog *prog,
+					union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return false;
+}
+#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
+
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3b0976aaac75..e45011dbc02d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -153,6 +153,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
+	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
@@ -169,6 +170,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 	return env->cur_state->regs;
 }
 
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
+#else
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9af9feaaeb64..fda527ccb263 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -797,8 +797,13 @@ enum bpf_netdev_command {
 	 * is equivalent to XDP_ATTACHED_DRV.
 	 */
 	XDP_QUERY_PROG,
+	/* BPF program for offload callbacks, invoked at program load time. */
+	BPF_OFFLOAD_VERIFIER_PREP,
+	BPF_OFFLOAD_TRANSLATE,
+	BPF_OFFLOAD_DESTROY,
 };
 
+struct bpf_ext_analyzer_ops;
 struct netlink_ext_ack;
 
 struct netdev_bpf {
@@ -815,6 +820,15 @@ struct netdev_bpf {
 			u8 prog_attached;
 			u32 prog_id;
 		};
+		/* BPF_OFFLOAD_VERIFIER_PREP */
+		struct {
+			struct bpf_prog *prog;
+			const struct bpf_ext_analyzer_ops *ops; /* callee set */
+		} verifier;
+		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
+		struct {
+			struct bpf_prog *prog;
+		} offload;
 	};
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9820677c2ff..80d191a93fb0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -260,6 +260,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
+		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 16e95c8e749e..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7fe448799d76..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1380,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 * valid program, which in this case would simply not
 	 * be JITed, but falls back to the interpreter.
 	 */
-	fp = bpf_int_jit_compile(fp);
+	if (!bpf_prog_is_dev_bound(fp->aux)) {
+		fp = bpf_int_jit_compile(fp);
+	} else {
+		*err = bpf_prog_offload_compile(fp);
+		if (*err)
+			return fp;
+	}
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	struct bpf_prog_aux *aux;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+	if (bpf_prog_is_dev_bound(aux))
+		bpf_prog_offload_destroy(aux->prog);
 	bpf_jit_free(aux->prog);
 }
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..5553e0e2f8b1
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,182 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dev_offload *offload;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->prog_flags)
+		return -EINVAL;
+
+	offload = kzalloc(sizeof(*offload), GFP_USER);
+	if (!offload)
+		return -ENOMEM;
+
+	offload->prog = prog;
+	init_waitqueue_head(&offload->verifier_done);
+
+	rtnl_lock();
+	offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+	if (!offload->netdev) {
+		rtnl_unlock();
+		kfree(offload);
+		return -EINVAL;
+	}
+
+	prog->aux->offload = offload;
+	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+			     struct netdev_bpf *data)
+{
+	struct net_device *netdev = prog->aux->offload->netdev;
+
+	ASSERT_RTNL();
+
+	if (!netdev)
+		return -ENODEV;
+	if (!netdev->netdev_ops->ndo_bpf)
+		return -EOPNOTSUPP;
+
+	data->command = cmd;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	struct netdev_bpf data = {};
+	int err;
+
+	data.verifier.prog = env->prog;
+
+	rtnl_lock();
+	err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+	if (err)
+		goto exit_unlock;
+
+	env->dev_ops = data.verifier.ops;
+
+	env->prog->aux->offload->dev_state = true;
+	env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+	rtnl_unlock();
+	return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+
+	data.offload.prog = prog;
+
+	if (offload->verifier_running)
+		wait_event(offload->verifier_done, !offload->verifier_running);
+
+	if (offload->dev_state)
+		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+	offload->dev_state = false;
+	list_del_init(&offload->offloads);
+	offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	__bpf_prog_offload_destroy(prog);
+	rtnl_unlock();
+
+	kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+	int ret;
+
+	data.offload.prog = prog;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+	rtnl_unlock();
+
+	return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+					  const struct bpf_insn *insn)
+{
+	WARN(1, "attempt to execute device eBPF program on the host!");
+	return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+	prog->bpf_func = bpf_prog_warn_on_exec;
+
+	return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+				    ulong event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_dev_offload *offload, *tmp;
+
+	ASSERT_RTNL();
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+					 offloads) {
+			if (offload->netdev == netdev)
+				__bpf_prog_offload_destroy(offload->prog);
+		}
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+	.notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+	register_netdevice_notifier(&bpf_offload_notifier);
+	return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 323be2473c4b..1574b9f0f24e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
-	prog->aux->ops = bpf_prog_types[type];
+	if (!bpf_prog_is_dev_bound(prog->aux))
+		prog->aux->ops = bpf_prog_types[type];
+	else
+		prog->aux->ops = &bpf_offload_prog_ops;
 	prog->type = type;
 	return 0;
 }
@@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (type && prog->type != *type) {
+	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_name
+#define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1152,6 +1155,12 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
+	if (attr->prog_target_ifindex) {
+		err = bpf_prog_offload_init(prog, attr);
+		if (err)
+			goto free_prog;
+	}
+
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04357ad5a812..51aabb32ad67 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3736,10 +3736,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
-		return 0;
+	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
+		return env->analyzer_ops->insn_hook(env, insn_idx,
+						    prev_insn_idx);
+	if (env->dev_ops && env->dev_ops->insn_hook)
+		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
-	return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+	return 0;
 }
 
 static int do_check(struct bpf_verifier_env *env)
@@ -4516,6 +4519,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
+	if (env->prog->aux->offload) {
+		ret = bpf_prog_offload_verifier_prep(env);
+		if (ret)
+			goto err_unlock;
+	}
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3


From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:18 -0700
Subject: bpf: report offload info to user space

Extend struct bpf_prog_info to contain information about program
being bound to a device.  Since the netdev may get destroyed while
program still exists we need a flag to indicate the program is
loaded for a device, even if the device is gone.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/offload.c     | 12 ++++++++++++
 kernel/bpf/syscall.c     |  5 +++++
 4 files changed, 24 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e45d43f9ec92..98bacd0fa5cc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -506,6 +506,7 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 80d191a93fb0..4455dd195201 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -895,6 +895,10 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
+enum bpf_prog_status {
+	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
+};
+
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -908,6 +912,8 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 5553e0e2f8b1..2816feb38be1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -144,6 +144,18 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	u32 ifindex;
+
+	rtnl_lock();
+	ifindex = offload->netdev ? offload->netdev->ifindex : 0;
+	rtnl_unlock();
+
+	return ifindex;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1574b9f0f24e..3217c20ea91b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1592,6 +1592,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		info.status |= BPF_PROG_STATUS_DEV_BOUND;
+		info.ifindex = bpf_prog_offload_ifindex(prog);
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:32 -0500
Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2

Cgroup v2 lacks the device controller, provided by cgroup v1.
This patch adds a new eBPF program type, which in combination
of previously added ability to attach multiple eBPF programs
to a cgroup, will provide a similar functionality, but with some
additional flexibility.

This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type.
A program takes major and minor device numbers, device type
(block/character) and access type (mknod/read/write) as parameters
and returns an integer which defines if the operation should be
allowed or terminated with -EPERM.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h     | 15 ++++++++++
 include/linux/bpf_types.h      |  3 ++
 include/linux/device_cgroup.h  |  8 ++++-
 include/uapi/linux/bpf.h       | 15 ++++++++++
 kernel/bpf/cgroup.c            | 67 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  7 +++++
 kernel/bpf/verifier.c          |  1 +
 tools/include/uapi/linux/bpf.h | 15 ++++++++++
 8 files changed, 130 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 87a7db9feb38..a7f16e0f8d68 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
 				     enum bpf_attach_type type);
 
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	}								       \
 	__ret;								       \
 })
+
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+							  access,	      \
+							  BPF_CGROUP_DEVICE); \
+									      \
+	__ret;								      \
+})
 #else
 
 struct cgroup_bpf {};
@@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 53c5b9ad7220..978c1d9c9383 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index 2d93d7ecd479..8557efe096dc 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
+#include <linux/bpf-cgroup.h>
 
 #define DEVCG_ACC_MKNOD 1
 #define DEVCG_ACC_READ  2
@@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
 { return 0; }
 #endif
 
-#ifdef CONFIG_CGROUP_DEVICE
+#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
 static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
 					     short access)
 {
+	int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);
+
+	if (rc)
+		return -EPERM;
+
 	return __devcgroup_check_permission(type, major, minor, access);
 }
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4455dd195201..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -141,6 +142,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -991,4 +993,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 3db5a17fcfe8..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -522,3 +522,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	struct bpf_cgroup_dev_ctx ctx = {
+		.access_type = (access << 16) | dev_type,
+		.major = major,
+		.minor = minor,
+	};
+	int allow = 1;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+				   BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
+
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+	default:
+		return NULL;
+	}
+}
+
+static bool cgroup_dev_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+	.get_func_proto		= cgroup_dev_func_proto,
+	.is_valid_access	= cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 416d70cdfc76..09badc37e864 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1326,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, true);
@@ -1378,6 +1381,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, false);
@@ -1420,6 +1426,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
 	case BPF_CGROUP_SOCK_OPS:
+	case BPF_CGROUP_DEVICE:
 		break;
 	default:
 		return -EINVAL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index add845fe788a..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3124,6 +3124,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
 		break;
 	default:
 		return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e92f62cf933a..b280f37cd057 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -131,6 +131,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -140,6 +141,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -990,4 +992,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 6f27f4f97ee8cbec99b429b653333f4e781a47a1 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Thu, 2 Nov 2017 10:57:38 +0200
Subject: usb: core: add Status Type definitions

USB 3.1 added a PTM_STATUS type. Let's add a define for it and
following patches will let usb_get_status() accept the new argument.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/message.c   | 4 ++--
 include/uapi/linux/usb/ch9.h | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 5e8379b42f47..f35cbfa2b87b 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -946,8 +946,8 @@ int usb_get_status(struct usb_device *dev, int type, int target, void *data)
 		return -ENOMEM;
 
 	ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0),
-		USB_REQ_GET_STATUS, USB_DIR_IN | type, 0, target, status,
-		sizeof(*status), USB_CTRL_GET_TIMEOUT);
+		USB_REQ_GET_STATUS, USB_DIR_IN | type, USB_STATUS_TYPE_STANDARD,
+		target, status, sizeof(*status), USB_CTRL_GET_TIMEOUT);
 
 	if (ret == 2) {
 		*(u16 *) data = le16_to_cpu(*status);
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 2a5d63040a0b..b2167e89ae6e 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -143,6 +143,10 @@
 #define	TEST_PACKET	4
 #define	TEST_FORCE_EN	5
 
+/* Status Type */
+#define USB_STATUS_TYPE_STANDARD	0
+#define USB_STATUS_TYPE_PTM		1
+
 /*
  * New Feature Selectors as added by USB 3.0
  * See USB 3.0 spec Table 9-7
-- 
cgit v1.2.3


From 84287bb3285634b60c55c00a1d5ed843b44fde92 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:23 -0800
Subject: ila: add checksum neutral map auto

Add checksum neutral auto that performs checksum neutral mapping
without using the C-bit. This is enabled by configuration of
a mapping.

The checksum neutral function has been split into
ila_csum_do_neutral_fmt and ila_csum_do_neutral_nofmt. The former
handles the C-bit and includes it in the adjustment value. The latter
just sets the adjustment value on the locator diff only.

Added configuration for checksum neutral map aut in ila_lwt
and ila_xlat.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h  |  1 +
 net/ipv6/ila/ila_common.c | 65 ++++++++++++++++++++++++++++-------------------
 net/ipv6/ila/ila_lwt.c    | 29 +++++++++++----------
 net/ipv6/ila/ila_xlat.c   | 10 +++++---
 4 files changed, 61 insertions(+), 44 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index f54853288f99..0744881dcef3 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -41,6 +41,7 @@ enum {
 	ILA_CSUM_ADJUST_TRANSPORT,
 	ILA_CSUM_NEUTRAL_MAP,
 	ILA_CSUM_NO_ACTION,
+	ILA_CSUM_NEUTRAL_MAP_AUTO,
 };
 
 #endif /* _UAPI_LINUX_ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index f1d9248d8b86..8c88ecf29b93 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -37,8 +37,8 @@ static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
 	return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p);
 }
 
-static void ila_csum_do_neutral(struct ila_addr *iaddr,
-				struct ila_params *p)
+static void ila_csum_do_neutral_fmt(struct ila_addr *iaddr,
+				    struct ila_params *p)
 {
 	__sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3];
 	__wsum diff, fval;
@@ -60,13 +60,23 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr,
 	iaddr->ident.csum_neutral ^= 1;
 }
 
-static void ila_csum_adjust_transport(struct sk_buff *skb,
+static void ila_csum_do_neutral_nofmt(struct ila_addr *iaddr,
 				      struct ila_params *p)
 {
+	__sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3];
 	__wsum diff;
-	struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
+
+	diff = get_csum_diff_iaddr(iaddr, p);
+
+	*adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust)));
+}
+
+static void ila_csum_adjust_transport(struct sk_buff *skb,
+				      struct ila_params *p)
+{
 	size_t nhoff = sizeof(struct ipv6hdr);
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__wsum diff;
 
 	switch (ip6h->nexthdr) {
 	case NEXTHDR_TCP:
@@ -105,36 +115,39 @@ static void ila_csum_adjust_transport(struct sk_buff *skb,
 		}
 		break;
 	}
-
-	/* Now change destination address */
-	iaddr->loc = p->locator;
 }
 
 void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
-			     bool set_csum_neutral)
+			     bool sir2ila)
 {
 	struct ipv6hdr *ip6h = ipv6_hdr(skb);
 	struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
 
-	/* First deal with the transport checksum */
-	if (ila_csum_neutral_set(iaddr->ident)) {
-		/* C-bit is set in the locator indicating that this
-		 * is a locator being translated to a SIR address.
-		 * Perform (receiver) checksum-neutral translation.
-		 */
-		if (!set_csum_neutral)
-			ila_csum_do_neutral(iaddr, p);
-	} else {
-		switch (p->csum_mode) {
-		case ILA_CSUM_ADJUST_TRANSPORT:
-			ila_csum_adjust_transport(skb, p);
-			break;
-		case ILA_CSUM_NEUTRAL_MAP:
-			ila_csum_do_neutral(iaddr, p);
-			break;
-		case ILA_CSUM_NO_ACTION:
+	switch (p->csum_mode) {
+	case ILA_CSUM_ADJUST_TRANSPORT:
+		ila_csum_adjust_transport(skb, p);
+		break;
+	case ILA_CSUM_NEUTRAL_MAP:
+		if (sir2ila) {
+			if (WARN_ON(ila_csum_neutral_set(iaddr->ident))) {
+				/* Checksum flag should never be
+				 * set in a formatted SIR address.
+				 */
+				break;
+			}
+		} else if (!ila_csum_neutral_set(iaddr->ident)) {
+			/* ILA to SIR translation and C-bit isn't
+			 * set so we're good.
+			 */
 			break;
 		}
+		ila_csum_do_neutral_fmt(iaddr, p);
+		break;
+	case ILA_CSUM_NEUTRAL_MAP_AUTO:
+		ila_csum_do_neutral_nofmt(iaddr, p);
+		break;
+	case ILA_CSUM_NO_ACTION:
+		break;
 	}
 
 	/* Now change destination address */
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 696281b4bca2..104af07d83a6 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -127,6 +127,7 @@ static int ila_build_state(struct nlattr *nla,
 	struct lwtunnel_state *newts;
 	const struct fib6_config *cfg6 = cfg;
 	struct ila_addr *iaddr;
+	u8 csum_mode = ILA_CSUM_NO_ACTION;
 	int ret;
 
 	if (family != AF_INET6)
@@ -139,15 +140,6 @@ static int ila_build_state(struct nlattr *nla,
 		return -EINVAL;
 	}
 
-	iaddr = (struct ila_addr *)&cfg6->fc_dst;
-
-	if (!ila_addr_is_ila(iaddr) || ila_csum_neutral_set(iaddr->ident)) {
-		/* Don't allow translation for a non-ILA address or checksum
-		 * neutral flag to be set.
-		 */
-		return -EINVAL;
-	}
-
 	ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack);
 	if (ret < 0)
 		return ret;
@@ -155,6 +147,19 @@ static int ila_build_state(struct nlattr *nla,
 	if (!tb[ILA_ATTR_LOCATOR])
 		return -EINVAL;
 
+	iaddr = (struct ila_addr *)&cfg6->fc_dst;
+
+	if (tb[ILA_ATTR_CSUM_MODE])
+		csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
+
+	if (csum_mode == ILA_CSUM_NEUTRAL_MAP &&
+	    ila_csum_neutral_set(iaddr->ident)) {
+		/* Don't allow translation if checksum neutral bit is
+		 * configured and it's set in the SIR address.
+		 */
+		return -EINVAL;
+	}
+
 	newts = lwtunnel_state_alloc(sizeof(*ilwt));
 	if (!newts)
 		return -ENOMEM;
@@ -168,17 +173,13 @@ static int ila_build_state(struct nlattr *nla,
 
 	p = ila_params_lwtunnel(newts);
 
+	p->csum_mode = csum_mode;
 	p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
 
 	/* Precompute checksum difference for translation since we
 	 * know both the old locator and the new one.
 	 */
 	p->locator_match = iaddr->loc;
-	p->csum_diff = compute_csum_diff8(
-		(__be32 *)&p->locator_match, (__be32 *)&p->locator);
-
-	if (tb[ILA_ATTR_CSUM_MODE])
-		p->csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
 
 	ila_init_saved_csum(p);
 
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 3123b9de91b5..213259629e66 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -138,6 +138,8 @@ static int parse_nl_config(struct genl_info *info,
 
 	if (info->attrs[ILA_ATTR_CSUM_MODE])
 		xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]);
+	else
+		xp->ip.csum_mode = ILA_CSUM_NO_ACTION;
 
 	if (info->attrs[ILA_ATTR_IFINDEX])
 		xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
@@ -198,7 +200,7 @@ static void ila_free_cb(void *ptr, void *arg)
 	}
 }
 
-static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral);
+static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
 
 static unsigned int
 ila_nf_input(void *priv,
@@ -396,7 +398,7 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
 			      (__force u64)ila->xp.ip.locator_match.v64,
 			      ILA_ATTR_PAD) ||
 	    nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) ||
-	    nla_put_u32(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode))
+	    nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode))
 		return -1;
 
 	return 0;
@@ -607,7 +609,7 @@ static struct pernet_operations ila_net_ops = {
 	.size = sizeof(struct ila_net),
 };
 
-static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
+static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
 {
 	struct ila_map *ila;
 	struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -626,7 +628,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
 
 	ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan);
 	if (ila)
-		ila_update_ipv6_locator(skb, &ila->xp.ip, set_csum_neutral);
+		ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila);
 
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 70d5aef48a421a68bd9d1bf8f8267af406681580 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:24 -0800
Subject: ila: allow configuration of identifier type

Allow identifier to be explicitly configured for a mapping.
This can either be one of the identifier types specified in the
ILA draft or a value of ILA_ATYPE_USE_FORMAT which means the
identifier type is inferred from the identifier type field.
If a value other than ILA_ATYPE_USE_FORMAT is set for a
mapping then it is assumed that the identifier type field is
not present in an identifier.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h | 13 ++++++++++++
 net/ipv6/ila/ila.h       | 12 +-----------
 net/ipv6/ila/ila_lwt.c   | 51 +++++++++++++++++++++++++++++++++++++++++-------
 net/ipv6/ila/ila_xlat.c  | 18 ++++++++++++-----
 4 files changed, 71 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 0744881dcef3..8353c78a7781 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -17,6 +17,7 @@ enum {
 	ILA_ATTR_DIR,				/* u32 */
 	ILA_ATTR_PAD,
 	ILA_ATTR_CSUM_MODE,			/* u8 */
+	ILA_ATTR_IDENT_TYPE,			/* u8 */
 
 	__ILA_ATTR_MAX,
 };
@@ -44,4 +45,16 @@ enum {
 	ILA_CSUM_NEUTRAL_MAP_AUTO,
 };
 
+enum {
+	ILA_ATYPE_IID = 0,
+	ILA_ATYPE_LUID,
+	ILA_ATYPE_VIRT_V4,
+	ILA_ATYPE_VIRT_UNI_V6,
+	ILA_ATYPE_VIRT_MULTI_V6,
+	ILA_ATYPE_NONLOCAL_ADDR,
+	ILA_ATYPE_RSVD_1,
+	ILA_ATYPE_RSVD_2,
+
+	ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */
+};
 #endif /* _UAPI_LINUX_ILA_H */
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index e0170f62bc39..3c7a11b62334 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -55,17 +55,6 @@ struct ila_identifier {
 	};
 };
 
-enum {
-	ILA_ATYPE_IID = 0,
-	ILA_ATYPE_LUID,
-	ILA_ATYPE_VIRT_V4,
-	ILA_ATYPE_VIRT_UNI_V6,
-	ILA_ATYPE_VIRT_MULTI_V6,
-	ILA_ATYPE_RSVD_1,
-	ILA_ATYPE_RSVD_2,
-	ILA_ATYPE_RSVD_3,
-};
-
 #define CSUM_NEUTRAL_FLAG	htonl(0x10000000)
 
 struct ila_addr {
@@ -93,6 +82,7 @@ struct ila_params {
 	struct ila_locator locator_match;
 	__wsum csum_diff;
 	u8 csum_mode;
+	u8 ident_type;
 };
 
 static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 104af07d83a6..4b97d573f223 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -114,6 +114,7 @@ drop:
 static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
 };
 
 static int ila_build_state(struct nlattr *nla,
@@ -127,19 +128,14 @@ static int ila_build_state(struct nlattr *nla,
 	struct lwtunnel_state *newts;
 	const struct fib6_config *cfg6 = cfg;
 	struct ila_addr *iaddr;
+	u8 ident_type = ILA_ATYPE_USE_FORMAT;
 	u8 csum_mode = ILA_CSUM_NO_ACTION;
+	u8 eff_ident_type;
 	int ret;
 
 	if (family != AF_INET6)
 		return -EINVAL;
 
-	if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
-		/* Need to have full locator and at least type field
-		 * included in destination
-		 */
-		return -EINVAL;
-	}
-
 	ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack);
 	if (ret < 0)
 		return ret;
@@ -149,6 +145,41 @@ static int ila_build_state(struct nlattr *nla,
 
 	iaddr = (struct ila_addr *)&cfg6->fc_dst;
 
+	if (tb[ILA_ATTR_IDENT_TYPE])
+		ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]);
+
+	if (ident_type == ILA_ATYPE_USE_FORMAT) {
+		/* Infer identifier type from type field in formatted
+		 * identifier.
+		 */
+
+		if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
+			/* Need to have full locator and at least type field
+			 * included in destination
+			 */
+			return -EINVAL;
+		}
+
+		eff_ident_type = iaddr->ident.type;
+	} else {
+		eff_ident_type = ident_type;
+	}
+
+	switch (eff_ident_type) {
+	case ILA_ATYPE_IID:
+		/* Don't allow ILA for IID type */
+		return -EINVAL;
+	case ILA_ATYPE_LUID:
+		break;
+	case ILA_ATYPE_VIRT_V4:
+	case ILA_ATYPE_VIRT_UNI_V6:
+	case ILA_ATYPE_VIRT_MULTI_V6:
+	case ILA_ATYPE_NONLOCAL_ADDR:
+		/* These ILA formats are not supported yet. */
+	default:
+		return -EINVAL;
+	}
+
 	if (tb[ILA_ATTR_CSUM_MODE])
 		csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
 
@@ -174,6 +205,7 @@ static int ila_build_state(struct nlattr *nla,
 	p = ila_params_lwtunnel(newts);
 
 	p->csum_mode = csum_mode;
+	p->ident_type = ident_type;
 	p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
 
 	/* Precompute checksum difference for translation since we
@@ -208,9 +240,13 @@ static int ila_fill_encap_info(struct sk_buff *skb,
 	if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64,
 			      ILA_ATTR_PAD))
 		goto nla_put_failure;
+
 	if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -221,6 +257,7 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
 {
 	return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_CSUM_MODE */
+	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_IDENT_TYPE */
 	       0;
 }
 
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 213259629e66..6eb5e68f112a 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -121,6 +121,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
 	[ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
 };
 
 static int parse_nl_config(struct genl_info *info,
@@ -141,6 +142,12 @@ static int parse_nl_config(struct genl_info *info,
 	else
 		xp->ip.csum_mode = ILA_CSUM_NO_ACTION;
 
+	if (info->attrs[ILA_ATTR_IDENT_TYPE])
+		xp->ip.ident_type = nla_get_u8(
+				info->attrs[ILA_ATTR_IDENT_TYPE]);
+	else
+		xp->ip.ident_type = ILA_ATYPE_USE_FORMAT;
+
 	if (info->attrs[ILA_ATTR_IFINDEX])
 		xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
 
@@ -398,7 +405,8 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
 			      (__force u64)ila->xp.ip.locator_match.v64,
 			      ILA_ATTR_PAD) ||
 	    nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) ||
-	    nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode))
+	    nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) ||
+	    nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type))
 		return -1;
 
 	return 0;
@@ -619,10 +627,10 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
 
 	/* Assumes skb contains a valid IPv6 header that is pulled */
 
-	if (!ila_addr_is_ila(iaddr)) {
-		/* Type indicates this is not an ILA address */
-		return 0;
-	}
+	/* No check here that ILA type in the mapping matches what is in the
+	 * address. We assume that whatever sender gaves us can be translated.
+	 * The checksum mode however is relevant.
+	 */
 
 	rcu_read_lock();
 
-- 
cgit v1.2.3


From fddb231ebe647749782a9ebf11106a81f7168ba7 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:25 -0800
Subject: ila: Add a hook type for LWT routes

In LWT tunnels both an input and output route method is defined.
If both of these are executed in the same path then double translation
happens and the effect is not correct.

This patch adds a new attribute that indicates the hook type. Two
values are defined for route output and route output. ILA
translation is only done for the one that is set. The default is
to enable ILA on route output.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h |  7 +++++++
 net/ipv6/ila/ila_lwt.c   | 39 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 8353c78a7781..483b77af4eb8 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -18,6 +18,7 @@ enum {
 	ILA_ATTR_PAD,
 	ILA_ATTR_CSUM_MODE,			/* u8 */
 	ILA_ATTR_IDENT_TYPE,			/* u8 */
+	ILA_ATTR_HOOK_TYPE,			/* u8 */
 
 	__ILA_ATTR_MAX,
 };
@@ -57,4 +58,10 @@ enum {
 
 	ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */
 };
+
+enum {
+	ILA_HOOK_ROUTE_OUTPUT,
+	ILA_HOOK_ROUTE_INPUT,
+};
+
 #endif /* _UAPI_LINUX_ILA_H */
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 4b97d573f223..3d56a2fb6f86 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -20,6 +20,7 @@ struct ila_lwt {
 	struct ila_params p;
 	struct dst_cache dst_cache;
 	u32 connected : 1;
+	u32 lwt_output : 1;
 };
 
 static inline struct ila_lwt *ila_lwt_lwtunnel(
@@ -45,8 +46,10 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto drop;
 
-	ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate),
-				true);
+	if (ilwt->lwt_output)
+		ila_update_ipv6_locator(skb,
+					ila_params_lwtunnel(orig_dst->lwtstate),
+					true);
 
 	if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) {
 		/* Already have a next hop address in route, no need for
@@ -98,11 +101,15 @@ drop:
 static int ila_input(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
+	struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate);
 
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto drop;
 
-	ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false);
+	if (!ilwt->lwt_output)
+		ila_update_ipv6_locator(skb,
+					ila_params_lwtunnel(dst->lwtstate),
+					false);
 
 	return dst->lwtstate->orig_input(skb);
 
@@ -115,6 +122,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
 	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
+	[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
 };
 
 static int ila_build_state(struct nlattr *nla,
@@ -129,7 +137,9 @@ static int ila_build_state(struct nlattr *nla,
 	const struct fib6_config *cfg6 = cfg;
 	struct ila_addr *iaddr;
 	u8 ident_type = ILA_ATYPE_USE_FORMAT;
+	u8 hook_type = ILA_HOOK_ROUTE_OUTPUT;
 	u8 csum_mode = ILA_CSUM_NO_ACTION;
+	bool lwt_output = true;
 	u8 eff_ident_type;
 	int ret;
 
@@ -180,6 +190,20 @@ static int ila_build_state(struct nlattr *nla,
 		return -EINVAL;
 	}
 
+	if (tb[ILA_ATTR_HOOK_TYPE])
+		hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]);
+
+	switch (hook_type) {
+	case ILA_HOOK_ROUTE_OUTPUT:
+		lwt_output = true;
+		break;
+	case ILA_HOOK_ROUTE_INPUT:
+		lwt_output = false;
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	if (tb[ILA_ATTR_CSUM_MODE])
 		csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
 
@@ -202,6 +226,8 @@ static int ila_build_state(struct nlattr *nla,
 		return ret;
 	}
 
+	ilwt->lwt_output = !!lwt_output;
+
 	p = ila_params_lwtunnel(newts);
 
 	p->csum_mode = csum_mode;
@@ -236,6 +262,7 @@ static int ila_fill_encap_info(struct sk_buff *skb,
 			       struct lwtunnel_state *lwtstate)
 {
 	struct ila_params *p = ila_params_lwtunnel(lwtstate);
+	struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate);
 
 	if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64,
 			      ILA_ATTR_PAD))
@@ -247,6 +274,11 @@ static int ila_fill_encap_info(struct sk_buff *skb,
 	if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE,
+		       ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT :
+					  ILA_HOOK_ROUTE_INPUT))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -258,6 +290,7 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
 	return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_CSUM_MODE */
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_IDENT_TYPE */
+	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_HOOK_TYPE */
 	       0;
 }
 
-- 
cgit v1.2.3


From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:41 +0100
Subject: net_sch: red: Add offload ability to RED qdisc

Add the ability to offload RED qdisc by using ndo_setup_tc.
There are four commands for RED offloading:
* TC_RED_SET: handles set and change.
* TC_RED_DESTROY: handle qdisc destroy.
* TC_RED_STATS: update the qdiscs counters (given as reference)
* TC_RED_XSTAT: returns red xstats.

Whether RED is being offloaded is being determined every time dump action
is being called because parent change of this qdisc could change its
offload state but doesn't require any RED function to be called.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      |  1 +
 include/net/pkt_cls.h          | 30 ++++++++++++++++
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_red.c            | 79 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fda527ccb263..71968a2ca9f3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -777,6 +777,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
 	TC_SETUP_CBS,
+	TC_SETUP_QDISC_RED,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 98fef3221227..03c208d3c922 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -703,4 +703,34 @@ struct tc_cookie {
 	u8  *data;
 	u32 len;
 };
+
+enum tc_red_command {
+	TC_RED_REPLACE,
+	TC_RED_DESTROY,
+	TC_RED_STATS,
+	TC_RED_XSTATS,
+};
+
+struct tc_red_qopt_offload_params {
+	u32 min;
+	u32 max;
+	u32 probability;
+	bool is_ecn;
+};
+struct tc_red_qopt_offload_stats {
+	struct gnet_stats_basic_packed *bstats;
+	struct gnet_stats_queue *qstats;
+};
+
+struct tc_red_qopt_offload {
+	enum tc_red_command command;
+	u32 handle;
+	u32 parent;
+	union {
+		struct tc_red_qopt_offload_params set;
+		struct tc_red_qopt_offload_stats stats;
+		struct red_stats *xstats;
+	};
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 5002562868cc..6a2c5ea7e9c4 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,6 +256,7 @@ struct tc_red_qopt {
 #define TC_RED_ECN		1
 #define TC_RED_HARDDROP		2
 #define TC_RED_ADAPTATIVE	4
+#define TC_RED_OFFLOADED	8
 };
 
 struct tc_red_xstats {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index fdfdb56aaae2..007dd8ef8aac 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
 
@@ -148,11 +149,37 @@ static void red_reset(struct Qdisc *sch)
 	red_restart(&q->vars);
 }
 
+static int red_offload(struct Qdisc *sch, bool enable)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_red_qopt_offload opt = {
+		.handle = sch->handle,
+		.parent = sch->parent,
+	};
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	if (enable) {
+		opt.command = TC_RED_REPLACE;
+		opt.set.min = q->parms.qth_min >> q->parms.Wlog;
+		opt.set.max = q->parms.qth_max >> q->parms.Wlog;
+		opt.set.probability = q->parms.max_P;
+		opt.set.is_ecn = red_use_ecn(q);
+	} else {
+		opt.command = TC_RED_DESTROY;
+	}
+
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
+}
+
 static void red_destroy(struct Qdisc *sch)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 
 	del_timer_sync(&q->adapt_timer);
+	red_offload(sch, false);
 	qdisc_destroy(q->qdisc);
 }
 
@@ -219,6 +246,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
 		red_start_of_idle_period(&q->vars);
 
 	sch_tree_unlock(sch);
+	red_offload(sch, true);
 	return 0;
 }
 
@@ -244,6 +272,33 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt)
 	return red_change(sch, opt);
 }
 
+static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_red_qopt_offload hw_stats = {
+		.handle = sch->handle,
+		.parent = sch->parent,
+		.command = TC_RED_STATS,
+		.stats.bstats = &sch->bstats,
+		.stats.qstats = &sch->qstats,
+	};
+	int err;
+
+	opt->flags &= ~TC_RED_OFFLOADED;
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return 0;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+					    &hw_stats);
+	if (err == -EOPNOTSUPP)
+		return 0;
+
+	if (!err)
+		opt->flags |= TC_RED_OFFLOADED;
+
+	return err;
+}
+
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
@@ -257,8 +312,13 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 		.Plog		= q->parms.Plog,
 		.Scell_log	= q->parms.Scell_log,
 	};
+	int err;
 
 	sch->qstats.backlog = q->qdisc->qstats.backlog;
+	err = red_dump_offload(sch, &opt);
+	if (err)
+		goto nla_put_failure;
+
 	opts = nla_nest_start(skb, TCA_OPTIONS);
 	if (opts == NULL)
 		goto nla_put_failure;
@@ -275,6 +335,7 @@ nla_put_failure:
 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 	struct tc_red_xstats st = {
 		.early	= q->stats.prob_drop + q->stats.forced_drop,
 		.pdrop	= q->stats.pdrop,
@@ -282,6 +343,24 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
 	};
 
+	if (tc_can_offload(dev) &&  dev->netdev_ops->ndo_setup_tc) {
+		struct red_stats hw_stats = {0};
+		struct tc_red_qopt_offload hw_stats_request = {
+			.handle = sch->handle,
+			.parent = sch->parent,
+			.command = TC_RED_XSTATS,
+			.xstats = &hw_stats,
+		};
+		if (!dev->netdev_ops->ndo_setup_tc(dev,
+						   TC_SETUP_QDISC_RED,
+						   &hw_stats_request)) {
+			st.early += hw_stats.prob_drop + hw_stats.forced_drop;
+			st.pdrop += hw_stats.pdrop;
+			st.other += hw_stats.other;
+			st.marked += hw_stats.prob_mark + hw_stats.forced_mark;
+		}
+	}
+
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }
 
-- 
cgit v1.2.3


From b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3 Mon Sep 17 00:00:00 2001
From: Yi Yang <yi.y.yang@intel.com>
Date: Tue, 7 Nov 2017 21:07:02 +0800
Subject: openvswitch: enable NSH support

v16->17
 - Fixed disputed check code: keep them in nsh_push and nsh_pop
   but also add them in __ovs_nla_copy_actions

v15->v16
 - Add csum recalculation for nsh_push, nsh_pop and set_nsh
   pointed out by Pravin
 - Move nsh key into the union with ipv4 and ipv6 and add
   check for nsh key in match_validate pointed out by Pravin
 - Add nsh check in validate_set and __ovs_nla_copy_actions

v14->v15
 - Check size in nsh_hdr_from_nlattr
 - Fixed four small issues pointed out By Jiri and Eric

v13->v14
 - Rename skb_push_nsh to nsh_push per Dave's comment
 - Rename skb_pop_nsh to nsh_pop per Dave's comment

v12->v13
 - Fix NSH header length check in set_nsh

v11->v12
 - Fix missing changes old comments pointed out
 - Fix new comments for v11

v10->v11
 - Fix the left three disputable comments for v9
   but not fixed in v10.

v9->v10
 - Change struct ovs_key_nsh to
       struct ovs_nsh_key_base base;
       __be32 context[NSH_MD1_CONTEXT_SIZE];
 - Fix new comments for v9

v8->v9
 - Fix build error reported by daily intel build
   because nsh module isn't selected by openvswitch

v7->v8
 - Rework nested value and mask for OVS_KEY_ATTR_NSH
 - Change pop_nsh to adapt to nsh kernel module
 - Fix many issues per comments from Jiri Benc

v6->v7
 - Remove NSH GSO patches in v6 because Jiri Benc
   reworked it as another patch series and they have
   been merged.
 - Change it to adapt to nsh kernel module added by NSH
   GSO patch series

v5->v6
 - Fix the rest comments for v4.
 - Add NSH GSO support for VxLAN-gpe + NSH and
   Eth + NSH.

v4->v5
 - Fix many comments by Jiri Benc and Eric Garver
   for v4.

v3->v4
 - Add new NSH match field ttl
 - Update NSH header to the latest format
   which will be final format and won't change
   per its author's confirmation.
 - Fix comments for v3.

v2->v3
 - Change OVS_KEY_ATTR_NSH to nested key to handle
   length-fixed attributes and length-variable
   attriubte more flexibly.
 - Remove struct ovs_action_push_nsh completely
 - Add code to handle nested attribute for SET_MASKED
 - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
   to transfer NSH header data.
 - Fix comments and coding style issues by Jiri and Eric

v1->v2
 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
 - Dynamically allocate struct ovs_action_push_nsh for
   length-variable metadata.

OVS master and 2.8 branch has merged NSH userspace
patch series, this patch is to enable NSH support
in kernel data path in order that OVS can support
NSH in compat mode by porting this.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Eric Garver <e@erig.me>
Acked-by: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nsh.h                |   3 +
 include/uapi/linux/openvswitch.h |  29 ++++
 net/nsh/nsh.c                    |  60 +++++++
 net/openvswitch/Kconfig          |   1 +
 net/openvswitch/actions.c        | 116 +++++++++++++
 net/openvswitch/flow.c           |  51 ++++++
 net/openvswitch/flow.h           |   7 +
 net/openvswitch/flow_netlink.c   | 343 ++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/flow_netlink.h   |   5 +
 9 files changed, 613 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/nsh.h b/include/net/nsh.h
index a1eaea20be96..350b1ad11c7f 100644
--- a/include/net/nsh.h
+++ b/include/net/nsh.h
@@ -304,4 +304,7 @@ static inline void nsh_set_flags_ttl_len(struct nshhdr *nsh, u8 flags,
 			NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK);
 }
 
+int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh);
+int nsh_pop(struct sk_buff *skb);
+
 #endif /* __NET_NSH_H */
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 501e4c4e2a03..ec75a685f1dd 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -336,6 +336,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_LABELS,	/* 16-octet connection tracking label */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
+	OVS_KEY_ATTR_NSH,       /* Nested set of ovs_nsh_key_* */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -495,6 +496,30 @@ struct ovs_key_ct_tuple_ipv6 {
 	__u8   ipv6_proto;
 };
 
+enum ovs_nsh_key_attr {
+	OVS_NSH_KEY_ATTR_UNSPEC,
+	OVS_NSH_KEY_ATTR_BASE,  /* struct ovs_nsh_key_base. */
+	OVS_NSH_KEY_ATTR_MD1,   /* struct ovs_nsh_key_md1. */
+	OVS_NSH_KEY_ATTR_MD2,   /* variable-length octets for MD type 2. */
+	__OVS_NSH_KEY_ATTR_MAX
+};
+
+#define OVS_NSH_KEY_ATTR_MAX (__OVS_NSH_KEY_ATTR_MAX - 1)
+
+struct ovs_nsh_key_base {
+	__u8 flags;
+	__u8 ttl;
+	__u8 mdtype;
+	__u8 np;
+	__be32 path_hdr;
+};
+
+#define NSH_MD1_CONTEXT_SIZE 4
+
+struct ovs_nsh_key_md1 {
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -811,6 +836,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
+ * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
+ * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -841,6 +868,8 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
 	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
+	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
+	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index 58fb827439a8..d7da99a0b0b8 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -14,6 +14,66 @@
 #include <net/nsh.h>
 #include <net/tun_proto.h>
 
+int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh)
+{
+	struct nshhdr *nh;
+	size_t length = nsh_hdr_len(pushed_nh);
+	u8 next_proto;
+
+	if (skb->mac_len) {
+		next_proto = TUN_P_ETHERNET;
+	} else {
+		next_proto = tun_p_from_eth_p(skb->protocol);
+		if (!next_proto)
+			return -EAFNOSUPPORT;
+	}
+
+	/* Add the NSH header */
+	if (skb_cow_head(skb, length) < 0)
+		return -ENOMEM;
+
+	skb_push(skb, length);
+	nh = (struct nshhdr *)(skb->data);
+	memcpy(nh, pushed_nh, length);
+	nh->np = next_proto;
+	skb_postpush_rcsum(skb, nh, length);
+
+	skb->protocol = htons(ETH_P_NSH);
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_mac_len(skb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nsh_push);
+
+int nsh_pop(struct sk_buff *skb)
+{
+	struct nshhdr *nh;
+	size_t length;
+	__be16 inner_proto;
+
+	if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN))
+		return -ENOMEM;
+	nh = (struct nshhdr *)(skb->data);
+	length = nsh_hdr_len(nh);
+	inner_proto = tun_p_to_eth_p(nh->np);
+	if (!pskb_may_pull(skb, length))
+		return -ENOMEM;
+
+	if (!inner_proto)
+		return -EAFNOSUPPORT;
+
+	skb_pull_rcsum(skb, length);
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_mac_len(skb);
+	skb->protocol = inner_proto;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nsh_pop);
+
 static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index ce947292ae77..2650205cdaf9 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -14,6 +14,7 @@ config OPENVSWITCH
 	select MPLS
 	select NET_MPLS_GSO
 	select DST_CACHE
+	select NET_NSH
 	---help---
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a551232daf61..9a6a6d51e421 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -43,6 +43,7 @@
 #include "flow.h"
 #include "conntrack.h"
 #include "vport.h"
+#include "flow_netlink.h"
 
 struct deferred_action {
 	struct sk_buff *skb;
@@ -380,6 +381,38 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
+		    const struct nshhdr *nh)
+{
+	int err;
+
+	err = nsh_push(skb, nh);
+	if (err)
+		return err;
+
+	/* safe right before invalidate_flow_key */
+	key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
+static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	int err;
+
+	err = nsh_pop(skb);
+	if (err)
+		return err;
+
+	/* safe right before invalidate_flow_key */
+	if (skb->protocol == htons(ETH_P_TEB))
+		key->mac_proto = MAC_PROTO_ETHERNET;
+	else
+		key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
 static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
 				  __be32 addr, __be32 new_addr)
 {
@@ -602,6 +635,69 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	return 0;
 }
 
+static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
+		   const struct nlattr *a)
+{
+	struct nshhdr *nh;
+	size_t length;
+	int err;
+	u8 flags;
+	u8 ttl;
+	int i;
+
+	struct ovs_key_nsh key;
+	struct ovs_key_nsh mask;
+
+	err = nsh_key_from_nlattr(a, &key, &mask);
+	if (err)
+		return err;
+
+	/* Make sure the NSH base header is there */
+	if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN))
+		return -ENOMEM;
+
+	nh = nsh_hdr(skb);
+	length = nsh_hdr_len(nh);
+
+	/* Make sure the whole NSH header is there */
+	err = skb_ensure_writable(skb, skb_network_offset(skb) +
+				       length);
+	if (unlikely(err))
+		return err;
+
+	nh = nsh_hdr(skb);
+	skb_postpull_rcsum(skb, nh, length);
+	flags = nsh_get_flags(nh);
+	flags = OVS_MASKED(flags, key.base.flags, mask.base.flags);
+	flow_key->nsh.base.flags = flags;
+	ttl = nsh_get_ttl(nh);
+	ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl);
+	flow_key->nsh.base.ttl = ttl;
+	nsh_set_flags_and_ttl(nh, flags, ttl);
+	nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr,
+				  mask.base.path_hdr);
+	flow_key->nsh.base.path_hdr = nh->path_hdr;
+	switch (nh->mdtype) {
+	case NSH_M_TYPE1:
+		for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) {
+			nh->md1.context[i] =
+			    OVS_MASKED(nh->md1.context[i], key.context[i],
+				       mask.context[i]);
+		}
+		memcpy(flow_key->nsh.context, nh->md1.context,
+		       sizeof(nh->md1.context));
+		break;
+	case NSH_M_TYPE2:
+		memset(flow_key->nsh.context, 0,
+		       sizeof(flow_key->nsh.context));
+		break;
+	default:
+		return -EINVAL;
+	}
+	skb_postpush_rcsum(skb, nh, length);
+	return 0;
+}
+
 /* Must follow skb_ensure_writable() since that can move the skb data. */
 static void set_tp_port(struct sk_buff *skb, __be16 *port,
 			__be16 new_port, __sum16 *check)
@@ -1024,6 +1120,10 @@ static int execute_masked_set_action(struct sk_buff *skb,
 				   get_mask(a, struct ovs_key_ethernet *));
 		break;
 
+	case OVS_KEY_ATTR_NSH:
+		err = set_nsh(skb, flow_key, a);
+		break;
+
 	case OVS_KEY_ATTR_IPV4:
 		err = set_ipv4(skb, flow_key, nla_data(a),
 			       get_mask(a, struct ovs_key_ipv4 *));
@@ -1214,6 +1314,22 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_POP_ETH:
 			err = pop_eth(skb, key);
 			break;
+
+		case OVS_ACTION_ATTR_PUSH_NSH: {
+			u8 buffer[NSH_HDR_MAX_LEN];
+			struct nshhdr *nh = (struct nshhdr *)buffer;
+
+			err = nsh_hdr_from_nlattr(nla_data(a), nh,
+						  NSH_HDR_MAX_LEN);
+			if (unlikely(err))
+				break;
+			err = push_nsh(skb, key, nh);
+			break;
+		}
+
+		case OVS_ACTION_ATTR_POP_NSH:
+			err = pop_nsh(skb, key);
+			break;
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 8c94cef25a72..864ddb1e3642 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -46,6 +46,7 @@
 #include <net/ipv6.h>
 #include <net/mpls.h>
 #include <net/ndisc.h>
+#include <net/nsh.h>
 
 #include "conntrack.h"
 #include "datapath.h"
@@ -490,6 +491,52 @@ invalid:
 	return 0;
 }
 
+static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	struct nshhdr *nh;
+	unsigned int nh_ofs = skb_network_offset(skb);
+	u8 version, length;
+	int err;
+
+	err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN);
+	if (unlikely(err))
+		return err;
+
+	nh = nsh_hdr(skb);
+	version = nsh_get_ver(nh);
+	length = nsh_hdr_len(nh);
+
+	if (version != 0)
+		return -EINVAL;
+
+	err = check_header(skb, nh_ofs + length);
+	if (unlikely(err))
+		return err;
+
+	nh = nsh_hdr(skb);
+	key->nsh.base.flags = nsh_get_flags(nh);
+	key->nsh.base.ttl = nsh_get_ttl(nh);
+	key->nsh.base.mdtype = nh->mdtype;
+	key->nsh.base.np = nh->np;
+	key->nsh.base.path_hdr = nh->path_hdr;
+	switch (key->nsh.base.mdtype) {
+	case NSH_M_TYPE1:
+		if (length != NSH_M_TYPE1_LEN)
+			return -EINVAL;
+		memcpy(key->nsh.context, nh->md1.context,
+		       sizeof(nh->md1));
+		break;
+	case NSH_M_TYPE2:
+		memset(key->nsh.context, 0,
+		       sizeof(nh->md1));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * key_extract - extracts a flow key from an Ethernet frame.
  * @skb: sk_buff that contains the frame, with skb->data pointing to the
@@ -735,6 +782,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
+	} else if (key->eth.type == htons(ETH_P_NSH)) {
+		error = parse_nsh(skb, key);
+		if (error)
+			return error;
 	}
 	return 0;
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1875bba4f865..c670dd24b8b7 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -35,6 +35,7 @@
 #include <net/inet_ecn.h>
 #include <net/ip_tunnels.h>
 #include <net/dst_metadata.h>
+#include <net/nsh.h>
 
 struct sk_buff;
 
@@ -66,6 +67,11 @@ struct vlan_head {
 	(offsetof(struct sw_flow_key, recirc_id) +	\
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
 
+struct ovs_key_nsh {
+	struct ovs_nsh_key_base base;
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 struct sw_flow_key {
 	u8 tun_opts[IP_TUNNEL_OPTS_MAX];
 	u8 tun_opts_len;
@@ -143,6 +149,7 @@ struct sw_flow_key {
 				} nd;
 			};
 		} ipv6;
+		struct ovs_key_nsh nsh;         /* network service header */
 	};
 	struct {
 		/* Connection tracking fields not packed above. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index dc0d79092e74..4201f9293af3 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -48,6 +48,7 @@
 #include <net/ndisc.h>
 #include <net/mpls.h>
 #include <net/vxlan.h>
+#include <net/tun_proto.h>
 #include <net/erspan.h>
 
 #include "flow_netlink.h"
@@ -80,9 +81,11 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 		case OVS_ACTION_ATTR_HASH:
 		case OVS_ACTION_ATTR_POP_ETH:
 		case OVS_ACTION_ATTR_POP_MPLS:
+		case OVS_ACTION_ATTR_POP_NSH:
 		case OVS_ACTION_ATTR_POP_VLAN:
 		case OVS_ACTION_ATTR_PUSH_ETH:
 		case OVS_ACTION_ATTR_PUSH_MPLS:
+		case OVS_ACTION_ATTR_PUSH_NSH:
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 		case OVS_ACTION_ATTR_SAMPLE:
 		case OVS_ACTION_ATTR_SET:
@@ -175,7 +178,8 @@ static bool match_validate(const struct sw_flow_match *match,
 			| (1 << OVS_KEY_ATTR_ICMPV6)
 			| (1 << OVS_KEY_ATTR_ARP)
 			| (1 << OVS_KEY_ATTR_ND)
-			| (1 << OVS_KEY_ATTR_MPLS));
+			| (1 << OVS_KEY_ATTR_MPLS)
+			| (1 << OVS_KEY_ATTR_NSH));
 
 	/* Always allowed mask fields. */
 	mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
@@ -284,6 +288,14 @@ static bool match_validate(const struct sw_flow_match *match,
 		}
 	}
 
+	if (match->key->eth.type == htons(ETH_P_NSH)) {
+		key_expected |= 1 << OVS_KEY_ATTR_NSH;
+		if (match->mask &&
+		    match->mask->key.eth.type == htons(0xffff)) {
+			mask_allowed |= 1 << OVS_KEY_ATTR_NSH;
+		}
+	}
+
 	if ((key_attrs & key_expected) != key_expected) {
 		/* Key attributes check failed. */
 		OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)",
@@ -325,12 +337,25 @@ size_t ovs_tun_key_attr_size(void)
 		+ nla_total_size(4);   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */
 }
 
+size_t ovs_nsh_key_attr_size(void)
+{
+	/* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider
+	 * updating this function.
+	 */
+	return  nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */
+		/* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are
+		 * mutually exclusive, so the bigger one can cover
+		 * the small one.
+		 */
+		+ nla_total_size(NSH_CTX_HDRS_MAX_LEN);
+}
+
 size_t ovs_key_attr_size(void)
 {
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -344,6 +369,8 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
 		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
+		+ nla_total_size(0)   /* OVS_KEY_ATTR_NSH */
+		  + ovs_nsh_key_attr_size()
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -377,6 +404,13 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 	[OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = sizeof(u32) },
 };
 
+static const struct ovs_len_tbl
+ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = {
+	[OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) },
+	[OVS_NSH_KEY_ATTR_MD1]  = { .len = sizeof(struct ovs_nsh_key_md1) },
+	[OVS_NSH_KEY_ATTR_MD2]  = { .len = OVS_ATTR_VARIABLE },
+};
+
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
 static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_ENCAP]	 = { .len = OVS_ATTR_NESTED },
@@ -409,6 +443,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
 	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
+	[OVS_KEY_ATTR_NSH]       = { .len = OVS_ATTR_NESTED,
+				     .next = ovs_nsh_key_attr_lens, },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -1227,6 +1263,221 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 	return 0;
 }
 
+int nsh_hdr_from_nlattr(const struct nlattr *attr,
+			struct nshhdr *nh, size_t size)
+{
+	struct nlattr *a;
+	int rem;
+	u8 flags = 0;
+	u8 ttl = 0;
+	int mdlen = 0;
+
+	/* validate_nsh has check this, so we needn't do duplicate check here
+	 */
+	if (size < NSH_BASE_HDR_LEN)
+		return -ENOBUFS;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base = nla_data(a);
+
+			flags = base->flags;
+			ttl = base->ttl;
+			nh->np = base->np;
+			nh->mdtype = base->mdtype;
+			nh->path_hdr = base->path_hdr;
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1:
+			mdlen = nla_len(a);
+			if (mdlen > size - NSH_BASE_HDR_LEN)
+				return -ENOBUFS;
+			memcpy(&nh->md1, nla_data(a), mdlen);
+			break;
+
+		case OVS_NSH_KEY_ATTR_MD2:
+			mdlen = nla_len(a);
+			if (mdlen > size - NSH_BASE_HDR_LEN)
+				return -ENOBUFS;
+			memcpy(&nh->md2, nla_data(a), mdlen);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* nsh header length  = NSH_BASE_HDR_LEN + mdlen */
+	nh->ver_flags_ttl_len = 0;
+	nsh_set_flags_ttl_len(nh, flags, ttl, NSH_BASE_HDR_LEN + mdlen);
+
+	return 0;
+}
+
+int nsh_key_from_nlattr(const struct nlattr *attr,
+			struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask)
+{
+	struct nlattr *a;
+	int rem;
+
+	/* validate_nsh has check this, so we needn't do duplicate check here
+	 */
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base = nla_data(a);
+			const struct ovs_nsh_key_base *base_mask = base + 1;
+
+			nsh->base = *base;
+			nsh_mask->base = *base_mask;
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 = nla_data(a);
+			const struct ovs_nsh_key_md1 *md1_mask = md1 + 1;
+
+			memcpy(nsh->context, md1->context, sizeof(*md1));
+			memcpy(nsh_mask->context, md1_mask->context,
+			       sizeof(*md1_mask));
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			/* Not supported yet */
+			return -ENOTSUPP;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int nsh_key_put_from_nlattr(const struct nlattr *attr,
+				   struct sw_flow_match *match, bool is_mask,
+				   bool is_push_nsh, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool has_base = false;
+	bool has_md1 = false;
+	bool has_md2 = false;
+	u8 mdtype = 0;
+	int mdlen = 0;
+
+	if (WARN_ON(is_push_nsh && is_mask))
+		return -EINVAL;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+		int i;
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(log, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    log,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base = nla_data(a);
+
+			has_base = true;
+			mdtype = base->mdtype;
+			SW_FLOW_KEY_PUT(match, nsh.base.flags,
+					base->flags, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.base.ttl,
+					base->ttl, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.base.mdtype,
+					base->mdtype, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.base.np,
+					base->np, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.base.path_hdr,
+					base->path_hdr, is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 = nla_data(a);
+
+			has_md1 = true;
+			for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++)
+				SW_FLOW_KEY_PUT(match, nsh.context[i],
+						md1->context[i], is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			if (!is_push_nsh) /* Not supported MD type 2 yet */
+				return -ENOTSUPP;
+
+			has_md2 = true;
+			mdlen = nla_len(a);
+			if (mdlen > NSH_CTX_HDRS_MAX_LEN || mdlen <= 0) {
+				OVS_NLERR(
+				    log,
+				    "Invalid MD length %d for MD type %d",
+				    mdlen,
+				    mdtype
+				);
+				return -EINVAL;
+			}
+			break;
+		default:
+			OVS_NLERR(log, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if (has_md1 && has_md2) {
+		OVS_NLERR(
+		    1,
+		    "invalid nsh attribute: md1 and md2 are exclusive."
+		);
+		return -EINVAL;
+	}
+
+	if (!is_mask) {
+		if ((has_md1 && mdtype != NSH_M_TYPE1) ||
+		    (has_md2 && mdtype != NSH_M_TYPE2)) {
+			OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
+				  mdtype);
+			return -EINVAL;
+		}
+
+		if (is_push_nsh &&
+		    (!has_base || (!has_md1 && !has_md2))) {
+			OVS_NLERR(
+			    1,
+			    "push_nsh: missing base or metadata attributes"
+			);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				u64 attrs, const struct nlattr **a,
 				bool is_mask, bool log)
@@ -1354,6 +1605,13 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_NSH)) {
+		if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match,
+					    is_mask, false, log) < 0)
+			return -EINVAL;
+		attrs &= ~(1 << OVS_KEY_ATTR_NSH);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
 		const struct ovs_key_mpls *mpls_key;
 
@@ -1670,6 +1928,34 @@ static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
 	return 0;
 }
 
+static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask,
+			     struct sk_buff *skb)
+{
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, OVS_KEY_ATTR_NSH);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(nsh->base), &nsh->base))
+		goto nla_put_failure;
+
+	if (is_mask || nsh->base.mdtype == NSH_M_TYPE1) {
+		if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1,
+			    sizeof(nsh->context), nsh->context))
+			goto nla_put_failure;
+	}
+
+	/* Don't support MD type 2 yet */
+
+	nla_nest_end(skb, start);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 			     const struct sw_flow_key *output, bool is_mask,
 			     struct sk_buff *skb)
@@ -1798,6 +2084,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 		ipv6_key->ipv6_tclass = output->ip.tos;
 		ipv6_key->ipv6_hlimit = output->ip.ttl;
 		ipv6_key->ipv6_frag = output->ip.frag;
+	} else if (swkey->eth.type == htons(ETH_P_NSH)) {
+		if (nsh_key_to_nlattr(&output->nsh, is_mask, skb))
+			goto nla_put_failure;
 	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
 		   swkey->eth.type == htons(ETH_P_RARP)) {
 		struct ovs_key_arp *arp_key;
@@ -2292,6 +2581,19 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	return err;
 }
 
+static bool validate_nsh(const struct nlattr *attr, bool is_mask,
+			 bool is_push_nsh, bool log)
+{
+	struct sw_flow_match match;
+	struct sw_flow_key key;
+	int ret = 0;
+
+	ovs_match_init(&match, &key, true, NULL);
+	ret = nsh_key_put_from_nlattr(attr, &match, is_mask,
+				      is_push_nsh, log);
+	return !ret;
+}
+
 /* Return false if there are any non-masked bits set.
  * Mask follows data immediately, before any netlink padding.
  */
@@ -2434,6 +2736,13 @@ static int validate_set(const struct nlattr *a,
 
 		break;
 
+	case OVS_KEY_ATTR_NSH:
+		if (eth_type != htons(ETH_P_NSH))
+			return -EINVAL;
+		if (!validate_nsh(nla_data(a), masked, false, log))
+			return -EINVAL;
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -2533,6 +2842,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
+			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
+			[OVS_ACTION_ATTR_POP_NSH] = 0,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -2690,6 +3001,34 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			mac_proto = MAC_PROTO_ETHERNET;
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_NSH:
+			if (mac_proto != MAC_PROTO_ETHERNET) {
+				u8 next_proto;
+
+				next_proto = tun_p_from_eth_p(eth_type);
+				if (!next_proto)
+					return -EINVAL;
+			}
+			mac_proto = MAC_PROTO_NONE;
+			if (!validate_nsh(nla_data(a), false, true, true))
+				return -EINVAL;
+			break;
+
+		case OVS_ACTION_ATTR_POP_NSH: {
+			__be16 inner_proto;
+
+			if (eth_type != htons(ETH_P_NSH))
+				return -EINVAL;
+			inner_proto = tun_p_to_eth_p(key->nsh.base.np);
+			if (!inner_proto)
+				return -EINVAL;
+			if (key->nsh.base.np == TUN_P_ETHERNET)
+				mac_proto = MAC_PROTO_ETHERNET;
+			else
+				mac_proto = MAC_PROTO_NONE;
+			break;
+		}
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 929c665ac3aa..6657606b2b47 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -79,4 +79,9 @@ int ovs_nla_put_actions(const struct nlattr *attr,
 void ovs_nla_free_flow_actions(struct sw_flow_actions *);
 void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
 
+int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh,
+			struct ovs_key_nsh *nsh_mask);
+int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh,
+			size_t size);
+
 #endif /* flow_netlink.h */
-- 
cgit v1.2.3


From 096b85464832d2a7bd7bd6d4db2fafed2ab77244 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@google.com>
Date: Fri, 13 Oct 2017 15:09:25 -0700
Subject: EVM: Include security.apparmor in EVM measurements

Apparmor will be gaining support for security.apparmor labels, and it
would be helpful to include these in EVM validation now so appropriate
signatures can be generated even before full support is merged.

Signed-off-by: Matthew Garrett <mjg59@google.com>
Acked-by: John Johansen <John.johansen@canonical.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 include/uapi/linux/xattr.h        | 3 +++
 security/integrity/evm/evm_main.c | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 1590c49cae57..e630b9cd70cb 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -65,6 +65,9 @@
 #define XATTR_NAME_SMACKTRANSMUTE XATTR_SECURITY_PREFIX XATTR_SMACK_TRANSMUTE
 #define XATTR_NAME_SMACKMMAP XATTR_SECURITY_PREFIX XATTR_SMACK_MMAP
 
+#define XATTR_APPARMOR_SUFFIX "apparmor"
+#define XATTR_NAME_APPARMOR XATTR_SECURITY_PREFIX XATTR_APPARMOR_SUFFIX
+
 #define XATTR_CAPS_SUFFIX "capability"
 #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
 
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 063d38aef64e..9826c02e2db8 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -49,6 +49,9 @@ char *evm_config_xattrnames[] = {
 	XATTR_NAME_SMACKMMAP,
 #endif
 #endif
+#ifdef CONFIG_SECURITY_APPARMOR
+	XATTR_NAME_APPARMOR,
+#endif
 #ifdef CONFIG_IMA_APPRAISE
 	XATTR_NAME_IMA,
 #endif
-- 
cgit v1.2.3


From 9e429d564926d3bca49907fa03031da705ad6f2c Mon Sep 17 00:00:00 2001
From: Jason Gerecke <killertofu@gmail.com>
Date: Tue, 7 Nov 2017 08:25:17 -0800
Subject: HID: wacom: generic: Send BTN_STYLUS3 when both barrel switches are
 set

The Wacom Pro Pen 3D includes a third barrel switch which is intended to
be particularly useful in applications where one frequency uses pan, zoom,
and rotate to navigate around a scene or model. The pen is compatible with
the MobileStudio Pro, 2nd-gen Intuos Pro, and Cintiq Pro. When the third
button is pressed, these devices set both the HID_DG_BARRELSWITCH and
HID_DG_BARRELSWITCH2 usages since their HID descriptors do not include a
usage specific to the button.

Rather than send both BTN_STYLUS and BTN_STYLUS2 when the third button is
pressed, userspace (libinput) has requested that we detect this condition
and report a newly-defined BTN_STYLUS3 event instead. We could define a
quirk specific to devices compatible with the Pro Pen 3D, but the liklihood
of seeing both barrel switch bits set with other pens/devices is low enough
to not worry about (pens mechanically prevent accidental activation of
multiple switches).

Signed-off-by: Jason Gerecke <jason.gerecke@wacom.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Peter Hutterer <peter.hutterer@who-t.net>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/wacom_wac.c                | 18 ++++++++++++++++--
 drivers/hid/wacom_wac.h                |  2 ++
 include/uapi/linux/input-event-codes.h |  1 +
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c
index e3223b0c4f90..16af6886e828 100644
--- a/drivers/hid/wacom_wac.c
+++ b/drivers/hid/wacom_wac.c
@@ -2140,6 +2140,12 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field
 	case HID_DG_TIPSWITCH:
 		wacom_wac->hid_data.tipswitch |= value;
 		return;
+	case HID_DG_BARRELSWITCH:
+		wacom_wac->hid_data.barrelswitch = value;
+		return;
+	case HID_DG_BARRELSWITCH2:
+		wacom_wac->hid_data.barrelswitch2 = value;
+		return;
 	case HID_DG_TOOLSERIALNUMBER:
 		if (value) {
 			wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL);
@@ -2254,6 +2260,12 @@ static void wacom_wac_pen_report(struct hid_device *hdev,
 
 	if (!delay_pen_events(wacom_wac) && wacom_wac->tool[0]) {
 		int id = wacom_wac->id[0];
+		int sw_state = wacom_wac->hid_data.barrelswitch |
+			       (wacom_wac->hid_data.barrelswitch2 << 1);
+
+		input_report_key(input, BTN_STYLUS, sw_state == 1);
+		input_report_key(input, BTN_STYLUS2, sw_state == 2);
+		input_report_key(input, BTN_STYLUS3, sw_state == 3);
 
 		/*
 		 * Non-USI EMR tools should have their IDs mangled to
@@ -3300,9 +3312,11 @@ int wacom_setup_pen_input_capabilities(struct input_dev *input_dev,
 	else
 		__set_bit(INPUT_PROP_POINTER, input_dev->propbit);
 
-	if (features->type == HID_GENERIC)
-		/* setup has already been done */
+	if (features->type == HID_GENERIC) {
+		/* setup has already been done; apply otherwise-undetectible quirks */
+		input_set_capability(input_dev, EV_KEY, BTN_STYLUS3);
 		return 0;
+	}
 
 	__set_bit(BTN_TOUCH, input_dev->keybit);
 	__set_bit(ABS_MISC, input_dev->absbit);
diff --git a/drivers/hid/wacom_wac.h b/drivers/hid/wacom_wac.h
index 8a03654048bf..69dda27e8dde 100644
--- a/drivers/hid/wacom_wac.h
+++ b/drivers/hid/wacom_wac.h
@@ -291,6 +291,8 @@ struct hid_data {
 	bool inrange_state;
 	bool invert_state;
 	bool tipswitch;
+	bool barrelswitch;
+	bool barrelswitch2;
 	int x;
 	int y;
 	int pressure;
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 179891074b3c..9b3a522f50d1 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -406,6 +406,7 @@
 #define BTN_TOOL_MOUSE		0x146
 #define BTN_TOOL_LENS		0x147
 #define BTN_TOOL_QUINTTAP	0x148	/* Five fingers on trackpad */
+#define BTN_STYLUS3		0x149
 #define BTN_TOUCH		0x14a
 #define BTN_STYLUS		0x14b
 #define BTN_STYLUS2		0x14c
-- 
cgit v1.2.3


From da9a1446d248f673a8560ce46251ff620214ab7b Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 9 Nov 2017 10:00:45 +0100
Subject: KVM: s390: provide a capability for AIS state migration

The AIS capability was introduced in 4.12, while the interface to
migrate the state was added in 4.13. Unfortunately it is not possible
for userspace to detect the migration capability without creating a flic
kvm device. As in QEMU the cpu model detection runs on the "none"
machine this will result in cpu model issues regarding the "ais"
capability.

To get the "ais" capability properly let's add a new KVM capability that
tells userspace that AIS states can be migrated.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Halil Pasic <pasic@linux.vnet.ibm.com>
---
 Documentation/virtual/kvm/api.txt               | 9 +++++++++
 Documentation/virtual/kvm/devices/s390_flic.txt | 2 ++
 arch/s390/kvm/kvm-s390.c                        | 1 +
 include/uapi/linux/kvm.h                        | 1 +
 4 files changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e63a35fafef0..49540e53c4bd 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4347,3 +4347,12 @@ This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr.  Its
 value is used to denote the target vcpu for a SynIC interrupt.  For
 compatibilty, KVM initializes this msr to KVM's internal vcpu index.  When this
 capability is absent, userspace can still query this msr's value.
+
+8.13 KVM_CAP_S390_AIS_MIGRATION
+
+Architectures: s390
+Parameters: none
+
+This capability indicates if the flic device will be able to get/set the
+AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows
+to discover this without having to create a flic device.
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index 27ad53c7149d..a4e20a090174 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -151,6 +151,8 @@ struct kvm_s390_ais_all {
     to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and
     nimm bit presents AIS mode for a ISC.
 
+    KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION.
+
 Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on
 FLIC with an unknown group or attribute gives the error code EINVAL (instead of
 ENXIO, as specified in the API documentation). It is not possible to conclude
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index de6a5b790da0..8f4b655f65d7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -395,6 +395,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_USER_INSTR0:
 	case KVM_CAP_S390_CMMA_MIGRATION:
 	case KVM_CAP_S390_AIS:
+	case KVM_CAP_S390_AIS_MIGRATION:
 		r = 1;
 		break;
 	case KVM_CAP_S390_MEM_OP:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 838887587411..b60595696836 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -930,6 +930,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_SMT_POSSIBLE 147
 #define KVM_CAP_HYPERV_SYNIC2 148
 #define KVM_CAP_HYPERV_VP_INDEX 149
+#define KVM_CAP_S390_AIS_MIGRATION 150
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 4d63adfe12dd9cb61ed8badb4d798955399048c2 Mon Sep 17 00:00:00 2001
From: Mark Greer <mgreer@animalcreek.com>
Date: Thu, 15 Jun 2017 20:34:22 -0700
Subject: NFC: Add NFC_CMD_DEACTIVATE_TARGET support

Once an NFC target (i.e., a tag) is found, it remains active until
there is a failure reading or writing it (often caused by the target
moving out of range).  While the target is active, the NFC adapter
and antenna must remain powered.  This wastes power when the target
remains in range but the client application no longer cares whether
it is there or not.

To mitigate this, add a new netlink command that allows userspace
to deactivate an active target.  When issued, this command will cause
the NFC subsystem to act as though the target was moved out of range.
Once the command has been executed, the client application can power
off the NFC adapter to reduce power consumption.

Signed-off-by: Mark Greer <mgreer@animalcreek.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h |  2 ++
 net/nfc/netlink.c        | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 399f39ff8048..f6e3c8c9c744 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -89,6 +89,7 @@
  * @NFC_CMD_ACTIVATE_TARGET: Request NFC controller to reactivate target.
  * @NFC_CMD_VENDOR: Vendor specific command, to be implemented directly
  *	from the driver in order to support hardware specific operations.
+ * @NFC_CMD_DEACTIVATE_TARGET: Request NFC controller to deactivate target.
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -121,6 +122,7 @@ enum nfc_commands {
 	NFC_CMD_SE_IO,
 	NFC_CMD_ACTIVATE_TARGET,
 	NFC_CMD_VENDOR,
+	NFC_CMD_DEACTIVATE_TARGET,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index b251fb936a27..f6359c277212 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -928,6 +928,30 @@ static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info)
 	return rc;
 }
 
+static int nfc_genl_deactivate_target(struct sk_buff *skb,
+				      struct genl_info *info)
+{
+	struct nfc_dev *dev;
+	u32 device_idx, target_idx;
+	int rc;
+
+	if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+		return -EINVAL;
+
+	device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+	dev = nfc_get_device(device_idx);
+	if (!dev)
+		return -ENODEV;
+
+	target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]);
+
+	rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP);
+
+	nfc_put_device(dev);
+	return rc;
+}
+
 static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nfc_dev *dev;
@@ -1751,6 +1775,11 @@ static const struct genl_ops nfc_genl_ops[] = {
 		.doit = nfc_genl_vendor_cmd,
 		.policy = nfc_genl_policy,
 	},
+	{
+		.cmd = NFC_CMD_DEACTIVATE_TARGET,
+		.doit = nfc_genl_deactivate_target,
+		.policy = nfc_genl_policy,
+	},
 };
 
 static struct genl_family nfc_genl_family __ro_after_init = {
-- 
cgit v1.2.3


From 42d5e37654e4cdb9fb2e2f3ab30045fee35c42d8 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 23 Aug 2017 07:03:39 -0400
Subject: audit: filter PATH records keyed on filesystem magic

Tracefs or debugfs were causing hundreds to thousands of PATH records to
be associated with the init_module and finit_module SYSCALL records on a
few modules when the following rule was in place for startup:
	-a always,exit -F arch=x86_64 -S init_module -F key=mod-load

Provide a method to ignore these large number of PATH records from
overwhelming the logs if they are not of interest.  Introduce a new
filter list "AUDIT_FILTER_FS", with a new field type AUDIT_FSTYPE,
which keys off the filesystem 4-octet hexadecimal magic identifier to
filter specific filesystem PATH records.

An example rule would look like:
	-a never,filesystem -F fstype=0x74726163 -F key=ignore_tracefs
	-a never,filesystem -F fstype=0x64626720 -F key=ignore_debugfs

Arguably the better way to address this issue is to disable tracefs and
debugfs on boot from production systems.

See: https://github.com/linux-audit/audit-kernel/issues/16
See: https://github.com/linux-audit/audit-userspace/issues/8
Test case: https://github.com/linux-audit/audit-testsuite/issues/42

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: fixed the whitespace damage in kernel/auditsc.c]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h |  8 ++++++--
 kernel/auditfilter.c       | 39 ++++++++++++++++++++++++++++++++-------
 kernel/auditsc.c           | 23 +++++++++++++++++++++++
 3 files changed, 61 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 0714a66f0e0c..be711341938e 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -155,8 +155,9 @@
 #define AUDIT_FILTER_WATCH	0x03	/* Apply rule to file system watches */
 #define AUDIT_FILTER_EXIT	0x04	/* Apply rule at syscall exit */
 #define AUDIT_FILTER_TYPE	0x05	/* Apply rule at audit_log_start */
+#define AUDIT_FILTER_FS		0x06	/* Apply rule at __audit_inode_child */
 
-#define AUDIT_NR_FILTERS	6
+#define AUDIT_NR_FILTERS	7
 
 #define AUDIT_FILTER_PREPEND	0x10	/* Prepend to front of list */
 
@@ -256,6 +257,7 @@
 #define AUDIT_OBJ_LEV_HIGH	23
 #define AUDIT_LOGINUID_SET	24
 #define AUDIT_SESSIONID	25	/* Session ID */
+#define AUDIT_FSTYPE	26	/* FileSystem Type */
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
@@ -335,13 +337,15 @@ enum {
 #define AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND	0x00000008
 #define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER	0x00000010
 #define AUDIT_FEATURE_BITMAP_LOST_RESET		0x00000020
+#define AUDIT_FEATURE_BITMAP_FILTER_FS		0x00000040
 
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
 				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
 				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH | \
 				  AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND | \
 				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER | \
-				  AUDIT_FEATURE_BITMAP_LOST_RESET)
+				  AUDIT_FEATURE_BITMAP_LOST_RESET | \
+				  AUDIT_FEATURE_BITMAP_FILTER_FS)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0b0aa5854dac..4a1758adb222 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_filter_list[3]),
 	LIST_HEAD_INIT(audit_filter_list[4]),
 	LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
+	LIST_HEAD_INIT(audit_filter_list[6]),
+#if AUDIT_NR_FILTERS != 7
 #error Fix audit_filter_list initialiser
 #endif
 };
@@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_rules_list[3]),
 	LIST_HEAD_INIT(audit_rules_list[4]),
 	LIST_HEAD_INIT(audit_rules_list[5]),
+	LIST_HEAD_INIT(audit_rules_list[6]),
 };
 
 DEFINE_MUTEX(audit_filter_mutex);
@@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
 #endif
 	case AUDIT_FILTER_USER:
 	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		;
 	}
 	if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 		    entry->rule.listnr != AUDIT_FILTER_USER)
 			return -EINVAL;
 		break;
+	case AUDIT_FSTYPE:
+		if (entry->rule.listnr != AUDIT_FILTER_FS)
+			return -EINVAL;
+		break;
+	}
+
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_FS:
+		switch(f->type) {
+		case AUDIT_FSTYPE:
+		case AUDIT_FILTERKEY:
+			break;
+		default:
+			return -EINVAL;
+		}
 	}
 
 	switch(f->type) {
@@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 			return -EINVAL;
 	/* FALL THROUGH */
 	case AUDIT_ARCH:
+	case AUDIT_FSTYPE:
 		if (f->op != Audit_not_equal && f->op != Audit_equal)
 			return -EINVAL;
 		break;
@@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
-	/* If either of these, don't count towards total */
-	if (entry->rule.listnr == AUDIT_FILTER_USER ||
-		entry->rule.listnr == AUDIT_FILTER_TYPE)
+	/* If any of these, don't count towards total */
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		dont_count = 1;
+	}
 #endif
 
 	mutex_lock(&audit_filter_mutex);
@@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry)
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
-	/* If either of these, don't count towards total */
-	if (entry->rule.listnr == AUDIT_FILTER_USER ||
-		entry->rule.listnr == AUDIT_FILTER_TYPE)
+	/* If any of these, don't count towards total */
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		dont_count = 1;
+	}
 #endif
 
 	mutex_lock(&audit_filter_mutex);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aac1a41f82bd..c9bb29e17335 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent,
 	struct inode *inode = d_backing_inode(dentry);
 	const char *dname = dentry->d_name.name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+	struct audit_entry *e;
+	struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+	int i;
 
 	if (!context->in_syscall)
 		return;
 
+	rcu_read_lock();
+	if (!list_empty(list)) {
+		list_for_each_entry_rcu(e, list, list) {
+			for (i = 0; i < e->rule.field_count; i++) {
+				struct audit_field *f = &e->rule.fields[i];
+
+				if (f->type == AUDIT_FSTYPE) {
+					if (audit_comparator(parent->i_sb->s_magic,
+					    f->op, f->val)) {
+						if (e->rule.action == AUDIT_NEVER) {
+							rcu_read_unlock();
+							return;
+						}
+					}
+				}
+			}
+		}
+	}
+	rcu_read_unlock();
+
 	if (inode)
 		handle_one(inode);
 
-- 
cgit v1.2.3


From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 7 Nov 2017 15:28:42 -0500
Subject: bpf: add a bpf_override_function helper

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                     |  3 +++
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/kprobes.h   |  4 ++++
 arch/x86/include/asm/ptrace.h    |  5 +++++
 arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++++++++
 include/linux/filter.h           |  3 ++-
 include/linux/trace_events.h     |  1 +
 include/uapi/linux/bpf.h         |  7 ++++++-
 kernel/bpf/core.c                |  3 +++
 kernel/bpf/verifier.c            |  2 ++
 kernel/events/core.c             |  7 +++++++
 kernel/trace/Kconfig             | 11 +++++++++++
 kernel/trace/bpf_trace.c         | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c      | 40 +++++++++++++++++++++++++++++++++-------
 kernel/trace/trace_probe.h       |  6 ++++++
 15 files changed, 133 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 057370a0ac4e..6e8520f09bc1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_KPROBE_OVERRIDE
+	bool
+
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fdb23313dd5..51458c1a0b4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index c0e3c45cf6ab..2370bb0149cc 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+	".type override_func, @function\n"
+	"override_func:\n"
+	"	ret\n"
+	".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+	regs->ip = (unsigned long)&override_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0cd02ff4ae30..eaec066f99e8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,7 +459,8 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				kprobe_override:1; /* Do we override a kprobe? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 84014ecfa67f..17e5e820a84c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,6 +523,7 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..adb66f78b674 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8a6c37762330..271daad31f37 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,6 +1326,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
+	if (fp->kprobe_override)
+		return false;
+
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4a942e2e753d..bc464b8ec91e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,6 +4357,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_override_return)
+			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 42d24bd64ea4..ac240d31b5bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	/* Kprobe override only works for kprobes, not uprobes. */
+	if (prog->kprobe_override &&
+	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..9dc0deeaad2b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,6 +518,17 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
+config BPF_KPROBE_OVERRIDE
+	bool "Enable BPF programs to override a kprobed function"
+	depends on BPF_EVENTS
+	depends on KPROBES_ON_FTRACE
+	depends on HAVE_KPROBE_OVERRIDE
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	default n
+	help
+	 Allows BPF to override the execution of a probed function and
+	 set a different return value.  This is used for error injection.
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 506efe6e8ed9..1865b0d4cdeb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <asm/kprobes.h>
+
+#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	__this_cpu_write(bpf_kprobe_override, 1);
+	regs_set_return_value(regs, rc);
+	arch_ftrace_kprobe_override_function(regs);
+	return 0;
+}
+#else
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	return -EINVAL;
+}
+#endif
+
+static const struct bpf_func_proto bpf_override_return_proto = {
+	.func		= bpf_override_return,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -551,6 +578,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
+	case BPF_FUNC_override_return:
+		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
+				    current->comm, task_pid_nr(current));
+		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -766,6 +797,10 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
+	/* Kprobe override only works for ftrace based kprobes. */
+	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
+		return -EINVAL;
+
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index abf92e478cfb..8e3c9ec1faf7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,6 +42,7 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -87,6 +88,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
+int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	return kprobe_ftrace(&tk->rp.kp);
+}
+
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1170,7 +1177,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static void
+static int
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1186,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
-		return;
+	if (bpf_prog_array_valid(call)) {
+		int ret;
+
+		ret = trace_call_bpf(call, regs);
+
+		/*
+		 * We need to check and see if we modified the pc of the
+		 * pt_regs, and if so clear the kprobe and return 1 so that we
+		 * don't do the instruction skipping.  Also reset our state so
+		 * we are clean the next pass through.
+		 */
+		if (__this_cpu_read(bpf_kprobe_override)) {
+			__this_cpu_write(bpf_kprobe_override, 0);
+			reset_current_kprobe();
+			return 1;
+		}
+		if (!ret)
+			return 0;
+	}
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return;
+		return 0;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1217,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return;
+		return 0;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
+	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1275,6 +1300,7 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1282,9 +1308,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		kprobe_perf_func(tk, regs);
+		ret = kprobe_perf_func(tk, regs);
 #endif
-	return 0;	/* We don't tweek kernel, so just return 0 */
+	return ret;
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..adbb3f7d1fb5 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,6 +253,7 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -278,6 +279,11 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
+
+static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	return 0;
+}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
-- 
cgit v1.2.3


From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 7 Nov 2017 21:52:09 -0800
Subject: net: ipv6: sysctl to specify IPv6 ND traffic class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a per-device sysctl to specify the default traffic class to use for
kernel originated IPv6 Neighbour Discovery packets.

Currently this includes:

  - Router Solicitation (ICMPv6 type 133)
    ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Solicitation (ICMPv6 type 135)
    ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Advertisement (ICMPv6 type 136)
    ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Redirect (ICMPv6 type 137)
    ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr()

and if the kernel ever gets around to generating RA's,
it would presumably also include:

  - Router Advertisement (ICMPv6 type 134)
    (radvd daemon could pick up on the kernel setting and use it)

Interface drivers may examine the Traffic Class value and translate
the DiffServ Code Point into a link-layer appropriate traffic
prioritization scheme.  An example of mapping IETF DSCP values to
IEEE 802.11 User Priority values can be found here:

    https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11

The expected primary use case is to properly prioritize ND over wifi.

Testing:
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  0
  jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  255
  jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  34

  jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1
  tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
  IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24)
  jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement,
  length 24, tgt is jzem22.pgc, Flags [solicited]

(based on original change written by Erik Kline, with minor changes)

v2: fix 'suspicious rcu_dereference_check() usage'
    by explicitly grabbing the rcu_read_lock.

Cc: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 11 +++++++++++
 net/ipv6/ndisc.c                       |  9 ++++++++-
 5 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 54410a1d4065..d8676dda7fa6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1732,6 +1732,15 @@ ndisc_notify - BOOLEAN
 	1 - Generate unsolicited neighbour advertisements when device is brought
 	    up or hardware address changes.
 
+ndisc_tclass - INTEGER
+	The IPv6 Traffic Class to use by default when sending IPv6 Neighbor
+	Discovery (Router Solicitation, Router Advertisement, Neighbor
+	Solicitation, Neighbor Advertisement, Redirect) messages.
+	These 8 bits can be interpreted as 6 high order bits holding the DSCP
+	value and 2 low order bits representing ECN (which you probably want
+	to leave cleared).
+	0 - (default)
+
 mldv1_unsolicited_report_interval - INTEGER
 	The interval in milliseconds in which the next unsolicited
 	MLDv1 report retransmit will take place.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ea04ca024f0d..cb18c6290ca8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -73,6 +73,7 @@ struct ipv6_devconf {
 	__u32		enhanced_dad;
 	__u32		addr_gen_mode;
 	__s32		disable_policy;
+	__s32           ndisc_tclass;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index b22a9c4e1b12..9c0f4a92bcff 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -186,6 +186,7 @@ enum {
 	DEVCONF_ADDR_GEN_MODE,
 	DEVCONF_DISABLE_POLICY,
 	DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
+	DEVCONF_NDISC_TCLASS,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6233e06fa35c..a6dffd65eb9d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5059,6 +5059,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
 	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
 	array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
+	array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5986,6 +5987,7 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
 }
 
 static int minus_one = -1;
+static const int zero = 0;
 static const int one = 1;
 static const int two_five_five = 255;
 
@@ -6356,6 +6358,15 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode           = 0644,
 		.proc_handler   = addrconf_sysctl_disable_policy,
 	},
+	{
+		.procname	= "ndisc_tclass",
+		.data		= &ipv6_devconf.ndisc_tclass,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (void *)&zero,
+		.extra2		= (void *)&two_five_five,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f9c3ffe04382..b3cea200c85e 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -427,12 +427,19 @@ static void ip6_nd_hdr(struct sk_buff *skb,
 		       int hop_limit, int len)
 {
 	struct ipv6hdr *hdr;
+	struct inet6_dev *idev;
+	unsigned tclass;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(skb->dev);
+	tclass = idev ? idev->cnf.ndisc_tclass : 0;
+	rcu_read_unlock();
 
 	skb_push(skb, sizeof(*hdr));
 	skb_reset_network_header(skb);
 	hdr = ipv6_hdr(skb);
 
-	ip6_flow_hdr(hdr, 0, 0);
+	ip6_flow_hdr(hdr, tclass, 0);
 
 	hdr->payload_len = htons(len);
 	hdr->nexthdr = IPPROTO_ICMPV6;
-- 
cgit v1.2.3


From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Nov 2017 18:24:55 +0900
Subject: bpf: Revert bpf_overrid_function() helper changes.

NACK'd by x86 maintainer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                              |  3 ---
 arch/x86/Kconfig                          |  1 -
 arch/x86/include/asm/kprobes.h            |  4 ----
 arch/x86/include/asm/ptrace.h             |  5 ----
 arch/x86/kernel/kprobes/ftrace.c          | 14 -----------
 include/linux/filter.h                    |  3 +--
 include/linux/trace_events.h              |  1 -
 include/uapi/linux/bpf.h                  |  7 +-----
 kernel/bpf/core.c                         |  3 ---
 kernel/bpf/verifier.c                     |  2 --
 kernel/events/core.c                      |  7 ------
 kernel/trace/Kconfig                      | 11 ---------
 kernel/trace/bpf_trace.c                  | 35 ---------------------------
 kernel/trace/trace_kprobe.c               | 40 ++++++-------------------------
 kernel/trace/trace_probe.h                |  6 -----
 samples/bpf/Makefile                      |  4 ----
 samples/bpf/test_override_return.sh       | 15 ------------
 samples/bpf/tracex7_kern.c                | 16 -------------
 samples/bpf/tracex7_user.c                | 28 ----------------------
 tools/include/uapi/linux/bpf.h            |  7 +-----
 tools/testing/selftests/bpf/bpf_helpers.h |  3 +--
 21 files changed, 11 insertions(+), 204 deletions(-)
 delete mode 100755 samples/bpf/test_override_return.sh
 delete mode 100644 samples/bpf/tracex7_kern.c
 delete mode 100644 samples/bpf/tracex7_user.c

(limited to 'include/uapi/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 6e8520f09bc1..057370a0ac4e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,9 +196,6 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
-config HAVE_KPROBE_OVERRIDE
-	bool
-
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51458c1a0b4a..2fdb23313dd5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,7 +153,6 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
-	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index c6c3b1f4306a..6cf65437b5e5 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,10 +67,6 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
-#ifdef CONFIG_KPROBES_ON_FTRACE
-extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
-#endif
-
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2370bb0149cc..c0e3c45cf6ab 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,11 +109,6 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
-static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
-{
-	regs->ax = rc;
-}
-
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 3c455bf490cb..041f7b6dfa0f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
-
-asmlinkage void override_func(void);
-asm(
-	".type override_func, @function\n"
-	"override_func:\n"
-	"	ret\n"
-	".size override_func, .-override_func\n"
-);
-
-void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
-{
-	regs->ip = (unsigned long)&override_func;
-}
-NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index eaec066f99e8..0cd02ff4ae30 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,8 +459,7 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				kprobe_override:1; /* Do we override a kprobe? */
+				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 17e5e820a84c..84014ecfa67f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,7 +523,6 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
-DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 271daad31f37..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,9 +1326,6 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
-	if (fp->kprobe_override)
-		return false;
-
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc464b8ec91e..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,8 +4357,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
-		if (insn->imm == BPF_FUNC_override_return)
-			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac240d31b5bf..42d24bd64ea4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,13 +8171,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
-	/* Kprobe override only works for kprobes, not uprobes. */
-	if (prog->kprobe_override &&
-	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
-
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9dc0deeaad2b..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,17 +518,6 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
-config BPF_KPROBE_OVERRIDE
-	bool "Enable BPF programs to override a kprobed function"
-	depends on BPF_EVENTS
-	depends on KPROBES_ON_FTRACE
-	depends on HAVE_KPROBE_OVERRIDE
-	depends on DYNAMIC_FTRACE_WITH_REGS
-	default n
-	help
-	 Allows BPF to override the execution of a probed function and
-	 set a different return value.  This is used for error injection.
-
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1865b0d4cdeb..506efe6e8ed9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,10 +13,6 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
-#include <linux/kprobes.h>
-#include <asm/kprobes.h>
-
-#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -80,29 +76,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	__this_cpu_write(bpf_kprobe_override, 1);
-	regs_set_return_value(regs, rc);
-	arch_ftrace_kprobe_override_function(regs);
-	return 0;
-}
-#else
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	return -EINVAL;
-}
-#endif
-
-static const struct bpf_func_proto bpf_override_return_proto = {
-	.func		= bpf_override_return,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_ANYTHING,
-};
-
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -578,10 +551,6 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
-	case BPF_FUNC_override_return:
-		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
-				    current->comm, task_pid_nr(current));
-		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -797,10 +766,6 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
-	/* Kprobe override only works for ftrace based kprobes. */
-	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
-		return -EINVAL;
-
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8e3c9ec1faf7..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,7 +42,6 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
-DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -88,12 +87,6 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
-int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
-	return kprobe_ftrace(&tk->rp.kp);
-}
-
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1177,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static int
+static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1186,29 +1179,12 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call)) {
-		int ret;
-
-		ret = trace_call_bpf(call, regs);
-
-		/*
-		 * We need to check and see if we modified the pc of the
-		 * pt_regs, and if so clear the kprobe and return 1 so that we
-		 * don't do the instruction skipping.  Also reset our state so
-		 * we are clean the next pass through.
-		 */
-		if (__this_cpu_read(bpf_kprobe_override)) {
-			__this_cpu_write(bpf_kprobe_override, 0);
-			reset_current_kprobe();
-			return 1;
-		}
-		if (!ret)
-			return 0;
-	}
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
+		return;
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return 0;
+		return;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1217,14 +1193,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return 0;
+		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
-	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1300,7 +1275,6 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
-	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1308,9 +1282,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		ret = kprobe_perf_func(tk, regs);
+		kprobe_perf_func(tk, regs);
 #endif
-	return ret;
+	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index adbb3f7d1fb5..903273c93e61 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,7 +253,6 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
-int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -279,11 +278,6 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
-
-static inline int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	return 0;
-}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 87db0f9a4c15..3b4945c1eab0 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -15,7 +15,6 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
-hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -62,7 +61,6 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
-tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -106,7 +104,6 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
-always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -161,7 +158,6 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
-HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
deleted file mode 100755
index e68b9ee6814b..000000000000
--- a/samples/bpf/test_override_return.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-rm -f testfile.img
-dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
-DEVICE=$(losetup --show -f testfile.img)
-mkfs.btrfs -f $DEVICE
-mkdir tmpmnt
-./tracex7 $DEVICE
-if [ $? -eq 0 ]
-then
-	echo "SUCCESS!"
-else
-	echo "FAILED!"
-fi
-losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
deleted file mode 100644
index 1ab308a43e0f..000000000000
--- a/samples/bpf/tracex7_kern.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <uapi/linux/ptrace.h>
-#include <uapi/linux/bpf.h>
-#include <linux/version.h>
-#include "bpf_helpers.h"
-
-SEC("kprobe/open_ctree")
-int bpf_prog1(struct pt_regs *ctx)
-{
-	unsigned long rc = -12;
-
-	bpf_override_return(ctx, rc);
-	return 0;
-}
-
-char _license[] SEC("license") = "GPL";
-u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
deleted file mode 100644
index 8a52ac492e8b..000000000000
--- a/samples/bpf/tracex7_user.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <linux/bpf.h>
-#include <unistd.h>
-#include "libbpf.h"
-#include "bpf_load.h"
-
-int main(int argc, char **argv)
-{
-	FILE *f;
-	char filename[256];
-	char command[256];
-	int ret;
-
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
-		return 1;
-	}
-
-	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
-	f = popen(command, "r");
-	ret = pclose(f);
-
-	return ret ? 0 : 1;
-}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 33cb00e46c49..fd9a17fa8a8b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -82,8 +82,7 @@ static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
 				       unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_prog_read_value;
-static int (*bpf_override_return)(void *ctx, unsigned long rc) =
-	(void *) BPF_FUNC_override_return;
+
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Nov 2017 13:01:26 -0800
Subject: tcp: retire FACK loss detection

FACK loss detection has been disabled by default and the
successor RACK subsumed FACK and can handle reordering better.
This patch removes FACK to simplify TCP loss recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  3 +-
 include/linux/tcp.h                    |  1 -
 include/net/tcp.h                      | 14 +--------
 include/uapi/linux/snmp.h              |  1 -
 net/ipv4/proc.c                        |  1 -
 net/ipv4/tcp.c                         |  2 --
 net/ipv4/tcp_input.c                   | 53 +++++-----------------------------
 net/ipv4/tcp_metrics.c                 |  4 +--
 net/ipv4/tcp_minisocks.c               |  5 +---
 net/ipv4/tcp_output.c                  |  5 +---
 10 files changed, 12 insertions(+), 77 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d8676dda7fa6..46c7e1085efc 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -289,8 +289,7 @@ tcp_ecn_fallback - BOOLEAN
 	Default: 1 (fallback enabled)
 
 tcp_fack - BOOLEAN
-	Enable FACK congestion avoidance and fast retransmission.
-	The value is not used, if tcp_sack is not enabled.
+	This is a legacy option, it has no effect anymore.
 
 tcp_fin_timeout - INTEGER
 	The length of time an orphaned (no longer referenced by any
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 22f40c96a15b..9574936fe041 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -85,7 +85,6 @@ struct tcp_sack_block {
 
 /*These are used to set the sack_ok field in struct tcp_options_received */
 #define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
-#define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
 #define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/
 
 struct tcp_options_received {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2f2c69ad31b2..ed71511e67a6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -384,7 +384,6 @@ void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
 void tcp_metrics_init(void);
 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
-void tcp_disable_fack(struct tcp_sock *tp);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
 void tcp_init_transfer(struct sock *sk, int bpf_op);
@@ -776,7 +775,7 @@ struct tcp_skb_cb {
 	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
-	__u8		sacked;		/* State flags for SACK/FACK.	*/
+	__u8		sacked;		/* State flags for SACK.	*/
 #define TCPCB_SACKED_ACKED	0x01	/* SKB ACK'd by a SACK block	*/
 #define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
 #define TCPCB_LOST		0x04	/* SKB is lost			*/
@@ -1066,7 +1065,6 @@ void tcp_rate_check_app_limited(struct sock *sk);
  *
  * tcp_is_sack - SACK enabled
  * tcp_is_reno - No SACK
- * tcp_is_fack - FACK enabled, implies SACK enabled
  */
 static inline int tcp_is_sack(const struct tcp_sock *tp)
 {
@@ -1078,16 +1076,6 @@ static inline bool tcp_is_reno(const struct tcp_sock *tp)
 	return !tcp_is_sack(tp);
 }
 
-static inline bool tcp_is_fack(const struct tcp_sock *tp)
-{
-	return tp->rx_opt.sack_ok & TCP_FACK_ENABLED;
-}
-
-static inline void tcp_enable_fack(struct tcp_sock *tp)
-{
-	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
-}
-
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 0d941cdd8e8c..33a70ece462f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -191,7 +191,6 @@ enum
 	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
 	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
 	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
-	LINUX_MIB_TCPFACKREORDER,		/* TCPFACKReorder */
 	LINUX_MIB_TCPSACKREORDER,		/* TCPSACKReorder */
 	LINUX_MIB_TCPRENOREORDER,		/* TCPRenoReorder */
 	LINUX_MIB_TCPTSREORDER,			/* TCPTSReorder */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 127153f1ed8a..9f37c4727861 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -212,7 +212,6 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
 	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
 	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
-	SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
 	SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
 	SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
 	SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bc71a27d5ad9..337555076043 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2509,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk,
 				return -EINVAL;
 
 			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
-			if (sock_net(sk)->ipv4.sysctl_tcp_fack)
-				tcp_enable_fack(tp);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (opt.opt_val != 0)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9ceaa1fdc3ab..487e181cff86 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -842,18 +842,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
-/*
- * Packet counting of FACK is based on in-order assumptions, therefore TCP
- * disables it when reordering is detected
- */
-void tcp_disable_fack(struct tcp_sock *tp)
-{
-	/* RFC3517 uses different metric in lost marker => reset on change */
-	if (tcp_is_fack(tp))
-		tp->lost_skb_hint = NULL;
-	tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
-}
-
 /* Take a notice that peer is sending D-SACKs */
 static void tcp_dsack_seen(struct tcp_sock *tp)
 {
@@ -881,7 +869,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 			 tp->sacked_out,
 			 tp->undo_marker ? tp->undo_retrans : 0);
 #endif
-		tcp_disable_fack(tp);
 	}
 
 	tp->rack.reord = 1;
@@ -891,8 +878,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 		mib_idx = LINUX_MIB_TCPTSREORDER;
 	else if (tcp_is_reno(tp))
 		mib_idx = LINUX_MIB_TCPRENOREORDER;
-	else if (tcp_is_fack(tp))
-		mib_idx = LINUX_MIB_TCPFACKREORDER;
 	else
 		mib_idx = LINUX_MIB_TCPSACKREORDER;
 
@@ -970,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
  * 3. Loss detection event of two flavors:
  *	A. Scoreboard estimator decided the packet is lost.
  *	   A'. Reno "three dupacks" marks head of queue lost.
- *	   A''. Its FACK modification, head until snd.fack is lost.
  *	B. SACK arrives sacking SND.NXT at the moment, when the
  *	   segment was retransmitted.
  * 4. D-SACK added new rule: D-SACK changes any tag to S.
@@ -1248,7 +1232,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		fack_count += pcount;
 
 		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
-		if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
+		if (tp->lost_skb_hint &&
 		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
 			tp->lost_cnt_hint += pcount;
 
@@ -2051,10 +2035,6 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
  * counter when SACK is enabled (without SACK, sacked_out is used for
  * that purpose).
  *
- * Instead, with FACK TCP uses fackets_out that includes both SACKed
- * segments up to the highest received SACK block so far and holes in
- * between them.
- *
  * With reordering, holes may still be in flight, so RFC3517 recovery
  * uses pure sacked_out (total number of SACKed segments) even though
  * it violates the RFC that uses duplicate ACKs, often these are equal
@@ -2064,10 +2044,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
  */
 static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 {
-	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
+	return tp->sacked_out + 1;
 }
 
-/* Linux NewReno/SACK/FACK/ECN state machine.
+/* Linux NewReno/SACK/ECN state machine.
  * --------------------------------------
  *
  * "Open"	Normal state, no dubious events, fast path.
@@ -2132,16 +2112,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
  *		dynamically measured and adjusted. This is implemented in
  *		tcp_rack_mark_lost.
  *
- *		FACK (Disabled by default. Subsumbed by RACK):
- *		It is the simplest heuristics. As soon as we decided
- *		that something is lost, we decide that _all_ not SACKed
- *		packets until the most forward SACK are lost. I.e.
- *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
- *		It is absolutely correct estimate, if network does not reorder
- *		packets. And it loses any connection to reality when reordering
- *		takes place. We use FACK by default until reordering
- *		is suspected on the path to this destination.
- *
  *		If the receiver does not support SACK:
  *
  *		NewReno (RFC6582): in Recovery we assume that one segment
@@ -2190,7 +2160,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 }
 
 /* Detect loss in event "A" above by marking head of queue up as lost.
- * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * For non-SACK(Reno) senders, the first "packets" number of segments
  * are considered lost. For RFC3517 SACK, a segment is considered lost if it
  * has at least tp->reordering SACKed seqments above it; "packets" refers to
  * the maximum SACKed segments to pass before reaching this limit.
@@ -2226,12 +2196,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 			break;
 
 		oldcnt = cnt;
-		if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
+		if (tcp_is_reno(tp) ||
 		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 			cnt += tcp_skb_pcount(skb);
 
 		if (cnt > packets) {
-			if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+			if (tcp_is_sack(tp) ||
 			    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
 			    (oldcnt >= packets))
 				break;
@@ -2262,11 +2232,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 
 	if (tcp_is_reno(tp)) {
 		tcp_mark_head_lost(sk, 1, 1);
-	} else if (tcp_is_fack(tp)) {
-		int lost = tp->fackets_out - tp->reordering;
-		if (lost <= 0)
-			lost = 1;
-		tcp_mark_head_lost(sk, lost, 0);
 	} else {
 		int sacked_upto = tp->sacked_out - tp->reordering;
 		if (sacked_upto >= 0)
@@ -3199,8 +3164,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			if (reord < prior_fackets && reord <= tp->fackets_out)
 				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
 
-			delta = tcp_is_fack(tp) ? pkts_acked :
-						  prior_sacked - tp->sacked_out;
+			delta = prior_sacked - tp->sacked_out;
 			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
 
@@ -5708,9 +5672,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			tp->tcp_header_len = sizeof(struct tcphdr);
 		}
 
-		if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack)
-			tcp_enable_fack(tp);
-
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9d5ddebfd831..7097f92d16e5 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -470,10 +470,8 @@ void tcp_init_metrics(struct sock *sk)
 		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	}
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
-	if (val && tp->reordering != val) {
-		tcp_disable_fack(tp);
+	if (val && tp->reordering != val)
 		tp->reordering = val;
-	}
 
 	crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4bb86580decd..326c9282bf94 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -509,10 +509,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 						       keepalive_time_when(newtp));
 
 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
-		if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
-			if (sock_net(sk)->ipv4.sysctl_tcp_fack)
-				tcp_enable_fack(newtp);
-		}
+		newtp->rx_opt.sack_ok = ireq->sack_ok;
 		newtp->window_clamp = req->rsk_window_clamp;
 		newtp->rcv_ssthresh = req->rsk_rcv_wnd;
 		newtp->rcv_wnd = req->rsk_rcv_wnd;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9b98d35aa0d8..094c429b4401 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1257,7 +1257,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 
 	if (tp->lost_skb_hint &&
 	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
-	    (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		tp->lost_cnt_hint -= decr;
 
 	tcp_verify_left_out(tp);
@@ -2961,9 +2961,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
  * we've sent it all or the congestion window limit is reached.
- * If doing SACK, the first ACK which comes back for a timeout
- * based retransmit packet might feed us FACK information again.
- * If so, we use it to avoid unnecessarily retransmissions.
  */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
-- 
cgit v1.2.3


From 99803171ef04037092bf5eb29ae801e8b4d49a75 Mon Sep 17 00:00:00 2001
From: Dave Taht <dave.taht@gmail.com>
Date: Wed, 8 Nov 2017 15:12:27 -0800
Subject: netem: add uapi to express delay and jitter in nanoseconds

netem userspace has long relied on a horrible /proc/net/psched hack
to translate the current notion of "ticks" to nanoseconds.

Expressing latency and jitter instead, in well defined nanoseconds,
increases the dynamic range of emulated delays and jitter in netem.

It will also ease a transition where reducing a tick to nsec
equivalence would constrain the max delay in prior versions of
netem to only 4.3 seconds.

Signed-off-by: Dave Taht <dave.taht@gmail.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_netem.c          | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 6a2c5ea7e9c4..8fe6d1842bee 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -537,6 +537,8 @@ enum {
 	TCA_NETEM_ECN,
 	TCA_NETEM_RATE64,
 	TCA_NETEM_PAD,
+	TCA_NETEM_LATENCY64,
+	TCA_NETEM_JITTER64,
 	__TCA_NETEM_MAX,
 };
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e64e0e0d94ff..47d6decba0ea 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -819,6 +819,8 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
+	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
+	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
 };
 
 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -916,6 +918,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 		q->rate = max_t(u64, q->rate,
 				nla_get_u64(tb[TCA_NETEM_RATE64]));
 
+	if (tb[TCA_NETEM_LATENCY64])
+		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
+
+	if (tb[TCA_NETEM_JITTER64])
+		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
+
 	if (tb[TCA_NETEM_ECN])
 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 
@@ -1020,6 +1028,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 		goto nla_put_failure;
 
+	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
+		goto nla_put_failure;
+
+	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
+		goto nla_put_failure;
+
 	cor.delay_corr = q->delay_cor.rho;
 	cor.loss_corr = q->loss_cor.rho;
 	cor.dup_corr = q->dup_cor.rho;
-- 
cgit v1.2.3


From 836af83b54e3e285c4a0cc06c24aeb737d3e0e18 Mon Sep 17 00:00:00 2001
From: Dave Taht <dave.taht@gmail.com>
Date: Wed, 8 Nov 2017 15:12:28 -0800
Subject: netem: support delivering packets in delayed time slots

Slotting is a crude approximation of the behaviors of shared media such
as cable, wifi, and LTE, which gather up a bunch of packets within a
varying delay window and deliver them, relative to that, nearly all at
once.

It works within the existing loss, duplication, jitter and delay
parameters of netem. Some amount of inherent latency must be specified,
regardless.

The new "slot" parameter specifies a minimum and maximum delay between
transmission attempts.

The "bytes" and "packets" parameters can be used to limit the amount of
information transferred per slot.

Examples of use:

tc qdisc add dev eth0 root netem delay 200us \
         slot 800us 10ms bytes 64k packets 42

A more correct example, using stacked netem instances and a packet limit
to emulate a tail drop wifi queue with slots and variable packet
delivery, with a 200Mbit isochronous underlying rate, and 20ms path
delay:

tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \
         limit 10000
tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \
         slot 800us 10ms bytes 64k packets 42 limit 512

Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  8 +++++
 net/sched/sch_netem.c          | 74 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 79 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 8fe6d1842bee..af3cc2f4e1ad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -539,6 +539,7 @@ enum {
 	TCA_NETEM_PAD,
 	TCA_NETEM_LATENCY64,
 	TCA_NETEM_JITTER64,
+	TCA_NETEM_SLOT,
 	__TCA_NETEM_MAX,
 };
 
@@ -576,6 +577,13 @@ struct tc_netem_rate {
 	__s32	cell_overhead;
 };
 
+struct tc_netem_slot {
+	__s64   min_delay; /* nsec */
+	__s64   max_delay;
+	__s32   max_packets;
+	__s32   max_bytes;
+};
+
 enum {
 	NETEM_LOSS_UNSPEC,
 	NETEM_LOSS_GI,		/* General Intuitive - 4 state model */
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 47d6decba0ea..b686e755fda9 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -135,6 +135,13 @@ struct netem_sched_data {
 		u32 a5; /* p23 used only in 4-states */
 	} clg;
 
+	struct tc_netem_slot slot_config;
+	struct slotstate {
+		u64 slot_next;
+		s32 packets_left;
+		s32 bytes_left;
+	} slot;
+
 };
 
 /* Time stamp put into socket buffer control block
@@ -591,6 +598,20 @@ finish_segs:
 	return NET_XMIT_SUCCESS;
 }
 
+/* Delay the next round with a new future slot with a
+ * correct number of bytes and packets.
+ */
+
+static void get_slot_next(struct netem_sched_data *q, u64 now)
+{
+	q->slot.slot_next = now + q->slot_config.min_delay +
+		(prandom_u32() *
+			(q->slot_config.max_delay -
+				q->slot_config.min_delay) >> 32);
+	q->slot.packets_left = q->slot_config.max_packets;
+	q->slot.bytes_left = q->slot_config.max_bytes;
+}
+
 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -608,14 +629,17 @@ deliver:
 	p = rb_first(&q->t_root);
 	if (p) {
 		u64 time_to_send;
+		u64 now = ktime_get_ns();
 
 		skb = rb_to_skb(p);
 
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
-		if (time_to_send <= ktime_get_ns()) {
-			rb_erase(p, &q->t_root);
+		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
+			get_slot_next(q, now);
 
+		if (time_to_send <= now &&  q->slot.slot_next <= now) {
+			rb_erase(p, &q->t_root);
 			sch->q.qlen--;
 			qdisc_qstats_backlog_dec(sch, skb);
 			skb->next = NULL;
@@ -634,6 +658,14 @@ deliver:
 				skb->tstamp = 0;
 #endif
 
+			if (q->slot.slot_next) {
+				q->slot.packets_left--;
+				q->slot.bytes_left -= qdisc_pkt_len(skb);
+				if (q->slot.packets_left <= 0 ||
+				    q->slot.bytes_left <= 0)
+					get_slot_next(q, now);
+			}
+
 			if (q->qdisc) {
 				unsigned int pkt_len = qdisc_pkt_len(skb);
 				struct sk_buff *to_free = NULL;
@@ -657,7 +689,10 @@ deliver:
 			if (skb)
 				goto deliver;
 		}
-		qdisc_watchdog_schedule_ns(&q->watchdog, time_to_send);
+
+		qdisc_watchdog_schedule_ns(&q->watchdog,
+					   max(time_to_send,
+					       q->slot.slot_next));
 	}
 
 	if (q->qdisc) {
@@ -688,6 +723,7 @@ static void dist_free(struct disttable *d)
  * Distribution data is a variable size payload containing
  * signed 16 bit values.
  */
+
 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -718,6 +754,23 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 	return 0;
 }
 
+static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
+{
+	const struct tc_netem_slot *c = nla_data(attr);
+
+	q->slot_config = *c;
+	if (q->slot_config.max_packets == 0)
+		q->slot_config.max_packets = INT_MAX;
+	if (q->slot_config.max_bytes == 0)
+		q->slot_config.max_bytes = INT_MAX;
+	q->slot.packets_left = q->slot_config.max_packets;
+	q->slot.bytes_left = q->slot_config.max_bytes;
+	if (q->slot_config.min_delay | q->slot_config.max_delay)
+		q->slot.slot_next = ktime_get_ns();
+	else
+		q->slot.slot_next = 0;
+}
+
 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
 {
 	const struct tc_netem_corr *c = nla_data(attr);
@@ -821,6 +874,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
+	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
 };
 
 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -927,6 +981,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_NETEM_ECN])
 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 
+	if (tb[TCA_NETEM_SLOT])
+		get_slot(q, tb[TCA_NETEM_SLOT]);
+
 	return ret;
 }
 
@@ -1016,6 +1073,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct tc_netem_reorder reorder;
 	struct tc_netem_corrupt corrupt;
 	struct tc_netem_rate rate;
+	struct tc_netem_slot slot;
 
 	qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
 			     UINT_MAX);
@@ -1070,6 +1128,16 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (dump_loss_model(q, skb) != 0)
 		goto nla_put_failure;
 
+	if (q->slot_config.min_delay | q->slot_config.max_delay) {
+		slot = q->slot_config;
+		if (slot.max_packets == INT_MAX)
+			slot.max_packets = 0;
+		if (slot.max_bytes == INT_MAX)
+			slot.max_bytes = 0;
+		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
+			goto nla_put_failure;
+	}
+
 	return nla_nest_end(skb, nla);
 
 nla_put_failure:
-- 
cgit v1.2.3


From 5794040647de4011598a6d005fdad95d24fd385b Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@ovn.org>
Date: Fri, 10 Nov 2017 12:09:40 -0800
Subject: openvswitch: Add meter netlink definitions

Meter has its own netlink family. Define netlink messages and attributes
for communicating with the user space programs.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 51 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index ec75a685f1dd..d60b9a4cf3d1 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -883,4 +883,55 @@ enum ovs_action_attr {
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
 
+/* Meters. */
+#define OVS_METER_FAMILY  "ovs_meter"
+#define OVS_METER_MCGROUP "ovs_meter"
+#define OVS_METER_VERSION 0x1
+
+enum ovs_meter_cmd {
+	OVS_METER_CMD_UNSPEC,
+	OVS_METER_CMD_FEATURES,	/* Get features supported by the datapath. */
+	OVS_METER_CMD_SET,	/* Add or modify a meter. */
+	OVS_METER_CMD_DEL,	/* Delete a meter. */
+	OVS_METER_CMD_GET	/* Get meter stats. */
+};
+
+enum ovs_meter_attr {
+	OVS_METER_ATTR_UNSPEC,
+	OVS_METER_ATTR_ID,	/* u32 meter ID within datapath. */
+	OVS_METER_ATTR_KBPS,	/* No argument. If set, units in kilobits
+				 * per second. Otherwise, units in
+				 * packets per second.
+				 */
+	OVS_METER_ATTR_STATS,	/* struct ovs_flow_stats for the meter. */
+	OVS_METER_ATTR_BANDS,	/* Nested attributes for meter bands. */
+	OVS_METER_ATTR_USED,	/* u64 msecs last used in monotonic time. */
+	OVS_METER_ATTR_CLEAR,	/* Flag to clear stats, used. */
+	OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */
+	OVS_METER_ATTR_MAX_BANDS,  /* u32 max number of bands per meter. */
+	OVS_METER_ATTR_PAD,
+	__OVS_METER_ATTR_MAX
+};
+
+#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1)
+
+enum ovs_band_attr {
+	OVS_BAND_ATTR_UNSPEC,
+	OVS_BAND_ATTR_TYPE,	/* u32 OVS_METER_BAND_TYPE_* constant. */
+	OVS_BAND_ATTR_RATE,	/* u32 band rate in meter units (see above). */
+	OVS_BAND_ATTR_BURST,	/* u32 burst size in meter units. */
+	OVS_BAND_ATTR_STATS,	/* struct ovs_flow_stats for the band. */
+	__OVS_BAND_ATTR_MAX
+};
+
+#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1)
+
+enum ovs_meter_band_type {
+	OVS_METER_BAND_TYPE_UNSPEC,
+	OVS_METER_BAND_TYPE_DROP,   /* Drop exceeding packets. */
+	__OVS_METER_BAND_TYPE_MAX
+};
+
+#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
cgit v1.2.3


From cd8a6c33693c1b89d2737ffdbf9611564e9ac907 Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@ovn.org>
Date: Fri, 10 Nov 2017 12:09:43 -0800
Subject: openvswitch: Add meter action support

Implements OVS kernel meter action support.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 3 +++
 net/openvswitch/actions.c        | 6 ++++++
 net/openvswitch/datapath.h       | 1 +
 net/openvswitch/flow_netlink.c   | 6 ++++++
 4 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d60b9a4cf3d1..4265d7f9e1f2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -838,6 +838,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
  * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
+ * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
+ * packet, or modify the packet (e.g., change the DSCP field).
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -870,6 +872,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
 	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
 	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
+	OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 9a6a6d51e421..30a5df27116e 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1330,6 +1330,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_POP_NSH:
 			err = pop_nsh(skb, key);
 			break;
+
+		case OVS_ACTION_ATTR_METER:
+			if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) {
+				consume_skb(skb);
+				return 0;
+			}
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 5d2997b42460..523d65526766 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS           USHRT_MAX
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 4201f9293af3..bb4dae198c78 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -90,6 +90,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 		case OVS_ACTION_ATTR_SAMPLE:
 		case OVS_ACTION_ATTR_SET:
 		case OVS_ACTION_ATTR_SET_MASKED:
+		case OVS_ACTION_ATTR_METER:
 		default:
 			return true;
 		}
@@ -2844,6 +2845,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
 			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
 			[OVS_ACTION_ATTR_POP_NSH] = 0,
+			[OVS_ACTION_ATTR_METER] = sizeof(u32),
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -3029,6 +3031,10 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			break;
 		}
 
+		case OVS_ACTION_ATTR_METER:
+			/* Non-existent meters are simply ignored.  */
+			break;
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
-- 
cgit v1.2.3


From f044c8847bb61eff5e1e95b6f6bb950e7f4a73a4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:45 +0000
Subject: afs: Lay the groundwork for supporting network namespaces

Lay the groundwork for supporting network namespaces (netns) to the AFS
filesystem by moving various global features to a network-namespace struct
(afs_net) and providing an instance of this as a temporary global variable
that everything uses via accessor functions for the moment.

The following changes have been made:

 (1) Store the netns in the superblock info.  This will be obtained from
     the mounter's nsproxy on a manual mount and inherited from the parent
     superblock on an automount.

 (2) The cell list is made per-netns.  It can be viewed through
     /proc/net/afs/cells and also be modified by writing commands to that
     file.

 (3) The local workstation cell is set per-ns in /proc/net/afs/rootcell.
     This is unset by default.

 (4) The 'rootcell' module parameter, which sets a cell and VL server list
     modifies the init net namespace, thereby allowing an AFS root fs to be
     theoretically used.

 (5) The volume location lists and the file lock manager are made
     per-netns.

 (6) The AF_RXRPC socket and associated I/O bits are made per-ns.

The various workqueues remain global for the moment.

Changes still to be made:

 (1) /proc/fs/afs/ should be moved to /proc/net/afs/ and a symlink emplaced
     from the old name.

 (2) A per-netns subsys needs to be registered for AFS into which it can
     store its per-netns data.

 (3) Rather than the AF_RXRPC socket being opened on module init, it needs
     to be opened on the creation of a superblock in that netns.

 (4) The socket needs to be closed when the last superblock using it is
     destroyed and all outstanding client calls on it have been completed.
     This prevents a reference loop on the namespace.

 (5) It is possible that several namespaces will want to use AFS, in which
     case each one will need its own UDP port.  These can either be set
     through /proc/net/afs/cm_port or the kernel can pick one at random.
     The init_ns gets 7001 by default.

Other issues that need resolving:

 (1) The DNS keyring needs net-namespacing.

 (2) Where do upcalls go (eg. DNS request-key upcall)?

 (3) Need something like open_socket_in_file_ns() syscall so that AFS
     command line tools attempting to operate on an AFS file/volume have
     their RPC calls go to the right place.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/afs.h               |   9 +++
 fs/afs/callback.c          |  24 +------
 fs/afs/cell.c              | 130 ++++++++++++++++++------------------
 fs/afs/cmservice.c         |  26 ++++----
 fs/afs/flock.c             |  39 +----------
 fs/afs/fsclient.c          |  56 ++++++++++------
 fs/afs/internal.h          | 163 +++++++++++++++++++++++++++++++++------------
 fs/afs/main.c              | 153 +++++++++++++++++++++++++++++-------------
 fs/afs/proc.c              |  64 +++++++++++-------
 fs/afs/rxrpc.c             | 132 ++++++++++++++++++------------------
 fs/afs/server.c            |  82 +++++++++++------------
 fs/afs/super.c             |  45 +++++++------
 fs/afs/vlclient.c          |  10 +--
 fs/afs/vlocation.c         | 151 +++++++++++++++++++----------------------
 fs/afs/volume.c            |  10 +--
 include/uapi/linux/magic.h |   1 +
 16 files changed, 603 insertions(+), 492 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 3c462ff6db63..93053115bcfc 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -72,6 +72,15 @@ struct afs_callback {
 
 #define AFSCBMAX 50	/* maximum callbacks transferred per bulk op */
 
+struct afs_uuid {
+	__be32		time_low;			/* low part of timestamp */
+	__be16		time_mid;			/* mid part of timestamp */
+	__be16		time_hi_and_version;		/* high part of timestamp and version  */
+	__u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
+	__u8		clock_seq_low;			/* clock seq low */
+	__u8		node[6];			/* spatially unique node ID (MAC addr) */
+};
+
 /*
  * AFS volume information
  */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 25d404d22cae..d12dffb76b67 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -28,9 +28,7 @@ unsigned afs_vnode_update_timeout = 10;
 	CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail,	\
 		   ARRAY_SIZE((server)->cb_break))
 
-//static void afs_callback_updater(struct work_struct *);
-
-static struct workqueue_struct *afs_callback_update_worker;
+struct workqueue_struct *afs_callback_update_worker;
 
 /*
  * allow the fileserver to request callback state (re-)initialisation
@@ -343,7 +341,7 @@ void afs_dispatch_give_up_callbacks(struct work_struct *work)
 	 *   had callbacks entirely, and the server will call us later to break
 	 *   them
 	 */
-	afs_fs_give_up_callbacks(server, true);
+	afs_fs_give_up_callbacks(server->cell->net, server, true);
 }
 
 /*
@@ -456,21 +454,3 @@ static void afs_callback_updater(struct work_struct *work)
 	afs_put_vnode(vl);
 }
 #endif
-
-/*
- * initialise the callback update process
- */
-int __init afs_callback_update_init(void)
-{
-	afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd",
-							     WQ_MEM_RECLAIM);
-	return afs_callback_update_worker ? 0 : -ENOMEM;
-}
-
-/*
- * shut down the callback update process
- */
-void afs_callback_update_kill(void)
-{
-	destroy_workqueue(afs_callback_update_worker);
-}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index ca0a3cf93791..bd570fa539a0 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -18,20 +18,12 @@
 #include <keys/rxrpc-type.h>
 #include "internal.h"
 
-DECLARE_RWSEM(afs_proc_cells_sem);
-LIST_HEAD(afs_proc_cells);
-
-static LIST_HEAD(afs_cells);
-static DEFINE_RWLOCK(afs_cells_lock);
-static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
-static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
-static struct afs_cell *afs_cell_root;
-
 /*
  * allocate a cell record and fill in its name, VL server address list and
  * allocate an anonymous key
  */
-static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
+static struct afs_cell *afs_cell_alloc(struct afs_net *net,
+				       const char *name, unsigned namelen,
 				       char *vllist)
 {
 	struct afs_cell *cell;
@@ -62,6 +54,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
 
 	atomic_set(&cell->usage, 1);
 	INIT_LIST_HEAD(&cell->link);
+	cell->net = net;
 	rwlock_init(&cell->servers_lock);
 	INIT_LIST_HEAD(&cell->servers);
 	init_rwsem(&cell->vl_sem);
@@ -142,12 +135,14 @@ error:
 
 /*
  * afs_cell_crate() - create a cell record
+ * @net:	The network namespace
  * @name:	is the name of the cell.
  * @namsesz:	is the strlen of the cell name.
  * @vllist:	is a colon separated list of IP addresses in "a.b.c.d" format.
  * @retref:	is T to return the cell reference when the cell exists.
  */
-struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
+struct afs_cell *afs_cell_create(struct afs_net *net,
+				 const char *name, unsigned namesz,
 				 char *vllist, bool retref)
 {
 	struct afs_cell *cell;
@@ -155,23 +150,23 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
 
 	_enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
 
-	down_write(&afs_cells_sem);
-	read_lock(&afs_cells_lock);
-	list_for_each_entry(cell, &afs_cells, link) {
+	down_write(&net->cells_sem);
+	read_lock(&net->cells_lock);
+	list_for_each_entry(cell, &net->cells, link) {
 		if (strncasecmp(cell->name, name, namesz) == 0)
 			goto duplicate_name;
 	}
-	read_unlock(&afs_cells_lock);
+	read_unlock(&net->cells_lock);
 
-	cell = afs_cell_alloc(name, namesz, vllist);
+	cell = afs_cell_alloc(net, name, namesz, vllist);
 	if (IS_ERR(cell)) {
 		_leave(" = %ld", PTR_ERR(cell));
-		up_write(&afs_cells_sem);
+		up_write(&net->cells_sem);
 		return cell;
 	}
 
 	/* add a proc directory for this cell */
-	ret = afs_proc_cell_setup(cell);
+	ret = afs_proc_cell_setup(net, cell);
 	if (ret < 0)
 		goto error;
 
@@ -183,20 +178,20 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
 #endif
 
 	/* add to the cell lists */
-	write_lock(&afs_cells_lock);
-	list_add_tail(&cell->link, &afs_cells);
-	write_unlock(&afs_cells_lock);
+	write_lock(&net->cells_lock);
+	list_add_tail(&cell->link, &net->cells);
+	write_unlock(&net->cells_lock);
 
-	down_write(&afs_proc_cells_sem);
-	list_add_tail(&cell->proc_link, &afs_proc_cells);
-	up_write(&afs_proc_cells_sem);
-	up_write(&afs_cells_sem);
+	down_write(&net->proc_cells_sem);
+	list_add_tail(&cell->proc_link, &net->proc_cells);
+	up_write(&net->proc_cells_sem);
+	up_write(&net->cells_sem);
 
 	_leave(" = %p", cell);
 	return cell;
 
 error:
-	up_write(&afs_cells_sem);
+	up_write(&net->cells_sem);
 	key_put(cell->anonymous_key);
 	kfree(cell);
 	_leave(" = %d", ret);
@@ -206,8 +201,8 @@ duplicate_name:
 	if (retref && !IS_ERR(cell))
 		afs_get_cell(cell);
 
-	read_unlock(&afs_cells_lock);
-	up_write(&afs_cells_sem);
+	read_unlock(&net->cells_lock);
+	up_write(&net->cells_sem);
 
 	if (retref) {
 		_leave(" = %p", cell);
@@ -223,7 +218,7 @@ duplicate_name:
  * - can be called with a module parameter string
  * - can be called from a write to /proc/fs/afs/rootcell
  */
-int afs_cell_init(char *rootcell)
+int afs_cell_init(struct afs_net *net, char *rootcell)
 {
 	struct afs_cell *old_root, *new_root;
 	char *cp;
@@ -245,17 +240,17 @@ int afs_cell_init(char *rootcell)
 		*cp++ = 0;
 
 	/* allocate a cell record for the root cell */
-	new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
+	new_root = afs_cell_create(net, rootcell, strlen(rootcell), cp, false);
 	if (IS_ERR(new_root)) {
 		_leave(" = %ld", PTR_ERR(new_root));
 		return PTR_ERR(new_root);
 	}
 
 	/* install the new cell */
-	write_lock(&afs_cells_lock);
-	old_root = afs_cell_root;
-	afs_cell_root = new_root;
-	write_unlock(&afs_cells_lock);
+	write_lock(&net->cells_lock);
+	old_root = net->ws_cell;
+	net->ws_cell = new_root;
+	write_unlock(&net->cells_lock);
 	afs_put_cell(old_root);
 
 	_leave(" = 0");
@@ -265,19 +260,20 @@ int afs_cell_init(char *rootcell)
 /*
  * lookup a cell record
  */
-struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
+struct afs_cell *afs_cell_lookup(struct afs_net *net,
+				 const char *name, unsigned namesz,
 				 bool dns_cell)
 {
 	struct afs_cell *cell;
 
 	_enter("\"%*.*s\",", namesz, namesz, name ?: "");
 
-	down_read(&afs_cells_sem);
-	read_lock(&afs_cells_lock);
+	down_read(&net->cells_sem);
+	read_lock(&net->cells_lock);
 
 	if (name) {
 		/* if the cell was named, look for it in the cell record list */
-		list_for_each_entry(cell, &afs_cells, link) {
+		list_for_each_entry(cell, &net->cells, link) {
 			if (strncmp(cell->name, name, namesz) == 0) {
 				afs_get_cell(cell);
 				goto found;
@@ -289,7 +285,7 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
 	found:
 		;
 	} else {
-		cell = afs_cell_root;
+		cell = net->ws_cell;
 		if (!cell) {
 			/* this should not happen unless user tries to mount
 			 * when root cell is not set. Return an impossibly
@@ -304,16 +300,16 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
 
 	}
 
-	read_unlock(&afs_cells_lock);
-	up_read(&afs_cells_sem);
+	read_unlock(&net->cells_lock);
+	up_read(&net->cells_sem);
 	_leave(" = %p", cell);
 	return cell;
 
 create_cell:
-	read_unlock(&afs_cells_lock);
-	up_read(&afs_cells_sem);
+	read_unlock(&net->cells_lock);
+	up_read(&net->cells_sem);
 
-	cell = afs_cell_create(name, namesz, NULL, true);
+	cell = afs_cell_create(net, name, namesz, NULL, true);
 
 	_leave(" = %p", cell);
 	return cell;
@@ -325,14 +321,14 @@ create_cell:
  */
 struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
 {
-	write_lock(&afs_cells_lock);
+	write_lock(&net->cells_lock);
 
 	if (cell && !list_empty(&cell->link))
 		afs_get_cell(cell);
 	else
 		cell = NULL;
 
-	write_unlock(&afs_cells_lock);
+	write_unlock(&net->cells_lock);
 	return cell;
 }
 #endif  /*  0  */
@@ -351,10 +347,10 @@ void afs_put_cell(struct afs_cell *cell)
 
 	/* to prevent a race, the decrement and the dequeue must be effectively
 	 * atomic */
-	write_lock(&afs_cells_lock);
+	write_lock(&cell->net->cells_lock);
 
 	if (likely(!atomic_dec_and_test(&cell->usage))) {
-		write_unlock(&afs_cells_lock);
+		write_unlock(&cell->net->cells_lock);
 		_leave("");
 		return;
 	}
@@ -362,19 +358,19 @@ void afs_put_cell(struct afs_cell *cell)
 	ASSERT(list_empty(&cell->servers));
 	ASSERT(list_empty(&cell->vl_list));
 
-	write_unlock(&afs_cells_lock);
+	wake_up(&cell->net->cells_freeable_wq);
 
-	wake_up(&afs_cells_freeable_wq);
+	write_unlock(&cell->net->cells_lock);
 
 	_leave(" [unused]");
 }
 
 /*
  * destroy a cell record
- * - must be called with the afs_cells_sem write-locked
+ * - must be called with the net->cells_sem write-locked
  * - cell->link should have been broken by the caller
  */
-static void afs_cell_destroy(struct afs_cell *cell)
+static void afs_cell_destroy(struct afs_net *net, struct afs_cell *cell)
 {
 	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
 
@@ -387,14 +383,14 @@ static void afs_cell_destroy(struct afs_cell *cell)
 
 		_debug("wait for cell %s", cell->name);
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		add_wait_queue(&afs_cells_freeable_wq, &myself);
+		add_wait_queue(&net->cells_freeable_wq, &myself);
 
 		while (atomic_read(&cell->usage) > 0) {
 			schedule();
 			set_current_state(TASK_UNINTERRUPTIBLE);
 		}
 
-		remove_wait_queue(&afs_cells_freeable_wq, &myself);
+		remove_wait_queue(&net->cells_freeable_wq, &myself);
 		set_current_state(TASK_RUNNING);
 	}
 
@@ -403,11 +399,11 @@ static void afs_cell_destroy(struct afs_cell *cell)
 	ASSERT(list_empty(&cell->servers));
 	ASSERT(list_empty(&cell->vl_list));
 
-	afs_proc_cell_remove(cell);
+	afs_proc_cell_remove(net, cell);
 
-	down_write(&afs_proc_cells_sem);
+	down_write(&net->proc_cells_sem);
 	list_del_init(&cell->proc_link);
-	up_write(&afs_proc_cells_sem);
+	up_write(&net->proc_cells_sem);
 
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_relinquish_cookie(cell->cache, 0);
@@ -422,39 +418,39 @@ static void afs_cell_destroy(struct afs_cell *cell)
  * purge in-memory cell database on module unload or afs_init() failure
  * - the timeout daemon is stopped before calling this
  */
-void afs_cell_purge(void)
+void afs_cell_purge(struct afs_net *net)
 {
 	struct afs_cell *cell;
 
 	_enter("");
 
-	afs_put_cell(afs_cell_root);
+	afs_put_cell(net->ws_cell);
 
-	down_write(&afs_cells_sem);
+	down_write(&net->cells_sem);
 
-	while (!list_empty(&afs_cells)) {
+	while (!list_empty(&net->cells)) {
 		cell = NULL;
 
 		/* remove the next cell from the front of the list */
-		write_lock(&afs_cells_lock);
+		write_lock(&net->cells_lock);
 
-		if (!list_empty(&afs_cells)) {
-			cell = list_entry(afs_cells.next,
+		if (!list_empty(&net->cells)) {
+			cell = list_entry(net->cells.next,
 					  struct afs_cell, link);
 			list_del_init(&cell->link);
 		}
 
-		write_unlock(&afs_cells_lock);
+		write_unlock(&net->cells_lock);
 
 		if (cell) {
 			_debug("PURGING CELL %s (%d)",
 			       cell->name, atomic_read(&cell->usage));
 
 			/* now the cell should be left with no references */
-			afs_cell_destroy(cell);
+			afs_cell_destroy(net, cell);
 		}
 	}
 
-	up_write(&afs_cells_sem);
+	up_write(&net->cells_sem);
 	_leave("");
 }
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 782d4d05a53b..30ce4be4165f 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -193,7 +193,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 
 	switch (call->unmarshall) {
 	case 0:
-		rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
+		rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -290,7 +290,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
 	call->server = server;
@@ -324,7 +324,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 
 	_enter("");
 
-	rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
 
 	ret = afs_extract_data(call, NULL, 0, false);
 	if (ret < 0)
@@ -335,7 +335,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
 	call->server = server;
@@ -357,7 +357,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 
 	_enter("");
 
-	rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
 
 	_enter("{%u}", call->unmarshall);
 
@@ -407,7 +407,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
 	call->server = server;
@@ -461,7 +461,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
 
 	_enter("");
 
-	if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0)
+	if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
 		reply.match = htonl(0);
 	else
 		reply.match = htonl(1);
@@ -568,13 +568,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 	memset(&reply, 0, sizeof(reply));
 	reply.ia.nifs = htonl(nifs);
 
-	reply.ia.uuid[0] = afs_uuid.time_low;
-	reply.ia.uuid[1] = htonl(ntohs(afs_uuid.time_mid));
-	reply.ia.uuid[2] = htonl(ntohs(afs_uuid.time_hi_and_version));
-	reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
-	reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+	reply.ia.uuid[0] = call->net->uuid.time_low;
+	reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid));
+	reply.ia.uuid[2] = htonl(ntohs(call->net->uuid.time_hi_and_version));
+	reply.ia.uuid[3] = htonl((s8) call->net->uuid.clock_seq_hi_and_reserved);
+	reply.ia.uuid[4] = htonl((s8) call->net->uuid.clock_seq_low);
 	for (loop = 0; loop < 6; loop++)
-		reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+		reply.ia.uuid[loop + 5] = htonl((s8) call->net->uuid.node[loop]);
 
 	if (ifs) {
 		for (loop = 0; loop < nifs; loop++) {
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3191dff2c156..559ac00af5f7 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -14,47 +14,16 @@
 #define AFS_LOCK_GRANTED	0
 #define AFS_LOCK_PENDING	1
 
+struct workqueue_struct *afs_lock_manager;
+
 static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
 static void afs_fl_release_private(struct file_lock *fl);
 
-static struct workqueue_struct *afs_lock_manager;
-static DEFINE_MUTEX(afs_lock_manager_mutex);
-
 static const struct file_lock_operations afs_lock_ops = {
 	.fl_copy_lock		= afs_fl_copy_lock,
 	.fl_release_private	= afs_fl_release_private,
 };
 
-/*
- * initialise the lock manager thread if it isn't already running
- */
-static int afs_init_lock_manager(void)
-{
-	int ret;
-
-	ret = 0;
-	if (!afs_lock_manager) {
-		mutex_lock(&afs_lock_manager_mutex);
-		if (!afs_lock_manager) {
-			afs_lock_manager = alloc_workqueue("kafs_lockd",
-							   WQ_MEM_RECLAIM, 0);
-			if (!afs_lock_manager)
-				ret = -ENOMEM;
-		}
-		mutex_unlock(&afs_lock_manager_mutex);
-	}
-	return ret;
-}
-
-/*
- * destroy the lock manager thread if it's running
- */
-void __exit afs_kill_lock_manager(void)
-{
-	if (afs_lock_manager)
-		destroy_workqueue(afs_lock_manager);
-}
-
 /*
  * if the callback is broken on this vnode, then the lock may now be available
  */
@@ -264,10 +233,6 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
 		return -EINVAL;
 
-	ret = afs_init_lock_manager();
-	if (ret < 0)
-		return ret;
-
 	fl->fl_ops = &afs_lock_ops;
 	INIT_LIST_HEAD(&fl->fl_u.afs.link);
 	fl->fl_u.afs.state = AFS_LOCK_PENDING;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 19f76ae36982..ce6f0159e1d4 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -284,12 +284,13 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 			     bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
 	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
@@ -490,11 +491,12 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 			       bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
@@ -531,6 +533,7 @@ int afs_fs_fetch_data(struct afs_server *server,
 		      bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	if (upper_32_bits(req->pos) ||
@@ -540,7 +543,7 @@ int afs_fs_fetch_data(struct afs_server *server,
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
@@ -590,7 +593,8 @@ static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
  * give up a set of callbacks
  * - the callbacks are held in the server->cb_break ring
  */
-int afs_fs_give_up_callbacks(struct afs_server *server,
+int afs_fs_give_up_callbacks(struct afs_net *net,
+			     struct afs_server *server,
 			     bool async)
 {
 	struct afs_call *call;
@@ -610,7 +614,7 @@ int afs_fs_give_up_callbacks(struct afs_server *server,
 
 	_debug("break %zu callbacks", ncallbacks);
 
-	call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
+	call = afs_alloc_flat_call(net, &afs_RXFSGiveUpCallBacks,
 				   12 + ncallbacks * 6 * 4, 0);
 	if (!call)
 		return -ENOMEM;
@@ -699,6 +703,7 @@ int afs_fs_create(struct afs_server *server,
 		  bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -708,7 +713,7 @@ int afs_fs_create(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz + (6 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
+	call = afs_alloc_flat_call(net, &afs_RXFSCreateXXXX, reqsz,
 				   (3 + 21 + 21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
@@ -789,6 +794,7 @@ int afs_fs_remove(struct afs_server *server,
 		  bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -798,7 +804,7 @@ int afs_fs_remove(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz;
 
-	call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
@@ -870,6 +876,7 @@ int afs_fs_link(struct afs_server *server,
 		bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -879,7 +886,7 @@ int afs_fs_link(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz + (3 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
@@ -958,6 +965,7 @@ int afs_fs_symlink(struct afs_server *server,
 		   bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz, c_namesz, c_padsz;
 	__be32 *bp;
 
@@ -971,7 +979,7 @@ int afs_fs_symlink(struct afs_server *server,
 
 	reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+	call = afs_alloc_flat_call(net, &afs_RXFSSymlink, reqsz,
 				   (3 + 21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
@@ -1062,6 +1070,7 @@ int afs_fs_rename(struct afs_server *server,
 		  bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(orig_dvnode);
 	size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
 	__be32 *bp;
 
@@ -1078,7 +1087,7 @@ int afs_fs_rename(struct afs_server *server,
 		(3 * 4) +
 		4 + n_namesz + n_padsz;
 
-	call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
@@ -1172,12 +1181,13 @@ static int afs_fs_store_data64(struct afs_server *server,
 {
 	struct afs_vnode *vnode = wb->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
 	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData64,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData64,
 				   (4 + 6 + 3 * 2) * 4,
 				   (21 + 6) * 4);
 	if (!call)
@@ -1230,6 +1240,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 {
 	struct afs_vnode *vnode = wb->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	loff_t size, pos, i_size;
 	__be32 *bp;
 
@@ -1254,7 +1265,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 		return afs_fs_store_data64(server, wb, first, last, offset, to,
 					   size, pos, i_size, async);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData,
 				   (4 + 6 + 3) * 4,
 				   (21 + 6) * 4);
 	if (!call)
@@ -1356,6 +1367,7 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
 				 bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
@@ -1363,7 +1375,7 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
 
 	ASSERT(attr->ia_valid & ATTR_SIZE);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData64_as_Status,
 				   (4 + 6 + 3 * 2) * 4,
 				   (21 + 6) * 4);
 	if (!call)
@@ -1404,6 +1416,7 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 			       bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
@@ -1414,7 +1427,7 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 		return afs_fs_setattr_size64(server, key, vnode, attr,
 					     async);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData_as_Status,
 				   (4 + 6 + 3) * 4,
 				   (21 + 6) * 4);
 	if (!call)
@@ -1452,6 +1465,7 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
 		   bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	if (attr->ia_valid & ATTR_SIZE)
@@ -1461,7 +1475,7 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
 	_enter(",%x,{%x:%u},,",
 	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreStatus,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus,
 				   (4 + 6) * 4,
 				   (21 + 6) * 4);
 	if (!call)
@@ -1687,6 +1701,7 @@ int afs_fs_get_volume_status(struct afs_server *server,
 			     bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 	void *tmpbuf;
 
@@ -1696,7 +1711,7 @@ int afs_fs_get_volume_status(struct afs_server *server,
 	if (!tmpbuf)
 		return -ENOMEM;
 
-	call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
 	if (!call) {
 		kfree(tmpbuf);
 		return -ENOMEM;
@@ -1779,11 +1794,12 @@ int afs_fs_set_lock(struct afs_server *server,
 		    bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
@@ -1812,11 +1828,12 @@ int afs_fs_extend_lock(struct afs_server *server,
 		       bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
@@ -1844,11 +1861,12 @@ int afs_fs_release_lock(struct afs_server *server,
 			bool async)
 {
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 3f03f7888302..53bd11d73469 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
 #include <linux/fscache.h>
 #include <linux/backing-dev.h>
 #include <linux/uuid.h>
+#include <net/net_namespace.h>
 #include <net/af_rxrpc.h>
 
 #include "afs.h"
@@ -48,6 +49,7 @@ struct afs_mount_params {
 	afs_voltype_t		type;		/* type of volume requested */
 	int			volnamesz;	/* size of volume name */
 	const char		*volname;	/* name of volume to mount */
+	struct afs_net		*net;		/* Network namespace in effect */
 	struct afs_cell		*cell;		/* cell in which to find volume */
 	struct afs_volume	*volume;	/* volume record */
 	struct key		*key;		/* key to use for secure mounting */
@@ -62,6 +64,7 @@ enum afs_call_state {
 	AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
 	AFS_CALL_COMPLETE,	/* Completed or failed */
 };
+
 /*
  * a record of an in-progress RxRPC call
  */
@@ -72,6 +75,7 @@ struct afs_call {
 	struct work_struct	work;		/* actual work processor */
 	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
 	struct key		*key;		/* security for this call */
+	struct afs_net		*net;		/* The network namespace */
 	struct afs_server	*server;	/* server affected by incoming CM call */
 	void			*request;	/* request data (first part) */
 	struct address_space	*mapping;	/* page set */
@@ -173,6 +177,7 @@ struct afs_writeback {
  * - there's one superblock per volume
  */
 struct afs_super_info {
+	struct afs_net		*net;		/* Network namespace */
 	struct afs_volume	*volume;	/* volume record */
 	char			rwparent;	/* T if parent is R/W AFS volume */
 };
@@ -192,12 +197,62 @@ struct afs_cache_cell {
 	struct in_addr	vl_servers[15];		/* cached cell VL servers */
 };
 
+/*
+ * AFS network namespace record.
+ */
+struct afs_net {
+	struct afs_uuid		uuid;
+	bool			live;		/* F if this namespace is being removed */
+
+	/* AF_RXRPC I/O stuff */
+	struct socket		*socket;
+	struct afs_call		*spare_incoming_call;
+	struct work_struct	charge_preallocation_work;
+	struct mutex		socket_mutex;
+	atomic_t		nr_outstanding_calls;
+	atomic_t		nr_superblocks;
+
+	/* Cell database */
+	struct list_head	cells;
+	struct afs_cell		*ws_cell;
+	rwlock_t		cells_lock;
+	struct rw_semaphore	cells_sem;
+	wait_queue_head_t	cells_freeable_wq;
+
+	struct rw_semaphore	proc_cells_sem;
+	struct list_head	proc_cells;
+
+	/* Volume location database */
+	struct list_head	vl_updates;		/* VL records in need-update order */
+	struct list_head	vl_graveyard;		/* Inactive VL records */
+	struct delayed_work	vl_reaper;
+	struct delayed_work	vl_updater;
+	spinlock_t		vl_updates_lock;
+	spinlock_t		vl_graveyard_lock;
+
+	/* File locking renewal management */
+	struct mutex		lock_manager_mutex;
+
+	/* Server database */
+	struct rb_root		servers;		/* Active servers */
+	rwlock_t		servers_lock;
+	struct list_head	server_graveyard;	/* Inactive server LRU list */
+	spinlock_t		server_graveyard_lock;
+	struct delayed_work	server_reaper;
+
+	/* Misc */
+	struct proc_dir_entry	*proc_afs;		/* /proc/net/afs directory */
+};
+
+extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns
+
 /*
  * AFS cell record
  */
 struct afs_cell {
 	atomic_t		usage;
 	struct list_head	link;		/* main cell list link */
+	struct afs_net		*net;		/* The network namespace */
 	struct key		*anonymous_key;	/* anonymous user key for this cell */
 	struct list_head	proc_link;	/* /proc cell list link */
 #ifdef CONFIG_AFS_FSCACHE
@@ -411,15 +466,6 @@ struct afs_interface {
 	unsigned	mtu;		/* MTU of interface */
 };
 
-struct afs_uuid {
-	__be32		time_low;			/* low part of timestamp */
-	__be16		time_mid;			/* mid part of timestamp */
-	__be16		time_hi_and_version;		/* high part of timestamp and version  */
-	__u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
-	__u8		clock_seq_low;			/* clock seq low */
-	__u8		node[6];			/* spatially unique node ID (MAC addr) */
-};
-
 /*****************************************************************************/
 /*
  * cache.c
@@ -440,6 +486,8 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
 /*
  * callback.c
  */
+extern struct workqueue_struct *afs_callback_update_worker;
+
 extern void afs_init_callback_state(struct afs_server *);
 extern void afs_broken_callback_work(struct work_struct *);
 extern void afs_break_callbacks(struct afs_server *, size_t,
@@ -448,22 +496,17 @@ extern void afs_discard_callback_on_delete(struct afs_vnode *);
 extern void afs_give_up_callback(struct afs_vnode *);
 extern void afs_dispatch_give_up_callbacks(struct work_struct *);
 extern void afs_flush_callback_breaks(struct afs_server *);
-extern int __init afs_callback_update_init(void);
-extern void afs_callback_update_kill(void);
 
 /*
  * cell.c
  */
-extern struct rw_semaphore afs_proc_cells_sem;
-extern struct list_head afs_proc_cells;
-
 #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
-extern int afs_cell_init(char *);
-extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
-extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
+extern int afs_cell_init(struct afs_net *, char *);
+extern struct afs_cell *afs_cell_create(struct afs_net *, const char *, unsigned, char *, bool);
+extern struct afs_cell *afs_cell_lookup(struct afs_net *, const char *, unsigned, bool);
 extern struct afs_cell *afs_grab_cell(struct afs_cell *);
 extern void afs_put_cell(struct afs_cell *);
-extern void afs_cell_purge(void);
+extern void __net_exit afs_cell_purge(struct afs_net *);
 
 /*
  * cmservice.c
@@ -492,7 +535,8 @@ extern void afs_put_read(struct afs_read *);
 /*
  * flock.c
  */
-extern void __exit afs_kill_lock_manager(void);
+extern struct workqueue_struct *afs_lock_manager;
+
 extern void afs_lock_work(struct work_struct *);
 extern void afs_lock_may_be_available(struct afs_vnode *);
 extern int afs_lock(struct file *, int, struct file_lock *);
@@ -504,7 +548,7 @@ extern int afs_flock(struct file *, int, struct file_lock *);
 extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
 				    struct afs_vnode *, struct afs_volsync *,
 				    bool);
-extern int afs_fs_give_up_callbacks(struct afs_server *, bool);
+extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *, bool);
 extern int afs_fs_fetch_data(struct afs_server *, struct key *,
 			     struct afs_vnode *, struct afs_read *, bool);
 extern int afs_fs_create(struct afs_server *, struct key *,
@@ -554,7 +598,35 @@ extern int afs_drop_inode(struct inode *);
  * main.c
  */
 extern struct workqueue_struct *afs_wq;
-extern struct afs_uuid afs_uuid;
+
+static inline struct afs_net *afs_d2net(struct dentry *dentry)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_i2net(struct inode *inode)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_v2net(struct afs_vnode *vnode)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_sock2net(struct sock *sk)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_get_net(struct afs_net *net)
+{
+	return net;
+}
+
+static inline void afs_put_net(struct afs_net *net)
+{
+}
 
 /*
  * misc.c
@@ -579,23 +651,24 @@ extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
 /*
  * proc.c
  */
-extern int afs_proc_init(void);
-extern void afs_proc_cleanup(void);
-extern int afs_proc_cell_setup(struct afs_cell *);
-extern void afs_proc_cell_remove(struct afs_cell *);
+extern int __net_init afs_proc_init(struct afs_net *);
+extern void __net_exit afs_proc_cleanup(struct afs_net *);
+extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *);
 
 /*
  * rxrpc.c
  */
-extern struct socket *afs_socket;
-extern atomic_t afs_outstanding_calls;
+extern struct workqueue_struct *afs_async_calls;
 
-extern int afs_open_socket(void);
-extern void afs_close_socket(void);
+extern int __net_init afs_open_socket(struct afs_net *);
+extern void __net_exit afs_close_socket(struct afs_net *);
+extern void afs_charge_preallocation(struct work_struct *);
 extern void afs_put_call(struct afs_call *);
 extern int afs_queue_call_work(struct afs_call *);
 extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, bool);
-extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
+					    const struct afs_call_type *,
 					    size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
 extern void afs_send_empty_reply(struct afs_call *);
@@ -629,37 +702,45 @@ do {								\
 
 extern struct afs_server *afs_lookup_server(struct afs_cell *,
 					    const struct in_addr *);
-extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *);
+extern struct afs_server *afs_find_server(struct afs_net *,
+					  const struct sockaddr_rxrpc *);
 extern void afs_put_server(struct afs_server *);
-extern void __exit afs_purge_servers(void);
+extern void afs_reap_server(struct work_struct *);
+extern void __net_exit afs_purge_servers(struct afs_net *);
 
 /*
  * super.c
  */
-extern int afs_fs_init(void);
-extern void afs_fs_exit(void);
+extern int __init afs_fs_init(void);
+extern void __exit afs_fs_exit(void);
 
 /*
  * vlclient.c
  */
-extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
+extern int afs_vl_get_entry_by_name(struct afs_net *,
+				    struct in_addr *, struct key *,
 				    const char *, struct afs_cache_vlocation *,
 				    bool);
-extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
+extern int afs_vl_get_entry_by_id(struct afs_net *,
+				  struct in_addr *, struct key *,
 				  afs_volid_t, afs_voltype_t,
 				  struct afs_cache_vlocation *, bool);
 
 /*
  * vlocation.c
  */
+extern struct workqueue_struct *afs_vlocation_update_worker;
+
 #define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
 
-extern int __init afs_vlocation_update_init(void);
-extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
+extern struct afs_vlocation *afs_vlocation_lookup(struct afs_net *,
+						  struct afs_cell *,
 						  struct key *,
 						  const char *, size_t);
-extern void afs_put_vlocation(struct afs_vlocation *);
-extern void afs_vlocation_purge(void);
+extern void afs_put_vlocation(struct afs_net *, struct afs_vlocation *);
+extern void afs_vlocation_updater(struct work_struct *);
+extern void afs_vlocation_reaper(struct work_struct *);
+extern void __net_exit afs_vlocation_purge(struct afs_net *);
 
 /*
  * vnode.c
@@ -707,7 +788,7 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
  */
 #define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
 
-extern void afs_put_volume(struct afs_volume *);
+extern void afs_put_volume(struct afs_net *, struct afs_volume *);
 extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
 extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
 extern int afs_volume_release_fileserver(struct afs_vnode *,
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 9944770849da..87b1a9c8000d 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -31,30 +31,104 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
-struct afs_uuid afs_uuid;
 struct workqueue_struct *afs_wq;
+struct afs_net __afs_net;
+
+/*
+ * Initialise an AFS network namespace record.
+ */
+static int __net_init afs_net_init(struct afs_net *net)
+{
+	int ret;
+
+	net->live = true;
+	generate_random_uuid((unsigned char *)&net->uuid);
+
+	INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation);
+	mutex_init(&net->socket_mutex);
+	INIT_LIST_HEAD(&net->cells);
+	rwlock_init(&net->cells_lock);
+	init_rwsem(&net->cells_sem);
+	init_waitqueue_head(&net->cells_freeable_wq);
+	init_rwsem(&net->proc_cells_sem);
+	INIT_LIST_HEAD(&net->proc_cells);
+	INIT_LIST_HEAD(&net->vl_updates);
+	INIT_LIST_HEAD(&net->vl_graveyard);
+	INIT_DELAYED_WORK(&net->vl_reaper, afs_vlocation_reaper);
+	INIT_DELAYED_WORK(&net->vl_updater, afs_vlocation_updater);
+	spin_lock_init(&net->vl_updates_lock);
+	spin_lock_init(&net->vl_graveyard_lock);
+	net->servers = RB_ROOT;
+	rwlock_init(&net->servers_lock);
+	INIT_LIST_HEAD(&net->server_graveyard);
+	spin_lock_init(&net->server_graveyard_lock);
+	INIT_DELAYED_WORK(&net->server_reaper, afs_reap_server);
+
+	/* Register the /proc stuff */
+	ret = afs_proc_init(net);
+	if (ret < 0)
+		goto error_proc;
+
+	/* Initialise the cell DB */
+	ret = afs_cell_init(net, rootcell);
+	if (ret < 0)
+		goto error_cell_init;
+
+	/* Create the RxRPC transport */
+	ret = afs_open_socket(net);
+	if (ret < 0)
+		goto error_open_socket;
+
+	return 0;
+
+error_open_socket:
+	afs_vlocation_purge(net);
+	afs_cell_purge(net);
+error_cell_init:
+	afs_proc_cleanup(net);
+error_proc:
+	return ret;
+}
+
+/*
+ * Clean up and destroy an AFS network namespace record.
+ */
+static void __net_exit afs_net_exit(struct afs_net *net)
+{
+	net->live = false;
+	afs_close_socket(net);
+	afs_purge_servers(net);
+	afs_vlocation_purge(net);
+	afs_cell_purge(net);
+	afs_proc_cleanup(net);
+}
 
 /*
  * initialise the AFS client FS module
  */
 static int __init afs_init(void)
 {
-	int ret;
+	int ret = -ENOMEM;
 
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
 
-	generate_random_uuid((unsigned char *)&afs_uuid);
-
-	/* create workqueue */
-	ret = -ENOMEM;
 	afs_wq = alloc_workqueue("afs", 0, 0);
 	if (!afs_wq)
-		return ret;
-
-	/* register the /proc stuff */
-	ret = afs_proc_init();
-	if (ret < 0)
-		goto error_proc;
+		goto error_afs_wq;
+	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
+	if (!afs_async_calls)
+		goto error_async;
+	afs_vlocation_update_worker =
+		alloc_workqueue("kafs_vlupdated", WQ_MEM_RECLAIM, 0);
+	if (!afs_vlocation_update_worker)
+		goto error_vl_up;
+	afs_callback_update_worker =
+		alloc_ordered_workqueue("kafs_callbackd", WQ_MEM_RECLAIM);
+	if (!afs_callback_update_worker)
+		goto error_callback;
+	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
+	if (!afs_lock_manager)
+		goto error_lockmgr;
 
 #ifdef CONFIG_AFS_FSCACHE
 	/* we want to be able to cache */
@@ -63,25 +137,9 @@ static int __init afs_init(void)
 		goto error_cache;
 #endif
 
-	/* initialise the cell DB */
-	ret = afs_cell_init(rootcell);
-	if (ret < 0)
-		goto error_cell_init;
-
-	/* initialise the VL update process */
-	ret = afs_vlocation_update_init();
-	if (ret < 0)
-		goto error_vl_update_init;
-
-	/* initialise the callback update process */
-	ret = afs_callback_update_init();
+	ret = afs_net_init(&__afs_net);
 	if (ret < 0)
-		goto error_callback_update_init;
-
-	/* create the RxRPC transport */
-	ret = afs_open_socket();
-	if (ret < 0)
-		goto error_open_socket;
+		goto error_net;
 
 	/* register the filesystems */
 	ret = afs_fs_init();
@@ -91,21 +149,22 @@ static int __init afs_init(void)
 	return ret;
 
 error_fs:
-	afs_close_socket();
-error_open_socket:
-	afs_callback_update_kill();
-error_callback_update_init:
-	afs_vlocation_purge();
-error_vl_update_init:
-	afs_cell_purge();
-error_cell_init:
+	afs_net_exit(&__afs_net);
+error_net:
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
-	afs_proc_cleanup();
-error_proc:
+	destroy_workqueue(afs_lock_manager);
+error_lockmgr:
+	destroy_workqueue(afs_callback_update_worker);
+error_callback:
+	destroy_workqueue(afs_vlocation_update_worker);
+error_vl_up:
+	destroy_workqueue(afs_async_calls);
+error_async:
 	destroy_workqueue(afs_wq);
+error_afs_wq:
 	rcu_barrier();
 	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
 	return ret;
@@ -124,17 +183,15 @@ static void __exit afs_exit(void)
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
 
 	afs_fs_exit();
-	afs_kill_lock_manager();
-	afs_close_socket();
-	afs_purge_servers();
-	afs_callback_update_kill();
-	afs_vlocation_purge();
-	destroy_workqueue(afs_wq);
-	afs_cell_purge();
+	afs_net_exit(&__afs_net);
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
 #endif
-	afs_proc_cleanup();
+	destroy_workqueue(afs_lock_manager);
+	destroy_workqueue(afs_callback_update_worker);
+	destroy_workqueue(afs_vlocation_update_worker);
+	destroy_workqueue(afs_async_calls);
+	destroy_workqueue(afs_wq);
 	rcu_barrier();
 }
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 35efb9a31dd7..c93433460348 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -17,8 +17,15 @@
 #include <linux/uaccess.h>
 #include "internal.h"
 
-static struct proc_dir_entry *proc_afs;
+static inline struct afs_net *afs_proc2net(struct file *f)
+{
+	return &__afs_net;
+}
 
+static inline struct afs_net *afs_seq2net(struct seq_file *m)
+{
+	return &__afs_net; // TODO: use seq_file_net(m)
+}
 
 static int afs_proc_cells_open(struct inode *inode, struct file *file);
 static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos);
@@ -122,23 +129,23 @@ static const struct file_operations afs_proc_cell_servers_fops = {
 /*
  * initialise the /proc/fs/afs/ directory
  */
-int afs_proc_init(void)
+int afs_proc_init(struct afs_net *net)
 {
 	_enter("");
 
-	proc_afs = proc_mkdir("fs/afs", NULL);
-	if (!proc_afs)
+	net->proc_afs = proc_mkdir("fs/afs", NULL);
+	if (!net->proc_afs)
 		goto error_dir;
 
-	if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
-	    !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
+	if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) ||
+	    !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops))
 		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
 error_tree:
-	remove_proc_subtree("fs/afs", NULL);
+	proc_remove(net->proc_afs);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -147,9 +154,10 @@ error_dir:
 /*
  * clean up the /proc/fs/afs/ directory
  */
-void afs_proc_cleanup(void)
+void afs_proc_cleanup(struct afs_net *net)
 {
-	remove_proc_subtree("fs/afs", NULL);
+	proc_remove(net->proc_afs);
+	net->proc_afs = NULL;
 }
 
 /*
@@ -176,25 +184,30 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
  */
 static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
 {
-	/* lock the list against modification */
-	down_read(&afs_proc_cells_sem);
-	return seq_list_start_head(&afs_proc_cells, *_pos);
+	struct afs_net *net = afs_seq2net(m);
+
+	down_read(&net->proc_cells_sem);
+	return seq_list_start_head(&net->proc_cells, *_pos);
 }
 
 /*
  * move to next cell in cells list
  */
-static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
+static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	return seq_list_next(v, &afs_proc_cells, pos);
+	struct afs_net *net = afs_seq2net(m);
+
+	return seq_list_next(v, &net->proc_cells, pos);
 }
 
 /*
  * clean up after reading from the cells list
  */
-static void afs_proc_cells_stop(struct seq_file *p, void *v)
+static void afs_proc_cells_stop(struct seq_file *m, void *v)
 {
-	up_read(&afs_proc_cells_sem);
+	struct afs_net *net = afs_seq2net(m);
+
+	up_read(&net->proc_cells_sem);
 }
 
 /*
@@ -203,8 +216,9 @@ static void afs_proc_cells_stop(struct seq_file *p, void *v)
 static int afs_proc_cells_show(struct seq_file *m, void *v)
 {
 	struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
+	struct afs_net *net = afs_seq2net(m);
 
-	if (v == &afs_proc_cells) {
+	if (v == &net->proc_cells) {
 		/* display header on line 1 */
 		seq_puts(m, "USE NAME\n");
 		return 0;
@@ -223,6 +237,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
 static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 				    size_t size, loff_t *_pos)
 {
+	struct afs_net *net = afs_proc2net(file);
 	char *kbuf, *name, *args;
 	int ret;
 
@@ -264,7 +279,7 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 	if (strcmp(kbuf, "add") == 0) {
 		struct afs_cell *cell;
 
-		cell = afs_cell_create(name, strlen(name), args, false);
+		cell = afs_cell_create(net, name, strlen(name), args, false);
 		if (IS_ERR(cell)) {
 			ret = PTR_ERR(cell);
 			goto done;
@@ -303,6 +318,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 				       const char __user *buf,
 				       size_t size, loff_t *_pos)
 {
+	struct afs_net *net = afs_proc2net(file);
 	char *kbuf, *s;
 	int ret;
 
@@ -322,7 +338,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 	/* determine command to perform */
 	_debug("rootcell=%s", kbuf);
 
-	ret = afs_cell_init(kbuf);
+	ret = afs_cell_init(net, kbuf);
 	if (ret >= 0)
 		ret = size;	/* consume everything, always */
 
@@ -334,13 +350,13 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 /*
  * initialise /proc/fs/afs/<cell>/
  */
-int afs_proc_cell_setup(struct afs_cell *cell)
+int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell)
 {
 	struct proc_dir_entry *dir;
 
 	_enter("%p{%s}", cell, cell->name);
 
-	dir = proc_mkdir(cell->name, proc_afs);
+	dir = proc_mkdir(cell->name, net->proc_afs);
 	if (!dir)
 		goto error_dir;
 
@@ -356,7 +372,7 @@ int afs_proc_cell_setup(struct afs_cell *cell)
 	return 0;
 
 error_tree:
-	remove_proc_subtree(cell->name, proc_afs);
+	remove_proc_subtree(cell->name, net->proc_afs);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -365,11 +381,11 @@ error_dir:
 /*
  * remove /proc/fs/afs/<cell>/
  */
-void afs_proc_cell_remove(struct afs_cell *cell)
+void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell)
 {
 	_enter("");
 
-	remove_proc_subtree(cell->name, proc_afs);
+	remove_proc_subtree(cell->name, net->proc_afs);
 
 	_leave("");
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 77f5420a1a24..656ceb285b85 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -17,10 +17,7 @@
 #include "internal.h"
 #include "afs_cm.h"
 
-struct socket *afs_socket; /* my RxRPC socket */
-static struct workqueue_struct *afs_async_calls;
-static struct afs_call *afs_spare_incoming_call;
-atomic_t afs_outstanding_calls;
+struct workqueue_struct *afs_async_calls;
 
 static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
 static int afs_wait_for_call_to_complete(struct afs_call *);
@@ -37,15 +34,11 @@ static const struct afs_call_type afs_RXCMxxxx = {
 	.abort_to_error	= afs_abort_to_error,
 };
 
-static void afs_charge_preallocation(struct work_struct *);
-
-static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
-
 /*
  * open an RxRPC socket and bind it to be a server for callback notifications
  * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
  */
-int afs_open_socket(void)
+int afs_open_socket(struct afs_net *net)
 {
 	struct sockaddr_rxrpc srx;
 	struct socket *socket;
@@ -53,11 +46,6 @@ int afs_open_socket(void)
 
 	_enter("");
 
-	ret = -ENOMEM;
-	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
-	if (!afs_async_calls)
-		goto error_0;
-
 	ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
 	if (ret < 0)
 		goto error_1;
@@ -85,16 +73,14 @@ int afs_open_socket(void)
 	if (ret < 0)
 		goto error_2;
 
-	afs_socket = socket;
-	afs_charge_preallocation(NULL);
+	net->socket = socket;
+	afs_charge_preallocation(&net->charge_preallocation_work);
 	_leave(" = 0");
 	return 0;
 
 error_2:
 	sock_release(socket);
 error_1:
-	destroy_workqueue(afs_async_calls);
-error_0:
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -102,36 +88,36 @@ error_0:
 /*
  * close the RxRPC socket AFS was using
  */
-void afs_close_socket(void)
+void afs_close_socket(struct afs_net *net)
 {
 	_enter("");
 
-	kernel_listen(afs_socket, 0);
+	kernel_listen(net->socket, 0);
 	flush_workqueue(afs_async_calls);
 
-	if (afs_spare_incoming_call) {
-		afs_put_call(afs_spare_incoming_call);
-		afs_spare_incoming_call = NULL;
+	if (net->spare_incoming_call) {
+		afs_put_call(net->spare_incoming_call);
+		net->spare_incoming_call = NULL;
 	}
 
-	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
-	wait_on_atomic_t(&afs_outstanding_calls, atomic_t_wait,
+	_debug("outstanding %u", atomic_read(&net->nr_outstanding_calls));
+	wait_on_atomic_t(&net->nr_outstanding_calls, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
-	kernel_sock_shutdown(afs_socket, SHUT_RDWR);
+	kernel_sock_shutdown(net->socket, SHUT_RDWR);
 	flush_workqueue(afs_async_calls);
-	sock_release(afs_socket);
+	sock_release(net->socket);
 
 	_debug("dework");
-	destroy_workqueue(afs_async_calls);
 	_leave("");
 }
 
 /*
  * Allocate a call.
  */
-static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
+static struct afs_call *afs_alloc_call(struct afs_net *net,
+				       const struct afs_call_type *type,
 				       gfp_t gfp)
 {
 	struct afs_call *call;
@@ -142,11 +128,12 @@ static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
 		return NULL;
 
 	call->type = type;
+	call->net = net;
 	atomic_set(&call->usage, 1);
 	INIT_WORK(&call->async_work, afs_process_async_call);
 	init_waitqueue_head(&call->waitq);
 
-	o = atomic_inc_return(&afs_outstanding_calls);
+	o = atomic_inc_return(&net->nr_outstanding_calls);
 	trace_afs_call(call, afs_call_trace_alloc, 1, o,
 		       __builtin_return_address(0));
 	return call;
@@ -157,8 +144,9 @@ static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
  */
 void afs_put_call(struct afs_call *call)
 {
+	struct afs_net *net = call->net;
 	int n = atomic_dec_return(&call->usage);
-	int o = atomic_read(&afs_outstanding_calls);
+	int o = atomic_read(&net->nr_outstanding_calls);
 
 	trace_afs_call(call, afs_call_trace_put, n + 1, o,
 		       __builtin_return_address(0));
@@ -169,7 +157,7 @@ void afs_put_call(struct afs_call *call)
 		ASSERT(call->type->name != NULL);
 
 		if (call->rxcall) {
-			rxrpc_kernel_end_call(afs_socket, call->rxcall);
+			rxrpc_kernel_end_call(net->socket, call->rxcall);
 			call->rxcall = NULL;
 		}
 		if (call->type->destructor)
@@ -178,11 +166,11 @@ void afs_put_call(struct afs_call *call)
 		kfree(call->request);
 		kfree(call);
 
-		o = atomic_dec_return(&afs_outstanding_calls);
+		o = atomic_dec_return(&net->nr_outstanding_calls);
 		trace_afs_call(call, afs_call_trace_free, 0, o,
 			       __builtin_return_address(0));
 		if (o == 0)
-			wake_up_atomic_t(&afs_outstanding_calls);
+			wake_up_atomic_t(&net->nr_outstanding_calls);
 	}
 }
 
@@ -194,7 +182,7 @@ int afs_queue_call_work(struct afs_call *call)
 	int u = atomic_inc_return(&call->usage);
 
 	trace_afs_call(call, afs_call_trace_work, u,
-		       atomic_read(&afs_outstanding_calls),
+		       atomic_read(&call->net->nr_outstanding_calls),
 		       __builtin_return_address(0));
 
 	INIT_WORK(&call->work, call->type->work);
@@ -207,12 +195,13 @@ int afs_queue_call_work(struct afs_call *call)
 /*
  * allocate a call with flat request and reply buffers
  */
-struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+struct afs_call *afs_alloc_flat_call(struct afs_net *net,
+				     const struct afs_call_type *type,
 				     size_t request_size, size_t reply_max)
 {
 	struct afs_call *call;
 
-	call = afs_alloc_call(type, GFP_NOFS);
+	call = afs_alloc_call(net, type, GFP_NOFS);
 	if (!call)
 		goto nomem_call;
 
@@ -317,7 +306,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
 		bytes = msg->msg_iter.count;
 		nr = msg->msg_iter.nr_segs;
 
-		ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg,
+		ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, msg,
 					     bytes, afs_notify_end_request_tx);
 		for (loop = 0; loop < nr; loop++)
 			put_page(bv[loop].bv_page);
@@ -352,7 +341,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 
 	_debug("____MAKE %p{%s,%x} [%d]____",
 	       call, call->type->name, key_serial(call->key),
-	       atomic_read(&afs_outstanding_calls));
+	       atomic_read(&call->net->nr_outstanding_calls));
 
 	call->async = async;
 
@@ -376,7 +365,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	}
 
 	/* create a call */
-	rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+	rxcall = rxrpc_kernel_begin_call(call->net->socket, &srx, call->key,
 					 (unsigned long)call,
 					 tx_total_len, gfp,
 					 (async ?
@@ -410,7 +399,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	 */
 	if (!call->send_pages)
 		call->state = AFS_CALL_AWAIT_REPLY;
-	ret = rxrpc_kernel_send_data(afs_socket, rxcall,
+	ret = rxrpc_kernel_send_data(call->net->socket, rxcall,
 				     &msg, call->request_size,
 				     afs_notify_end_request_tx);
 	if (ret < 0)
@@ -432,13 +421,14 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 error_do_abort:
 	call->state = AFS_CALL_COMPLETE;
 	if (ret != -ECONNABORTED) {
-		rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT,
-					ret, "KSD");
+		rxrpc_kernel_abort_call(call->net->socket, rxcall,
+					RX_USER_ABORT, ret, "KSD");
 	} else {
 		abort_code = 0;
 		offset = 0;
-		rxrpc_kernel_recv_data(afs_socket, rxcall, NULL, 0, &offset,
-				       false, &abort_code, &call->service_id);
+		rxrpc_kernel_recv_data(call->net->socket, rxcall, NULL,
+				       0, &offset, false, &abort_code,
+				       &call->service_id);
 		ret = call->type->abort_to_error(abort_code);
 	}
 error_kill_call:
@@ -464,7 +454,8 @@ static void afs_deliver_to_call(struct afs_call *call)
 	       ) {
 		if (call->state == AFS_CALL_AWAIT_ACK) {
 			size_t offset = 0;
-			ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+			ret = rxrpc_kernel_recv_data(call->net->socket,
+						     call->rxcall,
 						     NULL, 0, &offset, false,
 						     &call->abort_code,
 						     &call->service_id);
@@ -492,12 +483,12 @@ static void afs_deliver_to_call(struct afs_call *call)
 			goto call_complete;
 		case -ENOTCONN:
 			abort_code = RX_CALL_DEAD;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, ret, "KNC");
 			goto save_error;
 		case -ENOTSUPP:
 			abort_code = RXGEN_OPCODE;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, ret, "KIV");
 			goto save_error;
 		case -ENODATA:
@@ -507,7 +498,7 @@ static void afs_deliver_to_call(struct afs_call *call)
 			abort_code = RXGEN_CC_UNMARSHAL;
 			if (call->state != AFS_CALL_AWAIT_REPLY)
 				abort_code = RXGEN_SS_UNMARSHAL;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, -EBADMSG, "KUM");
 			goto save_error;
 		}
@@ -541,13 +532,13 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 
 	_enter("");
 
-	rtt = rxrpc_kernel_get_rtt(afs_socket, call->rxcall);
+	rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
 	rtt2 = nsecs_to_jiffies64(rtt) * 2;
 	if (rtt2 < 2)
 		rtt2 = 2;
 
 	timeout = rtt2;
-	last_life = rxrpc_kernel_check_life(afs_socket, call->rxcall);
+	last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
 
 	add_wait_queue(&call->waitq, &myself);
 	for (;;) {
@@ -564,7 +555,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 		if (call->state == AFS_CALL_COMPLETE)
 			break;
 
-		life = rxrpc_kernel_check_life(afs_socket, call->rxcall);
+		life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
 		if (timeout == 0 &&
 		    life == last_life && signal_pending(current))
 				break;
@@ -583,7 +574,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 	/* Kill off the call if it's still live. */
 	if (call->state < AFS_CALL_COMPLETE) {
 		_debug("call interrupted");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+		rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 					RX_USER_ABORT, -EINTR, "KWI");
 	}
 
@@ -621,7 +612,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 	u = __atomic_add_unless(&call->usage, 1, 0);
 	if (u != 0) {
 		trace_afs_call(call, afs_call_trace_wake, u,
-			       atomic_read(&afs_outstanding_calls),
+			       atomic_read(&call->net->nr_outstanding_calls),
 			       __builtin_return_address(0));
 
 		if (!queue_work(afs_async_calls, &call->async_work))
@@ -685,13 +676,15 @@ static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
 /*
  * Charge the incoming call preallocation.
  */
-static void afs_charge_preallocation(struct work_struct *work)
+void afs_charge_preallocation(struct work_struct *work)
 {
-	struct afs_call *call = afs_spare_incoming_call;
+	struct afs_net *net =
+		container_of(work, struct afs_net, charge_preallocation_work);
+	struct afs_call *call = net->spare_incoming_call;
 
 	for (;;) {
 		if (!call) {
-			call = afs_alloc_call(&afs_RXCMxxxx, GFP_KERNEL);
+			call = afs_alloc_call(net, &afs_RXCMxxxx, GFP_KERNEL);
 			if (!call)
 				break;
 
@@ -700,7 +693,7 @@ static void afs_charge_preallocation(struct work_struct *work)
 			init_waitqueue_head(&call->waitq);
 		}
 
-		if (rxrpc_kernel_charge_accept(afs_socket,
+		if (rxrpc_kernel_charge_accept(net->socket,
 					       afs_wake_up_async_call,
 					       afs_rx_attach,
 					       (unsigned long)call,
@@ -708,7 +701,7 @@ static void afs_charge_preallocation(struct work_struct *work)
 			break;
 		call = NULL;
 	}
-	afs_spare_incoming_call = call;
+	net->spare_incoming_call = call;
 }
 
 /*
@@ -729,7 +722,9 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
 static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
 			    unsigned long user_call_ID)
 {
-	queue_work(afs_wq, &afs_charge_preallocation_work);
+	struct afs_net *net = afs_sock2net(sk);
+
+	queue_work(afs_wq, &net->charge_preallocation_work);
 }
 
 /*
@@ -784,11 +779,12 @@ static void afs_notify_end_reply_tx(struct sock *sock,
  */
 void afs_send_empty_reply(struct afs_call *call)
 {
+	struct afs_net *net = call->net;
 	struct msghdr msg;
 
 	_enter("");
 
-	rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0);
+	rxrpc_kernel_set_tx_length(net->socket, call->rxcall, 0);
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
@@ -798,7 +794,7 @@ void afs_send_empty_reply(struct afs_call *call)
 	msg.msg_flags		= 0;
 
 	call->state = AFS_CALL_AWAIT_ACK;
-	switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0,
+	switch (rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, 0,
 				       afs_notify_end_reply_tx)) {
 	case 0:
 		_leave(" [replied]");
@@ -806,7 +802,7 @@ void afs_send_empty_reply(struct afs_call *call)
 
 	case -ENOMEM:
 		_debug("oom");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+		rxrpc_kernel_abort_call(net->socket, call->rxcall,
 					RX_USER_ABORT, -ENOMEM, "KOO");
 	default:
 		_leave(" [error]");
@@ -819,13 +815,14 @@ void afs_send_empty_reply(struct afs_call *call)
  */
 void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 {
+	struct afs_net *net = call->net;
 	struct msghdr msg;
 	struct kvec iov[1];
 	int n;
 
 	_enter("");
 
-	rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len);
+	rxrpc_kernel_set_tx_length(net->socket, call->rxcall, len);
 
 	iov[0].iov_base		= (void *) buf;
 	iov[0].iov_len		= len;
@@ -837,7 +834,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	msg.msg_flags		= 0;
 
 	call->state = AFS_CALL_AWAIT_ACK;
-	n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len,
+	n = rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, len,
 				   afs_notify_end_reply_tx);
 	if (n >= 0) {
 		/* Success */
@@ -847,7 +844,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 
 	if (n == -ENOMEM) {
 		_debug("oom");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+		rxrpc_kernel_abort_call(net->socket, call->rxcall,
 					RX_USER_ABORT, -ENOMEM, "KOO");
 	}
 	_leave(" [error]");
@@ -859,6 +856,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 		     bool want_more)
 {
+	struct afs_net *net = call->net;
 	int ret;
 
 	_enter("{%s,%zu},,%zu,%d",
@@ -866,7 +864,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 
 	ASSERTCMP(call->offset, <=, count);
 
-	ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+	ret = rxrpc_kernel_recv_data(net->socket, call->rxcall,
 				     buf, count, &call->offset,
 				     want_more, &call->abort_code,
 				     &call->service_id);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index c001b1f2455f..e47fd9bc0ddc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -15,32 +15,22 @@
 
 static unsigned afs_server_timeout = 10;	/* server timeout in seconds */
 
-static void afs_reap_server(struct work_struct *);
-
-/* tree of all the servers, indexed by IP address */
-static struct rb_root afs_servers = RB_ROOT;
-static DEFINE_RWLOCK(afs_servers_lock);
-
-/* LRU list of all the servers not currently in use */
-static LIST_HEAD(afs_server_graveyard);
-static DEFINE_SPINLOCK(afs_server_graveyard_lock);
-static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
-
 /*
  * install a server record in the master tree
  */
 static int afs_install_server(struct afs_server *server)
 {
 	struct afs_server *xserver;
+	struct afs_net *net = server->cell->net;
 	struct rb_node **pp, *p;
 	int ret;
 
 	_enter("%p", server);
 
-	write_lock(&afs_servers_lock);
+	write_lock(&net->servers_lock);
 
 	ret = -EEXIST;
-	pp = &afs_servers.rb_node;
+	pp = &net->servers.rb_node;
 	p = NULL;
 	while (*pp) {
 		p = *pp;
@@ -55,11 +45,11 @@ static int afs_install_server(struct afs_server *server)
 	}
 
 	rb_link_node(&server->master_rb, p, pp);
-	rb_insert_color(&server->master_rb, &afs_servers);
+	rb_insert_color(&server->master_rb, &net->servers);
 	ret = 0;
 
 error:
-	write_unlock(&afs_servers_lock);
+	write_unlock(&net->servers_lock);
 	return ret;
 }
 
@@ -150,9 +140,9 @@ found_server_quickly:
 	read_unlock(&cell->servers_lock);
 no_longer_unused:
 	if (!list_empty(&server->grave)) {
-		spin_lock(&afs_server_graveyard_lock);
+		spin_lock(&cell->net->server_graveyard_lock);
 		list_del_init(&server->grave);
-		spin_unlock(&afs_server_graveyard_lock);
+		spin_unlock(&cell->net->server_graveyard_lock);
 	}
 	_leave(" = %p{%d}", server, atomic_read(&server->usage));
 	return server;
@@ -178,7 +168,8 @@ server_in_two_cells:
 /*
  * look up a server by its IP address
  */
-struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx)
+struct afs_server *afs_find_server(struct afs_net *net,
+				   const struct sockaddr_rxrpc *srx)
 {
 	struct afs_server *server = NULL;
 	struct rb_node *p;
@@ -191,9 +182,9 @@ struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx)
 		return NULL;
 	}
 
-	read_lock(&afs_servers_lock);
+	read_lock(&net->servers_lock);
 
-	p = afs_servers.rb_node;
+	p = net->servers.rb_node;
 	while (p) {
 		server = rb_entry(p, struct afs_server, master_rb);
 
@@ -211,7 +202,7 @@ struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx)
 
 	server = NULL;
 found:
-	read_unlock(&afs_servers_lock);
+	read_unlock(&net->servers_lock);
 	ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
 	_leave(" = %p", server);
 	return server;
@@ -223,6 +214,8 @@ found:
  */
 void afs_put_server(struct afs_server *server)
 {
+	struct afs_net *net = server->cell->net;
+
 	if (!server)
 		return;
 
@@ -239,14 +232,14 @@ void afs_put_server(struct afs_server *server)
 
 	afs_flush_callback_breaks(server);
 
-	spin_lock(&afs_server_graveyard_lock);
+	spin_lock(&net->server_graveyard_lock);
 	if (atomic_read(&server->usage) == 0) {
-		list_move_tail(&server->grave, &afs_server_graveyard);
+		list_move_tail(&server->grave, &net->server_graveyard);
 		server->time_of_death = ktime_get_real_seconds();
-		queue_delayed_work(afs_wq, &afs_server_reaper,
-				   afs_server_timeout * HZ);
+		queue_delayed_work(afs_wq, &net->server_reaper,
+				   net->live ? afs_server_timeout * HZ : 0);
 	}
-	spin_unlock(&afs_server_graveyard_lock);
+	spin_unlock(&net->server_graveyard_lock);
 	_leave(" [dead]");
 }
 
@@ -272,42 +265,45 @@ static void afs_destroy_server(struct afs_server *server)
 /*
  * reap dead server records
  */
-static void afs_reap_server(struct work_struct *work)
+void afs_reap_server(struct work_struct *work)
 {
 	LIST_HEAD(corpses);
 	struct afs_server *server;
+	struct afs_net *net = container_of(work, struct afs_net, server_reaper.work);
 	unsigned long delay, expiry;
 	time64_t now;
 
 	now = ktime_get_real_seconds();
-	spin_lock(&afs_server_graveyard_lock);
+	spin_lock(&net->server_graveyard_lock);
 
-	while (!list_empty(&afs_server_graveyard)) {
-		server = list_entry(afs_server_graveyard.next,
+	while (!list_empty(&net->server_graveyard)) {
+		server = list_entry(net->server_graveyard.next,
 				    struct afs_server, grave);
 
 		/* the queue is ordered most dead first */
-		expiry = server->time_of_death + afs_server_timeout;
-		if (expiry > now) {
-			delay = (expiry - now) * HZ;
-			mod_delayed_work(afs_wq, &afs_server_reaper, delay);
-			break;
+		if (net->live) {
+			expiry = server->time_of_death + afs_server_timeout;
+			if (expiry > now) {
+				delay = (expiry - now) * HZ;
+				mod_delayed_work(afs_wq, &net->server_reaper, delay);
+				break;
+			}
 		}
 
 		write_lock(&server->cell->servers_lock);
-		write_lock(&afs_servers_lock);
+		write_lock(&net->servers_lock);
 		if (atomic_read(&server->usage) > 0) {
 			list_del_init(&server->grave);
 		} else {
 			list_move_tail(&server->grave, &corpses);
 			list_del_init(&server->link);
-			rb_erase(&server->master_rb, &afs_servers);
+			rb_erase(&server->master_rb, &net->servers);
 		}
-		write_unlock(&afs_servers_lock);
+		write_unlock(&net->servers_lock);
 		write_unlock(&server->cell->servers_lock);
 	}
 
-	spin_unlock(&afs_server_graveyard_lock);
+	spin_unlock(&net->server_graveyard_lock);
 
 	/* now reap the corpses we've extracted */
 	while (!list_empty(&corpses)) {
@@ -318,10 +314,10 @@ static void afs_reap_server(struct work_struct *work)
 }
 
 /*
- * discard all the server records for rmmod
+ * Discard all the server records from a net namespace when it is destroyed or
+ * the afs module is removed.
  */
-void __exit afs_purge_servers(void)
+void __net_exit afs_purge_servers(struct afs_net *net)
 {
-	afs_server_timeout = 0;
-	mod_delayed_work(afs_wq, &afs_server_reaper, 0);
+	mod_delayed_work(afs_wq, &net->server_reaper, 0);
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 689173c0a682..d47a9bc46a69 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -25,11 +25,10 @@
 #include <linux/statfs.h>
 #include <linux/sched.h>
 #include <linux/nsproxy.h>
+#include <linux/magic.h>
 #include <net/net_namespace.h>
 #include "internal.h"
 
-#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
-
 static void afs_i_init_once(void *foo);
 static struct dentry *afs_mount(struct file_system_type *fs_type,
 		      int flags, const char *dev_name, void *data);
@@ -201,7 +200,8 @@ static int afs_parse_options(struct afs_mount_params *params,
 		token = match_token(p, afs_options_list, args);
 		switch (token) {
 		case afs_opt_cell:
-			cell = afs_cell_lookup(args[0].from,
+			cell = afs_cell_lookup(params->net,
+					       args[0].from,
 					       args[0].to - args[0].from,
 					       false);
 			if (IS_ERR(cell))
@@ -308,7 +308,7 @@ static int afs_parse_device_name(struct afs_mount_params *params,
 
 	/* lookup the cell record */
 	if (cellname || !params->cell) {
-		cell = afs_cell_lookup(cellname, cellnamesz, true);
+		cell = afs_cell_lookup(params->net, cellname, cellnamesz, true);
 		if (IS_ERR(cell)) {
 			printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
 			       cellnamesz, cellnamesz, cellname ?: "");
@@ -334,7 +334,7 @@ static int afs_test_super(struct super_block *sb, void *data)
 	struct afs_super_info *as1 = data;
 	struct afs_super_info *as = sb->s_fs_info;
 
-	return as->volume == as1->volume;
+	return as->net == as1->net && as->volume == as1->volume;
 }
 
 static int afs_set_super(struct super_block *sb, void *data)
@@ -411,6 +411,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	_enter(",,%s,%p", dev_name, options);
 
 	memset(&params, 0, sizeof(params));
+	params.net = &__afs_net;
 
 	ret = -EINVAL;
 	if (current->nsproxy->net_ns != &init_net)
@@ -444,36 +445,32 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	}
 
 	/* allocate a superblock info record */
+	ret = -ENOMEM;
 	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
-	if (!as) {
-		ret = -ENOMEM;
-		afs_put_volume(vol);
-		goto error;
-	}
+	if (!as)
+		goto error_vol;
+
+	as->net = afs_get_net(params.net);
 	as->volume = vol;
 
 	/* allocate a deviceless superblock */
 	sb = sget(fs_type, afs_test_super, afs_set_super, flags, as);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
-		afs_put_volume(vol);
-		kfree(as);
-		goto error;
+		goto error_as;
 	}
 
 	if (!sb->s_root) {
 		/* initial superblock/root creation */
 		_debug("create");
 		ret = afs_fill_super(sb, &params);
-		if (ret < 0) {
-			deactivate_locked_super(sb);
-			goto error;
-		}
+		if (ret < 0)
+			goto error_sb;
 		sb->s_flags |= MS_ACTIVE;
 	} else {
 		_debug("reuse");
 		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
-		afs_put_volume(vol);
+		afs_put_volume(params.net, vol);
 		kfree(as);
 	}
 
@@ -482,6 +479,14 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	_leave(" = 0 [%p]", sb);
 	return dget(sb->s_root);
 
+error_sb:
+	deactivate_locked_super(sb);
+	goto error;
+error_as:
+	afs_put_net(as->net);
+	kfree(as);
+error_vol:
+	afs_put_volume(params.net, vol);
 error:
 	afs_put_cell(params.cell);
 	key_put(params.key);
@@ -493,8 +498,10 @@ error:
 static void afs_kill_super(struct super_block *sb)
 {
 	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_net *net = as->net;
+
 	kill_anon_super(sb);
-	afs_put_volume(as->volume);
+	afs_put_volume(net, as->volume);
 	kfree(as);
 }
 
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index a5e4cc561b6c..f5a043a9ba61 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -143,7 +143,8 @@ static const struct afs_call_type afs_RXVLGetEntryById = {
 /*
  * dispatch a get volume entry by name operation
  */
-int afs_vl_get_entry_by_name(struct in_addr *addr,
+int afs_vl_get_entry_by_name(struct afs_net *net,
+			     struct in_addr *addr,
 			     struct key *key,
 			     const char *volname,
 			     struct afs_cache_vlocation *entry,
@@ -159,7 +160,7 @@ int afs_vl_get_entry_by_name(struct in_addr *addr,
 	padsz = (4 - (volnamesz & 3)) & 3;
 	reqsz = 8 + volnamesz + padsz;
 
-	call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
+	call = afs_alloc_flat_call(net, &afs_RXVLGetEntryByName, reqsz, 384);
 	if (!call)
 		return -ENOMEM;
 
@@ -183,7 +184,8 @@ int afs_vl_get_entry_by_name(struct in_addr *addr,
 /*
  * dispatch a get volume entry by ID operation
  */
-int afs_vl_get_entry_by_id(struct in_addr *addr,
+int afs_vl_get_entry_by_id(struct afs_net *net,
+			   struct in_addr *addr,
 			   struct key *key,
 			   afs_volid_t volid,
 			   afs_voltype_t voltype,
@@ -195,7 +197,7 @@ int afs_vl_get_entry_by_id(struct in_addr *addr,
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+	call = afs_alloc_flat_call(net, &afs_RXVLGetEntryById, 12, 384);
 	if (!call)
 		return -ENOMEM;
 
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 37b7c3b342a6..ccb7aacfbeca 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -16,20 +16,11 @@
 #include <linux/sched.h>
 #include "internal.h"
 
+struct workqueue_struct *afs_vlocation_update_worker;
+
 static unsigned afs_vlocation_timeout = 10;	/* volume location timeout in seconds */
 static unsigned afs_vlocation_update_timeout = 10 * 60;
 
-static void afs_vlocation_reaper(struct work_struct *);
-static void afs_vlocation_updater(struct work_struct *);
-
-static LIST_HEAD(afs_vlocation_updates);
-static LIST_HEAD(afs_vlocation_graveyard);
-static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
-static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
-static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
-static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
-static struct workqueue_struct *afs_vlocation_update_worker;
-
 /*
  * iterate through the VL servers in a cell until one of them admits knowing
  * about the volume in question
@@ -52,8 +43,8 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
 		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
 
 		/* attempt to access the VL server */
-		ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
-					       false);
+		ret = afs_vl_get_entry_by_name(cell->net, &addr, key,
+					       vl->vldb.name, vldb, false);
 		switch (ret) {
 		case 0:
 			goto out;
@@ -110,8 +101,8 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
 		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
 
 		/* attempt to access the VL server */
-		ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
-					     false);
+		ret = afs_vl_get_entry_by_id(cell->net, &addr, key, volid,
+					     voltype, vldb, false);
 		switch (ret) {
 		case 0:
 			goto out;
@@ -335,7 +326,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
 /*
  * queue a vlocation record for updates
  */
-static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
+static void afs_vlocation_queue_for_updates(struct afs_net *net,
+					    struct afs_vlocation *vl)
 {
 	struct afs_vlocation *xvl;
 
@@ -343,25 +335,25 @@ static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
 	vl->update_at = ktime_get_real_seconds() +
 			afs_vlocation_update_timeout;
 
-	spin_lock(&afs_vlocation_updates_lock);
+	spin_lock(&net->vl_updates_lock);
 
-	if (!list_empty(&afs_vlocation_updates)) {
+	if (!list_empty(&net->vl_updates)) {
 		/* ... but wait at least 1 second more than the newest record
 		 * already queued so that we don't spam the VL server suddenly
 		 * with lots of requests
 		 */
-		xvl = list_entry(afs_vlocation_updates.prev,
+		xvl = list_entry(net->vl_updates.prev,
 				 struct afs_vlocation, update);
 		if (vl->update_at <= xvl->update_at)
 			vl->update_at = xvl->update_at + 1;
-	} else {
+	} else if (net->live) {
 		queue_delayed_work(afs_vlocation_update_worker,
-				   &afs_vlocation_update,
+				   &net->vl_updater,
 				   afs_vlocation_update_timeout * HZ);
 	}
 
-	list_add_tail(&vl->update, &afs_vlocation_updates);
-	spin_unlock(&afs_vlocation_updates_lock);
+	list_add_tail(&vl->update, &net->vl_updates);
+	spin_unlock(&net->vl_updates_lock);
 }
 
 /*
@@ -371,7 +363,8 @@ static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
  * - lookup in the local cache if not able to find on the VL server
  * - insert/update in the local cache if did get a VL response
  */
-struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
+struct afs_vlocation *afs_vlocation_lookup(struct afs_net *net,
+					   struct afs_cell *cell,
 					   struct key *key,
 					   const char *name,
 					   size_t namesz)
@@ -427,7 +420,7 @@ fill_in_record:
 #endif
 
 	/* schedule for regular updates */
-	afs_vlocation_queue_for_updates(vl);
+	afs_vlocation_queue_for_updates(net, vl);
 	goto success;
 
 found_in_memory:
@@ -436,9 +429,9 @@ found_in_memory:
 	atomic_inc(&vl->usage);
 	spin_unlock(&cell->vl_lock);
 	if (!list_empty(&vl->grave)) {
-		spin_lock(&afs_vlocation_graveyard_lock);
+		spin_lock(&net->vl_graveyard_lock);
 		list_del_init(&vl->grave);
-		spin_unlock(&afs_vlocation_graveyard_lock);
+		spin_unlock(&net->vl_graveyard_lock);
 	}
 	up_write(&cell->vl_sem);
 
@@ -481,7 +474,7 @@ error_abandon:
 	wake_up(&vl->waitq);
 error:
 	ASSERT(vl != NULL);
-	afs_put_vlocation(vl);
+	afs_put_vlocation(net, vl);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
@@ -489,7 +482,7 @@ error:
 /*
  * finish using a volume location record
  */
-void afs_put_vlocation(struct afs_vlocation *vl)
+void afs_put_vlocation(struct afs_net *net, struct afs_vlocation *vl)
 {
 	if (!vl)
 		return;
@@ -503,22 +496,22 @@ void afs_put_vlocation(struct afs_vlocation *vl)
 		return;
 	}
 
-	spin_lock(&afs_vlocation_graveyard_lock);
+	spin_lock(&net->vl_graveyard_lock);
 	if (atomic_read(&vl->usage) == 0) {
 		_debug("buried");
-		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
+		list_move_tail(&vl->grave, &net->vl_graveyard);
 		vl->time_of_death = ktime_get_real_seconds();
-		queue_delayed_work(afs_wq, &afs_vlocation_reap,
+		queue_delayed_work(afs_wq, &net->vl_reaper,
 				   afs_vlocation_timeout * HZ);
 
 		/* suspend updates on this record */
 		if (!list_empty(&vl->update)) {
-			spin_lock(&afs_vlocation_updates_lock);
+			spin_lock(&net->vl_updates_lock);
 			list_del_init(&vl->update);
-			spin_unlock(&afs_vlocation_updates_lock);
+			spin_unlock(&net->vl_updates_lock);
 		}
 	}
-	spin_unlock(&afs_vlocation_graveyard_lock);
+	spin_unlock(&net->vl_graveyard_lock);
 	_leave(" [killed?]");
 }
 
@@ -539,31 +532,34 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
 /*
  * reap dead volume location records
  */
-static void afs_vlocation_reaper(struct work_struct *work)
+void afs_vlocation_reaper(struct work_struct *work)
 {
 	LIST_HEAD(corpses);
 	struct afs_vlocation *vl;
+	struct afs_net *net = container_of(work, struct afs_net, vl_reaper.work);
 	unsigned long delay, expiry;
 	time64_t now;
 
 	_enter("");
 
 	now = ktime_get_real_seconds();
-	spin_lock(&afs_vlocation_graveyard_lock);
+	spin_lock(&net->vl_graveyard_lock);
 
-	while (!list_empty(&afs_vlocation_graveyard)) {
-		vl = list_entry(afs_vlocation_graveyard.next,
+	while (!list_empty(&net->vl_graveyard)) {
+		vl = list_entry(net->vl_graveyard.next,
 				struct afs_vlocation, grave);
 
 		_debug("check %p", vl);
 
 		/* the queue is ordered most dead first */
-		expiry = vl->time_of_death + afs_vlocation_timeout;
-		if (expiry > now) {
-			delay = (expiry - now) * HZ;
-			_debug("delay %lu", delay);
-			mod_delayed_work(afs_wq, &afs_vlocation_reap, delay);
-			break;
+		if (net->live) {
+			expiry = vl->time_of_death + afs_vlocation_timeout;
+			if (expiry > now) {
+				delay = (expiry - now) * HZ;
+				_debug("delay %lu", delay);
+				mod_delayed_work(afs_wq, &net->vl_reaper, delay);
+				break;
+			}
 		}
 
 		spin_lock(&vl->cell->vl_lock);
@@ -578,7 +574,7 @@ static void afs_vlocation_reaper(struct work_struct *work)
 		spin_unlock(&vl->cell->vl_lock);
 	}
 
-	spin_unlock(&afs_vlocation_graveyard_lock);
+	spin_unlock(&net->vl_graveyard_lock);
 
 	/* now reap the corpses we've extracted */
 	while (!list_empty(&corpses)) {
@@ -590,57 +586,47 @@ static void afs_vlocation_reaper(struct work_struct *work)
 	_leave("");
 }
 
-/*
- * initialise the VL update process
- */
-int __init afs_vlocation_update_init(void)
-{
-	afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated",
-						      WQ_MEM_RECLAIM, 0);
-	return afs_vlocation_update_worker ? 0 : -ENOMEM;
-}
-
 /*
  * discard all the volume location records for rmmod
  */
-void afs_vlocation_purge(void)
+void __net_exit afs_vlocation_purge(struct afs_net *net)
 {
-	afs_vlocation_timeout = 0;
-
-	spin_lock(&afs_vlocation_updates_lock);
-	list_del_init(&afs_vlocation_updates);
-	spin_unlock(&afs_vlocation_updates_lock);
-	mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0);
-	destroy_workqueue(afs_vlocation_update_worker);
-
-	mod_delayed_work(afs_wq, &afs_vlocation_reap, 0);
+	spin_lock(&net->vl_updates_lock);
+	list_del_init(&net->vl_updates);
+	spin_unlock(&net->vl_updates_lock);
+	mod_delayed_work(afs_vlocation_update_worker, &net->vl_updater, 0);
+	mod_delayed_work(afs_wq, &net->vl_reaper, 0);
 }
 
 /*
  * update a volume location
  */
-static void afs_vlocation_updater(struct work_struct *work)
+void afs_vlocation_updater(struct work_struct *work)
 {
 	struct afs_cache_vlocation vldb;
 	struct afs_vlocation *vl, *xvl;
+	struct afs_net *net = container_of(work, struct afs_net, vl_updater.work);
 	time64_t now;
 	long timeout;
 	int ret;
 
+	if (!net->live)
+		return;
+
 	_enter("");
 
 	now = ktime_get_real_seconds();
 
 	/* find a record to update */
-	spin_lock(&afs_vlocation_updates_lock);
+	spin_lock(&net->vl_updates_lock);
 	for (;;) {
-		if (list_empty(&afs_vlocation_updates)) {
-			spin_unlock(&afs_vlocation_updates_lock);
+		if (list_empty(&net->vl_updates) || !net->live) {
+			spin_unlock(&net->vl_updates_lock);
 			_leave(" [nothing]");
 			return;
 		}
 
-		vl = list_entry(afs_vlocation_updates.next,
+		vl = list_entry(net->vl_updates.next,
 				struct afs_vlocation, update);
 		if (atomic_read(&vl->usage) > 0)
 			break;
@@ -650,15 +636,15 @@ static void afs_vlocation_updater(struct work_struct *work)
 	timeout = vl->update_at - now;
 	if (timeout > 0) {
 		queue_delayed_work(afs_vlocation_update_worker,
-				   &afs_vlocation_update, timeout * HZ);
-		spin_unlock(&afs_vlocation_updates_lock);
+				   &net->vl_updater, timeout * HZ);
+		spin_unlock(&net->vl_updates_lock);
 		_leave(" [nothing]");
 		return;
 	}
 
 	list_del_init(&vl->update);
 	atomic_inc(&vl->usage);
-	spin_unlock(&afs_vlocation_updates_lock);
+	spin_unlock(&net->vl_updates_lock);
 
 	/* we can now perform the update */
 	_debug("update %s", vl->vldb.name);
@@ -688,18 +674,18 @@ static void afs_vlocation_updater(struct work_struct *work)
 	vl->update_at = ktime_get_real_seconds() +
 			afs_vlocation_update_timeout;
 
-	spin_lock(&afs_vlocation_updates_lock);
+	spin_lock(&net->vl_updates_lock);
 
-	if (!list_empty(&afs_vlocation_updates)) {
+	if (!list_empty(&net->vl_updates)) {
 		/* next update in 10 minutes, but wait at least 1 second more
 		 * than the newest record already queued so that we don't spam
 		 * the VL server suddenly with lots of requests
 		 */
-		xvl = list_entry(afs_vlocation_updates.prev,
+		xvl = list_entry(net->vl_updates.prev,
 				 struct afs_vlocation, update);
 		if (vl->update_at <= xvl->update_at)
 			vl->update_at = xvl->update_at + 1;
-		xvl = list_entry(afs_vlocation_updates.next,
+		xvl = list_entry(net->vl_updates.next,
 				 struct afs_vlocation, update);
 		timeout = xvl->update_at - now;
 		if (timeout < 0)
@@ -710,11 +696,10 @@ static void afs_vlocation_updater(struct work_struct *work)
 
 	ASSERT(list_empty(&vl->update));
 
-	list_add_tail(&vl->update, &afs_vlocation_updates);
+	list_add_tail(&vl->update, &net->vl_updates);
 
 	_debug("timeout %ld", timeout);
-	queue_delayed_work(afs_vlocation_update_worker,
-			   &afs_vlocation_update, timeout * HZ);
-	spin_unlock(&afs_vlocation_updates_lock);
-	afs_put_vlocation(vl);
+	queue_delayed_work(afs_vlocation_update_worker, &net->vl_updater, timeout * HZ);
+	spin_unlock(&net->vl_updates_lock);
+	afs_put_vlocation(net, vl);
 }
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index db73d6dad02b..3d5363e0b7e1 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -54,7 +54,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	       params->volnamesz, params->volnamesz, params->volname, params->rwpath);
 
 	/* lookup the volume location record */
-	vlocation = afs_vlocation_lookup(params->cell, params->key,
+	vlocation = afs_vlocation_lookup(params->net, params->cell, params->key,
 					 params->volname, params->volnamesz);
 	if (IS_ERR(vlocation)) {
 		ret = PTR_ERR(vlocation);
@@ -138,7 +138,7 @@ success:
 	_debug("kAFS selected %s volume %08x",
 	       afs_voltypes[volume->type], volume->vid);
 	up_write(&params->cell->vl_sem);
-	afs_put_vlocation(vlocation);
+	afs_put_vlocation(params->net, vlocation);
 	_leave(" = %p", volume);
 	return volume;
 
@@ -146,7 +146,7 @@ success:
 error_up:
 	up_write(&params->cell->vl_sem);
 error:
-	afs_put_vlocation(vlocation);
+	afs_put_vlocation(params->net, vlocation);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 
@@ -163,7 +163,7 @@ error_discard:
 /*
  * destroy a volume record
  */
-void afs_put_volume(struct afs_volume *volume)
+void afs_put_volume(struct afs_net *net, struct afs_volume *volume)
 {
 	struct afs_vlocation *vlocation;
 	int loop;
@@ -195,7 +195,7 @@ void afs_put_volume(struct afs_volume *volume)
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_relinquish_cookie(volume->cache, 0);
 #endif
-	afs_put_vlocation(vlocation);
+	afs_put_vlocation(net, vlocation);
 
 	for (loop = volume->nservers - 1; loop >= 0; loop--)
 		afs_put_server(volume->servers[loop]);
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index aa50113ebe5b..1a6fee974116 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -47,6 +47,7 @@
 #define OPENPROM_SUPER_MAGIC	0x9fa1
 #define QNX4_SUPER_MAGIC	0x002f		/* qnx4 fs detection */
 #define QNX6_SUPER_MAGIC	0x68191122	/* qnx6 fs detection */
+#define AFS_FS_MAGIC		0x6B414653
 
 #define REISERFS_SUPER_MAGIC	0x52654973	/* used by gcc */
 					/* used by file system utilities that
-- 
cgit v1.2.3


From 7f88ba4a19b91d310eca836b647edeb100c61c8d Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 10 Nov 2017 15:13:10 -0600
Subject: PCI/ASPM: Reformat ASPM register definitions

Reformat register field definitions in the style used elsewhere and align
comments with names used in the spec.  No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Vidya Sagar <vidyas@nvidia.com>
---
 include/uapi/linux/pci_regs.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index f8d58045926f..4150acb4cccb 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -995,19 +995,19 @@
 #define  PCI_PTM_CTRL_ENABLE		0x00000001  /* PTM enable */
 #define  PCI_PTM_CTRL_ROOT		0x00000002  /* Root select */
 
-/* L1 PM Substates */
-#define PCI_L1SS_CAP		    4	/* capability register */
-#define  PCI_L1SS_CAP_PCIPM_L1_2	 1	/* PCI PM L1.2 Support */
-#define  PCI_L1SS_CAP_PCIPM_L1_1	 2	/* PCI PM L1.1 Support */
-#define  PCI_L1SS_CAP_ASPM_L1_2		 4	/* ASPM L1.2 Support */
-#define  PCI_L1SS_CAP_ASPM_L1_1		 8	/* ASPM L1.1 Support */
-#define  PCI_L1SS_CAP_L1_PM_SS		16	/* L1 PM Substates Support */
-#define PCI_L1SS_CTL1		    8	/* Control Register 1 */
-#define  PCI_L1SS_CTL1_PCIPM_L1_2	1	/* PCI PM L1.2 Enable */
-#define  PCI_L1SS_CTL1_PCIPM_L1_1	2	/* PCI PM L1.1 Support */
-#define  PCI_L1SS_CTL1_ASPM_L1_2	4	/* ASPM L1.2 Support */
-#define  PCI_L1SS_CTL1_ASPM_L1_1	8	/* ASPM L1.1 Support */
-#define  PCI_L1SS_CTL1_L1SS_MASK	0x0000000F
-#define PCI_L1SS_CTL2		    0xC	/* Control Register 2 */
+/* ASPM L1 PM Substates */
+#define PCI_L1SS_CAP		0x04	/* Capabilities Register */
+#define  PCI_L1SS_CAP_PCIPM_L1_2	0x00000001  /* PCI-PM L1.2 Supported */
+#define  PCI_L1SS_CAP_PCIPM_L1_1	0x00000002  /* PCI-PM L1.1 Supported */
+#define  PCI_L1SS_CAP_ASPM_L1_2		0x00000004  /* ASPM L1.2 Supported */
+#define  PCI_L1SS_CAP_ASPM_L1_1		0x00000008  /* ASPM L1.1 Supported */
+#define  PCI_L1SS_CAP_L1_PM_SS		0x00000010  /* L1 PM Substates Supported */
+#define PCI_L1SS_CTL1		0x08	/* Control 1 Register */
+#define  PCI_L1SS_CTL1_PCIPM_L1_2	0x00000001  /* PCI-PM L1.2 Enable */
+#define  PCI_L1SS_CTL1_PCIPM_L1_1	0x00000002  /* PCI-PM L1.1 Enable */
+#define  PCI_L1SS_CTL1_ASPM_L1_2	0x00000004  /* ASPM L1.2 Enable */
+#define  PCI_L1SS_CTL1_ASPM_L1_1	0x00000008  /* ASPM L1.1 Enable */
+#define  PCI_L1SS_CTL1_L1SS_MASK	0x0000000f
+#define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
 
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From a48f3d5b197494d903c97ff7bc0909dac65740f8 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 13 Nov 2017 08:36:40 -0600
Subject: PCI/ASPM: Add L1 Substates definitions

Add and use #defines for L1 Substate register fields instead of hard-coding
the masks.  Also update comments to use names from the spec.  No functional
change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Vidya Sagar <vidyas@nvidia.com>
---
 drivers/pci/pcie/aspm.c       | 34 ++++++++++++++++++++--------------
 include/uapi/linux/pci_regs.h |  6 ++++++
 2 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index a378dd9d2473..d240ffab24c1 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -450,24 +450,25 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link,
 	if (!(link->aspm_support & ASPM_STATE_L1_2_MASK))
 		return;
 
-	/* Choose the greater of the two T_cmn_mode_rstr_time */
-	val1 = (upreg->l1ss_cap >> 8) & 0xFF;
-	val2 = (dwreg->l1ss_cap >> 8) & 0xFF;
+	/* Choose the greater of the two Port Common_Mode_Restore_Times */
+	val1 = (upreg->l1ss_cap & PCI_L1SS_CAP_CM_RESTORE_TIME) >> 8;
+	val2 = (dwreg->l1ss_cap & PCI_L1SS_CAP_CM_RESTORE_TIME) >> 8;
 	if (val1 > val2)
 		link->l1ss.ctl1 |= val1 << 8;
 	else
 		link->l1ss.ctl1 |= val2 << 8;
+
 	/*
 	 * We currently use LTR L1.2 threshold to be fixed constant picked from
 	 * Intel's coreboot.
 	 */
 	link->l1ss.ctl1 |= LTR_L1_2_THRESHOLD_BITS;
 
-	/* Choose the greater of the two T_pwr_on */
-	val1 = (upreg->l1ss_cap >> 19) & 0x1F;
-	scale1 = (upreg->l1ss_cap >> 16) & 0x03;
-	val2 = (dwreg->l1ss_cap >> 19) & 0x1F;
-	scale2 = (dwreg->l1ss_cap >> 16) & 0x03;
+	/* Choose the greater of the two Port T_POWER_ON times */
+	val1   = (upreg->l1ss_cap & PCI_L1SS_CAP_P_PWR_ON_VALUE) >> 19;
+	scale1 = (upreg->l1ss_cap & PCI_L1SS_CAP_P_PWR_ON_SCALE) >> 16;
+	val2   = (dwreg->l1ss_cap & PCI_L1SS_CAP_P_PWR_ON_VALUE) >> 19;
+	scale2 = (dwreg->l1ss_cap & PCI_L1SS_CAP_P_PWR_ON_SCALE) >> 16;
 
 	if (calc_l1ss_pwron(link->pdev, scale1, val1) >
 	    calc_l1ss_pwron(link->downstream, scale2, val2))
@@ -646,21 +647,26 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state)
 
 	if (enable_req & ASPM_STATE_L1_2_MASK) {
 
-		/* Program T_pwr_on in both ports */
+		/* Program T_POWER_ON times in both ports */
 		pci_write_config_dword(parent, up_cap_ptr + PCI_L1SS_CTL2,
 				       link->l1ss.ctl2);
 		pci_write_config_dword(child, dw_cap_ptr + PCI_L1SS_CTL2,
 				       link->l1ss.ctl2);
 
-		/* Program T_cmn_mode in parent */
+		/* Program Common_Mode_Restore_Time in upstream device */
 		pci_clear_and_set_dword(parent, up_cap_ptr + PCI_L1SS_CTL1,
-					0xFF00, link->l1ss.ctl1);
+					PCI_L1SS_CTL1_CM_RESTORE_TIME,
+					link->l1ss.ctl1);
 
-		/* Program LTR L1.2 threshold in both ports */
+		/* Program LTR_L1.2_THRESHOLD time in both ports */
 		pci_clear_and_set_dword(parent,	up_cap_ptr + PCI_L1SS_CTL1,
-					0xE3FF0000, link->l1ss.ctl1);
+					PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
+					PCI_L1SS_CTL1_LTR_L12_TH_SCALE,
+					link->l1ss.ctl1);
 		pci_clear_and_set_dword(child, dw_cap_ptr + PCI_L1SS_CTL1,
-					0xE3FF0000, link->l1ss.ctl1);
+					PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
+					PCI_L1SS_CTL1_LTR_L12_TH_SCALE,
+					link->l1ss.ctl1);
 	}
 
 	val = 0;
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 4150acb4cccb..85a4014de42e 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1002,12 +1002,18 @@
 #define  PCI_L1SS_CAP_ASPM_L1_2		0x00000004  /* ASPM L1.2 Supported */
 #define  PCI_L1SS_CAP_ASPM_L1_1		0x00000008  /* ASPM L1.1 Supported */
 #define  PCI_L1SS_CAP_L1_PM_SS		0x00000010  /* L1 PM Substates Supported */
+#define  PCI_L1SS_CAP_CM_RESTORE_TIME	0x0000ff00  /* Port Common_Mode_Restore_Time */
+#define  PCI_L1SS_CAP_P_PWR_ON_SCALE	0x00030000  /* Port T_POWER_ON scale */
+#define  PCI_L1SS_CAP_P_PWR_ON_VALUE	0x00f80000  /* Port T_POWER_ON value */
 #define PCI_L1SS_CTL1		0x08	/* Control 1 Register */
 #define  PCI_L1SS_CTL1_PCIPM_L1_2	0x00000001  /* PCI-PM L1.2 Enable */
 #define  PCI_L1SS_CTL1_PCIPM_L1_1	0x00000002  /* PCI-PM L1.1 Enable */
 #define  PCI_L1SS_CTL1_ASPM_L1_2	0x00000004  /* ASPM L1.2 Enable */
 #define  PCI_L1SS_CTL1_ASPM_L1_1	0x00000008  /* ASPM L1.1 Enable */
 #define  PCI_L1SS_CTL1_L1SS_MASK	0x0000000f
+#define  PCI_L1SS_CTL1_CM_RESTORE_TIME	0x0000ff00  /* Common_Mode_Restore_Time */
+#define  PCI_L1SS_CTL1_LTR_L12_TH_VALUE	0x03ff0000  /* LTR_L1.2_THRESHOLD_Value */
+#define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
 
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From 373d7080896a3cb3b28ae3a2abdafb7bb87552b1 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 14 Nov 2017 16:41:19 -0500
Subject: drm/amdkfd: Add CWSR support

This hardware feature allows the GPU to preempt shader execution in
the middle of a compute wave, save the state and restore it later
to resume execution.

Memory for saving the state is allocated per queue in user mode and
the address and size passed to the create_queue ioctl. The size
depends on the number of waves that can be in flight simultaneously
on a given ASIC.

Signed-off-by: Shaoyun.liu <shaoyun.liu@amd.com>
Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           |  7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 20 ++++-
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  6 ++
 drivers/gpu/drm/amd/amdkfd/kfd_module.c            |  4 +
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    | 27 +++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              | 31 +++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 87 +++++++++++++++++++++-
 include/uapi/linux/kfd_ioctl.h                     |  3 +-
 8 files changed, 179 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 505d39156acd..2a4612d8437a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -117,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep)
 		return -EPERM;
 	}
 
-	process = kfd_create_process(current);
+	process = kfd_create_process(filep);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 
@@ -206,6 +206,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 	q_properties->ctx_save_restore_area_address =
 			args->ctx_save_restore_address;
 	q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
+	q_properties->ctl_stack_size = args->ctl_stack_size;
 	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
 		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
 		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
@@ -1088,6 +1089,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
 			KFD_MMAP_EVENTS_MASK) {
 		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK;
 		return kfd_event_mmap(process, vma);
+	} else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) ==
+			KFD_MMAP_RESERVED_MEM_MASK) {
+		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK;
+		return kfd_reserved_mem_mmap(process, vma);
 	}
 
 	return -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 621a3b53a038..4f05eacca786 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -27,6 +27,7 @@
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_pm4_headers_vi.h"
+#include "cwsr_trap_handler_gfx8.asm"
 
 #define MQD_SIZE_ALIGNED 768
 
@@ -38,7 +39,8 @@ static const struct kfd_device_info kaveri_device_info = {
 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
 	.event_interrupt_class = &event_interrupt_class_cik,
 	.num_of_watch_points = 4,
-	.mqd_size_aligned = MQD_SIZE_ALIGNED
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = false,
 };
 
 static const struct kfd_device_info carrizo_device_info = {
@@ -49,7 +51,8 @@ static const struct kfd_device_info carrizo_device_info = {
 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
 	.event_interrupt_class = &event_interrupt_class_cik,
 	.num_of_watch_points = 4,
-	.mqd_size_aligned = MQD_SIZE_ALIGNED
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
 };
 
 struct kfd_deviceid {
@@ -212,6 +215,17 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
 	return AMD_IOMMU_INV_PRI_RSP_INVALID;
 }
 
+static void kfd_cwsr_init(struct kfd_dev *kfd)
+{
+	if (cwsr_enable && kfd->device_info->supports_cwsr) {
+		BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
+
+		kfd->cwsr_isa = cwsr_trap_gfx8_hex;
+		kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
+		kfd->cwsr_enabled = true;
+	}
+}
+
 bool kgd2kfd_device_init(struct kfd_dev *kfd,
 			 const struct kgd2kfd_shared_resources *gpu_resources)
 {
@@ -286,6 +300,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 		goto device_iommu_pasid_error;
 	}
 
+	kfd_cwsr_init(kfd);
+
 	if (kfd_resume(kfd))
 		goto kfd_resume_error;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e202921c150e..5c065024e285 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -173,6 +173,9 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	*allocated_vmid = qpd->vmid;
 	q->properties.vmid = qpd->vmid;
 
+	q->properties.tba_addr = qpd->tba_addr;
+	q->properties.tma_addr = qpd->tma_addr;
+
 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
 		retval = create_compute_queue_nocpsch(dqm, q, qpd);
 	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
@@ -846,6 +849,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 	}
 
 	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+
+	q->properties.tba_addr = qpd->tba_addr;
+	q->properties.tma_addr = qpd->tma_addr;
 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
 				&q->gart_mqd_addr, &q->properties);
 	if (retval)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index f744caeaee04..ee8adf654cd0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -50,6 +50,10 @@ module_param(sched_policy, int, 0444);
 MODULE_PARM_DESC(sched_policy,
 	"Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
 
+int cwsr_enable = 1;
+module_param(cwsr_enable, int, 0444);
+MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
+
 int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
 module_param(max_num_of_queues_per_device, int, 0444);
 MODULE_PARM_DESC(max_num_of_queues_per_device,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 2ba7cea7b99b..00e1f1a9728b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -89,6 +89,28 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 	if (q->format == KFD_QUEUE_FORMAT_AQL)
 		m->cp_hqd_iq_rptr = 1;
 
+	if (q->tba_addr) {
+		m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8);
+		m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8);
+		m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8);
+		m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8);
+		m->compute_pgm_rsrc2 |=
+			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+	}
+
+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
+		m->cp_hqd_persistent_state |=
+			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+		m->cp_hqd_ctx_save_base_addr_lo =
+			lower_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_base_addr_hi =
+			upper_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+	}
+
 	*mqd = m;
 	if (gart_addr)
 		*gart_addr = addr;
@@ -167,6 +189,11 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
 	}
 
+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
+		m->cp_hqd_ctx_save_control =
+			atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
+			mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
+
 	q->is_active = (q->queue_size > 0 &&
 			q->queue_address != 0 &&
 			q->queue_percent > 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 47504737ab4a..a66876467995 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -41,6 +41,7 @@
 
 #define KFD_MMAP_DOORBELL_MASK 0x8000000000000
 #define KFD_MMAP_EVENTS_MASK 0x4000000000000
+#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000
 
 /*
  * When working with cp scheduler we should assign the HIQ manually or via
@@ -62,6 +63,15 @@
 #define KFD_MAX_NUM_OF_PROCESSES 512
 #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
 
+/*
+ * Size of the per-process TBA+TMA buffer: 2 pages
+ *
+ * The first page is the TBA used for the CWSR ISA code. The second
+ * page is used as TMA for daisy changing a user-mode trap handler.
+ */
+#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2)
+#define KFD_CWSR_TMA_OFFSET PAGE_SIZE
+
 /*
  * Kernel module parameter to specify maximum number of supported queues per
  * device
@@ -78,6 +88,8 @@ extern int max_num_of_queues_per_device;
 /* Kernel module parameter to specify the scheduling policy */
 extern int sched_policy;
 
+extern int cwsr_enable;
+
 /*
  * Kernel module parameter to specify whether to send sigterm to HSA process on
  * unhandled exception
@@ -131,6 +143,7 @@ struct kfd_device_info {
 	size_t ih_ring_entry_size;
 	uint8_t num_of_watch_points;
 	uint16_t mqd_size_aligned;
+	bool supports_cwsr;
 };
 
 struct kfd_mem_obj {
@@ -200,6 +213,11 @@ struct kfd_dev {
 
 	/* Debug manager */
 	struct kfd_dbgmgr           *dbgmgr;
+
+	/* CWSR */
+	bool cwsr_enabled;
+	const void *cwsr_isa;
+	unsigned int cwsr_isa_size;
 };
 
 /* KGD2KFD callbacks */
@@ -332,6 +350,9 @@ struct queue_properties {
 	uint32_t eop_ring_buffer_size;
 	uint64_t ctx_save_restore_area_address;
 	uint32_t ctx_save_restore_area_size;
+	uint32_t ctl_stack_size;
+	uint64_t tba_addr;
+	uint64_t tma_addr;
 };
 
 /**
@@ -439,6 +460,11 @@ struct qcm_process_device {
 	uint32_t num_gws;
 	uint32_t num_oac;
 	uint32_t sh_hidden_private_base;
+
+	/* CWSR memory */
+	void *cwsr_kaddr;
+	uint64_t tba_addr;
+	uint64_t tma_addr;
 };
 
 
@@ -563,7 +589,7 @@ struct amdkfd_ioctl_desc {
 
 void kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
-struct kfd_process *kfd_create_process(const struct task_struct *);
+struct kfd_process *kfd_create_process(struct file *filep);
 struct kfd_process *kfd_get_process(const struct task_struct *);
 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
 
@@ -577,6 +603,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 							struct kfd_process *p);
 
+int kfd_reserved_mem_mmap(struct kfd_process *process,
+			  struct vm_area_struct *vma);
+
 /* Process device data iterator */
 struct kfd_process_device *kfd_get_first_process_device_data(
 							struct kfd_process *p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 1bb9b2643d5a..39f4c19aaf61 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -28,6 +28,7 @@
 #include <linux/amd-iommu.h>
 #include <linux/notifier.h>
 #include <linux/compat.h>
+#include <linux/mman.h>
 
 struct mm_struct;
 
@@ -53,6 +54,8 @@ struct kfd_process_release_work {
 
 static struct kfd_process *find_process(const struct task_struct *thread);
 static struct kfd_process *create_process(const struct task_struct *thread);
+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
+
 
 void kfd_process_create_wq(void)
 {
@@ -68,9 +71,10 @@ void kfd_process_destroy_wq(void)
 	}
 }
 
-struct kfd_process *kfd_create_process(const struct task_struct *thread)
+struct kfd_process *kfd_create_process(struct file *filep)
 {
 	struct kfd_process *process;
+	struct task_struct *thread = current;
 
 	if (!thread->mm)
 		return ERR_PTR(-EINVAL);
@@ -101,6 +105,8 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
 
 	up_write(&thread->mm->mmap_sem);
 
+	kfd_process_init_cwsr(process, filep);
+
 	return process;
 }
 
@@ -168,6 +174,11 @@ static void kfd_process_wq_release(struct work_struct *work)
 			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
 
 		list_del(&pdd->per_device_list);
+
+		if (pdd->qpd.cwsr_kaddr)
+			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
+				get_order(KFD_CWSR_TBA_TMA_SIZE));
+
 		kfree(pdd);
 	}
 
@@ -260,6 +271,46 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 	.release = kfd_process_notifier_release,
 };
 
+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep)
+{
+	int err = 0;
+	unsigned long  offset;
+	struct kfd_process_device *temp, *pdd = NULL;
+	struct kfd_dev *dev = NULL;
+	struct qcm_process_device *qpd = NULL;
+
+	mutex_lock(&p->mutex);
+	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+				per_device_list) {
+		dev = pdd->dev;
+		qpd = &pdd->qpd;
+		if (!dev->cwsr_enabled || qpd->cwsr_kaddr)
+			continue;
+		offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT;
+		qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
+			KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
+			MAP_SHARED, offset);
+
+		if (IS_ERR_VALUE(qpd->tba_addr)) {
+			pr_err("Failure to set tba address. error -%d.\n",
+				(int)qpd->tba_addr);
+			err = qpd->tba_addr;
+			qpd->tba_addr = 0;
+			qpd->cwsr_kaddr = NULL;
+			goto out;
+		}
+
+		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
+
+		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
+		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
+			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
+	}
+out:
+	mutex_unlock(&p->mutex);
+	return err;
+}
+
 static struct kfd_process *create_process(const struct task_struct *thread)
 {
 	struct kfd_process *process;
@@ -535,3 +586,37 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
 
 	return p;
 }
+
+int kfd_reserved_mem_mmap(struct kfd_process *process,
+			  struct vm_area_struct *vma)
+{
+	struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
+	struct kfd_process_device *pdd;
+	struct qcm_process_device *qpd;
+
+	if (!dev)
+		return -EINVAL;
+	if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
+		pr_err("Incorrect CWSR mapping size.\n");
+		return -EINVAL;
+	}
+
+	pdd = kfd_get_process_device_data(dev, process);
+	if (!pdd)
+		return -EINVAL;
+	qpd = &pdd->qpd;
+
+	qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					get_order(KFD_CWSR_TBA_TMA_SIZE));
+	if (!qpd->cwsr_kaddr) {
+		pr_err("Error allocating per process CWSR buffer.\n");
+		return -ENOMEM;
+	}
+
+	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND
+		| VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
+	/* Mapping pages to user process */
+	return remap_pfn_range(vma, vma->vm_start,
+			       PFN_DOWN(__pa(qpd->cwsr_kaddr)),
+			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
+}
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 6e80501368ae..f7563ef2e883 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -58,7 +58,8 @@ struct kfd_ioctl_create_queue_args {
 	__u64 eop_buffer_address;	/* to KFD */
 	__u64 eop_buffer_size;	/* to KFD */
 	__u64 ctx_save_restore_address; /* to KFD */
-	__u64 ctx_save_restore_size;	/* to KFD */
+	__u32 ctx_save_restore_size;	/* to KFD */
+	__u32 ctl_stack_size;		/* to KFD */
 };
 
 struct kfd_ioctl_destroy_queue_args {
-- 
cgit v1.2.3


From d7b9bd2248d794275b53d34e665f7c5a08c4b396 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 14 Nov 2017 16:41:20 -0500
Subject: drm/amdkfd: Add support for user-mode trap handlers

A second-level user mode trap handler can be installed. The CWSR trap
handler jumps to the secondary trap handler conditionally for any
conditions not handled by it. This can be used e.g. for debugging or
catching math exceptions.

When CWSR is disabled, the user mode trap handler is installed as
first level trap handler.

Signed-off-by: Shaoyun.liu <shaoyun.liu@amd.com>
Signed-off-by: Jay Cornwall <Jay.Cornwall@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           | 37 +++++++++++++++++++++-
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 22 +++++++++++++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |  5 +++
 include/uapi/linux/kfd_ioctl.h                     | 12 ++++++-
 4 files changed, 74 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 2a4612d8437a..cc61ec289880 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -432,6 +432,38 @@ out:
 	return err;
 }
 
+static int kfd_ioctl_set_trap_handler(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_set_trap_handler_args *args = data;
+	struct kfd_dev *dev;
+	int err = 0;
+	struct kfd_process_device *pdd;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (dev == NULL)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	if (dev->dqm->ops.set_trap_handler(dev->dqm,
+					&pdd->qpd,
+					args->tba_addr,
+					args->tma_addr))
+		err = -EINVAL;
+
+out:
+	mutex_unlock(&p->mutex);
+
+	return err;
+}
+
 static int kfd_ioctl_dbg_register(struct file *filep,
 				struct kfd_process *p, void *data)
 {
@@ -980,7 +1012,10 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 			kfd_ioctl_set_scratch_backing_va, 0),
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG,
-			kfd_ioctl_get_tile_config, 0)
+			kfd_ioctl_get_tile_config, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
+			kfd_ioctl_set_trap_handler, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 5c065024e285..8447810c9a1e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1116,6 +1116,26 @@ out:
 	return retval;
 }
 
+static int set_trap_handler(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
+				uint64_t tba_addr,
+				uint64_t tma_addr)
+{
+	uint64_t *tma;
+
+	if (dqm->dev->cwsr_enabled) {
+		/* Jump from CWSR trap handler to user trap */
+		tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+		tma[0] = tba_addr;
+		tma[1] = tma_addr;
+	} else {
+		qpd->tba_addr = tba_addr;
+		qpd->tma_addr = tma_addr;
+	}
+
+	return 0;
+}
+
 static int process_termination_nocpsch(struct device_queue_manager *dqm,
 		struct qcm_process_device *qpd)
 {
@@ -1247,6 +1267,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
 		dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+		dqm->ops.set_trap_handler = set_trap_handler;
 		dqm->ops.process_termination = process_termination_cpsch;
 		break;
 	case KFD_SCHED_POLICY_NO_HWS:
@@ -1262,6 +1283,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.initialize = initialize_nocpsch;
 		dqm->ops.uninitialize = uninitialize;
 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+		dqm->ops.set_trap_handler = set_trap_handler;
 		dqm->ops.process_termination = process_termination_nocpsch;
 		break;
 	default:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 5b77cb69f732..8752edf9cd9b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -123,6 +123,11 @@ struct device_queue_manager_ops {
 					   void __user *alternate_aperture_base,
 					   uint64_t alternate_aperture_size);
 
+	int	(*set_trap_handler)(struct device_queue_manager *dqm,
+				    struct qcm_process_device *qpd,
+				    uint64_t tba_addr,
+				    uint64_t tma_addr);
+
 	int (*process_termination)(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
 };
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index f7563ef2e883..f4cab5b3ba9a 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -262,6 +262,13 @@ struct kfd_ioctl_get_tile_config_args {
 	 */
 };
 
+struct kfd_ioctl_set_trap_handler_args {
+	uint64_t tba_addr;		/* to KFD */
+	uint64_t tma_addr;		/* to KFD */
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t pad;
+};
+
 #define AMDKFD_IOCTL_BASE 'K'
 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
@@ -322,7 +329,10 @@ struct kfd_ioctl_get_tile_config_args {
 #define AMDKFD_IOC_GET_TILE_CONFIG                                      \
 		AMDKFD_IOWR(0x12, struct kfd_ioctl_get_tile_config_args)
 
+#define AMDKFD_IOC_SET_TRAP_HANDLER		\
+		AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x13
+#define AMDKFD_COMMAND_END		0x14
 
 #endif
-- 
cgit v1.2.3


From 0eef304bc9f7d079a1165e8cd2f24b078e9e1f2a Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 13 Nov 2017 03:37:06 +0300
Subject: uapi: fix linux/rxrpc.h userspace compilation errors

Consistently use types provided by <linux/types.h> to fix the following
linux/rxrpc.h userspace compilation errors:

/usr/include/linux/rxrpc.h:24:2: error: unknown type name 'u16'
  u16  srx_service; /* service desired */
/usr/include/linux/rxrpc.h:25:2: error: unknown type name 'u16'
  u16  transport_type; /* type of transport socket (SOCK_DGRAM) */
/usr/include/linux/rxrpc.h:26:2: error: unknown type name 'u16'
  u16  transport_len; /* length of transport address */

Use __kernel_sa_family_t instead of sa_family_t the same way
as uapi/linux/in.h does, to fix the following
linux/rxrpc.h userspace compilation errors:

/usr/include/linux/rxrpc.h:23:2: error: unknown type name 'sa_family_t'
  sa_family_t srx_family; /* address family */
/usr/include/linux/rxrpc.h:28:3: error: unknown type name 'sa_family_t'
  sa_family_t family;  /* transport address family */

Fixes: 727f8914477e ("rxrpc: Expose UAPI definitions to userspace")
Cc: <stable@vger.kernel.org> # v4.14
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rxrpc.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 9656aad8f8f7..9d4afea308a4 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -20,12 +20,12 @@
  * RxRPC socket address
  */
 struct sockaddr_rxrpc {
-	sa_family_t	srx_family;	/* address family */
-	u16		srx_service;	/* service desired */
-	u16		transport_type;	/* type of transport socket (SOCK_DGRAM) */
-	u16		transport_len;	/* length of transport address */
+	__kernel_sa_family_t	srx_family;	/* address family */
+	__u16			srx_service;	/* service desired */
+	__u16			transport_type;	/* type of transport socket (SOCK_DGRAM) */
+	__u16			transport_len;	/* length of transport address */
 	union {
-		sa_family_t family;		/* transport address family */
+		__kernel_sa_family_t family;	/* transport address family */
 		struct sockaddr_in sin;		/* IPv4 transport address */
 		struct sockaddr_in6 sin6;	/* IPv6 transport address */
 	} transport;
-- 
cgit v1.2.3


From b9f3eb499d84f8d4adcb2f9212ec655700b28228 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 14 Nov 2017 06:30:11 +0300
Subject: uapi: fix linux/tls.h userspace compilation error

Move inclusion of a private kernel header <net/tcp.h>
from uapi/linux/tls.h to its only user - net/tls.h,
to fix the following linux/tls.h userspace compilation error:

/usr/include/linux/tls.h:41:21: fatal error: net/tcp.h: No such file or directory

As to this point uapi/linux/tls.h was totaly unusuable for userspace,
cleanup this header file further by moving other redundant includes
to net/tls.h.

Fixes: 3c4d7559159b ("tls: kernel TLS support")
Cc: <stable@vger.kernel.org> # v4.13+
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h        | 4 ++++
 include/uapi/linux/tls.h | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tls.h b/include/net/tls.h
index 70becd0a9299..936cfc5cab7d 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -35,6 +35,10 @@
 #define _TLS_OFFLOAD_H
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
 
 #include <uapi/linux/tls.h>
 
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index d5e0682ab837..293b2cdad88d 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -35,10 +35,6 @@
 #define _UAPI_LINUX_TLS_H
 
 #include <linux/types.h>
-#include <asm/byteorder.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <net/tcp.h>
 
 /* TLS socket options */
 #define TLS_TX			1	/* Set transmit parameters */
-- 
cgit v1.2.3


From 6ab6a0dd228220bd00d745f8eb8ca497e522f4b6 Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Wed, 15 Nov 2017 15:34:23 +0100
Subject: ipv6: sr: update the struct ipv6_sr_hdr

The IPv6 Segment Routing Header (SRH) format has been updated (revision 6
of the SRH ietf draft). The update includes the following SRH fields:

(1) The "First Segment" field changed to be "Last Entry" which contains
the index, in the Segment List, of the last element of the Segment List.

(2) The 16 bit "reserved" field now is used as a "tag" which tags a packet
as part of a class or group of packets, e.g.,packets sharing the same
set of properties.

This patch updates the struct ipv6_sr_hdr, so it complies with the updated
SRH draft. The 16 bit "reserved" field is changed to be "tag", In addition
a comment is added to the "first_segment" field, showing that it represents
the "Last Entry" field of the SRH.

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h
index 2f6fb0dd613c..286e8d6a8e98 100644
--- a/include/uapi/linux/seg6.h
+++ b/include/uapi/linux/seg6.h
@@ -26,9 +26,9 @@ struct ipv6_sr_hdr {
 	__u8	hdrlen;
 	__u8	type;
 	__u8	segments_left;
-	__u8	first_segment;
+	__u8	first_segment; /* Represents the last_entry field of SRH */
 	__u8	flags;
-	__u16	reserved;
+	__u16	tag;
 
 	struct in6_addr segments[0];
 };
-- 
cgit v1.2.3


From ded97d2c2b2c5f1dcced0bc57133f7753b037dfc Mon Sep 17 00:00:00 2001
From: Victor Chibotaru <tchibo@google.com>
Date: Fri, 17 Nov 2017 15:30:46 -0800
Subject: kcov: support comparison operands collection

Enables kcov to collect comparison operands from instrumented code.
This is done by using Clang's -fsanitize=trace-cmp instrumentation
(currently not available for GCC).

The comparison operands help a lot in fuzz testing.  E.g.  they are used
in Syzkaller to cover the interiors of conditional statements with way
less attempts and thus make previously unreachable code reachable.

To allow separate collection of coverage and comparison operands two
different work modes are implemented.  Mode selection is now done via a
KCOV_ENABLE ioctl call with corresponding argument value.

Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com
Signed-off-by: Victor Chibotaru <tchibo@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kcov.h      |  12 ++-
 include/uapi/linux/kcov.h |  24 ++++++
 kernel/kcov.c             | 214 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 211 insertions(+), 39 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index f5d8ce4f4f86..3ecf6f5e3a5f 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -8,19 +8,23 @@ struct task_struct;
 
 #ifdef CONFIG_KCOV
 
-void kcov_task_init(struct task_struct *t);
-void kcov_task_exit(struct task_struct *t);
-
 enum kcov_mode {
 	/* Coverage collection is not enabled yet. */
 	KCOV_MODE_DISABLED = 0,
+	/* KCOV was initialized, but tracing mode hasn't been chosen yet. */
+	KCOV_MODE_INIT = 1,
 	/*
 	 * Tracing coverage collection mode.
 	 * Covered PCs are collected in a per-task buffer.
 	 */
-	KCOV_MODE_TRACE = 1,
+	KCOV_MODE_TRACE_PC = 2,
+	/* Collecting comparison operands mode. */
+	KCOV_MODE_TRACE_CMP = 3,
 };
 
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
 #else
 
 static inline void kcov_task_init(struct task_struct *t) {}
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 33eabbb8ada1..9529867717a8 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -8,4 +8,28 @@
 #define KCOV_ENABLE			_IO('c', 100)
 #define KCOV_DISABLE			_IO('c', 101)
 
+enum {
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 * In new KCOV version the mode is chosen by calling
+	 * ioctl(fd, KCOV_ENABLE, mode). In older versions the mode argument
+	 * was supposed to be 0 in such a call. So, for reasons of backward
+	 * compatibility, we have chosen the value KCOV_TRACE_PC to be 0.
+	 */
+	KCOV_TRACE_PC = 0,
+	/* Collecting comparison operands mode. */
+	KCOV_TRACE_CMP = 1,
+};
+
+/*
+ * The format for the types of collected comparisons.
+ *
+ * Bit 0 shows whether one of the arguments is a compile-time constant.
+ * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes.
+ */
+#define KCOV_CMP_CONST          (1 << 0)
+#define KCOV_CMP_SIZE(n)        ((n) << 1)
+#define KCOV_CMP_MASK           KCOV_CMP_SIZE(3)
+
 #endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/kcov.c b/kernel/kcov.c
index d9f9fa9cacc6..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
 #include <linux/kcov.h>
 #include <asm/setup.h>
 
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
 /*
  * kcov descriptor (one per opened debugfs file).
  * State transitions of the descriptor:
  *  - initial state after open()
  *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
  *  - then, mmap() call (several calls are allowed but not useful)
- *  - then, repeated enable/disable for a task (only one task a time allowed)
+ *  - then, ioctl(KCOV_ENABLE, arg), where arg is
+ *	KCOV_TRACE_PC - to trace only the PCs
+ *	or
+ *	KCOV_TRACE_CMP - to trace only the comparison operands
+ *  - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
  */
 struct kcov {
 	/*
@@ -48,51 +56,176 @@ struct kcov {
 	struct task_struct	*t;
 };
 
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
 {
-	struct task_struct *t;
 	enum kcov_mode mode;
 
-	t = current;
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
 	if (!in_task())
-		return;
+		return false;
 	mode = READ_ONCE(t->kcov_mode);
-	if (mode == KCOV_MODE_TRACE) {
-		unsigned long *area;
-		unsigned long pos;
-		unsigned long ip = _RET_IP_;
+	/*
+	 * There is some code that runs in interrupts but for which
+	 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+	 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+	 * interrupts, there are paired barrier()/WRITE_ONCE() in
+	 * kcov_ioctl_locked().
+	 */
+	barrier();
+	return mode == needed_mode;
+}
 
+static unsigned long canonicalize_ip(unsigned long ip)
+{
 #ifdef CONFIG_RANDOMIZE_BASE
-		ip -= kaslr_offset();
+	ip -= kaslr_offset();
 #endif
+	return ip;
+}
 
-		/*
-		 * There is some code that runs in interrupts but for which
-		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
-		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
-		 * interrupts, there are paired barrier()/WRITE_ONCE() in
-		 * kcov_ioctl_locked().
-		 */
-		barrier();
-		area = t->kcov_area;
-		/* The first word is number of subsequent PCs. */
-		pos = READ_ONCE(area[0]) + 1;
-		if (likely(pos < t->kcov_size)) {
-			area[pos] = ip;
-			WRITE_ONCE(area[0], pos);
-		}
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	unsigned long *area;
+	unsigned long ip = canonicalize_ip(_RET_IP_);
+	unsigned long pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+		return;
+
+	area = t->kcov_area;
+	/* The first 64-bit word is the number of subsequent PCs. */
+	pos = READ_ONCE(area[0]) + 1;
+	if (likely(pos < t->kcov_size)) {
+		area[pos] = ip;
+		WRITE_ONCE(area[0], pos);
 	}
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
 
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+	struct task_struct *t;
+	u64 *area;
+	u64 count, start_index, end_pos, max_pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+		return;
+
+	ip = canonicalize_ip(ip);
+
+	/*
+	 * We write all comparison arguments and types as u64.
+	 * The buffer was allocated for t->kcov_size unsigned longs.
+	 */
+	area = (u64 *)t->kcov_area;
+	max_pos = t->kcov_size * sizeof(unsigned long);
+
+	count = READ_ONCE(area[0]);
+
+	/* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+	start_index = 1 + count * KCOV_WORDS_PER_CMP;
+	end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+	if (likely(end_pos <= max_pos)) {
+		area[start_index] = type;
+		area[start_index + 1] = arg1;
+		area[start_index + 2] = arg2;
+		area[start_index + 3] = ip;
+		WRITE_ONCE(area[0], count + 1);
+	}
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+	u64 i;
+	u64 count = cases[0];
+	u64 size = cases[1];
+	u64 type = KCOV_CMP_CONST;
+
+	switch (size) {
+	case 8:
+		type |= KCOV_CMP_SIZE(0);
+		break;
+	case 16:
+		type |= KCOV_CMP_SIZE(1);
+		break;
+	case 32:
+		type |= KCOV_CMP_SIZE(2);
+		break;
+	case 64:
+		type |= KCOV_CMP_SIZE(3);
+		break;
+	default:
+		return;
+	}
+	for (i = 0; i < count; i++)
+		write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
 static void kcov_get(struct kcov *kcov)
 {
 	atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
 	/* Just to not leave dangling references behind. */
 	kcov_task_init(t);
 	kcov->t = NULL;
+	kcov->mode = KCOV_MODE_INIT;
 	spin_unlock(&kcov->lock);
 	kcov_put(kcov);
 }
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 
 	spin_lock(&kcov->lock);
 	size = kcov->size * sizeof(unsigned long);
-	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
 	    vma->vm_end - vma->vm_start != size) {
 		res = -EINVAL;
 		goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
 	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
 	if (!kcov)
 		return -ENOMEM;
+	kcov->mode = KCOV_MODE_DISABLED;
 	atomic_set(&kcov->refcount, 1);
 	spin_lock_init(&kcov->lock);
 	filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
 			return -EINVAL;
 		kcov->size = size;
-		kcov->mode = KCOV_MODE_TRACE;
+		kcov->mode = KCOV_MODE_INIT;
 		return 0;
 	case KCOV_ENABLE:
 		/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		 * at task exit or voluntary by KCOV_DISABLE. After that it can
 		 * be enabled for another task.
 		 */
-		unused = arg;
-		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
-		    kcov->area == NULL)
+		if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
 			return -EINVAL;
 		if (kcov->t != NULL)
 			return -EBUSY;
+		if (arg == KCOV_TRACE_PC)
+			kcov->mode = KCOV_MODE_TRACE_PC;
+		else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+			kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+		return -ENOTSUPP;
+#endif
+		else
+			return -EINVAL;
 		t = current;
 		/* Cache in task struct for performance. */
 		t->kcov_size = kcov->size;
 		t->kcov_area = kcov->area;
-		/* See comment in __sanitizer_cov_trace_pc(). */
+		/* See comment in check_kcov_mode(). */
 		barrier();
 		WRITE_ONCE(t->kcov_mode, kcov->mode);
 		t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 			return -EINVAL;
 		kcov_task_init(t);
 		kcov->t = NULL;
+		kcov->mode = KCOV_MODE_INIT;
 		kcov_put(kcov);
 		return 0;
 	default:
-- 
cgit v1.2.3


From 1f6f4cb7ba219b00a3fa9afe8049fa16444d8b52 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:53 -0800
Subject: bpf: offload: rename the ifindex field

bpf_target_prog seems long and clunky, rename it to prog_ifindex.
We don't want to call this field just ifindex, because maps
may need a similar field in the future and bpf_attr members for
programs and maps are unnamed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h       | 2 +-
 kernel/bpf/offload.c           | 2 +-
 kernel/bpf/syscall.c           | 4 ++--
 tools/include/uapi/linux/bpf.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..3f626df42516 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -262,7 +262,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
-		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
+		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ac187f9ee182..a778e5df7e26 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -29,7 +29,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 	init_waitqueue_head(&offload->verifier_done);
 
 	rtnl_lock();
-	offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+	offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
 	if (!offload->netdev) {
 		rtnl_unlock();
 		kfree(offload);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 09badc37e864..8e9d065bb7cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1118,7 +1118,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
+#define	BPF_PROG_LOAD_LAST_FIELD prog_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1181,7 +1181,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
-	if (attr->prog_target_ifindex) {
+	if (attr->prog_ifindex) {
 		err = bpf_prog_offload_init(prog, attr);
 		if (err)
 			goto free_prog;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e880ae6434ee..3f626df42516 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -262,7 +262,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
-		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
+		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-- 
cgit v1.2.3


From 1ee640095f049e7ac4ec36b985abada497b98cc2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:59 -0800
Subject: bpf: revert report offload info to user space

This reverts commit bd601b6ada11 ("bpf: report offload info to user
space").  The ifindex by itself is not sufficient, we should provide
information on which network namespace this ifindex belongs to.
After considering some options we concluded that it's best to just
remove this API for now, and rework it in -next.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  1 -
 include/uapi/linux/bpf.h |  6 ------
 kernel/bpf/offload.c     | 12 ------------
 kernel/bpf/syscall.c     |  5 -----
 4 files changed, 24 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 37bbab8c0f56..76c577281d78 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -515,7 +515,6 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f626df42516..4c223ab30293 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -897,10 +897,6 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
-enum bpf_prog_status {
-	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
-};
-
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -914,8 +910,6 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
-	__u32 ifindex;
-	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index d4267c674fec..68ec884440b7 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -149,18 +149,6 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
-{
-	struct bpf_dev_offload *offload = prog->aux->offload;
-	u32 ifindex;
-
-	rtnl_lock();
-	ifindex = offload->netdev ? offload->netdev->ifindex : 0;
-	rtnl_unlock();
-
-	return ifindex;
-}
-
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 41509cf825d8..2c4cfeaa8d5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1616,11 +1616,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
-	if (bpf_prog_is_dev_bound(prog->aux)) {
-		info.status |= BPF_PROG_STATUS_DEV_BOUND;
-		info.ifindex = bpf_prog_offload_ifindex(prog);
-	}
-
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From a158bdd3247b9656df36ba133235fff702e9fdc3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:41 +0000
Subject: rxrpc: Fix call timeouts

Fix the rxrpc call expiration timeouts and make them settable from
userspace.  By analogy with other rx implementations, there should be three
timeouts:

 (1) "Normal timeout"

     This is set for all calls and is triggered if we haven't received any
     packets from the peer in a while.  It is measured from the last time
     we received any packet on that call.  This is not reset by any
     connection packets (such as CHALLENGE/RESPONSE packets).

     If a service operation takes a long time, the server should generate
     PING ACKs at a duration that's substantially less than the normal
     timeout so is to keep both sides alive.  This is set at 1/6 of normal
     timeout.

 (2) "Idle timeout"

     This is set only for a service call and is triggered if we stop
     receiving the DATA packets that comprise the request data.  It is
     measured from the last time we received a DATA packet.

 (3) "Hard timeout"

     This can be set for a call and specified the maximum lifetime of that
     call.  It should not be specified by default.  Some operations (such
     as volume transfer) take a long time.

Allow userspace to set/change the timeouts on a call with sendmsg, using a
control message:

	RXRPC_SET_CALL_TIMEOUTS

The data to the message is a number of 32-bit words, not all of which need
be given:

	u32 hard_timeout;	/* sec from first packet */
	u32 idle_timeout;	/* msec from packet Rx */
	u32 normal_timeout;	/* msec from data Rx */

This can be set in combination with any other sendmsg() that affects a
call.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  69 +++++++++++------
 include/uapi/linux/rxrpc.h   |   1 +
 net/rxrpc/ar-internal.h      |  37 +++++----
 net/rxrpc/call_event.c       | 179 ++++++++++++++++++++-----------------------
 net/rxrpc/call_object.c      |  27 ++++---
 net/rxrpc/conn_client.c      |   4 +-
 net/rxrpc/input.c            |  34 +++++++-
 net/rxrpc/misc.c             |  19 ++---
 net/rxrpc/recvmsg.c          |   2 +-
 net/rxrpc/sendmsg.c          |  59 +++++++++++---
 net/rxrpc/sysctl.c           |  60 +++++++--------
 11 files changed, 290 insertions(+), 201 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index ebe96796027a..01dcbc2164b5 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -138,10 +138,20 @@ enum rxrpc_rtt_rx_trace {
 
 enum rxrpc_timer_trace {
 	rxrpc_timer_begin,
+	rxrpc_timer_exp_ack,
+	rxrpc_timer_exp_hard,
+	rxrpc_timer_exp_idle,
+	rxrpc_timer_exp_normal,
+	rxrpc_timer_exp_ping,
+	rxrpc_timer_exp_resend,
 	rxrpc_timer_expired,
 	rxrpc_timer_init_for_reply,
 	rxrpc_timer_init_for_send_reply,
+	rxrpc_timer_restart,
 	rxrpc_timer_set_for_ack,
+	rxrpc_timer_set_for_hard,
+	rxrpc_timer_set_for_idle,
+	rxrpc_timer_set_for_normal,
 	rxrpc_timer_set_for_ping,
 	rxrpc_timer_set_for_resend,
 	rxrpc_timer_set_for_send,
@@ -296,12 +306,22 @@ enum rxrpc_congest_change {
 #define rxrpc_timer_traces \
 	EM(rxrpc_timer_begin,			"Begin ") \
 	EM(rxrpc_timer_expired,			"*EXPR*") \
+	EM(rxrpc_timer_exp_ack,			"ExpAck") \
+	EM(rxrpc_timer_exp_hard,		"ExpHrd") \
+	EM(rxrpc_timer_exp_idle,		"ExpIdl") \
+	EM(rxrpc_timer_exp_normal,		"ExpNml") \
+	EM(rxrpc_timer_exp_ping,		"ExpPng") \
+	EM(rxrpc_timer_exp_resend,		"ExpRsn") \
 	EM(rxrpc_timer_init_for_reply,		"IniRpl") \
 	EM(rxrpc_timer_init_for_send_reply,	"SndRpl") \
+	EM(rxrpc_timer_restart,			"Restrt") \
 	EM(rxrpc_timer_set_for_ack,		"SetAck") \
+	EM(rxrpc_timer_set_for_hard,		"SetHrd") \
+	EM(rxrpc_timer_set_for_idle,		"SetIdl") \
+	EM(rxrpc_timer_set_for_normal,		"SetNml") \
 	EM(rxrpc_timer_set_for_ping,		"SetPng") \
 	EM(rxrpc_timer_set_for_resend,		"SetRTx") \
-	E_(rxrpc_timer_set_for_send,		"SetTx ")
+	E_(rxrpc_timer_set_for_send,		"SetSnd")
 
 #define rxrpc_propose_ack_traces \
 	EM(rxrpc_propose_ack_client_tx_end,	"ClTxEnd") \
@@ -932,39 +952,44 @@ TRACE_EVENT(rxrpc_rtt_rx,
 
 TRACE_EVENT(rxrpc_timer,
 	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		     ktime_t now, unsigned long now_j),
+		     unsigned long now),
 
-	    TP_ARGS(call, why, now, now_j),
+	    TP_ARGS(call, why, now),
 
 	    TP_STRUCT__entry(
 		    __field(struct rxrpc_call *,		call		)
 		    __field(enum rxrpc_timer_trace,		why		)
-		    __field_struct(ktime_t,			now		)
-		    __field_struct(ktime_t,			expire_at	)
-		    __field_struct(ktime_t,			ack_at		)
-		    __field_struct(ktime_t,			resend_at	)
-		    __field(unsigned long,			now_j		)
-		    __field(unsigned long,			timer		)
+		    __field(long,				now		)
+		    __field(long,				ack_at		)
+		    __field(long,				resend_at	)
+		    __field(long,				ping_at		)
+		    __field(long,				expect_rx_by	)
+		    __field(long,				expect_req_by	)
+		    __field(long,				expect_term_by	)
+		    __field(long,				timer		)
 			     ),
 
 	    TP_fast_assign(
-		    __entry->call	= call;
-		    __entry->why	= why;
-		    __entry->now	= now;
-		    __entry->expire_at	= call->expire_at;
-		    __entry->ack_at	= call->ack_at;
-		    __entry->resend_at	= call->resend_at;
-		    __entry->now_j	= now_j;
-		    __entry->timer	= call->timer.expires;
+		    __entry->call		= call;
+		    __entry->why		= why;
+		    __entry->now		= now;
+		    __entry->ack_at		= call->ack_at;
+		    __entry->resend_at		= call->resend_at;
+		    __entry->expect_rx_by	= call->expect_rx_by;
+		    __entry->expect_req_by	= call->expect_req_by;
+		    __entry->expect_term_by	= call->expect_term_by;
+		    __entry->timer		= call->timer.expires;
 			   ),
 
-	    TP_printk("c=%p %s x=%lld a=%lld r=%lld t=%ld",
+	    TP_printk("c=%p %s a=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
 		      __entry->call,
 		      __print_symbolic(__entry->why, rxrpc_timer_traces),
-		      ktime_to_ns(ktime_sub(__entry->expire_at, __entry->now)),
-		      ktime_to_ns(ktime_sub(__entry->ack_at, __entry->now)),
-		      ktime_to_ns(ktime_sub(__entry->resend_at, __entry->now)),
-		      __entry->timer - __entry->now_j)
+		      __entry->ack_at - __entry->now,
+		      __entry->resend_at - __entry->now,
+		      __entry->expect_rx_by - __entry->now,
+		      __entry->expect_req_by - __entry->now,
+		      __entry->expect_term_by - __entry->now,
+		      __entry->timer - __entry->now)
 	    );
 
 TRACE_EVENT(rxrpc_rx_lose,
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 9d4afea308a4..9335d92c14a4 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -59,6 +59,7 @@ enum rxrpc_cmsg_type {
 	RXRPC_EXCLUSIVE_CALL	= 10,	/* s-: Call should be on exclusive connection */
 	RXRPC_UPGRADE_SERVICE	= 11,	/* s-: Request service upgrade for client call */
 	RXRPC_TX_LENGTH		= 12,	/* s-: Total length of Tx data */
+	RXRPC_SET_CALL_TIMEOUT	= 13,	/* s-: Set one or more call timeouts */
 	RXRPC__SUPPORTED
 };
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ba63f2231107..548411371048 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -468,9 +468,9 @@ enum rxrpc_call_flag {
 enum rxrpc_call_event {
 	RXRPC_CALL_EV_ACK,		/* need to generate ACK */
 	RXRPC_CALL_EV_ABORT,		/* need to generate abort */
-	RXRPC_CALL_EV_TIMER,		/* Timer expired */
 	RXRPC_CALL_EV_RESEND,		/* Tx resend required */
 	RXRPC_CALL_EV_PING,		/* Ping send required */
+	RXRPC_CALL_EV_EXPIRED,		/* Expiry occurred */
 };
 
 /*
@@ -514,10 +514,14 @@ struct rxrpc_call {
 	struct rxrpc_peer	*peer;		/* Peer record for remote address */
 	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
 	struct mutex		user_mutex;	/* User access mutex */
-	ktime_t			ack_at;		/* When deferred ACK needs to happen */
-	ktime_t			resend_at;	/* When next resend needs to happen */
-	ktime_t			ping_at;	/* When next to send a ping */
-	ktime_t			expire_at;	/* When the call times out */
+	unsigned long		ack_at;		/* When deferred ACK needs to happen */
+	unsigned long		resend_at;	/* When next resend needs to happen */
+	unsigned long		ping_at;	/* When next to send a ping */
+	unsigned long		expect_rx_by;	/* When we expect to get a packet by */
+	unsigned long		expect_req_by;	/* When we expect to get a request DATA packet by */
+	unsigned long		expect_term_by;	/* When we expect call termination by */
+	u32			next_rx_timo;	/* Timeout for next Rx packet (jif) */
+	u32			next_req_timo;	/* Timeout for next Rx request packet (jif) */
 	struct timer_list	timer;		/* Combined event timer */
 	struct work_struct	processor;	/* Event processor */
 	rxrpc_notify_rx_t	notify_rx;	/* kernel service Rx notification function */
@@ -697,12 +701,19 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
-void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
-void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
 void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
 		       enum rxrpc_propose_ack_trace);
 void rxrpc_process_call(struct work_struct *);
 
+static inline void rxrpc_reduce_call_timer(struct rxrpc_call *call,
+					   unsigned long expire_at,
+					   unsigned long now,
+					   enum rxrpc_timer_trace why)
+{
+	trace_rxrpc_timer(call, why, now);
+	timer_reduce(&call->timer, expire_at);
+}
+
 /*
  * call_object.c
  */
@@ -843,8 +854,8 @@ static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call,
  */
 extern unsigned int rxrpc_max_client_connections;
 extern unsigned int rxrpc_reap_client_connections;
-extern unsigned int rxrpc_conn_idle_client_expiry;
-extern unsigned int rxrpc_conn_idle_client_fast_expiry;
+extern unsigned long rxrpc_conn_idle_client_expiry;
+extern unsigned long rxrpc_conn_idle_client_fast_expiry;
 extern struct idr rxrpc_client_conn_ids;
 
 void rxrpc_destroy_client_conn_ids(void);
@@ -976,13 +987,13 @@ static inline void rxrpc_queue_local(struct rxrpc_local *local)
  * misc.c
  */
 extern unsigned int rxrpc_max_backlog __read_mostly;
-extern unsigned int rxrpc_requested_ack_delay;
-extern unsigned int rxrpc_soft_ack_delay;
-extern unsigned int rxrpc_idle_ack_delay;
+extern unsigned long rxrpc_requested_ack_delay;
+extern unsigned long rxrpc_soft_ack_delay;
+extern unsigned long rxrpc_idle_ack_delay;
 extern unsigned int rxrpc_rx_window_size;
 extern unsigned int rxrpc_rx_mtu;
 extern unsigned int rxrpc_rx_jumbo_max;
-extern unsigned int rxrpc_resend_timeout;
+extern unsigned long rxrpc_resend_timeout;
 
 extern const s8 rxrpc_ack_priority[];
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 3574508baf9a..c14395d5ad8c 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -21,80 +21,6 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
-/*
- * Set the timer
- */
-void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		       ktime_t now)
-{
-	unsigned long t_j, now_j = jiffies;
-	ktime_t t;
-	bool queue = false;
-
-	if (call->state < RXRPC_CALL_COMPLETE) {
-		t = call->expire_at;
-		if (!ktime_after(t, now)) {
-			trace_rxrpc_timer(call, why, now, now_j);
-			queue = true;
-			goto out;
-		}
-
-		if (!ktime_after(call->resend_at, now)) {
-			call->resend_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
-				queue = true;
-		} else if (ktime_before(call->resend_at, t)) {
-			t = call->resend_at;
-		}
-
-		if (!ktime_after(call->ack_at, now)) {
-			call->ack_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
-				queue = true;
-		} else if (ktime_before(call->ack_at, t)) {
-			t = call->ack_at;
-		}
-
-		if (!ktime_after(call->ping_at, now)) {
-			call->ping_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
-				queue = true;
-		} else if (ktime_before(call->ping_at, t)) {
-			t = call->ping_at;
-		}
-
-		t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now)));
-		t_j += jiffies;
-
-		/* We have to make sure that the calculated jiffies value falls
-		 * at or after the nsec value, or we may loop ceaselessly
-		 * because the timer times out, but we haven't reached the nsec
-		 * timeout yet.
-		 */
-		t_j++;
-
-		if (call->timer.expires != t_j || !timer_pending(&call->timer)) {
-			mod_timer(&call->timer, t_j);
-			trace_rxrpc_timer(call, why, now, now_j);
-		}
-	}
-
-out:
-	if (queue)
-		rxrpc_queue_call(call);
-}
-
-/*
- * Set the timer
- */
-void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		     ktime_t now)
-{
-	read_lock_bh(&call->state_lock);
-	__rxrpc_set_timer(call, why, now);
-	read_unlock_bh(&call->state_lock);
-}
-
 /*
  * Propose a PING ACK be sent.
  */
@@ -106,12 +32,13 @@ static void rxrpc_propose_ping(struct rxrpc_call *call,
 		    !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
 			rxrpc_queue_call(call);
 	} else {
-		ktime_t now = ktime_get_real();
-		ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay);
+		unsigned long now = jiffies;
+		unsigned long ping_at = now + rxrpc_idle_ack_delay;
 
-		if (ktime_before(ping_at, call->ping_at)) {
-			call->ping_at = ping_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now);
+		if (time_before(ping_at, call->ping_at)) {
+			WRITE_ONCE(call->ping_at, ping_at);
+			rxrpc_reduce_call_timer(call, ping_at, now,
+						rxrpc_timer_set_for_ping);
 		}
 	}
 }
@@ -125,8 +52,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 				enum rxrpc_propose_ack_trace why)
 {
 	enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
-	unsigned int expiry = rxrpc_soft_ack_delay;
-	ktime_t now, ack_at;
+	unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
 	/* Pings are handled specially because we don't want to accidentally
@@ -190,11 +116,12 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		    background)
 			rxrpc_queue_call(call);
 	} else {
-		now = ktime_get_real();
-		ack_at = ktime_add_ms(now, expiry);
-		if (ktime_before(ack_at, call->ack_at)) {
-			call->ack_at = ack_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now);
+		now = jiffies;
+		ack_at = jiffies + expiry;
+		if (time_before(ack_at, call->ack_at)) {
+			WRITE_ONCE(call->ack_at, ack_at);
+			rxrpc_reduce_call_timer(call, ack_at, now,
+						rxrpc_timer_set_for_ack);
 		}
 	}
 
@@ -227,18 +154,20 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
 /*
  * Perform retransmission of NAK'd and unack'd packets.
  */
-static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
+static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 {
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
+	unsigned long resend_at;
 	rxrpc_seq_t cursor, seq, top;
-	ktime_t max_age, oldest, ack_ts;
+	ktime_t now, max_age, oldest, ack_ts;
 	int ix;
 	u8 annotation, anno_type, retrans = 0, unacked = 0;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
-	max_age = ktime_sub_ms(now, rxrpc_resend_timeout);
+	now = ktime_get_real();
+	max_age = ktime_sub_ms(now, rxrpc_resend_timeout * 1000 / HZ);
 
 	spin_lock_bh(&call->lock);
 
@@ -282,7 +211,9 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
 				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
 
-	call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout);
+	resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now)));
+	resend_at += jiffies + rxrpc_resend_timeout;
+	WRITE_ONCE(call->resend_at, resend_at);
 
 	if (unacked)
 		rxrpc_congestion_timeout(call);
@@ -292,7 +223,8 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
 	 * retransmitting data.
 	 */
 	if (!retrans) {
-		rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
+		rxrpc_reduce_call_timer(call, resend_at, now,
+					rxrpc_timer_set_for_resend);
 		spin_unlock_bh(&call->lock);
 		ack_ts = ktime_sub(now, call->acks_latest_ts);
 		if (ktime_to_ns(ack_ts) < call->peer->rtt)
@@ -364,7 +296,7 @@ void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
-	ktime_t now;
+	unsigned long now, next, t;
 
 	rxrpc_see_call(call);
 
@@ -384,8 +316,50 @@ recheck_state:
 		goto out_put;
 	}
 
-	now = ktime_get_real();
-	if (ktime_before(call->expire_at, now)) {
+	/* Work out if any timeouts tripped */
+	now = jiffies;
+	t = READ_ONCE(call->expect_rx_by);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->expect_req_by);
+	if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST &&
+	    time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->expect_term_by);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->ack_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now);
+		cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_ACK, &call->events);
+	}
+
+	t = READ_ONCE(call->ping_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
+		cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_PING, &call->events);
+	}
+
+	t = READ_ONCE(call->resend_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now);
+		cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_RESEND, &call->events);
+	}
+
+	/* Process events */
+	if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) {
 		rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
 		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		goto recheck_state;
@@ -408,7 +382,22 @@ recheck_state:
 		goto recheck_state;
 	}
 
-	rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
+	/* Make sure the timer is restarted */
+	next = call->expect_rx_by;
+
+#define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; }
+	
+	set(call->expect_req_by);
+	set(call->expect_term_by);
+	set(call->ack_at);
+	set(call->resend_at);
+	set(call->ping_at);
+
+	now = jiffies;
+	if (time_after_eq(now, next))
+		goto recheck_state;
+
+	rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart);
 
 	/* other events may have been raised since we started checking */
 	if (call->events && call->state < RXRPC_CALL_COMPLETE) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index c3e1fa854471..b305970a9b63 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -51,8 +51,10 @@ static void rxrpc_call_timer_expired(unsigned long _call)
 
 	_enter("%d", call->debug_id);
 
-	if (call->state < RXRPC_CALL_COMPLETE)
-		rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real());
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies);
+		rxrpc_queue_call(call);
+	}
 }
 
 static struct lock_class_key rxrpc_call_user_mutex_lock_class_key;
@@ -139,6 +141,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp)
 	atomic_set(&call->usage, 1);
 	call->debug_id = atomic_inc_return(&rxrpc_debug_id);
 	call->tx_total_len = -1;
+	call->next_rx_timo = 20 * HZ;
+	call->next_req_timo = 1 * HZ;
 
 	memset(&call->sock_node, 0xed, sizeof(call->sock_node));
 
@@ -189,15 +193,16 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
  */
 static void rxrpc_start_call_timer(struct rxrpc_call *call)
 {
-	ktime_t now = ktime_get_real(), expire_at;
-
-	expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime);
-	call->expire_at = expire_at;
-	call->ack_at = expire_at;
-	call->ping_at = expire_at;
-	call->resend_at = expire_at;
-	call->timer.expires = jiffies + LONG_MAX / 2;
-	rxrpc_set_timer(call, rxrpc_timer_begin, now);
+	unsigned long now = jiffies;
+	unsigned long j = now + MAX_JIFFY_OFFSET;
+
+	call->ack_at = j;
+	call->resend_at = j;
+	call->ping_at = j;
+	call->expect_rx_by = j;
+	call->expect_req_by = j;
+	call->expect_term_by = j;
+	call->timer.expires = now;
 }
 
 /*
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index cfb997593da9..97f6a8de4845 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -85,8 +85,8 @@
 
 __read_mostly unsigned int rxrpc_max_client_connections = 1000;
 __read_mostly unsigned int rxrpc_reap_client_connections = 900;
-__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
-__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
+__read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
+__read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
 
 /*
  * We use machine-unique IDs for our client connections.
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 1b592073ec96..c89647eae86d 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -318,16 +318,18 @@ bad_state:
 static bool rxrpc_receiving_reply(struct rxrpc_call *call)
 {
 	struct rxrpc_ack_summary summary = { 0 };
+	unsigned long now, timo;
 	rxrpc_seq_t top = READ_ONCE(call->tx_top);
 
 	if (call->ackr_reason) {
 		spin_lock_bh(&call->lock);
 		call->ackr_reason = 0;
-		call->resend_at = call->expire_at;
-		call->ack_at = call->expire_at;
 		spin_unlock_bh(&call->lock);
-		rxrpc_set_timer(call, rxrpc_timer_init_for_reply,
-				ktime_get_real());
+		now = jiffies;
+		timo = now + MAX_JIFFY_OFFSET;
+		WRITE_ONCE(call->resend_at, timo);
+		WRITE_ONCE(call->ack_at, timo);
+		trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now);
 	}
 
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
@@ -437,6 +439,19 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 	if (state >= RXRPC_CALL_COMPLETE)
 		return;
 
+	if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) {
+		unsigned long timo = READ_ONCE(call->next_req_timo);
+		unsigned long now, expect_req_by;
+
+		if (timo) {
+			now = jiffies;
+			expect_req_by = now + timo;
+			WRITE_ONCE(call->expect_req_by, expect_req_by);
+			rxrpc_reduce_call_timer(call, expect_req_by, now,
+						rxrpc_timer_set_for_idle);
+		}
+	}
+
 	/* Received data implicitly ACKs all of the request packets we sent
 	 * when we're acting as a client.
 	 */
@@ -908,9 +923,20 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
 				    struct sk_buff *skb, u16 skew)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned long timo;
 
 	_enter("%p,%p", call, skb);
 
+	timo = READ_ONCE(call->next_rx_timo);
+	if (timo) {
+		unsigned long now = jiffies, expect_rx_by;
+
+		expect_rx_by = jiffies + timo;
+		WRITE_ONCE(call->expect_rx_by, expect_rx_by);
+		rxrpc_reduce_call_timer(call, expect_rx_by, now,
+					rxrpc_timer_set_for_normal);
+	}
+	
 	switch (sp->hdr.type) {
 	case RXRPC_PACKET_TYPE_DATA:
 		rxrpc_input_data(call, skb, skew);
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 1a2d4b112064..c1d9e7fd7448 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -20,34 +20,29 @@
  */
 unsigned int rxrpc_max_backlog __read_mostly = 10;
 
-/*
- * Maximum lifetime of a call (in mx).
- */
-unsigned int rxrpc_max_call_lifetime = 60 * 1000;
-
 /*
  * How long to wait before scheduling ACK generation after seeing a
- * packet with RXRPC_REQUEST_ACK set (in ms).
+ * packet with RXRPC_REQUEST_ACK set (in jiffies).
  */
-unsigned int rxrpc_requested_ack_delay = 1;
+unsigned long rxrpc_requested_ack_delay = 1;
 
 /*
- * How long to wait before scheduling an ACK with subtype DELAY (in ms).
+ * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
  *
  * We use this when we've received new data packets.  If those packets aren't
  * all consumed within this time we will send a DELAY ACK if an ACK was not
  * requested to let the sender know it doesn't need to resend.
  */
-unsigned int rxrpc_soft_ack_delay = 1 * 1000;
+unsigned long rxrpc_soft_ack_delay = HZ;
 
 /*
- * How long to wait before scheduling an ACK with subtype IDLE (in ms).
+ * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
  *
  * We use this when we've consumed some previously soft-ACK'd packets when
  * further packets aren't immediately received to decide when to send an IDLE
  * ACK let the other end know that it can free up its Tx buffer space.
  */
-unsigned int rxrpc_idle_ack_delay = 0.5 * 1000;
+unsigned long rxrpc_idle_ack_delay = HZ / 2;
 
 /*
  * Receive window size in packets.  This indicates the maximum number of
@@ -75,7 +70,7 @@ unsigned int rxrpc_rx_jumbo_max = 4;
 /*
  * Time till packet resend (in milliseconds).
  */
-unsigned int rxrpc_resend_timeout = 4 * 1000;
+unsigned long rxrpc_resend_timeout = 4 * HZ;
 
 const s8 rxrpc_ack_priority[] = {
 	[0]				= 0,
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index be0b9ae13893..0b6609da80b7 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -163,7 +163,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
 	case RXRPC_CALL_SERVER_RECV_REQUEST:
 		call->tx_phase = true;
 		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
-		call->ack_at = call->expire_at;
+		call->expect_req_by = jiffies + MAX_JIFFY_OFFSET;
 		write_unlock_bh(&call->state_lock);
 		rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true,
 				  rxrpc_propose_ack_processing_op);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index de5ab327c18a..03e0676db28c 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -158,6 +158,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 			       rxrpc_notify_end_tx_t notify_end_tx)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned long now;
 	rxrpc_seq_t seq = sp->hdr.seq;
 	int ret, ix;
 	u8 annotation = RXRPC_TX_ANNO_UNACK;
@@ -197,11 +198,11 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 			break;
 		case RXRPC_CALL_SERVER_ACK_REQUEST:
 			call->state = RXRPC_CALL_SERVER_SEND_REPLY;
-			call->ack_at = call->expire_at;
+			now = jiffies;
+			WRITE_ONCE(call->ack_at, now + MAX_JIFFY_OFFSET);
 			if (call->ackr_reason == RXRPC_ACK_DELAY)
 				call->ackr_reason = 0;
-			__rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply,
-					  ktime_get_real());
+			trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now);
 			if (!last)
 				break;
 			/* Fall through */
@@ -223,14 +224,12 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 		_debug("need instant resend %d", ret);
 		rxrpc_instant_resend(call, ix);
 	} else {
-		ktime_t now = ktime_get_real(), resend_at;
+		unsigned long now = jiffies, resend_at;
 
-		resend_at = ktime_add_ms(now, rxrpc_resend_timeout);
-
-		if (ktime_before(resend_at, call->resend_at)) {
-			call->resend_at = resend_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_send, now);
-		}
+		resend_at = now + rxrpc_resend_timeout;
+		WRITE_ONCE(call->resend_at, resend_at);
+		rxrpc_reduce_call_timer(call, resend_at, now,
+					rxrpc_timer_set_for_send);
 	}
 
 	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
@@ -513,6 +512,19 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p)
 				return -EINVAL;
 			break;
 
+		case RXRPC_SET_CALL_TIMEOUT:
+			if (len & 3 || len < 4 || len > 12)
+				return -EINVAL;
+			memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len);
+			p->call.nr_timeouts = len / 4;
+			if (p->call.timeouts.hard > INT_MAX / HZ)
+				return -ERANGE;
+			if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000)
+				return -ERANGE;
+			if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000)
+				return -ERANGE;
+			break;
+
 		default:
 			return -EINVAL;
 		}
@@ -577,11 +589,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 {
 	enum rxrpc_call_state state;
 	struct rxrpc_call *call;
+	unsigned long now, j;
 	int ret;
 
 	struct rxrpc_send_params p = {
 		.call.tx_total_len	= -1,
 		.call.user_call_ID	= 0,
+		.call.nr_timeouts	= 0,
 		.abort_code		= 0,
 		.command		= RXRPC_CMD_SEND_DATA,
 		.exclusive		= false,
@@ -646,6 +660,31 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		}
 	}
 
+	switch (p.call.nr_timeouts) {
+	case 3:
+		j = msecs_to_jiffies(p.call.timeouts.normal);
+		if (p.call.timeouts.normal > 0 && j == 0)
+			j = 1;
+		WRITE_ONCE(call->next_rx_timo, j);
+		/* Fall through */
+	case 2:
+		j = msecs_to_jiffies(p.call.timeouts.idle);
+		if (p.call.timeouts.idle > 0 && j == 0)
+			j = 1;
+		WRITE_ONCE(call->next_req_timo, j);
+		/* Fall through */
+	case 1:
+		if (p.call.timeouts.hard > 0) {
+			j = msecs_to_jiffies(p.call.timeouts.hard);
+			now = jiffies;
+			j += now;
+			WRITE_ONCE(call->expect_term_by, j);
+			rxrpc_reduce_call_timer(call, j, now,
+						rxrpc_timer_set_for_hard);
+		}
+		break;
+	}
+
 	state = READ_ONCE(call->state);
 	_debug("CALL %d USR %lx ST %d on CONN %p",
 	       call->debug_id, call->user_call_ID, state, call->conn);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 34c706d2f79c..4a7af7aff37d 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -21,6 +21,8 @@ static const unsigned int four = 4;
 static const unsigned int thirtytwo = 32;
 static const unsigned int n_65535 = 65535;
 static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
+static const unsigned long one_jiffy = 1;
+static const unsigned long max_jiffies = MAX_JIFFY_OFFSET;
 
 /*
  * RxRPC operating parameters.
@@ -29,64 +31,60 @@ static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
  * information on the individual parameters.
  */
 static struct ctl_table rxrpc_sysctl_table[] = {
-	/* Values measured in milliseconds */
+	/* Values measured in milliseconds but used in jiffies */
 	{
 		.procname	= "req_ack_delay",
 		.data		= &rxrpc_requested_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&zero,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "soft_ack_delay",
 		.data		= &rxrpc_soft_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_ack_delay",
 		.data		= &rxrpc_idle_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
-	},
-	{
-		.procname	= "resend_timeout",
-		.data		= &rxrpc_resend_timeout,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_conn_expiry",
 		.data		= &rxrpc_conn_idle_client_expiry,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_conn_fast_expiry",
 		.data		= &rxrpc_conn_idle_client_fast_expiry,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
-
-	/* Values measured in seconds but used in jiffies */
 	{
-		.procname	= "max_call_lifetime",
-		.data		= &rxrpc_max_call_lifetime,
-		.maxlen		= sizeof(unsigned int),
+		.procname	= "resend_timeout",
+		.data		= &rxrpc_resend_timeout,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 
 	/* Non-time values */
-- 
cgit v1.2.3


From cf33c1ee5254c6a430bc1538232b49c3ea13e613 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Fri, 24 Nov 2017 15:14:25 -0800
Subject: bcache: Fix building error on MIPS

This patch try to fix the building error on MIPS. The reason is MIPS
has already defined the PTR macro, which conflicts with the PTR macro
in include/uapi/linux/bcache.h.

[fixed by mlyle: corrected a line-length issue]

Cc: stable@vger.kernel.org
Signed-off-by: Huacai Chen <chenhc@lemote.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c   | 2 +-
 drivers/md/bcache/extents.c | 2 +-
 drivers/md/bcache/journal.c | 2 +-
 include/uapi/linux/bcache.h | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a27d85232ce1..a0cc1bc6d884 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -490,7 +490,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
 		if (b == -1)
 			goto err;
 
-		k->ptr[i] = PTR(ca->buckets[b].gen,
+		k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
 				bucket_to_sector(c, b),
 				ca->sb.nr_this_dev);
 
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 41c238fc3733..f9d391711595 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -585,7 +585,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey
 		return false;
 
 	for (i = 0; i < KEY_PTRS(l); i++)
-		if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+		if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
 		    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
 			return false;
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 5018c56ebb67..a87165c1d8e5 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -512,7 +512,7 @@ static void journal_reclaim(struct cache_set *c)
 			continue;
 
 		ja->cur_idx = next;
-		k->ptr[n++] = PTR(0,
+		k->ptr[n++] = MAKE_PTR(0,
 				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
 				  ca->sb.nr_this_dev);
 	}
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 90fc490f973f..821f71a2e48f 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -91,7 +91,7 @@ PTR_FIELD(PTR_GEN,			0,  8)
 
 #define PTR_CHECK_DEV			((1 << PTR_DEV_BITS) - 1)
 
-#define PTR(gen, offset, dev)						\
+#define MAKE_PTR(gen, offset, dev)					\
 	((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
 
 /* Bkey utility code */
-- 
cgit v1.2.3


From 7bbefcfac1936c8d9082a828b09f42a3839cb06e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 24 Nov 2017 12:08:40 -0800
Subject: uapi: add SPDX identifier to vm_sockets_diag.h

New file seems to have missed the SPDX license scan and update.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/vm_sockets_diag.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vm_sockets_diag.h b/include/uapi/linux/vm_sockets_diag.h
index 14cd7dc5a187..0b4dd54f3d1e 100644
--- a/include/uapi/linux/vm_sockets_diag.h
+++ b/include/uapi/linux/vm_sockets_diag.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* AF_VSOCK sock_diag(7) interface for querying open sockets */
 
 #ifndef _UAPI__VM_SOCKETS_DIAG_H__
-- 
cgit v1.2.3


From b4d085201d86af69cbda2214c6dafc0be240ef9f Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 13 Nov 2017 03:35:27 +0300
Subject: uapi: fix linux/kfd_ioctl.h userspace compilation errors

Consistently use types provided by <linux/types.h> via <drm/drm.h>
to fix the following linux/kfd_ioctl.h userspace compilation errors:

/usr/include/linux/kfd_ioctl.h:236:2: error: unknown type name 'uint64_t'
  uint64_t va_addr; /* to KFD */
/usr/include/linux/kfd_ioctl.h:237:2: error: unknown type name 'uint32_t'
  uint32_t gpu_id; /* to KFD */
/usr/include/linux/kfd_ioctl.h:238:2: error: unknown type name 'uint32_t'
  uint32_t pad;
/usr/include/linux/kfd_ioctl.h:243:2: error: unknown type name 'uint64_t'
  uint64_t tile_config_ptr;
/usr/include/linux/kfd_ioctl.h:245:2: error: unknown type name 'uint64_t'
  uint64_t macro_tile_config_ptr;
/usr/include/linux/kfd_ioctl.h:249:2: error: unknown type name 'uint32_t'
  uint32_t num_tile_configs;
/usr/include/linux/kfd_ioctl.h:253:2: error: unknown type name 'uint32_t'
  uint32_t num_macro_tile_configs;
/usr/include/linux/kfd_ioctl.h:255:2: error: unknown type name 'uint32_t'
  uint32_t gpu_id;  /* to KFD */
/usr/include/linux/kfd_ioctl.h:256:2: error: unknown type name 'uint32_t'
  uint32_t gb_addr_config; /* from KFD */
/usr/include/linux/kfd_ioctl.h:257:2: error: unknown type name 'uint32_t'
  uint32_t num_banks;  /* from KFD */
/usr/include/linux/kfd_ioctl.h:258:2: error: unknown type name 'uint32_t'
  uint32_t num_ranks;  /* from KFD */

Fixes: 6a1c9510694fe ("drm/amdkfd: Adding new IOCTL for scratch memory v2")
Fixes: 5d71dbc3a5886 ("drm/amdkfd: Implement image tiling mode support v2")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 include/uapi/linux/kfd_ioctl.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 731d0df722e3..6e80501368ae 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -233,29 +233,29 @@ struct kfd_ioctl_wait_events_args {
 };
 
 struct kfd_ioctl_set_scratch_backing_va_args {
-	uint64_t va_addr;	/* to KFD */
-	uint32_t gpu_id;	/* to KFD */
-	uint32_t pad;
+	__u64 va_addr;	/* to KFD */
+	__u32 gpu_id;	/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_get_tile_config_args {
 	/* to KFD: pointer to tile array */
-	uint64_t tile_config_ptr;
+	__u64 tile_config_ptr;
 	/* to KFD: pointer to macro tile array */
-	uint64_t macro_tile_config_ptr;
+	__u64 macro_tile_config_ptr;
 	/* to KFD: array size allocated by user mode
 	 * from KFD: array size filled by kernel
 	 */
-	uint32_t num_tile_configs;
+	__u32 num_tile_configs;
 	/* to KFD: array size allocated by user mode
 	 * from KFD: array size filled by kernel
 	 */
-	uint32_t num_macro_tile_configs;
+	__u32 num_macro_tile_configs;
 
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t gb_addr_config;	/* from KFD */
-	uint32_t num_banks;		/* from KFD */
-	uint32_t num_ranks;		/* from KFD */
+	__u32 gpu_id;		/* to KFD */
+	__u32 gb_addr_config;	/* from KFD */
+	__u32 num_banks;		/* from KFD */
+	__u32 num_ranks;		/* from KFD */
 	/* struct size can be extended later if needed
 	 * without breaking ABI compatibility
 	 */
-- 
cgit v1.2.3


From 1751e8a6cb935e555fcdbcb9ab4f0446e322ca3e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 27 Nov 2017 13:05:09 -0800
Subject: Rename superblock flags (MS_xyz -> SB_xyz)

This is a pure automated search-and-replace of the internal kernel
superblock flags.

The s_flags are now called SB_*, with the names and the values for the
moment mirroring the MS_* flags that they're equivalent to.

Note how the MS_xyz flags are the ones passed to the mount system call,
while the SB_xyz flags are what we then use in sb->s_flags.

The script to do this was:

    # places to look in; re security/*: it generally should *not* be
    # touched (that stuff parses mount(2) arguments directly), but
    # there are two places where we really deal with superblock flags.
    FILES="drivers/mtd drivers/staging/lustre fs ipc mm \
            include/linux/fs.h include/uapi/linux/bfs_fs.h \
            security/apparmor/apparmorfs.c security/apparmor/include/lib.h"
    # the list of MS_... constants
    SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \
          DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \
          POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \
          I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \
          ACTIVE NOUSER"

    SED_PROG=
    for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done

    # we want files that contain at least one of MS_...,
    # with fs/namespace.c and fs/pnode.c excluded.
    L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c')

    for f in $L; do sed -i $f $SED_PROG; done

Requested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mtd/mtdsuper.c                          |  6 +--
 drivers/staging/lustre/lustre/llite/file.c      |  2 +-
 drivers/staging/lustre/lustre/llite/llite_lib.c | 14 +++----
 fs/9p/vfs_super.c                               |  6 +--
 fs/adfs/super.c                                 |  4 +-
 fs/affs/amigaffs.c                              |  2 +-
 fs/affs/bitmap.c                                |  6 +--
 fs/affs/super.c                                 | 16 ++++----
 fs/afs/super.c                                  |  4 +-
 fs/befs/ChangeLog                               |  2 +-
 fs/befs/linuxvfs.c                              |  4 +-
 fs/btrfs/ctree.h                                |  2 +-
 fs/btrfs/extent_io.c                            |  2 +-
 fs/btrfs/ioctl.c                                |  4 +-
 fs/btrfs/super.c                                | 50 ++++++++++++------------
 fs/btrfs/volumes.c                              |  4 +-
 fs/ceph/super.c                                 |  8 ++--
 fs/cifs/cifs_fs_sb.h                            |  2 +-
 fs/cifs/cifsfs.c                                | 12 +++---
 fs/cifs/cifsglob.h                              |  4 +-
 fs/cifs/inode.c                                 |  2 +-
 fs/cifs/xattr.c                                 |  8 ++--
 fs/coda/inode.c                                 |  4 +-
 fs/cramfs/inode.c                               |  4 +-
 fs/ecryptfs/main.c                              |  8 ++--
 fs/efs/super.c                                  |  4 +-
 fs/ext2/balloc.c                                |  4 +-
 fs/ext2/ialloc.c                                |  4 +-
 fs/ext2/super.c                                 | 20 +++++-----
 fs/ext4/inode.c                                 |  4 +-
 fs/ext4/super.c                                 | 52 ++++++++++++-------------
 fs/f2fs/checkpoint.c                            | 10 ++---
 fs/f2fs/f2fs.h                                  |  2 +-
 fs/f2fs/gc.c                                    |  2 +-
 fs/f2fs/recovery.c                              | 10 ++---
 fs/f2fs/super.c                                 | 28 ++++++-------
 fs/fat/fatent.c                                 |  6 +--
 fs/fat/inode.c                                  |  8 ++--
 fs/fat/misc.c                                   |  2 +-
 fs/fat/namei_msdos.c                            |  2 +-
 fs/freevxfs/vxfs_super.c                        |  4 +-
 fs/fs-writeback.c                               |  2 +-
 fs/fuse/inode.c                                 | 12 +++---
 fs/gfs2/ops_fstype.c                            | 16 ++++----
 fs/gfs2/super.c                                 | 10 ++---
 fs/gfs2/trans.c                                 |  2 +-
 fs/hfs/mdb.c                                    |  4 +-
 fs/hfs/super.c                                  | 16 ++++----
 fs/hfsplus/super.c                              | 22 +++++------
 fs/hpfs/map.c                                   |  2 +-
 fs/hpfs/super.c                                 |  8 ++--
 fs/inode.c                                      | 10 ++---
 fs/isofs/inode.c                                |  2 +-
 fs/jffs2/fs.c                                   |  4 +-
 fs/jffs2/os-linux.h                             |  2 +-
 fs/jffs2/super.c                                |  4 +-
 fs/jfs/super.c                                  | 10 ++---
 fs/kernfs/mount.c                               |  2 +-
 fs/libfs.c                                      |  6 +--
 fs/locks.c                                      |  2 +-
 fs/minix/inode.c                                |  4 +-
 fs/ncpfs/inode.c                                |  4 +-
 fs/nfs/dir.c                                    |  2 +-
 fs/nfs/inode.c                                  |  2 +-
 fs/nfs/internal.h                               |  2 +-
 fs/nfs/super.c                                  | 22 +++++------
 fs/nilfs2/segment.c                             |  2 +-
 fs/nilfs2/super.c                               | 24 ++++++------
 fs/nilfs2/the_nilfs.c                           |  6 +--
 fs/notify/fsnotify.c                            |  2 +-
 fs/nsfs.c                                       |  2 +-
 fs/ntfs/super.c                                 | 32 +++++++--------
 fs/ocfs2/file.c                                 |  2 +-
 fs/ocfs2/super.c                                | 28 ++++++-------
 fs/ocfs2/xattr.c                                |  2 +-
 fs/openpromfs/inode.c                           |  4 +-
 fs/orangefs/super.c                             |  8 ++--
 fs/overlayfs/super.c                            | 10 ++---
 fs/proc/inode.c                                 |  2 +-
 fs/proc/root.c                                  |  2 +-
 fs/proc_namespace.c                             |  8 ++--
 fs/qnx4/inode.c                                 |  4 +-
 fs/qnx6/inode.c                                 |  4 +-
 fs/reiserfs/inode.c                             |  2 +-
 fs/reiserfs/journal.c                           |  6 +--
 fs/reiserfs/prints.c                            |  4 +-
 fs/reiserfs/super.c                             | 18 ++++-----
 fs/reiserfs/xattr.c                             | 10 ++---
 fs/romfs/super.c                                |  4 +-
 fs/squashfs/super.c                             |  4 +-
 fs/statfs.c                                     |  6 +--
 fs/sysfs/mount.c                                |  2 +-
 fs/sysv/inode.c                                 |  2 +-
 fs/sysv/super.c                                 |  2 +-
 fs/ubifs/file.c                                 |  2 +-
 fs/ubifs/io.c                                   |  2 +-
 fs/ubifs/super.c                                | 20 +++++-----
 fs/ubifs/ubifs.h                                |  4 +-
 fs/udf/super.c                                  |  6 +--
 fs/ufs/balloc.c                                 |  8 ++--
 fs/ufs/ialloc.c                                 | 10 ++---
 fs/ufs/super.c                                  | 30 +++++++-------
 fs/xfs/xfs_log.c                                |  6 +--
 fs/xfs/xfs_super.c                              |  8 ++--
 fs/xfs/xfs_super.h                              |  2 +-
 include/linux/fs.h                              |  2 +-
 include/uapi/linux/bfs_fs.h                     |  2 +-
 ipc/mqueue.c                                    |  2 +-
 mm/shmem.c                                      | 10 ++---
 security/apparmor/apparmorfs.c                  |  2 +-
 security/apparmor/include/lib.h                 |  2 +-
 111 files changed, 417 insertions(+), 417 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index e43fea896d1e..d58a61c09304 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -79,14 +79,14 @@ static struct dentry *mount_mtd_aux(struct file_system_type *fs_type, int flags,
 	pr_debug("MTDSB: New superblock for device %d (\"%s\")\n",
 	      mtd->index, mtd->name);
 
-	ret = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+	ret = fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
 	if (ret < 0) {
 		deactivate_locked_super(sb);
 		return ERR_PTR(ret);
 	}
 
 	/* go */
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 	return dget(sb->s_root);
 
 	/* new mountpoint for an already mounted superblock */
@@ -202,7 +202,7 @@ struct dentry *mount_mtd(struct file_system_type *fs_type, int flags,
 not_an_MTD_device:
 #endif /* CONFIG_BLOCK */
 
-	if (!(flags & MS_SILENT))
+	if (!(flags & SB_SILENT))
 		printk(KERN_NOTICE
 		       "MTD: Attempt to mount non-MTD device \"%s\"\n",
 		       dev_name);
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index 2d6e64dea266..938b859b6650 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -1016,7 +1016,7 @@ static bool file_is_noatime(const struct file *file)
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return true;
 
-	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return true;
 
 	return false;
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 65ac5128f005..8666f1e81ade 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -313,11 +313,11 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	}
 
 	if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 		sbi->ll_flags |= LL_SBI_ACL;
 	} else {
 		LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
-		sb->s_flags &= ~MS_POSIXACL;
+		sb->s_flags &= ~SB_POSIXACL;
 		sbi->ll_flags &= ~LL_SBI_ACL;
 	}
 
@@ -660,7 +660,7 @@ void ll_kill_super(struct super_block *sb)
 	struct ll_sb_info *sbi;
 
 	/* not init sb ?*/
-	if (!(sb->s_flags & MS_ACTIVE))
+	if (!(sb->s_flags & SB_ACTIVE))
 		return;
 
 	sbi = ll_s2sbi(sb);
@@ -2039,8 +2039,8 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data)
 	int err;
 	__u32 read_only;
 
-	if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
-		read_only = *flags & MS_RDONLY;
+	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
+		read_only = *flags & SB_RDONLY;
 		err = obd_set_info_async(NULL, sbi->ll_md_exp,
 					 sizeof(KEY_READ_ONLY),
 					 KEY_READ_ONLY, sizeof(read_only),
@@ -2053,9 +2053,9 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data)
 		}
 
 		if (read_only)
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		else
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 
 		if (sbi->ll_flags & LL_SBI_VERBOSE)
 			LCONSOLE_WARN("Remounted %s %s\n", profilenm,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8b75463cb211..af03c2a901eb 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -94,13 +94,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	if (v9ses->cache)
 		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
 
-	sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+	sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME;
 	if (!v9ses->cache)
-		sb->s_flags |= MS_SYNCHRONOUS;
+		sb->s_flags |= SB_SYNCHRONOUS;
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 	if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 #endif
 
 	return 0;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index c9fdfb112933..cfda2c7caedc 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -213,7 +213,7 @@ static int parse_options(struct super_block *sb, char *options)
 static int adfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return parse_options(sb, data);
 }
 
@@ -372,7 +372,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root;
 	int ret = -EINVAL;
 
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 
 	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
 	if (!asb)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 185d5ab7e986..0f0e6925e97d 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -453,7 +453,7 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
 	pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf);
 	if (!sb_rdonly(sb))
 		pr_warn("Remounting filesystem read-only\n");
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	va_end(args);
 }
 
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 2b1399611d9e..5ba9ef2742f6 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -250,12 +250,12 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 	int i, res = 0;
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		return 0;
 
 	if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
 		pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id);
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 		return 0;
 	}
 
@@ -288,7 +288,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 		if (affs_checksum_block(sb, bh)) {
 			pr_warn("Bitmap %u invalid - mounting %s read only.\n",
 				bm->bm_key, sb->s_id);
-			*flags |= MS_RDONLY;
+			*flags |= SB_RDONLY;
 			goto out;
 		}
 		pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 884bedab7266..1117e36134cc 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -356,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic             = AFFS_SUPER_MAGIC;
 	sb->s_op                = &affs_sops;
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 
 	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -466,7 +466,7 @@ got_root:
 	if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
 	     || chksum == MUFS_DCOFS) && !sb_rdonly(sb)) {
 		pr_notice("Dircache FS - mounting %s read only\n", sb->s_id);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	switch (chksum) {
 	case MUFS_FS:
@@ -488,7 +488,7 @@ got_root:
 		/* fall thru */
 	case FS_OFS:
 		affs_set_opt(sbi->s_flags, SF_OFS);
-		sb->s_flags |= MS_NOEXEC;
+		sb->s_flags |= SB_NOEXEC;
 		break;
 	case MUFS_DCOFS:
 	case MUFS_INTLOFS:
@@ -497,7 +497,7 @@ got_root:
 	case FS_INTLOFS:
 		affs_set_opt(sbi->s_flags, SF_INTL);
 		affs_set_opt(sbi->s_flags, SF_OFS);
-		sb->s_flags |= MS_NOEXEC;
+		sb->s_flags |= SB_NOEXEC;
 		break;
 	default:
 		pr_err("Unknown filesystem on device %s: %08X\n",
@@ -513,7 +513,7 @@ got_root:
 			sig, sig[3] + '0', blocksize);
 	}
 
-	sb->s_flags |= MS_NODEV | MS_NOSUID;
+	sb->s_flags |= SB_NODEV | SB_NOSUID;
 
 	sbi->s_data_blksize = sb->s_blocksize;
 	if (affs_test_opt(sbi->s_flags, SF_OFS))
@@ -570,7 +570,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
 
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 
 	memcpy(volume, sbi->s_volume, 32);
 	if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
@@ -596,10 +596,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	memcpy(sbi->s_volume, volume, 32);
 	spin_unlock(&sbi->symlink_lock);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		affs_free_bitmap(sb);
 	else
 		res = affs_init_bitmap(sb, flags);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 875b5eb02242..d3f97da61bdf 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -496,10 +496,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 		if (ret < 0)
 			goto error_sb;
 		as = NULL;
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 	} else {
 		_debug("reuse");
-		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+		ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
 		afs_destroy_sbi(as);
 		as = NULL;
 	}
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
index 75a461cfaca6..16f2dfe8c2f7 100644
--- a/fs/befs/ChangeLog
+++ b/fs/befs/ChangeLog
@@ -365,7 +365,7 @@ Version 0.4 (2001-10-28)
 	(fs/befs/super.c)
 
 * Tell the kernel to only mount befs read-only. 
-	By setting the MS_RDONLY flag in befs_read_super().
+	By setting the SB_RDONLY flag in befs_read_super().
 	Not that it was possible to write before. But now the kernel won't even try.
 	(fs/befs/super.c)
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a92355cc453b..ee236231cafa 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -841,7 +841,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb_rdonly(sb)) {
 		befs_warning(sb,
 			     "No write support. Marking filesystem read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/*
@@ -948,7 +948,7 @@ static int
 befs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		return -EINVAL;
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f7df5536ab61..51477a537c83 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2957,7 +2957,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
  */
 static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
 {
-	return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info);
+	return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
 }
 
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 16045ea86fc1..f9e9f721efe2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1984,7 +1984,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	struct btrfs_bio *bbio = NULL;
 	int ret;
 
-	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
+	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
 	BUG_ON(!mirror_num);
 
 	bio = btrfs_io_bio_alloc(1);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fd172a93d11a..d748ad1c3620 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1172,7 +1172,7 @@ again:
 	if (!i_done || ret)
 		goto out;
 
-	if (!(inode->i_sb->s_flags & MS_ACTIVE))
+	if (!(inode->i_sb->s_flags & SB_ACTIVE))
 		goto out;
 
 	/*
@@ -1333,7 +1333,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		 * make sure we stop running if someone unmounts
 		 * the FS
 		 */
-		if (!(inode->i_sb->s_flags & MS_ACTIVE))
+		if (!(inode->i_sb->s_flags & SB_ACTIVE))
 			break;
 
 		if (btrfs_defrag_cancelled(fs_info)) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 65af029559b5..305cae7444a0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -107,7 +107,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 		return;
 
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		btrfs_info(fs_info, "forced readonly");
 		/*
 		 * Note that a running device replace operation is not
@@ -137,7 +137,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 
 	/*
 	 * Special case: if the error is EROFS, and we're already
-	 * under MS_RDONLY, then it is safe here.
+	 * under SB_RDONLY, then it is safe here.
 	 */
 	if (errno == -EROFS && sb_rdonly(sb))
   		return;
@@ -168,7 +168,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 
 	/* Don't go through full error handling during mount */
-	if (sb->s_flags & MS_BORN)
+	if (sb->s_flags & SB_BORN)
 		btrfs_handle_error(fs_info);
 }
 
@@ -625,7 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			break;
 		case Opt_acl:
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-			info->sb->s_flags |= MS_POSIXACL;
+			info->sb->s_flags |= SB_POSIXACL;
 			break;
 #else
 			btrfs_err(info, "support for ACL not compiled in!");
@@ -633,7 +633,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			goto out;
 #endif
 		case Opt_noacl:
-			info->sb->s_flags &= ~MS_POSIXACL;
+			info->sb->s_flags &= ~SB_POSIXACL;
 			break;
 		case Opt_notreelog:
 			btrfs_set_and_info(info, NOTREELOG,
@@ -851,7 +851,7 @@ check:
 	/*
 	 * Extra check for current option against current flag
 	 */
-	if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+	if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
 		btrfs_err(info,
 			  "nologreplay must be used with ro mount option");
 		ret = -EINVAL;
@@ -1147,7 +1147,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 	sb->s_flags |= SB_I_VERSION;
 	sb->s_iflags |= SB_I_CGROUPWB;
@@ -1180,7 +1180,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	}
 
 	cleancache_init_fs(sb);
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 	return 0;
 
 fail_close:
@@ -1277,7 +1277,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",flushoncommit");
 	if (btrfs_test_opt(info, DISCARD))
 		seq_puts(seq, ",discard");
-	if (!(info->sb->s_flags & MS_POSIXACL))
+	if (!(info->sb->s_flags & SB_POSIXACL))
 		seq_puts(seq, ",noacl");
 	if (btrfs_test_opt(info, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
@@ -1409,11 +1409,11 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
 
 	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
 	if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
-		if (flags & MS_RDONLY) {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+		if (flags & SB_RDONLY) {
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY,
 					     device_name, newargs);
 		} else {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY,
 					     device_name, newargs);
 			if (IS_ERR(mnt)) {
 				root = ERR_CAST(mnt);
@@ -1565,7 +1565,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	u64 subvol_objectid = 0;
 	int error = 0;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	error = btrfs_parse_early_options(data, mode, fs_type,
@@ -1619,13 +1619,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (error)
 		goto error_fs_info;
 
-	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+	if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
 		error = -EACCES;
 		goto error_close_devices;
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
 		 fs_info);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
@@ -1635,7 +1635,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (s->s_root) {
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
-		if ((flags ^ s->s_flags) & MS_RDONLY)
+		if ((flags ^ s->s_flags) & SB_RDONLY)
 			error = -EBUSY;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
@@ -1702,11 +1702,11 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
 {
 	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
 	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
-	     (flags & MS_RDONLY))) {
+	     (flags & SB_RDONLY))) {
 		/* wait for any defraggers to finish */
 		wait_event(fs_info->transaction_wait,
 			   (atomic_read(&fs_info->defrag_running) == 0));
-		if (flags & MS_RDONLY)
+		if (flags & SB_RDONLY)
 			sync_filesystem(fs_info->sb);
 	}
 }
@@ -1766,10 +1766,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	btrfs_resize_thread_pool(fs_info,
 		fs_info->thread_pool_size, old_thread_pool_size);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out;
 
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		/*
 		 * this also happens on 'umount -rf' or on shutdown, when
 		 * the filesystem is busy.
@@ -1781,10 +1781,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		/* avoid complains from lockdep et al. */
 		up(&fs_info->uuid_tree_rescan_sem);
 
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 		/*
-		 * Setting MS_RDONLY will put the cleaner thread to
+		 * Setting SB_RDONLY will put the cleaner thread to
 		 * sleep at the next loop if it's already active.
 		 * If it's already asleep, we'll leave unused block
 		 * groups on disk until we're mounted read-write again
@@ -1856,7 +1856,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 				goto restore;
 			}
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		set_bit(BTRFS_FS_OPEN, &fs_info->flags);
 	}
@@ -1866,9 +1866,9 @@ out:
 	return 0;
 
 restore:
-	/* We've hit an error - don't reset MS_RDONLY */
+	/* We've hit an error - don't reset SB_RDONLY */
 	if (sb_rdonly(sb))
-		old_flags |= MS_RDONLY;
+		old_flags |= SB_RDONLY;
 	sb->s_flags = old_flags;
 	fs_info->mount_opt = old_opts;
 	fs_info->compress_type = old_compress_type;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1ecb938ba4d..925070b9ce03 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2384,7 +2384,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 		ret = btrfs_prepare_sprout(fs_info);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
@@ -2497,7 +2497,7 @@ error_sysfs:
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
 error_trans:
 	if (seeding_dev)
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	if (trans)
 		btrfs_end_transaction(trans);
 	rcu_string_free(device->name);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fe9fbb3f13f7..a62d2a9841dc 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -331,11 +331,11 @@ static int parse_fsopt_token(char *c, void *private)
 		break;
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	case Opt_acl:
-		fsopt->sb_flags |= MS_POSIXACL;
+		fsopt->sb_flags |= SB_POSIXACL;
 		break;
 #endif
 	case Opt_noacl:
-		fsopt->sb_flags &= ~MS_POSIXACL;
+		fsopt->sb_flags &= ~SB_POSIXACL;
 		break;
 	default:
 		BUG_ON(token);
@@ -520,7 +520,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",nopoolperm");
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
-	if (fsopt->sb_flags & MS_POSIXACL)
+	if (fsopt->sb_flags & SB_POSIXACL)
 		seq_puts(m, ",acl");
 	else
 		seq_puts(m, ",noacl");
@@ -988,7 +988,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 	dout("ceph_mount\n");
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
-	flags |= MS_POSIXACL;
+	flags |= SB_POSIXACL;
 #endif
 	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
 	if (err < 0) {
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index cbd216b57239..350fa55a1bf7 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -42,7 +42,7 @@
 #define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
 #define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
 #define CIFS_MOUNT_RWPIDFORWARD	0x80000 /* use pid forwarding for rw */
-#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of SB_POSIXACL in mnt_cifs_flags */
 #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
 #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
 #define CIFS_MOUNT_MAP_SFM_CHR	0x800000 /* SFM/MAC mapping for illegal chars */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c8b75d33f31..31b7565b1617 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -125,7 +125,7 @@ cifs_read_super(struct super_block *sb)
 	tcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 
 	if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
 		sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -497,7 +497,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",cifsacl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
 		seq_puts(s, ",dynperm");
-	if (root->d_sb->s_flags & MS_POSIXACL)
+	if (root->d_sb->s_flags & SB_POSIXACL)
 		seq_puts(s, ",acl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
 		seq_puts(s, ",mfsymlinks");
@@ -573,7 +573,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return 0;
 }
 
@@ -708,7 +708,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 
 	rc = cifs_mount(cifs_sb, volume_info);
 	if (rc) {
-		if (!(flags & MS_SILENT))
+		if (!(flags & SB_SILENT))
 			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
 				 rc);
 		root = ERR_PTR(rc);
@@ -720,7 +720,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 	mnt_data.flags = flags;
 
 	/* BB should we make this contingent on mount parm? */
-	flags |= MS_NODIRATIME | MS_NOATIME;
+	flags |= SB_NODIRATIME | SB_NOATIME;
 
 	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
 	if (IS_ERR(sb)) {
@@ -739,7 +739,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 			goto out_super;
 		}
 
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 	}
 
 	root = cifs_get_root(volume_info, sb);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e185b2853eab..b16583594d1a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -559,8 +559,8 @@ struct smb_vol {
 			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
 			 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
 
-#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
-		      MS_NODEV | MS_SYNCHRONOUS)
+#define CIFS_MS_MASK (SB_RDONLY | SB_MANDLOCK | SB_NOEXEC | SB_NOSUID | \
+		      SB_NODEV | SB_SYNCHRONOUS)
 
 struct cifs_mnt_data {
 	struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7c732cb44164..ecb99079363a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -985,7 +985,7 @@ retry_iget5_locked:
 		}
 
 		cifs_fattr_to_inode(inode, fattr);
-		if (sb->s_flags & MS_NOATIME)
+		if (sb->s_flags & SB_NOATIME)
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
 		if (inode->i_state & I_NEW) {
 			inode->i_ino = hash;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 52f975d848a0..316af84674f1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -117,7 +117,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 #ifdef CONFIG_CIFS_POSIX
 		if (!value)
 			goto out;
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
 				value, (const int)size,
 				ACL_TYPE_ACCESS, cifs_sb->local_nls,
@@ -129,7 +129,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 #ifdef CONFIG_CIFS_POSIX
 		if (!value)
 			goto out;
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
 				value, (const int)size,
 				ACL_TYPE_DEFAULT, cifs_sb->local_nls,
@@ -266,7 +266,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
 
 	case XATTR_ACL_ACCESS:
 #ifdef CONFIG_CIFS_POSIX
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
 				value, size, ACL_TYPE_ACCESS,
 				cifs_sb->local_nls,
@@ -276,7 +276,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
 
 	case XATTR_ACL_DEFAULT:
 #ifdef CONFIG_CIFS_POSIX
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
 				value, size, ACL_TYPE_DEFAULT,
 				cifs_sb->local_nls,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6f0a6a4d5faa..97424cf206c0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -96,7 +96,7 @@ void coda_destroy_inodecache(void)
 static int coda_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -188,7 +188,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_unlock(&vc->vc_mutex);
 
 	sb->s_fs_info = vc;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 	sb->s_blocksize = 4096;	/* XXXXX  what do we put here?? */
 	sb->s_blocksize_bits = 12;
 	sb->s_magic = CODA_SUPER_MAGIC;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9a2ab419ba62..017b0ab19bc4 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -505,7 +505,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -592,7 +592,7 @@ static int cramfs_finalize_super(struct super_block *sb,
 	struct inode *root;
 
 	/* Set it all up.. */
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &cramfs_ops;
 	root = get_cramfs_inode(sb, cramfs_root, 0);
 	if (IS_ERR(root))
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index f2677c90d96e..025d66a705db 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -560,8 +560,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	 * Set the POSIX ACL flag based on whether they're enabled in the lower
 	 * mount.
 	 */
-	s->s_flags = flags & ~MS_POSIXACL;
-	s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL;
+	s->s_flags = flags & ~SB_POSIXACL;
+	s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL;
 
 	/**
 	 * Force a read-only eCryptfs mount when:
@@ -569,7 +569,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	 *   2) The ecryptfs_encrypted_view mount option is specified
 	 */
 	if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 
 	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
 	s->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -602,7 +602,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	ecryptfs_set_dentry_private(s->s_root, root_info);
 	root_info->lower_path = path;
 
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
 
 out_free:
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 65b59009555b..6ffb7ba1547a 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -116,7 +116,7 @@ static void destroy_inodecache(void)
 static int efs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -311,7 +311,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 #ifdef DEBUG
 		pr_info("forcing read-only mode\n");
 #endif
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 	}
 	s->s_op   = &efs_superblock_operations;
 	s->s_export_op = &efs_export_ops;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e1b3724bebf2..33db13365c5e 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -548,7 +548,7 @@ do_more:
 	}
 
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	group_adjust_blocks(sb, block_group, desc, bh2, group_freed);
@@ -1424,7 +1424,7 @@ allocated:
 	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	*errp = 0;
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index a1fc3dabca41..6484199b35d1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -145,7 +145,7 @@ void ext2_free_inode (struct inode * inode)
 	else
 		ext2_release_inode(sb, block_group, is_directory);
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	brelse(bitmap_bh);
@@ -517,7 +517,7 @@ repeat_in_this_group:
 	goto fail;
 got:
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 	brelse(bitmap_bh);
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e2b6be03e69b..7646818ab266 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -75,7 +75,7 @@ void ext2_error(struct super_block *sb, const char *function,
 	if (test_opt(sb, ERRORS_RO)) {
 		ext2_msg(sb, KERN_CRIT,
 			     "error: remounting filesystem read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 }
 
@@ -656,7 +656,7 @@ static int ext2_setup_super (struct super_block * sb,
 		ext2_msg(sb, KERN_ERR,
 			"error: revision level too high, "
 			"forcing read-only mode");
-		res = MS_RDONLY;
+		res = SB_RDONLY;
 	}
 	if (read_only)
 		return res;
@@ -924,9 +924,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_resuid = opts.s_resuid;
 	sbi->s_resgid = opts.s_resgid;
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
-		 MS_POSIXACL : 0);
+		 SB_POSIXACL : 0);
 	sb->s_iflags |= SB_I_CGROUPWB;
 
 	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
@@ -1178,7 +1178,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		ext2_msg(sb, KERN_WARNING,
 			"warning: mounting ext3 filesystem as ext2");
 	if (ext2_setup_super (sb, es, sb_rdonly(sb)))
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	ext2_write_super(sb);
 	return 0;
 
@@ -1341,9 +1341,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 			 "dax flag with busy inodes while remounting");
 		new_opts.s_mount_opt ^= EXT2_MOUNT_DAX;
 	}
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out_set;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
 		    !(sbi->s_mount_state & EXT2_VALID_FS))
 			goto out_set;
@@ -1379,7 +1379,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		 */
 		sbi->s_mount_state = le16_to_cpu(es->s_state);
 		if (!ext2_setup_super (sb, es, 0))
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 		spin_unlock(&sbi->s_lock);
 
 		ext2_write_super(sb);
@@ -1392,8 +1392,8 @@ out_set:
 	sbi->s_mount_opt = new_opts.s_mount_opt;
 	sbi->s_resuid = new_opts.s_resuid;
 	sbi->s_resgid = new_opts.s_resgid;
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
 	spin_unlock(&sbi->s_lock);
 
 	return 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0992d76f7ab1..7df2c5644e59 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2742,7 +2742,7 @@ static int ext4_writepages(struct address_space *mapping,
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
-	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
 	 * the latter could be true if the filesystem is mounted
 	 * read-only, and in that case, ext4_writepages should
 	 * *never* be called, so if that ever happens, we would want
@@ -5183,7 +5183,7 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	ext4_inode_csum_set(inode, raw_inode, ei);
 	spin_unlock(&ei->i_raw_lock);
-	if (inode->i_sb->s_flags & MS_LAZYTIME)
+	if (inode->i_sb->s_flags & SB_LAZYTIME)
 		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
 					      bh->b_data);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0556cd036b69..7c46693a14d7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -422,7 +422,7 @@ static void ext4_handle_error(struct super_block *sb)
 		 * before ->s_flags update
 		 */
 		smp_wmb();
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if (test_opt(sb, ERRORS_PANIC)) {
 		if (EXT4_SB(sb)->s_journal &&
@@ -635,7 +635,7 @@ void __ext4_abort(struct super_block *sb, const char *function,
 		 * before ->s_flags update
 		 */
 		smp_wmb();
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		if (EXT4_SB(sb)->s_journal)
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
@@ -1682,10 +1682,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		sb->s_flags |= SB_I_VERSION;
 		return 1;
 	case Opt_lazytime:
-		sb->s_flags |= MS_LAZYTIME;
+		sb->s_flags |= SB_LAZYTIME;
 		return 1;
 	case Opt_nolazytime:
-		sb->s_flags &= ~MS_LAZYTIME;
+		sb->s_flags &= ~SB_LAZYTIME;
 		return 1;
 	}
 
@@ -2116,7 +2116,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
 		ext4_msg(sb, KERN_ERR, "revision level too high, "
 			 "forcing read-only mode");
-		res = MS_RDONLY;
+		res = SB_RDONLY;
 	}
 	if (read_only)
 		goto done;
@@ -2429,7 +2429,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 
 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
 		/* don't clear list on RO mount w/ errors */
-		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
+		if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
 			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
 				  "clearing orphan list.\n");
 			es->s_last_orphan = 0;
@@ -2438,19 +2438,19 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		return;
 	}
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 
 	/*
 	 * Turn on quotas which were not enabled for read-only mounts if
 	 * filesystem has quota feature, so that they are updated correctly.
 	 */
-	if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) {
+	if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
 		int ret = ext4_enable_quotas(sb);
 
 		if (!ret)
@@ -2539,7 +2539,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		}
 	}
 #endif
-	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 }
 
 /*
@@ -2741,7 +2741,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 
 	if (ext4_has_feature_readonly(sb)) {
 		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		return 1;
 	}
 
@@ -3623,8 +3623,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sb->s_iflags |= SB_I_CGROUPWB;
 	}
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
 	    (ext4_has_compat_features(sb) ||
@@ -4199,7 +4199,7 @@ no_journal:
 	}
 
 	if (ext4_setup_super(sb, es, sb_rdonly(sb)))
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 	/* determine the minimum size of new large inodes, if present */
 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
@@ -4693,7 +4693,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	 * the clock is set in the future, and this will cause e2fsck
 	 * to complain and force a full file system check.
 	 */
-	if (!(sb->s_flags & MS_RDONLY))
+	if (!(sb->s_flags & SB_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
 	if (sb->s_bdev->bd_part)
 		es->s_kbytes_written =
@@ -5047,8 +5047,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
 		ext4_abort(sb, "Abort forced by user");
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	es = sbi->s_es;
 
@@ -5057,16 +5057,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 	}
 
-	if (*flags & MS_LAZYTIME)
-		sb->s_flags |= MS_LAZYTIME;
+	if (*flags & SB_LAZYTIME)
+		sb->s_flags |= SB_LAZYTIME;
 
-	if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
+	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
 		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
 			err = -EROFS;
 			goto restore_opts;
 		}
 
-		if (*flags & MS_RDONLY) {
+		if (*flags & SB_RDONLY) {
 			err = sync_filesystem(sb);
 			if (err < 0)
 				goto restore_opts;
@@ -5078,7 +5078,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * First of all, the unconditional stuff we have to do
 			 * to disable replay of the journal when we next remount
 			 */
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 
 			/*
 			 * OK, test if we are remounting a valid rw partition
@@ -5140,7 +5140,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				ext4_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
 			if (!ext4_setup_super(sb, es, 0))
-				sb->s_flags &= ~MS_RDONLY;
+				sb->s_flags &= ~SB_RDONLY;
 			if (ext4_has_feature_mmp(sb))
 				if (ext4_multi_mount_protect(sb,
 						le64_to_cpu(es->s_mmp_block))) {
@@ -5164,7 +5164,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	ext4_setup_system_zone(sb);
-	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
+	if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY))
 		ext4_commit_super(sb, 1);
 
 #ifdef CONFIG_QUOTA
@@ -5182,7 +5182,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 #endif
 
-	*flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
+	*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
 	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
 	kfree(orig_data);
 	return 0;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index dd2e73e10857..4aa69bc1c70a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -617,17 +617,17 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sbi->sb->s_flags &= ~MS_RDONLY;
+		sbi->sb->s_flags &= ~SB_RDONLY;
 	}
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sbi->sb->s_flags |= MS_ACTIVE;
+	sbi->sb->s_flags |= SB_ACTIVE;
 
 	/* Turn on quotas so that they are updated correctly */
-	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
 #endif
 
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -658,7 +658,7 @@ out:
 	if (quota_enabled)
 		f2fs_quota_off_umount(sbi->sb);
 #endif
-	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 
 	return err;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f4e094e816c6..6abf26c31d01 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2378,7 +2378,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 
 static inline int f2fs_readonly(struct super_block *sb)
 {
-	return sb->s_flags & MS_RDONLY;
+	return sb->s_flags & SB_RDONLY;
 }
 
 static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5d5bba462f26..d844dcb80570 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1005,7 +1005,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 
 	cpc.reason = __get_cp_reason(sbi);
 gc_more:
-	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) {
+	if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
 		ret = -EINVAL;
 		goto stop;
 	}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 92c57ace1939..b3a14b0429f2 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -598,16 +598,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	int quota_enabled;
 #endif
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sbi->sb->s_flags &= ~MS_RDONLY;
+		sbi->sb->s_flags &= ~SB_RDONLY;
 	}
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sbi->sb->s_flags |= MS_ACTIVE;
+	sbi->sb->s_flags |= SB_ACTIVE;
 	/* Turn on quotas so that they are updated correctly */
-	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
 #endif
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -671,7 +671,7 @@ out:
 	if (quota_enabled)
 		f2fs_quota_off_umount(sbi->sb);
 #endif
-	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 
 	return ret ? ret: err;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a6c5dd450002..708155d9c2e4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -534,10 +534,10 @@ static int parse_options(struct super_block *sb, char *options)
 #endif
 			break;
 		case Opt_lazytime:
-			sb->s_flags |= MS_LAZYTIME;
+			sb->s_flags |= SB_LAZYTIME;
 			break;
 		case Opt_nolazytime:
-			sb->s_flags &= ~MS_LAZYTIME;
+			sb->s_flags &= ~SB_LAZYTIME;
 			break;
 #ifdef CONFIG_QUOTA
 		case Opt_quota:
@@ -1168,7 +1168,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, INLINE_DENTRY);
 	set_opt(sbi, EXTENT_CACHE);
 	set_opt(sbi, NOHEAP);
-	sbi->sb->s_flags |= MS_LAZYTIME;
+	sbi->sb->s_flags |= SB_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
 	if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
@@ -1236,7 +1236,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 #endif
 
 	/* recover superblocks we couldn't write due to previous RO mount */
-	if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+	if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
 		err = f2fs_commit_super(sbi, false);
 		f2fs_msg(sb, KERN_INFO,
 			"Try to recover all the superblocks, ret: %d", err);
@@ -1255,17 +1255,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * Previous and new state of filesystem is RO,
 	 * so skip checking GC and FLUSH_MERGE conditions.
 	 */
-	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
+	if (f2fs_readonly(sb) && (*flags & SB_RDONLY))
 		goto skip;
 
 #ifdef CONFIG_QUOTA
-	if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
+	if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) {
 		err = dquot_suspend(sb, -1);
 		if (err < 0)
 			goto restore_opts;
 	} else {
 		/* dquot_resume needs RW */
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 		if (sb_any_quota_suspended(sb)) {
 			dquot_resume(sb, -1);
 		} else if (f2fs_sb_has_quota_ino(sb)) {
@@ -1288,7 +1288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * or if background_gc = off is passed in mount
 	 * option. Also sync the filesystem.
 	 */
-	if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+	if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) {
 		if (sbi->gc_thread) {
 			stop_gc_thread(sbi);
 			need_restart_gc = true;
@@ -1300,7 +1300,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		need_stop_gc = true;
 	}
 
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		writeback_inodes_sb(sb, WB_REASON_SYNC);
 		sync_inodes_sb(sb);
 
@@ -1314,7 +1314,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * We stop issue flush thread if FS is mounted as RO
 	 * or if flush_merge is not passed in mount option.
 	 */
-	if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
+	if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
 		clear_opt(sbi, FLUSH_MERGE);
 		destroy_flush_cmd_control(sbi, false);
 	} else {
@@ -1329,8 +1329,8 @@ skip:
 		kfree(s_qf_names[i]);
 #endif
 	/* Update the POSIXACL Flag */
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	return 0;
 restore_gc:
@@ -2472,8 +2472,8 @@ try_onemore:
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
 	sb->s_time_gran = 1;
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
 	memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
 
 	/* init f2fs-specific super block info */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 48b2336692f9..bac10de678cc 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -392,7 +392,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
 			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
 			set_buffer_uptodate(c_bh);
 			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
-			if (sb->s_flags & MS_SYNCHRONOUS)
+			if (sb->s_flags & SB_SYNCHRONOUS)
 				err = sync_dirty_buffer(c_bh);
 			brelse(c_bh);
 			if (err)
@@ -597,7 +597,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 		}
 
 		if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
-			if (sb->s_flags & MS_SYNCHRONOUS) {
+			if (sb->s_flags & SB_SYNCHRONOUS) {
 				err = fat_sync_bhs(bhs, nr_bhs);
 				if (err)
 					goto error;
@@ -612,7 +612,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 		fat_collect_bhs(bhs, &nr_bhs, &fatent);
 	} while (cluster != FAT_ENT_EOF);
 
-	if (sb->s_flags & MS_SYNCHRONOUS) {
+	if (sb->s_flags & SB_SYNCHRONOUS) {
 		err = fat_sync_bhs(bhs, nr_bhs);
 		if (err)
 			goto error;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 30c52394a7ad..016c46b5e44c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -781,12 +781,12 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 {
 	int new_rdonly;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+	*flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
 
 	sync_filesystem(sb);
 
 	/* make sure we update state on remount. */
-	new_rdonly = *flags & MS_RDONLY;
+	new_rdonly = *flags & SB_RDONLY;
 	if (new_rdonly != sb_rdonly(sb)) {
 		if (new_rdonly)
 			fat_set_state(sb, 0, 0);
@@ -1352,7 +1352,7 @@ out:
 	if (opts->unicode_xlate)
 		opts->utf8 = 0;
 	if (opts->nfs == FAT_NFS_NOSTALE_RO) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		sb->s_export_op = &fat_export_ops_nostale;
 	}
 
@@ -1608,7 +1608,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
 
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 	sb->s_magic = MSDOS_SUPER_MAGIC;
 	sb->s_op = &fat_sops;
 	sb->s_export_op = &fat_export_ops;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index acc3aa30ee54..f9bdc1e01c98 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -33,7 +33,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 	if (opts->errors == FAT_ERRORS_PANIC)
 		panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
 	else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
 	}
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 7d6a105d601b..d24d2758a363 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -646,7 +646,7 @@ static void setup(struct super_block *sb)
 {
 	MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
 	sb->s_d_op = &msdos_dentry_operations;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 }
 
 static int msdos_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 455ce5b77e9b..f989efa051a0 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -116,7 +116,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -220,7 +220,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 	int ret = -EINVAL;
 	u32 j;
 
-	sbp->s_flags |= MS_RDONLY;
+	sbp->s_flags |= SB_RDONLY;
 
 	infp = kzalloc(sizeof(*infp), GFP_KERNEL);
 	if (!infp) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 08f5debd07d1..cea4836385b7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -490,7 +490,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
-	if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+	if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
 	    inode->i_state & (I_WB_SWITCH | I_FREEING) ||
 	    inode_to_wb(inode) == isw->new_wb) {
 		spin_unlock(&inode->i_lock);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2f504d615d92..624f18bbfd2b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -130,7 +130,7 @@ static void fuse_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
-	if (inode->i_sb->s_flags & MS_ACTIVE) {
+	if (inode->i_sb->s_flags & SB_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		struct fuse_inode *fi = get_fuse_inode(inode);
 		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
@@ -141,7 +141,7 @@ static void fuse_evict_inode(struct inode *inode)
 static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (*flags & MS_MANDLOCK)
+	if (*flags & SB_MANDLOCK)
 		return -EINVAL;
 
 	return 0;
@@ -1056,10 +1056,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	int is_bdev = sb->s_bdev != NULL;
 
 	err = -EINVAL;
-	if (sb->s_flags & MS_MANDLOCK)
+	if (sb->s_flags & SB_MANDLOCK)
 		goto err;
 
-	sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
+	sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
 
 	if (!parse_fuse_opt(data, &d, is_bdev))
 		goto err;
@@ -1109,9 +1109,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_dev_free;
 
 	/* Handle umasking inside the fuse code */
-	if (sb->s_flags & MS_POSIXACL)
+	if (sb->s_flags & SB_POSIXACL)
 		fc->dont_mask = 1;
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 
 	fc->default_permissions = d.default_permissions;
 	fc->allow_other = d.allow_other;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a3711f543405..ad55eb86a250 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1065,15 +1065,15 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	sdp->sd_args = *args;
 
 	if (sdp->sd_args.ar_spectator) {
-                sb->s_flags |= MS_RDONLY;
+                sb->s_flags |= SB_RDONLY;
 		set_bit(SDF_RORECOVERY, &sdp->sd_flags);
 	}
 	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	if (sdp->sd_args.ar_nobarrier)
 		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 
-	sb->s_flags |= MS_NOSEC;
+	sb->s_flags |= SB_NOSEC;
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 	sb->s_d_op = &gfs2_dops;
@@ -1257,7 +1257,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	struct gfs2_args args;
 	struct gfs2_sbd *sdp;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1313,15 +1313,15 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 
 	if (s->s_root) {
 		error = -EBUSY;
-		if ((flags ^ s->s_flags) & MS_RDONLY)
+		if ((flags ^ s->s_flags) & SB_RDONLY)
 			goto error_super;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
 		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+		error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0);
 		if (error)
 			goto error_super;
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 		bdev->bd_super = s;
 	}
 
@@ -1365,7 +1365,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
 		pr_warn("gfs2 mount does not exist\n");
 		return ERR_CAST(s);
 	}
-	if ((flags ^ s->s_flags) & MS_RDONLY) {
+	if ((flags ^ s->s_flags) & SB_RDONLY) {
 		deactivate_locked_super(s);
 		return ERR_PTR(-EBUSY);
 	}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9cb5c9a97d69..d81d46e19726 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1256,10 +1256,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 
 	if (sdp->sd_args.ar_spectator)
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 
-	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-		if (*flags & MS_RDONLY)
+	if ((sb->s_flags ^ *flags) & SB_RDONLY) {
+		if (*flags & SB_RDONLY)
 			error = gfs2_make_fs_ro(sdp);
 		else
 			error = gfs2_make_fs_rw(sdp);
@@ -1269,9 +1269,9 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	sdp->sd_args = args;
 	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	else
-		sb->s_flags &= ~MS_POSIXACL;
+		sb->s_flags &= ~SB_POSIXACL;
 	if (sdp->sd_args.ar_nobarrier)
 		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 	else
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index a85ca8b2c9ba..ca8b72d0a831 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -117,7 +117,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 		kfree(tr);
 	up_read(&sdp->sd_log_flush_lock);
 
-	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+	if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS)
 		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 	if (alloced)
 		sb_end_intwrite(sdp->sd_vfs);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 894994d2c885..460281b1299e 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -204,11 +204,11 @@ int hfs_mdb_get(struct super_block *sb)
 	attrib = mdb->drAtrb;
 	if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
 		pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
 		pr_warn("filesystem is marked locked, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if (!sb_rdonly(sb)) {
 		/* Mark the volume uncleanly unmounted in case we crash */
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 7e0d65e9586c..173876782f73 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -114,18 +114,18 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	*flags |= SB_NODIRATIME;
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (!(*flags & MS_RDONLY)) {
+	if (!(*flags & SB_RDONLY)) {
 		if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
 			pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
 			pr_warn("filesystem is marked locked, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		}
 	}
 	return 0;
@@ -407,7 +407,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_op = &hfs_super_operations;
 	sb->s_xattr = hfs_xattr_handlers;
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 	mutex_init(&sbi->bitmap_lock);
 
 	res = hfs_mdb_get(sb);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index e5bb2de2262a..1d458b716957 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -329,9 +329,9 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (!(*flags & MS_RDONLY)) {
+	if (!(*flags & SB_RDONLY)) {
 		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
 		int force = 0;
 
@@ -340,20 +340,20 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 
 		if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
 			pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (force) {
 			/* nothing */
 		} else if (vhdr->attributes &
 				cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 			pr_warn("filesystem is marked locked, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (vhdr->attributes &
 				cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
 			pr_warn("filesystem is marked journaled, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		}
 	}
 	return 0;
@@ -455,16 +455,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
 		pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
 		/* nothing */
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		pr_warn("Filesystem is marked locked, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
 			!sb_rdonly(sb)) {
 		pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	err = -EINVAL;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index e0e60b148400..7c49f1ef0c85 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -288,7 +288,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
 					goto bail;
 				}
 				if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) {
-					if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok;
+					if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & SB_RDONLY) goto ok;
 					hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
 					goto bail;
 				}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 1516fb4e28f4..c45a3b9b9ac7 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -78,7 +78,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...)
 			else {
 				pr_cont("; remounting read-only\n");
 				mark_dirty(s, 0);
-				s->s_flags |= MS_RDONLY;
+				s->s_flags |= SB_RDONLY;
 			}
 		} else if (sb_rdonly(s))
 				pr_cont("; going on - but anything won't be destroyed because it's read-only\n");
@@ -457,7 +457,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
 	sync_filesystem(s);
 
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 
 	hpfs_lock(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
@@ -488,7 +488,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
 	sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
 
-	if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
+	if (!(*flags & SB_RDONLY)) mark_dirty(s, 1);
 
 	hpfs_unlock(s);
 	return 0;
@@ -614,7 +614,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 		goto bail4;
 	}
 
-	s->s_flags |= MS_NOATIME;
+	s->s_flags |= SB_NOATIME;
 
 	/* Fill superblock stuff */
 	s->s_magic = HPFS_SUPER_MAGIC;
diff --git a/fs/inode.c b/fs/inode.c
index fd401028a309..03102d6ef044 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -416,7 +416,7 @@ void inode_add_lru(struct inode *inode)
 {
 	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
 				I_FREEING | I_WILL_FREE)) &&
-	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
 		inode_lru_list_add(inode);
 }
 
@@ -595,7 +595,7 @@ static void dispose_list(struct list_head *head)
  * @sb:		superblock to operate on
  *
  * Make sure that no inodes with zero refcount are retained.  This is
- * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * called by superblock shutdown after having SB_ACTIVE flag removed,
  * so any inode reaching zero refcount during or after that call will
  * be immediately evicted.
  */
@@ -1492,7 +1492,7 @@ static void iput_final(struct inode *inode)
 	else
 		drop = generic_drop_inode(inode);
 
-	if (!drop && (sb->s_flags & MS_ACTIVE)) {
+	if (!drop && (sb->s_flags & SB_ACTIVE)) {
 		inode_add_lru(inode);
 		spin_unlock(&inode->i_lock);
 		return;
@@ -1644,7 +1644,7 @@ int generic_update_time(struct inode *inode, struct timespec *time, int flags)
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
 
-	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+	if (!(inode->i_sb->s_flags & SB_LAZYTIME) || (flags & S_VERSION))
 		iflags |= I_DIRTY_SYNC;
 	__mark_inode_dirty(inode, iflags);
 	return 0;
@@ -1691,7 +1691,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode,
 
 	if (IS_NOATIME(inode))
 		return false;
-	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return false;
 
 	if (mnt->mnt_flags & MNT_NOATIME)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 447a24d77b89..bc258a4402f6 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,7 +114,7 @@ static void destroy_inodecache(void)
 static int isofs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		return -EROFS;
 	return 0;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e96c6b05e43e..d8c274d39ddb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -409,10 +409,10 @@ int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
 		mutex_unlock(&c->alloc_sem);
 	}
 
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		jffs2_start_garbage_collect_thread(c);
 
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 824e61ede465..c2fbec19c616 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -59,7 +59,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 }
 
 
-#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY)
+#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & SB_RDONLY)
 
 #define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) )
 #ifndef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 153f1c6eb169..f60dee7faf03 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -301,10 +301,10 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_op = &jffs2_super_operations;
 	sb->s_export_op = &jffs2_export_ops;
-	sb->s_flags = sb->s_flags | MS_NOATIME;
+	sb->s_flags = sb->s_flags | SB_NOATIME;
 	sb->s_xattr = jffs2_xattr_handlers;
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 	ret = jffs2_do_fill_super(sb, data, silent);
 	return ret;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2f7b3af5b8b7..90373aebfdca 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -87,7 +87,7 @@ static void jfs_handle_error(struct super_block *sb)
 	else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
 		jfs_err("ERROR: (device %s): remounting filesystem as read-only",
 			sb->s_id);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/* nothing is done for continue beyond marking the superblock dirty */
@@ -477,7 +477,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 			return rc;
 	}
 
-	if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) {
+	if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
 		/*
 		 * Invalidate any previously read metadata.  fsck may have
 		 * changed the on-disk data since we mounted r/o
@@ -488,12 +488,12 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		ret = jfs_mount_rw(sb, 1);
 
 		/* mark the fs r/w for quota activity */
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		dquot_resume(sb, -1);
 		return ret;
 	}
-	if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) {
+	if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
 		rc = dquot_suspend(sb, -1);
 		if (rc < 0)
 			return rc;
@@ -545,7 +545,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->flag = flag;
 
 #ifdef CONFIG_JFS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 
 	if (newLVSize) {
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 95a7c88baed9..26dd9a50f383 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -335,7 +335,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 			deactivate_locked_super(sb);
 			return ERR_PTR(error);
 		}
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 
 		mutex_lock(&kernfs_mutex);
 		list_add(&info->node, &root->supers);
diff --git a/fs/libfs.c b/fs/libfs.c
index 3aabe553fc45..7ff3cb904acd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -246,7 +246,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	struct inode *root;
 	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
-	s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER,
+	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
 			&init_user_ns, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
@@ -277,7 +277,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_d_op = dops;
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
 
 Enomem:
@@ -578,7 +578,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c
 	spin_lock(&pin_fs_lock);
 	if (unlikely(!*mount)) {
 		spin_unlock(&pin_fs_lock);
-		mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);
+		mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
 		if (IS_ERR(mnt))
 			return PTR_ERR(mnt);
 		spin_lock(&pin_fs_lock);
diff --git a/fs/locks.c b/fs/locks.c
index 1bd71c4d663a..21b4dfa289ee 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -141,7 +141,7 @@
 
 static inline bool is_remote_lock(struct file *filp)
 {
-	return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK));
+	return likely(!(filp->f_path.dentry->d_sb->s_flags & SB_NOREMOTELOCK));
 }
 
 static bool lease_breaking(struct file_lock *fl)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index b6829d679643..72e308c3e66b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -125,9 +125,9 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
 
 	sync_filesystem(sb);
 	ms = sbi->s_ms;
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		if (ms->s_state & MINIX_VALID_FS ||
 		    !(sbi->s_mount_state & MINIX_VALID_FS))
 			return 0;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 129f1937fa2c..41de88cdc053 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -103,7 +103,7 @@ static void destroy_inodecache(void)
 static int ncp_remount(struct super_block *sb, int *flags, char* data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return 0;
 }
 
@@ -547,7 +547,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	else
 		default_bufsize = 1024;
 
-	sb->s_flags |= MS_NODIRATIME;	/* probably even noatime */
+	sb->s_flags |= SB_NODIRATIME;	/* probably even noatime */
 	sb->s_maxbytes = 0xFFFFFFFFU;
 	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
 	sb->s_blocksize_bits = 10;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e51ae52ed14f..2f3f86726f5b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1256,7 +1256,7 @@ static int nfs_dentry_delete(const struct dentry *dentry)
 		/* Unhash it, so that ->d_iput() would be called */
 		return 1;
 	}
-	if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
+	if (!(dentry->d_sb->s_flags & SB_ACTIVE)) {
 		/* Unhash it, so that ancestors of killed async unlink
 		 * files will be cleaned up during umount */
 		return 1;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 38b93d54c02e..b992d2382ffa 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -752,7 +752,7 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
 	 * Note that we only have to check the vfsmount flags here:
 	 *  - NFS always sets S_NOATIME by so checking it would give a
 	 *    bogus result
-	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+	 *  - NFS never sets SB_NOATIME or SB_NODIRATIME so there is
 	 *    no point in checking those.
 	 */
 	if ((path->mnt->mnt_flags & MNT_NOATIME) ||
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ab17fd4700a..8357ff69962f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -10,7 +10,7 @@
 #include <linux/nfs_page.h>
 #include <linux/wait_bit.h>
 
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+#define NFS_MS_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS)
 
 extern const struct export_operations nfs_export_ops;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 43cadb28db6e..29bacdc56f6a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -813,9 +813,9 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
 	 */
 	seq_printf(m, "\n\topts:\t");
 	seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw");
-	seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
-	seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : "");
-	seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : "");
 	nfs_show_mount_options(m, nfss, 1);
 
 	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
@@ -2296,11 +2296,11 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	/*
 	 * noac is a special case. It implies -o sync, but that's not
 	 * necessarily reflected in the mtab options. do_remount_sb
-	 * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the
+	 * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the
 	 * remount options, so we have to explicitly reset it.
 	 */
 	if (data->flags & NFS_MOUNT_NOAC)
-		*flags |= MS_SYNCHRONOUS;
+		*flags |= SB_SYNCHRONOUS;
 
 	/* compare new mount options with old ones */
 	error = nfs_compare_remount_data(nfss, data);
@@ -2349,7 +2349,7 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 		/* The VFS shouldn't apply the umask to mode bits. We will do
 		 * so ourselves when necessary.
 		 */
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 		sb->s_time_gran = 1;
 		sb->s_export_op = &nfs_export_ops;
 	}
@@ -2379,7 +2379,7 @@ static void nfs_clone_super(struct super_block *sb,
 		/* The VFS shouldn't apply the umask to mode bits. We will do
 		 * so ourselves when necessary.
 		 */
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	}
 
  	nfs_initialise_sb(sb);
@@ -2600,11 +2600,11 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 
 	/* -o noac implies -o sync */
 	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+		sb_mntdata.mntflags |= SB_SYNCHRONOUS;
 
 	if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
-		if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
-			sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+		if (mount_info->cloned->sb->s_flags & SB_SYNCHRONOUS)
+			sb_mntdata.mntflags |= SB_SYNCHRONOUS;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
 	s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
@@ -2641,7 +2641,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 	if (error)
 		goto error_splat_root;
 
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 
 out:
 	return mntroot;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index f572538dcc4f..9f3ffba41533 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1979,7 +1979,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
 	struct nilfs_inode_info *ii, *n;
-	int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
+	int during_mount = !(sci->sc_super->s_flags & SB_ACTIVE);
 	int defer_iput = false;
 
 	spin_lock(&nilfs->ns_inode_lock);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3ce20cd44a20..3073b646e1ba 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -141,7 +141,7 @@ void __nilfs_error(struct super_block *sb, const char *function,
 
 		if (nilfs_test_opt(nilfs, ERRORS_RO)) {
 			printk(KERN_CRIT "Remounting filesystem read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 	}
 
@@ -869,7 +869,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 
 	/* FS independent flags */
 #ifdef NILFS_ATIME_DISABLE
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 #endif
 
 	nilfs_set_default_options(sb, sbp);
@@ -1133,7 +1133,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		err = -EINVAL;
 		goto restore_opts;
 	}
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
 
 	err = -EINVAL;
 
@@ -1143,12 +1143,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		/* Shutting down log writer */
 		nilfs_detach_log_writer(sb);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 		/*
 		 * Remounting a valid RW partition RDONLY, so set
@@ -1178,7 +1178,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore_opts;
 		}
 
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		root = NILFS_I(d_inode(sb->s_root))->i_root;
 		err = nilfs_attach_log_writer(sb, root);
@@ -1212,7 +1212,7 @@ static int nilfs_parse_snapshot_option(const char *option,
 	const char *msg = NULL;
 	int err;
 
-	if (!(sd->flags & MS_RDONLY)) {
+	if (!(sd->flags & SB_RDONLY)) {
 		msg = "read-only option is not specified";
 		goto parse_error;
 	}
@@ -1286,7 +1286,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	struct dentry *root_dentry;
 	int err, s_new = false;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1327,14 +1327,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
 		sb_set_blocksize(s, block_size(sd.bdev));
 
-		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (err)
 			goto failed_super;
 
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 	} else if (!sd.cno) {
 		if (nilfs_tree_is_busy(s->s_root)) {
-			if ((flags ^ s->s_flags) & MS_RDONLY) {
+			if ((flags ^ s->s_flags) & SB_RDONLY) {
 				nilfs_msg(s, KERN_ERR,
 					  "the device already has a %s mount.",
 					  sb_rdonly(s) ? "read-only" : "read/write");
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index afebb5067cec..1a85317e83f0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -220,7 +220,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 
 	if (!valid_fs) {
 		nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
-		if (s_flags & MS_RDONLY) {
+		if (s_flags & SB_RDONLY) {
 			nilfs_msg(sb, KERN_INFO,
 				  "recovery required for readonly filesystem");
 			nilfs_msg(sb, KERN_INFO,
@@ -286,7 +286,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 	if (valid_fs)
 		goto skip_recovery;
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		__u64 features;
 
 		if (nilfs_test_opt(nilfs, NORECOVERY)) {
@@ -309,7 +309,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 			err = -EROFS;
 			goto failed_unload;
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 	} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
 		nilfs_msg(sb, KERN_ERR,
 			  "recovery cancelled because norecovery option was specified for a read/write mount");
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 81d8959b6aef..219b269c737e 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -67,7 +67,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
 
 		/*
 		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * doing an __iget/iput with SB_ACTIVE clear would actually
 		 * evict all inodes with zero i_count from icache which is
 		 * unnecessarily violent and may in fact be illegal to do.
 		 */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index ef243e14b6eb..7c6f76d29f56 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -255,5 +255,5 @@ void __init nsfs_init(void)
 	nsfs_mnt = kern_mount(&nsfs);
 	if (IS_ERR(nsfs_mnt))
 		panic("can't set nsfs up\n");
-	nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+	nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3f70f041dbe9..bb7159f697f2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -473,7 +473,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 #else /* NTFS_RW */
 	/*
 	 * For the read-write compiled driver, if we are remounting read-write,
@@ -487,7 +487,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	 * When remounting read-only, mark the volume clean if no volume errors
 	 * have occurred.
 	 */
-	if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) {
+	if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
 		static const char *es = ".  Cannot remount read-write.";
 
 		/* Remounting read-write. */
@@ -548,7 +548,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			NVolSetErrors(vol);
 			return -EROFS;
 		}
-	} else if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) {
+	} else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
 		/* Remounting read-only. */
 		if (!NVolErrors(vol)) {
 			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
@@ -1799,7 +1799,7 @@ static bool load_system_files(ntfs_volume *vol)
 						es3);
 				goto iput_mirr_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s",
 					!vol->mftmirr_ino ? es1 : es2, es3);
 		} else
@@ -1937,7 +1937,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_vol_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1974,7 +1974,7 @@ get_ctx_vol_failed:
 				}
 				goto iput_logfile_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2019,7 +2019,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_root_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2042,7 +2042,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		/*
 		 * Do not set NVolErrors() because ntfs_remount() might manage
 		 * to set the dirty flag in which case all would be well.
@@ -2055,7 +2055,7 @@ get_ctx_vol_failed:
 	 * If (still) a read-write mount, set the NT4 compatibility flag on
 	 * newer NTFS version volumes.
 	 */
-	if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) &&
+	if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
 			ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
 		static const char *es1 = "Failed to set NT4 compatibility flag";
 		static const char *es2 = ".  Run chkdsk.";
@@ -2069,7 +2069,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif
@@ -2087,7 +2087,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
@@ -2128,7 +2128,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_quota_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2150,7 +2150,7 @@ get_ctx_vol_failed:
 			goto iput_quota_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 	/*
@@ -2171,7 +2171,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_usnjrnl_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2194,7 +2194,7 @@ get_ctx_vol_failed:
 			goto iput_usnjrnl_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
@@ -2728,7 +2728,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	lockdep_off();
 	ntfs_debug("Entering.");
 #ifndef NTFS_RW
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 #endif /* ! NTFS_RW */
 	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
 	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index dc455d45a66a..a1d051055472 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -227,7 +227,7 @@ int ocfs2_should_update_atime(struct inode *inode,
 		return 0;
 
 	if ((inode->i_flags & S_NOATIME) ||
-	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
+	    ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 
 	/*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 040bbb6a6e4b..80efa5699fb0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -675,9 +675,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	/* We're going to/from readonly mode. */
-	if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
+	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
 		/* Disable quota accounting before remounting RO */
-		if (*flags & MS_RDONLY) {
+		if (*flags & SB_RDONLY) {
 			ret = ocfs2_susp_quotas(osb, 0);
 			if (ret < 0)
 				goto out;
@@ -691,8 +691,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			goto unlock_osb;
 		}
 
-		if (*flags & MS_RDONLY) {
-			sb->s_flags |= MS_RDONLY;
+		if (*flags & SB_RDONLY) {
+			sb->s_flags |= SB_RDONLY;
 			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 		} else {
 			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
@@ -709,14 +709,14 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 				ret = -EINVAL;
 				goto unlock_osb;
 			}
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
 		}
 		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
 		/* Enable quota accounting after remounting RW */
-		if (!ret && !(*flags & MS_RDONLY)) {
+		if (!ret && !(*flags & SB_RDONLY)) {
 			if (sb_any_quota_suspended(sb))
 				ret = ocfs2_susp_quotas(osb, 1);
 			else
@@ -724,7 +724,7 @@ unlock_osb:
 			if (ret < 0) {
 				/* Return back changes... */
 				spin_lock(&osb->osb_lock);
-				sb->s_flags |= MS_RDONLY;
+				sb->s_flags |= SB_RDONLY;
 				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 				spin_unlock(&osb->osb_lock);
 				goto out;
@@ -744,9 +744,9 @@ unlock_osb:
 		if (!ocfs2_is_hard_readonly(osb))
 			ocfs2_set_journal_params(osb);
 
-		sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 			((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
-							MS_POSIXACL : 0);
+							SB_POSIXACL : 0);
 	}
 out:
 	return ret;
@@ -1057,10 +1057,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
-	sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
-		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) |
+		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
 
-	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+	/* Hard readonly mode only if: bdev_read_only, SB_RDONLY,
 	 * heartbeat=none */
 	if (bdev_read_only(sb->s_bdev)) {
 		if (!sb_rdonly(sb)) {
@@ -2057,7 +2057,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
 	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 	/* this is needed to support O_LARGEFILE */
 	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
@@ -2568,7 +2568,7 @@ static int ocfs2_handle_error(struct super_block *sb)
 			return rv;
 
 		pr_crit("OCFS2: File system is now read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		ocfs2_set_ro_flag(osb, 0);
 	}
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5fdf269ba82e..c5898c59d411 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -901,7 +901,7 @@ static int ocfs2_xattr_list_entry(struct super_block *sb,
 
 	case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
 	case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
-		if (!(sb->s_flags & MS_POSIXACL))
+		if (!(sb->s_flags & SB_POSIXACL))
 			return 0;
 		break;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 13215f26e321..2200662a9bf1 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -369,7 +369,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 static int openprom_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -386,7 +386,7 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
 	struct op_inode_info *oi;
 	int ret;
 
-	s->s_flags |= MS_NOATIME;
+	s->s_flags |= SB_NOATIME;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = OPENPROM_SUPER_MAGIC;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 366750eef201..36f1390b5ed7 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -40,7 +40,7 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb);
 
-	if (root->d_sb->s_flags & MS_POSIXACL)
+	if (root->d_sb->s_flags & SB_POSIXACL)
 		seq_puts(m, ",acl");
 	if (orangefs_sb->flags & ORANGEFS_OPT_INTR)
 		seq_puts(m, ",intr");
@@ -60,7 +60,7 @@ static int parse_mount_options(struct super_block *sb, char *options,
 	 * Force any potential flags that might be set from the mount
 	 * to zero, ie, initialize to unset.
 	 */
-	sb->s_flags &= ~MS_POSIXACL;
+	sb->s_flags &= ~SB_POSIXACL;
 	orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
 	orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
 
@@ -73,7 +73,7 @@ static int parse_mount_options(struct super_block *sb, char *options,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_acl:
-			sb->s_flags |= MS_POSIXACL;
+			sb->s_flags |= SB_POSIXACL;
 			break;
 		case Opt_intr:
 			orangefs_sb->flags |= ORANGEFS_OPT_INTR;
@@ -507,7 +507,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 
 	ret = orangefs_fill_sb(sb,
 	      &new_op->downcall.resp.fs_mount, data,
-	      flags & MS_SILENT ? 1 : 0);
+	      flags & SB_SILENT ? 1 : 0);
 
 	if (ret) {
 		d = ERR_PTR(ret);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index be03578181d2..288d20f9a55a 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -326,7 +326,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct ovl_fs *ofs = sb->s_fs_info;
 
-	if (!(*flags & MS_RDONLY) && ovl_force_readonly(ofs))
+	if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs))
 		return -EROFS;
 
 	return 0;
@@ -1190,7 +1190,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 			goto out_err;
 
 		if (!ofs->workdir)
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 
 		sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth;
 		sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran;
@@ -1203,7 +1203,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
 	if (!ofs->upper_mnt)
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	else if (ofs->upper_mnt->mnt_sb != ofs->same_sb)
 		ofs->same_sb = NULL;
 
@@ -1213,7 +1213,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 			goto out_free_oe;
 
 		if (!ofs->indexdir)
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 	}
 
 	/* Show index=off/on in /proc/mounts for any of the reasons above */
@@ -1227,7 +1227,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &ovl_super_operations;
 	sb->s_xattr = ovl_xattr_handlers;
 	sb->s_fs_info = ofs;
-	sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
+	sb->s_flags |= SB_POSIXACL | SB_NOREMOTELOCK;
 
 	err = -ENOMEM;
 	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 225f541f7078..dd0f82622427 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -483,7 +483,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 
 	/* User space would break if executables or devices appear on proc */
 	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
-	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
+	s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = PROC_SUPER_MAGIC;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4e42aba97f2e..ede8e64974be 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -91,7 +91,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 {
 	struct pid_namespace *ns;
 
-	if (flags & MS_KERNMOUNT) {
+	if (flags & SB_KERNMOUNT) {
 		ns = data;
 		data = NULL;
 	} else {
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7b635d173213..b786840facd9 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -45,10 +45,10 @@ struct proc_fs_info {
 static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
 	static const struct proc_fs_info fs_info[] = {
-		{ MS_SYNCHRONOUS, ",sync" },
-		{ MS_DIRSYNC, ",dirsync" },
-		{ MS_MANDLOCK, ",mand" },
-		{ MS_LAZYTIME, ",lazytime" },
+		{ SB_SYNCHRONOUS, ",sync" },
+		{ SB_DIRSYNC, ",dirsync" },
+		{ SB_MANDLOCK, ",mand" },
+		{ SB_LAZYTIME, ",lazytime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3a67cfb142d8..3d46fe302fcb 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -47,7 +47,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
 	sync_filesystem(sb);
 	qs = qnx4_sb(sb);
 	qs->Version = QNX4_VERSION;
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -199,7 +199,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 
 	s->s_op = &qnx4_sops;
 	s->s_magic = QNX4_SUPER_MAGIC;
-	s->s_flags |= MS_RDONLY;	/* Yup, read-only yet */
+	s->s_flags |= SB_RDONLY;	/* Yup, read-only yet */
 
 	/* Check the superblock signature. Since the qnx4 code is
 	   dangerous, we should leave as quickly as possible
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 1192422a1c56..4aeb26bcb4d0 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -56,7 +56,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
 static int qnx6_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -427,7 +427,7 @@ mmi_success:
 	}
 	s->s_op = &qnx6_sops;
 	s->s_magic = QNX6_SUPER_MAGIC;
-	s->s_flags |= MS_RDONLY;        /* Yup, read-only yet */
+	s->s_flags |= SB_RDONLY;        /* Yup, read-only yet */
 
 	/* ease the later tree level calculations */
 	sbi = QNX6_SB(s);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 11a48affa882..b13fc024d2ee 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2106,7 +2106,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 			journal_end(th);
 			goto out_inserted_sd;
 		}
-	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
+	} else if (inode->i_sb->s_flags & SB_POSIXACL) {
 		reiserfs_warning(inode->i_sb, "jdm-13090",
 				 "ACLs aren't enabled in the fs, "
 				 "but vfs thinks they are!");
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 69ff280bdfe8..70057359fbaf 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1960,7 +1960,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 	/*
 	 * Cancel flushing of old commits. Note that neither of these works
 	 * will be requeued because superblock is being shutdown and doesn't
-	 * have MS_ACTIVE set.
+	 * have SB_ACTIVE set.
 	 */
 	reiserfs_cancel_old_flush(sb);
 	/* wait for all commits to finish */
@@ -4302,7 +4302,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
 		 * Avoid queueing work when sb is being shut down. Transaction
 		 * will be flushed on journal shutdown.
 		 */
-		if (sb->s_flags & MS_ACTIVE)
+		if (sb->s_flags & SB_ACTIVE)
 			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
 					   &journal->j_work, HZ / 10);
 	}
@@ -4393,7 +4393,7 @@ void reiserfs_abort_journal(struct super_block *sb, int errno)
 	if (!journal->j_errno)
 		journal->j_errno = errno;
 
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	set_bit(J_ABORTED, &journal->j_state);
 
 #ifdef CONFIG_REISERFS_CHECK
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 64f49cafbc5b..7e288d97adcb 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -390,7 +390,7 @@ void __reiserfs_error(struct super_block *sb, const char *id,
 		return;
 
 	reiserfs_info(sb, "Remounting filesystem read-only\n");
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	reiserfs_abort_journal(sb, -EIO);
 }
 
@@ -409,7 +409,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
 	printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
 	       error_buf);
 
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	reiserfs_abort_journal(sb, errno);
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5464ec517702..020c9cacbb2f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -121,7 +121,7 @@ void reiserfs_schedule_old_flush(struct super_block *s)
 	 * Avoid scheduling flush when sb is being shut down. It can race
 	 * with journal shutdown and free still queued delayed work.
 	 */
-	if (sb_rdonly(s) || !(s->s_flags & MS_ACTIVE))
+	if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
 		return;
 
 	spin_lock(&sbi->old_work_lock);
@@ -252,11 +252,11 @@ static int finish_unfinished(struct super_block *s)
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	if (s->s_flags & MS_ACTIVE) {
+	if (s->s_flags & SB_ACTIVE) {
 		ms_active_set = 0;
 	} else {
 		ms_active_set = 1;
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 	}
 	/* Turn on quotas so that they are updated correctly */
 	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
@@ -411,7 +411,7 @@ static int finish_unfinished(struct super_block *s)
 	reiserfs_write_lock(s);
 	if (ms_active_set)
 		/* Restore the flag back */
-		s->s_flags &= ~MS_ACTIVE;
+		s->s_flags &= ~SB_ACTIVE;
 #endif
 	pathrelse(&path);
 	if (done)
@@ -1521,7 +1521,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 			goto out_err_unlock;
 	}
 
-	if (*mount_flags & MS_RDONLY) {
+	if (*mount_flags & SB_RDONLY) {
 		reiserfs_write_unlock(s);
 		reiserfs_xattr_init(s, *mount_flags);
 		/* remount read-only */
@@ -1567,7 +1567,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
 
 		/* now it is safe to call journal_begin */
-		s->s_flags &= ~MS_RDONLY;
+		s->s_flags &= ~SB_RDONLY;
 		err = journal_begin(&th, s, 10);
 		if (err)
 			goto out_err_unlock;
@@ -1575,7 +1575,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		/* Mount a partition which is read-only, read-write */
 		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
 		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-		s->s_flags &= ~MS_RDONLY;
+		s->s_flags &= ~SB_RDONLY;
 		set_sb_umount_state(rs, REISERFS_ERROR_FS);
 		if (!old_format_only(s))
 			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
@@ -1590,7 +1590,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		goto out_err_unlock;
 
 	reiserfs_write_unlock(s);
-	if (!(*mount_flags & MS_RDONLY)) {
+	if (!(*mount_flags & SB_RDONLY)) {
 		dquot_resume(s, -1);
 		reiserfs_write_lock(s);
 		finish_unfinished(s);
@@ -2055,7 +2055,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
 		SWARN(silent, s, "clm-7000",
 		      "Detected readonly device, marking FS readonly");
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 	}
 	args.objectid = REISERFS_ROOT_OBJECTID;
 	args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 46492fb37a4c..5dbf5324bdda 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -959,7 +959,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 
 /*
  * We need to take a copy of the mount flags since things like
- * MS_RDONLY don't get set until *after* we're called.
+ * SB_RDONLY don't get set until *after* we're called.
  * mount_flags != mount_options
  */
 int reiserfs_xattr_init(struct super_block *s, int mount_flags)
@@ -971,7 +971,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 	if (err)
 		goto error;
 
-	if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
+	if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
 		inode_lock(d_inode(s->s_root));
 		err = create_privroot(REISERFS_SB(s)->priv_root);
 		inode_unlock(d_inode(s->s_root));
@@ -999,11 +999,11 @@ error:
 		clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
 	}
 
-	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
+	/* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
 	if (reiserfs_posixacl(s))
-		s->s_flags |= MS_POSIXACL;
+		s->s_flags |= SB_POSIXACL;
 	else
-		s->s_flags &= ~MS_POSIXACL;
+		s->s_flags &= ~SB_POSIXACL;
 
 	return err;
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 0186fe6d39f3..8f06fd1f3d69 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -451,7 +451,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int romfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -502,7 +502,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_maxbytes = 0xFFFFFFFF;
 	sb->s_magic = ROMFS_MAGIC;
-	sb->s_flags |= MS_RDONLY | MS_NOATIME;
+	sb->s_flags |= SB_RDONLY | SB_NOATIME;
 	sb->s_op = &romfs_super_ops;
 
 #ifdef CONFIG_ROMFS_ON_MTD
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cf01e15a7b16..8a73b97217c8 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -195,7 +195,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 		(u64) le64_to_cpu(sblk->id_table_start));
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &squashfs_super_ops;
 
 	err = -ENOMEM;
@@ -373,7 +373,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
diff --git a/fs/statfs.c b/fs/statfs.c
index b072a8bab71a..5b2a24f0f263 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -35,11 +35,11 @@ static int flags_by_mnt(int mnt_flags)
 static int flags_by_sb(int s_flags)
 {
 	int flags = 0;
-	if (s_flags & MS_SYNCHRONOUS)
+	if (s_flags & SB_SYNCHRONOUS)
 		flags |= ST_SYNCHRONOUS;
-	if (s_flags & MS_MANDLOCK)
+	if (s_flags & SB_MANDLOCK)
 		flags |= ST_MANDLOCK;
-	if (s_flags & MS_RDONLY)
+	if (s_flags & SB_RDONLY)
 		flags |= ST_RDONLY;
 	return flags;
 }
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 20b8f82e115b..fb49510c5dcf 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -30,7 +30,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	void *ns;
 	bool new_sb;
 
-	if (!(flags & MS_KERNMOUNT)) {
+	if (!(flags & SB_KERNMOUNT)) {
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
 			return ERR_PTR(-EPERM);
 	}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3c47b7d5d4cf..bec9f79adb25 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -63,7 +63,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
 
 	sync_filesystem(sb);
 	if (sbi->s_forced_ro)
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 	return 0;
 }
 
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 0d56e486b392..89765ddfb738 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -333,7 +333,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
 	/* set up enough so that it can read an inode */
 	sb->s_op = &sysv_sops;
 	if (sbi->s_forced_ro)
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	if (sbi->s_truncate)
 		sb->s_d_op = &sysv_dentry_operations;
 	root_inode = sysv_iget(sb, SYSV_ROOT_INO);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a02aa59d1e24..dfe85069586e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1406,7 +1406,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
 
-	if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+	if (!(inode->i_sb->s_flags & SB_LAZYTIME))
 		iflags |= I_DIRTY_SYNC;
 
 	release = ui->dirty;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3be28900bf37..fe77e9625e84 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -84,7 +84,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 	if (!c->ro_error) {
 		c->ro_error = 1;
 		c->no_chk_data_crc = 0;
-		c->vfs_sb->s_flags |= MS_RDONLY;
+		c->vfs_sb->s_flags |= SB_RDONLY;
 		ubifs_warn(c, "switched to read-only mode, error %d", err);
 		dump_stack();
 	}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7503e7cdf870..0beb285b143d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -968,7 +968,7 @@ static int parse_standard_option(const char *option)
 
 	pr_notice("UBIFS: parse %s\n", option);
 	if (!strcmp(option, "sync"))
-		return MS_SYNCHRONOUS;
+		return SB_SYNCHRONOUS;
 	return 0;
 }
 
@@ -1160,8 +1160,8 @@ static int mount_ubifs(struct ubifs_info *c)
 	size_t sz;
 
 	c->ro_mount = !!sb_rdonly(c->vfs_sb);
-	/* Suppress error messages while probing if MS_SILENT is set */
-	c->probing = !!(c->vfs_sb->s_flags & MS_SILENT);
+	/* Suppress error messages while probing if SB_SILENT is set */
+	c->probing = !!(c->vfs_sb->s_flags & SB_SILENT);
 
 	err = init_constants_early(c);
 	if (err)
@@ -1852,7 +1852,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
-	if (c->ro_mount && !(*flags & MS_RDONLY)) {
+	if (c->ro_mount && !(*flags & SB_RDONLY)) {
 		if (c->ro_error) {
 			ubifs_msg(c, "cannot re-mount R/W due to prior errors");
 			return -EROFS;
@@ -1864,7 +1864,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
-	} else if (!c->ro_mount && (*flags & MS_RDONLY)) {
+	} else if (!c->ro_mount && (*flags & SB_RDONLY)) {
 		if (c->ro_error) {
 			ubifs_msg(c, "cannot re-mount R/O due to prior errors");
 			return -EROFS;
@@ -2117,7 +2117,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 	 */
 	ubi = open_ubi(name, UBI_READONLY);
 	if (IS_ERR(ubi)) {
-		if (!(flags & MS_SILENT))
+		if (!(flags & SB_SILENT))
 			pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
 			       current->pid, name, (int)PTR_ERR(ubi));
 		return ERR_CAST(ubi);
@@ -2143,18 +2143,18 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 		kfree(c);
 		/* A new mount point for already mounted UBIFS */
 		dbg_gen("this ubi volume is already mounted");
-		if (!!(flags & MS_RDONLY) != c1->ro_mount) {
+		if (!!(flags & SB_RDONLY) != c1->ro_mount) {
 			err = -EBUSY;
 			goto out_deact;
 		}
 	} else {
-		err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
 		if (err)
 			goto out_deact;
 		/* We do not support atime */
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 #ifndef CONFIG_UBIFS_ATIME_SUPPORT
-		sb->s_flags |= MS_NOATIME;
+		sb->s_flags |= SB_NOATIME;
 #else
 		ubifs_msg(c, "full atime support is enabled.");
 #endif
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 63c7468147eb..5ee7af879cc4 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1201,7 +1201,7 @@ struct ubifs_debug_info;
  * @need_recovery: %1 if the file-system needs recovery
  * @replaying: %1 during journal replay
  * @mounting: %1 while mounting
- * @probing: %1 while attempting to mount if MS_SILENT mount flag is set
+ * @probing: %1 while attempting to mount if SB_SILENT mount flag is set
  * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
  * @replay_list: temporary list used during journal replay
  * @replay_buds: list of buds to replay
@@ -1850,7 +1850,7 @@ __printf(2, 3)
 void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
 /*
  * A conditional variant of 'ubifs_err()' which doesn't output anything
- * if probing (ie. MS_SILENT set).
+ * if probing (ie. SB_SILENT set).
  */
 #define ubifs_errc(c, fmt, ...)						\
 do {									\
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f80e0a0f24d3..f73239a9a97d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -650,7 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	sync_filesystem(sb);
 	if (lvidiu) {
 		int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
-		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
+		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY))
 			return -EACCES;
 	}
 
@@ -673,10 +673,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	sbi->s_dmode = uopt.dmode;
 	write_unlock(&sbi->s_cred_lock);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out_unlock;
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index b5cd79065ef9..e727ee07dbe4 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -115,7 +115,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -205,7 +205,7 @@ do_more:
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 
 	if (overflow) {
@@ -567,7 +567,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -688,7 +688,7 @@ cg_found:
 succed:
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 916b4a428933..e1ef0f0a1353 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -112,7 +112,7 @@ void ufs_free_inode (struct inode * inode)
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	
 	ufs_mark_sb_dirty(sb);
@@ -146,14 +146,14 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
 		set_buffer_uptodate(bh);
 		mark_buffer_dirty(bh);
 		unlock_buffer(bh);
-		if (sb->s_flags & MS_SYNCHRONOUS)
+		if (sb->s_flags & SB_SYNCHRONOUS)
 			sync_dirty_buffer(bh);
 		brelse(bh);
 	}
 
 	fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
 	ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 
 	UFSD("EXIT\n");
@@ -284,7 +284,7 @@ cg_found:
 	}
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -330,7 +330,7 @@ cg_found:
 		ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec);
 		mark_buffer_dirty(bh);
 		unlock_buffer(bh);
-		if (sb->s_flags & MS_SYNCHRONOUS)
+		if (sb->s_flags & SB_SYNCHRONOUS)
 			sync_dirty_buffer(bh);
 		brelse(bh);
 	}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6440003f8ddc..4d497e9c6883 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -282,7 +282,7 @@ void ufs_error (struct super_block * sb, const char * function,
 		usb1->fs_clean = UFS_FSBAD;
 		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		ufs_mark_sb_dirty(sb);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	va_start(args, fmt);
 	vaf.fmt = fmt;
@@ -320,7 +320,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	pr_crit("panic (device %s): %s: %pV\n",
 		sb->s_id, function, &vaf);
 	va_end(args);
@@ -905,7 +905,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=old is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -921,7 +921,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=nextstep is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -937,7 +937,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=nextstep-cd is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -953,7 +953,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=openstep is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -968,7 +968,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=hp is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
  		}
  		break;
 	default:
@@ -1125,21 +1125,21 @@ magic_found:
 			break;
 		case UFS_FSACTIVE:
 			pr_err("%s(): fs is active\n", __func__);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		case UFS_FSBAD:
 			pr_err("%s(): fs is bad\n", __func__);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		default:
 			pr_err("%s(): can't grok fs_clean 0x%x\n",
 			       __func__, usb1->fs_clean);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		}
 	} else {
 		pr_err("%s(): fs needs fsck\n", __func__);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/*
@@ -1328,7 +1328,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		return -EINVAL;
 	}
 
-	if ((bool)(*mount_flags & MS_RDONLY) == sb_rdonly(sb)) {
+	if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
 		mutex_unlock(&UFS_SB(sb)->s_lock);
 		return 0;
@@ -1337,7 +1337,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	/*
 	 * fs was mouted as rw, remounting ro
 	 */
-	if (*mount_flags & MS_RDONLY) {
+	if (*mount_flags & SB_RDONLY) {
 		ufs_put_super_internal(sb);
 		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
 		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
@@ -1346,7 +1346,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			ufs_set_fs_state(sb, usb1, usb3,
 				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
 		ubh_mark_buffer_dirty (USPI_UBH(uspi));
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else {
 	/*
 	 * fs was mounted as ro, remounting rw
@@ -1370,7 +1370,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			mutex_unlock(&UFS_SB(sb)->s_lock);
 			return -EPERM;
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 #endif
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 38d4227895ae..a503af96d780 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -781,17 +781,17 @@ xfs_log_mount_finish(
 	 * something to an unlinked inode, the irele won't cause
 	 * premature truncation and freeing of the inode, which results
 	 * in log recovery failure.  We have to evict the unreferenced
-	 * lru inodes after clearing MS_ACTIVE because we don't
+	 * lru inodes after clearing SB_ACTIVE because we don't
 	 * otherwise clean up the lru if there's a subsequent failure in
 	 * xfs_mountfs, which leads to us leaking the inodes if nothing
 	 * else (e.g. quotacheck) references the inodes before the
 	 * mount failure occurs.
 	 */
-	mp->m_super->s_flags |= MS_ACTIVE;
+	mp->m_super->s_flags |= SB_ACTIVE;
 	error = xlog_recover_finish(mp->m_log);
 	if (!error)
 		xfs_log_work_queue(mp);
-	mp->m_super->s_flags &= ~MS_ACTIVE;
+	mp->m_super->s_flags &= ~SB_ACTIVE;
 	evict_inodes(mp->m_super);
 
 	/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f663022353c0..5122d3021117 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -212,9 +212,9 @@ xfs_parseargs(
 	 */
 	if (sb_rdonly(sb))
 		mp->m_flags |= XFS_MOUNT_RDONLY;
-	if (sb->s_flags & MS_DIRSYNC)
+	if (sb->s_flags & SB_DIRSYNC)
 		mp->m_flags |= XFS_MOUNT_DIRSYNC;
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		mp->m_flags |= XFS_MOUNT_WSYNC;
 
 	/*
@@ -1312,7 +1312,7 @@ xfs_fs_remount(
 	}
 
 	/* ro -> rw */
-	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
 		if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
 			xfs_warn(mp,
 		"ro->rw transition prohibited on norecovery mount");
@@ -1368,7 +1368,7 @@ xfs_fs_remount(
 	}
 
 	/* rw -> ro */
-	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
 		/* Free the per-AG metadata reservation pool. */
 		error = xfs_fs_unreserve_ag_blocks(mp);
 		if (error) {
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 5f2f32408011..fcc5dfc70aa0 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -30,7 +30,7 @@ extern void xfs_qm_exit(void);
 
 #ifdef CONFIG_XFS_POSIX_ACL
 # define XFS_ACL_STRING		"ACLs, "
-# define set_posix_acl_flag(sb)	((sb)->s_flags |= MS_POSIXACL)
+# define set_posix_acl_flag(sb)	((sb)->s_flags |= SB_POSIXACL)
 #else
 # define XFS_ACL_STRING
 # define set_posix_acl_flag(sb)	do { } while (0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2995a271ec46..bbd92da0946e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1872,7 +1872,7 @@ struct super_operations {
  */
 #define __IS_FLG(inode, flg)	((inode)->i_sb->s_flags & (flg))
 
-static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & MS_RDONLY; }
+static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
 #define IS_RDONLY(inode)	sb_rdonly((inode)->i_sb)
 #define IS_SYNC(inode)		(__IS_FLG(inode, SB_SYNCHRONOUS) || \
 					((inode)->i_flags & S_SYNC))
diff --git a/include/uapi/linux/bfs_fs.h b/include/uapi/linux/bfs_fs.h
index 73445ef07dda..940b04772af8 100644
--- a/include/uapi/linux/bfs_fs.h
+++ b/include/uapi/linux/bfs_fs.h
@@ -76,7 +76,7 @@ struct bfs_super_block {
 #define BFS_FILEBLOCKS(ip) \
         ((ip)->i_sblock == 0 ? 0 : (le32_to_cpu((ip)->i_eblock) + 1) -  le32_to_cpu((ip)->i_sblock))
 #define BFS_UNCLEAN(bfs_sb, sb)	\
-	((le32_to_cpu(bfs_sb->s_from) != -1) && (le32_to_cpu(bfs_sb->s_to) != -1) && !(sb->s_flags & MS_RDONLY))
+	((le32_to_cpu(bfs_sb->s_from) != -1) && (le32_to_cpu(bfs_sb->s_to) != -1) && !(sb->s_flags & SB_RDONLY))
 
 
 #endif	/* _LINUX_BFS_FS_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index d24025626310..9649ecd8a73a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -331,7 +331,7 @@ static struct dentry *mqueue_mount(struct file_system_type *fs_type,
 			 void *data)
 {
 	struct ipc_namespace *ns;
-	if (flags & MS_KERNMOUNT) {
+	if (flags & SB_KERNMOUNT) {
 		ns = data;
 		data = NULL;
 	} else {
diff --git a/mm/shmem.c b/mm/shmem.c
index 4aa9307feab0..7fbe67be86fa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3776,7 +3776,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	 * tmpfs instance, limiting inodes to one per page of lowmem;
 	 * but the internal instance is left unlimited.
 	 */
-	if (!(sb->s_flags & MS_KERNMOUNT)) {
+	if (!(sb->s_flags & SB_KERNMOUNT)) {
 		sbinfo->max_blocks = shmem_default_max_blocks();
 		sbinfo->max_inodes = shmem_default_max_inodes();
 		if (shmem_parse_options(data, sbinfo, false)) {
@@ -3784,12 +3784,12 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed;
 		}
 	} else {
-		sb->s_flags |= MS_NOUSER;
+		sb->s_flags |= SB_NOUSER;
 	}
 	sb->s_export_op = &shmem_export_ops;
-	sb->s_flags |= MS_NOSEC;
+	sb->s_flags |= SB_NOSEC;
 #else
-	sb->s_flags |= MS_NOUSER;
+	sb->s_flags |= SB_NOUSER;
 #endif
 
 	spin_lock_init(&sbinfo->stat_lock);
@@ -3809,7 +3809,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_xattr = shmem_xattr_handlers;
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 	uuid_gen(&sb->s_uuid);
 
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 8542e9a55e1b..d4fa04d91439 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -2451,7 +2451,7 @@ static int __init aa_create_aafs(void)
 	aafs_mnt = kern_mount(&aafs_ops);
 	if (IS_ERR(aafs_mnt))
 		panic("can't set apparmorfs up\n");
-	aafs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+	aafs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
 
 	/* Populate fs tree. */
 	error = entry_create_dir(&aa_sfs_entry, NULL);
diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h
index f546707a2bbb..6505e1ad9e23 100644
--- a/security/apparmor/include/lib.h
+++ b/security/apparmor/include/lib.h
@@ -86,7 +86,7 @@ static inline unsigned int aa_dfa_null_transition(struct aa_dfa *dfa,
 
 static inline bool path_mediated_fs(struct dentry *dentry)
 {
-	return !(dentry->d_sb->s_flags & MS_NOUSER);
+	return !(dentry->d_sb->s_flags & SB_NOUSER);
 }
 
 
-- 
cgit v1.2.3


From 8ced390c2b18364af35e3d3f080e06f8ea96be9a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Jul 2017 22:05:03 -0400
Subject: define __poll_t, annotate constants

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/blackfin/include/uapi/asm/poll.h |  4 ++--
 arch/frv/include/uapi/asm/poll.h      |  2 +-
 arch/m68k/include/uapi/asm/poll.h     |  2 +-
 arch/mips/include/uapi/asm/poll.h     |  2 +-
 arch/sparc/include/uapi/asm/poll.h    |  8 ++++----
 arch/xtensa/include/uapi/asm/poll.h   |  4 ++--
 include/uapi/asm-generic/poll.h       | 30 +++++++++++++++---------------
 include/uapi/linux/types.h            |  6 ++++++
 8 files changed, 32 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/blackfin/include/uapi/asm/poll.h b/arch/blackfin/include/uapi/asm/poll.h
index 8b094d43e9b7..961e49056b0a 100644
--- a/arch/blackfin/include/uapi/asm/poll.h
+++ b/arch/blackfin/include/uapi/asm/poll.h
@@ -9,8 +9,8 @@
 #ifndef _UAPI__BFIN_POLL_H
 #define _UAPI__BFIN_POLL_H
 
-#define POLLWRNORM	4 /* POLLOUT */
-#define POLLWRBAND	256
+#define POLLWRNORM	(__force __poll_t)4 /* POLLOUT */
+#define POLLWRBAND	(__force __poll_t)256
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/frv/include/uapi/asm/poll.h b/arch/frv/include/uapi/asm/poll.h
index 887b67288340..d7053ada7c69 100644
--- a/arch/frv/include/uapi/asm/poll.h
+++ b/arch/frv/include/uapi/asm/poll.h
@@ -3,7 +3,7 @@
 #define _ASM_POLL_H
 
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	256
+#define POLLWRBAND	(__force __poll_t)256
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/m68k/include/uapi/asm/poll.h b/arch/m68k/include/uapi/asm/poll.h
index c3e3fcc15e1d..99314937fe5c 100644
--- a/arch/m68k/include/uapi/asm/poll.h
+++ b/arch/m68k/include/uapi/asm/poll.h
@@ -3,7 +3,7 @@
 #define __m68k_POLL_H
 
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	256
+#define POLLWRBAND	(__force __poll_t)256
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/mips/include/uapi/asm/poll.h b/arch/mips/include/uapi/asm/poll.h
index ad289d7b7434..e937f8b1b007 100644
--- a/arch/mips/include/uapi/asm/poll.h
+++ b/arch/mips/include/uapi/asm/poll.h
@@ -3,7 +3,7 @@
 #define __ASM_POLL_H
 
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	0x0100
+#define POLLWRBAND	(__force __poll_t)0x0100
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/sparc/include/uapi/asm/poll.h b/arch/sparc/include/uapi/asm/poll.h
index 72356c999125..595cb12df1f1 100644
--- a/arch/sparc/include/uapi/asm/poll.h
+++ b/arch/sparc/include/uapi/asm/poll.h
@@ -3,10 +3,10 @@
 #define __SPARC_POLL_H
 
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	256
-#define POLLMSG		512
-#define POLLREMOVE	1024
-#define POLLRDHUP       2048
+#define POLLWRBAND	(__force __poll_t)256
+#define POLLMSG		(__force __poll_t)512
+#define POLLREMOVE	(__force __poll_t)1024
+#define POLLRDHUP       (__force __poll_t)2048
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/xtensa/include/uapi/asm/poll.h b/arch/xtensa/include/uapi/asm/poll.h
index 4d249040b33d..22bbc48b9f85 100644
--- a/arch/xtensa/include/uapi/asm/poll.h
+++ b/arch/xtensa/include/uapi/asm/poll.h
@@ -13,8 +13,8 @@
 #define _XTENSA_POLL_H
 
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	0x0100
-#define POLLREMOVE	0x0800
+#define POLLWRBAND	(__force __poll_t)0x0100
+#define POLLREMOVE	(__force __poll_t)0x0800
 
 #include <asm-generic/poll.h>
 
diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h
index fefb3d2c3fac..8c0e9dd384b6 100644
--- a/include/uapi/asm-generic/poll.h
+++ b/include/uapi/asm-generic/poll.h
@@ -3,35 +3,35 @@
 #define __ASM_GENERIC_POLL_H
 
 /* These are specified by iBCS2 */
-#define POLLIN		0x0001
-#define POLLPRI		0x0002
-#define POLLOUT		0x0004
-#define POLLERR		0x0008
-#define POLLHUP		0x0010
-#define POLLNVAL	0x0020
+#define POLLIN		(__force __poll_t)0x0001
+#define POLLPRI		(__force __poll_t)0x0002
+#define POLLOUT		(__force __poll_t)0x0004
+#define POLLERR		(__force __poll_t)0x0008
+#define POLLHUP		(__force __poll_t)0x0010
+#define POLLNVAL	(__force __poll_t)0x0020
 
 /* The rest seem to be more-or-less nonstandard. Check them! */
-#define POLLRDNORM	0x0040
-#define POLLRDBAND	0x0080
+#define POLLRDNORM	(__force __poll_t)0x0040
+#define POLLRDBAND	(__force __poll_t)0x0080
 #ifndef POLLWRNORM
-#define POLLWRNORM	0x0100
+#define POLLWRNORM	(__force __poll_t)0x0100
 #endif
 #ifndef POLLWRBAND
-#define POLLWRBAND	0x0200
+#define POLLWRBAND	(__force __poll_t)0x0200
 #endif
 #ifndef POLLMSG
-#define POLLMSG		0x0400
+#define POLLMSG		(__force __poll_t)0x0400
 #endif
 #ifndef POLLREMOVE
-#define POLLREMOVE	0x1000
+#define POLLREMOVE	(__force __poll_t)0x1000
 #endif
 #ifndef POLLRDHUP
-#define POLLRDHUP       0x2000
+#define POLLRDHUP       (__force __poll_t)0x2000
 #endif
 
-#define POLLFREE	0x4000	/* currently only for epoll */
+#define POLLFREE	(__force __poll_t)0x4000	/* currently only for epoll */
 
-#define POLL_BUSY_LOOP	0x8000
+#define POLL_BUSY_LOOP	(__force __poll_t)0x8000
 
 struct pollfd {
 	int fd;
diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index e3d1d0c78f3c..cd4f0b897a48 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -49,5 +49,11 @@ typedef __u32 __bitwise __wsum;
 #define __aligned_be64 __be64 __attribute__((aligned(8)))
 #define __aligned_le64 __le64 __attribute__((aligned(8)))
 
+#ifdef __CHECK_POLL
+typedef unsigned __bitwise __poll_t;
+#else
+typedef unsigned __poll_t;
+#endif
+
 #endif /*  __ASSEMBLY__ */
 #endif /* _UAPI_LINUX_TYPES_H */
-- 
cgit v1.2.3


From 81cf4a45360f70528f1f64ba018d61cb5767249a Mon Sep 17 00:00:00 2001
From: Masakazu Mokuno <masakazu.mokuno@gmail.com>
Date: Fri, 10 Nov 2017 01:25:50 +0900
Subject: USB: core: Add type-specific length check of BOS descriptors

As most of BOS descriptors are longer in length than their header
'struct usb_dev_cap_header', comparing solely with it is not sufficient
to avoid out-of-bounds access to BOS descriptors.

This patch adds descriptor type specific length check in
usb_get_bos_descriptor() to fix the issue.

Signed-off-by: Masakazu Mokuno <masakazu.mokuno@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/config.c    | 28 ++++++++++++++++++++++++----
 include/uapi/linux/usb/ch9.h |  3 +++
 2 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index da8acd980fc6..55b198ba629b 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -905,14 +905,25 @@ void usb_release_bos_descriptor(struct usb_device *dev)
 	}
 }
 
+static const __u8 bos_desc_len[256] = {
+	[USB_CAP_TYPE_WIRELESS_USB] = USB_DT_USB_WIRELESS_CAP_SIZE,
+	[USB_CAP_TYPE_EXT]          = USB_DT_USB_EXT_CAP_SIZE,
+	[USB_SS_CAP_TYPE]           = USB_DT_USB_SS_CAP_SIZE,
+	[USB_SSP_CAP_TYPE]          = USB_DT_USB_SSP_CAP_SIZE(1),
+	[CONTAINER_ID_TYPE]         = USB_DT_USB_SS_CONTN_ID_SIZE,
+	[USB_PTM_CAP_TYPE]          = USB_DT_USB_PTM_ID_SIZE,
+};
+
 /* Get BOS descriptor set */
 int usb_get_bos_descriptor(struct usb_device *dev)
 {
 	struct device *ddev = &dev->dev;
 	struct usb_bos_descriptor *bos;
 	struct usb_dev_cap_header *cap;
+	struct usb_ssp_cap_descriptor *ssp_cap;
 	unsigned char *buffer;
-	int length, total_len, num, i;
+	int length, total_len, num, i, ssac;
+	__u8 cap_type;
 	int ret;
 
 	bos = kzalloc(sizeof(struct usb_bos_descriptor), GFP_KERNEL);
@@ -965,7 +976,13 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 			dev->bos->desc->bNumDeviceCaps = i;
 			break;
 		}
+		cap_type = cap->bDevCapabilityType;
 		length = cap->bLength;
+		if (bos_desc_len[cap_type] && length < bos_desc_len[cap_type]) {
+			dev->bos->desc->bNumDeviceCaps = i;
+			break;
+		}
+
 		total_len -= length;
 
 		if (cap->bDescriptorType != USB_DT_DEVICE_CAPABILITY) {
@@ -973,7 +990,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 			continue;
 		}
 
-		switch (cap->bDevCapabilityType) {
+		switch (cap_type) {
 		case USB_CAP_TYPE_WIRELESS_USB:
 			/* Wireless USB cap descriptor is handled by wusb */
 			break;
@@ -986,8 +1003,11 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 				(struct usb_ss_cap_descriptor *)buffer;
 			break;
 		case USB_SSP_CAP_TYPE:
-			dev->bos->ssp_cap =
-				(struct usb_ssp_cap_descriptor *)buffer;
+			ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
+			ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
+				USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
+			if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
+				dev->bos->ssp_cap = ssp_cap;
 			break;
 		case CONTAINER_ID_TYPE:
 			dev->bos->ss_id =
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 41a0a81b01e6..c4c79aa331bd 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -880,6 +880,8 @@ struct usb_wireless_cap_descriptor {	/* Ultra Wide Band */
 	__u8  bReserved;
 } __attribute__((packed));
 
+#define USB_DT_USB_WIRELESS_CAP_SIZE	11
+
 /* USB 2.0 Extension descriptor */
 #define	USB_CAP_TYPE_EXT		2
 
@@ -1072,6 +1074,7 @@ struct usb_ptm_cap_descriptor {
 	__u8  bDevCapabilityType;
 } __attribute__((packed));
 
+#define USB_DT_USB_PTM_ID_SIZE		3
 /*
  * The size of the descriptor for the Sublink Speed Attribute Count
  * (SSAC) specified in bmAttributes[4:0].
-- 
cgit v1.2.3


From 9a450484089dfa8b6348eff2a918f3c8f38585b9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 27 Nov 2017 12:29:50 +0100
Subject: lp: support 64-bit time_t user space

Once we get a glibc with 64-bit time_t, the LPSETTIMEOUT ioctl stops
working, since the command number and data structure no longer match.

To work around that, this introduces a new command number LPSETTIMEOUT_NEW
that is used whenever the modified user space evaluates the LPSETTIMEOUT
macro.

The trick we use is a bit convoluted but necessary: we cannot check for
any macros set by the C library in linux/lp.h, because this particular
header can be included before including sys/time.h. However, we can assume
that by the time that LPSETTIMEOUT is seen in the code, the definition
for 'timeval' and 'time_t' has been seen as well, so we can use the
sizeof() operator to determine whether we should use the old or the
new definition. We use the old one not only for traditional 32-bit user
space with 32-bit time_t, but also for all 64-bit architectures and x32,
which always use a 64-bit time_t, the new definition will be used only for
32-bit user space with 64-bit time_t, which also requires a newer kernel.

The compat_ioctl() handler now implements both commands, but has to
use a special case for existing x32 binaries. The native ioctl handler
now implements both command numbers on both 32-bit and 64-bit, though
the latter version use the same interpretation for both.

This is based on an earlier patch from Bamvor.

Cc: Bamvor Jian Zhang <bamv2005@gmail.com>
Link: http://www.spinics.net/lists/y2038/msg01162.html
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/lp.c       | 67 +++++++++++++++++++++++++++++++++++++------------
 include/uapi/linux/lp.h | 12 ++++++++-
 2 files changed, 62 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/char/lp.c b/drivers/char/lp.c
index 8249762192d5..be14abf70da1 100644
--- a/drivers/char/lp.c
+++ b/drivers/char/lp.c
@@ -659,17 +659,31 @@ static int lp_do_ioctl(unsigned int minor, unsigned int cmd,
 	return retval;
 }
 
-static int lp_set_timeout(unsigned int minor, struct timeval *par_timeout)
+static int lp_set_timeout(unsigned int minor, s64 tv_sec, long tv_usec)
 {
 	long to_jiffies;
 
 	/* Convert to jiffies, place in lp_table */
-	if ((par_timeout->tv_sec < 0) ||
-	    (par_timeout->tv_usec < 0)) {
+	if (tv_sec < 0 || tv_usec < 0)
 		return -EINVAL;
+
+	/*
+	 * we used to not check, so let's not make this fatal,
+	 * but deal with user space passing a 32-bit tv_nsec in
+	 * a 64-bit field, capping the timeout to 1 second
+	 * worth of microseconds, and capping the total at
+	 * MAX_JIFFY_OFFSET.
+	 */
+	if (tv_usec > 999999)
+		tv_usec = 999999;
+
+	if (tv_sec >= MAX_SEC_IN_JIFFIES - 1) {
+		to_jiffies = MAX_JIFFY_OFFSET;
+	} else {
+		to_jiffies = DIV_ROUND_UP(tv_usec, 1000000/HZ);
+		to_jiffies += tv_sec * (long) HZ;
 	}
-	to_jiffies = DIV_ROUND_UP(par_timeout->tv_usec, 1000000/HZ);
-	to_jiffies += par_timeout->tv_sec * (long) HZ;
+
 	if (to_jiffies <= 0) {
 		return -EINVAL;
 	}
@@ -677,23 +691,43 @@ static int lp_set_timeout(unsigned int minor, struct timeval *par_timeout)
 	return 0;
 }
 
+static int lp_set_timeout32(unsigned int minor, void __user *arg)
+{
+	s32 karg[2];
+
+	if (copy_from_user(karg, arg, sizeof(karg)))
+		return -EFAULT;
+
+	return lp_set_timeout(minor, karg[0], karg[1]);
+}
+
+static int lp_set_timeout64(unsigned int minor, void __user *arg)
+{
+	s64 karg[2];
+
+	if (copy_from_user(karg, arg, sizeof(karg)))
+		return -EFAULT;
+
+	return lp_set_timeout(minor, karg[0], karg[1]);
+}
+
 static long lp_ioctl(struct file *file, unsigned int cmd,
 			unsigned long arg)
 {
 	unsigned int minor;
-	struct timeval par_timeout;
 	int ret;
 
 	minor = iminor(file_inode(file));
 	mutex_lock(&lp_mutex);
 	switch (cmd) {
-	case LPSETTIMEOUT:
-		if (copy_from_user(&par_timeout, (void __user *)arg,
-					sizeof (struct timeval))) {
-			ret = -EFAULT;
+	case LPSETTIMEOUT_OLD:
+		if (BITS_PER_LONG == 32) {
+			ret = lp_set_timeout32(minor, (void __user *)arg);
 			break;
 		}
-		ret = lp_set_timeout(minor, &par_timeout);
+		/* fallthrough for 64-bit */
+	case LPSETTIMEOUT_NEW:
+		ret = lp_set_timeout64(minor, (void __user *)arg);
 		break;
 	default:
 		ret = lp_do_ioctl(minor, cmd, arg, (void __user *)arg);
@@ -709,18 +743,19 @@ static long lp_compat_ioctl(struct file *file, unsigned int cmd,
 			unsigned long arg)
 {
 	unsigned int minor;
-	struct timeval par_timeout;
 	int ret;
 
 	minor = iminor(file_inode(file));
 	mutex_lock(&lp_mutex);
 	switch (cmd) {
-	case LPSETTIMEOUT:
-		if (compat_get_timeval(&par_timeout, compat_ptr(arg))) {
-			ret = -EFAULT;
+	case LPSETTIMEOUT_OLD:
+		if (!COMPAT_USE_64BIT_TIME) {
+			ret = lp_set_timeout32(minor, (void __user *)arg);
 			break;
 		}
-		ret = lp_set_timeout(minor, &par_timeout);
+		/* fallthrough for x32 mode */
+	case LPSETTIMEOUT_NEW:
+		ret = lp_set_timeout64(minor, (void __user *)arg);
 		break;
 #ifdef LP_STATS
 	case LPGETSTATS:
diff --git a/include/uapi/linux/lp.h b/include/uapi/linux/lp.h
index dafcfe4e4834..8589a27037d7 100644
--- a/include/uapi/linux/lp.h
+++ b/include/uapi/linux/lp.h
@@ -8,6 +8,8 @@
 #ifndef _UAPI_LINUX_LP_H
 #define _UAPI_LINUX_LP_H
 
+#include <linux/types.h>
+#include <linux/ioctl.h>
 
 /*
  * Per POSIX guidelines, this module reserves the LP and lp prefixes
@@ -88,7 +90,15 @@
 #define LPGETSTATS  0x060d  /* get statistics (struct lp_stats) */
 #endif
 #define LPGETFLAGS  0x060e  /* get status flags */
-#define LPSETTIMEOUT 0x060f /* set parport timeout */
+#define LPSETTIMEOUT_OLD 0x060f /* set parport timeout */
+#define LPSETTIMEOUT_NEW \
+	_IOW(0x6, 0xf, __s64[2]) /* set parport timeout */
+#if __BITS_PER_LONG == 64
+#define LPSETTIMEOUT LPSETTIMEOUT_OLD
+#else
+#define LPSETTIMEOUT (sizeof(time_t) > sizeof(__kernel_long_t) ? \
+	LPSETTIMEOUT_NEW : LPSETTIMEOUT_OLD)
+#endif
 
 /* timeout for printk'ing a timeout, in jiffies (100ths of a second).
    This is also used for re-checking error conditions if LP_ABORT is
-- 
cgit v1.2.3


From 26500475ac1b499d8636ff281311d633909f5d20 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@docker.com>
Date: Wed, 11 Oct 2017 09:39:21 -0600
Subject: ptrace, seccomp: add support for retrieving seccomp metadata

With the new SECCOMP_FILTER_FLAG_LOG, we need to be able to extract these
flags for checkpoint restore, since they describe the state of a filter.

So, let's add PTRACE_SECCOMP_GET_METADATA, similar to ..._GET_FILTER, which
returns the metadata of the nth filter (right now, just the flags).
Hopefully this will be future proof, and new per-filter metadata can be
added to this struct.

Signed-off-by: Tycho Andersen <tycho@docker.com>
CC: Kees Cook <keescook@chromium.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/seccomp.h     |  8 ++++++++
 include/uapi/linux/ptrace.h |  6 ++++++
 kernel/ptrace.c             |  4 ++++
 kernel/seccomp.c            | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 51 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 10f25f7e4304..c723a5c4e3ff 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -95,11 +95,19 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
 #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
 extern long seccomp_get_filter(struct task_struct *task,
 			       unsigned long filter_off, void __user *data);
+extern long seccomp_get_metadata(struct task_struct *task,
+				 unsigned long filter_off, void __user *data);
 #else
 static inline long seccomp_get_filter(struct task_struct *task,
 				      unsigned long n, void __user *data)
 {
 	return -EINVAL;
 }
+static inline long seccomp_get_metadata(struct task_struct *task,
+					unsigned long filter_off,
+					void __user *data)
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
 #endif /* _LINUX_SECCOMP_H */
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index e3939e00980b..e46d82b91166 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -66,6 +66,12 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_SETSIGMASK	0x420b
 
 #define PTRACE_SECCOMP_GET_FILTER	0x420c
+#define PTRACE_SECCOMP_GET_METADATA	0x420d
+
+struct seccomp_metadata {
+	unsigned long filter_off;	/* Input: which filter */
+	unsigned int flags;		/* Output: filter's flags */
+};
 
 /* Read signals from a shared (process wide) queue */
 #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 84b1367935e4..58291e9f3276 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1092,6 +1092,10 @@ int ptrace_request(struct task_struct *child, long request,
 		ret = seccomp_get_filter(child, addr, datavp);
 		break;
 
+	case PTRACE_SECCOMP_GET_METADATA:
+		ret = seccomp_get_metadata(child, addr, datavp);
+		break;
+
 	default:
 		break;
 	}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 99bddaf79076..61bd9dc260c8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1061,6 +1061,39 @@ out:
 	__put_seccomp_filter(filter);
 	return ret;
 }
+
+long seccomp_get_metadata(struct task_struct *task,
+			  unsigned long size, void __user *data)
+{
+	long ret;
+	struct seccomp_filter *filter;
+	struct seccomp_metadata kmd = {};
+
+	if (!capable(CAP_SYS_ADMIN) ||
+	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+		return -EACCES;
+	}
+
+	size = min_t(unsigned long, size, sizeof(kmd));
+
+	if (copy_from_user(&kmd, data, size))
+		return -EFAULT;
+
+	filter = get_nth_filter(task, kmd.filter_off);
+	if (IS_ERR(filter))
+		return PTR_ERR(filter);
+
+	memset(&kmd, 0, sizeof(kmd));
+	if (filter->log)
+		kmd.flags |= SECCOMP_FILTER_FLAG_LOG;
+
+	ret = size;
+	if (copy_to_user(data, &kmd, size))
+		ret = -EFAULT;
+
+	__put_seccomp_filter(filter);
+	return ret;
+}
 #endif
 
 #ifdef CONFIG_SYSCTL
-- 
cgit v1.2.3


From f2aa97240c84b8f258710e297ba60048bd9c153e Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Fri, 23 Dec 2016 13:13:34 +0100
Subject: tee: add TEE_IOCTL_PARAM_ATTR_META

Adds TEE_IOCTL_PARAM_ATTR_META which can be used to indicate meta
parameters when communicating with user space. These meta parameters can
be used by supplicant support multiple parallel requests at a time.

Reviewed-by: Etienne Carriere <etienne.carriere@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/optee/supp.c | 25 +++++++++++++++++++++++++
 drivers/tee/tee_core.c   | 16 ++++++++++------
 include/uapi/linux/tee.h |  7 +++++++
 3 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c
index b4ea0678a436..56aa8b929b8c 100644
--- a/drivers/tee/optee/supp.c
+++ b/drivers/tee/optee/supp.c
@@ -119,6 +119,27 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
 	return ret;
 }
 
+static int supp_check_recv_params(size_t num_params, struct tee_param *params)
+{
+	size_t n;
+
+	/*
+	 * If there's memrefs we need to decrease those as they where
+	 * increased earlier and we'll even refuse to accept any below.
+	 */
+	for (n = 0; n < num_params; n++)
+		if (tee_param_is_memref(params + n) && params[n].u.memref.shm)
+			tee_shm_put(params[n].u.memref.shm);
+
+	/*
+	 * We only expect parameters as TEE_IOCTL_PARAM_ATTR_TYPE_NONE (0).
+	 */
+	for (n = 0; n < num_params; n++)
+		if (params[n].attr)
+			return -EINVAL;
+	return 0;
+}
+
 /**
  * optee_supp_recv() - receive request for supplicant
  * @ctx:	context receiving the request
@@ -137,6 +158,10 @@ int optee_supp_recv(struct tee_context *ctx, u32 *func, u32 *num_params,
 	struct optee_supp *supp = &optee->supp;
 	int rc;
 
+	rc = supp_check_recv_params(*num_params, param);
+	if (rc)
+		return rc;
+
 	/*
 	 * In case two threads in one supplicant is calling this function
 	 * simultaneously we need to protect the data with a mutex which
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index c78104589e42..4d0ce606f0fc 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -152,11 +152,11 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params,
 			return -EFAULT;
 
 		/* All unused attribute bits has to be zero */
-		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_TYPE_MASK)
+		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_MASK)
 			return -EINVAL;
 
 		params[n].attr = ip.attr;
-		switch (ip.attr) {
+		switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
 		case TEE_IOCTL_PARAM_ATTR_TYPE_NONE:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
 			break;
@@ -394,8 +394,8 @@ static int params_to_supp(struct tee_context *ctx,
 		struct tee_ioctl_param ip;
 		struct tee_param *p = params + n;
 
-		ip.attr = p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK;
-		switch (p->attr) {
+		ip.attr = p->attr;
+		switch (p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
 			ip.a = p->u.value.a;
@@ -459,6 +459,10 @@ static int tee_ioctl_supp_recv(struct tee_context *ctx,
 	if (!params)
 		return -ENOMEM;
 
+	rc = params_from_user(ctx, params, num_params, uarg->params);
+	if (rc)
+		goto out;
+
 	rc = ctx->teedev->desc->ops->supp_recv(ctx, &func, &num_params, params);
 	if (rc)
 		goto out;
@@ -488,11 +492,11 @@ static int params_from_supp(struct tee_param *params, size_t num_params,
 			return -EFAULT;
 
 		/* All unused attribute bits has to be zero */
-		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_TYPE_MASK)
+		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_MASK)
 			return -EINVAL;
 
 		p->attr = ip.attr;
-		switch (ip.attr) {
+		switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
 			/* Only out and in/out values can be updated */
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 688782e90140..267c12e7fd79 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -154,6 +154,13 @@ struct tee_ioctl_buf_data {
  */
 #define TEE_IOCTL_PARAM_ATTR_TYPE_MASK		0xff
 
+/* Meta parameter carrying extra information about the message. */
+#define TEE_IOCTL_PARAM_ATTR_META		0x100
+
+/* Mask of all known attr bits */
+#define TEE_IOCTL_PARAM_ATTR_MASK \
+	(TEE_IOCTL_PARAM_ATTR_TYPE_MASK | TEE_IOCTL_PARAM_ATTR_META)
+
 /*
  * Matches TEEC_LOGIN_* in GP TEE Client API
  * Are only defined for GP compliant TEEs
-- 
cgit v1.2.3


From 40e44a1e669d078946f46853808a60d29e6f0885 Mon Sep 17 00:00:00 2001
From: Scott Branden <scott.branden@broadcom.com>
Date: Thu, 30 Nov 2017 11:35:59 -0800
Subject: net: ethtool: add support for reset of AP inside NIC interface.

Add ETH_RESET_AP to reset the application processor(s) inside the NIC
interface.

Current ETH_RESET_MGMT supports a management processor inside this NIC.
This is typically used for remote NIC management purposes.

Application processors exist inside some SmartNICs to run various
applications inside the NIC processor - be it a simple algorithm without
an OS to as complex as hosting multiple VMs.

Signed-off-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index ac71559314e7..44a0b675a6bc 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1686,6 +1686,7 @@ enum ethtool_reset_flags {
 	ETH_RESET_PHY		= 1 << 6,	/* Transceiver/PHY */
 	ETH_RESET_RAM		= 1 << 7,	/* RAM shared between
 						 * multiple components */
+	ETH_RESET_AP		= 1 << 8,	/* Application processor */
 
 	ETH_RESET_DEDICATED	= 0x0000ffff,	/* All components dedicated to
 						 * this interface */
-- 
cgit v1.2.3


From 521a72e1f2e8141d78e7699eaacda24e308ed428 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sat, 4 Nov 2017 21:20:01 +0100
Subject: i2c: add a message flag for DMA safe buffers

I2C has no requirement that the buffer of a message needs to be DMA
safe. In case it is, it can now be flagged, so drivers wishing to
do DMA can use the buffer directly.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/uapi/linux/i2c.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/i2c.h b/include/uapi/linux/i2c.h
index fe648032d6b9..f71a1751cacf 100644
--- a/include/uapi/linux/i2c.h
+++ b/include/uapi/linux/i2c.h
@@ -72,6 +72,9 @@ struct i2c_msg {
 #define I2C_M_RD		0x0001	/* read data, from slave to master */
 					/* I2C_M_RD is guaranteed to be 0x0001! */
 #define I2C_M_TEN		0x0010	/* this is a ten bit chip address */
+#define I2C_M_DMA_SAFE		0x0200	/* the buffer of this message is DMA safe */
+					/* makes only sense in kernelspace */
+					/* userspace buffers are copied anyway */
 #define I2C_M_RECV_LEN		0x0400	/* length will be first received byte */
 #define I2C_M_NO_RD_ACK		0x0800	/* if I2C_FUNC_PROTOCOL_MANGLING */
 #define I2C_M_IGNORE_NAK	0x1000	/* if I2C_FUNC_PROTOCOL_MANGLING */
-- 
cgit v1.2.3


From 5acc5c063196b4a531a761a954023c1848ec832b Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:26 -0600
Subject: KVM: Introduce KVM_MEMORY_ENCRYPT_OP ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the hardware supports memory encryption then the
KVM_MEMORY_ENCRYPT_OP ioctl can be used by qemu to issue a platform
specific memory encryption commands.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 Documentation/virtual/kvm/api.txt | 16 ++++++++++++++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/x86.c                |  6 ++++++
 include/uapi/linux/kvm.h          |  2 ++
 4 files changed, 26 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f670e4b9e7f3..c8755be35543 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3394,6 +3394,22 @@ invalid, if invalid pages are written to (e.g. after the end of memory)
 or if no page table is present for the addresses (e.g. when using
 hugepages).
 
+4.109 KVM_MEMORY_ENCRYPT_OP
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: an opaque platform specific structure (in/out)
+Returns: 0 on success; -1 on error
+
+If the platform supports creating encrypted VMs then this ioctl can be used
+for issuing platform-specific memory encryption commands to manage those
+encrypted VMs.
+
+Currently, this ioctl is used for issuing Secure Encrypted Virtualization
+(SEV) commands on AMD Processors. The SEV commands are defined in
+Documentation/virtual/kvm/amd-memory-encryption.txt.
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1bfb99770c34..c87e214d55df 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1066,6 +1066,8 @@ struct kvm_x86_ops {
 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
 	int (*enable_smi_window)(struct kvm_vcpu *vcpu);
+
+	int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 34c85aa2e2d1..7bbed0c0ba79 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4281,6 +4281,12 @@ set_identity_unlock:
 		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
 		break;
 	}
+	case KVM_MEMORY_ENCRYPT_OP: {
+		r = -ENOTTY;
+		if (kvm_x86_ops->mem_enc_op)
+			r = kvm_x86_ops->mem_enc_op(kvm, argp);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 282d7613fce8..addd0cf4445f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1358,6 +1358,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_S390_CMMA_MIGRATION */
 #define KVM_S390_GET_CMMA_BITS      _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
 #define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
+/* Memory Encryption Commands */
+#define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-- 
cgit v1.2.3


From 69eaedee411c1fc1cf123520897a96b7cf04d8a0 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:26 -0600
Subject: KVM: Introduce KVM_MEMORY_ENCRYPT_{UN,}REG_REGION ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If hardware supports memory encryption then KVM_MEMORY_ENCRYPT_REG_REGION
and KVM_MEMORY_ENCRYPT_UNREG_REGION ioctl's can be used by userspace to
register/unregister the guest memory regions which may contain the encrypted
data (e.g guest RAM, PCI BAR, SMRAM etc).

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 Documentation/virtual/kvm/api.txt | 34 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/x86.c                | 24 ++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  8 ++++++++
 4 files changed, 68 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index c8755be35543..c2ced6a44bbb 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3410,6 +3410,40 @@ Currently, this ioctl is used for issuing Secure Encrypted Virtualization
 (SEV) commands on AMD Processors. The SEV commands are defined in
 Documentation/virtual/kvm/amd-memory-encryption.txt.
 
+4.110 KVM_MEMORY_ENCRYPT_REG_REGION
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_enc_region (in)
+Returns: 0 on success; -1 on error
+
+This ioctl can be used to register a guest memory region which may
+contain encrypted data (e.g. guest RAM, SMRAM etc).
+
+It is used in the SEV-enabled guest. When encryption is enabled, a guest
+memory region may contain encrypted data. The SEV memory encryption
+engine uses a tweak such that two identical plaintext pages, each at
+different locations will have differing ciphertexts. So swapping or
+moving ciphertext of those pages will not result in plaintext being
+swapped. So relocating (or migrating) physical backing pages for the SEV
+guest will require some additional steps.
+
+Note: The current SEV key management spec does not provide commands to
+swap or migrate (move) ciphertext pages. Hence, for now we pin the guest
+memory region registered with the ioctl.
+
+4.111 KVM_MEMORY_ENCRYPT_UNREG_REGION
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_enc_region (in)
+Returns: 0 on success; -1 on error
+
+This ioctl can be used to unregister the guest memory region registered
+with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c87e214d55df..58b7cc30466b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1068,6 +1068,8 @@ struct kvm_x86_ops {
 	int (*enable_smi_window)(struct kvm_vcpu *vcpu);
 
 	int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
+	int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+	int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7bbed0c0ba79..926f55cecf2e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4287,6 +4287,30 @@ set_identity_unlock:
 			r = kvm_x86_ops->mem_enc_op(kvm, argp);
 		break;
 	}
+	case KVM_MEMORY_ENCRYPT_REG_REGION: {
+		struct kvm_enc_region region;
+
+		r = -EFAULT;
+		if (copy_from_user(&region, argp, sizeof(region)))
+			goto out;
+
+		r = -ENOTTY;
+		if (kvm_x86_ops->mem_enc_reg_region)
+			r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
+		break;
+	}
+	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
+		struct kvm_enc_region region;
+
+		r = -EFAULT;
+		if (copy_from_user(&region, argp, sizeof(region)))
+			goto out;
+
+		r = -ENOTTY;
+		if (kvm_x86_ops->mem_enc_unreg_region)
+			r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index addd0cf4445f..c8c65190907d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1361,6 +1361,14 @@ struct kvm_s390_ucas_mapping {
 /* Memory Encryption Commands */
 #define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
 
+struct kvm_enc_region {
+	__u64 addr;
+	__u64 size;
+};
+
+#define KVM_MEMORY_ENCRYPT_REG_REGION    _IOR(KVMIO, 0xbb, struct kvm_enc_region)
+#define KVM_MEMORY_ENCRYPT_UNREG_REGION  _IOR(KVMIO, 0xbc, struct kvm_enc_region)
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-- 
cgit v1.2.3


From 1d57b17c60ff245b8e50813bf0fd24143ecf26d4 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:27 -0600
Subject: crypto: ccp: Define SEV userspace ioctl and command id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a include file which defines the ioctl and command id used for
issuing SEV platform management specific commands.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Gary R Hook <gary.hook@amd.com>
---
 include/uapi/linux/psp-sev.h | 142 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 include/uapi/linux/psp-sev.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
new file mode 100644
index 000000000000..3d77fe91239a
--- /dev/null
+++ b/include/uapi/linux/psp-sev.h
@@ -0,0 +1,142 @@
+/*
+ * Userspace interface for AMD Secure Encrypted Virtualization (SEV)
+ * platform management commands.
+ *
+ * Copyright (C) 2016-2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * SEV spec 0.14 is available at:
+ * http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __PSP_SEV_USER_H__
+#define __PSP_SEV_USER_H__
+
+#include <linux/types.h>
+
+/**
+ * SEV platform commands
+ */
+enum {
+	SEV_FACTORY_RESET = 0,
+	SEV_PLATFORM_STATUS,
+	SEV_PEK_GEN,
+	SEV_PEK_CSR,
+	SEV_PDH_GEN,
+	SEV_PDH_CERT_EXPORT,
+	SEV_PEK_CERT_IMPORT,
+
+	SEV_MAX,
+};
+
+/**
+ * SEV Firmware status code
+ */
+typedef enum {
+	SEV_RET_SUCCESS = 0,
+	SEV_RET_INVALID_PLATFORM_STATE,
+	SEV_RET_INVALID_GUEST_STATE,
+	SEV_RET_INAVLID_CONFIG,
+	SEV_RET_INVALID_len,
+	SEV_RET_ALREADY_OWNED,
+	SEV_RET_INVALID_CERTIFICATE,
+	SEV_RET_POLICY_FAILURE,
+	SEV_RET_INACTIVE,
+	SEV_RET_INVALID_ADDRESS,
+	SEV_RET_BAD_SIGNATURE,
+	SEV_RET_BAD_MEASUREMENT,
+	SEV_RET_ASID_OWNED,
+	SEV_RET_INVALID_ASID,
+	SEV_RET_WBINVD_REQUIRED,
+	SEV_RET_DFFLUSH_REQUIRED,
+	SEV_RET_INVALID_GUEST,
+	SEV_RET_INVALID_COMMAND,
+	SEV_RET_ACTIVE,
+	SEV_RET_HWSEV_RET_PLATFORM,
+	SEV_RET_HWSEV_RET_UNSAFE,
+	SEV_RET_UNSUPPORTED,
+	SEV_RET_MAX,
+} sev_ret_code;
+
+/**
+ * struct sev_user_data_status - PLATFORM_STATUS command parameters
+ *
+ * @major: major API version
+ * @minor: minor API version
+ * @state: platform state
+ * @flags: platform config flags
+ * @build: firmware build id for API version
+ * @guest_count: number of active guests
+ */
+struct sev_user_data_status {
+	__u8 api_major;				/* Out */
+	__u8 api_minor;				/* Out */
+	__u8 state;				/* Out */
+	__u32 flags;				/* Out */
+	__u8 build;				/* Out */
+	__u32 guest_count;			/* Out */
+} __packed;
+
+/**
+ * struct sev_user_data_pek_csr - PEK_CSR command parameters
+ *
+ * @address: PEK certificate chain
+ * @length: length of certificate
+ */
+struct sev_user_data_pek_csr {
+	__u64 address;				/* In */
+	__u32 length;				/* In/Out */
+} __packed;
+
+/**
+ * struct sev_user_data_cert_import - PEK_CERT_IMPORT command parameters
+ *
+ * @pek_address: PEK certificate chain
+ * @pek_len: length of PEK certificate
+ * @oca_address: OCA certificate chain
+ * @oca_len: length of OCA certificate
+ */
+struct sev_user_data_pek_cert_import {
+	__u64 pek_cert_address;			/* In */
+	__u32 pek_cert_len;			/* In */
+	__u64 oca_cert_address;			/* In */
+	__u32 oca_cert_len;			/* In */
+} __packed;
+
+/**
+ * struct sev_user_data_pdh_cert_export - PDH_CERT_EXPORT command parameters
+ *
+ * @pdh_address: PDH certificate address
+ * @pdh_len: length of PDH certificate
+ * @cert_chain_address: PDH certificate chain
+ * @cert_chain_len: length of PDH certificate chain
+ */
+struct sev_user_data_pdh_cert_export {
+	__u64 pdh_cert_address;			/* In */
+	__u32 pdh_cert_len;			/* In/Out */
+	__u64 cert_chain_address;		/* In */
+	__u32 cert_chain_len;			/* In/Out */
+} __packed;
+
+/**
+ * struct sev_issue_cmd - SEV ioctl parameters
+ *
+ * @cmd: SEV commands to execute
+ * @opaque: pointer to the command structure
+ * @error: SEV FW return code on failure
+ */
+struct sev_issue_cmd {
+	__u32 cmd;				/* In */
+	__u64 data;				/* In */
+	__u32 error;				/* Out */
+} __packed;
+
+#define SEV_IOC_TYPE		'S'
+#define SEV_ISSUE_CMD	_IOWR(SEV_IOC_TYPE, 0x0, struct sev_issue_cmd)
+
+#endif /* __PSP_USER_SEV_H */
-- 
cgit v1.2.3


From cdd77d3e193031cc67426cd671d8aa370f7dfee4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 17 Nov 2017 16:23:08 -0800
Subject: nfit, libnvdimm: deprecate the generic SMART ioctl

The kernel's ND_IOCTL_SMART_THRESHOLD command is based on a payload
definition that has become broken / out-of-sync with recent versions of
the NVDIMM_FAMILY_INTEL definition. Deprecate the use of the
ND_IOCTL_SMART_THRESHOLD command in favor of the ND_CMD_CALL approach
taken by NVDIMM_FAMILY_{HPE,MSFT}, where we can manage the per-vendor
variance in userspace.

In a couple years, when the new scheme is widely deployed in userspace
packages, the ND_IOCTL_SMART_THRESHOLD support can be removed. For now
we prevent new binaries from compiling against the kernel header
definitions, but kernel still compatible with old binaries. The
libndctl.h [1] header is now the authoritative interface definition for
NVDIMM SMART.

[1]: https://github.com/pmem/ndctl
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c                  |  3 --
 include/uapi/linux/ndctl.h            | 54 --------------------------------
 tools/testing/nvdimm/test/nfit.c      | 39 ++++++++++++++---------
 tools/testing/nvdimm/test/nfit_test.h | 59 +++++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 71 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 0a5e6cd758fe..78eabc3a1ab1 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1142,9 +1142,6 @@ int __init nvdimm_bus_init(void)
 {
 	int rc;
 
-	BUILD_BUG_ON(sizeof(struct nd_smart_payload) != 128);
-	BUILD_BUG_ON(sizeof(struct nd_smart_threshold_payload) != 8);
-
 	rc = bus_register(&nvdimm_bus_type);
 	if (rc)
 		return rc;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 3f03567631cb..30ef1236aafa 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -15,54 +15,6 @@
 
 #include <linux/types.h>
 
-struct nd_cmd_smart {
-	__u32 status;
-	__u8 data[128];
-} __packed;
-
-#define ND_SMART_HEALTH_VALID	(1 << 0)
-#define ND_SMART_SPARES_VALID	(1 << 1)
-#define ND_SMART_USED_VALID	(1 << 2)
-#define ND_SMART_TEMP_VALID 	(1 << 3)
-#define ND_SMART_CTEMP_VALID 	(1 << 4)
-#define ND_SMART_ALARM_VALID	(1 << 9)
-#define ND_SMART_SHUTDOWN_VALID	(1 << 10)
-#define ND_SMART_VENDOR_VALID	(1 << 11)
-#define ND_SMART_SPARE_TRIP	(1 << 0)
-#define ND_SMART_TEMP_TRIP	(1 << 1)
-#define ND_SMART_CTEMP_TRIP	(1 << 2)
-#define ND_SMART_NON_CRITICAL_HEALTH	(1 << 0)
-#define ND_SMART_CRITICAL_HEALTH	(1 << 1)
-#define ND_SMART_FATAL_HEALTH		(1 << 2)
-
-struct nd_smart_payload {
-	__u32 flags;
-	__u8 reserved0[4];
-	__u8 health;
-	__u8 spares;
-	__u8 life_used;
-	__u8 alarm_flags;
-	__u16 temperature;
-	__u16 ctrl_temperature;
-	__u8 reserved1[15];
-	__u8 shutdown_state;
-	__u32 vendor_size;
-	__u8 vendor_data[92];
-} __packed;
-
-struct nd_cmd_smart_threshold {
-	__u32 status;
-	__u8 data[8];
-} __packed;
-
-struct nd_smart_threshold_payload {
-	__u8 alarm_control;
-	__u8 reserved0;
-	__u16 temperature;
-	__u8 spares;
-	__u8 reserved[3];
-} __packed;
-
 struct nd_cmd_dimm_flags {
 	__u32 status;
 	__u32 flags;
@@ -211,12 +163,6 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 
 #define ND_IOCTL 'N'
 
-#define ND_IOCTL_SMART			_IOWR(ND_IOCTL, ND_CMD_SMART,\
-					struct nd_cmd_smart)
-
-#define ND_IOCTL_SMART_THRESHOLD	_IOWR(ND_IOCTL, ND_CMD_SMART_THRESHOLD,\
-					struct nd_cmd_smart_threshold)
-
 #define ND_IOCTL_DIMM_FLAGS		_IOWR(ND_IOCTL, ND_CMD_DIMM_FLAGS,\
 					struct nd_cmd_dimm_flags)
 
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 7217b2b953b5..640c02b08a50 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -440,39 +440,50 @@ static int nfit_test_cmd_translate_spa(struct nvdimm_bus *bus,
 	return 0;
 }
 
-static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len)
+static int nfit_test_cmd_smart(struct nd_intel_smart *smart, unsigned int buf_len)
 {
-	static const struct nd_smart_payload smart_data = {
-		.flags = ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID
-			| ND_SMART_SPARES_VALID | ND_SMART_ALARM_VALID
-			| ND_SMART_USED_VALID | ND_SMART_SHUTDOWN_VALID,
-		.health = ND_SMART_NON_CRITICAL_HEALTH,
-		.temperature = 23 * 16,
+	static const struct nd_intel_smart smart_data = {
+		.flags = ND_INTEL_SMART_HEALTH_VALID
+			| ND_INTEL_SMART_SPARES_VALID
+			| ND_INTEL_SMART_ALARM_VALID
+			| ND_INTEL_SMART_USED_VALID
+			| ND_INTEL_SMART_SHUTDOWN_VALID
+			| ND_INTEL_SMART_MTEMP_VALID,
+		.health = ND_INTEL_SMART_NON_CRITICAL_HEALTH,
+		.media_temperature = 23 * 16,
+		.ctrl_temperature = 30 * 16,
+		.pmic_temperature = 40 * 16,
 		.spares = 75,
-		.alarm_flags = ND_SMART_SPARE_TRIP | ND_SMART_TEMP_TRIP,
+		.alarm_flags = ND_INTEL_SMART_SPARE_TRIP
+			| ND_INTEL_SMART_TEMP_TRIP,
+		.ait_status = 1,
 		.life_used = 5,
 		.shutdown_state = 0,
 		.vendor_size = 0,
+		.shutdown_count = 100,
 	};
 
 	if (buf_len < sizeof(*smart))
 		return -EINVAL;
-	memcpy(smart->data, &smart_data, sizeof(smart_data));
+	memcpy(smart, &smart_data, sizeof(smart_data));
 	return 0;
 }
 
-static int nfit_test_cmd_smart_threshold(struct nd_cmd_smart_threshold *smart_t,
+static int nfit_test_cmd_smart_threshold(
+		struct nd_intel_smart_threshold *smart_t,
 		unsigned int buf_len)
 {
-	static const struct nd_smart_threshold_payload smart_t_data = {
-		.alarm_control = ND_SMART_SPARE_TRIP | ND_SMART_TEMP_TRIP,
-		.temperature = 40 * 16,
+	static const struct nd_intel_smart_threshold smart_t_data = {
+		.alarm_control = ND_INTEL_SMART_SPARE_TRIP
+			| ND_INTEL_SMART_TEMP_TRIP,
+		.media_temperature = 40 * 16,
+		.ctrl_temperature = 30 * 16,
 		.spares = 5,
 	};
 
 	if (buf_len < sizeof(*smart_t))
 		return -EINVAL;
-	memcpy(smart_t->data, &smart_t_data, sizeof(smart_t_data));
+	memcpy(smart_t, &smart_t_data, sizeof(smart_t_data));
 	return 0;
 }
 
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index 113b44675a71..b85fba2856c7 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -84,6 +84,65 @@ struct nd_cmd_ars_err_inj_stat {
 	} __packed record[0];
 } __packed;
 
+#define ND_INTEL_SMART 1
+#define ND_INTEL_SMART_THRESHOLD 2
+
+#define ND_INTEL_SMART_HEALTH_VALID             (1 << 0)
+#define ND_INTEL_SMART_SPARES_VALID             (1 << 1)
+#define ND_INTEL_SMART_USED_VALID               (1 << 2)
+#define ND_INTEL_SMART_MTEMP_VALID              (1 << 3)
+#define ND_INTEL_SMART_CTEMP_VALID              (1 << 4)
+#define ND_INTEL_SMART_SHUTDOWN_COUNT_VALID     (1 << 5)
+#define ND_INTEL_SMART_AIT_STATUS_VALID         (1 << 6)
+#define ND_INTEL_SMART_PTEMP_VALID              (1 << 7)
+#define ND_INTEL_SMART_ALARM_VALID              (1 << 9)
+#define ND_INTEL_SMART_SHUTDOWN_VALID           (1 << 10)
+#define ND_INTEL_SMART_VENDOR_VALID             (1 << 11)
+#define ND_INTEL_SMART_SPARE_TRIP               (1 << 0)
+#define ND_INTEL_SMART_TEMP_TRIP                (1 << 1)
+#define ND_INTEL_SMART_CTEMP_TRIP               (1 << 2)
+#define ND_INTEL_SMART_NON_CRITICAL_HEALTH      (1 << 0)
+#define ND_INTEL_SMART_CRITICAL_HEALTH          (1 << 1)
+#define ND_INTEL_SMART_FATAL_HEALTH             (1 << 2)
+
+struct nd_intel_smart {
+	__u32 status;
+	union {
+		struct {
+			__u32 flags;
+			__u8 reserved0[4];
+			__u8 health;
+			__u8 spares;
+			__u8 life_used;
+			__u8 alarm_flags;
+			__u16 media_temperature;
+			__u16 ctrl_temperature;
+			__u32 shutdown_count;
+			__u8 ait_status;
+			__u16 pmic_temperature;
+			__u8 reserved1[8];
+			__u8 shutdown_state;
+			__u32 vendor_size;
+			__u8 vendor_data[92];
+		} __packed;
+		__u8 data[128];
+	};
+} __packed;
+
+struct nd_intel_smart_threshold {
+	__u32 status;
+	union {
+		struct {
+			__u16 alarm_control;
+			__u8 spares;
+			__u16 media_temperature;
+			__u16 ctrl_temperature;
+			__u8 reserved[1];
+		} __packed;
+		__u8 data[8];
+	};
+} __packed;
+
 union acpi_object;
 typedef void *acpi_handle;
 
-- 
cgit v1.2.3


From dc48bae01e5a23ae67758e8fe31cdc439202b190 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:33 -0600
Subject: KVM: Define SEV key management command id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Define Secure Encrypted Virtualization (SEV) key management command id
and structure. The command definition is available in SEV KM spec
0.14 (http://support.amd.com/TechDocs/55766_SEV-KM API_Specification.pdf)
and Documentation/virtual/kvm/amd-memory-encryption.txt.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 .../virtual/kvm/amd-memory-encryption.rst          | 202 +++++++++++++++++++++
 include/uapi/linux/kvm.h                           |  80 ++++++++
 2 files changed, 282 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst
index a8ef21e737db..71d6d257074f 100644
--- a/Documentation/virtual/kvm/amd-memory-encryption.rst
+++ b/Documentation/virtual/kvm/amd-memory-encryption.rst
@@ -43,3 +43,205 @@ setting the SEV bit before executing VMRUN.::
 SEV hardware uses ASIDs to associate a memory encryption key with a VM.
 Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value
 defined in the CPUID 0x8000001f[ecx] field.
+
+SEV Key Management
+==================
+
+The SEV guest key management is handled by a separate processor called the AMD
+Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure
+key management interface to perform common hypervisor activities such as
+encrypting bootstrap code, snapshot, migrating and debugging the guest. For more
+information, see the SEV Key Management spec [api-spec]_
+
+KVM implements the following commands to support common lifecycle events of SEV
+guests, such as launching, running, snapshotting, migrating and decommissioning.
+
+1. KVM_SEV_INIT
+---------------
+
+The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform
+context. In a typical workflow, this command should be the first command issued.
+
+Returns: 0 on success, -negative on error
+
+2. KVM_SEV_LAUNCH_START
+-----------------------
+
+The KVM_SEV_LAUNCH_START command is used for creating the memory encryption
+context. To create the encryption context, user must provide a guest policy,
+the owner's public Diffie-Hellman (PDH) key and session information.
+
+Parameters: struct  kvm_sev_launch_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_start {
+                __u32 handle;           /* if zero then firmware creates a new handle */
+                __u32 policy;           /* guest's policy */
+
+                __u64 dh_uaddr;         /* userspace address pointing to the guest owner's PDH key */
+                __u32 dh_len;
+
+                __u64 session_addr;     /* userspace address which points to the guest session information */
+                __u32 session_len;
+        };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.2.
+
+3. KVM_SEV_LAUNCH_UPDATE_DATA
+-----------------------------
+
+The KVM_SEV_LAUNCH_UPDATE_DATA is used for encrypting a memory region. It also
+calculates a measurement of the memory contents. The measurement is a signature
+of the memory contents that can be sent to the guest owner as an attestation
+that the memory was encrypted correctly by the firmware.
+
+Parameters (in): struct  kvm_sev_launch_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_update {
+                __u64 uaddr;    /* userspace address to be encrypted (must be 16-byte aligned) */
+                __u32 len;      /* length of the data to be encrypted (must be 16-byte aligned) */
+        };
+
+For more details, see SEV spec Section 6.3.
+
+4. KVM_SEV_LAUNCH_MEASURE
+-------------------------
+
+The KVM_SEV_LAUNCH_MEASURE command is used to retrieve the measurement of the
+data encrypted by the KVM_SEV_LAUNCH_UPDATE_DATA command. The guest owner may
+wait to provide the guest with confidential information until it can verify the
+measurement. Since the guest owner knows the initial contents of the guest at
+boot, the measurement can be verified by comparing it to what the guest owner
+expects.
+
+Parameters (in): struct  kvm_sev_launch_measure
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_measure {
+                __u64 uaddr;    /* where to copy the measurement */
+                __u32 len;      /* length of measurement blob */
+        };
+
+For more details on the measurement verification flow, see SEV spec Section 6.4.
+
+5. KVM_SEV_LAUNCH_FINISH
+------------------------
+
+After completion of the launch flow, the KVM_SEV_LAUNCH_FINISH command can be
+issued to make the guest ready for the execution.
+
+Returns: 0 on success, -negative on error
+
+6. KVM_SEV_GUEST_STATUS
+-----------------------
+
+The KVM_SEV_GUEST_STATUS command is used to retrieve status information about a
+SEV-enabled guest.
+
+Parameters (out): struct kvm_sev_guest_status
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_guest_status {
+                __u32 handle;   /* guest handle */
+                __u32 policy;   /* guest policy */
+                __u8 state;     /* guest state (see enum below) */
+        };
+
+SEV guest state:
+
+::
+
+        enum {
+        SEV_STATE_INVALID = 0;
+        SEV_STATE_LAUNCHING,    /* guest is currently being launched */
+        SEV_STATE_SECRET,       /* guest is being launched and ready to accept the ciphertext data */
+        SEV_STATE_RUNNING,      /* guest is fully launched and running */
+        SEV_STATE_RECEIVING,    /* guest is being migrated in from another SEV machine */
+        SEV_STATE_SENDING       /* guest is getting migrated out to another SEV machine */
+        };
+
+7. KVM_SEV_DBG_DECRYPT
+----------------------
+
+The KVM_SEV_DEBUG_DECRYPT command can be used by the hypervisor to request the
+firmware to decrypt the data at the given memory region.
+
+Parameters (in): struct kvm_sev_dbg
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_dbg {
+                __u64 src_uaddr;        /* userspace address of data to decrypt */
+                __u64 dst_uaddr;        /* userspace address of destination */
+                __u32 len;              /* length of memory region to decrypt */
+        };
+
+The command returns an error if the guest policy does not allow debugging.
+
+8. KVM_SEV_DBG_ENCRYPT
+----------------------
+
+The KVM_SEV_DEBUG_ENCRYPT command can be used by the hypervisor to request the
+firmware to encrypt the data at the given memory region.
+
+Parameters (in): struct kvm_sev_dbg
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_dbg {
+                __u64 src_uaddr;        /* userspace address of data to encrypt */
+                __u64 dst_uaddr;        /* userspace address of destination */
+                __u32 len;              /* length of memory region to encrypt */
+        };
+
+The command returns an error if the guest policy does not allow debugging.
+
+9. KVM_SEV_LAUNCH_SECRET
+------------------------
+
+The KVM_SEV_LAUNCH_SECRET command can be used by the hypervisor to inject secret
+data after the measurement has been validated by the guest owner.
+
+Parameters (in): struct kvm_sev_launch_secret
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_secret {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the guest memory region where the secret should be injected */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the hypervisor memory region which contains the secret */
+                __u32 trans_len;
+        };
+
+References
+==========
+
+.. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf
+.. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf
+.. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34)
+.. [kvm-forum]  http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c8c65190907d..571431d3384b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1369,6 +1369,86 @@ struct kvm_enc_region {
 #define KVM_MEMORY_ENCRYPT_REG_REGION    _IOR(KVMIO, 0xbb, struct kvm_enc_region)
 #define KVM_MEMORY_ENCRYPT_UNREG_REGION  _IOR(KVMIO, 0xbc, struct kvm_enc_region)
 
+/* Secure Encrypted Virtualization command */
+enum sev_cmd_id {
+	/* Guest initialization commands */
+	KVM_SEV_INIT = 0,
+	KVM_SEV_ES_INIT,
+	/* Guest launch commands */
+	KVM_SEV_LAUNCH_START,
+	KVM_SEV_LAUNCH_UPDATE_DATA,
+	KVM_SEV_LAUNCH_UPDATE_VMSA,
+	KVM_SEV_LAUNCH_SECRET,
+	KVM_SEV_LAUNCH_MEASURE,
+	KVM_SEV_LAUNCH_FINISH,
+	/* Guest migration commands (outgoing) */
+	KVM_SEV_SEND_START,
+	KVM_SEV_SEND_UPDATE_DATA,
+	KVM_SEV_SEND_UPDATE_VMSA,
+	KVM_SEV_SEND_FINISH,
+	/* Guest migration commands (incoming) */
+	KVM_SEV_RECEIVE_START,
+	KVM_SEV_RECEIVE_UPDATE_DATA,
+	KVM_SEV_RECEIVE_UPDATE_VMSA,
+	KVM_SEV_RECEIVE_FINISH,
+	/* Guest status and debug commands */
+	KVM_SEV_GUEST_STATUS,
+	KVM_SEV_DBG_DECRYPT,
+	KVM_SEV_DBG_ENCRYPT,
+	/* Guest certificates commands */
+	KVM_SEV_CERT_EXPORT,
+
+	KVM_SEV_NR_MAX,
+};
+
+struct kvm_sev_cmd {
+	__u32 id;
+	__u64 data;
+	__u32 error;
+	__u32 sev_fd;
+};
+
+struct kvm_sev_launch_start {
+	__u32 handle;
+	__u32 policy;
+	__u64 dh_uaddr;
+	__u32 dh_len;
+	__u64 session_uaddr;
+	__u32 session_len;
+};
+
+struct kvm_sev_launch_update_data {
+	__u64 uaddr;
+	__u32 len;
+};
+
+
+struct kvm_sev_launch_secret {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+struct kvm_sev_launch_measure {
+	__u64 uaddr;
+	__u32 len;
+};
+
+struct kvm_sev_guest_status {
+	__u32 handle;
+	__u32 policy;
+	__u32 state;
+};
+
+struct kvm_sev_dbg {
+	__u64 src_uaddr;
+	__u64 dst_uaddr;
+	__u32 len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-- 
cgit v1.2.3


From f19397a5c65665d66e3866b42056f1f58b7a366b Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 1 Dec 2017 10:15:04 -0800
Subject: bpf: Add access to snd_cwnd and others in sock_ops

Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these
fields are only valid if the socket associated with the sock_ops program
call is a full socket, the field is_fullsock is also added to the
bpf_sock_ops struct. If the socket is not a full socket, reading these
fields returns 0.

Note that in most cases it will not be necessary to check is_fullsock to
know if there is a full socket. The context of the call, as specified by
the 'op' field, can sometimes determine whether there is a full socket.

The struct bpf_sock_ops has the following fields added:

  __u32 is_fullsock;      /* Some TCP fields are only valid if
                           * there is a full socket. If not, the
                           * fields read as zero.
			   */
  __u32 snd_cwnd;
  __u32 srtt_us;          /* Averaged RTT << 3 in usecs */

There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add
read access to more 32 bit tcp_sock fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h   |  1 +
 include/net/tcp.h        |  6 ++++--
 include/uapi/linux/bpf.h |  6 ++++++
 net/core/filter.c        | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 80b5b482cb46..0062302e1285 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -985,6 +985,7 @@ struct bpf_sock_ops_kern {
 		u32 reply;
 		u32 replylong[4];
 	};
+	u32	is_fullsock;
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4e09398009c1..89a656077884 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2012,10 +2012,12 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
 
-	if (sk_fullsock(sk))
+	memset(&sock_ops, 0, sizeof(sock_ops));
+	if (sk_fullsock(sk)) {
+		sock_ops.is_fullsock = 1;
 		sock_owned_by_me(sk);
+	}
 
-	memset(&sock_ops, 0, sizeof(sock_ops));
 	sock_ops.sk = sk;
 	sock_ops.op = op;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4c223ab30293..80d62e88590c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -941,6 +941,12 @@ struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 4d644ad17457..754abe1041b7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
+
+	case offsetof(struct bpf_sock_ops, is_fullsock):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern,
+						is_fullsock),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern,
+					       is_fullsock));
+		break;
+
+/* Helper macro for adding read access to tcp_sock fields. */
+#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+	do {								      \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+				      offsetof(struct tcp_sock, FIELD_NAME)); \
+	} while (0)
+
+	case offsetof(struct bpf_sock_ops, snd_cwnd):
+		SOCK_OPS_GET_TCP32(snd_cwnd);
+		break;
+
+	case offsetof(struct bpf_sock_ops, srtt_us):
+		SOCK_OPS_GET_TCP32(srtt_us);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit v1.2.3


From c895f6f703ad7dd2f99e751d9884b0aa5d0eea25 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 4 Dec 2017 10:56:44 +0100
Subject: bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type

Commit 0515e5999a466dfe ("bpf: introduce BPF_PROG_TYPE_PERF_EVENT
program type") introduced the bpf_perf_event_data structure which
exports the pt_regs structure.  This is OK for multiple architectures
but fail for s390 and arm64 which do not export pt_regs.  Programs
using them, for example, the bpf selftest fail to compile on these
architectures.

For s390, exporting the pt_regs is not an option because s390 wants
to allow changes to it.  For arm64, there is a user_pt_regs structure
that covers parts of the pt_regs structure for use by user space.

To solve the broken uapi for s390 and arm64, introduce an abstract
type for pt_regs and add an asm/bpf_perf_event.h file that concretes
the type.  An asm-generic header file covers the architectures that
export pt_regs today.

The arch-specific enablement for s390 and arm64 follows in separate
commits.

Reported-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Fixes: 0515e5999a466dfe ("bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type")
Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Reviewed-and-tested-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/alpha/include/uapi/asm/Kbuild        | 2 ++
 arch/arc/include/uapi/asm/Kbuild          | 1 +
 arch/arm/include/uapi/asm/Kbuild          | 1 +
 arch/blackfin/include/uapi/asm/Kbuild     | 1 +
 arch/c6x/include/uapi/asm/Kbuild          | 1 +
 arch/cris/include/uapi/asm/Kbuild         | 1 +
 arch/frv/include/uapi/asm/Kbuild          | 2 ++
 arch/h8300/include/uapi/asm/Kbuild        | 1 +
 arch/hexagon/include/uapi/asm/Kbuild      | 1 +
 arch/ia64/include/uapi/asm/Kbuild         | 1 +
 arch/m32r/include/uapi/asm/Kbuild         | 1 +
 arch/m68k/include/uapi/asm/Kbuild         | 1 +
 arch/metag/include/uapi/asm/Kbuild        | 1 +
 arch/microblaze/include/uapi/asm/Kbuild   | 1 +
 arch/mips/include/uapi/asm/Kbuild         | 1 +
 arch/mn10300/include/uapi/asm/Kbuild      | 1 +
 arch/nios2/include/uapi/asm/Kbuild        | 1 +
 arch/openrisc/include/uapi/asm/Kbuild     | 1 +
 arch/parisc/include/uapi/asm/Kbuild       | 1 +
 arch/powerpc/include/uapi/asm/Kbuild      | 1 +
 arch/riscv/include/uapi/asm/Kbuild        | 1 +
 arch/score/include/uapi/asm/Kbuild        | 1 +
 arch/sh/include/uapi/asm/Kbuild           | 1 +
 arch/sparc/include/uapi/asm/Kbuild        | 1 +
 arch/tile/include/uapi/asm/Kbuild         | 1 +
 arch/unicore32/include/uapi/asm/Kbuild    | 1 +
 arch/x86/include/uapi/asm/Kbuild          | 1 +
 arch/xtensa/include/uapi/asm/Kbuild       | 1 +
 include/linux/perf_event.h                | 6 +++++-
 include/uapi/asm-generic/bpf_perf_event.h | 9 +++++++++
 include/uapi/linux/bpf_perf_event.h       | 5 ++---
 kernel/events/core.c                      | 2 +-
 32 files changed, 47 insertions(+), 5 deletions(-)
 create mode 100644 include/uapi/asm-generic/bpf_perf_event.h

(limited to 'include/uapi/linux')

diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild
index b15bf6bc0e94..14a2e9af97e9 100644
--- a/arch/alpha/include/uapi/asm/Kbuild
+++ b/arch/alpha/include/uapi/asm/Kbuild
@@ -1,2 +1,4 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
+
+generic-y += bpf_perf_event.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index fa6d0ff4ff89..170b5db64afe 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 4d53de308ee0..4d1cc1847edf 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ generated-y += unistd-oabi.h
 generated-y += unistd-eabi.h
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/blackfin/include/uapi/asm/Kbuild b/arch/blackfin/include/uapi/asm/Kbuild
index aa624b4ab655..2240b38c2915 100644
--- a/arch/blackfin/include/uapi/asm/Kbuild
+++ b/arch/blackfin/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 67ee896a76a7..26644e15d854 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/cris/include/uapi/asm/Kbuild b/arch/cris/include/uapi/asm/Kbuild
index 3687b54bb18e..3470c6e9c7b9 100644
--- a/arch/cris/include/uapi/asm/Kbuild
+++ b/arch/cris/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/frv/include/uapi/asm/Kbuild b/arch/frv/include/uapi/asm/Kbuild
index b15bf6bc0e94..14a2e9af97e9 100644
--- a/arch/frv/include/uapi/asm/Kbuild
+++ b/arch/frv/include/uapi/asm/Kbuild
@@ -1,2 +1,4 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
+
+generic-y += bpf_perf_event.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 187aed820e71..2f65f78792cb 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild
index cb5df3aad3a8..41a176dbb53e 100644
--- a/arch/hexagon/include/uapi/asm/Kbuild
+++ b/arch/hexagon/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 13a97aa2285f..f5c6967a93bb 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
diff --git a/arch/m32r/include/uapi/asm/Kbuild b/arch/m32r/include/uapi/asm/Kbuild
index 1c44d3b3eba0..451bf6071c6e 100644
--- a/arch/m32r/include/uapi/asm/Kbuild
+++ b/arch/m32r/include/uapi/asm/Kbuild
@@ -1,5 +1,6 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
 generic-y += siginfo.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index 3717b64a620d..c2e26a44c482 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/metag/include/uapi/asm/Kbuild b/arch/metag/include/uapi/asm/Kbuild
index 6ac763d9a3e3..f9eaf07d29f8 100644
--- a/arch/metag/include/uapi/asm/Kbuild
+++ b/arch/metag/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 06609ca36115..2c6a6bffea32 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/mips/include/uapi/asm/Kbuild b/arch/mips/include/uapi/asm/Kbuild
index a0266feba9e6..7a4becd8963a 100644
--- a/arch/mips/include/uapi/asm/Kbuild
+++ b/arch/mips/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += ipcbuf.h
diff --git a/arch/mn10300/include/uapi/asm/Kbuild b/arch/mn10300/include/uapi/asm/Kbuild
index c94ee54210bc..81271d3af47c 100644
--- a/arch/mn10300/include/uapi/asm/Kbuild
+++ b/arch/mn10300/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y	+= bpf_perf_event.h
 generic-y	+= siginfo.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index ffca24da7647..13a3d77b4d7b 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 62286dbeb904..130c16ccba0a 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index 196d2a4efb31..286ef5a5904b 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
 generic-y += param.h
 generic-y += poll.h
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 0d960ef78a9a..1a6ed5919ffd 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += param.h
 generic-y += poll.h
 generic-y += resource.h
diff --git a/arch/riscv/include/uapi/asm/Kbuild b/arch/riscv/include/uapi/asm/Kbuild
index 5ded96b06352..7e91f4850475 100644
--- a/arch/riscv/include/uapi/asm/Kbuild
+++ b/arch/riscv/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += setup.h
 generic-y += unistd.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/score/include/uapi/asm/Kbuild b/arch/score/include/uapi/asm/Kbuild
index c94ee54210bc..81271d3af47c 100644
--- a/arch/score/include/uapi/asm/Kbuild
+++ b/arch/score/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y	+= bpf_perf_event.h
 generic-y	+= siginfo.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index e28531333efa..ba4d39cb321d 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild
index 2178c78c7c1a..4680ba246b55 100644
--- a/arch/sparc/include/uapi/asm/Kbuild
+++ b/arch/sparc/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += types.h
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index 5711de0a1b5e..cc439612bcd5 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 759a71411169..8611ef980554 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index da1489cb64dc..1e901e421f2d 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generated-y += unistd_32.h
 generated-y += unistd_64.h
 generated-y += unistd_x32.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index a5bcdfb890f1..837d4dd76785 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2c9c87d8a0c1..7546822a1d74 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -15,6 +15,7 @@
 #define _LINUX_PERF_EVENT_H
 
 #include <uapi/linux/perf_event.h>
+#include <uapi/linux/bpf_perf_event.h>
 
 /*
  * Kernel-internal data types and definitions:
@@ -787,7 +788,7 @@ struct perf_output_handle {
 };
 
 struct bpf_perf_event_data_kern {
-	struct pt_regs *regs;
+	bpf_user_pt_regs_t *regs;
 	struct perf_sample_data *data;
 	struct perf_event *event;
 };
@@ -1177,6 +1178,9 @@ extern void perf_bp_event(struct perf_event *event, void *data);
 		(user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
 # define perf_instruction_pointer(regs)	instruction_pointer(regs)
 #endif
+#ifndef perf_arch_bpf_user_pt_regs
+# define perf_arch_bpf_user_pt_regs(regs) regs
+#endif
 
 static inline bool has_branch_stack(struct perf_event *event)
 {
diff --git a/include/uapi/asm-generic/bpf_perf_event.h b/include/uapi/asm-generic/bpf_perf_event.h
new file mode 100644
index 000000000000..53815d2cd047
--- /dev/null
+++ b/include/uapi/asm-generic/bpf_perf_event.h
@@ -0,0 +1,9 @@
+#ifndef _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__
+#define _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__
+
+#include <linux/ptrace.h>
+
+/* Export kernel pt_regs structure */
+typedef struct pt_regs bpf_user_pt_regs_t;
+
+#endif /* _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__ */
diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h
index af549d4ecf1b..8f95303f9d80 100644
--- a/include/uapi/linux/bpf_perf_event.h
+++ b/include/uapi/linux/bpf_perf_event.h
@@ -8,11 +8,10 @@
 #ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__
 #define _UAPI__LINUX_BPF_PERF_EVENT_H__
 
-#include <linux/types.h>
-#include <linux/ptrace.h>
+#include <asm/bpf_perf_event.h>
 
 struct bpf_perf_event_data {
-	struct pt_regs regs;
+	bpf_user_pt_regs_t regs;
 	__u64 sample_period;
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4767e1..ba957b9812b3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7987,11 +7987,11 @@ static void bpf_overflow_handler(struct perf_event *event,
 {
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
-		.regs = regs,
 		.event = event,
 	};
 	int ret = 0;
 
+	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
 	preempt_disable();
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 		goto out;
-- 
cgit v1.2.3


From 96f84061620c6325a2ca9a9a05b410e6461d03c3 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 4 Dec 2017 17:31:23 +0800
Subject: tun: add eBPF based queue selection method

This patch introduces an eBPF based queue selection method. With this,
the policy could be offloaded to userspace completely through a new
ioctl TUNSETSTEERINGEBPF.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 145 +++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 123 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 80568f81a7c8..787cc35ef89b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -195,6 +195,11 @@ struct tun_flow_entry {
 
 #define TUN_NUM_FLOW_ENTRIES 1024
 
+struct tun_steering_prog {
+	struct rcu_head rcu;
+	struct bpf_prog *prog;
+};
+
 /* Since the socket were moved to tun_file, to preserve the behavior of persist
  * device, socket filter, sndbuf and vnet header size were restore when the
  * file were attached to a persist device.
@@ -232,6 +237,7 @@ struct tun_struct {
 	u32 rx_batched;
 	struct tun_pcpu_stats __percpu *pcpu_stats;
 	struct bpf_prog __rcu *xdp_prog;
+	struct tun_steering_prog __rcu *steering_prog;
 };
 
 static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -537,15 +543,12 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
  * different rxq no. here. If we could not get rxhash, then we would
  * hope the rxq no. may help here.
  */
-static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 {
-	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_flow_entry *e;
 	u32 txq = 0;
 	u32 numqueues = 0;
 
-	rcu_read_lock();
 	numqueues = READ_ONCE(tun->numqueues);
 
 	txq = __skb_get_hash_symmetric(skb);
@@ -563,10 +566,37 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
 			txq -= numqueues;
 	}
 
-	rcu_read_unlock();
 	return txq;
 }
 
+static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
+{
+	struct tun_steering_prog *prog;
+	u16 ret = 0;
+
+	prog = rcu_dereference(tun->steering_prog);
+	if (prog)
+		ret = bpf_prog_run_clear_cb(prog->prog, skb);
+
+	return ret % tun->numqueues;
+}
+
+static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv, select_queue_fallback_t fallback)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	u16 ret;
+
+	rcu_read_lock();
+	if (rcu_dereference(tun->steering_prog))
+		ret = tun_ebpf_select_queue(tun, skb);
+	else
+		ret = tun_automq_select_queue(tun, skb);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static inline bool tun_not_capable(struct tun_struct *tun)
 {
 	const struct cred *cred = current_cred();
@@ -933,23 +963,10 @@ static int tun_net_close(struct net_device *dev)
 }
 
 /* Net device start xmit */
-static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
+static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
 {
-	struct tun_struct *tun = netdev_priv(dev);
-	int txq = skb->queue_mapping;
-	struct tun_file *tfile;
-	u32 numqueues = 0;
-
-	rcu_read_lock();
-	tfile = rcu_dereference(tun->tfiles[txq]);
-	numqueues = READ_ONCE(tun->numqueues);
-
-	/* Drop packet if interface is not attached */
-	if (txq >= numqueues)
-		goto drop;
-
 #ifdef CONFIG_RPS
-	if (numqueues == 1 && static_key_false(&rps_needed)) {
+	if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
 		/* Select queue was not called for the skbuff, so we extract the
 		 * RPS hash and save it into the flow_table here.
 		 */
@@ -965,6 +982,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 #endif
+}
+
+/* Net device start xmit */
+static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	int txq = skb->queue_mapping;
+	struct tun_file *tfile;
+	u32 numqueues = 0;
+
+	rcu_read_lock();
+	tfile = rcu_dereference(tun->tfiles[txq]);
+	numqueues = READ_ONCE(tun->numqueues);
+
+	/* Drop packet if interface is not attached */
+	if (txq >= numqueues)
+		goto drop;
+
+	if (!rcu_dereference(tun->steering_prog))
+		tun_automq_xmit(tun, skb);
 
 	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
 
@@ -1547,7 +1584,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int copylen;
 	bool zerocopy = false;
 	int err;
-	u32 rxhash;
+	u32 rxhash = 0;
 	int skb_xdp = 1;
 	bool frags = tun_napi_frags_enabled(tun);
 
@@ -1735,7 +1772,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		rcu_read_unlock();
 	}
 
-	rxhash = __skb_get_hash_symmetric(skb);
+	rcu_read_lock();
+	if (!rcu_dereference(tun->steering_prog))
+		rxhash = __skb_get_hash_symmetric(skb);
+	rcu_read_unlock();
 
 	if (frags) {
 		/* Exercise flow dissector code path. */
@@ -1779,7 +1819,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	u64_stats_update_end(&stats->syncp);
 	put_cpu_ptr(stats);
 
-	tun_flow_update(tun, rxhash, tfile);
+	if (rxhash)
+		tun_flow_update(tun, rxhash, tfile);
+
 	return total_len;
 }
 
@@ -1987,6 +2029,36 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return ret;
 }
 
+static void tun_steering_prog_free(struct rcu_head *rcu)
+{
+	struct tun_steering_prog *prog = container_of(rcu,
+					 struct tun_steering_prog, rcu);
+
+	bpf_prog_destroy(prog->prog);
+	kfree(prog);
+}
+
+static int __tun_set_steering_ebpf(struct tun_struct *tun,
+				   struct bpf_prog *prog)
+{
+	struct tun_steering_prog *old, *new = NULL;
+
+	if (prog) {
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		new->prog = prog;
+	}
+
+	old = rtnl_dereference(tun->steering_prog);
+	rcu_assign_pointer(tun->steering_prog, new);
+
+	if (old)
+		call_rcu(&old->rcu, tun_steering_prog_free);
+
+	return 0;
+}
+
 static void tun_free_netdev(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
@@ -1995,6 +2067,9 @@ static void tun_free_netdev(struct net_device *dev)
 	free_percpu(tun->pcpu_stats);
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
+	rtnl_lock();
+	__tun_set_steering_ebpf(tun, NULL);
+	rtnl_unlock();
 }
 
 static void tun_setup(struct net_device *dev)
@@ -2283,6 +2358,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		tun->filter_attached = false;
 		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
 		tun->rx_batched = 0;
+		RCU_INIT_POINTER(tun->steering_prog, NULL);
 
 		tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
 		if (!tun->pcpu_stats) {
@@ -2475,6 +2551,25 @@ unlock:
 	return ret;
 }
 
+static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
+{
+	struct bpf_prog *prog;
+	int fd;
+
+	if (copy_from_user(&fd, data, sizeof(fd)))
+		return -EFAULT;
+
+	if (fd == -1) {
+		prog = NULL;
+	} else {
+		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+	}
+
+	return __tun_set_steering_ebpf(tun, prog);
+}
+
 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg, int ifreq_len)
 {
@@ -2751,6 +2846,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = 0;
 		break;
 
+	case TUNSETSTEERINGEBPF:
+		ret = tun_set_steering_ebpf(tun, argp);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 030d3e6d6029..fb38c1797131 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -57,6 +57,7 @@
  */
 #define TUNSETVNETBE _IOW('T', 222, int)
 #define TUNGETVNETBE _IOR('T', 223, int)
+#define TUNSETSTEERINGEBPF _IOR('T', 224, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
cgit v1.2.3


From bb64da9aba89765fee74b395967b18a7d6c364e9 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 21 Nov 2017 16:02:52 +0100
Subject: KVM: s390: mark irq_state.flags as non-usable

Old kernels did not check for zero in the irq_state.flags field and old
QEMUs did not zero the flag/reserved fields when calling
KVM_S390_*_IRQ_STATE.  Let's add comments to prevent future uses of
these fields.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt | 15 ++++++++++++---
 arch/s390/kvm/kvm-s390.c          |  6 ++++--
 include/uapi/linux/kvm.h          |  4 ++--
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f670e4b9e7f3..57d3ee9e4bde 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2901,14 +2901,19 @@ userspace buffer and its length:
 
 struct kvm_s390_irq_state {
 	__u64 buf;
-	__u32 flags;
+	__u32 flags;        /* will stay unused for compatibility reasons */
 	__u32 len;
-	__u32 reserved[4];
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
 };
 
 Userspace passes in the above struct and for each pending interrupt a
 struct kvm_s390_irq is copied to the provided buffer.
 
+The structure contains a flags and a reserved field for future extensions. As
+the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and
+reserved, these fields can not be used in the future without breaking
+compatibility.
+
 If -ENOBUFS is returned the buffer provided was too small and userspace
 may retry with a bigger buffer.
 
@@ -2932,10 +2937,14 @@ containing a struct kvm_s390_irq_state:
 
 struct kvm_s390_irq_state {
 	__u64 buf;
+	__u32 flags;        /* will stay unused for compatibility reasons */
 	__u32 len;
-	__u32 pad;
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
 };
 
+The restrictions for flags and reserved apply as well.
+(see KVM_S390_GET_IRQ_STATE)
+
 The userspace memory referenced by buf contains a struct kvm_s390_irq
 for each interrupt to be injected into the guest.
 If one of the interrupts could not be injected for some reason the
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 34375eed93ee..efa439f6ffb3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * hosting zSeries kernel virtual machines
+ * hosting IBM Z kernel virtual machines (s390x)
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2017
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -3808,6 +3808,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			r = -EINVAL;
 			break;
 		}
+		/* do not use irq_state.flags, it will break old QEMUs */
 		r = kvm_s390_set_irq_state(vcpu,
 					   (void __user *) irq_state.buf,
 					   irq_state.len);
@@ -3823,6 +3824,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			r = -EINVAL;
 			break;
 		}
+		/* do not use irq_state.flags, it will break old QEMUs */
 		r = kvm_s390_get_irq_state(vcpu,
 					   (__u8 __user *)  irq_state.buf,
 					   irq_state.len);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 282d7613fce8..496e59a2738b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -630,9 +630,9 @@ struct kvm_s390_irq {
 
 struct kvm_s390_irq_state {
 	__u64 buf;
-	__u32 flags;
+	__u32 flags;        /* will stay unused for compatibility reasons */
 	__u32 len;
-	__u32 reserved[4];
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
 };
 
 /* for KVM_SET_GUEST_DEBUG */
-- 
cgit v1.2.3


From 4982327ff6755377a8a66e84113f496f3a6c53bc Mon Sep 17 00:00:00 2001
From: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Date: Thu, 9 Nov 2017 23:44:34 +0100
Subject: Input: add KEY_ROTATE_LOCK_TOGGLE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The key has the same use as the SW_ROTATE_LOCK, but is used on devices
where the state is not tracked by the hardware but has to be handled
in software.

Signed-off-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 include/uapi/linux/input-event-codes.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 061fa62958a2..53fbae27b280 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -594,6 +594,7 @@
 #define BTN_DPAD_RIGHT		0x223
 
 #define KEY_ALS_TOGGLE		0x230	/* Ambient light sensor */
+#define KEY_ROTATE_LOCK_TOGGLE	0x231	/* Display rotation lock */
 
 #define KEY_BUTTONCONFIG		0x240	/* AL Button Configuration */
 #define KEY_TASKMANAGER		0x241	/* AL Task/Project Manager */
-- 
cgit v1.2.3


From 772a58693fc3116d05b7969223a80a6376e639eb Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:03:58 +0800
Subject: sctp: add stream interleave enable members and sockopt

This patch adds intl_enable in asoc and netns, and strm_interleave in
sctp_sock to indicate if stream interleave is enabled and supported.

netns intl_enable would be set via procfs, but that is not added yet
until all stream interleave codes are completely implemented; asoc
intl_enable will be set when doing 4-shakehands.

sp strm_interleave can be set by sockopt SCTP_INTERLEAVING_SUPPORTED
which is also added in this patch. This socket option is defined in
section 4.3.1 of RFC8260.

Note that strm_interleave can only be set by sockopt when both netns
intl_enable and sp frag_interleave are set.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/sctp.h   |  5 ++-
 include/net/sctp/structs.h |  2 ++
 include/uapi/linux/sctp.h  |  1 +
 net/sctp/socket.c          | 88 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 94 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index ebc813277662..0db7fb3e4e15 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -122,9 +122,12 @@ struct netns_sctp {
 	/* Flag to indicate if PR-CONFIG is enabled. */
 	int reconf_enable;
 
-	/* Flag to idicate if SCTP-AUTH is enabled */
+	/* Flag to indicate if SCTP-AUTH is enabled */
 	int auth_enable;
 
+	/* Flag to indicate if stream interleave is enabled */
+	int intl_enable;
+
 	/*
 	 * Policy to control SCTP IPv4 address scoping
 	 * 0   - Disable IPv4 address scoping
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2f8f93da5dc2..7030cbe11f45 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -217,6 +217,7 @@ struct sctp_sock {
 		disable_fragments:1,
 		v4mapped:1,
 		frag_interleave:1,
+		strm_interleave:1,
 		recvrcvinfo:1,
 		recvnxtinfo:1,
 		data_ready_signalled:1;
@@ -1940,6 +1941,7 @@ struct sctp_association {
 	__u8 need_ecne:1,	/* Need to send an ECNE Chunk? */
 	     temp:1,		/* Is it a temporary association? */
 	     force_delay:1,
+	     intl_enable:1,
 	     prsctp_enable:1,
 	     reconf_enable:1;
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index d9adab32dbee..6ed934c65a5f 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -125,6 +125,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
 #define SCTP_STREAM_SCHEDULER	123
 #define SCTP_STREAM_SCHEDULER_VALUE	124
+#define SCTP_INTERLEAVING_SUPPORTED	125
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3e55daa37e66..306c737bde87 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3350,7 +3350,10 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk,
 	if (get_user(val, (int __user *)optval))
 		return -EFAULT;
 
-	sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1;
+	sctp_sk(sk)->frag_interleave = !!val;
+
+	if (!sctp_sk(sk)->frag_interleave)
+		sctp_sk(sk)->strm_interleave = 0;
 
 	return 0;
 }
@@ -4019,6 +4022,40 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_interleaving_supported(struct sock *sk,
+						  char __user *optval,
+						  unsigned int optlen)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct sctp_assoc_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_id)
+		goto out;
+
+	if (!net->sctp.intl_enable || !sp->frag_interleave) {
+		retval = -EPERM;
+		goto out;
+	}
+
+	sp->strm_interleave = !!params.assoc_value;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4206,6 +4243,10 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_STREAM_SCHEDULER_VALUE:
 		retval = sctp_setsockopt_scheduler_value(sk, optval, optlen);
 		break;
+	case SCTP_INTERLEAVING_SUPPORTED:
+		retval = sctp_setsockopt_interleaving_supported(sk, optval,
+								optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6969,6 +7010,47 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
+						  char __user *optval,
+						  int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		params.assoc_value = asoc->intl_enable;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		params.assoc_value = sp->strm_interleave;
+	} else {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7159,6 +7241,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_scheduler_value(sk, len, optval,
 							 optlen);
 		break;
+	case SCTP_INTERLEAVING_SUPPORTED:
+		retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
+								optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 65f5e357839e40817aead853d7a7f61ff828b52b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Dec 2017 21:04:08 +0800
Subject: sctp: implement abort_pd for sctp_stream_interleave

abort_pd is added as a member of sctp_stream_interleave, used to abort
partial delivery for data or idata, called in sctp_cmd_assoc_failed.

Since stream interleave allows to do partial delivery for each stream
at the same time, sctp_intl_abort_pd for idata would be very different
from the old function sctp_ulpq_abort_pd for data.

Note that sctp_ulpevent_make_pdapi will support per stream in this
patch by adding pdapi_stream and pdapi_seq in sctp_pdapi_event, as
described in section 6.1.7 of RFC6458.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_interleave.h |  1 +
 include/net/sctp/ulpevent.h          |  3 +-
 include/uapi/linux/sctp.h            |  2 +
 net/sctp/sm_sideeffect.c             |  2 +-
 net/sctp/stream_interleave.c         | 99 ++++++++++++++++++++++++++++++++++++
 net/sctp/ulpevent.c                  |  9 ++--
 net/sctp/ulpqueue.c                  |  2 +-
 7 files changed, 112 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h
index 317d9b3a5299..501b2be049a3 100644
--- a/include/net/sctp/stream_interleave.h
+++ b/include/net/sctp/stream_interleave.h
@@ -46,6 +46,7 @@ struct sctp_stream_interleave {
 	void	(*renege_events)(struct sctp_ulpq *ulpq,
 				 struct sctp_chunk *chunk, gfp_t gfp);
 	void	(*start_pd)(struct sctp_ulpq *ulpq, gfp_t gfp);
+	void	(*abort_pd)(struct sctp_ulpq *ulpq, gfp_t gfp);
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream);
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index ce4f2aa35d56..51b4e0626c34 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -122,7 +122,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
 
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
 	const struct sctp_association *asoc,
-	__u32 indication, gfp_t gfp);
+	__u32 indication, __u32 sid, __u32 seq,
+	__u32 flags, gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication(
 	const struct sctp_association *asoc, gfp_t gfp);
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6ed934c65a5f..4c4db14786bd 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -460,6 +460,8 @@ struct sctp_pdapi_event {
 	__u32 pdapi_length;
 	__u32 pdapi_indication;
 	sctp_assoc_t pdapi_assoc_id;
+	__u32 pdapi_stream;
+	__u32 pdapi_seq;
 };
 
 enum { SCTP_PARTIAL_DELIVERY_ABORTED=0, };
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 36710549a4ca..8adde71fdb31 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -632,7 +632,7 @@ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
 	struct sctp_chunk *abort;
 
 	/* Cancel any partial delivery in progress. */
-	sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+	asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC);
 
 	if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT)
 		event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 4dce8d33c5ab..d15645ea338b 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -652,6 +652,103 @@ static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 	sk_mem_reclaim(asoc->base.sk);
 }
 
+static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
+				      __u32 mid, __u16 flags, gfp_t gfp)
+{
+	struct sock *sk = ulpq->asoc->base.sk;
+	struct sctp_ulpevent *ev = NULL;
+
+	if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
+					&sctp_sk(sk)->subscribe))
+		return;
+
+	ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED,
+				      sid, mid, flags, gfp);
+	if (ev) {
+		__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
+
+		if (!sctp_sk(sk)->data_ready_signalled) {
+			sctp_sk(sk)->data_ready_signalled = 1;
+			sk->sk_data_ready(sk);
+		}
+	}
+}
+
+static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
+{
+	struct sctp_stream *stream = &ulpq->asoc->stream;
+	struct sctp_ulpevent *cevent, *event = NULL;
+	struct sk_buff_head *lobby = &ulpq->lobby;
+	struct sk_buff *pos, *tmp;
+	struct sk_buff_head temp;
+	__u16 csid;
+	__u32 cmid;
+
+	skb_queue_head_init(&temp);
+	sctp_skb_for_each(pos, lobby, tmp) {
+		cevent = (struct sctp_ulpevent *)pos->cb;
+		csid = cevent->stream;
+		cmid = cevent->mid;
+
+		if (csid > sid)
+			break;
+
+		if (csid < sid)
+			continue;
+
+		if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid)))
+			break;
+
+		__skb_unlink(pos, lobby);
+		if (!event)
+			event = sctp_skb2event(pos);
+
+		__skb_queue_tail(&temp, pos);
+	}
+
+	if (!event && pos != (struct sk_buff *)lobby) {
+		cevent = (struct sctp_ulpevent *)pos->cb;
+		csid = cevent->stream;
+		cmid = cevent->mid;
+
+		if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) {
+			sctp_mid_next(stream, in, csid);
+			__skb_unlink(pos, lobby);
+			__skb_queue_tail(&temp, pos);
+			event = sctp_skb2event(pos);
+		}
+	}
+
+	if (event) {
+		sctp_intl_retrieve_ordered(ulpq, event);
+		sctp_enqueue_event(ulpq, event);
+	}
+}
+
+static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+	struct sctp_stream *stream = &ulpq->asoc->stream;
+	__u16 sid;
+
+	for (sid = 0; sid < stream->incnt; sid++) {
+		struct sctp_stream_in *sin = &stream->in[sid];
+		__u32 mid;
+
+		if (sin->pd_mode) {
+			sin->pd_mode = 0;
+
+			mid = sin->mid;
+			sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp);
+			sctp_mid_skip(stream, in, sid, mid);
+
+			sctp_intl_reap_ordered(ulpq, sid);
+		}
+	}
+
+	/* intl abort pd happens only when all data needs to be cleaned */
+	sctp_ulpq_flush(ulpq);
+}
+
 static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.data_chunk_len		= sizeof(struct sctp_data_chunk),
 	/* DATA process functions */
@@ -662,6 +759,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = {
 	.enqueue_event		= sctp_ulpq_tail_event,
 	.renege_events		= sctp_ulpq_renege,
 	.start_pd		= sctp_ulpq_partial_delivery,
+	.abort_pd		= sctp_ulpq_abort_pd,
 };
 
 static struct sctp_stream_interleave sctp_stream_interleave_1 = {
@@ -674,6 +772,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = {
 	.enqueue_event		= sctp_enqueue_event,
 	.renege_events		= sctp_renege_events,
 	.start_pd		= sctp_intl_start_pd,
+	.abort_pd		= sctp_intl_abort_pd,
 };
 
 void sctp_stream_interleave_init(struct sctp_stream *stream)
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d3218f3e9cf7..84207ad33e8e 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -730,8 +730,9 @@ fail:
  *   various events.
  */
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
-	const struct sctp_association *asoc, __u32 indication,
-	gfp_t gfp)
+					const struct sctp_association *asoc,
+					__u32 indication, __u32 sid, __u32 seq,
+					__u32 flags, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_pdapi_event *pd;
@@ -752,7 +753,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
 	 *   Currently unused.
 	 */
 	pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
-	pd->pdapi_flags = 0;
+	pd->pdapi_flags = flags;
+	pd->pdapi_stream = sid;
+	pd->pdapi_seq = seq;
 
 	/* pdapi_length: 32 bits (unsigned integer)
 	 *
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 76ec5149a093..dd53daab4a25 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -1144,7 +1144,7 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 				       &sctp_sk(sk)->subscribe))
 		ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
 					      SCTP_PARTIAL_DELIVERY_ABORTED,
-					      gfp);
+					      0, 0, 0, gfp);
 	if (ev)
 		__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
 
-- 
cgit v1.2.3


From aa15d3d257f9edcb8d15ed27e228d1c0080cb919 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 11 Dec 2017 11:58:21 -0500
Subject: USB: remove the URB_NO_FSBR flag

The URB_NO_FSBR flag has never really been used.  It was introduced as
a potential way for UHCI to minimize PCI bus usage (by not attempting
full-speed bulk and control transfers more than once per frame), but
the flag was not set by any drivers.

There's no point in keeping it around.  This patch simplifies the API
by removing it.  Unfortunately, it does have to be kept as part of the
usbfs ABI, but at least we can document in
include/uapi/linux/usbdevice_fs.h that it doesn't do anything.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/usb/usbip_protocol.txt | 1 -
 drivers/usb/core/devio.c             | 2 --
 drivers/usb/core/urb.c               | 3 ---
 drivers/usb/host/uhci-q.c            | 3 +--
 drivers/usb/usbip/stub_rx.c          | 3 ---
 include/linux/usb.h                  | 1 -
 include/uapi/linux/usbdevice_fs.h    | 2 +-
 7 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/usb/usbip_protocol.txt b/Documentation/usb/usbip_protocol.txt
index 16b6fe27284c..c7a0f4c7e7f1 100644
--- a/Documentation/usb/usbip_protocol.txt
+++ b/Documentation/usb/usbip_protocol.txt
@@ -274,7 +274,6 @@ USBIP_CMD_SUBMIT: Submit an URB
   URB_SHORT_NOT_OK        | 0x00000001 | only in | only in   | only in  | no
   URB_ISO_ASAP            | 0x00000002 | no      | no        | no       | yes
   URB_NO_TRANSFER_DMA_MAP | 0x00000004 | yes     | yes       | yes      | yes
-  URB_NO_FSBR             | 0x00000020 | yes     | no        | no       | no
   URB_ZERO_PACKET         | 0x00000040 | no      | no        | only out | no
   URB_NO_INTERRUPT        | 0x00000080 | yes     | yes       | yes      | yes
   URB_FREE_BUFFER         | 0x00000100 | yes     | yes       | yes      | yes
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 705c573d0257..808b370f1737 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1677,8 +1677,6 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
 		u |= URB_ISO_ASAP;
 	if (uurb->flags & USBDEVFS_URB_SHORT_NOT_OK && is_in)
 		u |= URB_SHORT_NOT_OK;
-	if (uurb->flags & USBDEVFS_URB_NO_FSBR)
-		u |= URB_NO_FSBR;
 	if (uurb->flags & USBDEVFS_URB_ZERO_PACKET)
 		u |= URB_ZERO_PACKET;
 	if (uurb->flags & USBDEVFS_URB_NO_INTERRUPT)
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 9fdf137c4865..796c9b149728 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -479,9 +479,6 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
 		if (is_out)
 			allowed |= URB_ZERO_PACKET;
 		/* FALLTHROUGH */
-	case USB_ENDPOINT_XFER_CONTROL:
-		allowed |= URB_NO_FSBR;	/* only affects UHCI */
-		/* FALLTHROUGH */
 	default:			/* all non-iso endpoints */
 		if (!is_out)
 			allowed |= URB_SHORT_NOT_OK;
diff --git a/drivers/usb/host/uhci-q.c b/drivers/usb/host/uhci-q.c
index d40438238938..35fcb826152c 100644
--- a/drivers/usb/host/uhci-q.c
+++ b/drivers/usb/host/uhci-q.c
@@ -73,8 +73,7 @@ static void uhci_add_fsbr(struct uhci_hcd *uhci, struct urb *urb)
 {
 	struct urb_priv *urbp = urb->hcpriv;
 
-	if (!(urb->transfer_flags & URB_NO_FSBR))
-		urbp->fsbr = 1;
+	urbp->fsbr = 1;
 }
 
 static void uhci_urbp_wants_fsbr(struct uhci_hcd *uhci, struct urb_priv *urbp)
diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 536e037f541f..7a9aa9ff485d 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -412,9 +412,6 @@ static void masking_bogus_flags(struct urb *urb)
 		if (is_out)
 			allowed |= URB_ZERO_PACKET;
 		/* FALLTHROUGH */
-	case USB_ENDPOINT_XFER_CONTROL:
-		allowed |= URB_NO_FSBR;	/* only affects UHCI */
-		/* FALLTHROUGH */
 	default:			/* all non-iso endpoints */
 		if (!is_out)
 			allowed |= URB_SHORT_NOT_OK;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index fbbe974661f2..fe665a0d5bce 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1293,7 +1293,6 @@ extern int usb_disabled(void);
 #define URB_ISO_ASAP		0x0002	/* iso-only; use the first unexpired
 					 * slot in the schedule */
 #define URB_NO_TRANSFER_DMA_MAP	0x0004	/* urb->transfer_dma valid on submit */
-#define URB_NO_FSBR		0x0020	/* UHCI-specific */
 #define URB_ZERO_PACKET		0x0040	/* Finish bulk OUT with short packet */
 #define URB_NO_INTERRUPT	0x0080	/* HINT: no non-error interrupt
 					 * needed */
diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h
index 70ed5338d447..964e87217be4 100644
--- a/include/uapi/linux/usbdevice_fs.h
+++ b/include/uapi/linux/usbdevice_fs.h
@@ -79,7 +79,7 @@ struct usbdevfs_connectinfo {
 #define USBDEVFS_URB_SHORT_NOT_OK	0x01
 #define USBDEVFS_URB_ISO_ASAP		0x02
 #define USBDEVFS_URB_BULK_CONTINUATION	0x04
-#define USBDEVFS_URB_NO_FSBR		0x20
+#define USBDEVFS_URB_NO_FSBR		0x20	/* Not used */
 #define USBDEVFS_URB_ZERO_PACKET	0x40
 #define USBDEVFS_URB_NO_INTERRUPT	0x80
 
-- 
cgit v1.2.3


From f371b304f12e31fe30207c41ca7754564e0ea4dc Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 11 Dec 2017 11:39:02 -0800
Subject: bpf/tracing: allow user space to query prog array on the same tp

Commit e87c6bc3852b ("bpf: permit multiple bpf attachments
for a single perf event") added support to attach multiple
bpf programs to a single perf event.
Although this provides flexibility, users may want to know
what other bpf programs attached to the same tp interface.
Besides getting visibility for the underlying bpf system,
such information may also help consolidate multiple bpf programs,
understand potential performance issues due to a large array,
and debug (e.g., one bpf program which overwrites return code
may impact subsequent program results).

Commit 2541517c32be ("tracing, perf: Implement BPF programs
attached to kprobes") utilized the existing perf ioctl
interface and added the command PERF_EVENT_IOC_SET_BPF
to attach a bpf program to a tracepoint. This patch adds a new
ioctl command, given a perf event fd, to query the bpf program
array attached to the same perf tracepoint event.

The new uapi ioctl command:
  PERF_EVENT_IOC_QUERY_BPF

The new uapi/linux/perf_event.h structure:
  struct perf_event_query_bpf {
       __u32	ids_len;
       __u32	prog_cnt;
       __u32	ids[0];
  };

User space provides buffer "ids" for kernel to copy to.
When returning from the kernel, the number of available
programs in the array is set in "prog_cnt".

The usage:
  struct perf_event_query_bpf *query =
    malloc(sizeof(*query) + sizeof(u32) * ids_len);
  query.ids_len = ids_len;
  err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query);
  if (err == 0) {
    /* query.prog_cnt is the number of available progs,
     * number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt
     */
  } else if (errno == ENOSPC) {
    /* query.ids_len number of progs copied,
     * query.prog_cnt is the number of available progs
     */
  } else {
      /* other errors */
  }

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h             |  4 ++++
 include/uapi/linux/perf_event.h | 22 ++++++++++++++++++++++
 kernel/bpf/core.c               | 21 +++++++++++++++++++++
 kernel/events/core.c            |  3 +++
 kernel/trace/bpf_trace.c        | 23 +++++++++++++++++++++++
 5 files changed, 73 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..f812ac508e9f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -254,6 +254,7 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
+int bpf_event_query_prog_array(struct perf_event *event, void __user *info);
 
 int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
@@ -285,6 +286,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 
 void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
 				struct bpf_prog *old_prog);
+int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+			     __u32 __user *prog_ids, u32 request_cnt,
+			     __u32 __user *prog_cnt);
 int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b9a4953018ed..769533696483 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -418,6 +418,27 @@ struct perf_event_attr {
 	__u16	__reserved_2;	/* align to __u64 */
 };
 
+/*
+ * Structure used by below PERF_EVENT_IOC_QUERY_BPF command
+ * to query bpf programs attached to the same perf tracepoint
+ * as the given perf event.
+ */
+struct perf_event_query_bpf {
+	/*
+	 * The below ids array length
+	 */
+	__u32	ids_len;
+	/*
+	 * Set by the kernel to indicate the number of
+	 * available programs
+	 */
+	__u32	prog_cnt;
+	/*
+	 * User provided buffer to store program ids
+	 */
+	__u32	ids[0];
+};
+
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
 
 /*
@@ -433,6 +454,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
 #define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 #define PERF_EVENT_IOC_PAUSE_OUTPUT	_IOW('$', 9, __u32)
+#define PERF_EVENT_IOC_QUERY_BPF	_IOWR('$', 10, struct perf_event_query_bpf *)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 86b50aa26ee8..b16c6f8f42b6 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1462,6 +1462,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 	rcu_read_lock();
 	prog = rcu_dereference(progs)->progs;
 	for (; *prog; prog++) {
+		if (*prog == &dummy_bpf_prog.prog)
+			continue;
 		id = (*prog)->aux->id;
 		if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
 			rcu_read_unlock();
@@ -1545,6 +1547,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	return 0;
 }
 
+int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+			     __u32 __user *prog_ids, u32 request_cnt,
+			     __u32 __user *prog_cnt)
+{
+	u32 cnt = 0;
+
+	if (array)
+		cnt = bpf_prog_array_length(array);
+
+	if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
+		return -EFAULT;
+
+	/* return early if user requested only program count or nothing to copy */
+	if (!request_cnt || !cnt)
+		return 0;
+
+	return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4767e1..f10609e539d4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4723,6 +4723,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 		rcu_read_unlock();
 		return 0;
 	}
+
+	case PERF_EVENT_IOC_QUERY_BPF:
+		return bpf_event_query_prog_array(event, (void __user *)arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0ce99c379c30..b143f2a05aff 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -820,3 +820,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 unlock:
 	mutex_unlock(&bpf_event_mutex);
 }
+
+int bpf_event_query_prog_array(struct perf_event *event, void __user *info)
+{
+	struct perf_event_query_bpf __user *uquery = info;
+	struct perf_event_query_bpf query = {};
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+	if (copy_from_user(&query, uquery, sizeof(query)))
+		return -EFAULT;
+
+	mutex_lock(&bpf_event_mutex);
+	ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
+				       uquery->ids,
+				       query.ids_len,
+				       &uquery->prog_cnt);
+	mutex_unlock(&bpf_event_mutex);
+
+	return ret;
+}
-- 
cgit v1.2.3


From 9802d86585db91655c7d1929a4f6bbe0952ea88e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 11 Dec 2017 11:36:48 -0500
Subject: bpf: add a bpf_override_function helper

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/Kconfig                     |  3 +++
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/kprobes.h   |  4 +++
 arch/x86/include/asm/ptrace.h    |  5 ++++
 arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++++
 include/linux/filter.h           |  3 ++-
 include/linux/trace_events.h     |  1 +
 include/uapi/linux/bpf.h         |  7 ++++-
 kernel/bpf/core.c                |  3 +++
 kernel/bpf/verifier.c            |  2 ++
 kernel/events/core.c             |  7 +++++
 kernel/trace/Kconfig             | 11 ++++++++
 kernel/trace/bpf_trace.c         | 35 +++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c      | 55 +++++++++++++++++++++++++++++++++++-----
 kernel/trace/trace_probe.h       | 12 +++++++++
 15 files changed, 154 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 400b9e1b2f27..d3f4aaf9cb7a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_KPROBE_OVERRIDE
+	bool
+
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8eed3f94bfc7..04d66e6fa447 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -154,6 +154,7 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 9f2e3102e0bb..36abb23a7a35 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14131dd06b29..6de1fd3d0097 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 8dc0161cec8f..1ea748d682fd 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+	".type override_func, @function\n"
+	"override_func:\n"
+	"	ret\n"
+	".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+	regs->ip = (unsigned long)&override_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0062302e1285..5feb441d3dd9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				kprobe_override:1; /* Do we override a kprobe? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
 	u32			jited_len;	/* Size of jited insns in bytes */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index af44e7c2d577..5fea451f6e28 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -528,6 +528,7 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 80d62e88590c..595bda120cfb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b16c6f8f42b6..d32bebf4f2de 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1320,6 +1320,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
+	if (fp->kprobe_override)
+		return false;
+
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7afa92e9b409..e807bda7fe29 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4413,6 +4413,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_override_return)
+			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f10609e539d4..5857c500721b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8080,6 +8080,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	/* Kprobe override only works for kprobes, not uprobes. */
+	if (prog->kprobe_override &&
+	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index af7dad126c13..3e6fd580fe7f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -529,6 +529,17 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
+config BPF_KPROBE_OVERRIDE
+	bool "Enable BPF programs to override a kprobed function"
+	depends on BPF_EVENTS
+	depends on KPROBES_ON_FTRACE
+	depends on HAVE_KPROBE_OVERRIDE
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	default n
+	help
+	 Allows BPF to override the execution of a probed function and
+	 set a different return value.  This is used for error injection.
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b143f2a05aff..e009b7ecf473 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <asm/kprobes.h>
+
+#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,24 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	__this_cpu_write(bpf_kprobe_override, 1);
+	regs_set_return_value(regs, rc);
+	arch_ftrace_kprobe_override_function(regs);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_override_return_proto = {
+	.func		= bpf_override_return,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+#endif
+
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -551,6 +573,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+	case BPF_FUNC_override_return:
+		return &bpf_override_return_proto;
+#endif
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -768,6 +794,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
+	/*
+	 * Kprobe override only works for ftrace based kprobes, and only if they
+	 * are on the opt-in list.
+	 */
+	if (prog->kprobe_override &&
+	    (!trace_kprobe_ftrace(event->tp_event) ||
+	     !trace_kprobe_error_injectable(event->tp_event)))
+		return -EINVAL;
+
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 492700c5fb4d..5db849809a56 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,6 +42,7 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -87,6 +88,27 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
+int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	return kprobe_ftrace(&tk->rp.kp);
+}
+
+int trace_kprobe_error_injectable(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	unsigned long addr;
+
+	if (tk->symbol) {
+		addr = (unsigned long)
+			kallsyms_lookup_name(trace_kprobe_symbol(tk));
+		addr += tk->rp.kp.offset;
+	} else {
+		addr = (unsigned long)tk->rp.kp.addr;
+	}
+	return within_kprobe_error_injection_list(addr);
+}
+
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1170,7 +1192,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static void
+static int
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1201,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
-		return;
+	if (bpf_prog_array_valid(call)) {
+		int ret;
+
+		ret = trace_call_bpf(call, regs);
+
+		/*
+		 * We need to check and see if we modified the pc of the
+		 * pt_regs, and if so clear the kprobe and return 1 so that we
+		 * don't do the instruction skipping.  Also reset our state so
+		 * we are clean the next pass through.
+		 */
+		if (__this_cpu_read(bpf_kprobe_override)) {
+			__this_cpu_write(bpf_kprobe_override, 0);
+			reset_current_kprobe();
+			return 1;
+		}
+		if (!ret)
+			return 0;
+	}
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return;
+		return 0;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1232,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return;
+		return 0;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL);
+	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1275,6 +1315,7 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1282,9 +1323,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		kprobe_perf_func(tk, regs);
+		ret = kprobe_perf_func(tk, regs);
 #endif
-	return 0;	/* We don't tweek kernel, so just return 0 */
+	return ret;
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb66e3eaa192..5e54d748c84c 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,6 +252,8 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+int trace_kprobe_ftrace(struct trace_event_call *call);
+int trace_kprobe_error_injectable(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
+
+static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	return 0;
+}
+
+static inline int trace_kprobe_error_injectable(struct trace_event_call *call)
+{
+	return 0;
+}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
-- 
cgit v1.2.3


From 65adc27375a85beb57c3869cedb2a410fad9c288 Mon Sep 17 00:00:00 2001
From: Andrew Price <anprice@redhat.com>
Date: Tue, 12 Dec 2017 11:37:15 -0600
Subject: gfs2: Add a next-resource-group pointer to resource groups

Add a new rg_skip field to struct gfs2_rgrp, replacing __pad. The
rg_skip field has the following meaning:

- If rg_skip is zero, it is considered unset and not useful.
- If rg_skip is non-zero, its value will be the number of blocks between
  this rgrp's address and the next rgrp's address. This can be used as a
  hint by fsck.gfs2 when rebuilding a bad rindex, for example.

This will provide less dependency on the rindex in future, and allow
tools such as fsck.gfs2 to iterate the resource groups without keeping
the rindex around.

The field is updated in gfs2_rgrp_out() so that existing file systems
will have it set. This means that any resource groups that aren't ever
written will not be updated. The final rgrp is a special case as there
is no next rgrp, so it will always have a rg_skip of 0 (unless the fs is
extended).

Before this patch, gfs2_rgrp_out() zeroes the __pad field explicitly, so
the rg_skip field can get set back to 0 in cases where nodes with and
without this patch are mixed in a cluster. In some cases, the field may
bounce between being set by one node and then zeroed by another which
may harm performance slightly, e.g. when two nodes create many small
files. In testing this situation is rare but it becomes more likely as
the filesystem fills up and there are fewer resource groups to choose
from. The problem goes away when all nodes are running with this patch.
Dipping into the space currently occupied by the rg_reserved field would
have resulted in the same problem as it is also explicitly zeroed, so
unfortunately there is no other way around it.

Signed-off-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/rgrp.c                   | 6 +++++-
 include/uapi/linux/gfs2_ondisk.h | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index b52c5c3ac445..be2fc26029e4 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1044,12 +1044,16 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
+	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
 	struct gfs2_rgrp *str = buf;
 
 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
 	str->rg_free = cpu_to_be32(rgd->rd_free);
 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
-	str->__pad = cpu_to_be32(0);
+	if (next == NULL)
+		str->rg_skip = 0;
+	else if (next->rd_addr > rgd->rd_addr)
+		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 5156bad77b47..da7a30ddef72 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -187,7 +187,10 @@ struct gfs2_rgrp {
 	__be32 rg_flags;
 	__be32 rg_free;
 	__be32 rg_dinodes;
-	__be32 __pad;
+	union {
+		__be32 __pad;
+		__be32 rg_skip; /* Distance to the next rgrp in fs blocks */
+	};
 	__be64 rg_igeneration;
 
 	__u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
-- 
cgit v1.2.3


From 166725d96322473305e35f9d580591a01697ab29 Mon Sep 17 00:00:00 2001
From: Andrew Price <anprice@redhat.com>
Date: Tue, 12 Dec 2017 11:40:05 -0600
Subject: gfs2: Add rindex fields to rgrp headers

Add rg_data0, rg_data and rg_bitbytes to struct gfs2_rgrp. The fields
are identical to their counterparts in struct gfs2_rindex and are
intended to reduce the use of the rindex. For now the fields are only
written back as the in-memory equivalents in struct gfs2_rgrpd are set
using values from the rindex. However, they are needed at this point so
that userspace can make use of them, allowing a migration away from the
rindex over time.

The new fields take up previously reserved space which was explicitly
zeroed on write so, in clusters with mixed kernels, these fields could
get zeroed after being set and this should not be treated as an error.

Signed-off-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/rgrp.c                   | 5 +++++
 include/uapi/linux/gfs2_ondisk.h | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index be2fc26029e4..a9184903a9f5 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1040,6 +1040,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 	rgd->rd_free = be32_to_cpu(str->rg_free);
 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
+	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
 }
 
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
@@ -1055,6 +1056,10 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 	else if (next->rd_addr > rgd->rd_addr)
 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
+	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
+	str->rg_data = cpu_to_be32(rgd->rd_data);
+	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
+
 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
 
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index da7a30ddef72..648e0cbca574 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -192,8 +192,13 @@ struct gfs2_rgrp {
 		__be32 rg_skip; /* Distance to the next rgrp in fs blocks */
 	};
 	__be64 rg_igeneration;
+	/* The following 3 fields are duplicated from gfs2_rindex to reduce
+	   reliance on the rindex */
+	__be64 rg_data0;     /* First data location */
+	__be32 rg_data;      /* Number of data blocks in rgrp */
+	__be32 rg_bitbytes;  /* Number of bytes in data bitmaps */
 
-	__u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
+	__u8 rg_reserved[64]; /* Several fields from gfs1 now reserved */
 };
 
 /*
-- 
cgit v1.2.3


From 850d2d915fa69011bef9bd668499cce889fdd8b3 Mon Sep 17 00:00:00 2001
From: Andrew Price <anprice@redhat.com>
Date: Tue, 12 Dec 2017 11:42:30 -0600
Subject: gfs2: Add a crc field to resource group headers

Add the rg_crc field to store a crc32 of the gfs2_rgrp structure. This
allows us to check resource group headers' integrity and removes the
requirement to check them against the rindex entries in fsck. If this
field is found to be zero, it should be ignored (or updated with an
accurate value).

Signed-off-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/rgrp.c                   | 5 +++++
 include/uapi/linux/gfs2_ondisk.h | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index a9184903a9f5..e8aba6fa1472 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -34,6 +34,7 @@
 #include "log.h"
 #include "inode.h"
 #include "trace_gfs2.h"
+#include "dir.h"
 
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
@@ -1047,6 +1048,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
 	struct gfs2_rgrp *str = buf;
+	u32 crc;
 
 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
 	str->rg_free = cpu_to_be32(rgd->rd_free);
@@ -1059,6 +1061,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
 	str->rg_data = cpu_to_be32(rgd->rd_data);
 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
+	str->rg_crc = 0;
+	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
+	str->rg_crc = cpu_to_be32(crc);
 
 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 648e0cbca574..09f0920f07e9 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -197,8 +197,9 @@ struct gfs2_rgrp {
 	__be64 rg_data0;     /* First data location */
 	__be32 rg_data;      /* Number of data blocks in rgrp */
 	__be32 rg_bitbytes;  /* Number of bytes in data bitmaps */
+	__be32 rg_crc;       /* crc32 of the structure with this field 0 */
 
-	__u8 rg_reserved[64]; /* Several fields from gfs1 now reserved */
+	__u8 rg_reserved[60]; /* Several fields from gfs1 now reserved */
 };
 
 /*
-- 
cgit v1.2.3


From 9b6192589be788dec73a0e99fe49b8f8ddaf825e Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sat, 25 Feb 2017 06:51:29 -0500
Subject: media: lirc: implement scancode sending

This introduces a new lirc mode: scancode. Any device which can send raw IR
can now also send scancodes.

int main()
{
	int mode, fd = open("/dev/lirc0", O_RDWR);

        mode = LIRC_MODE_SCANCODE;
	if (ioctl(fd, LIRC_SET_SEND_MODE, &mode)) {
		// kernel too old or lirc does not support transmit
	}
	struct lirc_scancode scancode = {
		.scancode = 0x1e3d,
		.rc_proto = RC_PROTO_RC5,
	};
	write(fd, &scancode, sizeof(scancode));
	close(fd);
}

The other fields of lirc_scancode must be set to 0.

Note that toggle (rc5, rc6) and repeats (nec) are not implemented. Nor is
there a method for holding down a key for a period.

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/rc/ir-lirc-codec.c | 99 +++++++++++++++++++++++++++++-----------
 drivers/media/rc/rc-core-priv.h  |  2 +-
 include/media/rc-map.h           | 54 +---------------------
 include/uapi/linux/lirc.h        | 82 +++++++++++++++++++++++++++++++++
 4 files changed, 156 insertions(+), 81 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c
index 9954ad4b8e59..0a3ec693d290 100644
--- a/drivers/media/rc/ir-lirc-codec.c
+++ b/drivers/media/rc/ir-lirc-codec.c
@@ -107,7 +107,8 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf,
 {
 	struct lirc_codec *lirc;
 	struct rc_dev *dev;
-	unsigned int *txbuf; /* buffer with values to transmit */
+	unsigned int *txbuf = NULL;
+	struct ir_raw_event *raw = NULL;
 	ssize_t ret = -EINVAL;
 	size_t count;
 	ktime_t start;
@@ -121,16 +122,50 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf,
 	if (!lirc)
 		return -EFAULT;
 
-	if (n < sizeof(unsigned) || n % sizeof(unsigned))
-		return -EINVAL;
+	if (lirc->send_mode == LIRC_MODE_SCANCODE) {
+		struct lirc_scancode scan;
 
-	count = n / sizeof(unsigned);
-	if (count > LIRCBUF_SIZE || count % 2 == 0)
-		return -EINVAL;
+		if (n != sizeof(scan))
+			return -EINVAL;
 
-	txbuf = memdup_user(buf, n);
-	if (IS_ERR(txbuf))
-		return PTR_ERR(txbuf);
+		if (copy_from_user(&scan, buf, sizeof(scan)))
+			return -EFAULT;
+
+		if (scan.flags || scan.keycode || scan.timestamp)
+			return -EINVAL;
+
+		raw = kmalloc_array(LIRCBUF_SIZE, sizeof(*raw), GFP_KERNEL);
+		if (!raw)
+			return -ENOMEM;
+
+		ret = ir_raw_encode_scancode(scan.rc_proto, scan.scancode,
+					     raw, LIRCBUF_SIZE);
+		if (ret < 0)
+			goto out;
+
+		count = ret;
+
+		txbuf = kmalloc_array(count, sizeof(unsigned int), GFP_KERNEL);
+		if (!txbuf) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		for (i = 0; i < count; i++)
+			/* Convert from NS to US */
+			txbuf[i] = DIV_ROUND_UP(raw[i].duration, 1000);
+	} else {
+		if (n < sizeof(unsigned int) || n % sizeof(unsigned int))
+			return -EINVAL;
+
+		count = n / sizeof(unsigned int);
+		if (count > LIRCBUF_SIZE || count % 2 == 0)
+			return -EINVAL;
+
+		txbuf = memdup_user(buf, n);
+		if (IS_ERR(txbuf))
+			return PTR_ERR(txbuf);
+	}
 
 	dev = lirc->dev;
 	if (!dev) {
@@ -156,24 +191,30 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf,
 	if (ret < 0)
 		goto out;
 
-	for (duration = i = 0; i < ret; i++)
-		duration += txbuf[i];
-
-	ret *= sizeof(unsigned int);
-
-	/*
-	 * The lircd gap calculation expects the write function to
-	 * wait for the actual IR signal to be transmitted before
-	 * returning.
-	 */
-	towait = ktime_us_delta(ktime_add_us(start, duration), ktime_get());
-	if (towait > 0) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(usecs_to_jiffies(towait));
+	if (lirc->send_mode == LIRC_MODE_SCANCODE) {
+		ret = n;
+	} else {
+		for (duration = i = 0; i < ret; i++)
+			duration += txbuf[i];
+
+		ret *= sizeof(unsigned int);
+
+		/*
+		 * The lircd gap calculation expects the write function to
+		 * wait for the actual IR signal to be transmitted before
+		 * returning.
+		 */
+		towait = ktime_us_delta(ktime_add_us(start, duration),
+					ktime_get());
+		if (towait > 0) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(usecs_to_jiffies(towait));
+		}
 	}
 
 out:
 	kfree(txbuf);
+	kfree(raw);
 	return ret;
 }
 
@@ -202,20 +243,22 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd,
 
 	switch (cmd) {
 
-	/* legacy support */
+	/* mode support */
 	case LIRC_GET_SEND_MODE:
 		if (!dev->tx_ir)
 			return -ENOTTY;
 
-		val = LIRC_MODE_PULSE;
+		val = lirc->send_mode;
 		break;
 
 	case LIRC_SET_SEND_MODE:
 		if (!dev->tx_ir)
 			return -ENOTTY;
 
-		if (val != LIRC_MODE_PULSE)
+		if (!(val == LIRC_MODE_PULSE || val == LIRC_MODE_SCANCODE))
 			return -EINVAL;
+
+		lirc->send_mode = val;
 		return 0;
 
 	/* TX settings */
@@ -361,7 +404,7 @@ static int ir_lirc_register(struct rc_dev *dev)
 	}
 
 	if (dev->tx_ir) {
-		features |= LIRC_CAN_SEND_PULSE;
+		features |= LIRC_CAN_SEND_PULSE | LIRC_CAN_SEND_SCANCODE;
 		if (dev->s_tx_mask)
 			features |= LIRC_CAN_SET_TRANSMITTER_MASK;
 		if (dev->s_tx_carrier)
@@ -399,6 +442,8 @@ static int ir_lirc_register(struct rc_dev *dev)
 	if (rc < 0)
 		goto out;
 
+	dev->raw->lirc.send_mode = LIRC_MODE_PULSE;
+
 	dev->raw->lirc.ldev = ldev;
 	dev->raw->lirc.dev = dev;
 	return 0;
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index 564d6e13585e..d10fc998e1db 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -103,7 +103,7 @@ struct ir_raw_event_ctrl {
 		u64 gap_duration;
 		bool gap;
 		bool send_timeout_reports;
-
+		u8 send_mode;
 	} lirc;
 	struct xmp_dec {
 		int state;
diff --git a/include/media/rc-map.h b/include/media/rc-map.h
index 72197cb43781..7046734b3895 100644
--- a/include/media/rc-map.h
+++ b/include/media/rc-map.h
@@ -10,59 +10,7 @@
  */
 
 #include <linux/input.h>
-
-/**
- * enum rc_proto - the Remote Controller protocol
- *
- * @RC_PROTO_UNKNOWN: Protocol not known
- * @RC_PROTO_OTHER: Protocol known but proprietary
- * @RC_PROTO_RC5: Philips RC5 protocol
- * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol
- * @RC_PROTO_RC5_SZ: StreamZap variant of RC5
- * @RC_PROTO_JVC: JVC protocol
- * @RC_PROTO_SONY12: Sony 12 bit protocol
- * @RC_PROTO_SONY15: Sony 15 bit protocol
- * @RC_PROTO_SONY20: Sony 20 bit protocol
- * @RC_PROTO_NEC: NEC protocol
- * @RC_PROTO_NECX: Extended NEC protocol
- * @RC_PROTO_NEC32: NEC 32 bit protocol
- * @RC_PROTO_SANYO: Sanyo protocol
- * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard
- * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse
- * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol
- * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol
- * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol
- * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol
- * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol
- * @RC_PROTO_SHARP: Sharp protocol
- * @RC_PROTO_XMP: XMP protocol
- * @RC_PROTO_CEC: CEC protocol
- */
-enum rc_proto {
-	RC_PROTO_UNKNOWN	= 0,
-	RC_PROTO_OTHER		= 1,
-	RC_PROTO_RC5		= 2,
-	RC_PROTO_RC5X_20	= 3,
-	RC_PROTO_RC5_SZ		= 4,
-	RC_PROTO_JVC		= 5,
-	RC_PROTO_SONY12		= 6,
-	RC_PROTO_SONY15		= 7,
-	RC_PROTO_SONY20		= 8,
-	RC_PROTO_NEC		= 9,
-	RC_PROTO_NECX		= 10,
-	RC_PROTO_NEC32		= 11,
-	RC_PROTO_SANYO		= 12,
-	RC_PROTO_MCIR2_KBD	= 13,
-	RC_PROTO_MCIR2_MSE	= 14,
-	RC_PROTO_RC6_0		= 15,
-	RC_PROTO_RC6_6A_20	= 16,
-	RC_PROTO_RC6_6A_24	= 17,
-	RC_PROTO_RC6_6A_32	= 18,
-	RC_PROTO_RC6_MCE	= 19,
-	RC_PROTO_SHARP		= 20,
-	RC_PROTO_XMP		= 21,
-	RC_PROTO_CEC		= 22,
-};
+#include <uapi/linux/lirc.h>
 
 #define RC_PROTO_BIT_NONE		0ULL
 #define RC_PROTO_BIT_UNKNOWN		BIT_ULL(RC_PROTO_UNKNOWN)
diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index c3aef4316fbf..4fe580d36e41 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -47,12 +47,14 @@
 #define LIRC_MODE_RAW                  0x00000001
 #define LIRC_MODE_PULSE                0x00000002
 #define LIRC_MODE_MODE2                0x00000004
+#define LIRC_MODE_SCANCODE             0x00000008
 #define LIRC_MODE_LIRCCODE             0x00000010
 
 
 #define LIRC_CAN_SEND_RAW              LIRC_MODE2SEND(LIRC_MODE_RAW)
 #define LIRC_CAN_SEND_PULSE            LIRC_MODE2SEND(LIRC_MODE_PULSE)
 #define LIRC_CAN_SEND_MODE2            LIRC_MODE2SEND(LIRC_MODE_MODE2)
+#define LIRC_CAN_SEND_SCANCODE         LIRC_MODE2SEND(LIRC_MODE_SCANCODE)
 #define LIRC_CAN_SEND_LIRCCODE         LIRC_MODE2SEND(LIRC_MODE_LIRCCODE)
 
 #define LIRC_CAN_SEND_MASK             0x0000003f
@@ -64,6 +66,7 @@
 #define LIRC_CAN_REC_RAW               LIRC_MODE2REC(LIRC_MODE_RAW)
 #define LIRC_CAN_REC_PULSE             LIRC_MODE2REC(LIRC_MODE_PULSE)
 #define LIRC_CAN_REC_MODE2             LIRC_MODE2REC(LIRC_MODE_MODE2)
+#define LIRC_CAN_REC_SCANCODE          LIRC_MODE2REC(LIRC_MODE_SCANCODE)
 #define LIRC_CAN_REC_LIRCCODE          LIRC_MODE2REC(LIRC_MODE_LIRCCODE)
 
 #define LIRC_CAN_REC_MASK              LIRC_MODE2REC(LIRC_CAN_SEND_MASK)
@@ -131,4 +134,83 @@
 
 #define LIRC_SET_WIDEBAND_RECEIVER     _IOW('i', 0x00000023, __u32)
 
+/*
+ * struct lirc_scancode - decoded scancode with protocol for use with
+ *	LIRC_MODE_SCANCODE
+ *
+ * @timestamp: Timestamp in nanoseconds using CLOCK_MONOTONIC when IR
+ *	was decoded.
+ * @flags: should be 0 for transmit. When receiving scancodes,
+ *	LIRC_SCANCODE_FLAG_TOGGLE or LIRC_SCANCODE_FLAG_REPEAT can be set
+ *	depending on the protocol
+ * @rc_proto: see enum rc_proto
+ * @keycode: the translated keycode. Set to 0 for transmit.
+ * @scancode: the scancode received or to be sent
+ */
+struct lirc_scancode {
+	__u64	timestamp;
+	__u16	flags;
+	__u16	rc_proto;
+	__u32	keycode;
+	__u64	scancode;
+};
+
+/* Set if the toggle bit of rc-5 or rc-6 is enabled */
+#define LIRC_SCANCODE_FLAG_TOGGLE	1
+/* Set if this is a nec or sanyo repeat */
+#define LIRC_SCANCODE_FLAG_REPEAT	2
+
+/**
+ * enum rc_proto - the Remote Controller protocol
+ *
+ * @RC_PROTO_UNKNOWN: Protocol not known
+ * @RC_PROTO_OTHER: Protocol known but proprietary
+ * @RC_PROTO_RC5: Philips RC5 protocol
+ * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol
+ * @RC_PROTO_RC5_SZ: StreamZap variant of RC5
+ * @RC_PROTO_JVC: JVC protocol
+ * @RC_PROTO_SONY12: Sony 12 bit protocol
+ * @RC_PROTO_SONY15: Sony 15 bit protocol
+ * @RC_PROTO_SONY20: Sony 20 bit protocol
+ * @RC_PROTO_NEC: NEC protocol
+ * @RC_PROTO_NECX: Extended NEC protocol
+ * @RC_PROTO_NEC32: NEC 32 bit protocol
+ * @RC_PROTO_SANYO: Sanyo protocol
+ * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard
+ * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse
+ * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol
+ * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol
+ * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol
+ * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol
+ * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol
+ * @RC_PROTO_SHARP: Sharp protocol
+ * @RC_PROTO_XMP: XMP protocol
+ * @RC_PROTO_CEC: CEC protocol
+ */
+enum rc_proto {
+	RC_PROTO_UNKNOWN	= 0,
+	RC_PROTO_OTHER		= 1,
+	RC_PROTO_RC5		= 2,
+	RC_PROTO_RC5X_20	= 3,
+	RC_PROTO_RC5_SZ		= 4,
+	RC_PROTO_JVC		= 5,
+	RC_PROTO_SONY12		= 6,
+	RC_PROTO_SONY15		= 7,
+	RC_PROTO_SONY20		= 8,
+	RC_PROTO_NEC		= 9,
+	RC_PROTO_NECX		= 10,
+	RC_PROTO_NEC32		= 11,
+	RC_PROTO_SANYO		= 12,
+	RC_PROTO_MCIR2_KBD	= 13,
+	RC_PROTO_MCIR2_MSE	= 14,
+	RC_PROTO_RC6_0		= 15,
+	RC_PROTO_RC6_6A_20	= 16,
+	RC_PROTO_RC6_6A_24	= 17,
+	RC_PROTO_RC6_6A_32	= 18,
+	RC_PROTO_RC6_MCE	= 19,
+	RC_PROTO_SHARP		= 20,
+	RC_PROTO_XMP		= 21,
+	RC_PROTO_CEC		= 22,
+};
+
 #endif
-- 
cgit v1.2.3


From 033ddf12bcf5326b93bd604f50a7474a434a35f9 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Wed, 29 Nov 2017 14:48:26 +0200
Subject: tee: add register user memory

Added new ioctl to allow users register own buffers as a shared memory.

Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
[jw: moved tee_shm_is_registered() declaration]
[jw: added space after __tee_shm_alloc() implementation]
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_core.c   |  41 +++++++++-
 drivers/tee/tee_shm.c    | 206 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/tee_drv.h  |  47 ++++++++++-
 include/uapi/linux/tee.h |  30 +++++++
 4 files changed, 294 insertions(+), 30 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 58a5009eacc3..295910f5cdd0 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -114,8 +114,6 @@ static int tee_ioctl_shm_alloc(struct tee_context *ctx,
 	if (data.flags)
 		return -EINVAL;
 
-	data.id = -1;
-
 	shm = tee_shm_alloc(ctx, data.size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF);
 	if (IS_ERR(shm))
 		return PTR_ERR(shm);
@@ -138,6 +136,43 @@ static int tee_ioctl_shm_alloc(struct tee_context *ctx,
 	return ret;
 }
 
+static int
+tee_ioctl_shm_register(struct tee_context *ctx,
+		       struct tee_ioctl_shm_register_data __user *udata)
+{
+	long ret;
+	struct tee_ioctl_shm_register_data data;
+	struct tee_shm *shm;
+
+	if (copy_from_user(&data, udata, sizeof(data)))
+		return -EFAULT;
+
+	/* Currently no input flags are supported */
+	if (data.flags)
+		return -EINVAL;
+
+	shm = tee_shm_register(ctx, data.addr, data.length,
+			       TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED);
+	if (IS_ERR(shm))
+		return PTR_ERR(shm);
+
+	data.id = shm->id;
+	data.flags = shm->flags;
+	data.length = shm->size;
+
+	if (copy_to_user(udata, &data, sizeof(data)))
+		ret = -EFAULT;
+	else
+		ret = tee_shm_get_fd(shm);
+	/*
+	 * When user space closes the file descriptor the shared memory
+	 * should be freed or if tee_shm_get_fd() failed then it will
+	 * be freed immediately.
+	 */
+	tee_shm_put(shm);
+	return ret;
+}
+
 static int params_from_user(struct tee_context *ctx, struct tee_param *params,
 			    size_t num_params,
 			    struct tee_ioctl_param __user *uparams)
@@ -586,6 +621,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return tee_ioctl_version(ctx, uarg);
 	case TEE_IOC_SHM_ALLOC:
 		return tee_ioctl_shm_alloc(ctx, uarg);
+	case TEE_IOC_SHM_REGISTER:
+		return tee_ioctl_shm_register(ctx, uarg);
 	case TEE_IOC_OPEN_SESSION:
 		return tee_ioctl_open_session(ctx, uarg);
 	case TEE_IOC_INVOKE:
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index fdda89e917f7..11d11a46d86e 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -23,7 +23,6 @@
 static void tee_shm_release(struct tee_shm *shm)
 {
 	struct tee_device *teedev = shm->teedev;
-	struct tee_shm_pool_mgr *poolm;
 
 	mutex_lock(&teedev->mutex);
 	idr_remove(&teedev->idr, shm->id);
@@ -31,12 +30,29 @@ static void tee_shm_release(struct tee_shm *shm)
 		list_del(&shm->link);
 	mutex_unlock(&teedev->mutex);
 
-	if (shm->flags & TEE_SHM_DMA_BUF)
-		poolm = teedev->pool->dma_buf_mgr;
-	else
-		poolm = teedev->pool->private_mgr;
+	if (shm->flags & TEE_SHM_POOL) {
+		struct tee_shm_pool_mgr *poolm;
+
+		if (shm->flags & TEE_SHM_DMA_BUF)
+			poolm = teedev->pool->dma_buf_mgr;
+		else
+			poolm = teedev->pool->private_mgr;
+
+		poolm->ops->free(poolm, shm);
+	} else if (shm->flags & TEE_SHM_REGISTER) {
+		size_t n;
+		int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm);
+
+		if (rc)
+			dev_err(teedev->dev.parent,
+				"unregister shm %p failed: %d", shm, rc);
+
+		for (n = 0; n < shm->num_pages; n++)
+			put_page(shm->pages[n]);
+
+		kfree(shm->pages);
+	}
 
-	poolm->ops->free(poolm, shm);
 	kfree(shm);
 
 	tee_device_put(teedev);
@@ -76,6 +92,10 @@ static int tee_shm_op_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
 	struct tee_shm *shm = dmabuf->priv;
 	size_t size = vma->vm_end - vma->vm_start;
 
+	/* Refuse sharing shared memory provided by application */
+	if (shm->flags & TEE_SHM_REGISTER)
+		return -EINVAL;
+
 	return remap_pfn_range(vma, vma->vm_start, shm->paddr >> PAGE_SHIFT,
 			       size, vma->vm_page_prot);
 }
@@ -89,26 +109,20 @@ static const struct dma_buf_ops tee_shm_dma_buf_ops = {
 	.mmap = tee_shm_op_mmap,
 };
 
-/**
- * tee_shm_alloc() - Allocate shared memory
- * @ctx:	Context that allocates the shared memory
- * @size:	Requested size of shared memory
- * @flags:	Flags setting properties for the requested shared memory.
- *
- * Memory allocated as global shared memory is automatically freed when the
- * TEE file pointer is closed. The @flags field uses the bits defined by
- * TEE_SHM_* in <linux/tee_drv.h>. TEE_SHM_MAPPED must currently always be
- * set. If TEE_SHM_DMA_BUF global shared memory will be allocated and
- * associated with a dma-buf handle, else driver private memory.
- */
-struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
+struct tee_shm *__tee_shm_alloc(struct tee_context *ctx,
+				struct tee_device *teedev,
+				size_t size, u32 flags)
 {
-	struct tee_device *teedev = ctx->teedev;
 	struct tee_shm_pool_mgr *poolm = NULL;
 	struct tee_shm *shm;
 	void *ret;
 	int rc;
 
+	if (ctx && ctx->teedev != teedev) {
+		dev_err(teedev->dev.parent, "ctx and teedev mismatch\n");
+		return ERR_PTR(-EINVAL);
+	}
+
 	if (!(flags & TEE_SHM_MAPPED)) {
 		dev_err(teedev->dev.parent,
 			"only mapped allocations supported\n");
@@ -135,7 +149,7 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
 		goto err_dev_put;
 	}
 
-	shm->flags = flags;
+	shm->flags = flags | TEE_SHM_POOL;
 	shm->teedev = teedev;
 	shm->ctx = ctx;
 	if (flags & TEE_SHM_DMA_BUF)
@@ -171,9 +185,12 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
 			goto err_rem;
 		}
 	}
-	mutex_lock(&teedev->mutex);
-	list_add_tail(&shm->link, &ctx->list_shm);
-	mutex_unlock(&teedev->mutex);
+
+	if (ctx) {
+		mutex_lock(&teedev->mutex);
+		list_add_tail(&shm->link, &ctx->list_shm);
+		mutex_unlock(&teedev->mutex);
+	}
 
 	return shm;
 err_rem:
@@ -188,8 +205,140 @@ err_dev_put:
 	tee_device_put(teedev);
 	return ret;
 }
+
+/**
+ * tee_shm_alloc() - Allocate shared memory
+ * @ctx:	Context that allocates the shared memory
+ * @size:	Requested size of shared memory
+ * @flags:	Flags setting properties for the requested shared memory.
+ *
+ * Memory allocated as global shared memory is automatically freed when the
+ * TEE file pointer is closed. The @flags field uses the bits defined by
+ * TEE_SHM_* in <linux/tee_drv.h>. TEE_SHM_MAPPED must currently always be
+ * set. If TEE_SHM_DMA_BUF global shared memory will be allocated and
+ * associated with a dma-buf handle, else driver private memory.
+ */
+struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
+{
+	return __tee_shm_alloc(ctx, ctx->teedev, size, flags);
+}
 EXPORT_SYMBOL_GPL(tee_shm_alloc);
 
+struct tee_shm *tee_shm_priv_alloc(struct tee_device *teedev, size_t size)
+{
+	return __tee_shm_alloc(NULL, teedev, size, TEE_SHM_MAPPED);
+}
+EXPORT_SYMBOL_GPL(tee_shm_priv_alloc);
+
+struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
+				 size_t length, u32 flags)
+{
+	struct tee_device *teedev = ctx->teedev;
+	const u32 req_flags = TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED;
+	struct tee_shm *shm;
+	void *ret;
+	int rc;
+	int num_pages;
+	unsigned long start;
+
+	if (flags != req_flags)
+		return ERR_PTR(-ENOTSUPP);
+
+	if (!tee_device_get(teedev))
+		return ERR_PTR(-EINVAL);
+
+	if (!teedev->desc->ops->shm_register ||
+	    !teedev->desc->ops->shm_unregister) {
+		tee_device_put(teedev);
+		return ERR_PTR(-ENOTSUPP);
+	}
+
+	shm = kzalloc(sizeof(*shm), GFP_KERNEL);
+	if (!shm) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	shm->flags = flags | TEE_SHM_REGISTER;
+	shm->teedev = teedev;
+	shm->ctx = ctx;
+	shm->id = -1;
+	start = rounddown(addr, PAGE_SIZE);
+	shm->offset = addr - start;
+	shm->size = length;
+	num_pages = (roundup(addr + length, PAGE_SIZE) - start) / PAGE_SIZE;
+	shm->pages = kcalloc(num_pages, sizeof(*shm->pages), GFP_KERNEL);
+	if (!shm->pages) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	rc = get_user_pages_fast(start, num_pages, 1, shm->pages);
+	if (rc > 0)
+		shm->num_pages = rc;
+	if (rc != num_pages) {
+		if (rc > 0)
+			rc = -ENOMEM;
+		ret = ERR_PTR(rc);
+		goto err;
+	}
+
+	mutex_lock(&teedev->mutex);
+	shm->id = idr_alloc(&teedev->idr, shm, 1, 0, GFP_KERNEL);
+	mutex_unlock(&teedev->mutex);
+
+	if (shm->id < 0) {
+		ret = ERR_PTR(shm->id);
+		goto err;
+	}
+
+	rc = teedev->desc->ops->shm_register(ctx, shm, shm->pages,
+					     shm->num_pages);
+	if (rc) {
+		ret = ERR_PTR(rc);
+		goto err;
+	}
+
+	if (flags & TEE_SHM_DMA_BUF) {
+		DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+
+		exp_info.ops = &tee_shm_dma_buf_ops;
+		exp_info.size = shm->size;
+		exp_info.flags = O_RDWR;
+		exp_info.priv = shm;
+
+		shm->dmabuf = dma_buf_export(&exp_info);
+		if (IS_ERR(shm->dmabuf)) {
+			ret = ERR_CAST(shm->dmabuf);
+			teedev->desc->ops->shm_unregister(ctx, shm);
+			goto err;
+		}
+	}
+
+	mutex_lock(&teedev->mutex);
+	list_add_tail(&shm->link, &ctx->list_shm);
+	mutex_unlock(&teedev->mutex);
+
+	return shm;
+err:
+	if (shm) {
+		size_t n;
+
+		if (shm->id >= 0) {
+			mutex_lock(&teedev->mutex);
+			idr_remove(&teedev->idr, shm->id);
+			mutex_unlock(&teedev->mutex);
+		}
+		for (n = 0; n < shm->num_pages; n++)
+			put_page(shm->pages[n]);
+		kfree(shm->pages);
+	}
+	kfree(shm);
+	tee_device_put(teedev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tee_shm_register);
+
 /**
  * tee_shm_get_fd() - Increase reference count and return file descriptor
  * @shm:	Shared memory handle
@@ -197,10 +346,9 @@ EXPORT_SYMBOL_GPL(tee_shm_alloc);
  */
 int tee_shm_get_fd(struct tee_shm *shm)
 {
-	u32 req_flags = TEE_SHM_MAPPED | TEE_SHM_DMA_BUF;
 	int fd;
 
-	if ((shm->flags & req_flags) != req_flags)
+	if (!(shm->flags & TEE_SHM_DMA_BUF))
 		return -EINVAL;
 
 	fd = dma_buf_fd(shm->dmabuf, O_CLOEXEC);
@@ -238,6 +386,8 @@ EXPORT_SYMBOL_GPL(tee_shm_free);
  */
 int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa)
 {
+	if (!(shm->flags & TEE_SHM_MAPPED))
+		return -EINVAL;
 	/* Check that we're in the range of the shm */
 	if ((char *)va < (char *)shm->kaddr)
 		return -EINVAL;
@@ -258,6 +408,8 @@ EXPORT_SYMBOL_GPL(tee_shm_va2pa);
  */
 int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va)
 {
+	if (!(shm->flags & TEE_SHM_MAPPED))
+		return -EINVAL;
 	/* Check that we're in the range of the shm */
 	if (pa < shm->paddr)
 		return -EINVAL;
@@ -284,6 +436,8 @@ EXPORT_SYMBOL_GPL(tee_shm_pa2va);
  */
 void *tee_shm_get_va(struct tee_shm *shm, size_t offs)
 {
+	if (!(shm->flags & TEE_SHM_MAPPED))
+		return ERR_PTR(-EINVAL);
 	if (offs >= shm->size)
 		return ERR_PTR(-EINVAL);
 	return (char *)shm->kaddr + offs;
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index e9be4a45ff3e..7c8495607b99 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -25,8 +25,12 @@
  * specific TEE driver.
  */
 
-#define TEE_SHM_MAPPED		0x1	/* Memory mapped by the kernel */
-#define TEE_SHM_DMA_BUF		0x2	/* Memory with dma-buf handle */
+#define TEE_SHM_MAPPED		BIT(0)	/* Memory mapped by the kernel */
+#define TEE_SHM_DMA_BUF		BIT(1)	/* Memory with dma-buf handle */
+#define TEE_SHM_EXT_DMA_BUF	BIT(2)	/* Memory with dma-buf handle */
+#define TEE_SHM_REGISTER	BIT(3)  /* Memory registered in secure world */
+#define TEE_SHM_USER_MAPPED	BIT(4)  /* Memory mapped in user space */
+#define TEE_SHM_POOL		BIT(5)  /* Memory allocated from pool */
 
 struct device;
 struct tee_device;
@@ -76,6 +80,8 @@ struct tee_param {
  * @cancel_req:		request cancel of an ongoing invoke or open
  * @supp_revc:		called for supplicant to get a command
  * @supp_send:		called for supplicant to send a response
+ * @shm_register:	register shared memory buffer in TEE
+ * @shm_unregister:	unregister shared memory buffer in TEE
  */
 struct tee_driver_ops {
 	void (*get_version)(struct tee_device *teedev,
@@ -94,6 +100,9 @@ struct tee_driver_ops {
 			 struct tee_param *param);
 	int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params,
 			 struct tee_param *param);
+	int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm,
+			    struct page **pages, size_t num_pages);
+	int (*shm_unregister)(struct tee_context *ctx, struct tee_shm *shm);
 };
 
 /**
@@ -301,6 +310,40 @@ void *tee_get_drvdata(struct tee_device *teedev);
  */
 struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags);
 
+/**
+ * tee_shm_priv_alloc() - Allocate shared memory privately
+ * @dev:	Device that allocates the shared memory
+ * @size:	Requested size of shared memory
+ *
+ * Allocates shared memory buffer that is not associated with any client
+ * context. Such buffers are owned by TEE driver and used for internal calls.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_priv_alloc(struct tee_device *teedev, size_t size);
+
+/**
+ * tee_shm_register() - Register shared memory buffer
+ * @ctx:	Context that registers the shared memory
+ * @addr:	Address is userspace of the shared buffer
+ * @length:	Length of the shared buffer
+ * @flags:	Flags setting properties for the requested shared memory.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
+				 size_t length, u32 flags);
+
+/**
+ * tee_shm_is_registered() - Check if shared memory object in registered in TEE
+ * @shm:	Shared memory handle
+ * @returns true if object is registered in TEE
+ */
+static inline bool tee_shm_is_registered(struct tee_shm *shm)
+{
+	return shm && (shm->flags & TEE_SHM_REGISTER);
+}
+
 /**
  * tee_shm_free() - Free shared memory
  * @shm:	Handle to shared memory to free
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 688782e90140..d41a07afe3fc 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -50,6 +50,7 @@
 
 #define TEE_GEN_CAP_GP		(1 << 0)/* GlobalPlatform compliant TEE */
 #define TEE_GEN_CAP_PRIVILEGED	(1 << 1)/* Privileged device (for supplicant) */
+#define TEE_GEN_CAP_REG_MEM	(1 << 2)/* Supports registering shared memory */
 
 /*
  * TEE Implementation ID
@@ -332,6 +333,35 @@ struct tee_iocl_supp_send_arg {
 #define TEE_IOC_SUPPL_SEND	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 7, \
 				     struct tee_ioctl_buf_data)
 
+/**
+ * struct tee_ioctl_shm_register_data - Shared memory register argument
+ * @addr:      [in] Start address of shared memory to register
+ * @length:    [in/out] Length of shared memory to register
+ * @flags:     [in/out] Flags to/from registration.
+ * @id:                [out] Identifier of the shared memory
+ *
+ * The flags field should currently be zero as input. Updated by the call
+ * with actual flags as defined by TEE_IOCTL_SHM_* above.
+ * This structure is used as argument for TEE_IOC_SHM_REGISTER below.
+ */
+struct tee_ioctl_shm_register_data {
+	__u64 addr;
+	__u64 length;
+	__u32 flags;
+	__s32 id;
+};
+
+/**
+ * TEE_IOC_SHM_REGISTER - Register shared memory argument
+ *
+ * Registers shared memory between the user space process and secure OS.
+ *
+ * Returns a file descriptor on success or < 0 on failure
+ *
+ * The shared memory is unregisterred when the descriptor is closed.
+ */
+#define TEE_IOC_SHM_REGISTER   _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 9, \
+				     struct tee_ioctl_shm_register_data)
 /*
  * Five syscalls are used when communicating with the TEE driver.
  * open(): opens the device associated with the driver
-- 
cgit v1.2.3


From 7db7d9f369a47e1a46f93c320b45cb89e81723e7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 15:05:11 +0100
Subject: batman-adv: Add SPDX license identifier above copyright header

The "Linux kernel licensing rules" require that each file has a SPDX
license identifier as first line (and sometimes as second line).

The FSFE REUSE practices [1] would also require the same tags but have no
restrictions on the placement in the source file. Using the "Linux kernel
licensing rules" is therefore also fulfilling the FSFE REUSE practices
requirements at the same time.

[1] https://reuse.software/practices/

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h        | 1 +
 net/batman-adv/Makefile                | 2 +-
 net/batman-adv/bat_algo.c              | 1 +
 net/batman-adv/bat_algo.h              | 1 +
 net/batman-adv/bat_iv_ogm.c            | 1 +
 net/batman-adv/bat_iv_ogm.h            | 1 +
 net/batman-adv/bat_v.c                 | 1 +
 net/batman-adv/bat_v.h                 | 1 +
 net/batman-adv/bat_v_elp.c             | 1 +
 net/batman-adv/bat_v_elp.h             | 1 +
 net/batman-adv/bat_v_ogm.c             | 1 +
 net/batman-adv/bat_v_ogm.h             | 1 +
 net/batman-adv/bitarray.c              | 1 +
 net/batman-adv/bitarray.h              | 1 +
 net/batman-adv/bridge_loop_avoidance.c | 1 +
 net/batman-adv/bridge_loop_avoidance.h | 1 +
 net/batman-adv/debugfs.c               | 1 +
 net/batman-adv/debugfs.h               | 1 +
 net/batman-adv/distributed-arp-table.c | 1 +
 net/batman-adv/distributed-arp-table.h | 1 +
 net/batman-adv/fragmentation.c         | 1 +
 net/batman-adv/fragmentation.h         | 1 +
 net/batman-adv/gateway_client.c        | 1 +
 net/batman-adv/gateway_client.h        | 1 +
 net/batman-adv/gateway_common.c        | 1 +
 net/batman-adv/gateway_common.h        | 1 +
 net/batman-adv/hard-interface.c        | 1 +
 net/batman-adv/hard-interface.h        | 1 +
 net/batman-adv/hash.c                  | 1 +
 net/batman-adv/hash.h                  | 1 +
 net/batman-adv/icmp_socket.c           | 1 +
 net/batman-adv/icmp_socket.h           | 1 +
 net/batman-adv/log.c                   | 1 +
 net/batman-adv/log.h                   | 1 +
 net/batman-adv/main.c                  | 1 +
 net/batman-adv/main.h                  | 1 +
 net/batman-adv/multicast.c             | 1 +
 net/batman-adv/multicast.h             | 1 +
 net/batman-adv/netlink.c               | 1 +
 net/batman-adv/netlink.h               | 1 +
 net/batman-adv/network-coding.c        | 1 +
 net/batman-adv/network-coding.h        | 1 +
 net/batman-adv/originator.c            | 1 +
 net/batman-adv/originator.h            | 1 +
 net/batman-adv/packet.h                | 1 +
 net/batman-adv/routing.c               | 1 +
 net/batman-adv/routing.h               | 1 +
 net/batman-adv/send.c                  | 1 +
 net/batman-adv/send.h                  | 1 +
 net/batman-adv/soft-interface.c        | 1 +
 net/batman-adv/soft-interface.h        | 1 +
 net/batman-adv/sysfs.c                 | 1 +
 net/batman-adv/sysfs.h                 | 1 +
 net/batman-adv/tp_meter.c              | 1 +
 net/batman-adv/tp_meter.h              | 1 +
 net/batman-adv/translation-table.c     | 1 +
 net/batman-adv/translation-table.h     | 1 +
 net/batman-adv/tvlv.c                  | 1 +
 net/batman-adv/tvlv.h                  | 1 +
 net/batman-adv/types.h                 | 1 +
 60 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index efd641c8a5d6..fb4533826163 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: ISC */
 /* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 915987bc6d29..022f6e77307b 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,4 +1,4 @@
-#
+# SPDX-License-Identifier: GPL-2.0
 # Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 44fd073b7546..fa306b25a78b 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 29f6312f9bf1..029221615ba3 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 1b659ab652fb..bff5ec66a2e1 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index ae2ab526bdb1..9dc0dd5c83df 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 341ceab8338d..16709552c21e 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index dd7c4b647e6b..a17ab68bbce8 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 1de992c58b35..8375fd679db3 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index 376ead280ab9..5e39d0588a48 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index c251445a42a0..22d2bafa814a 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 2068770b542d..6a4c14ccc3c6 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 2b070c7e31da..125817c389e5 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index cc262c9d97e0..8cb2c874f5d3 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index cdd8e8e4df0b..007147f3ed9e 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 234775748b8e..b568cec819c5 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index e32ad47c6efd..d94585dc2dbe 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 9c5d4a65b98c..90a08d35c501 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 760c0de72582..3c2faf773335 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index ec364a3c1c66..d81a05a6e6f9 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index a98cf1104a30..22ce4c0c86c3 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 1a2d6c308745..30ffa992fcfc 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 10d521f0b17f..e8db19940ab8 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 3baa3d466e5e..981f58421a32 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 2c26039c23fc..a7039503d88e 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 0a6a97d201f2..7c298b05c1dc 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 4e3d5340ad96..2f067a507fd5 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 9f9890ff7a22..ac7311a91f9d 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index b5f7e13918ac..a6dbaf2e9fc9 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 0c905e91c5e2..81cf54eb2fad 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index bded31121d12..cc76f1365300 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index f3fec40aae86..84cddd01eeab 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 4ef4bde2cc2d..148e64e846d2 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 65ce97efa6b5..6744a64143c0 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 4daed7ad46f2..5ce2007ea11b 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index bb8003cf2296..4bdb39ab3b20 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index e553a8770a89..01546a42b7ad 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 2a78cddab0e9..51f273b5b77d 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index ab13b4d58733..ce424fe2f24e 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index f1cd8c5da966..0e7e57b69b54 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 3604d7899e2c..5cfac6e56610 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index c66efb81d2f4..adaeafa4f71e 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 0a565d0422bb..0716daf5b9a7 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 40c7f039d5d7..b5d2164532c9 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 8e8a5db197cb..4eaf4b426726 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 40d9bf3e5bfe..86b0ea1e5c1c 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 5ede16c32f15..a1289bc5f115 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 7895323fd2a7..a6c53684ba70 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index a16b34f473ef..eb36820e41bc 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 9f673cdfecf8..e543024f98ef 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 639c3abb214a..075c5b5b2ce1 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index aa187fd42475..ab0b95f15b36 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index e487412e256b..0384cb6c406b 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 15cd2139381e..601feb2c4ecf 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index a8ada5c123bd..c8b8f2cb2c2b 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 8a3ce79b1307..281bd4cf7f90 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 411d586191da..8d9e3abec2c8 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 1d9e267caec9..67b2ba4b824b 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index 4d01400ada30..a74df33f446d 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index a62795868794..1df798b32077 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
-- 
cgit v1.2.3


From a010579273bdfbee6ee79422dbebba3dcf18ebf7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Nov 2017 15:05:16 +0100
Subject: batman-adv: Change batman_adv.h license to MIT

The ISC license is considered as not recommended in "Linux kernel licensing
rules". It should only be used for existing code or for importing code from
a different project with that license.

But the kernel still has the similar sounding MIT/Expat license under the
preferred licenses. Switching to this license for this relatively new file
should therefore allow batman-adv to better follow the new licensing rules.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Acked-by: Antonio Quartulli <a@unstable.cc>
Acked-by: Matthias Schiffer <mschiffer@universe-factory.net>
Acked-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index fb4533826163..ae00c99cbed0 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -1,19 +1,25 @@
-/* SPDX-License-Identifier: ISC */
+/* SPDX-License-Identifier: MIT */
 /* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
  *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
  */
 
 #ifndef _UAPI_LINUX_BATMAN_ADV_H_
-- 
cgit v1.2.3


From f551c91de262ba36b20c3ac19538afb4f4507441 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 13 Dec 2017 16:38:56 -0800
Subject: net: erspan: introduce erspan v2 for ip_gre

The patch adds support for erspan version 2.  Not all features are
supported in this patch.  The SGT (security group tag), GRA (timestamp
granularity), FT (frame type) are set to fixed value.  Only hardware
ID and direction are configurable.  Optional subheader is also not
supported.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h           | 120 ++++++++++++++++++++++++++++++++++++++++-
 include/net/ip_tunnels.h       |   5 +-
 include/uapi/linux/if_ether.h  |   1 +
 include/uapi/linux/if_tunnel.h |   3 ++
 net/ipv4/ip_gre.c              | 105 ++++++++++++++++++++++++++++++------
 5 files changed, 216 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/erspan.h b/include/net/erspan.h
index 70c40c7c75b2..acdf6843095d 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -24,11 +24,29 @@
  * |      Reserved         |                  Index                |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
+ *
+ *  ERSPAN Version 2 (Type III) header (12 octets [42:49])
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |  Ver  |          VLAN         | COS |BSO|T|     Session ID    |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                          Timestamp                            |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |             SGT               |P|    FT   |   Hw ID   |D|Gra|O|
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *      Platform Specific SubHeader (8 octets, optional)
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |  Platf ID |               Platform Specific Info              |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                  Platform Specific Info                       |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
  * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
  */
 
 #define ERSPAN_VERSION	0x1	/* ERSPAN type II */
-
 #define VER_MASK	0xf000
 #define VLAN_MASK	0x0fff
 #define COS_MASK	0xe000
@@ -37,6 +55,28 @@
 #define ID_MASK		0x03ff
 #define INDEX_MASK	0xfffff
 
+#define ERSPAN_VERSION2	0x2	/* ERSPAN type III*/
+#define BSO_MASK	EN_MASK
+#define SGT_MASK	0xffff0000
+#define P_MASK		0x8000
+#define FT_MASK		0x7c00
+#define HWID_MASK	0x03f0
+#define DIR_MASK	0x0008
+#define GRA_MASK	0x0006
+#define O_MASK		0x0001
+
+/* ERSPAN version 2 metadata header */
+struct erspan_md2 {
+	__be32 timestamp;
+	__be16 sgt;	/* security group tag */
+	__be16 flags;
+#define P_OFFSET	15
+#define FT_OFFSET	10
+#define HWID_OFFSET	4
+#define DIR_OFFSET	3
+#define GRA_OFFSET	1
+};
+
 enum erspan_encap_type {
 	ERSPAN_ENCAP_NOVLAN = 0x0,	/* originally without VLAN tag */
 	ERSPAN_ENCAP_ISL = 0x1,		/* originally ISL encapsulated */
@@ -48,8 +88,10 @@ enum erspan_encap_type {
 #define ERSPAN_V2_MDSIZE	8
 struct erspan_metadata {
 	union {
-		__be32 index;	/* Version 1 (type II)*/
+		__be32 index;		/* Version 1 (type II)*/
+		struct erspan_md2 md2;	/* Version 2 (type III) */
 	} u;
+	int version;
 };
 
 struct erspan_base_hdr {
@@ -58,6 +100,7 @@ struct erspan_base_hdr {
 	__be16 session_id;
 #define COS_OFFSET  13
 #define EN_OFFSET   11
+#define BSO_OFFSET  EN_OFFSET
 #define T_OFFSET    10
 };
 
@@ -123,4 +166,77 @@ static inline void erspan_build_header(struct sk_buff *skb,
 	ersmd->u.index = htonl(index & INDEX_MASK);
 }
 
+/* ERSPAN GRA: timestamp granularity
+ *   00b --> granularity = 100 microseconds
+ *   01b --> granularity = 100 nanoseconds
+ *   10b --> granularity = IEEE 1588
+ * Here we only support 100 microseconds.
+ */
+static inline __be32 erspan_get_timestamp(void)
+{
+	u64 h_usecs;
+	ktime_t kt;
+
+	kt = ktime_get_real();
+	h_usecs = ktime_divns(kt, 100 * NSEC_PER_USEC);
+
+	/* ERSPAN base header only has 32-bit,
+	 * so it wraps around 4 days.
+	 */
+	return htonl((u32)h_usecs);
+}
+
+static inline void erspan_build_header_v2(struct sk_buff *skb,
+					  __be32 id, u8 direction, u16 hwid,
+					  bool truncate, bool is_ipv4)
+{
+	struct ethhdr *eth = eth_hdr(skb);
+	struct erspan_base_hdr *ershdr;
+	struct erspan_metadata *md;
+	struct qtag_prefix {
+		__be16 eth_type;
+		__be16 tci;
+	} *qp;
+	u16 vlan_tci = 0;
+	u16 session_id;
+	u8 gra = 0; /* 100 usec */
+	u8 bso = 0; /* Bad/Short/Oversized */
+	u8 sgt = 0;
+	u8 tos;
+
+	tos = is_ipv4 ? ip_hdr(skb)->tos :
+			(ipv6_hdr(skb)->priority << 4) +
+			(ipv6_hdr(skb)->flow_lbl[0] >> 4);
+
+	/* Unlike v1, v2 does not have En field,
+	 * so only extract vlan tci field.
+	 */
+	if (eth->h_proto == htons(ETH_P_8021Q)) {
+		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
+		vlan_tci = ntohs(qp->tci);
+	}
+
+	skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
+	ershdr = (struct erspan_base_hdr *)skb->data;
+	memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
+
+	/* Build base header */
+	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
+				 (ERSPAN_VERSION2 << VER_OFFSET));
+	session_id = (u16)(ntohl(id) & ID_MASK) |
+		     ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) |
+		     (bso << BSO_OFFSET & BSO_MASK) |
+		     ((truncate << T_OFFSET) & T_MASK);
+	ershdr->session_id = htons(session_id);
+
+	/* Build metadata */
+	md = (struct erspan_metadata *)(ershdr + 1);
+	md->u.md2.timestamp = erspan_get_timestamp();
+	md->u.md2.sgt = htons(sgt);
+	md->u.md2.flags = htons(((1 << P_OFFSET) & P_MASK) |
+				((hwid << HWID_OFFSET) & HWID_MASK) |
+				((direction << DIR_OFFSET) & DIR_MASK) |
+				((gra << GRA_OFFSET) & GRA_MASK));
+}
+
 #endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 24628f6b09bf..1f16773cfd76 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -116,8 +116,11 @@ struct ip_tunnel {
 	u32		o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
 
-	/* This field used only by ERSPAN */
+	/* These four fields used only by ERSPAN */
 	u32		index;		/* ERSPAN type II index */
+	u8		erspan_ver;	/* ERSPAN version */
+	u8		dir;		/* ERSPAN direction */
+	u16		hwid;		/* ERSPAN hardware ID */
 
 	struct dst_cache dst_cache;
 
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 3ee3bf7c8526..87b7529fcdfe 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -47,6 +47,7 @@
 #define ETH_P_PUP	0x0200		/* Xerox PUP packet		*/
 #define ETH_P_PUPAT	0x0201		/* Xerox PUP Addr Trans packet	*/
 #define ETH_P_TSN	0x22F0		/* TSN (IEEE 1722) packet	*/
+#define ETH_P_ERSPAN2	0x22EB		/* ERSPAN version 2 (type III)	*/
 #define ETH_P_IP	0x0800		/* Internet Protocol packet	*/
 #define ETH_P_X25	0x0805		/* CCITT X.25			*/
 #define ETH_P_ARP	0x0806		/* Address Resolution packet	*/
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index e68dadbd6d45..1b3d148c4560 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -137,6 +137,9 @@ enum {
 	IFLA_GRE_IGNORE_DF,
 	IFLA_GRE_FWMARK,
 	IFLA_GRE_ERSPAN_INDEX,
+	IFLA_GRE_ERSPAN_VER,
+	IFLA_GRE_ERSPAN_DIR,
+	IFLA_GRE_ERSPAN_HWID,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 3e37402147f3..004800b923c6 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -315,11 +315,26 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 				return PACKET_REJECT;
 
 			memcpy(md, pkt_md, sizeof(*md));
+			md->version = ver;
+
 			info = &tun_dst->u.tun_info;
 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 			info->options_len = sizeof(*md);
 		} else {
-			tunnel->index = ntohl(pkt_md->u.index);
+			tunnel->erspan_ver = ver;
+			if (ver == 1) {
+				tunnel->index = ntohl(pkt_md->u.index);
+			} else {
+				u16 md2_flags;
+				u16 dir, hwid;
+
+				md2_flags = ntohs(pkt_md->u.md2.flags);
+				dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+				hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+				tunnel->dir = dir;
+				tunnel->hwid = hwid;
+			}
+
 		}
 
 		skb_reset_mac_header(skb);
@@ -413,7 +428,8 @@ static int gre_rcv(struct sk_buff *skb)
 	if (hdr_len < 0)
 		goto drop;
 
-	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+		     tpi.proto == htons(ETH_P_ERSPAN2))) {
 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 			return 0;
 	}
@@ -568,6 +584,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	bool truncate = false;
 	struct flowi4 fl;
 	int tunnel_hlen;
+	int version;
 	__be16 df;
 
 	tun_info = skb_tunnel_info(skb);
@@ -576,9 +593,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto err_free_skb;
 
 	key = &tun_info->key;
+	md = ip_tunnel_info_opts(tun_info);
+	if (!md)
+		goto err_free_rt;
 
 	/* ERSPAN has fixed 8 byte GRE header */
-	tunnel_hlen = 8 + sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE;
+	version = md->version;
+	tunnel_hlen = 8 + erspan_hdr_len(version);
 
 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
 	if (!rt)
@@ -592,12 +613,23 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		truncate = true;
 	}
 
-	md = ip_tunnel_info_opts(tun_info);
-	if (!md)
-		goto err_free_rt;
+	if (version == 1) {
+		erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+				    ntohl(md->u.index), truncate, true);
+	} else if (version == 2) {
+		u16 md2_flags;
+		u8 direction;
+		u16 hwid;
 
-	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
-			    ntohl(md->u.index), truncate, true);
+		md2_flags = ntohs(md->u.md2.flags);
+		direction = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+		hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+
+		erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id),
+				       direction, hwid,	truncate, true);
+	} else {
+		goto err_free_rt;
+	}
 
 	gre_build_header(skb, 8, TUNNEL_SEQ,
 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
@@ -699,8 +731,14 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 	}
 
 	/* Push ERSPAN header */
-	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
-			    truncate, true);
+	if (tunnel->erspan_ver == 1)
+		erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
+				    truncate, true);
+	else
+		erspan_build_header_v2(skb, tunnel->parms.o_key,
+				       tunnel->dir, tunnel->hwid,
+				       truncate, true);
+
 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
 	return NETDEV_TX_OK;
@@ -1172,13 +1210,32 @@ static int ipgre_netlink_parms(struct net_device *dev,
 	if (data[IFLA_GRE_FWMARK])
 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
 
-	if (data[IFLA_GRE_ERSPAN_INDEX]) {
-		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+	if (data[IFLA_GRE_ERSPAN_VER]) {
+		t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
 
-		if (t->index & ~INDEX_MASK)
+		if (t->erspan_ver != 1 && t->erspan_ver != 2)
 			return -EINVAL;
 	}
 
+	if (t->erspan_ver == 1) {
+		if (data[IFLA_GRE_ERSPAN_INDEX]) {
+			t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+			if (t->index & ~INDEX_MASK)
+				return -EINVAL;
+		}
+	} else if (t->erspan_ver == 2) {
+		if (data[IFLA_GRE_ERSPAN_DIR]) {
+			t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+			if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
+				return -EINVAL;
+		}
+		if (data[IFLA_GRE_ERSPAN_HWID]) {
+			t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+			if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
+				return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -1245,7 +1302,7 @@ static int erspan_tunnel_init(struct net_device *dev)
 	tunnel->tun_hlen = 8;
 	tunnel->parms.iph.protocol = IPPROTO_GRE;
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
-		       sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE;
+		       erspan_hdr_len(tunnel->erspan_ver);
 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
 
 	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
@@ -1375,6 +1432,12 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(4) +
 		/* IFLA_GRE_ERSPAN_INDEX */
 		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_VER */
+		nla_total_size(1) +
+		/* IFLA_GRE_ERSPAN_DIR */
+		nla_total_size(1) +
+		/* IFLA_GRE_ERSPAN_HWID */
+		nla_total_size(2) +
 		0;
 }
 
@@ -1417,9 +1480,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			goto nla_put_failure;
 	}
 
-	if (t->index)
+	if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+		goto nla_put_failure;
+
+	if (t->erspan_ver == 1) {
 		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
 			goto nla_put_failure;
+	} else if (t->erspan_ver == 2) {
+		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+			goto nla_put_failure;
+		if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+			goto nla_put_failure;
+	}
 
 	return 0;
 
@@ -1455,6 +1527,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
-- 
cgit v1.2.3


From 7a4fa29106d9a38ef005f5ab15d493c259f269c0 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Thu, 14 Dec 2017 15:54:29 +0200
Subject: net: sched: Add TCA_HW_OFFLOAD

Qdiscs can be offloaded to HW, but current implementation isn't uniform.
Instead, qdiscs either pass information about offload status via their
TCA_OPTIONS or omit it altogether.

Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform
uAPI for the offloading status of qdiscs.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h      | 1 +
 include/uapi/linux/rtnetlink.h | 1 +
 net/sched/sch_api.c            | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25f2648..83a3e47d5845 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,6 +71,7 @@ struct Qdisc {
 				      * qdisc_tree_decrease_qlen() should stop.
 				      */
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d8b5f80c2ea6..843e29aa3cac 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -557,6 +557,7 @@ enum {
 	TCA_PAD,
 	TCA_DUMP_INVISIBLE,
 	TCA_CHAIN,
+	TCA_HW_OFFLOAD,
 	__TCA_MAX
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f536876b..0f1eab99ff4e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -795,6 +795,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_info = refcount_read(&q->refcnt);
 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
 		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
 	qlen = q->q.qlen;
-- 
cgit v1.2.3


From 4a98795bc8ea148b1ebbbf001283e06430cffe36 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Thu, 14 Dec 2017 15:54:31 +0200
Subject: pkt_sched: Remove TC_RED_OFFLOADED from uapi

Following the previous patch, RED is now using the new uniform uapi
for indicating it's offloaded. As a result, TC_RED_OFFLOADED is no
longer utilized by kernel and can be removed [as it's still not
part of any stable release].

Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index af3cc2f4e1ad..37b5096ae97b 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,7 +256,6 @@ struct tc_red_qopt {
 #define TC_RED_ECN		1
 #define TC_RED_HARDDROP		2
 #define TC_RED_ADAPTATIVE	4
-#define TC_RED_OFFLOADED	8
 };
 
 struct tc_red_xstats {
-- 
cgit v1.2.3


From cc8b0b92a1699bc32f7fec71daa2bfc90de43a4d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 14 Dec 2017 17:55:05 -0800
Subject: bpf: introduce function calls (function boundaries)

Allow arbitrary function calls from bpf function to another bpf function.

Since the beginning of bpf all bpf programs were represented as a single function
and program authors were forced to use always_inline for all functions
in their C code. That was causing llvm to unnecessary inflate the code size
and forcing developers to move code to header files with little code reuse.

With a bit of additional complexity teach verifier to recognize
arbitrary function calls from one bpf function to another as long as
all of functions are presented to the verifier as a single bpf program.
New program layout:
r6 = r1    // some code
..
r1 = ..    // arg1
r2 = ..    // arg2
call pc+1  // function call pc-relative
exit
.. = r1    // access arg1
.. = r2    // access arg2
..
call pc+20 // second level of function call
...

It allows for better optimized code and finally allows to introduce
the core bpf libraries that can be reused in different projects,
since programs are no longer limited by single elf file.
With function calls bpf can be compiled into multiple .o files.

This patch is the first step. It detects programs that contain
multiple functions and checks that calls between them are valid.
It splits the sequence of bpf instructions (one program) into a set
of bpf functions that call each other. Calls to only known
functions are allowed. In the future the verifier may allow
calls to unresolved functions and will do dynamic linking.
This logic supports statically linked bpf functions only.

Such function boundary detection could have been done as part of
control flow graph building in check_cfg(), but it's cleaner to
separate function boundary detection vs control flow checks within
a subprogram (function) into logically indepedent steps.
Follow up patches may split check_cfg() further, but not check_subprogs().

Only allow bpf-to-bpf calls for root only and for non-hw-offloaded programs.
These restrictions can be relaxed in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |   5 +-
 include/uapi/linux/bpf.h     |   6 ++
 kernel/bpf/disasm.c          |   8 ++-
 kernel/bpf/verifier.c        | 141 ++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 155 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c561b986bab0..91a583bb3fa7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -141,6 +141,8 @@ struct bpf_ext_analyzer_ops {
 			 int insn_idx, int prev_insn_idx);
 };
 
+#define BPF_MAX_SUBPROGS 256
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
@@ -159,8 +161,9 @@ struct bpf_verifier_env {
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
-
 	struct bpf_verifer_log log;
+	u32 subprog_starts[BPF_MAX_SUBPROGS];
+	u32 subprog_cnt;
 };
 
 static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 595bda120cfb..d01f1cb3cfc0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -197,8 +197,14 @@ enum bpf_attach_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
+/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
 #define BPF_PSEUDO_MAP_FD	1
 
+/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
+ * offset to another bpf function
+ */
+#define BPF_PSEUDO_CALL		1
+
 /* flags for BPF_MAP_UPDATE_ELEM command */
 #define BPF_ANY		0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index e682850c9715..883f88fa5bfc 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -189,8 +189,12 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
 		u8 opcode = BPF_OP(insn->code);
 
 		if (opcode == BPF_CALL) {
-			verbose(env, "(%02x) call %s#%d\n", insn->code,
-				func_id_name(insn->imm), insn->imm);
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				verbose(env, "(%02x) call pc%+d\n", insn->code,
+					insn->imm);
+			else
+				verbose(env, "(%02x) call %s#%d\n", insn->code,
+					func_id_name(insn->imm), insn->imm);
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
 			verbose(env, "(%02x) goto pc%+d\n",
 				insn->code, insn->off);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e807bda7fe29..1d0f7ff0b9a9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20,6 +20,8 @@
 #include <linux/file.h>
 #include <linux/vmalloc.h>
 #include <linux/stringify.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 
 #include "disasm.h"
 
@@ -636,6 +638,113 @@ enum reg_arg_type {
 	DST_OP_NO_MARK	/* same as above, check only, don't mark */
 };
 
+static int cmp_subprogs(const void *a, const void *b)
+{
+	return *(int *)a - *(int *)b;
+}
+
+static int find_subprog(struct bpf_verifier_env *env, int off)
+{
+	u32 *p;
+
+	p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
+		    sizeof(env->subprog_starts[0]), cmp_subprogs);
+	if (!p)
+		return -ENOENT;
+	return p - env->subprog_starts;
+
+}
+
+static int add_subprog(struct bpf_verifier_env *env, int off)
+{
+	int insn_cnt = env->prog->len;
+	int ret;
+
+	if (off >= insn_cnt || off < 0) {
+		verbose(env, "call to invalid destination\n");
+		return -EINVAL;
+	}
+	ret = find_subprog(env, off);
+	if (ret >= 0)
+		return 0;
+	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
+		verbose(env, "too many subprograms\n");
+		return -E2BIG;
+	}
+	env->subprog_starts[env->subprog_cnt++] = off;
+	sort(env->subprog_starts, env->subprog_cnt,
+	     sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+	return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+
+	/* determine subprog starts. The end is one before the next starts */
+	for (i = 0; i < insn_cnt; i++) {
+		if (insn[i].code != (BPF_JMP | BPF_CALL))
+			continue;
+		if (insn[i].src_reg != BPF_PSEUDO_CALL)
+			continue;
+		if (!env->allow_ptr_leaks) {
+			verbose(env, "function calls to other bpf functions are allowed for root only\n");
+			return -EPERM;
+		}
+		if (bpf_prog_is_dev_bound(env->prog->aux)) {
+			verbose(env, "funcation calls in offloaded programs are not supported yet\n");
+			return -EINVAL;
+		}
+		ret = add_subprog(env, i + insn[i].imm + 1);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (env->log.level > 1)
+		for (i = 0; i < env->subprog_cnt; i++)
+			verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+
+	/* now check that all jumps are within the same subprog */
+	subprog_start = 0;
+	if (env->subprog_cnt == cur_subprog)
+		subprog_end = insn_cnt;
+	else
+		subprog_end = env->subprog_starts[cur_subprog++];
+	for (i = 0; i < insn_cnt; i++) {
+		u8 code = insn[i].code;
+
+		if (BPF_CLASS(code) != BPF_JMP)
+			goto next;
+		if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+			goto next;
+		off = i + insn[i].off + 1;
+		if (off < subprog_start || off >= subprog_end) {
+			verbose(env, "jump out of range from insn %d to %d\n", i, off);
+			return -EINVAL;
+		}
+next:
+		if (i == subprog_end - 1) {
+			/* to avoid fall-through from one subprog into another
+			 * the last insn of the subprog should be either exit
+			 * or unconditional jump back
+			 */
+			if (code != (BPF_JMP | BPF_EXIT) &&
+			    code != (BPF_JMP | BPF_JA)) {
+				verbose(env, "last insn is not an exit or jmp\n");
+				return -EINVAL;
+			}
+			subprog_start = subprog_end;
+			if (env->subprog_cnt == cur_subprog)
+				subprog_end = insn_cnt;
+			else
+				subprog_end = env->subprog_starts[cur_subprog++];
+		}
+	}
+	return 0;
+}
+
 static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 {
 	struct bpf_verifier_state *parent = state->parent;
@@ -3284,6 +3393,10 @@ static int check_cfg(struct bpf_verifier_env *env)
 	int ret = 0;
 	int i, t;
 
+	ret = check_subprogs(env);
+	if (ret < 0)
+		return ret;
+
 	insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_state)
 		return -ENOMEM;
@@ -3316,6 +3429,14 @@ peek_stack:
 				goto err_free;
 			if (t + 1 < insn_cnt)
 				env->explored_states[t + 1] = STATE_LIST_MARK;
+			if (insns[t].src_reg == BPF_PSEUDO_CALL) {
+				env->explored_states[t] = STATE_LIST_MARK;
+				ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+				if (ret == 1)
+					goto peek_stack;
+				else if (ret < 0)
+					goto err_free;
+			}
 		} else if (opcode == BPF_JA) {
 			if (BPF_SRC(insns[t].code) != BPF_K) {
 				ret = -EINVAL;
@@ -4245,6 +4366,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
 	return 0;
 }
 
+static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+	int i;
+
+	if (len == 1)
+		return;
+	for (i = 0; i < env->subprog_cnt; i++) {
+		if (env->subprog_starts[i] < off)
+			continue;
+		env->subprog_starts[i] += len - 1;
+	}
+}
+
 static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
 					    const struct bpf_insn *patch, u32 len)
 {
@@ -4255,6 +4389,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 		return NULL;
 	if (adjust_insn_aux_data(env, new_prog->len, off, len))
 		return NULL;
+	adjust_subprog_starts(env, off, len);
 	return new_prog;
 }
 
@@ -4408,6 +4543,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (insn->code != (BPF_JMP | BPF_CALL))
 			continue;
+		if (insn->src_reg == BPF_PSEUDO_CALL)
+			continue;
 
 		if (insn->imm == BPF_FUNC_get_route_realm)
 			prog->dst_needed = 1;
@@ -4589,12 +4726,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!env->explored_states)
 		goto skip_full_check;
 
+	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
 	ret = check_cfg(env);
 	if (ret < 0)
 		goto skip_full_check;
 
-	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
 	ret = do_check(env);
 	if (env->cur_state) {
 		free_verifier_state(env->cur_state, true);
-- 
cgit v1.2.3


From 0bd9298557eefbf63d169cc8f853fa3f21078cab Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 27 Sep 2017 09:22:01 -0400
Subject: media: frontend: describe nested structs

There are some nested structs on this header, with aren't
properly document them.

This should solve some warnings after the addition of
a patche at kernel-doc adding support for nested structs/unions.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/dvb/frontend.h | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index b297b65845d6..9dad6c66cc34 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -756,16 +756,15 @@ enum fecap_scale_params {
 /**
  * struct dtv_stats - Used for reading a DTV status property
  *
- * @scale:	Filled with enum fecap_scale_params - the scale
- *		in usage for that parameter
+ * @scale:
+ *	Filled with enum fecap_scale_params - the scale in usage
+ *	for that parameter
  *
- * The ``{unnamed_union}`` may have either one of the values below:
- *
- * %svalue
+ * @svalue:
  *	integer value of the measure, for %FE_SCALE_DECIBEL,
  *	used for dB measures. The unit is 0.001 dB.
  *
- * %uvalue
+ * @uvalue:
  *	unsigned integer value of the measure, used when @scale is
  *	either %FE_SCALE_RELATIVE or %FE_SCALE_COUNTER.
  *
@@ -828,19 +827,19 @@ struct dtv_fe_stats {
 /**
  * struct dtv_property - store one of frontend command and its value
  *
- * @cmd:	Digital TV command.
- * @reserved:	Not used.
- * @u:		Union with the values for the command.
- * @result:	Unused
- *
- * The @u union may have either one of the values below:
+ * @cmd:		Digital TV command.
+ * @reserved:		Not used.
+ * @u:			Union with the values for the command.
+ * @u.data:		A unsigned 32 bits integer with command value.
+ * @u.buffer:		Struct to store bigger properties.
+ *			Currently unused.
+ * @u.buffer.data:	an unsigned 32-bits array.
+ * @u.buffer.len:	number of elements of the buffer.
+ * @u.buffer.reserved1:	Reserved.
+ * @u.buffer.reserved2:	Reserved.
+ * @u.st:		a &struct dtv_fe_stats array of statistics.
+ * @result:		Currently unused.
  *
- * %data
- *	an unsigned 32-bits number.
- * %st
- *	a &struct dtv_fe_stats array of statistics.
- * %buffer
- *	a buffer of up to 32 characters (currently unused).
  */
 struct dtv_property {
 	__u32 cmd;
-- 
cgit v1.2.3


From f6ddd094f5793447d594aa9f42032a7aba12b4d2 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 30 Nov 2017 17:01:25 +0100
Subject: virt: Add vboxguest driver for Virtual Box Guest integration UAPI

This commit adds the headers describing the ioctl API for the
/dev/vboxguest device used by the Virtual Box Guest Additions
in Virtual Box virtual machines.

The driver providing the /dev/vboxguest device will allow Virtual Box
Guest Additions features such as copy-and-paste, seamless mode and
OpenGL pass-through.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                            |   7 +
 include/uapi/linux/vbox_err.h          | 151 +++++++++++++++
 include/uapi/linux/vbox_vmmdev_types.h | 226 ++++++++++++++++++++++
 include/uapi/linux/vboxguest.h         | 330 +++++++++++++++++++++++++++++++++
 4 files changed, 714 insertions(+)
 create mode 100644 include/uapi/linux/vbox_err.h
 create mode 100644 include/uapi/linux/vbox_vmmdev_types.h
 create mode 100644 include/uapi/linux/vboxguest.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d83ed21a62b..405d4d28b612 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14557,6 +14557,13 @@ S:	Maintained
 F:	drivers/virtio/virtio_input.c
 F:	include/uapi/linux/virtio_input.h
 
+VIRTUAL BOX GUEST DEVICE DRIVER
+M:	Hans de Goede <hdegoede@redhat.com>
+M:	Arnd Bergmann <arnd@arndb.de>
+M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+S:	Maintained
+F:	include/uapi/linux/vbox*.h
+
 VIRTUAL SERIO DEVICE DRIVER
 M:	Stephen Chandler Paul <thatslyude@gmail.com>
 S:	Maintained
diff --git a/include/uapi/linux/vbox_err.h b/include/uapi/linux/vbox_err.h
new file mode 100644
index 000000000000..7eae536ff1e6
--- /dev/null
+++ b/include/uapi/linux/vbox_err.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: MIT */
+/* Copyright (C) 2017 Oracle Corporation */
+
+#ifndef __UAPI_VBOX_ERR_H__
+#define __UAPI_VBOX_ERR_H__
+
+#define VINF_SUCCESS                        0
+#define VERR_GENERAL_FAILURE                (-1)
+#define VERR_INVALID_PARAMETER              (-2)
+#define VERR_INVALID_MAGIC                  (-3)
+#define VERR_INVALID_HANDLE                 (-4)
+#define VERR_LOCK_FAILED                    (-5)
+#define VERR_INVALID_POINTER                (-6)
+#define VERR_IDT_FAILED                     (-7)
+#define VERR_NO_MEMORY                      (-8)
+#define VERR_ALREADY_LOADED                 (-9)
+#define VERR_PERMISSION_DENIED              (-10)
+#define VERR_VERSION_MISMATCH               (-11)
+#define VERR_NOT_IMPLEMENTED                (-12)
+#define VERR_INVALID_FLAGS                  (-13)
+
+#define VERR_NOT_EQUAL                      (-18)
+#define VERR_NOT_SYMLINK                    (-19)
+#define VERR_NO_TMP_MEMORY                  (-20)
+#define VERR_INVALID_FMODE                  (-21)
+#define VERR_WRONG_ORDER                    (-22)
+#define VERR_NO_TLS_FOR_SELF                (-23)
+#define VERR_FAILED_TO_SET_SELF_TLS         (-24)
+#define VERR_NO_CONT_MEMORY                 (-26)
+#define VERR_NO_PAGE_MEMORY                 (-27)
+#define VERR_THREAD_IS_DEAD                 (-29)
+#define VERR_THREAD_NOT_WAITABLE            (-30)
+#define VERR_PAGE_TABLE_NOT_PRESENT         (-31)
+#define VERR_INVALID_CONTEXT                (-32)
+#define VERR_TIMER_BUSY                     (-33)
+#define VERR_ADDRESS_CONFLICT               (-34)
+#define VERR_UNRESOLVED_ERROR               (-35)
+#define VERR_INVALID_FUNCTION               (-36)
+#define VERR_NOT_SUPPORTED                  (-37)
+#define VERR_ACCESS_DENIED                  (-38)
+#define VERR_INTERRUPTED                    (-39)
+#define VERR_TIMEOUT                        (-40)
+#define VERR_BUFFER_OVERFLOW                (-41)
+#define VERR_TOO_MUCH_DATA                  (-42)
+#define VERR_MAX_THRDS_REACHED              (-43)
+#define VERR_MAX_PROCS_REACHED              (-44)
+#define VERR_SIGNAL_REFUSED                 (-45)
+#define VERR_SIGNAL_PENDING                 (-46)
+#define VERR_SIGNAL_INVALID                 (-47)
+#define VERR_STATE_CHANGED                  (-48)
+#define VERR_INVALID_UUID_FORMAT            (-49)
+#define VERR_PROCESS_NOT_FOUND              (-50)
+#define VERR_PROCESS_RUNNING                (-51)
+#define VERR_TRY_AGAIN                      (-52)
+#define VERR_PARSE_ERROR                    (-53)
+#define VERR_OUT_OF_RANGE                   (-54)
+#define VERR_NUMBER_TOO_BIG                 (-55)
+#define VERR_NO_DIGITS                      (-56)
+#define VERR_NEGATIVE_UNSIGNED              (-57)
+#define VERR_NO_TRANSLATION                 (-58)
+
+#define VERR_NOT_FOUND                      (-78)
+#define VERR_INVALID_STATE                  (-79)
+#define VERR_OUT_OF_RESOURCES               (-80)
+
+#define VERR_FILE_NOT_FOUND                 (-102)
+#define VERR_PATH_NOT_FOUND                 (-103)
+#define VERR_INVALID_NAME                   (-104)
+#define VERR_ALREADY_EXISTS                 (-105)
+#define VERR_TOO_MANY_OPEN_FILES            (-106)
+#define VERR_SEEK                           (-107)
+#define VERR_NEGATIVE_SEEK                  (-108)
+#define VERR_SEEK_ON_DEVICE                 (-109)
+#define VERR_EOF                            (-110)
+#define VERR_READ_ERROR                     (-111)
+#define VERR_WRITE_ERROR                    (-112)
+#define VERR_WRITE_PROTECT                  (-113)
+#define VERR_SHARING_VIOLATION              (-114)
+#define VERR_FILE_LOCK_FAILED               (-115)
+#define VERR_FILE_LOCK_VIOLATION            (-116)
+#define VERR_CANT_CREATE                    (-117)
+#define VERR_CANT_DELETE_DIRECTORY          (-118)
+#define VERR_NOT_SAME_DEVICE                (-119)
+#define VERR_FILENAME_TOO_LONG              (-120)
+#define VERR_MEDIA_NOT_PRESENT              (-121)
+#define VERR_MEDIA_NOT_RECOGNIZED           (-122)
+#define VERR_FILE_NOT_LOCKED                (-123)
+#define VERR_FILE_LOCK_LOST                 (-124)
+#define VERR_DIR_NOT_EMPTY                  (-125)
+#define VERR_NOT_A_DIRECTORY                (-126)
+#define VERR_IS_A_DIRECTORY                 (-127)
+#define VERR_FILE_TOO_BIG                   (-128)
+
+#define VERR_NET_IO_ERROR                       (-400)
+#define VERR_NET_OUT_OF_RESOURCES               (-401)
+#define VERR_NET_HOST_NOT_FOUND                 (-402)
+#define VERR_NET_PATH_NOT_FOUND                 (-403)
+#define VERR_NET_PRINT_ERROR                    (-404)
+#define VERR_NET_NO_NETWORK                     (-405)
+#define VERR_NET_NOT_UNIQUE_NAME                (-406)
+
+#define VERR_NET_IN_PROGRESS                    (-436)
+#define VERR_NET_ALREADY_IN_PROGRESS            (-437)
+#define VERR_NET_NOT_SOCKET                     (-438)
+#define VERR_NET_DEST_ADDRESS_REQUIRED          (-439)
+#define VERR_NET_MSG_SIZE                       (-440)
+#define VERR_NET_PROTOCOL_TYPE                  (-441)
+#define VERR_NET_PROTOCOL_NOT_AVAILABLE         (-442)
+#define VERR_NET_PROTOCOL_NOT_SUPPORTED         (-443)
+#define VERR_NET_SOCKET_TYPE_NOT_SUPPORTED      (-444)
+#define VERR_NET_OPERATION_NOT_SUPPORTED        (-445)
+#define VERR_NET_PROTOCOL_FAMILY_NOT_SUPPORTED  (-446)
+#define VERR_NET_ADDRESS_FAMILY_NOT_SUPPORTED   (-447)
+#define VERR_NET_ADDRESS_IN_USE                 (-448)
+#define VERR_NET_ADDRESS_NOT_AVAILABLE          (-449)
+#define VERR_NET_DOWN                           (-450)
+#define VERR_NET_UNREACHABLE                    (-451)
+#define VERR_NET_CONNECTION_RESET               (-452)
+#define VERR_NET_CONNECTION_ABORTED             (-453)
+#define VERR_NET_CONNECTION_RESET_BY_PEER       (-454)
+#define VERR_NET_NO_BUFFER_SPACE                (-455)
+#define VERR_NET_ALREADY_CONNECTED              (-456)
+#define VERR_NET_NOT_CONNECTED                  (-457)
+#define VERR_NET_SHUTDOWN                       (-458)
+#define VERR_NET_TOO_MANY_REFERENCES            (-459)
+#define VERR_NET_CONNECTION_TIMED_OUT           (-460)
+#define VERR_NET_CONNECTION_REFUSED             (-461)
+#define VERR_NET_HOST_DOWN                      (-464)
+#define VERR_NET_HOST_UNREACHABLE               (-465)
+#define VERR_NET_PROTOCOL_ERROR                 (-466)
+#define VERR_NET_INCOMPLETE_TX_PACKET           (-467)
+
+/* misc. unsorted codes */
+#define VERR_RESOURCE_BUSY                      (-138)
+#define VERR_DISK_FULL                          (-152)
+#define VERR_TOO_MANY_SYMLINKS                  (-156)
+#define VERR_NO_MORE_FILES                      (-201)
+#define VERR_INTERNAL_ERROR                     (-225)
+#define VERR_INTERNAL_ERROR_2                   (-226)
+#define VERR_INTERNAL_ERROR_3                   (-227)
+#define VERR_INTERNAL_ERROR_4                   (-228)
+#define VERR_DEV_IO_ERROR                       (-250)
+#define VERR_IO_BAD_LENGTH                      (-255)
+#define VERR_BROKEN_PIPE                        (-301)
+#define VERR_NO_DATA                            (-304)
+#define VERR_SEM_DESTROYED                      (-363)
+#define VERR_DEADLOCK                           (-365)
+#define VERR_BAD_EXE_FORMAT                     (-608)
+#define VINF_HGCM_ASYNC_EXECUTE                 (2903)
+
+#endif
diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h
new file mode 100644
index 000000000000..0e68024f36c7
--- /dev/null
+++ b/include/uapi/linux/vbox_vmmdev_types.h
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */
+/*
+ * Virtual Device for Guest <-> VMM/Host communication, type definitions
+ * which are also used for the vboxguest ioctl interface / by vboxsf
+ *
+ * Copyright (C) 2006-2016 Oracle Corporation
+ */
+
+#ifndef __UAPI_VBOX_VMMDEV_TYPES_H__
+#define __UAPI_VBOX_VMMDEV_TYPES_H__
+
+#include <asm/bitsperlong.h>
+#include <linux/types.h>
+
+/*
+ * We cannot use linux' compiletime_assert here because it expects to be used
+ * inside a function only. Use a typedef to a char array with a negative size.
+ */
+#define VMMDEV_ASSERT_SIZE(type, size) \
+	typedef char type ## _asrt_size[1 - 2*!!(sizeof(struct type) != (size))]
+
+/** enum vmmdev_request_type - VMMDev request types. */
+enum vmmdev_request_type {
+	VMMDEVREQ_INVALID_REQUEST              =  0,
+	VMMDEVREQ_GET_MOUSE_STATUS             =  1,
+	VMMDEVREQ_SET_MOUSE_STATUS             =  2,
+	VMMDEVREQ_SET_POINTER_SHAPE            =  3,
+	VMMDEVREQ_GET_HOST_VERSION             =  4,
+	VMMDEVREQ_IDLE                         =  5,
+	VMMDEVREQ_GET_HOST_TIME                = 10,
+	VMMDEVREQ_GET_HYPERVISOR_INFO          = 20,
+	VMMDEVREQ_SET_HYPERVISOR_INFO          = 21,
+	VMMDEVREQ_REGISTER_PATCH_MEMORY        = 22, /* since version 3.0.6 */
+	VMMDEVREQ_DEREGISTER_PATCH_MEMORY      = 23, /* since version 3.0.6 */
+	VMMDEVREQ_SET_POWER_STATUS             = 30,
+	VMMDEVREQ_ACKNOWLEDGE_EVENTS           = 41,
+	VMMDEVREQ_CTL_GUEST_FILTER_MASK        = 42,
+	VMMDEVREQ_REPORT_GUEST_INFO            = 50,
+	VMMDEVREQ_REPORT_GUEST_INFO2           = 58, /* since version 3.2.0 */
+	VMMDEVREQ_REPORT_GUEST_STATUS          = 59, /* since version 3.2.8 */
+	VMMDEVREQ_REPORT_GUEST_USER_STATE      = 74, /* since version 4.3 */
+	/* Retrieve a display resize request sent by the host, deprecated. */
+	VMMDEVREQ_GET_DISPLAY_CHANGE_REQ       = 51,
+	VMMDEVREQ_VIDEMODE_SUPPORTED           = 52,
+	VMMDEVREQ_GET_HEIGHT_REDUCTION         = 53,
+	/**
+	 * @VMMDEVREQ_GET_DISPLAY_CHANGE_REQ2:
+	 * Retrieve a display resize request sent by the host.
+	 *
+	 * Queries a display resize request sent from the host.  If the
+	 * event_ack member is sent to true and there is an unqueried request
+	 * available for one of the virtual display then that request will
+	 * be returned.  If several displays have unqueried requests the lowest
+	 * numbered display will be chosen first.  Only the most recent unseen
+	 * request for each display is remembered.
+	 * If event_ack is set to false, the last host request queried with
+	 * event_ack set is resent, or failing that the most recent received
+	 * from the host.  If no host request was ever received then all zeros
+	 * are returned.
+	 */
+	VMMDEVREQ_GET_DISPLAY_CHANGE_REQ2      = 54,
+	VMMDEVREQ_REPORT_GUEST_CAPABILITIES    = 55,
+	VMMDEVREQ_SET_GUEST_CAPABILITIES       = 56,
+	VMMDEVREQ_VIDEMODE_SUPPORTED2          = 57, /* since version 3.2.0 */
+	VMMDEVREQ_GET_DISPLAY_CHANGE_REQEX     = 80, /* since version 4.2.4 */
+	VMMDEVREQ_HGCM_CONNECT                 = 60,
+	VMMDEVREQ_HGCM_DISCONNECT              = 61,
+	VMMDEVREQ_HGCM_CALL32                  = 62,
+	VMMDEVREQ_HGCM_CALL64                  = 63,
+	VMMDEVREQ_HGCM_CANCEL                  = 64,
+	VMMDEVREQ_HGCM_CANCEL2                 = 65,
+	VMMDEVREQ_VIDEO_ACCEL_ENABLE           = 70,
+	VMMDEVREQ_VIDEO_ACCEL_FLUSH            = 71,
+	VMMDEVREQ_VIDEO_SET_VISIBLE_REGION     = 72,
+	VMMDEVREQ_GET_SEAMLESS_CHANGE_REQ      = 73,
+	VMMDEVREQ_QUERY_CREDENTIALS            = 100,
+	VMMDEVREQ_REPORT_CREDENTIALS_JUDGEMENT = 101,
+	VMMDEVREQ_REPORT_GUEST_STATS           = 110,
+	VMMDEVREQ_GET_MEMBALLOON_CHANGE_REQ    = 111,
+	VMMDEVREQ_GET_STATISTICS_CHANGE_REQ    = 112,
+	VMMDEVREQ_CHANGE_MEMBALLOON            = 113,
+	VMMDEVREQ_GET_VRDPCHANGE_REQ           = 150,
+	VMMDEVREQ_LOG_STRING                   = 200,
+	VMMDEVREQ_GET_CPU_HOTPLUG_REQ          = 210,
+	VMMDEVREQ_SET_CPU_HOTPLUG_STATUS       = 211,
+	VMMDEVREQ_REGISTER_SHARED_MODULE       = 212,
+	VMMDEVREQ_UNREGISTER_SHARED_MODULE     = 213,
+	VMMDEVREQ_CHECK_SHARED_MODULES         = 214,
+	VMMDEVREQ_GET_PAGE_SHARING_STATUS      = 215,
+	VMMDEVREQ_DEBUG_IS_PAGE_SHARED         = 216,
+	VMMDEVREQ_GET_SESSION_ID               = 217, /* since version 3.2.8 */
+	VMMDEVREQ_WRITE_COREDUMP               = 218,
+	VMMDEVREQ_GUEST_HEARTBEAT              = 219,
+	VMMDEVREQ_HEARTBEAT_CONFIGURE          = 220,
+	/* Ensure the enum is a 32 bit data-type */
+	VMMDEVREQ_SIZEHACK                     = 0x7fffffff
+};
+
+#if __BITS_PER_LONG == 64
+#define VMMDEVREQ_HGCM_CALL VMMDEVREQ_HGCM_CALL64
+#else
+#define VMMDEVREQ_HGCM_CALL VMMDEVREQ_HGCM_CALL32
+#endif
+
+/** HGCM service location types. */
+enum vmmdev_hgcm_service_location_type {
+	VMMDEV_HGCM_LOC_INVALID    = 0,
+	VMMDEV_HGCM_LOC_LOCALHOST  = 1,
+	VMMDEV_HGCM_LOC_LOCALHOST_EXISTING = 2,
+	/* Ensure the enum is a 32 bit data-type */
+	VMMDEV_HGCM_LOC_SIZEHACK   = 0x7fffffff
+};
+
+/** HGCM host service location. */
+struct vmmdev_hgcm_service_location_localhost {
+	/** Service name */
+	char service_name[128];
+};
+VMMDEV_ASSERT_SIZE(vmmdev_hgcm_service_location_localhost, 128);
+
+/** HGCM service location. */
+struct vmmdev_hgcm_service_location {
+	/** Type of the location. */
+	enum vmmdev_hgcm_service_location_type type;
+
+	union {
+		struct vmmdev_hgcm_service_location_localhost localhost;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vmmdev_hgcm_service_location, 128 + 4);
+
+/** HGCM function parameter type. */
+enum vmmdev_hgcm_function_parameter_type {
+	VMMDEV_HGCM_PARM_TYPE_INVALID            = 0,
+	VMMDEV_HGCM_PARM_TYPE_32BIT              = 1,
+	VMMDEV_HGCM_PARM_TYPE_64BIT              = 2,
+	/** Deprecated Doesn't work, use PAGELIST. */
+	VMMDEV_HGCM_PARM_TYPE_PHYSADDR           = 3,
+	/** In and Out, user-memory */
+	VMMDEV_HGCM_PARM_TYPE_LINADDR            = 4,
+	/** In, user-memory  (read;  host<-guest) */
+	VMMDEV_HGCM_PARM_TYPE_LINADDR_IN         = 5,
+	/** Out, user-memory (write; host->guest) */
+	VMMDEV_HGCM_PARM_TYPE_LINADDR_OUT        = 6,
+	/** In and Out, kernel-memory */
+	VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL     = 7,
+	/** In, kernel-memory  (read;  host<-guest) */
+	VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN  = 8,
+	/** Out, kernel-memory (write; host->guest) */
+	VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT = 9,
+	/** Physical addresses of locked pages for a buffer. */
+	VMMDEV_HGCM_PARM_TYPE_PAGELIST           = 10,
+	/* Ensure the enum is a 32 bit data-type */
+	VMMDEV_HGCM_PARM_TYPE_SIZEHACK           = 0x7fffffff
+};
+
+/** HGCM function parameter, 32-bit client. */
+struct vmmdev_hgcm_function_parameter32 {
+	enum vmmdev_hgcm_function_parameter_type type;
+	union {
+		__u32 value32;
+		__u64 value64;
+		struct {
+			__u32 size;
+			union {
+				__u32 phys_addr;
+				__u32 linear_addr;
+			} u;
+		} pointer;
+		struct {
+			/** Size of the buffer described by the page list. */
+			__u32 size;
+			/** Relative to the request header. */
+			__u32 offset;
+		} page_list;
+	} u;
+} __packed;
+VMMDEV_ASSERT_SIZE(vmmdev_hgcm_function_parameter32, 4 + 8);
+
+/** HGCM function parameter, 64-bit client. */
+struct vmmdev_hgcm_function_parameter64 {
+	enum vmmdev_hgcm_function_parameter_type type;
+	union {
+		__u32 value32;
+		__u64 value64;
+		struct {
+			__u32 size;
+			union {
+				__u64 phys_addr;
+				__u64 linear_addr;
+			} u;
+		} __packed pointer;
+		struct {
+			/** Size of the buffer described by the page list. */
+			__u32 size;
+			/** Relative to the request header. */
+			__u32 offset;
+		} page_list;
+	} __packed u;
+} __packed;
+VMMDEV_ASSERT_SIZE(vmmdev_hgcm_function_parameter64, 4 + 12);
+
+#if __BITS_PER_LONG == 64
+#define vmmdev_hgcm_function_parameter vmmdev_hgcm_function_parameter64
+#else
+#define vmmdev_hgcm_function_parameter vmmdev_hgcm_function_parameter32
+#endif
+
+#define VMMDEV_HGCM_F_PARM_DIRECTION_NONE      0x00000000U
+#define VMMDEV_HGCM_F_PARM_DIRECTION_TO_HOST   0x00000001U
+#define VMMDEV_HGCM_F_PARM_DIRECTION_FROM_HOST 0x00000002U
+#define VMMDEV_HGCM_F_PARM_DIRECTION_BOTH      0x00000003U
+
+/**
+ * struct vmmdev_hgcm_pagelist - VMMDEV_HGCM_PARM_TYPE_PAGELIST parameters
+ * point to this structure to actually describe the buffer.
+ */
+struct vmmdev_hgcm_pagelist {
+	__u32 flags;             /** VMMDEV_HGCM_F_PARM_*. */
+	__u16 offset_first_page; /** Data offset in the first page. */
+	__u16 page_count;        /** Number of pages. */
+	__u64 pages[1];          /** Page addresses. */
+};
+VMMDEV_ASSERT_SIZE(vmmdev_hgcm_pagelist, 4 + 2 + 2 + 8);
+
+#endif
diff --git a/include/uapi/linux/vboxguest.h b/include/uapi/linux/vboxguest.h
new file mode 100644
index 000000000000..612f0c7d3558
--- /dev/null
+++ b/include/uapi/linux/vboxguest.h
@@ -0,0 +1,330 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */
+/*
+ * VBoxGuest - VirtualBox Guest Additions Driver Interface.
+ *
+ * Copyright (C) 2006-2016 Oracle Corporation
+ */
+
+#ifndef __UAPI_VBOXGUEST_H__
+#define __UAPI_VBOXGUEST_H__
+
+#include <asm/bitsperlong.h>
+#include <linux/ioctl.h>
+#include <linux/vbox_err.h>
+#include <linux/vbox_vmmdev_types.h>
+
+/* Version of vbg_ioctl_hdr structure. */
+#define VBG_IOCTL_HDR_VERSION		0x10001
+/* Default request type.  Use this for non-VMMDev requests. */
+#define VBG_IOCTL_HDR_TYPE_DEFAULT		0
+
+/**
+ * Common ioctl header.
+ *
+ * This is a mirror of vmmdev_request_header to prevent duplicating data and
+ * needing to verify things multiple times.
+ */
+struct vbg_ioctl_hdr {
+	/** IN: The request input size, and output size if size_out is zero. */
+	__u32 size_in;
+	/** IN: Structure version (VBG_IOCTL_HDR_VERSION) */
+	__u32 version;
+	/** IN: The VMMDev request type or VBG_IOCTL_HDR_TYPE_DEFAULT. */
+	__u32 type;
+	/**
+	 * OUT: The VBox status code of the operation, out direction only.
+	 * This is a VINF_ or VERR_ value as defined in vbox_err.h.
+	 */
+	__s32 rc;
+	/** IN: Output size. Set to zero to use size_in as output size. */
+	__u32 size_out;
+	/** Reserved, MBZ. */
+	__u32 reserved;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_hdr, 24);
+
+
+/*
+ * The VBoxGuest I/O control version.
+ *
+ * As usual, the high word contains the major version and changes to it
+ * signifies incompatible changes.
+ *
+ * The lower word is the minor version number, it is increased when new
+ * functions are added or existing changed in a backwards compatible manner.
+ */
+#define VBG_IOC_VERSION		0x00010000u
+
+/**
+ * VBG_IOCTL_DRIVER_VERSION_INFO data structure
+ *
+ * Note VBG_IOCTL_DRIVER_VERSION_INFO may switch the session to a backwards
+ * compatible interface version if uClientVersion indicates older client code.
+ */
+struct vbg_ioctl_driver_version_info {
+	/** The header. */
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			/** Requested interface version (VBG_IOC_VERSION). */
+			__u32 req_version;
+			/**
+			 * Minimum interface version number (typically the
+			 * major version part of VBG_IOC_VERSION).
+			 */
+			__u32 min_version;
+			/** Reserved, MBZ. */
+			__u32 reserved1;
+			/** Reserved, MBZ. */
+			__u32 reserved2;
+		} in;
+		struct {
+			/** Version for this session (typ. VBG_IOC_VERSION). */
+			__u32 session_version;
+			/** Version of the IDC interface (VBG_IOC_VERSION). */
+			__u32 driver_version;
+			/** The SVN revision of the driver, or 0. */
+			__u32 driver_revision;
+			/** Reserved \#1 (zero until defined). */
+			__u32 reserved1;
+			/** Reserved \#2 (zero until defined). */
+			__u32 reserved2;
+		} out;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_driver_version_info, 24 + 20);
+
+#define VBG_IOCTL_DRIVER_VERSION_INFO \
+	_IOWR('V', 0, struct vbg_ioctl_driver_version_info)
+
+
+/* IOCTL to perform a VMM Device request less than 1KB in size. */
+#define VBG_IOCTL_VMMDEV_REQUEST(s)	_IOC(_IOC_READ | _IOC_WRITE, 'V', 2, s)
+
+
+/* IOCTL to perform a VMM Device request larger then 1KB. */
+#define VBG_IOCTL_VMMDEV_REQUEST_BIG	_IOC(_IOC_READ | _IOC_WRITE, 'V', 3, 0)
+
+
+/** VBG_IOCTL_HGCM_CONNECT data structure. */
+struct vbg_ioctl_hgcm_connect {
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			struct vmmdev_hgcm_service_location loc;
+		} in;
+		struct {
+			__u32 client_id;
+		} out;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_hgcm_connect, 24 + 132);
+
+#define VBG_IOCTL_HGCM_CONNECT \
+	_IOWR('V', 4, struct vbg_ioctl_hgcm_connect)
+
+
+/** VBG_IOCTL_HGCM_DISCONNECT data structure. */
+struct vbg_ioctl_hgcm_disconnect {
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			__u32 client_id;
+		} in;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_hgcm_disconnect, 24 + 4);
+
+#define VBG_IOCTL_HGCM_DISCONNECT \
+	_IOWR('V', 5, struct vbg_ioctl_hgcm_disconnect)
+
+
+/** VBG_IOCTL_HGCM_CALL data structure. */
+struct vbg_ioctl_hgcm_call {
+	/** The header. */
+	struct vbg_ioctl_hdr hdr;
+	/** Input: The id of the caller. */
+	__u32 client_id;
+	/** Input: Function number. */
+	__u32 function;
+	/**
+	 * Input: How long to wait (milliseconds) for completion before
+	 * cancelling the call. Set to -1 to wait indefinitely.
+	 */
+	__u32 timeout_ms;
+	/** Interruptable flag, ignored for userspace calls. */
+	__u8 interruptible;
+	/** Explicit padding, MBZ. */
+	__u8 reserved;
+	/**
+	 * Input: How many parameters following this structure.
+	 *
+	 * The parameters are either HGCMFunctionParameter64 or 32,
+	 * depending on whether we're receiving a 64-bit or 32-bit request.
+	 *
+	 * The current maximum is 61 parameters (given a 1KB max request size,
+	 * and a 64-bit parameter size of 16 bytes).
+	 */
+	__u16 parm_count;
+	/*
+	 * Parameters follow in form:
+	 * struct hgcm_function_parameter<32|64> parms[parm_count]
+	 */
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_hgcm_call, 24 + 16);
+
+#define VBG_IOCTL_HGCM_CALL_32(s)	_IOC(_IOC_READ | _IOC_WRITE, 'V', 6, s)
+#define VBG_IOCTL_HGCM_CALL_64(s)	_IOC(_IOC_READ | _IOC_WRITE, 'V', 7, s)
+#if __BITS_PER_LONG == 64
+#define VBG_IOCTL_HGCM_CALL(s)		VBG_IOCTL_HGCM_CALL_64(s)
+#else
+#define VBG_IOCTL_HGCM_CALL(s)		VBG_IOCTL_HGCM_CALL_32(s)
+#endif
+
+
+/** VBG_IOCTL_LOG data structure. */
+struct vbg_ioctl_log {
+	/** The header. */
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			/**
+			 * The log message, this may be zero terminated. If it
+			 * is not zero terminated then the length is determined
+			 * from the input size.
+			 */
+			char msg[1];
+		} in;
+	} u;
+};
+
+#define VBG_IOCTL_LOG(s)		_IOC(_IOC_READ | _IOC_WRITE, 'V', 9, s)
+
+
+/** VBG_IOCTL_WAIT_FOR_EVENTS data structure. */
+struct vbg_ioctl_wait_for_events {
+	/** The header. */
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			/** Timeout in milliseconds. */
+			__u32 timeout_ms;
+			/** Events to wait for. */
+			__u32 events;
+		} in;
+		struct {
+			/** Events that occurred. */
+			__u32 events;
+		} out;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_wait_for_events, 24 + 8);
+
+#define VBG_IOCTL_WAIT_FOR_EVENTS \
+	_IOWR('V', 10, struct vbg_ioctl_wait_for_events)
+
+
+/*
+ * IOCTL to VBoxGuest to interrupt (cancel) any pending
+ * VBG_IOCTL_WAIT_FOR_EVENTS and return.
+ *
+ * Handled inside the vboxguest driver and not seen by the host at all.
+ * After calling this, VBG_IOCTL_WAIT_FOR_EVENTS should no longer be called in
+ * the same session. Any VBOXGUEST_IOCTL_WAITEVENT calls in the same session
+ * done after calling this will directly exit with -EINTR.
+ */
+#define VBG_IOCTL_INTERRUPT_ALL_WAIT_FOR_EVENTS \
+	_IOWR('V', 11, struct vbg_ioctl_hdr)
+
+
+/** VBG_IOCTL_CHANGE_FILTER_MASK data structure. */
+struct vbg_ioctl_change_filter {
+	/** The header. */
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			/** Flags to set. */
+			__u32 or_mask;
+			/** Flags to remove. */
+			__u32 not_mask;
+		} in;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_change_filter, 24 + 8);
+
+/* IOCTL to VBoxGuest to control the event filter mask. */
+#define VBG_IOCTL_CHANGE_FILTER_MASK \
+	_IOWR('V', 12, struct vbg_ioctl_change_filter)
+
+
+/** VBG_IOCTL_CHANGE_GUEST_CAPABILITIES data structure. */
+struct vbg_ioctl_set_guest_caps {
+	/** The header. */
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			/** Capabilities to set (VMMDEV_GUEST_SUPPORTS_XXX). */
+			__u32 or_mask;
+			/** Capabilities to drop (VMMDEV_GUEST_SUPPORTS_XXX). */
+			__u32 not_mask;
+		} in;
+		struct {
+			/** Capabilities held by the session after the call. */
+			__u32 session_caps;
+			/** Capabilities for all the sessions after the call. */
+			__u32 global_caps;
+		} out;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_set_guest_caps, 24 + 8);
+
+#define VBG_IOCTL_CHANGE_GUEST_CAPABILITIES \
+	_IOWR('V', 14, struct vbg_ioctl_set_guest_caps)
+
+
+/** VBG_IOCTL_CHECK_BALLOON data structure. */
+struct vbg_ioctl_check_balloon {
+	/** The header. */
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			/** The size of the balloon in chunks of 1MB. */
+			__u32 balloon_chunks;
+			/**
+			 * false = handled in R0, no further action required.
+			 *  true = allocate balloon memory in R3.
+			 */
+			__u8 handle_in_r3;
+			/** Explicit padding, MBZ. */
+			__u8 padding[3];
+		} out;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_check_balloon, 24 + 8);
+
+/*
+ * IOCTL to check memory ballooning.
+ *
+ * The guest kernel module will ask the host for the current size of the
+ * balloon and adjust the size. Or it will set handle_in_r3 = true and R3 is
+ * responsible for allocating memory and calling VBG_IOCTL_CHANGE_BALLOON.
+ */
+#define VBG_IOCTL_CHECK_BALLOON \
+	_IOWR('V', 17, struct vbg_ioctl_check_balloon)
+
+
+/** VBG_IOCTL_WRITE_CORE_DUMP data structure. */
+struct vbg_ioctl_write_coredump {
+	struct vbg_ioctl_hdr hdr;
+	union {
+		struct {
+			__u32 flags; /** Flags (reserved, MBZ). */
+		} in;
+	} u;
+};
+VMMDEV_ASSERT_SIZE(vbg_ioctl_write_coredump, 24 + 4);
+
+#define VBG_IOCTL_WRITE_CORE_DUMP \
+	_IOWR('V', 19, struct vbg_ioctl_write_coredump)
+
+#endif
-- 
cgit v1.2.3


From 06ef0ccb5a36e1feba9b413ff59a04ecc4407c1c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 18 Dec 2017 10:13:44 -0800
Subject: bpf/cgroup: fix a verification error for a CGROUP_DEVICE type prog

The tools/testing/selftests/bpf test program
test_dev_cgroup fails with the following error
when compiled with llvm 6.0. (I did not try
with earlier versions.)

  libbpf: load bpf program failed: Permission denied
  libbpf: -- BEGIN DUMP LOG ---
  libbpf:
  0: (61) r2 = *(u32 *)(r1 +4)
  1: (b7) r0 = 0
  2: (55) if r2 != 0x1 goto pc+8
   R0=inv0 R1=ctx(id=0,off=0,imm=0) R2=inv1 R10=fp0
  3: (69) r2 = *(u16 *)(r1 +0)
  invalid bpf_context access off=0 size=2
  ...

The culprit is the following statement in dev_cgroup.c:
  short type = ctx->access_type & 0xFFFF;
This code is typical as the ctx->access_type is assigned
as below in kernel/bpf/cgroup.c:
  struct bpf_cgroup_dev_ctx ctx = {
        .access_type = (access << 16) | dev_type,
        .major = major,
        .minor = minor,
  };

The compiler converts it to u16 access while
the verifier cgroup_dev_is_valid_access rejects
any non u32 access.

This patch permits the field access_type to be accessible
with type u16 and u8 as well.

Signed-off-by: Yonghong Song <yhs@fb.com>
Tested-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  3 ++-
 kernel/bpf/cgroup.c      | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d01f1cb3cfc0..69eabfcb9bdb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1012,7 +1012,8 @@ struct bpf_perf_event_value {
 #define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
 
 struct bpf_cgroup_dev_ctx {
-	__u32 access_type; /* (access << 16) | type */
+	/* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
+	__u32 access_type;
 	__u32 major;
 	__u32 minor;
 };
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b789ab78d28f..c1c0b60d3f2f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
 				       struct bpf_insn_access_aux *info)
 {
+	const int size_default = sizeof(__u32);
+
 	if (type == BPF_WRITE)
 		return false;
 
@@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size,
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
-		return false;
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
+		bpf_ctx_record_field_size(info, size_default);
+		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+			return false;
+		break;
+	default:
+		if (size != size_default)
+			return false;
+	}
 
 	return true;
 }
-- 
cgit v1.2.3


From f0edce7a7f4c236f2d05040746b388cfac8796a1 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Wed, 29 Nov 2017 10:28:43 -0700
Subject: switchtec: Add Global Fabric Manager Server (GFMS) event

Add a new event type that is newly exposed by recent firmware. The event
will never occur if the firmware is too old. If user space tries to use
this event in an older kernel, it will just get an EINVAL which is
perfectly acceptable in the existing user space code.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/switch/switchtec.c       | 1 +
 include/linux/switchtec.h            | 3 +++
 include/uapi/linux/switchtec_ioctl.h | 3 ++-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 730cc897b94d..7668d270725d 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -707,6 +707,7 @@ static const struct event_reg {
 	EV_GLB(SWITCHTEC_IOCTL_EVENT_CLI_MRPC_COMP_ASYNC,
 	       cli_mrpc_comp_async_hdr),
 	EV_GLB(SWITCHTEC_IOCTL_EVENT_GPIO_INT, gpio_interrupt_hdr),
+	EV_GLB(SWITCHTEC_IOCTL_EVENT_GFMS, gfms_event_hdr),
 	EV_PAR(SWITCHTEC_IOCTL_EVENT_PART_RESET, part_reset_hdr),
 	EV_PAR(SWITCHTEC_IOCTL_EVENT_MRPC_COMP, mrpc_comp_hdr),
 	EV_PAR(SWITCHTEC_IOCTL_EVENT_MRPC_COMP_ASYNC, mrpc_comp_async_hdr),
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 09d73d0d1aa8..42d121174fe2 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -100,6 +100,9 @@ struct sw_event_regs {
 	u32 gpio_interrupt_hdr;
 	u32 gpio_interrupt_data;
 	u32 reserved16[4];
+	u32 gfms_event_hdr;
+	u32 gfms_event_data;
+	u32 reserved17[4];
 } __packed;
 
 enum {
diff --git a/include/uapi/linux/switchtec_ioctl.h b/include/uapi/linux/switchtec_ioctl.h
index 75df44373034..4f4daf8db954 100644
--- a/include/uapi/linux/switchtec_ioctl.h
+++ b/include/uapi/linux/switchtec_ioctl.h
@@ -88,7 +88,8 @@ struct switchtec_ioctl_event_summary {
 #define SWITCHTEC_IOCTL_EVENT_FORCE_SPEED		26
 #define SWITCHTEC_IOCTL_EVENT_CREDIT_TIMEOUT		27
 #define SWITCHTEC_IOCTL_EVENT_LINK_STATE		28
-#define SWITCHTEC_IOCTL_MAX_EVENTS			29
+#define SWITCHTEC_IOCTL_EVENT_GFMS			29
+#define SWITCHTEC_IOCTL_MAX_EVENTS			30
 
 #define SWITCHTEC_IOCTL_EVENT_LOCAL_PART_IDX -1
 #define SWITCHTEC_IOCTL_EVENT_IDX_ALL -2
-- 
cgit v1.2.3


From 983dafaab799511e092ffd006f3a064b37ccbccf Mon Sep 17 00:00:00 2001
From: Sunil Dutt <usdutt@qti.qualcomm.com>
Date: Wed, 13 Dec 2017 19:51:36 +0200
Subject: cfg80211: Scan results to also report the per chain signal strength

This commit enhances the scan results to report the per chain signal
strength based on the latest BSS update. This provides similar
information to what is already available through STA information.

Signed-off-by: Sunil Dutt <usdutt@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 8 ++++++++
 include/uapi/linux/nl80211.h | 4 ++++
 net/wireless/nl80211.c       | 5 +++++
 net/wireless/scan.c          | 5 +++++
 4 files changed, 22 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d7f8e7b96bcb..3a4a1a903a4d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1773,6 +1773,8 @@ enum cfg80211_signal_type {
  *	by %parent_bssid.
  * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to
  *	the BSS that requested the scan in which the beacon/probe was received.
+ * @chains: bitmask for filled values in @chain_signal.
+ * @chain_signal: per-chain signal strength of last received BSS in dBm.
  */
 struct cfg80211_inform_bss {
 	struct ieee80211_channel *chan;
@@ -1781,6 +1783,8 @@ struct cfg80211_inform_bss {
 	u64 boottime_ns;
 	u64 parent_tsf;
 	u8 parent_bssid[ETH_ALEN] __aligned(2);
+	u8 chains;
+	s8 chain_signal[IEEE80211_MAX_CHAINS];
 };
 
 /**
@@ -1824,6 +1828,8 @@ struct cfg80211_bss_ies {
  *	that holds the beacon data. @beacon_ies is still valid, of course, and
  *	points to the same data as hidden_beacon_bss->beacon_ies in that case.
  * @signal: signal strength value (type depends on the wiphy's signal_type)
+ * @chains: bitmask for filled values in @chain_signal.
+ * @chain_signal: per-chain signal strength of last received BSS in dBm.
  * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes
  */
 struct cfg80211_bss {
@@ -1842,6 +1848,8 @@ struct cfg80211_bss {
 	u16 capability;
 
 	u8 bssid[ETH_ALEN];
+	u8 chains;
+	s8 chain_signal[IEEE80211_MAX_CHAINS];
 
 	u8 priv[0] __aligned(sizeof(void *));
 };
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f882fe1f9709..c587a61c32bf 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3862,6 +3862,9 @@ enum nl80211_bss_scan_width {
  *	@NL80211_BSS_PARENT_BSSID. (u64).
  * @NL80211_BSS_PARENT_BSSID: the BSS according to which @NL80211_BSS_PARENT_TSF
  *	is set.
+ * @NL80211_BSS_CHAIN_SIGNAL: per-chain signal strength of last BSS update.
+ *	Contains a nested array of signal strength attributes (u8, dBm),
+ *	using the nesting index as the antenna number.
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -3885,6 +3888,7 @@ enum nl80211_bss {
 	NL80211_BSS_PAD,
 	NL80211_BSS_PARENT_TSF,
 	NL80211_BSS_PARENT_BSSID,
+	NL80211_BSS_CHAIN_SIGNAL,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e4dddfb64ced..b3f8970c3a47 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7839,6 +7839,11 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 			      intbss->ts_boottime, NL80211_BSS_PAD))
 		goto nla_put_failure;
 
+	if (!nl80211_put_signal(msg, intbss->pub.chains,
+				intbss->pub.chain_signal,
+				NL80211_BSS_CHAIN_SIGNAL))
+		goto nla_put_failure;
+
 	switch (rdev->wiphy.signal_type) {
 	case CFG80211_SIGNAL_TYPE_MBM:
 		if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal))
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index f6c5fe482506..d36c3eb7b931 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -981,6 +981,9 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
 		found->ts = tmp->ts;
 		found->ts_boottime = tmp->ts_boottime;
 		found->parent_tsf = tmp->parent_tsf;
+		found->pub.chains = tmp->pub.chains;
+		memcpy(found->pub.chain_signal, tmp->pub.chain_signal,
+		       IEEE80211_MAX_CHAINS);
 		ether_addr_copy(found->parent_bssid, tmp->parent_bssid);
 	} else {
 		struct cfg80211_internal_bss *new;
@@ -1233,6 +1236,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
 	tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
 	tmp.ts_boottime = data->boottime_ns;
 	tmp.parent_tsf = data->parent_tsf;
+	tmp.pub.chains = data->chains;
+	memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS);
 	ether_addr_copy(tmp.parent_bssid, data->parent_bssid);
 
 	signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
-- 
cgit v1.2.3


From f9d79126195374c285035777b9d6abd24ceba363 Mon Sep 17 00:00:00 2001
From: Athanasios Oikonomou <athoik@gmail.com>
Date: Sat, 16 Dec 2017 07:23:38 -0500
Subject: media: dvb_frontend: add physical layer scrambling support

This commit adds a new property DTV_SCRAMBLING_SEQUENCE_INDEX.

This 18 bit field, when present, carries the index of the DVB-S2 physical
layer scrambling sequence as defined in clause 5.5.4 of EN 302 307.
There is no explicit signalling method to convey scrambling sequence index
to the receiver. If S2 satellite delivery system descriptor is available
it can be used to read the scrambling sequence index (EN 300 468 table 41).

By default, gold scrambling sequence index 0 is used. The valid scrambling
sequence index range is from 0 to 262142.

Increase the DVB API version in order userspace to be aware of the changes.

Signed-off-by: Athanasios Oikonomou <athoik@gmail.com>
Acked-by: Ralph Metzler <rjkm@metzlerbros.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 .../media/uapi/dvb/fe_property_parameters.rst          | 18 ++++++++++++++++++
 .../uapi/dvb/frontend-property-satellite-systems.rst   |  2 ++
 drivers/media/dvb-core/dvb_frontend.c                  | 12 ++++++++++++
 drivers/media/dvb-core/dvb_frontend.h                  |  5 +++++
 include/uapi/linux/dvb/frontend.h                      |  5 ++++-
 include/uapi/linux/dvb/version.h                       |  2 +-
 6 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/fe_property_parameters.rst b/Documentation/media/uapi/dvb/fe_property_parameters.rst
index 6eef507fea50..3524dcae4604 100644
--- a/Documentation/media/uapi/dvb/fe_property_parameters.rst
+++ b/Documentation/media/uapi/dvb/fe_property_parameters.rst
@@ -987,3 +987,21 @@ Possible values: 0, 1, LNA_AUTO
 1, LNA on
 
 use the special macro LNA_AUTO to set LNA auto
+
+
+.. _DTV-SCRAMBLING-SEQUENCE-INDEX:
+
+DTV_SCRAMBLING_SEQUENCE_INDEX
+=============================
+
+Used on DVB-S2.
+
+This 18 bit field, when present, carries the index of the DVB-S2 physical
+layer scrambling sequence as defined in clause 5.5.4 of EN 302 307.
+There is no explicit signalling method to convey scrambling sequence index
+to the receiver. If S2 satellite delivery system descriptor is available
+it can be used to read the scrambling sequence index (EN 300 468 table 41).
+
+By default, gold scrambling sequence index 0 is used.
+
+The valid scrambling sequence index range is from 0 to 262142.
diff --git a/Documentation/media/uapi/dvb/frontend-property-satellite-systems.rst b/Documentation/media/uapi/dvb/frontend-property-satellite-systems.rst
index 1f40399c68ff..2929e6999a7a 100644
--- a/Documentation/media/uapi/dvb/frontend-property-satellite-systems.rst
+++ b/Documentation/media/uapi/dvb/frontend-property-satellite-systems.rst
@@ -60,6 +60,8 @@ following parameters:
 
 -  :ref:`DTV_STREAM_ID <DTV-STREAM-ID>`
 
+-  :ref:`DTV_SCRAMBLING_SEQUENCE_INDEX <DTV-SCRAMBLING-SEQUENCE-INDEX>`
+
 In addition, the :ref:`DTV QoS statistics <frontend-stat-properties>`
 are also valid.
 
diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index ee3ea4dcd9c1..5547b9830bbc 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -982,6 +982,7 @@ static int dvb_frontend_clear_cache(struct dvb_frontend *fe)
 	}
 
 	c->stream_id = NO_STREAM_ID_FILTER;
+	c->scrambling_sequence_index = 0;/* default sequence */
 
 	switch (c->delivery_system) {
 	case SYS_DVBS:
@@ -1072,6 +1073,7 @@ static struct dtv_cmds_h dtv_cmds[DTV_MAX_COMMAND + 1] = {
 
 	_DTV_CMD(DTV_STREAM_ID, 1, 0),
 	_DTV_CMD(DTV_DVBT2_PLP_ID_LEGACY, 1, 0),
+	_DTV_CMD(DTV_SCRAMBLING_SEQUENCE_INDEX, 1, 0),
 	_DTV_CMD(DTV_LNA, 1, 0),
 
 	/* Get */
@@ -1417,6 +1419,11 @@ static int dtv_property_process_get(struct dvb_frontend *fe,
 		tvp->u.data = c->stream_id;
 		break;
 
+	/* Physical layer scrambling support */
+	case DTV_SCRAMBLING_SEQUENCE_INDEX:
+		tvp->u.data = c->scrambling_sequence_index;
+		break;
+
 	/* ATSC-MH */
 	case DTV_ATSCMH_FIC_VER:
 		tvp->u.data = fe->dtv_property_cache.atscmh_fic_ver;
@@ -1900,6 +1907,11 @@ static int dtv_property_process_set(struct dvb_frontend *fe,
 		c->stream_id = data;
 		break;
 
+	/* Physical layer scrambling support */
+	case DTV_SCRAMBLING_SEQUENCE_INDEX:
+		c->scrambling_sequence_index = data;
+		break;
+
 	/* ATSC-MH */
 	case DTV_ATSCMH_PARADE_ID:
 		fe->dtv_property_cache.atscmh_parade_id = data;
diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h
index ace0c2fb26c2..2bc25f1e425b 100644
--- a/drivers/media/dvb-core/dvb_frontend.h
+++ b/drivers/media/dvb-core/dvb_frontend.h
@@ -513,6 +513,8 @@ struct dvb_fe_events {
  * @layer.interleaving:	 per layer interleaving.
  * @stream_id:		If different than zero, enable substream filtering, if
  *			hardware supports (DVB-S2 and DVB-T2).
+ * @scrambling_sequence_index:	Carries the index of the DVB-S2 physical layer
+ *				scrambling sequence.
  * @atscmh_fic_ver:	Version number of the FIC (Fast Information Channel)
  *			signaling data (only ATSC-M/H)
  * @atscmh_parade_id:	Parade identification number (only ATSC-M/H)
@@ -591,6 +593,9 @@ struct dtv_frontend_properties {
 	/* Multistream specifics */
 	u32			stream_id;
 
+	/* Physical Layer Scrambling specifics */
+	u32			scrambling_sequence_index;
+
 	/* ATSC-MH specifics */
 	u8			atscmh_fic_ver;
 	u8			atscmh_parade_id;
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 9dad6c66cc34..4f9b4551c534 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -547,7 +547,10 @@ enum fe_interleaving {
 #define DTV_STAT_ERROR_BLOCK_COUNT	68
 #define DTV_STAT_TOTAL_BLOCK_COUNT	69
 
-#define DTV_MAX_COMMAND		DTV_STAT_TOTAL_BLOCK_COUNT
+/* Physical layer scrambling */
+#define DTV_SCRAMBLING_SEQUENCE_INDEX	70
+
+#define DTV_MAX_COMMAND		DTV_SCRAMBLING_SEQUENCE_INDEX
 
 /**
  * enum fe_pilot - Type of pilot tone
diff --git a/include/uapi/linux/dvb/version.h b/include/uapi/linux/dvb/version.h
index 02e32ea83984..2c5cffe6d2a0 100644
--- a/include/uapi/linux/dvb/version.h
+++ b/include/uapi/linux/dvb/version.h
@@ -25,6 +25,6 @@
 #define _DVBVERSION_H_
 
 #define DVB_API_VERSION 5
-#define DVB_API_VERSION_MINOR 10
+#define DVB_API_VERSION_MINOR 11
 
 #endif /*_DVBVERSION_H_*/
-- 
cgit v1.2.3


From a32295c612c57990d17fb0f41e7134394b2f35f6 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 13 Dec 2017 13:31:31 +1100
Subject: vfio-pci: Allow mapping MSIX BAR

By default VFIO disables mapping of MSIX BAR to the userspace as
the userspace may program it in a way allowing spurious interrupts;
instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
In order to eliminate guessing from the userspace about what is
mmapable, VFIO also advertises a sparse list of regions allowed to mmap.

This works fine as long as the system page size equals to the MSIX
alignment requirement which is 4KB. However with a bigger page size
the existing code prohibits mapping non-MSIX parts of a page with MSIX
structures so these parts have to be emulated via slow reads/writes on
a VFIO device fd. If these emulated bits are accessed often, this has
serious impact on performance.

This allows mmap of the entire BAR containing MSIX vector table.

This removes the sparse capability for PCI devices as it becomes useless.

As the userspace needs to know for sure whether mmapping of the MSIX
vector containing data can succeed, this adds a new capability -
VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
that the entire BAR can be mmapped.

This does not touch the MSIX mangling in the BAR read/write handlers as
we are doing this just to enable direct access to non MSIX registers.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw - fixup whitespace, trim function name]
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c | 64 ++++++---------------------------------------
 include/uapi/linux/vfio.h   | 10 +++++++
 2 files changed, 18 insertions(+), 56 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index de48acd29a84..b0f759476900 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -565,47 +565,15 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 	return walk.ret;
 }
 
-static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
-				struct vfio_info_cap *caps)
+static int msix_mmappable_cap(struct vfio_pci_device *vdev,
+			      struct vfio_info_cap *caps)
 {
-	struct vfio_region_info_cap_sparse_mmap *sparse;
-	size_t end, size;
-	int nr_areas = 2, i = 0, ret;
-
-	end = pci_resource_len(vdev->pdev, vdev->msix_bar);
-
-	/* If MSI-X table is aligned to the start or end, only one area */
-	if (((vdev->msix_offset & PAGE_MASK) == 0) ||
-	    (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end))
-		nr_areas = 1;
-
-	size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
-
-	sparse = kzalloc(size, GFP_KERNEL);
-	if (!sparse)
-		return -ENOMEM;
-
-	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
-	sparse->header.version = 1;
-	sparse->nr_areas = nr_areas;
-
-	if (vdev->msix_offset & PAGE_MASK) {
-		sparse->areas[i].offset = 0;
-		sparse->areas[i].size = vdev->msix_offset & PAGE_MASK;
-		i++;
-	}
-
-	if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) {
-		sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset +
-						     vdev->msix_size);
-		sparse->areas[i].size = end - sparse->areas[i].offset;
-		i++;
-	}
-
-	ret = vfio_info_add_capability(caps, &sparse->header, size);
-	kfree(sparse);
+	struct vfio_info_cap_header header = {
+		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
+		.version = 1
+	};
 
-	return ret;
+	return vfio_info_add_capability(caps, &header, sizeof(header));
 }
 
 int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
@@ -696,7 +664,7 @@ static long vfio_pci_ioctl(void *device_data,
 			if (vdev->bar_mmap_supported[info.index]) {
 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 				if (info.index == vdev->msix_bar) {
-					ret = msix_sparse_mmap_cap(vdev, &caps);
+					ret = msix_mmappable_cap(vdev, &caps);
 					if (ret)
 						return ret;
 				}
@@ -1127,22 +1095,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 	if (req_start + req_len > phys_len)
 		return -EINVAL;
 
-	if (index == vdev->msix_bar) {
-		/*
-		 * Disallow mmaps overlapping the MSI-X table; users don't
-		 * get to touch this directly.  We could find somewhere
-		 * else to map the overlap, but page granularity is only
-		 * a recommendation, not a requirement, so the user needs
-		 * to know which bits are real.  Requiring them to mmap
-		 * around the table makes that clear.
-		 */
-
-		/* If neither entirely above nor below, then it overlaps */
-		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
-		      req_start + req_len <= vdev->msix_offset))
-			return -EINVAL;
-	}
-
 	/*
 	 * Even though we don't make use of the barmap for the mmap,
 	 * we need to request the region and the barmap tracks that.
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e3301dbd27d4..0d914350f7bf 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -301,6 +301,16 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
 
+/*
+ * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
+ * which allows direct access to non-MSIX registers which happened to be within
+ * the same system page.
+ *
+ * Even though the userspace gets direct access to the MSIX data, the existing
+ * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
+ */
+#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  *				    struct vfio_irq_info)
-- 
cgit v1.2.3


From 4c82fd0abb87e20d0d68ef5237e74732352806c8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 Dec 2017 12:08:33 +0100
Subject: netfilter: uapi: correct UNTRACKED conntrack state bit number

nft_ct exposes this bit to userspace.  This used to be

  #define NF_CT_STATE_UNTRACKED_BIT              (1 << (IP_CT_NUMBER + 1))
  (IP_CT_NUMBER is 5, so this was 0x40)

.. but this got changed to 8 (0x100) when the untracked object got removed.
Replace this with a literal 6 to prevent further incompatible changes
in case IP_CT_NUMBER ever increases.

Fixes: cc41c84b7e7f2 ("netfilter: kill the fake untracked conntrack objects")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 3fea7709a441..57ccfb32e87f 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -36,7 +36,7 @@ enum ip_conntrack_info {
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
 #define NF_CT_STATE_BIT(ctinfo)			(1 << ((ctinfo) % IP_CT_IS_REPLY + 1))
-#define NF_CT_STATE_UNTRACKED_BIT		(1 << (IP_CT_UNTRACKED + 1))
+#define NF_CT_STATE_UNTRACKED_BIT		(1 << 6)
 
 /* Bitset representing status of connection. */
 enum ip_conntrack_status {
-- 
cgit v1.2.3


From fec149f5d3234c037ec761d1db4cc8c0550e9964 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Thu, 21 Dec 2017 10:17:41 +0100
Subject: batman-adv: Convert packet.h to uapi header

The header file is used by different userspace programs to inject packets
or to decode sniffed packets. It should therefore be available to them as
userspace header.

Also other components in the kernel (like the flow dissector) require
access to the packet definitions to be able to decode ETH_P_BATMAN ethernet
packets.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                            |   1 +
 include/uapi/linux/batadv_packet.h     | 644 +++++++++++++++++++++++++++++++++
 net/batman-adv/bat_iv_ogm.c            |   2 +-
 net/batman-adv/bat_v.c                 |   2 +-
 net/batman-adv/bat_v_elp.c             |   2 +-
 net/batman-adv/bat_v_ogm.c             |   2 +-
 net/batman-adv/bridge_loop_avoidance.c |   2 +-
 net/batman-adv/distributed-arp-table.h |   2 +-
 net/batman-adv/fragmentation.c         |   2 +-
 net/batman-adv/gateway_client.c        |   2 +-
 net/batman-adv/gateway_common.c        |   2 +-
 net/batman-adv/hard-interface.c        |   2 +-
 net/batman-adv/icmp_socket.c           |   2 +-
 net/batman-adv/main.c                  |   2 +-
 net/batman-adv/main.h                  |   2 +-
 net/batman-adv/multicast.c             |   2 +-
 net/batman-adv/netlink.c               |   2 +-
 net/batman-adv/network-coding.c        |   2 +-
 net/batman-adv/packet.h                | 644 ---------------------------------
 net/batman-adv/routing.c               |   2 +-
 net/batman-adv/send.h                  |   3 +-
 net/batman-adv/soft-interface.c        |   2 +-
 net/batman-adv/sysfs.c                 |   2 +-
 net/batman-adv/tp_meter.c              |   2 +-
 net/batman-adv/translation-table.c     |   2 +-
 net/batman-adv/tvlv.c                  |   2 +-
 net/batman-adv/types.h                 |   3 +-
 27 files changed, 669 insertions(+), 670 deletions(-)
 create mode 100644 include/uapi/linux/batadv_packet.h
 delete mode 100644 net/batman-adv/packet.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 129c591e0f34..753799d24cd9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2564,6 +2564,7 @@ S:	Maintained
 F:	Documentation/ABI/testing/sysfs-class-net-batman-adv
 F:	Documentation/ABI/testing/sysfs-class-net-mesh
 F:	Documentation/networking/batman-adv.rst
+F:	include/uapi/linux/batadv_packet.h
 F:	include/uapi/linux/batman_adv.h
 F:	net/batman-adv/
 
diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h
new file mode 100644
index 000000000000..5cb360be2a11
--- /dev/null
+++ b/include/uapi/linux/batadv_packet.h
@@ -0,0 +1,644 @@
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _UAPI_LINUX_BATADV_PACKET_H_
+#define _UAPI_LINUX_BATADV_PACKET_H_
+
+#include <asm/byteorder.h>
+#include <linux/if_ether.h>
+#include <linux/types.h>
+
+/**
+ * batadv_tp_is_error() - Check throughput meter return code for error
+ * @n: throughput meter return code
+ *
+ * Return: 0 when not error was detected, != 0 otherwise
+ */
+#define batadv_tp_is_error(n) ((__u8)(n) > 127 ? 1 : 0)
+
+/**
+ * enum batadv_packettype - types for batman-adv encapsulated packets
+ * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
+ * @BATADV_BCAST: broadcast packets carrying broadcast payload
+ * @BATADV_CODED: network coded packets
+ * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V
+ * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V
+ *
+ * @BATADV_UNICAST: unicast packets carrying unicast payload traffic
+ * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original
+ *     payload packet
+ * @BATADV_UNICAST_4ADDR: unicast packet including the originator address of
+ *     the sender
+ * @BATADV_ICMP: unicast packet like IP ICMP used for ping or traceroute
+ * @BATADV_UNICAST_TVLV: unicast packet carrying TVLV containers
+ */
+enum batadv_packettype {
+	/* 0x00 - 0x3f: local packets or special rules for handling */
+	BATADV_IV_OGM           = 0x00,
+	BATADV_BCAST            = 0x01,
+	BATADV_CODED            = 0x02,
+	BATADV_ELP		= 0x03,
+	BATADV_OGM2		= 0x04,
+	/* 0x40 - 0x7f: unicast */
+#define BATADV_UNICAST_MIN     0x40
+	BATADV_UNICAST          = 0x40,
+	BATADV_UNICAST_FRAG     = 0x41,
+	BATADV_UNICAST_4ADDR    = 0x42,
+	BATADV_ICMP             = 0x43,
+	BATADV_UNICAST_TVLV     = 0x44,
+#define BATADV_UNICAST_MAX     0x7f
+	/* 0x80 - 0xff: reserved */
+};
+
+/**
+ * enum batadv_subtype - packet subtype for unicast4addr
+ * @BATADV_P_DATA: user payload
+ * @BATADV_P_DAT_DHT_GET: DHT request message
+ * @BATADV_P_DAT_DHT_PUT: DHT store message
+ * @BATADV_P_DAT_CACHE_REPLY: ARP reply generated by DAT
+ */
+enum batadv_subtype {
+	BATADV_P_DATA			= 0x01,
+	BATADV_P_DAT_DHT_GET		= 0x02,
+	BATADV_P_DAT_DHT_PUT		= 0x03,
+	BATADV_P_DAT_CACHE_REPLY	= 0x04,
+};
+
+/* this file is included by batctl which needs these defines */
+#define BATADV_COMPAT_VERSION 15
+
+/**
+ * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets
+ * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was
+ *     previously received from someone else than the best neighbor.
+ * @BATADV_PRIMARIES_FIRST_HOP: flag unused.
+ * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a
+ *     one hop neighbor on the interface where it was originally received.
+ */
+enum batadv_iv_flags {
+	BATADV_NOT_BEST_NEXT_HOP   = 1UL << 0,
+	BATADV_PRIMARIES_FIRST_HOP = 1UL << 1,
+	BATADV_DIRECTLINK          = 1UL << 2,
+};
+
+/**
+ * enum batadv_icmp_packettype - ICMP message types
+ * @BATADV_ECHO_REPLY: success reply to BATADV_ECHO_REQUEST
+ * @BATADV_DESTINATION_UNREACHABLE: failure when route to destination not found
+ * @BATADV_ECHO_REQUEST: request BATADV_ECHO_REPLY from destination
+ * @BATADV_TTL_EXCEEDED: error after BATADV_ECHO_REQUEST traversed too many hops
+ * @BATADV_PARAMETER_PROBLEM: return code for malformed messages
+ * @BATADV_TP: throughput meter packet
+ */
+enum batadv_icmp_packettype {
+	BATADV_ECHO_REPLY	       = 0,
+	BATADV_DESTINATION_UNREACHABLE = 3,
+	BATADV_ECHO_REQUEST	       = 8,
+	BATADV_TTL_EXCEEDED	       = 11,
+	BATADV_PARAMETER_PROBLEM       = 12,
+	BATADV_TP		       = 15,
+};
+
+/**
+ * enum batadv_mcast_flags - flags for multicast capabilities and settings
+ * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for
+ *  224.0.0.0/24 or ff02::1
+ * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets
+ * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets
+ */
+enum batadv_mcast_flags {
+	BATADV_MCAST_WANT_ALL_UNSNOOPABLES	= 1UL << 0,
+	BATADV_MCAST_WANT_ALL_IPV4		= 1UL << 1,
+	BATADV_MCAST_WANT_ALL_IPV6		= 1UL << 2,
+};
+
+/* tt data subtypes */
+#define BATADV_TT_DATA_TYPE_MASK 0x0F
+
+/**
+ * enum batadv_tt_data_flags - flags for tt data tvlv
+ * @BATADV_TT_OGM_DIFF: TT diff propagated through OGM
+ * @BATADV_TT_REQUEST: TT request message
+ * @BATADV_TT_RESPONSE: TT response message
+ * @BATADV_TT_FULL_TABLE: contains full table to replace existing table
+ */
+enum batadv_tt_data_flags {
+	BATADV_TT_OGM_DIFF   = 1UL << 0,
+	BATADV_TT_REQUEST    = 1UL << 1,
+	BATADV_TT_RESPONSE   = 1UL << 2,
+	BATADV_TT_FULL_TABLE = 1UL << 4,
+};
+
+/**
+ * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
+ * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
+ */
+enum batadv_vlan_flags {
+	BATADV_VLAN_HAS_TAG	= 1UL << 15,
+};
+
+/**
+ * enum batadv_bla_claimframe - claim frame types for the bridge loop avoidance
+ * @BATADV_CLAIM_TYPE_CLAIM: claim of a client mac address
+ * @BATADV_CLAIM_TYPE_UNCLAIM: unclaim of a client mac address
+ * @BATADV_CLAIM_TYPE_ANNOUNCE: announcement of backbone with current crc
+ * @BATADV_CLAIM_TYPE_REQUEST: request of full claim table
+ * @BATADV_CLAIM_TYPE_LOOPDETECT: mesh-traversing loop detect packet
+ */
+enum batadv_bla_claimframe {
+	BATADV_CLAIM_TYPE_CLAIM		= 0x00,
+	BATADV_CLAIM_TYPE_UNCLAIM	= 0x01,
+	BATADV_CLAIM_TYPE_ANNOUNCE	= 0x02,
+	BATADV_CLAIM_TYPE_REQUEST	= 0x03,
+	BATADV_CLAIM_TYPE_LOOPDETECT	= 0x04,
+};
+
+/**
+ * enum batadv_tvlv_type - tvlv type definitions
+ * @BATADV_TVLV_GW: gateway tvlv
+ * @BATADV_TVLV_DAT: distributed arp table tvlv
+ * @BATADV_TVLV_NC: network coding tvlv
+ * @BATADV_TVLV_TT: translation table tvlv
+ * @BATADV_TVLV_ROAM: roaming advertisement tvlv
+ * @BATADV_TVLV_MCAST: multicast capability tvlv
+ */
+enum batadv_tvlv_type {
+	BATADV_TVLV_GW		= 0x01,
+	BATADV_TVLV_DAT		= 0x02,
+	BATADV_TVLV_NC		= 0x03,
+	BATADV_TVLV_TT		= 0x04,
+	BATADV_TVLV_ROAM	= 0x05,
+	BATADV_TVLV_MCAST	= 0x06,
+};
+
+#pragma pack(2)
+/* the destination hardware field in the ARP frame is used to
+ * transport the claim type and the group id
+ */
+struct batadv_bla_claim_dst {
+	__u8   magic[3];	/* FF:43:05 */
+	__u8   type;		/* bla_claimframe */
+	__be16 group;		/* group id */
+};
+
+#pragma pack()
+
+/**
+ * struct batadv_ogm_packet - ogm (routing protocol) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @flags: contains routing relevant flags - see enum batadv_iv_flags
+ * @seqno: sequence identification
+ * @orig: address of the source node
+ * @prev_sender: address of the previous sender
+ * @reserved: reserved byte for alignment
+ * @tq: transmission quality
+ * @tvlv_len: length of tvlv data following the ogm header
+ */
+struct batadv_ogm_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   flags;
+	__be32 seqno;
+	__u8   orig[ETH_ALEN];
+	__u8   prev_sender[ETH_ALEN];
+	__u8   reserved;
+	__u8   tq;
+	__be16 tvlv_len;
+	/* __packed is not needed as the struct size is divisible by 4,
+	 * and the largest data type in this struct has a size of 4.
+	 */
+};
+
+#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
+
+/**
+ * struct batadv_ogm2_packet - ogm2 (routing protocol) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the general header
+ * @ttl: time to live for this packet, part of the general header
+ * @flags: reseved for routing relevant flags - currently always 0
+ * @seqno: sequence number
+ * @orig: originator mac address
+ * @tvlv_len: length of the appended tvlv buffer (in bytes)
+ * @throughput: the currently flooded path throughput
+ */
+struct batadv_ogm2_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   flags;
+	__be32 seqno;
+	__u8   orig[ETH_ALEN];
+	__be16 tvlv_len;
+	__be32 throughput;
+	/* __packed is not needed as the struct size is divisible by 4,
+	 * and the largest data type in this struct has a size of 4.
+	 */
+};
+
+#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet)
+
+/**
+ * struct batadv_elp_packet - elp (neighbor discovery) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @orig: originator mac address
+ * @seqno: sequence number
+ * @elp_interval: currently used ELP sending interval in ms
+ */
+struct batadv_elp_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   orig[ETH_ALEN];
+	__be32 seqno;
+	__be32 elp_interval;
+};
+
+#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
+
+/**
+ * struct batadv_icmp_header - common members among all the ICMP packets
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @align: not used - useful for alignment purposes only
+ *
+ * This structure is used for ICMP packets parsing only and it is never sent
+ * over the wire. The alignment field at the end is there to ensure that
+ * members are padded the same way as they are in real packets.
+ */
+struct batadv_icmp_header {
+	__u8 packet_type;
+	__u8 version;
+	__u8 ttl;
+	__u8 msg_type; /* see ICMP message types above */
+	__u8 dst[ETH_ALEN];
+	__u8 orig[ETH_ALEN];
+	__u8 uid;
+	__u8 align[3];
+};
+
+/**
+ * struct batadv_icmp_packet - ICMP packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @reserved: not used - useful for alignment
+ * @seqno: ICMP sequence number
+ */
+struct batadv_icmp_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   reserved;
+	__be16 seqno;
+};
+
+/**
+ * struct batadv_icmp_tp_packet - ICMP TP Meter packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @subtype: TP packet subtype (see batadv_icmp_tp_subtype)
+ * @session: TP session identifier
+ * @seqno: the TP sequence number
+ * @timestamp: time when the packet has been sent. This value is filled in a
+ *  TP_MSG and echoed back in the next TP_ACK so that the sender can compute the
+ *  RTT. Since it is read only by the host which wrote it, there is no need to
+ *  store it using network order
+ */
+struct batadv_icmp_tp_packet {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   subtype;
+	__u8   session[2];
+	__be32 seqno;
+	__be32 timestamp;
+};
+
+/**
+ * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes
+ * @BATADV_TP_MSG: Msg from sender to receiver
+ * @BATADV_TP_ACK: acknowledgment from receiver to sender
+ */
+enum batadv_icmp_tp_subtype {
+	BATADV_TP_MSG	= 0,
+	BATADV_TP_ACK,
+};
+
+#define BATADV_RR_LEN 16
+
+/**
+ * struct batadv_icmp_packet_rr - ICMP RouteRecord packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @rr_cur: number of entries the rr array
+ * @seqno: ICMP sequence number
+ * @rr: route record array
+ */
+struct batadv_icmp_packet_rr {
+	__u8   packet_type;
+	__u8   version;
+	__u8   ttl;
+	__u8   msg_type; /* see ICMP message types above */
+	__u8   dst[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__u8   uid;
+	__u8   rr_cur;
+	__be16 seqno;
+	__u8   rr[BATADV_RR_LEN][ETH_ALEN];
+};
+
+#define BATADV_ICMP_MAX_PACKET_SIZE	sizeof(struct batadv_icmp_packet_rr)
+
+/* All packet headers in front of an ethernet header have to be completely
+ * divisible by 2 but not by 4 to make the payload after the ethernet
+ * header again 4 bytes boundary aligned.
+ *
+ * A packing of 2 is necessary to avoid extra padding at the end of the struct
+ * caused by a structure member which is larger than two bytes. Otherwise
+ * the structure would not fulfill the previously mentioned rule to avoid the
+ * misalignment of the payload after the ethernet header. It may also lead to
+ * leakage of information when the padding it not initialized before sending.
+ */
+#pragma pack(2)
+
+/**
+ * struct batadv_unicast_packet - unicast packet for network payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @ttvn: translation table version number
+ * @dest: originator destination of the unicast packet
+ */
+struct batadv_unicast_packet {
+	__u8 packet_type;
+	__u8 version;
+	__u8 ttl;
+	__u8 ttvn; /* destination translation table version number */
+	__u8 dest[ETH_ALEN];
+	/* "4 bytes boundary + 2 bytes" long to make the payload after the
+	 * following ethernet header again 4 bytes boundary aligned
+	 */
+};
+
+/**
+ * struct batadv_unicast_4addr_packet - extended unicast packet
+ * @u: common unicast packet header
+ * @src: address of the source
+ * @subtype: packet subtype
+ * @reserved: reserved byte for alignment
+ */
+struct batadv_unicast_4addr_packet {
+	struct batadv_unicast_packet u;
+	__u8 src[ETH_ALEN];
+	__u8 subtype;
+	__u8 reserved;
+	/* "4 bytes boundary + 2 bytes" long to make the payload after the
+	 * following ethernet header again 4 bytes boundary aligned
+	 */
+};
+
+/**
+ * struct batadv_frag_packet - fragmented packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @dest: final destination used when routing fragments
+ * @orig: originator of the fragment used when merging the packet
+ * @no: fragment number within this sequence
+ * @priority: priority of frame, from ToS IP precedence or 802.1p
+ * @reserved: reserved byte for alignment
+ * @seqno: sequence identification
+ * @total_size: size of the merged packet
+ */
+struct batadv_frag_packet {
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+#if defined(__BIG_ENDIAN_BITFIELD)
+	__u8   no:4;
+	__u8   priority:3;
+	__u8   reserved:1;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8   reserved:1;
+	__u8   priority:3;
+	__u8   no:4;
+#else
+#error "unknown bitfield endianness"
+#endif
+	__u8   dest[ETH_ALEN];
+	__u8   orig[ETH_ALEN];
+	__be16 seqno;
+	__be16 total_size;
+};
+
+/**
+ * struct batadv_bcast_packet - broadcast packet for network payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @reserved: reserved byte for alignment
+ * @seqno: sequence identification
+ * @orig: originator of the broadcast packet
+ */
+struct batadv_bcast_packet {
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   reserved;
+	__be32 seqno;
+	__u8   orig[ETH_ALEN];
+	/* "4 bytes boundary + 2 bytes" long to make the payload after the
+	 * following ethernet header again 4 bytes boundary aligned
+	 */
+};
+
+/**
+ * struct batadv_coded_packet - network coded packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @first_source: original source of first included packet
+ * @first_orig_dest: original destinal of first included packet
+ * @first_crc: checksum of first included packet
+ * @first_ttvn: tt-version number of first included packet
+ * @second_ttl: ttl of second packet
+ * @second_dest: second receiver of this coded packet
+ * @second_source: original source of second included packet
+ * @second_orig_dest: original destination of second included packet
+ * @second_crc: checksum of second included packet
+ * @second_ttvn: tt version number of second included packet
+ * @coded_len: length of network coded part of the payload
+ */
+struct batadv_coded_packet {
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   first_ttvn;
+	/* __u8 first_dest[ETH_ALEN]; - saved in mac header destination */
+	__u8   first_source[ETH_ALEN];
+	__u8   first_orig_dest[ETH_ALEN];
+	__be32 first_crc;
+	__u8   second_ttl;
+	__u8   second_ttvn;
+	__u8   second_dest[ETH_ALEN];
+	__u8   second_source[ETH_ALEN];
+	__u8   second_orig_dest[ETH_ALEN];
+	__be32 second_crc;
+	__be16 coded_len;
+};
+
+#pragma pack()
+
+/**
+ * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @reserved: reserved field (for packet alignment)
+ * @src: address of the source
+ * @dst: address of the destination
+ * @tvlv_len: length of tvlv data following the unicast tvlv header
+ * @align: 2 bytes to align the header to a 4 byte boundary
+ */
+struct batadv_unicast_tvlv_packet {
+	__u8   packet_type;
+	__u8   version;  /* batman version field */
+	__u8   ttl;
+	__u8   reserved;
+	__u8   dst[ETH_ALEN];
+	__u8   src[ETH_ALEN];
+	__be16 tvlv_len;
+	__u16  align;
+};
+
+/**
+ * struct batadv_tvlv_hdr - base tvlv header struct
+ * @type: tvlv container type (see batadv_tvlv_type)
+ * @version: tvlv container version
+ * @len: tvlv container length
+ */
+struct batadv_tvlv_hdr {
+	__u8   type;
+	__u8   version;
+	__be16 len;
+};
+
+/**
+ * struct batadv_tvlv_gateway_data - gateway data propagated through gw tvlv
+ *  container
+ * @bandwidth_down: advertised uplink download bandwidth
+ * @bandwidth_up: advertised uplink upload bandwidth
+ */
+struct batadv_tvlv_gateway_data {
+	__be32 bandwidth_down;
+	__be32 bandwidth_up;
+};
+
+/**
+ * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
+ * @flags: translation table flags (see batadv_tt_data_flags)
+ * @ttvn: translation table version number
+ * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by
+ *  one batadv_tvlv_tt_vlan_data object per announced vlan
+ */
+struct batadv_tvlv_tt_data {
+	__u8   flags;
+	__u8   ttvn;
+	__be16 num_vlan;
+};
+
+/**
+ * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through
+ *  the tt tvlv container
+ * @crc: crc32 checksum of the entries belonging to this vlan
+ * @vid: vlan identifier
+ * @reserved: unused, useful for alignment purposes
+ */
+struct batadv_tvlv_tt_vlan_data {
+	__be32 crc;
+	__be16 vid;
+	__u16  reserved;
+};
+
+/**
+ * struct batadv_tvlv_tt_change - translation table diff data
+ * @flags: status indicators concerning the non-mesh client (see
+ *  batadv_tt_client_flags)
+ * @reserved: reserved field - useful for alignment purposes only
+ * @addr: mac address of non-mesh client that triggered this tt change
+ * @vid: VLAN identifier
+ */
+struct batadv_tvlv_tt_change {
+	__u8   flags;
+	__u8   reserved[3];
+	__u8   addr[ETH_ALEN];
+	__be16 vid;
+};
+
+/**
+ * struct batadv_tvlv_roam_adv - roaming advertisement
+ * @client: mac address of roaming client
+ * @vid: VLAN identifier
+ */
+struct batadv_tvlv_roam_adv {
+	__u8   client[ETH_ALEN];
+	__be16 vid;
+};
+
+/**
+ * struct batadv_tvlv_mcast_data - payload of a multicast tvlv
+ * @flags: multicast flags announced by the orig node
+ * @reserved: reserved field
+ */
+struct batadv_tvlv_mcast_data {
+	__u8 flags;
+	__u8 reserved[3];
+};
+
+#endif /* _UAPI_LINUX_BATADV_PACKET_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 84c36430c25a..79e326383726 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -52,6 +52,7 @@
 #include <linux/workqueue.h>
 #include <net/genetlink.h>
 #include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -63,7 +64,6 @@
 #include "netlink.h"
 #include "network-coding.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 78ddf3afa83a..27e165ac9302 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -37,6 +37,7 @@
 #include <linux/workqueue.h>
 #include <net/genetlink.h>
 #include <net/netlink.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -49,7 +50,6 @@
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 
 struct sk_buff;
 
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 59ae96cef596..a83478c46597 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -42,13 +42,13 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <net/cfg80211.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bat_algo.h"
 #include "bat_v_ogm.h"
 #include "hard-interface.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index e415974c540d..ba59b77c605d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -39,13 +39,13 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bat_algo.h"
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e647450e5d0f..fad47853ad3c 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -50,6 +50,7 @@
 #include <net/genetlink.h>
 #include <net/netlink.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "hard-interface.h"
@@ -57,7 +58,6 @@
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 #include "soft-interface.h"
 #include "sysfs.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 3d47bedaf661..12897eb46268 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -24,9 +24,9 @@
 #include <linux/compiler.h>
 #include <linux/netdevice.h>
 #include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "originator.h"
-#include "packet.h"
 
 struct seq_file;
 struct sk_buff;
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 4979350af9a7..22dde42fd80e 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -33,10 +33,10 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "hard-interface.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "soft-interface.h"
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 2488e25d0eef..37fe9a644f22 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -43,6 +43,7 @@
 #include <linux/stddef.h>
 #include <linux/udp.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "gateway_common.h"
@@ -50,7 +51,6 @@
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "soft-interface.h"
 #include "sysfs.h"
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 83bfeecf661c..b3e156af2256 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -27,10 +27,10 @@
 #include <linux/netdevice.h>
 #include <linux/stddef.h>
 #include <linux/string.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "gateway_client.h"
 #include "log.h"
-#include "packet.h"
 #include "tvlv.h"
 
 /**
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 13d04dba0b3a..5f186bff284a 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -38,6 +38,7 @@
 #include <linux/spinlock.h>
 #include <net/net_namespace.h>
 #include <net/rtnetlink.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bat_v.h"
 #include "bridge_loop_avoidance.h"
@@ -46,7 +47,6 @@
 #include "gateway_client.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 #include "soft-interface.h"
 #include "sysfs.h"
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index f2ef75b7fa73..8041cf106c37 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -44,11 +44,11 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/wait.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "hard-interface.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 
 static struct batadv_socket_client *batadv_socket_client_hash[256];
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 8bee4279d579..d31c8266e244 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -46,6 +46,7 @@
 #include <linux/workqueue.h>
 #include <net/dsfield.h>
 #include <net/rtnetlink.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -63,7 +64,6 @@
 #include "netlink.h"
 #include "network-coding.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "soft-interface.h"
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index d5484ac381d3..f7ba3f96d8f3 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -223,8 +223,8 @@ enum batadv_uev_type {
 #include <linux/jiffies.h>
 #include <linux/percpu.h>
 #include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
 
-#include "packet.h"
 #include "types.h"
 
 struct net_device;
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 8a503c526b90..cbdeb47ec3f6 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -55,11 +55,11 @@
 #include <net/if_inet6.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
-#include "packet.h"
 #include "translation-table.h"
 #include "tvlv.h"
 
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 103d4bdcdbdb..a823d3899bad 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -40,6 +40,7 @@
 #include <net/genetlink.h>
 #include <net/netlink.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -47,7 +48,6 @@
 #include "gateway_client.h"
 #include "hard-interface.h"
 #include "originator.h"
-#include "packet.h"
 #include "soft-interface.h"
 #include "tp_meter.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 3758be7fd881..b48116bb24ef 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -49,12 +49,12 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
 #include "originator.h"
-#include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "tvlv.h"
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
deleted file mode 100644
index 3b2d2db993aa..000000000000
--- a/net/batman-adv/packet.h
+++ /dev/null
@@ -1,644 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
- *
- * Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _NET_BATMAN_ADV_PACKET_H_
-#define _NET_BATMAN_ADV_PACKET_H_
-
-#include <asm/byteorder.h>
-#include <linux/if_ether.h>
-#include <linux/types.h>
-
-/**
- * batadv_tp_is_error() - Check throughput meter return code for error
- * @n: throughput meter return code
- *
- * Return: 0 when not error was detected, != 0 otherwise
- */
-#define batadv_tp_is_error(n) ((__u8)(n) > 127 ? 1 : 0)
-
-/**
- * enum batadv_packettype - types for batman-adv encapsulated packets
- * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
- * @BATADV_BCAST: broadcast packets carrying broadcast payload
- * @BATADV_CODED: network coded packets
- * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V
- * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V
- *
- * @BATADV_UNICAST: unicast packets carrying unicast payload traffic
- * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original
- *     payload packet
- * @BATADV_UNICAST_4ADDR: unicast packet including the originator address of
- *     the sender
- * @BATADV_ICMP: unicast packet like IP ICMP used for ping or traceroute
- * @BATADV_UNICAST_TVLV: unicast packet carrying TVLV containers
- */
-enum batadv_packettype {
-	/* 0x00 - 0x3f: local packets or special rules for handling */
-	BATADV_IV_OGM           = 0x00,
-	BATADV_BCAST            = 0x01,
-	BATADV_CODED            = 0x02,
-	BATADV_ELP		= 0x03,
-	BATADV_OGM2		= 0x04,
-	/* 0x40 - 0x7f: unicast */
-#define BATADV_UNICAST_MIN     0x40
-	BATADV_UNICAST          = 0x40,
-	BATADV_UNICAST_FRAG     = 0x41,
-	BATADV_UNICAST_4ADDR    = 0x42,
-	BATADV_ICMP             = 0x43,
-	BATADV_UNICAST_TVLV     = 0x44,
-#define BATADV_UNICAST_MAX     0x7f
-	/* 0x80 - 0xff: reserved */
-};
-
-/**
- * enum batadv_subtype - packet subtype for unicast4addr
- * @BATADV_P_DATA: user payload
- * @BATADV_P_DAT_DHT_GET: DHT request message
- * @BATADV_P_DAT_DHT_PUT: DHT store message
- * @BATADV_P_DAT_CACHE_REPLY: ARP reply generated by DAT
- */
-enum batadv_subtype {
-	BATADV_P_DATA			= 0x01,
-	BATADV_P_DAT_DHT_GET		= 0x02,
-	BATADV_P_DAT_DHT_PUT		= 0x03,
-	BATADV_P_DAT_CACHE_REPLY	= 0x04,
-};
-
-/* this file is included by batctl which needs these defines */
-#define BATADV_COMPAT_VERSION 15
-
-/**
- * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets
- * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was
- *     previously received from someone else than the best neighbor.
- * @BATADV_PRIMARIES_FIRST_HOP: flag unused.
- * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a
- *     one hop neighbor on the interface where it was originally received.
- */
-enum batadv_iv_flags {
-	BATADV_NOT_BEST_NEXT_HOP   = 1UL << 0,
-	BATADV_PRIMARIES_FIRST_HOP = 1UL << 1,
-	BATADV_DIRECTLINK          = 1UL << 2,
-};
-
-/**
- * enum batadv_icmp_packettype - ICMP message types
- * @BATADV_ECHO_REPLY: success reply to BATADV_ECHO_REQUEST
- * @BATADV_DESTINATION_UNREACHABLE: failure when route to destination not found
- * @BATADV_ECHO_REQUEST: request BATADV_ECHO_REPLY from destination
- * @BATADV_TTL_EXCEEDED: error after BATADV_ECHO_REQUEST traversed too many hops
- * @BATADV_PARAMETER_PROBLEM: return code for malformed messages
- * @BATADV_TP: throughput meter packet
- */
-enum batadv_icmp_packettype {
-	BATADV_ECHO_REPLY	       = 0,
-	BATADV_DESTINATION_UNREACHABLE = 3,
-	BATADV_ECHO_REQUEST	       = 8,
-	BATADV_TTL_EXCEEDED	       = 11,
-	BATADV_PARAMETER_PROBLEM       = 12,
-	BATADV_TP		       = 15,
-};
-
-/**
- * enum batadv_mcast_flags - flags for multicast capabilities and settings
- * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for
- *  224.0.0.0/24 or ff02::1
- * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets
- * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets
- */
-enum batadv_mcast_flags {
-	BATADV_MCAST_WANT_ALL_UNSNOOPABLES	= 1UL << 0,
-	BATADV_MCAST_WANT_ALL_IPV4		= 1UL << 1,
-	BATADV_MCAST_WANT_ALL_IPV6		= 1UL << 2,
-};
-
-/* tt data subtypes */
-#define BATADV_TT_DATA_TYPE_MASK 0x0F
-
-/**
- * enum batadv_tt_data_flags - flags for tt data tvlv
- * @BATADV_TT_OGM_DIFF: TT diff propagated through OGM
- * @BATADV_TT_REQUEST: TT request message
- * @BATADV_TT_RESPONSE: TT response message
- * @BATADV_TT_FULL_TABLE: contains full table to replace existing table
- */
-enum batadv_tt_data_flags {
-	BATADV_TT_OGM_DIFF   = 1UL << 0,
-	BATADV_TT_REQUEST    = 1UL << 1,
-	BATADV_TT_RESPONSE   = 1UL << 2,
-	BATADV_TT_FULL_TABLE = 1UL << 4,
-};
-
-/**
- * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
- * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
- */
-enum batadv_vlan_flags {
-	BATADV_VLAN_HAS_TAG	= 1UL << 15,
-};
-
-/**
- * enum batadv_bla_claimframe - claim frame types for the bridge loop avoidance
- * @BATADV_CLAIM_TYPE_CLAIM: claim of a client mac address
- * @BATADV_CLAIM_TYPE_UNCLAIM: unclaim of a client mac address
- * @BATADV_CLAIM_TYPE_ANNOUNCE: announcement of backbone with current crc
- * @BATADV_CLAIM_TYPE_REQUEST: request of full claim table
- * @BATADV_CLAIM_TYPE_LOOPDETECT: mesh-traversing loop detect packet
- */
-enum batadv_bla_claimframe {
-	BATADV_CLAIM_TYPE_CLAIM		= 0x00,
-	BATADV_CLAIM_TYPE_UNCLAIM	= 0x01,
-	BATADV_CLAIM_TYPE_ANNOUNCE	= 0x02,
-	BATADV_CLAIM_TYPE_REQUEST	= 0x03,
-	BATADV_CLAIM_TYPE_LOOPDETECT	= 0x04,
-};
-
-/**
- * enum batadv_tvlv_type - tvlv type definitions
- * @BATADV_TVLV_GW: gateway tvlv
- * @BATADV_TVLV_DAT: distributed arp table tvlv
- * @BATADV_TVLV_NC: network coding tvlv
- * @BATADV_TVLV_TT: translation table tvlv
- * @BATADV_TVLV_ROAM: roaming advertisement tvlv
- * @BATADV_TVLV_MCAST: multicast capability tvlv
- */
-enum batadv_tvlv_type {
-	BATADV_TVLV_GW		= 0x01,
-	BATADV_TVLV_DAT		= 0x02,
-	BATADV_TVLV_NC		= 0x03,
-	BATADV_TVLV_TT		= 0x04,
-	BATADV_TVLV_ROAM	= 0x05,
-	BATADV_TVLV_MCAST	= 0x06,
-};
-
-#pragma pack(2)
-/* the destination hardware field in the ARP frame is used to
- * transport the claim type and the group id
- */
-struct batadv_bla_claim_dst {
-	__u8   magic[3];	/* FF:43:05 */
-	__u8   type;		/* bla_claimframe */
-	__be16 group;		/* group id */
-};
-
-#pragma pack()
-
-/**
- * struct batadv_ogm_packet - ogm (routing protocol) packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @flags: contains routing relevant flags - see enum batadv_iv_flags
- * @seqno: sequence identification
- * @orig: address of the source node
- * @prev_sender: address of the previous sender
- * @reserved: reserved byte for alignment
- * @tq: transmission quality
- * @tvlv_len: length of tvlv data following the ogm header
- */
-struct batadv_ogm_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   flags;
-	__be32 seqno;
-	__u8   orig[ETH_ALEN];
-	__u8   prev_sender[ETH_ALEN];
-	__u8   reserved;
-	__u8   tq;
-	__be16 tvlv_len;
-	/* __packed is not needed as the struct size is divisible by 4,
-	 * and the largest data type in this struct has a size of 4.
-	 */
-};
-
-#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
-
-/**
- * struct batadv_ogm2_packet - ogm2 (routing protocol) packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the general header
- * @ttl: time to live for this packet, part of the general header
- * @flags: reseved for routing relevant flags - currently always 0
- * @seqno: sequence number
- * @orig: originator mac address
- * @tvlv_len: length of the appended tvlv buffer (in bytes)
- * @throughput: the currently flooded path throughput
- */
-struct batadv_ogm2_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   flags;
-	__be32 seqno;
-	__u8   orig[ETH_ALEN];
-	__be16 tvlv_len;
-	__be32 throughput;
-	/* __packed is not needed as the struct size is divisible by 4,
-	 * and the largest data type in this struct has a size of 4.
-	 */
-};
-
-#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet)
-
-/**
- * struct batadv_elp_packet - elp (neighbor discovery) packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @orig: originator mac address
- * @seqno: sequence number
- * @elp_interval: currently used ELP sending interval in ms
- */
-struct batadv_elp_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   orig[ETH_ALEN];
-	__be32 seqno;
-	__be32 elp_interval;
-};
-
-#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
-
-/**
- * struct batadv_icmp_header - common members among all the ICMP packets
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @msg_type: ICMP packet type
- * @dst: address of the destination node
- * @orig: address of the source node
- * @uid: local ICMP socket identifier
- * @align: not used - useful for alignment purposes only
- *
- * This structure is used for ICMP packets parsing only and it is never sent
- * over the wire. The alignment field at the end is there to ensure that
- * members are padded the same way as they are in real packets.
- */
-struct batadv_icmp_header {
-	__u8 packet_type;
-	__u8 version;
-	__u8 ttl;
-	__u8 msg_type; /* see ICMP message types above */
-	__u8 dst[ETH_ALEN];
-	__u8 orig[ETH_ALEN];
-	__u8 uid;
-	__u8 align[3];
-};
-
-/**
- * struct batadv_icmp_packet - ICMP packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @msg_type: ICMP packet type
- * @dst: address of the destination node
- * @orig: address of the source node
- * @uid: local ICMP socket identifier
- * @reserved: not used - useful for alignment
- * @seqno: ICMP sequence number
- */
-struct batadv_icmp_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   msg_type; /* see ICMP message types above */
-	__u8   dst[ETH_ALEN];
-	__u8   orig[ETH_ALEN];
-	__u8   uid;
-	__u8   reserved;
-	__be16 seqno;
-};
-
-/**
- * struct batadv_icmp_tp_packet - ICMP TP Meter packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @msg_type: ICMP packet type
- * @dst: address of the destination node
- * @orig: address of the source node
- * @uid: local ICMP socket identifier
- * @subtype: TP packet subtype (see batadv_icmp_tp_subtype)
- * @session: TP session identifier
- * @seqno: the TP sequence number
- * @timestamp: time when the packet has been sent. This value is filled in a
- *  TP_MSG and echoed back in the next TP_ACK so that the sender can compute the
- *  RTT. Since it is read only by the host which wrote it, there is no need to
- *  store it using network order
- */
-struct batadv_icmp_tp_packet {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   msg_type; /* see ICMP message types above */
-	__u8   dst[ETH_ALEN];
-	__u8   orig[ETH_ALEN];
-	__u8   uid;
-	__u8   subtype;
-	__u8   session[2];
-	__be32 seqno;
-	__be32 timestamp;
-};
-
-/**
- * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes
- * @BATADV_TP_MSG: Msg from sender to receiver
- * @BATADV_TP_ACK: acknowledgment from receiver to sender
- */
-enum batadv_icmp_tp_subtype {
-	BATADV_TP_MSG	= 0,
-	BATADV_TP_ACK,
-};
-
-#define BATADV_RR_LEN 16
-
-/**
- * struct batadv_icmp_packet_rr - ICMP RouteRecord packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @msg_type: ICMP packet type
- * @dst: address of the destination node
- * @orig: address of the source node
- * @uid: local ICMP socket identifier
- * @rr_cur: number of entries the rr array
- * @seqno: ICMP sequence number
- * @rr: route record array
- */
-struct batadv_icmp_packet_rr {
-	__u8   packet_type;
-	__u8   version;
-	__u8   ttl;
-	__u8   msg_type; /* see ICMP message types above */
-	__u8   dst[ETH_ALEN];
-	__u8   orig[ETH_ALEN];
-	__u8   uid;
-	__u8   rr_cur;
-	__be16 seqno;
-	__u8   rr[BATADV_RR_LEN][ETH_ALEN];
-};
-
-#define BATADV_ICMP_MAX_PACKET_SIZE	sizeof(struct batadv_icmp_packet_rr)
-
-/* All packet headers in front of an ethernet header have to be completely
- * divisible by 2 but not by 4 to make the payload after the ethernet
- * header again 4 bytes boundary aligned.
- *
- * A packing of 2 is necessary to avoid extra padding at the end of the struct
- * caused by a structure member which is larger than two bytes. Otherwise
- * the structure would not fulfill the previously mentioned rule to avoid the
- * misalignment of the payload after the ethernet header. It may also lead to
- * leakage of information when the padding it not initialized before sending.
- */
-#pragma pack(2)
-
-/**
- * struct batadv_unicast_packet - unicast packet for network payload
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @ttvn: translation table version number
- * @dest: originator destination of the unicast packet
- */
-struct batadv_unicast_packet {
-	__u8 packet_type;
-	__u8 version;
-	__u8 ttl;
-	__u8 ttvn; /* destination translation table version number */
-	__u8 dest[ETH_ALEN];
-	/* "4 bytes boundary + 2 bytes" long to make the payload after the
-	 * following ethernet header again 4 bytes boundary aligned
-	 */
-};
-
-/**
- * struct batadv_unicast_4addr_packet - extended unicast packet
- * @u: common unicast packet header
- * @src: address of the source
- * @subtype: packet subtype
- * @reserved: reserved byte for alignment
- */
-struct batadv_unicast_4addr_packet {
-	struct batadv_unicast_packet u;
-	__u8 src[ETH_ALEN];
-	__u8 subtype;
-	__u8 reserved;
-	/* "4 bytes boundary + 2 bytes" long to make the payload after the
-	 * following ethernet header again 4 bytes boundary aligned
-	 */
-};
-
-/**
- * struct batadv_frag_packet - fragmented packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @dest: final destination used when routing fragments
- * @orig: originator of the fragment used when merging the packet
- * @no: fragment number within this sequence
- * @priority: priority of frame, from ToS IP precedence or 802.1p
- * @reserved: reserved byte for alignment
- * @seqno: sequence identification
- * @total_size: size of the merged packet
- */
-struct batadv_frag_packet {
-	__u8   packet_type;
-	__u8   version;  /* batman version field */
-	__u8   ttl;
-#if defined(__BIG_ENDIAN_BITFIELD)
-	__u8   no:4;
-	__u8   priority:3;
-	__u8   reserved:1;
-#elif defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8   reserved:1;
-	__u8   priority:3;
-	__u8   no:4;
-#else
-#error "unknown bitfield endianness"
-#endif
-	__u8   dest[ETH_ALEN];
-	__u8   orig[ETH_ALEN];
-	__be16 seqno;
-	__be16 total_size;
-};
-
-/**
- * struct batadv_bcast_packet - broadcast packet for network payload
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @reserved: reserved byte for alignment
- * @seqno: sequence identification
- * @orig: originator of the broadcast packet
- */
-struct batadv_bcast_packet {
-	__u8   packet_type;
-	__u8   version;  /* batman version field */
-	__u8   ttl;
-	__u8   reserved;
-	__be32 seqno;
-	__u8   orig[ETH_ALEN];
-	/* "4 bytes boundary + 2 bytes" long to make the payload after the
-	 * following ethernet header again 4 bytes boundary aligned
-	 */
-};
-
-/**
- * struct batadv_coded_packet - network coded packet
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @first_source: original source of first included packet
- * @first_orig_dest: original destinal of first included packet
- * @first_crc: checksum of first included packet
- * @first_ttvn: tt-version number of first included packet
- * @second_ttl: ttl of second packet
- * @second_dest: second receiver of this coded packet
- * @second_source: original source of second included packet
- * @second_orig_dest: original destination of second included packet
- * @second_crc: checksum of second included packet
- * @second_ttvn: tt version number of second included packet
- * @coded_len: length of network coded part of the payload
- */
-struct batadv_coded_packet {
-	__u8   packet_type;
-	__u8   version;  /* batman version field */
-	__u8   ttl;
-	__u8   first_ttvn;
-	/* __u8 first_dest[ETH_ALEN]; - saved in mac header destination */
-	__u8   first_source[ETH_ALEN];
-	__u8   first_orig_dest[ETH_ALEN];
-	__be32 first_crc;
-	__u8   second_ttl;
-	__u8   second_ttvn;
-	__u8   second_dest[ETH_ALEN];
-	__u8   second_source[ETH_ALEN];
-	__u8   second_orig_dest[ETH_ALEN];
-	__be32 second_crc;
-	__be16 coded_len;
-};
-
-#pragma pack()
-
-/**
- * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload
- * @packet_type: batman-adv packet type, part of the general header
- * @version: batman-adv protocol version, part of the genereal header
- * @ttl: time to live for this packet, part of the genereal header
- * @reserved: reserved field (for packet alignment)
- * @src: address of the source
- * @dst: address of the destination
- * @tvlv_len: length of tvlv data following the unicast tvlv header
- * @align: 2 bytes to align the header to a 4 byte boundary
- */
-struct batadv_unicast_tvlv_packet {
-	__u8   packet_type;
-	__u8   version;  /* batman version field */
-	__u8   ttl;
-	__u8   reserved;
-	__u8   dst[ETH_ALEN];
-	__u8   src[ETH_ALEN];
-	__be16 tvlv_len;
-	__u16  align;
-};
-
-/**
- * struct batadv_tvlv_hdr - base tvlv header struct
- * @type: tvlv container type (see batadv_tvlv_type)
- * @version: tvlv container version
- * @len: tvlv container length
- */
-struct batadv_tvlv_hdr {
-	__u8   type;
-	__u8   version;
-	__be16 len;
-};
-
-/**
- * struct batadv_tvlv_gateway_data - gateway data propagated through gw tvlv
- *  container
- * @bandwidth_down: advertised uplink download bandwidth
- * @bandwidth_up: advertised uplink upload bandwidth
- */
-struct batadv_tvlv_gateway_data {
-	__be32 bandwidth_down;
-	__be32 bandwidth_up;
-};
-
-/**
- * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
- * @flags: translation table flags (see batadv_tt_data_flags)
- * @ttvn: translation table version number
- * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by
- *  one batadv_tvlv_tt_vlan_data object per announced vlan
- */
-struct batadv_tvlv_tt_data {
-	__u8   flags;
-	__u8   ttvn;
-	__be16 num_vlan;
-};
-
-/**
- * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through
- *  the tt tvlv container
- * @crc: crc32 checksum of the entries belonging to this vlan
- * @vid: vlan identifier
- * @reserved: unused, useful for alignment purposes
- */
-struct batadv_tvlv_tt_vlan_data {
-	__be32 crc;
-	__be16 vid;
-	__u16  reserved;
-};
-
-/**
- * struct batadv_tvlv_tt_change - translation table diff data
- * @flags: status indicators concerning the non-mesh client (see
- *  batadv_tt_client_flags)
- * @reserved: reserved field - useful for alignment purposes only
- * @addr: mac address of non-mesh client that triggered this tt change
- * @vid: VLAN identifier
- */
-struct batadv_tvlv_tt_change {
-	__u8   flags;
-	__u8   reserved[3];
-	__u8   addr[ETH_ALEN];
-	__be16 vid;
-};
-
-/**
- * struct batadv_tvlv_roam_adv - roaming advertisement
- * @client: mac address of roaming client
- * @vid: VLAN identifier
- */
-struct batadv_tvlv_roam_adv {
-	__u8   client[ETH_ALEN];
-	__be16 vid;
-};
-
-/**
- * struct batadv_tvlv_mcast_data - payload of a multicast tvlv
- * @flags: multicast flags announced by the orig node
- * @reserved: reserved field
- */
-struct batadv_tvlv_mcast_data {
-	__u8 flags;
-	__u8 reserved[3];
-};
-
-#endif /* _NET_BATMAN_ADV_PACKET_H_ */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index eb835bde502a..b6891e8b741c 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -34,6 +34,7 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bitarray.h"
 #include "bridge_loop_avoidance.h"
@@ -44,7 +45,6 @@
 #include "log.h"
 #include "network-coding.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 #include "soft-interface.h"
 #include "tp_meter.h"
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 8c7399dd06ca..1e8c79093623 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -24,8 +24,7 @@
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
-
-#include "packet.h"
+#include <uapi/linux/batadv_packet.h>
 
 struct sk_buff;
 
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 1eb5555c5fe4..900c5ce21cd4 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -49,6 +49,7 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bat_algo.h"
 #include "bridge_loop_avoidance.h"
@@ -60,7 +61,6 @@
 #include "multicast.h"
 #include "network-coding.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 #include "sysfs.h"
 #include "translation-table.h"
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 56fb42551453..c1578fa0b952 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -39,6 +39,7 @@
 #include <linux/string.h>
 #include <linux/stringify.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "bridge_loop_avoidance.h"
 #include "distributed-arp-table.h"
@@ -47,7 +48,6 @@
 #include "hard-interface.h"
 #include "log.h"
 #include "network-coding.h"
-#include "packet.h"
 #include "soft-interface.h"
 
 static struct net_device *batadv_kobj_to_netdev(struct kobject *obj)
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 7dcf2aa4deb5..8b576712d0c1 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -49,13 +49,13 @@
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "hard-interface.h"
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 
 /**
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 0e53be3f8df0..7550a9ccd695 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -52,6 +52,7 @@
 #include <net/genetlink.h>
 #include <net/netlink.h>
 #include <net/sock.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bridge_loop_avoidance.h"
@@ -60,7 +61,6 @@
 #include "log.h"
 #include "netlink.h"
 #include "originator.h"
-#include "packet.h"
 #include "soft-interface.h"
 #include "tvlv.h"
 
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index e189f026974c..5ffcb45ac6ff 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -36,9 +36,9 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
 
 #include "originator.h"
-#include "packet.h"
 #include "send.h"
 #include "tvlv.h"
 
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 77b145eba193..bb1578410e0c 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -35,10 +35,9 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
-#include "packet.h"
-
 struct seq_file;
 
 #ifdef CONFIG_BATMAN_ADV_DAT
-- 
cgit v1.2.3


From f15bc54eeecd86dfba3885aab839cd1f45172a38 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Fri, 22 Dec 2017 15:10:18 +0100
Subject: l2tp: add peer_offset parameter

Introduce peer_offset parameter in order to add the capability
to specify two different values for payload offset on tx/rx side.
If just offset is provided by userspace use it for rx side as well
in order to maintain compatibility with older l2tp versions

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h |  1 +
 net/l2tp/l2tp_core.c      |  3 ++-
 net/l2tp/l2tp_core.h      | 13 ++++++++++---
 net/l2tp/l2tp_debugfs.c   |  8 +++++---
 net/l2tp/l2tp_netlink.c   | 21 ++++++++++++++++++++-
 5 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index d84ce5c1c9aa..d6fee55dbded 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -127,6 +127,7 @@ enum {
 	L2TP_ATTR_UDP_ZERO_CSUM6_TX,	/* flag */
 	L2TP_ATTR_UDP_ZERO_CSUM6_RX,	/* flag */
 	L2TP_ATTR_PAD,
+	L2TP_ATTR_PEER_OFFSET,		/* u16 */
 	__L2TP_ATTR_MAX,
 };
 
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 115918ad8eca..6ff64717da1e 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -792,7 +792,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 			ptr += 2 + offset;
 		}
 	} else
-		ptr += session->offset;
+		ptr += session->peer_offset;
 
 	offset = ptr - optr;
 	if (!pskb_may_pull(skb, offset))
@@ -1785,6 +1785,7 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 			session->lns_mode = cfg->lns_mode;
 			session->reorder_timeout = cfg->reorder_timeout;
 			session->offset = cfg->offset;
+			session->peer_offset = cfg->peer_offset;
 			session->l2specific_type = cfg->l2specific_type;
 			session->l2specific_len = cfg->l2specific_len;
 			session->cookie_len = cfg->cookie_len;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 9534e16965cc..c6fe7cc42a05 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -59,7 +59,8 @@ struct l2tp_session_cfg {
 	int			debug;		/* bitmask of debug message
 						 * categories */
 	u16			vlan_id;	/* VLAN pseudowire only */
-	u16			offset;		/* offset to payload */
+	u16			offset;		/* offset to tx payload */
+	u16			peer_offset;	/* offset to rx payload */
 	u16			l2specific_len;	/* Layer 2 specific length */
 	u16			l2specific_type; /* Layer 2 specific type */
 	u8			cookie[8];	/* optional cookie */
@@ -86,8 +87,14 @@ struct l2tp_session {
 	int			cookie_len;
 	u8			peer_cookie[8];
 	int			peer_cookie_len;
-	u16			offset;		/* offset from end of L2TP header
-						   to beginning of data */
+	u16			offset;		/* offset from end of L2TP
+						 * header to beginning of
+						 * tx data
+						 */
+	u16			peer_offset;	/* offset from end of L2TP
+						 * header to beginning of
+						 * rx data
+						 */
 	u16			l2specific_len;
 	u16			l2specific_type;
 	u16			hdr_len;
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index eb69411bcb47..4cc30b38aba4 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -180,8 +180,9 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 		   session->lns_mode ? "LNS" : "LAC",
 		   session->debug,
 		   jiffies_to_msecs(session->reorder_timeout));
-	seq_printf(m, "   offset %hu l2specific %hu/%hu\n",
-		   session->offset, session->l2specific_type, session->l2specific_len);
+	seq_printf(m, "   offset %hu peer_offset %hu l2specific %hu/%hu\n",
+		   session->offset, session->peer_offset,
+		   session->l2specific_type, session->l2specific_len);
 	if (session->cookie_len) {
 		seq_printf(m, "   cookie %02x%02x%02x%02x",
 			   session->cookie[0], session->cookie[1],
@@ -228,7 +229,8 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
 		seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
 		seq_puts(m, "  SESSION ID, peer ID, PWTYPE\n");
 		seq_puts(m, "   refcnt cnt\n");
-		seq_puts(m, "   offset OFFSET l2specific TYPE/LEN\n");
+		seq_puts(m, "   offset OFFSET peer_offset OFFSET");
+		seq_puts(m, " l2specific TYPE/LEN\n");
 		seq_puts(m, "   [ cookie ]\n");
 		seq_puts(m, "   [ peer cookie ]\n");
 		seq_puts(m, "   config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n");
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 7e9c50125556..d7d4d7a7a54d 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -547,9 +547,25 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 	}
 
 	if (tunnel->version > 2) {
-		if (info->attrs[L2TP_ATTR_OFFSET])
+		if (info->attrs[L2TP_ATTR_PEER_OFFSET]) {
+			struct nlattr *peer_offset;
+
+			peer_offset = info->attrs[L2TP_ATTR_PEER_OFFSET];
+			cfg.peer_offset = nla_get_u16(peer_offset);
+		}
+
+		if (info->attrs[L2TP_ATTR_OFFSET]) {
 			cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]);
 
+			/* in order to maintain compatibility with older
+			 * versions where offset was used for both tx and
+			 * rx side, update rx side with offset if peer_offset
+			 * is not provided by userspace
+			 */
+			if (!info->attrs[L2TP_ATTR_PEER_OFFSET])
+				cfg.peer_offset = cfg.offset;
+		}
+
 		if (info->attrs[L2TP_ATTR_DATA_SEQ])
 			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
 
@@ -763,6 +779,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 	     nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) ||
 	    (session->offset &&
 	     nla_put_u16(skb, L2TP_ATTR_OFFSET, session->offset)) ||
+	    (session->peer_offset &&
+	     nla_put_u16(skb, L2TP_ATTR_PEER_OFFSET, session->peer_offset)) ||
 	    (session->cookie_len &&
 	     nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len,
 		     &session->cookie[0])) ||
@@ -903,6 +921,7 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
 	[L2TP_ATTR_PW_TYPE]		= { .type = NLA_U16, },
 	[L2TP_ATTR_ENCAP_TYPE]		= { .type = NLA_U16, },
 	[L2TP_ATTR_OFFSET]		= { .type = NLA_U16, },
+	[L2TP_ATTR_PEER_OFFSET]		= { .type = NLA_U16, },
 	[L2TP_ATTR_DATA_SEQ]		= { .type = NLA_U8, },
 	[L2TP_ATTR_L2SPEC_TYPE]		= { .type = NLA_U8, },
 	[L2TP_ATTR_L2SPEC_LEN]		= { .type = NLA_U8, },
-- 
cgit v1.2.3


From 57868acc369ab73ec8f6b43a0c6749077376b189 Mon Sep 17 00:00:00 2001
From: Satendra Singh Thakur <satendra.t@samsung.com>
Date: Mon, 18 Dec 2017 22:35:53 -0500
Subject: media: videobuf2: Add new uAPI for DVB streaming I/O

Adds a new uAPI for DVB to use streaming I/O which is implemented
based on videobuf2, using those new ioctls:

- DMX_REQBUFS:  Request kernel to allocate buffers which count and size
	        are dedicated by user.
- DMX_QUERYBUF: Get the buffer information like a memory offset which
		will mmap() and be shared with user-space.
- DMX_EXPBUF:   Just for testing whether buffer-exporting success or not.
- DMX_QBUF:     Pass the buffer to kernel-space.
- DMX_DQBUF:    Get back the buffer which may contain TS data.

Originally developed by: Junghak Sung <jh1009.sung@samsung.com>, as
seen at:
	https://patchwork.linuxtv.org/patch/31613/
	https://patchwork.kernel.org/patch/7334301/

The original patch was written before merging VB2-core functionalities
upstream. When such series was added, several adjustments were made,
fixing some issues with	V4L2, causing the original patch to be
non-trivially rebased.

After rebased, a few bugs in the patch were fixed. The patch was
also enhanced it and polling functionality got added.

The main changes over the original patch are:

dvb_vb2_fill_buffer():
	- Set the size of the outgoing buffer after while loop using
	  vb2_set_plane_payload;

	- Added NULL check for source buffer as per normal convention
	  of demux driver, this is called twice, first time with valid
	  buffer second time with NULL pointer, if its not handled,
	  it will result in  crash

	- Restricted spinlock for only list_* operations

dvb_vb2_init():
	- Restricted q->io_modes to only VB2_MMAP as its the only
	  supported mode

dvb_vb2_release():
	- Replaced the && in if condiion with &, because otherwise
	  it was always getting satisfied.

dvb_vb2_stream_off():
	- Added list_del code for enqueud buffers upon stream off

dvb_vb2_poll():
	- Added this new function in order to support polling

dvb_demux_poll() and dvb_dvr_poll()
	- dvb_vb2_poll() is now called from these functions

- Ported this patch and latest videobuf2 to lower kernel versions and
  tested auto scan.

Co-developed-by: Junghak Sung <jh1009.sung@samsung.com>

[mchehab@s-opensource.com: checkpatch fixes]
Signed-off-by: Junghak Sung <jh1009.sung@samsung.com>
Signed-off-by: Geunyoung Kim <nenggun.kim@samsung.com>
Acked-by: Seung-Woo Kim <sw0312.kim@samsung.com>
Acked-by: Inki Dae <inki.dae@samsung.com>
Signed-off-by: Satendra Singh Thakur <satendra.t@samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/dvb-core/Makefile  |   2 +-
 drivers/media/dvb-core/dmxdev.c  | 196 +++++++++++++++---
 drivers/media/dvb-core/dmxdev.h  |   4 +
 drivers/media/dvb-core/dvb_vb2.c | 423 +++++++++++++++++++++++++++++++++++++++
 drivers/media/dvb-core/dvb_vb2.h |  72 +++++++
 include/uapi/linux/dvb/dmx.h     |  66 +++++-
 6 files changed, 733 insertions(+), 30 deletions(-)
 create mode 100644 drivers/media/dvb-core/dvb_vb2.c
 create mode 100644 drivers/media/dvb-core/dvb_vb2.h

(limited to 'include/uapi/linux')

diff --git a/drivers/media/dvb-core/Makefile b/drivers/media/dvb-core/Makefile
index 47e2e391bfb8..bbc65dfa0a8e 100644
--- a/drivers/media/dvb-core/Makefile
+++ b/drivers/media/dvb-core/Makefile
@@ -7,6 +7,6 @@ dvb-net-$(CONFIG_DVB_NET) := dvb_net.o
 
 dvb-core-objs := dvbdev.o dmxdev.o dvb_demux.o		 	\
 		 dvb_ca_en50221.o dvb_frontend.o 		\
-		 $(dvb-net-y) dvb_ringbuffer.o dvb_math.o
+		 $(dvb-net-y) dvb_ringbuffer.o dvb_vb2.o dvb_math.o
 
 obj-$(CONFIG_DVB_CORE) += dvb-core.o
diff --git a/drivers/media/dvb-core/dmxdev.c b/drivers/media/dvb-core/dmxdev.c
index 3ddd44e1ee77..0ddf58adb172 100644
--- a/drivers/media/dvb-core/dmxdev.c
+++ b/drivers/media/dvb-core/dmxdev.c
@@ -28,6 +28,7 @@
 #include <linux/wait.h>
 #include <linux/uaccess.h>
 #include "dmxdev.h"
+#include "dvb_vb2.h"
 
 static int debug;
 
@@ -138,14 +139,8 @@ static int dvb_dvr_open(struct inode *inode, struct file *file)
 		return -ENODEV;
 	}
 
-	if ((file->f_flags & O_ACCMODE) == O_RDWR) {
-		if (!(dmxdev->capabilities & DMXDEV_CAP_DUPLEX)) {
-			mutex_unlock(&dmxdev->mutex);
-			return -EOPNOTSUPP;
-		}
-	}
-
-	if ((file->f_flags & O_ACCMODE) == O_RDONLY) {
+	if (((file->f_flags & O_ACCMODE) == O_RDONLY) ||
+	    ((file->f_flags & O_ACCMODE) == O_RDWR)) {
 		void *mem;
 
 		if (!dvbdev->readers) {
@@ -158,6 +153,8 @@ static int dvb_dvr_open(struct inode *inode, struct file *file)
 			return -ENOMEM;
 		}
 		dvb_ringbuffer_init(&dmxdev->dvr_buffer, mem, DVR_BUFFER_SIZE);
+		dvb_vb2_init(&dmxdev->dvr_vb2_ctx, "dvr",
+			     file->f_flags & O_NONBLOCK);
 		dvbdev->readers--;
 	}
 
@@ -195,7 +192,11 @@ static int dvb_dvr_release(struct inode *inode, struct file *file)
 		dmxdev->demux->connect_frontend(dmxdev->demux,
 						dmxdev->dvr_orig_fe);
 	}
-	if ((file->f_flags & O_ACCMODE) == O_RDONLY) {
+	if (((file->f_flags & O_ACCMODE) == O_RDONLY) ||
+	    ((file->f_flags & O_ACCMODE) == O_RDWR)) {
+		if (dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx))
+			dvb_vb2_stream_off(&dmxdev->dvr_vb2_ctx);
+		dvb_vb2_release(&dmxdev->dvr_vb2_ctx);
 		dvbdev->readers++;
 		if (dmxdev->dvr_buffer.data) {
 			void *mem = dmxdev->dvr_buffer.data;
@@ -358,8 +359,8 @@ static int dvb_dmxdev_section_callback(const u8 *buffer1, size_t buffer1_len,
 {
 	struct dmxdev_filter *dmxdevfilter = filter->priv;
 	int ret;
-
-	if (dmxdevfilter->buffer.error) {
+	if (!dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx) &&
+	    dmxdevfilter->buffer.error) {
 		wake_up(&dmxdevfilter->buffer.queue);
 		return 0;
 	}
@@ -370,11 +371,19 @@ static int dvb_dmxdev_section_callback(const u8 *buffer1, size_t buffer1_len,
 	}
 	del_timer(&dmxdevfilter->timer);
 	dprintk("section callback %*ph\n", 6, buffer1);
-	ret = dvb_dmxdev_buffer_write(&dmxdevfilter->buffer, buffer1,
-				      buffer1_len);
-	if (ret == buffer1_len) {
-		ret = dvb_dmxdev_buffer_write(&dmxdevfilter->buffer, buffer2,
-					      buffer2_len);
+	if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) {
+		ret = dvb_vb2_fill_buffer(&dmxdevfilter->vb2_ctx,
+					  buffer1, buffer1_len);
+		if (ret == buffer1_len)
+			ret = dvb_vb2_fill_buffer(&dmxdevfilter->vb2_ctx,
+						  buffer2, buffer2_len);
+	} else {
+		ret = dvb_dmxdev_buffer_write(&dmxdevfilter->buffer,
+					      buffer1, buffer1_len);
+		if (ret == buffer1_len) {
+			ret = dvb_dmxdev_buffer_write(&dmxdevfilter->buffer,
+						      buffer2, buffer2_len);
+		}
 	}
 	if (ret < 0)
 		dmxdevfilter->buffer.error = ret;
@@ -391,6 +400,7 @@ static int dvb_dmxdev_ts_callback(const u8 *buffer1, size_t buffer1_len,
 {
 	struct dmxdev_filter *dmxdevfilter = feed->priv;
 	struct dvb_ringbuffer *buffer;
+	struct dvb_vb2_ctx *ctx;
 	int ret;
 
 	spin_lock(&dmxdevfilter->dev->lock);
@@ -399,19 +409,30 @@ static int dvb_dmxdev_ts_callback(const u8 *buffer1, size_t buffer1_len,
 		return 0;
 	}
 
-	if (dmxdevfilter->params.pes.output == DMX_OUT_TAP
-	    || dmxdevfilter->params.pes.output == DMX_OUT_TSDEMUX_TAP)
+	if (dmxdevfilter->params.pes.output == DMX_OUT_TAP ||
+	    dmxdevfilter->params.pes.output == DMX_OUT_TSDEMUX_TAP) {
 		buffer = &dmxdevfilter->buffer;
-	else
+		ctx = &dmxdevfilter->vb2_ctx;
+	} else {
 		buffer = &dmxdevfilter->dev->dvr_buffer;
-	if (buffer->error) {
-		spin_unlock(&dmxdevfilter->dev->lock);
-		wake_up(&buffer->queue);
-		return 0;
+		ctx = &dmxdevfilter->dev->dvr_vb2_ctx;
+	}
+
+	if (dvb_vb2_is_streaming(ctx)) {
+		ret = dvb_vb2_fill_buffer(ctx, buffer1, buffer1_len);
+		if (ret == buffer1_len)
+			ret = dvb_vb2_fill_buffer(ctx, buffer2, buffer2_len);
+	} else {
+		if (buffer->error) {
+			spin_unlock(&dmxdevfilter->dev->lock);
+			wake_up(&buffer->queue);
+			return 0;
+		}
+		ret = dvb_dmxdev_buffer_write(buffer, buffer1, buffer1_len);
+		if (ret == buffer1_len)
+			ret = dvb_dmxdev_buffer_write(buffer,
+						      buffer2, buffer2_len);
 	}
-	ret = dvb_dmxdev_buffer_write(buffer, buffer1, buffer1_len);
-	if (ret == buffer1_len)
-		ret = dvb_dmxdev_buffer_write(buffer, buffer2, buffer2_len);
 	if (ret < 0)
 		buffer->error = ret;
 	spin_unlock(&dmxdevfilter->dev->lock);
@@ -750,6 +771,8 @@ static int dvb_demux_open(struct inode *inode, struct file *file)
 	file->private_data = dmxdevfilter;
 
 	dvb_ringbuffer_init(&dmxdevfilter->buffer, NULL, 8192);
+	dvb_vb2_init(&dmxdevfilter->vb2_ctx, "demux_filter",
+		     file->f_flags & O_NONBLOCK);
 	dmxdevfilter->type = DMXDEV_TYPE_NONE;
 	dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_ALLOCATED);
 	timer_setup(&dmxdevfilter->timer, dvb_dmxdev_filter_timeout, 0);
@@ -765,6 +788,10 @@ static int dvb_dmxdev_filter_free(struct dmxdev *dmxdev,
 {
 	mutex_lock(&dmxdev->mutex);
 	mutex_lock(&dmxdevfilter->mutex);
+	if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx))
+		dvb_vb2_stream_off(&dmxdevfilter->vb2_ctx);
+	dvb_vb2_release(&dmxdevfilter->vb2_ctx);
+
 
 	dvb_dmxdev_filter_stop(dmxdevfilter);
 	dvb_dmxdev_filter_reset(dmxdevfilter);
@@ -1052,6 +1079,53 @@ static int dvb_demux_do_ioctl(struct file *file,
 		mutex_unlock(&dmxdevfilter->mutex);
 		break;
 
+	case DMX_REQBUFS:
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
+			return -ERESTARTSYS;
+		}
+		ret = dvb_vb2_reqbufs(&dmxdevfilter->vb2_ctx, parg);
+		mutex_unlock(&dmxdevfilter->mutex);
+		break;
+
+	case DMX_QUERYBUF:
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
+			return -ERESTARTSYS;
+		}
+		ret = dvb_vb2_querybuf(&dmxdevfilter->vb2_ctx, parg);
+		mutex_unlock(&dmxdevfilter->mutex);
+		break;
+
+	case DMX_EXPBUF:
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
+			return -ERESTARTSYS;
+		}
+		ret = dvb_vb2_expbuf(&dmxdevfilter->vb2_ctx, parg);
+		mutex_unlock(&dmxdevfilter->mutex);
+		break;
+
+	case DMX_QBUF:
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
+			return -ERESTARTSYS;
+		}
+		ret = dvb_vb2_qbuf(&dmxdevfilter->vb2_ctx, parg);
+		if (ret == 0 && !dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx))
+			ret = dvb_vb2_stream_on(&dmxdevfilter->vb2_ctx);
+		mutex_unlock(&dmxdevfilter->mutex);
+		break;
+
+	case DMX_DQBUF:
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
+			return -ERESTARTSYS;
+		}
+		ret = dvb_vb2_dqbuf(&dmxdevfilter->vb2_ctx, parg);
+		mutex_unlock(&dmxdevfilter->mutex);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
@@ -1073,6 +1147,8 @@ static unsigned int dvb_demux_poll(struct file *file, poll_table *wait)
 
 	if ((!dmxdevfilter) || dmxdevfilter->dev->exit)
 		return POLLERR;
+	if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx))
+		return dvb_vb2_poll(&dmxdevfilter->vb2_ctx, file, wait);
 
 	poll_wait(file, &dmxdevfilter->buffer.queue, wait);
 
@@ -1090,11 +1166,31 @@ static unsigned int dvb_demux_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
-static int dvb_demux_release(struct inode *inode, struct file *file)
+static int dvb_demux_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct dmxdev_filter *dmxdevfilter = file->private_data;
 	struct dmxdev *dmxdev = dmxdevfilter->dev;
+	int ret;
+
+	if (mutex_lock_interruptible(&dmxdev->mutex))
+		return -ERESTARTSYS;
+
+	if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+		mutex_unlock(&dmxdev->mutex);
+		return -ERESTARTSYS;
+	}
+	ret = dvb_vb2_mmap(&dmxdevfilter->vb2_ctx, vma);
 
+	mutex_unlock(&dmxdevfilter->mutex);
+	mutex_unlock(&dmxdev->mutex);
+
+	return ret;
+}
+
+static int dvb_demux_release(struct inode *inode, struct file *file)
+{
+	struct dmxdev_filter *dmxdevfilter = file->private_data;
+	struct dmxdev *dmxdev = dmxdevfilter->dev;
 	int ret;
 
 	ret = dvb_dmxdev_filter_free(dmxdev, dmxdevfilter);
@@ -1118,6 +1214,7 @@ static const struct file_operations dvb_demux_fops = {
 	.release = dvb_demux_release,
 	.poll = dvb_demux_poll,
 	.llseek = default_llseek,
+	.mmap = dvb_demux_mmap,
 };
 
 static const struct dvb_device dvbdev_demux = {
@@ -1146,6 +1243,28 @@ static int dvb_dvr_do_ioctl(struct file *file,
 		ret = dvb_dvr_set_buffer_size(dmxdev, arg);
 		break;
 
+	case DMX_REQBUFS:
+		ret = dvb_vb2_reqbufs(&dmxdev->dvr_vb2_ctx, parg);
+		break;
+
+	case DMX_QUERYBUF:
+		ret = dvb_vb2_querybuf(&dmxdev->dvr_vb2_ctx, parg);
+		break;
+
+	case DMX_EXPBUF:
+		ret = dvb_vb2_expbuf(&dmxdev->dvr_vb2_ctx, parg);
+		break;
+
+	case DMX_QBUF:
+		ret = dvb_vb2_qbuf(&dmxdev->dvr_vb2_ctx, parg);
+		if (ret == 0 && !dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx))
+			ret = dvb_vb2_stream_on(&dmxdev->dvr_vb2_ctx);
+		break;
+
+	case DMX_DQBUF:
+		ret = dvb_vb2_dqbuf(&dmxdev->dvr_vb2_ctx, parg);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
@@ -1170,10 +1289,13 @@ static unsigned int dvb_dvr_poll(struct file *file, poll_table *wait)
 
 	if (dmxdev->exit)
 		return POLLERR;
+	if (dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx))
+		return dvb_vb2_poll(&dmxdev->dvr_vb2_ctx, file, wait);
 
 	poll_wait(file, &dmxdev->dvr_buffer.queue, wait);
 
-	if ((file->f_flags & O_ACCMODE) == O_RDONLY) {
+	if (((file->f_flags & O_ACCMODE) == O_RDONLY) ||
+	    ((file->f_flags & O_ACCMODE) == O_RDWR)) {
 		if (dmxdev->dvr_buffer.error)
 			mask |= (POLLIN | POLLRDNORM | POLLPRI | POLLERR);
 
@@ -1185,6 +1307,23 @@ static unsigned int dvb_dvr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static int dvb_dvr_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct dvb_device *dvbdev = file->private_data;
+	struct dmxdev *dmxdev = dvbdev->priv;
+	int ret;
+
+	if (dmxdev->exit)
+		return -ENODEV;
+
+	if (mutex_lock_interruptible(&dmxdev->mutex))
+		return -ERESTARTSYS;
+
+	ret = dvb_vb2_mmap(&dmxdev->dvr_vb2_ctx, vma);
+	mutex_unlock(&dmxdev->mutex);
+	return ret;
+}
+
 static const struct file_operations dvb_dvr_fops = {
 	.owner = THIS_MODULE,
 	.read = dvb_dvr_read,
@@ -1194,6 +1333,7 @@ static const struct file_operations dvb_dvr_fops = {
 	.release = dvb_dvr_release,
 	.poll = dvb_dvr_poll,
 	.llseek = default_llseek,
+	.mmap = dvb_dvr_mmap,
 };
 
 static const struct dvb_device dvbdev_dvr = {
diff --git a/drivers/media/dvb-core/dmxdev.h b/drivers/media/dvb-core/dmxdev.h
index a89d552620a7..a77064d6e2c4 100644
--- a/drivers/media/dvb-core/dmxdev.h
+++ b/drivers/media/dvb-core/dmxdev.h
@@ -35,6 +35,7 @@
 #include "dvbdev.h"
 #include "demux.h"
 #include "dvb_ringbuffer.h"
+#include "dvb_vb2.h"
 
 /**
  * enum dmxdev_type - type of demux filter type.
@@ -140,6 +141,7 @@ struct dmxdev_filter {
 	enum dmxdev_state state;
 	struct dmxdev *dev;
 	struct dvb_ringbuffer buffer;
+	struct dvb_vb2_ctx vb2_ctx;
 
 	struct mutex mutex;
 
@@ -183,6 +185,8 @@ struct dmxdev {
 	struct dvb_ringbuffer dvr_buffer;
 #define DVR_BUFFER_SIZE (10*188*1024)
 
+	struct dvb_vb2_ctx dvr_vb2_ctx;
+
 	struct mutex mutex;
 	spinlock_t lock;
 };
diff --git a/drivers/media/dvb-core/dvb_vb2.c b/drivers/media/dvb-core/dvb_vb2.c
new file mode 100644
index 000000000000..34193a4acc47
--- /dev/null
+++ b/drivers/media/dvb-core/dvb_vb2.c
@@ -0,0 +1,423 @@
+/*
+ * dvb-vb2.c - dvb-vb2
+ *
+ * Copyright (C) 2015 Samsung Electronics
+ *
+ * Author: jh1009.sung@samsung.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include "dvbdev.h"
+#include "dvb_vb2.h"
+
+static int vb2_debug;
+module_param(vb2_debug, int, 0644);
+
+#define dprintk(level, fmt, arg...)					      \
+	do {								      \
+		if (vb2_debug >= level)					      \
+			pr_info("vb2: %s: " fmt, __func__, ## arg); \
+	} while (0)
+
+static int _queue_setup(struct vb2_queue *vq,
+			unsigned int *nbuffers, unsigned int *nplanes,
+			unsigned int sizes[], struct device *alloc_devs[])
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vq);
+
+	*nbuffers = ctx->buf_cnt;
+	*nplanes = 1;
+	sizes[0] = ctx->buf_siz;
+
+	/*
+	 * videobuf2-vmalloc allocator is context-less so no need to set
+	 * alloc_ctxs array.
+	 */
+
+	dprintk(3, "[%s] count=%d, size=%d\n", ctx->name,
+		*nbuffers, sizes[0]);
+
+	return 0;
+}
+
+static int _buffer_prepare(struct vb2_buffer *vb)
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
+	unsigned long size = ctx->buf_siz;
+
+	if (vb2_plane_size(vb, 0) < size) {
+		dprintk(1, "[%s] data will not fit into plane (%lu < %lu)\n",
+			ctx->name, vb2_plane_size(vb, 0), size);
+		return -EINVAL;
+	}
+
+	vb2_set_plane_payload(vb, 0, size);
+	dprintk(3, "[%s]\n", ctx->name);
+
+	return 0;
+}
+
+static void _buffer_queue(struct vb2_buffer *vb)
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
+	struct dvb_buffer *buf = container_of(vb, struct dvb_buffer, vb);
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&ctx->slock, flags);
+	list_add_tail(&buf->list, &ctx->dvb_q);
+	spin_unlock_irqrestore(&ctx->slock, flags);
+
+	dprintk(3, "[%s]\n", ctx->name);
+}
+
+static int _start_streaming(struct vb2_queue *vq, unsigned int count)
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vq);
+
+	dprintk(3, "[%s] count=%d\n", ctx->name, count);
+	return 0;
+}
+
+static void _stop_streaming(struct vb2_queue *vq)
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vq);
+
+	dprintk(3, "[%s]\n", ctx->name);
+}
+
+static void _dmxdev_lock(struct vb2_queue *vq)
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vq);
+
+	mutex_lock(&ctx->mutex);
+	dprintk(3, "[%s]\n", ctx->name);
+}
+
+static void _dmxdev_unlock(struct vb2_queue *vq)
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vq);
+
+	if (mutex_is_locked(&ctx->mutex))
+		mutex_unlock(&ctx->mutex);
+	dprintk(3, "[%s]\n", ctx->name);
+}
+
+static const struct vb2_ops dvb_vb2_qops = {
+	.queue_setup		= _queue_setup,
+	.buf_prepare		= _buffer_prepare,
+	.buf_queue		= _buffer_queue,
+	.start_streaming	= _start_streaming,
+	.stop_streaming		= _stop_streaming,
+	.wait_prepare		= _dmxdev_unlock,
+	.wait_finish		= _dmxdev_lock,
+};
+
+static void _fill_dmx_buffer(struct vb2_buffer *vb, void *pb)
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
+	struct dmx_buffer *b = pb;
+
+	b->index = vb->index;
+	b->length = vb->planes[0].length;
+	b->bytesused = vb->planes[0].bytesused;
+	b->offset = vb->planes[0].m.offset;
+	memset(b->reserved, 0, sizeof(b->reserved));
+	dprintk(3, "[%s]\n", ctx->name);
+}
+
+static int _fill_vb2_buffer(struct vb2_buffer *vb,
+			    const void *pb, struct vb2_plane *planes)
+{
+	struct dvb_vb2_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
+
+	planes[0].bytesused = 0;
+	dprintk(3, "[%s]\n", ctx->name);
+
+	return 0;
+}
+
+static const struct vb2_buf_ops dvb_vb2_buf_ops = {
+	.fill_user_buffer	= _fill_dmx_buffer,
+	.fill_vb2_buffer	= _fill_vb2_buffer,
+};
+
+/*
+ * Videobuf operations
+ */
+int dvb_vb2_init(struct dvb_vb2_ctx *ctx, const char *name, int nonblocking)
+{
+	struct vb2_queue *q = &ctx->vb_q;
+	int ret;
+
+	memset(ctx, 0, sizeof(struct dvb_vb2_ctx));
+	q->type = DVB_BUF_TYPE_CAPTURE;
+	/**capture type*/
+	q->is_output = 0;
+	/**only mmap is supported currently*/
+	q->io_modes = VB2_MMAP;
+	q->drv_priv = ctx;
+	q->buf_struct_size = sizeof(struct dvb_buffer);
+	q->min_buffers_needed = 1;
+	q->ops = &dvb_vb2_qops;
+	q->mem_ops = &vb2_vmalloc_memops;
+	q->buf_ops = &dvb_vb2_buf_ops;
+	q->num_buffers = 0;
+	ret = vb2_core_queue_init(q);
+	if (ret) {
+		ctx->state = DVB_VB2_STATE_NONE;
+		dprintk(1, "[%s] errno=%d\n", ctx->name, ret);
+		return ret;
+	}
+
+	mutex_init(&ctx->mutex);
+	spin_lock_init(&ctx->slock);
+	INIT_LIST_HEAD(&ctx->dvb_q);
+
+	strncpy(ctx->name, name, DVB_VB2_NAME_MAX);
+	ctx->nonblocking = nonblocking;
+	ctx->state = DVB_VB2_STATE_INIT;
+
+	dprintk(3, "[%s]\n", ctx->name);
+
+	return 0;
+}
+
+int dvb_vb2_release(struct dvb_vb2_ctx *ctx)
+{
+	struct vb2_queue *q = (struct vb2_queue *)&ctx->vb_q;
+
+	if (ctx->state & DVB_VB2_STATE_INIT)
+		vb2_core_queue_release(q);
+
+	ctx->state = DVB_VB2_STATE_NONE;
+	dprintk(3, "[%s]\n", ctx->name);
+
+	return 0;
+}
+
+int dvb_vb2_stream_on(struct dvb_vb2_ctx *ctx)
+{
+	struct vb2_queue *q = &ctx->vb_q;
+	int ret;
+
+	ret = vb2_core_streamon(q, q->type);
+	if (ret) {
+		ctx->state = DVB_VB2_STATE_NONE;
+		dprintk(1, "[%s] errno=%d\n", ctx->name, ret);
+		return ret;
+	}
+	ctx->state |= DVB_VB2_STATE_STREAMON;
+	dprintk(3, "[%s]\n", ctx->name);
+
+	return 0;
+}
+
+int dvb_vb2_stream_off(struct dvb_vb2_ctx *ctx)
+{
+	struct vb2_queue *q = (struct vb2_queue *)&ctx->vb_q;
+	int ret;
+	unsigned long flags = 0;
+
+	ctx->state &= ~DVB_VB2_STATE_STREAMON;
+	spin_lock_irqsave(&ctx->slock, flags);
+	while (!list_empty(&ctx->dvb_q)) {
+		struct dvb_buffer       *buf;
+
+		buf = list_entry(ctx->dvb_q.next,
+				 struct dvb_buffer, list);
+		list_del(&buf->list);
+		spin_unlock_irqrestore(&ctx->slock, flags);
+		vb2_buffer_done(&buf->vb, VB2_BUF_STATE_ERROR);
+		spin_lock_irqsave(&ctx->slock, flags);
+	}
+	spin_unlock_irqrestore(&ctx->slock, flags);
+	ret = vb2_core_streamoff(q, q->type);
+	if (ret) {
+		ctx->state = DVB_VB2_STATE_NONE;
+		dprintk(1, "[%s] errno=%d\n", ctx->name, ret);
+		return ret;
+	}
+	dprintk(3, "[%s]\n", ctx->name);
+
+	return 0;
+}
+
+int dvb_vb2_is_streaming(struct dvb_vb2_ctx *ctx)
+{
+	return (ctx->state & DVB_VB2_STATE_STREAMON);
+}
+
+int dvb_vb2_fill_buffer(struct dvb_vb2_ctx *ctx,
+			const unsigned char *src, int len)
+{
+	unsigned long flags = 0;
+	void *vbuf = NULL;
+	int todo = len;
+	unsigned char *psrc = (unsigned char *)src;
+	int ll = 0;
+
+	dprintk(3, "[%s] %d bytes are rcvd\n", ctx->name, len);
+	if (!src) {
+		dprintk(3, "[%s]:NULL pointer src\n", ctx->name);
+		/**normal case: This func is called twice from demux driver
+		 * once with valid src pointer, second time with NULL pointer
+		 */
+		return 0;
+	}
+	while (todo) {
+		if (!ctx->buf) {
+			spin_lock_irqsave(&ctx->slock, flags);
+			if (list_empty(&ctx->dvb_q)) {
+				spin_unlock_irqrestore(&ctx->slock, flags);
+				dprintk(3, "[%s] Buffer overflow!!!\n",
+					ctx->name);
+				break;
+			}
+
+			ctx->buf = list_entry(ctx->dvb_q.next,
+					      struct dvb_buffer, list);
+			list_del(&ctx->buf->list);
+			spin_unlock_irqrestore(&ctx->slock, flags);
+			ctx->remain = vb2_plane_size(&ctx->buf->vb, 0);
+			ctx->offset = 0;
+		}
+
+		if (!dvb_vb2_is_streaming(ctx)) {
+			vb2_buffer_done(&ctx->buf->vb, VB2_BUF_STATE_ERROR);
+			ctx->buf = NULL;
+			break;
+		}
+
+		/* Fill buffer */
+		ll = min(todo, ctx->remain);
+		vbuf = vb2_plane_vaddr(&ctx->buf->vb, 0);
+		memcpy(vbuf + ctx->offset, psrc, ll);
+		todo -= ll;
+		psrc += ll;
+
+		ctx->remain -= ll;
+		ctx->offset += ll;
+
+		if (ctx->remain == 0) {
+			vb2_buffer_done(&ctx->buf->vb, VB2_BUF_STATE_DONE);
+			ctx->buf = NULL;
+		}
+	}
+
+	if (ctx->nonblocking && ctx->buf) {
+		vb2_set_plane_payload(&ctx->buf->vb, 0, ll);
+		vb2_buffer_done(&ctx->buf->vb, VB2_BUF_STATE_DONE);
+		ctx->buf = NULL;
+	}
+
+	if (todo)
+		dprintk(1, "[%s] %d bytes are dropped.\n", ctx->name, todo);
+	else
+		dprintk(3, "[%s]\n", ctx->name);
+
+	dprintk(3, "[%s] %d bytes are copied\n", ctx->name, len - todo);
+	return (len - todo);
+}
+
+int dvb_vb2_reqbufs(struct dvb_vb2_ctx *ctx, struct dmx_requestbuffers *req)
+{
+	int ret;
+
+	ctx->buf_siz = req->size;
+	ctx->buf_cnt = req->count;
+	ret = vb2_core_reqbufs(&ctx->vb_q, VB2_MEMORY_MMAP, &req->count);
+	if (ret) {
+		ctx->state = DVB_VB2_STATE_NONE;
+		dprintk(1, "[%s] count=%d size=%d errno=%d\n", ctx->name,
+			ctx->buf_cnt, ctx->buf_siz, ret);
+		return ret;
+	}
+	ctx->state |= DVB_VB2_STATE_REQBUFS;
+	dprintk(3, "[%s] count=%d size=%d\n", ctx->name,
+		ctx->buf_cnt, ctx->buf_siz);
+
+	return 0;
+}
+
+int dvb_vb2_querybuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b)
+{
+	vb2_core_querybuf(&ctx->vb_q, b->index, b);
+	dprintk(3, "[%s] index=%d\n", ctx->name, b->index);
+	return 0;
+}
+
+int dvb_vb2_expbuf(struct dvb_vb2_ctx *ctx, struct dmx_exportbuffer *exp)
+{
+	struct vb2_queue *q = &ctx->vb_q;
+	int ret;
+
+	ret = vb2_core_expbuf(&ctx->vb_q, &exp->fd, q->type, exp->index,
+			      0, exp->flags);
+	if (ret) {
+		dprintk(1, "[%s] index=%d errno=%d\n", ctx->name,
+			exp->index, ret);
+		return ret;
+	}
+	dprintk(3, "[%s] index=%d fd=%d\n", ctx->name, exp->index, exp->fd);
+
+	return 0;
+}
+
+int dvb_vb2_qbuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b)
+{
+	int ret;
+
+	ret = vb2_core_qbuf(&ctx->vb_q, b->index, b);
+	if (ret) {
+		dprintk(1, "[%s] index=%d errno=%d\n", ctx->name,
+			b->index, ret);
+		return ret;
+	}
+	dprintk(5, "[%s] index=%d\n", ctx->name, b->index);
+
+	return 0;
+}
+
+int dvb_vb2_dqbuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b)
+{
+	int ret;
+
+	ret = vb2_core_dqbuf(&ctx->vb_q, &b->index, b, ctx->nonblocking);
+	if (ret) {
+		dprintk(1, "[%s] errno=%d\n", ctx->name, ret);
+		return ret;
+	}
+	dprintk(5, "[%s] index=%d\n", ctx->name, b->index);
+
+	return 0;
+}
+
+int dvb_vb2_mmap(struct dvb_vb2_ctx *ctx, struct vm_area_struct *vma)
+{
+	int ret;
+
+	ret = vb2_mmap(&ctx->vb_q, vma);
+	if (ret) {
+		dprintk(1, "[%s] errno=%d\n", ctx->name, ret);
+		return ret;
+	}
+	dprintk(3, "[%s] ret=%d\n", ctx->name, ret);
+
+	return 0;
+}
+
+unsigned int dvb_vb2_poll(struct dvb_vb2_ctx *ctx, struct file *file,
+			  poll_table *wait)
+{
+	dprintk(3, "[%s]\n", ctx->name);
+	return vb2_core_poll(&ctx->vb_q, file, wait);
+}
+
diff --git a/drivers/media/dvb-core/dvb_vb2.h b/drivers/media/dvb-core/dvb_vb2.h
new file mode 100644
index 000000000000..d68653926d91
--- /dev/null
+++ b/drivers/media/dvb-core/dvb_vb2.h
@@ -0,0 +1,72 @@
+/*
+ * dvb-vb2.h - DVB driver helper framework for streaming I/O
+ *
+ * Copyright (C) 2015 Samsung Electronics
+ *
+ * Author: jh1009.sung@samsung.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef _DVB_VB2_H
+#define _DVB_VB2_H
+
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/dvb/dmx.h>
+#include <media/videobuf2-core.h>
+#include <media/videobuf2-dma-contig.h>
+#include <media/videobuf2-vmalloc.h>
+
+enum dvb_buf_type {
+	DVB_BUF_TYPE_CAPTURE        = 1,
+	DVB_BUF_TYPE_OUTPUT         = 2,
+};
+
+#define DVB_VB2_STATE_NONE (0x0)
+#define DVB_VB2_STATE_INIT (0x1)
+#define DVB_VB2_STATE_REQBUFS (0x2)
+#define DVB_VB2_STATE_STREAMON (0x4)
+
+#define DVB_VB2_NAME_MAX (20)
+
+struct dvb_buffer {
+	struct vb2_buffer	vb;
+	struct list_head	list;
+};
+
+struct dvb_vb2_ctx {
+	struct vb2_queue	vb_q;
+	struct mutex		mutex;
+	spinlock_t		slock;
+	struct list_head	dvb_q;
+	struct dvb_buffer	*buf;
+	int	offset;
+	int	remain;
+	int	state;
+	int	buf_siz;
+	int	buf_cnt;
+	int	nonblocking;
+	char	name[DVB_VB2_NAME_MAX + 1];
+};
+
+int dvb_vb2_init(struct dvb_vb2_ctx *ctx, const char *name, int non_blocking);
+int dvb_vb2_release(struct dvb_vb2_ctx *ctx);
+int dvb_vb2_stream_on(struct dvb_vb2_ctx *ctx);
+int dvb_vb2_stream_off(struct dvb_vb2_ctx *ctx);
+int dvb_vb2_is_streaming(struct dvb_vb2_ctx *ctx);
+int dvb_vb2_fill_buffer(struct dvb_vb2_ctx *ctx,
+			const unsigned char *src, int len);
+
+int dvb_vb2_reqbufs(struct dvb_vb2_ctx *ctx, struct dmx_requestbuffers *req);
+int dvb_vb2_querybuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b);
+int dvb_vb2_expbuf(struct dvb_vb2_ctx *ctx, struct dmx_exportbuffer *exp);
+int dvb_vb2_qbuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b);
+int dvb_vb2_dqbuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b);
+int dvb_vb2_mmap(struct dvb_vb2_ctx *ctx, struct vm_area_struct *vma);
+unsigned int dvb_vb2_poll(struct dvb_vb2_ctx *ctx, struct file *file,
+			  poll_table *wait);
+
+#endif /* _DVB_VB2_H */
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index c10f1324b4ca..e212aa18ad78 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -211,6 +211,64 @@ struct dmx_stc {
 	__u64 stc;
 };
 
+/**
+ * struct dmx_buffer - dmx buffer info
+ *
+ * @index:	id number of the buffer
+ * @bytesused:	number of bytes occupied by data in the buffer (payload);
+ * @offset:	for buffers with memory == DMX_MEMORY_MMAP;
+ *		offset from the start of the device memory for this plane,
+ *		(or a "cookie" that should be passed to mmap() as offset)
+ * @length:	size in bytes of the buffer
+ *
+ * Contains data exchanged by application and driver using one of the streaming
+ * I/O methods.
+ */
+struct dmx_buffer {
+	__u32			index;
+	__u32			bytesused;
+	__u32			offset;
+	__u32			length;
+	__u32			reserved[4];
+};
+
+/**
+ * struct dmx_requestbuffers - request dmx buffer information
+ *
+ * @count:	number of requested buffers,
+ * @size:	size in bytes of the requested buffer
+ *
+ * Contains data used for requesting a dmx buffer.
+ * All reserved fields must be set to zero.
+ */
+struct dmx_requestbuffers {
+	__u32			count;
+	__u32			size;
+	__u32			reserved[2];
+};
+
+/**
+ * struct dmx_exportbuffer - export of dmx buffer as DMABUF file descriptor
+ *
+ * @index:	id number of the buffer
+ * @flags:	flags for newly created file, currently only O_CLOEXEC is
+ *		supported, refer to manual of open syscall for more details
+ * @fd:		file descriptor associated with DMABUF (set by driver)
+ *
+ * Contains data used for exporting a dmx buffer as DMABUF file descriptor.
+ * The buffer is identified by a 'cookie' returned by DMX_QUERYBUF
+ * (identical to the cookie used to mmap() the buffer to userspace). All
+ * reserved fields must be set to zero. The field reserved0 is expected to
+ * become a structure 'type' allowing an alternative layout of the structure
+ * content. Therefore this field should not be used for any other extensions.
+ */
+struct dmx_exportbuffer {
+	__u32		index;
+	__u32		flags;
+	__s32		fd;
+	__u32		reserved[5];
+};
+
 #define DMX_START                _IO('o', 41)
 #define DMX_STOP                 _IO('o', 42)
 #define DMX_SET_FILTER           _IOW('o', 43, struct dmx_sct_filter_params)
@@ -231,4 +289,10 @@ typedef struct dmx_filter dmx_filter_t;
 
 #endif
 
-#endif /* _UAPI_DVBDMX_H_ */
+#define DMX_REQBUFS              _IOWR('o', 60, struct dmx_requestbuffers)
+#define DMX_QUERYBUF             _IOWR('o', 61, struct dmx_buffer)
+#define DMX_EXPBUF               _IOWR('o', 62, struct dmx_exportbuffer)
+#define DMX_QBUF                 _IOWR('o', 63, struct dmx_buffer)
+#define DMX_DQBUF                _IOWR('o', 64, struct dmx_buffer)
+
+#endif /* _DVBDMX_H_ */
-- 
cgit v1.2.3


From a114a585be4f3173fe454921a0918fb7e71633b0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 21 Dec 2017 10:57:13 -0500
Subject: media: dvb-core: get rid of mmap reserved field

The "reserved" field was a way, used at V4L2 API, to add new
data to existing structs without breaking userspace. However,
there are now clever ways of doing that, without needing to add
an uneeded overhead. So, get rid of them.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/dmx-expbuf.rst   | 2 --
 Documentation/media/uapi/dvb/dmx-qbuf.rst     | 2 --
 Documentation/media/uapi/dvb/dmx-querybuf.rst | 2 --
 Documentation/media/uapi/dvb/dmx-reqbufs.rst  | 2 --
 drivers/media/dvb-core/dvb_vb2.c              | 1 -
 include/uapi/linux/dvb/dmx.h                  | 3 ---
 6 files changed, 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/dmx-expbuf.rst b/Documentation/media/uapi/dvb/dmx-expbuf.rst
index 51df34c6fb59..2d96cfe891df 100644
--- a/Documentation/media/uapi/dvb/dmx-expbuf.rst
+++ b/Documentation/media/uapi/dvb/dmx-expbuf.rst
@@ -36,8 +36,6 @@ This ioctl is an extension to the memory mapping I/O method.
 It can be used to export a buffer as a DMABUF file at any time after
 buffers have been allocated with the :ref:`DMX_REQBUFS` ioctl.
 
-The ``reserved`` array must be zeroed before calling it.
-
 To export a buffer, applications fill struct :c:type:`dmx_exportbuffer`.
 Applications must set the ``index`` field. Valid index numbers
 range from zero to the number of buffers allocated with :ref:`DMX_REQBUFS`
diff --git a/Documentation/media/uapi/dvb/dmx-qbuf.rst b/Documentation/media/uapi/dvb/dmx-qbuf.rst
index b20b8153d48d..b48c4931658e 100644
--- a/Documentation/media/uapi/dvb/dmx-qbuf.rst
+++ b/Documentation/media/uapi/dvb/dmx-qbuf.rst
@@ -45,8 +45,6 @@ numbers range from zero to the number of buffers allocated with
 one. The contents of the struct :c:type:`dmx_buffer` returned
 by a :ref:`DMX_QUERYBUF` ioctl will do as well.
 
-The and ``reserved`` field must be set to 0.
-
 When ``DMX_QBUF`` is called with a pointer to this structure, it locks the
 memory pages of the buffer in physical memory, so they cannot be swapped
 out to disk. Buffers remain locked until dequeued, until the
diff --git a/Documentation/media/uapi/dvb/dmx-querybuf.rst b/Documentation/media/uapi/dvb/dmx-querybuf.rst
index 46a50f191b10..89481e24bb86 100644
--- a/Documentation/media/uapi/dvb/dmx-querybuf.rst
+++ b/Documentation/media/uapi/dvb/dmx-querybuf.rst
@@ -36,8 +36,6 @@ This ioctl is part of the mmap streaming I/O method. It can
 be used to query the status of a buffer at any time after buffers have
 been allocated with the :ref:`DMX_REQBUFS` ioctl.
 
-The ``reserved`` array must be zeroed before calling it.
-
 Applications set the ``index`` field. Valid index numbers range from zero
 to the number of buffers allocated with :ref:`DMX_REQBUFS`
 (struct :c:type:`dvb_requestbuffers` ``count``) minus one.
diff --git a/Documentation/media/uapi/dvb/dmx-reqbufs.rst b/Documentation/media/uapi/dvb/dmx-reqbufs.rst
index 0749623d9d83..14b80d60bf35 100644
--- a/Documentation/media/uapi/dvb/dmx-reqbufs.rst
+++ b/Documentation/media/uapi/dvb/dmx-reqbufs.rst
@@ -42,8 +42,6 @@ allocated by applications through a device driver, and this ioctl only
 configures the driver into DMABUF I/O mode without performing any direct
 allocation.
 
-The ``reserved`` array must be zeroed before calling it.
-
 To allocate device buffers applications initialize all fields of the
 struct :c:type:`dmx_requestbuffers` structure. They set the  ``count`` field
 to the desired number of buffers,  and ``size`` to the size of each
diff --git a/drivers/media/dvb-core/dvb_vb2.c b/drivers/media/dvb-core/dvb_vb2.c
index fa1dcde74e81..68c59a497925 100644
--- a/drivers/media/dvb-core/dvb_vb2.c
+++ b/drivers/media/dvb-core/dvb_vb2.c
@@ -143,7 +143,6 @@ static void _fill_dmx_buffer(struct vb2_buffer *vb, void *pb)
 	b->length = vb->planes[0].length;
 	b->bytesused = vb->planes[0].bytesused;
 	b->offset = vb->planes[0].m.offset;
-	memset(b->reserved, 0, sizeof(b->reserved));
 	dprintk(3, "[%s]\n", ctx->name);
 }
 
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index e212aa18ad78..5f3c5a918f00 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -229,7 +229,6 @@ struct dmx_buffer {
 	__u32			bytesused;
 	__u32			offset;
 	__u32			length;
-	__u32			reserved[4];
 };
 
 /**
@@ -244,7 +243,6 @@ struct dmx_buffer {
 struct dmx_requestbuffers {
 	__u32			count;
 	__u32			size;
-	__u32			reserved[2];
 };
 
 /**
@@ -266,7 +264,6 @@ struct dmx_exportbuffer {
 	__u32		index;
 	__u32		flags;
 	__s32		fd;
-	__u32		reserved[5];
 };
 
 #define DMX_START                _IO('o', 41)
-- 
cgit v1.2.3


From e8391b7654a9d21323fe0226770741c0e42c53f4 Mon Sep 17 00:00:00 2001
From: Yong Zhi <yong.zhi@intel.com>
Date: Wed, 8 Nov 2017 19:30:36 -0500
Subject: media: videodev2.h, v4l2-ioctl: add IPU3 raw10 color format

Add IPU3 specific formats:

	V4L2_PIX_FMT_IPU3_SBGGR10
	V4L2_PIX_FMT_IPU3_SGBRG10
	V4L2_PIX_FMT_IPU3_SGRBG10
	V4L2_PIX_FMT_IPU3_SRGGB10

Signed-off-by: Yong Zhi <yong.zhi@intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/v4l2-core/v4l2-ioctl.c | 4 ++++
 include/uapi/linux/videodev2.h       | 6 ++++++
 2 files changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 79614992ee21..3937945b12dc 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1202,6 +1202,10 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG10P:	descr = "10-bit Bayer GBGB/RGRG Packed"; break;
 	case V4L2_PIX_FMT_SGRBG10P:	descr = "10-bit Bayer GRGR/BGBG Packed"; break;
 	case V4L2_PIX_FMT_SRGGB10P:	descr = "10-bit Bayer RGRG/GBGB Packed"; break;
+	case V4L2_PIX_FMT_IPU3_SBGGR10: descr = "10-bit bayer BGGR IPU3 Packed"; break;
+	case V4L2_PIX_FMT_IPU3_SGBRG10: descr = "10-bit bayer GBRG IPU3 Packed"; break;
+	case V4L2_PIX_FMT_IPU3_SGRBG10: descr = "10-bit bayer GRBG IPU3 Packed"; break;
+	case V4L2_PIX_FMT_IPU3_SRGGB10: descr = "10-bit bayer RGGB IPU3 Packed"; break;
 	case V4L2_PIX_FMT_SBGGR10ALAW8:	descr = "8-bit Bayer BGBG/GRGR (A-law)"; break;
 	case V4L2_PIX_FMT_SGBRG10ALAW8:	descr = "8-bit Bayer GBGB/RGRG (A-law)"; break;
 	case V4L2_PIX_FMT_SGRBG10ALAW8:	descr = "8-bit Bayer GRGR/BGBG (A-law)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 1c095b5a99c5..b26160e0b483 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -669,6 +669,12 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_MT21C    v4l2_fourcc('M', 'T', '2', '1') /* Mediatek compressed block mode  */
 #define V4L2_PIX_FMT_INZI     v4l2_fourcc('I', 'N', 'Z', 'I') /* Intel Planar Greyscale 10-bit and Depth 16-bit */
 
+/* 10bit raw bayer packed, 32 bytes for every 25 pixels, last LSB 6 bits unused */
+#define V4L2_PIX_FMT_IPU3_SBGGR10	v4l2_fourcc('i', 'p', '3', 'b') /* IPU3 packed 10-bit BGGR bayer */
+#define V4L2_PIX_FMT_IPU3_SGBRG10	v4l2_fourcc('i', 'p', '3', 'g') /* IPU3 packed 10-bit GBRG bayer */
+#define V4L2_PIX_FMT_IPU3_SGRBG10	v4l2_fourcc('i', 'p', '3', 'G') /* IPU3 packed 10-bit GRBG bayer */
+#define V4L2_PIX_FMT_IPU3_SRGGB10	v4l2_fourcc('i', 'p', '3', 'r') /* IPU3 packed 10-bit RGGB bayer */
+
 /* SDR formats - used only for Software Defined Radio devices */
 #define V4L2_SDR_FMT_CU8          v4l2_fourcc('C', 'U', '0', '8') /* IQ u8 */
 #define V4L2_SDR_FMT_CU16LE       v4l2_fourcc('C', 'U', '1', '6') /* IQ u16le */
-- 
cgit v1.2.3


From 675fc275a3a2d905535207237402c6d8dcb5fa4b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:09 -0800
Subject: bpf: offload: report device information for offloaded programs

Report to the user ifindex and namespace information of offloaded
programs.  If device has disappeared return -ENODEV.  Specify the
namespace using dev/inode combination.

CC: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h            |  2 ++
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/offload.c           | 59 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  6 +++++
 tools/include/uapi/linux/bpf.h |  3 +++
 5 files changed, 73 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a916ab34299..7810ae57b357 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -531,6 +531,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+			       struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69eabfcb9bdb..f2f8b36e2ad4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -921,6 +921,9 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index e4f1668a021c..040d4e0edf3f 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -16,9 +16,11 @@
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bug.h>
+#include <linux/kdev_t.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
+#include <linux/proc_ns.h>
 #include <linux/rtnetlink.h>
 #include <linux/rwsem.h>
 
@@ -176,6 +178,63 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+struct ns_get_path_bpf_prog_args {
+	struct bpf_prog *prog;
+	struct bpf_prog_info *info;
+};
+
+static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
+{
+	struct ns_get_path_bpf_prog_args *args = private_data;
+	struct bpf_prog_aux *aux = args->prog->aux;
+	struct ns_common *ns;
+	struct net *net;
+
+	rtnl_lock();
+	down_read(&bpf_devs_lock);
+
+	if (aux->offload) {
+		args->info->ifindex = aux->offload->netdev->ifindex;
+		net = dev_net(aux->offload->netdev);
+		get_net(net);
+		ns = &net->ns;
+	} else {
+		args->info->ifindex = 0;
+		ns = NULL;
+	}
+
+	up_read(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return ns;
+}
+
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+			       struct bpf_prog *prog)
+{
+	struct ns_get_path_bpf_prog_args args = {
+		.prog	= prog,
+		.info	= info,
+	};
+	struct inode *ns_inode;
+	struct path ns_path;
+	void *res;
+
+	res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
+	if (IS_ERR(res)) {
+		if (!info->ifindex)
+			return -ENODEV;
+		return PTR_ERR(res);
+	}
+
+	ns_inode = ns_path.dentry->d_inode;
+	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+	info->netns_ino = ns_inode->i_ino;
+	path_put(&ns_path);
+
+	return 0;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e02dafa6f402..ebf0fb23e237 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1707,6 +1707,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		err = bpf_prog_offload_info_fill(&info, prog);
+		if (err)
+			return err;
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index db1b0923a308..4e8c60acfa32 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -921,6 +921,9 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
cgit v1.2.3


From bbb6189df4077cde8592cd2f804bb1122067dd32 Mon Sep 17 00:00:00 2001
From: Kristian Evensen <kristian.evensen@gmail.com>
Date: Wed, 27 Dec 2017 18:27:58 +0100
Subject: inet_diag: Add equal-operator for ports

inet_diag currently provides less/greater than or equal operators for
comparing ports when filtering sockets. An equal comparison can be
performed by combining the two existing operators, or a user can for
example request a port range and then do the final filtering in
userspace. However, these approaches both have drawbacks. Implementing
equal using LE/GE causes the size and complexity of a filter to grow
quickly as the number of ports increase, while it on busy machines would
be great if the kernel only returns information about relevant sockets.

This patch introduces source and destination port equal operators.
INET_DIAG_BC_S_EQ is used to match a source port, INET_DIAG_BC_D_EQ a
destination port, and usage is the same as for the existing port
operators.  I.e., the port to match is stored in the no-member of the
next inet_diag_bc_op-struct in the filter.

Signed-off-by: Kristian Evensen <kristian.evensen@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h | 2 ++
 net/ipv4/inet_diag.c           | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 817d807e9481..14565d703291 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -92,6 +92,8 @@ enum {
 	INET_DIAG_BC_D_COND,
 	INET_DIAG_BC_DEV_COND,   /* u32 ifindex */
 	INET_DIAG_BC_MARK_COND,
+	INET_DIAG_BC_S_EQ,
+	INET_DIAG_BC_D_EQ,
 };
 
 struct inet_diag_hostcond {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c9c35b61a027..a383f299ce24 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -564,12 +564,18 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
 		case INET_DIAG_BC_JMP:
 			yes = 0;
 			break;
+		case INET_DIAG_BC_S_EQ:
+			yes = entry->sport == op[1].no;
+			break;
 		case INET_DIAG_BC_S_GE:
 			yes = entry->sport >= op[1].no;
 			break;
 		case INET_DIAG_BC_S_LE:
 			yes = entry->sport <= op[1].no;
 			break;
+		case INET_DIAG_BC_D_EQ:
+			yes = entry->dport == op[1].no;
+			break;
 		case INET_DIAG_BC_D_GE:
 			yes = entry->dport >= op[1].no;
 			break;
@@ -802,8 +808,10 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
 			if (!valid_devcond(bc, len, &min_len))
 				return -EINVAL;
 			break;
+		case INET_DIAG_BC_S_EQ:
 		case INET_DIAG_BC_S_GE:
 		case INET_DIAG_BC_S_LE:
+		case INET_DIAG_BC_D_EQ:
 		case INET_DIAG_BC_D_GE:
 		case INET_DIAG_BC_D_LE:
 			if (!valid_port_comparison(bc, len, &min_len))
-- 
cgit v1.2.3


From c0bace798436bca0fdc221ff61143f1376a9c3de Mon Sep 17 00:00:00 2001
From: Felix Janda <felix.janda@posteo.de>
Date: Mon, 1 Jan 2018 19:33:20 +0100
Subject: uapi libc compat: add fallback for unsupported libcs

libc-compat.h aims to prevent symbol collisions between uapi and libc
headers for each supported libc. This requires continuous coordination
between them.

The goal of this commit is to improve the situation for libcs (such as
musl) which are not yet supported and/or do not wish to be explicitly
supported, while not affecting supported libcs. More precisely, with
this commit, unsupported libcs can request the suppression of any
specific uapi definition by defining the correspondings _UAPI_DEF_*
macro as 0. This can fix symbol collisions for them, as long as the
libc headers are included before the uapi headers. Inclusion in the
other order is outside the scope of this commit.

All infrastructure in order to enable this fallback for unsupported
libcs is already in place, except that libc-compat.h unconditionally
defines all _UAPI_DEF_* macros to 1 for all unsupported libcs so that
any previous definitions are ignored. In order to fix this, this commit
merely makes these definitions conditional.

This commit together with the musl libc commit

http://git.musl-libc.org/cgit/musl/commit/?id=04983f2272382af92eb8f8838964ff944fbb8258

fixes for example the following compiler errors when <linux/in6.h> is
included after musl's <netinet/in.h>:

./linux/in6.h:32:8: error: redefinition of 'struct in6_addr'
./linux/in6.h:49:8: error: redefinition of 'struct sockaddr_in6'
./linux/in6.h:59:8: error: redefinition of 'struct ipv6_mreq'

The comments referencing glibc are still correct, but this file is not
only used for glibc any more.

Signed-off-by: Felix Janda <felix.janda@posteo.de>
Reviewed-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/libc-compat.h | 55 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index 282875cf8056..8254c937c9f4 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -168,46 +168,99 @@
 
 /* If we did not see any headers from any supported C libraries,
  * or we are being included in the kernel, then define everything
- * that we need. */
+ * that we need. Check for previous __UAPI_* definitions to give
+ * unsupported C libraries a way to opt out of any kernel definition. */
 #else /* !defined(__GLIBC__) */
 
 /* Definitions for if.h */
+#ifndef __UAPI_DEF_IF_IFCONF
 #define __UAPI_DEF_IF_IFCONF 1
+#endif
+#ifndef __UAPI_DEF_IF_IFMAP
 #define __UAPI_DEF_IF_IFMAP 1
+#endif
+#ifndef __UAPI_DEF_IF_IFNAMSIZ
 #define __UAPI_DEF_IF_IFNAMSIZ 1
+#endif
+#ifndef __UAPI_DEF_IF_IFREQ
 #define __UAPI_DEF_IF_IFREQ 1
+#endif
 /* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */
+#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS
 #define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1
+#endif
 /* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */
+#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO
 #define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1
+#endif
 
 /* Definitions for in.h */
+#ifndef __UAPI_DEF_IN_ADDR
 #define __UAPI_DEF_IN_ADDR		1
+#endif
+#ifndef __UAPI_DEF_IN_IPPROTO
 #define __UAPI_DEF_IN_IPPROTO		1
+#endif
+#ifndef __UAPI_DEF_IN_PKTINFO
 #define __UAPI_DEF_IN_PKTINFO		1
+#endif
+#ifndef __UAPI_DEF_IP_MREQ
 #define __UAPI_DEF_IP_MREQ		1
+#endif
+#ifndef __UAPI_DEF_SOCKADDR_IN
 #define __UAPI_DEF_SOCKADDR_IN		1
+#endif
+#ifndef __UAPI_DEF_IN_CLASS
 #define __UAPI_DEF_IN_CLASS		1
+#endif
 
 /* Definitions for in6.h */
+#ifndef __UAPI_DEF_IN6_ADDR
 #define __UAPI_DEF_IN6_ADDR		1
+#endif
+#ifndef __UAPI_DEF_IN6_ADDR_ALT
 #define __UAPI_DEF_IN6_ADDR_ALT		1
+#endif
+#ifndef __UAPI_DEF_SOCKADDR_IN6
 #define __UAPI_DEF_SOCKADDR_IN6		1
+#endif
+#ifndef __UAPI_DEF_IPV6_MREQ
 #define __UAPI_DEF_IPV6_MREQ		1
+#endif
+#ifndef __UAPI_DEF_IPPROTO_V6
 #define __UAPI_DEF_IPPROTO_V6		1
+#endif
+#ifndef __UAPI_DEF_IPV6_OPTIONS
 #define __UAPI_DEF_IPV6_OPTIONS		1
+#endif
+#ifndef __UAPI_DEF_IN6_PKTINFO
 #define __UAPI_DEF_IN6_PKTINFO		1
+#endif
+#ifndef __UAPI_DEF_IP6_MTUINFO
 #define __UAPI_DEF_IP6_MTUINFO		1
+#endif
 
 /* Definitions for ipx.h */
+#ifndef __UAPI_DEF_SOCKADDR_IPX
 #define __UAPI_DEF_SOCKADDR_IPX			1
+#endif
+#ifndef __UAPI_DEF_IPX_ROUTE_DEFINITION
 #define __UAPI_DEF_IPX_ROUTE_DEFINITION		1
+#endif
+#ifndef __UAPI_DEF_IPX_INTERFACE_DEFINITION
 #define __UAPI_DEF_IPX_INTERFACE_DEFINITION	1
+#endif
+#ifndef __UAPI_DEF_IPX_CONFIG_DATA
 #define __UAPI_DEF_IPX_CONFIG_DATA		1
+#endif
+#ifndef __UAPI_DEF_IPX_ROUTE_DEF
 #define __UAPI_DEF_IPX_ROUTE_DEF		1
+#endif
 
 /* Definitions for xattr.h */
+#ifndef __UAPI_DEF_XATTR
 #define __UAPI_DEF_XATTR		1
+#endif
 
 #endif /* __GLIBC__ */
 
-- 
cgit v1.2.3


From 563a01e1012dddbe120d1e1e1c466ea0639a098b Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <guennadi.liakhovetski@intel.com>
Date: Wed, 8 Nov 2017 11:00:12 -0500
Subject: media: v4l: Add a UVC Metadata format

Add a pixel format, used by the UVC driver to stream metadata.

Signed-off-by: Guennadi Liakhovetski <guennadi.liakhovetski@intel.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/v4l/meta-formats.rst    |  1 +
 Documentation/media/uapi/v4l/pixfmt-meta-uvc.rst | 51 ++++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c             |  1 +
 include/uapi/linux/videodev2.h                   |  1 +
 4 files changed, 54 insertions(+)
 create mode 100644 Documentation/media/uapi/v4l/pixfmt-meta-uvc.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/meta-formats.rst b/Documentation/media/uapi/v4l/meta-formats.rst
index 01e24e3df571..0c4e1ecf5879 100644
--- a/Documentation/media/uapi/v4l/meta-formats.rst
+++ b/Documentation/media/uapi/v4l/meta-formats.rst
@@ -12,5 +12,6 @@ These formats are used for the :ref:`metadata` interface only.
 .. toctree::
     :maxdepth: 1
 
+    pixfmt-meta-uvc
     pixfmt-meta-vsp1-hgo
     pixfmt-meta-vsp1-hgt
diff --git a/Documentation/media/uapi/v4l/pixfmt-meta-uvc.rst b/Documentation/media/uapi/v4l/pixfmt-meta-uvc.rst
new file mode 100644
index 000000000000..b5165dc090c2
--- /dev/null
+++ b/Documentation/media/uapi/v4l/pixfmt-meta-uvc.rst
@@ -0,0 +1,51 @@
+.. -*- coding: utf-8; mode: rst -*-
+
+.. _v4l2-meta-fmt-uvc:
+
+*******************************
+V4L2_META_FMT_UVC ('UVCH')
+*******************************
+
+UVC Payload Header Data
+
+
+Description
+===========
+
+This format describes standard UVC metadata, extracted from UVC packet headers
+and provided by the UVC driver through metadata video nodes. That data includes
+exact copies of the standard part of UVC Payload Header contents and auxiliary
+timing information, required for precise interpretation of timestamps, contained
+in those headers. See section "2.4.3.3 Video and Still Image Payload Headers" of
+the "UVC 1.5 Class specification" for details.
+
+Each UVC payload header can be between 2 and 12 bytes large. Buffers can
+contain multiple headers, if multiple such headers have been transmitted by the
+camera for the respective frame. However, the driver may drop headers when the
+buffer is full, when they contain no useful information (e.g. those without the
+SCR field or with that field identical to the previous header), or generally to
+perform rate limiting when the device sends a large number of headers.
+
+Each individual block contains the following fields:
+
+.. flat-table:: UVC Metadata Block
+    :widths: 1 4
+    :header-rows:  1
+    :stub-columns: 0
+
+    * - Field
+      - Description
+    * - __u64 ts;
+      - system timestamp in host byte order, measured by the driver upon
+        reception of the payload
+    * - __u16 sof;
+      - USB Frame Number in host byte order, also obtained by the driver as
+        close as possible to the above timestamp to enable correlation between
+        them
+    * - :cspan:`1` *The rest is an exact copy of the UVC payload header:*
+    * - __u8 length;
+      - length of the rest of the block, including this field
+    * - __u8 flags;
+      - Flags, indicating presence of other standard UVC fields
+    * - __u8 buf[];
+      - The rest of the header, possibly including UVC PTS and SCR fields
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 3937945b12dc..1d7c2ea78c3e 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1250,6 +1250,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_TCH_FMT_TU08:		descr = "8-bit unsigned touch data"; break;
 	case V4L2_META_FMT_VSP1_HGO:	descr = "R-Car VSP1 1-D Histogram"; break;
 	case V4L2_META_FMT_VSP1_HGT:	descr = "R-Car VSP1 2-D Histogram"; break;
+	case V4L2_META_FMT_UVC:		descr = "UVC payload header metadata"; break;
 
 	default:
 		/* Compressed formats */
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index b26160e0b483..faa97fda588a 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -694,6 +694,7 @@ struct v4l2_pix_format {
 /* Meta-data formats */
 #define V4L2_META_FMT_VSP1_HGO    v4l2_fourcc('V', 'S', 'P', 'H') /* R-Car VSP1 1-D Histogram */
 #define V4L2_META_FMT_VSP1_HGT    v4l2_fourcc('V', 'S', 'P', 'T') /* R-Car VSP1 2-D Histogram */
+#define V4L2_META_FMT_UVC         v4l2_fourcc('U', 'V', 'C', 'H') /* UVC Payload Header metadata */
 
 /* priv field value to indicates that subsequent fields are valid. */
 #define V4L2_PIX_FMT_PRIV_MAGIC		0xfeedcafe
-- 
cgit v1.2.3


From 088ead25524583e2200aa99111bea2f66a86545a Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <guennadi.liakhovetski@intel.com>
Date: Wed, 6 Dec 2017 10:15:40 -0500
Subject: media: uvcvideo: Add a metadata device node

Some UVC video cameras contain metadata in their payload headers. This
patch extracts that data, adding more clock synchronisation information,
on both bulk and isochronous endpoints and makes it available to the user
space on a separate video node, using the V4L2_CAP_META_CAPTURE capability
and the V4L2_BUF_TYPE_META_CAPTURE buffer queue type. By default, only the
V4L2_META_FMT_UVC pixel format is available from those nodes. However,
cameras can be added to the device ID table to additionally specify their
own metadata format, in which case that format will also become available
from the metadata node.

[Use put_unaligned instead of __put_unaligned_cpu64]
[Use put_unaligned for the sof field as well]

Signed-off-by: Guennadi Liakhovetski <guennadi.liakhovetski@intel.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/usb/uvc/Makefile       |   2 +-
 drivers/media/usb/uvc/uvc_driver.c   |  15 ++-
 drivers/media/usb/uvc/uvc_isight.c   |   2 +-
 drivers/media/usb/uvc/uvc_metadata.c | 179 +++++++++++++++++++++++++++++++++++
 drivers/media/usb/uvc/uvc_queue.c    |  44 +++++++--
 drivers/media/usb/uvc/uvc_video.c    | 134 ++++++++++++++++++++++++--
 drivers/media/usb/uvc/uvcvideo.h     |  16 +++-
 include/uapi/linux/uvcvideo.h        |  26 +++++
 8 files changed, 396 insertions(+), 22 deletions(-)
 create mode 100644 drivers/media/usb/uvc/uvc_metadata.c

(limited to 'include/uapi/linux')

diff --git a/drivers/media/usb/uvc/Makefile b/drivers/media/usb/uvc/Makefile
index a4fe5b5d533f..4f9eee4f81ab 100644
--- a/drivers/media/usb/uvc/Makefile
+++ b/drivers/media/usb/uvc/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 uvcvideo-objs  := uvc_driver.o uvc_queue.o uvc_v4l2.o uvc_video.o uvc_ctrl.o \
-		  uvc_status.o uvc_isight.o uvc_debugfs.o
+		  uvc_status.o uvc_isight.o uvc_debugfs.o uvc_metadata.o
 ifeq ($(CONFIG_MEDIA_CONTROLLER),y)
 uvcvideo-objs  += uvc_entity.o
 endif
diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
index 1509bbc77a2f..2886c100abf2 100644
--- a/drivers/media/usb/uvc/uvc_driver.c
+++ b/drivers/media/usb/uvc/uvc_driver.c
@@ -1883,6 +1883,7 @@ static void uvc_unregister_video(struct uvc_device *dev)
 			continue;
 
 		video_unregister_device(&stream->vdev);
+		video_unregister_device(&stream->meta.vdev);
 
 		uvc_debugfs_cleanup_stream(stream);
 	}
@@ -1930,6 +1931,9 @@ int uvc_register_video_device(struct uvc_device *dev,
 	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
 		vdev->device_caps = V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_STREAMING;
 		break;
+	case V4L2_BUF_TYPE_META_CAPTURE:
+		vdev->device_caps = V4L2_CAP_META_CAPTURE | V4L2_CAP_STREAMING;
+		break;
 	}
 
 	strlcpy(vdev->name, dev->name, sizeof vdev->name);
@@ -1965,7 +1969,8 @@ static int uvc_register_video(struct uvc_device *dev,
 	}
 
 	if (stream->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
-		stream->chain->caps |= V4L2_CAP_VIDEO_CAPTURE;
+		stream->chain->caps |= V4L2_CAP_VIDEO_CAPTURE
+			| V4L2_CAP_META_CAPTURE;
 	else
 		stream->chain->caps |= V4L2_CAP_VIDEO_OUTPUT;
 
@@ -2003,6 +2008,11 @@ static int uvc_register_terms(struct uvc_device *dev,
 		if (ret < 0)
 			return ret;
 
+		/* Register a metadata node, but ignore a possible failure,
+		 * complete registration of video nodes anyway.
+		 */
+		uvc_meta_register(stream);
+
 		term->vdev = &stream->vdev;
 	}
 
@@ -2037,6 +2047,7 @@ static int uvc_register_chains(struct uvc_device *dev)
 
 struct uvc_device_info {
 	u32	quirks;
+	u32	meta_format;
 };
 
 static int uvc_probe(struct usb_interface *intf,
@@ -2074,6 +2085,8 @@ static int uvc_probe(struct usb_interface *intf,
 	dev->intfnum = intf->cur_altsetting->desc.bInterfaceNumber;
 	dev->quirks = (uvc_quirks_param == -1)
 		    ? quirks : uvc_quirks_param;
+	if (info)
+		dev->meta_format = info->meta_format;
 
 	if (udev->product != NULL)
 		strlcpy(dev->name, udev->product, sizeof dev->name);
diff --git a/drivers/media/usb/uvc/uvc_isight.c b/drivers/media/usb/uvc/uvc_isight.c
index 8510e7259e76..fb940cfae575 100644
--- a/drivers/media/usb/uvc/uvc_isight.c
+++ b/drivers/media/usb/uvc/uvc_isight.c
@@ -100,7 +100,7 @@ static int isight_decode(struct uvc_video_queue *queue, struct uvc_buffer *buf,
 }
 
 void uvc_video_decode_isight(struct urb *urb, struct uvc_streaming *stream,
-		struct uvc_buffer *buf)
+			struct uvc_buffer *buf, struct uvc_buffer *meta_buf)
 {
 	int ret, i;
 
diff --git a/drivers/media/usb/uvc/uvc_metadata.c b/drivers/media/usb/uvc/uvc_metadata.c
new file mode 100644
index 000000000000..cd1aec19cc5b
--- /dev/null
+++ b/drivers/media/usb/uvc/uvc_metadata.c
@@ -0,0 +1,179 @@
+/*
+ *      uvc_metadata.c  --  USB Video Class driver - Metadata handling
+ *
+ *      Copyright (C) 2016
+ *          Guennadi Liakhovetski (guennadi.liakhovetski@intel.com)
+ *
+ *      This program is free software; you can redistribute it and/or modify
+ *      it under the terms of the GNU General Public License as published by
+ *      the Free Software Foundation; either version 2 of the License, or
+ *      (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+#include <linux/videodev2.h>
+
+#include <media/v4l2-ioctl.h>
+#include <media/videobuf2-v4l2.h>
+#include <media/videobuf2-vmalloc.h>
+
+#include "uvcvideo.h"
+
+/* -----------------------------------------------------------------------------
+ * V4L2 ioctls
+ */
+
+static int uvc_meta_v4l2_querycap(struct file *file, void *fh,
+				  struct v4l2_capability *cap)
+{
+	struct v4l2_fh *vfh = file->private_data;
+	struct uvc_streaming *stream = video_get_drvdata(vfh->vdev);
+	struct uvc_video_chain *chain = stream->chain;
+
+	strlcpy(cap->driver, "uvcvideo", sizeof(cap->driver));
+	strlcpy(cap->card, vfh->vdev->name, sizeof(cap->card));
+	usb_make_path(stream->dev->udev, cap->bus_info, sizeof(cap->bus_info));
+	cap->capabilities = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_STREAMING
+			  | chain->caps;
+
+	return 0;
+}
+
+static int uvc_meta_v4l2_get_format(struct file *file, void *fh,
+				    struct v4l2_format *format)
+{
+	struct v4l2_fh *vfh = file->private_data;
+	struct uvc_streaming *stream = video_get_drvdata(vfh->vdev);
+	struct v4l2_meta_format *fmt = &format->fmt.meta;
+
+	if (format->type != vfh->vdev->queue->type)
+		return -EINVAL;
+
+	memset(fmt, 0, sizeof(*fmt));
+
+	fmt->dataformat = stream->meta.format;
+	fmt->buffersize = UVC_METATADA_BUF_SIZE;
+
+	return 0;
+}
+
+static int uvc_meta_v4l2_try_format(struct file *file, void *fh,
+				    struct v4l2_format *format)
+{
+	struct v4l2_fh *vfh = file->private_data;
+	struct uvc_streaming *stream = video_get_drvdata(vfh->vdev);
+	struct uvc_device *dev = stream->dev;
+	struct v4l2_meta_format *fmt = &format->fmt.meta;
+	u32 fmeta = fmt->dataformat;
+
+	if (format->type != vfh->vdev->queue->type)
+		return -EINVAL;
+
+	memset(fmt, 0, sizeof(*fmt));
+
+	fmt->dataformat = fmeta == dev->meta_format ? fmeta : V4L2_META_FMT_UVC;
+	fmt->buffersize = UVC_METATADA_BUF_SIZE;
+
+	return 0;
+}
+
+static int uvc_meta_v4l2_set_format(struct file *file, void *fh,
+				    struct v4l2_format *format)
+{
+	struct v4l2_fh *vfh = file->private_data;
+	struct uvc_streaming *stream = video_get_drvdata(vfh->vdev);
+	struct v4l2_meta_format *fmt = &format->fmt.meta;
+	int ret;
+
+	ret = uvc_meta_v4l2_try_format(file, fh, format);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * We could in principle switch at any time, also during streaming.
+	 * Metadata buffers would still be perfectly parseable, but it's more
+	 * consistent and cleaner to disallow that.
+	 */
+	mutex_lock(&stream->mutex);
+
+	if (uvc_queue_allocated(&stream->queue))
+		ret = -EBUSY;
+	else
+		stream->meta.format = fmt->dataformat;
+
+	mutex_unlock(&stream->mutex);
+
+	return ret;
+}
+
+static int uvc_meta_v4l2_enum_formats(struct file *file, void *fh,
+				      struct v4l2_fmtdesc *fdesc)
+{
+	struct v4l2_fh *vfh = file->private_data;
+	struct uvc_streaming *stream = video_get_drvdata(vfh->vdev);
+	struct uvc_device *dev = stream->dev;
+	u32 index = fdesc->index;
+
+	if (fdesc->type != vfh->vdev->queue->type ||
+	    index > 1U || (index && !dev->meta_format))
+		return -EINVAL;
+
+	memset(fdesc, 0, sizeof(*fdesc));
+
+	fdesc->type = vfh->vdev->queue->type;
+	fdesc->index = index;
+	fdesc->pixelformat = index ? dev->meta_format : V4L2_META_FMT_UVC;
+
+	return 0;
+}
+
+static const struct v4l2_ioctl_ops uvc_meta_ioctl_ops = {
+	.vidioc_querycap		= uvc_meta_v4l2_querycap,
+	.vidioc_g_fmt_meta_cap		= uvc_meta_v4l2_get_format,
+	.vidioc_s_fmt_meta_cap		= uvc_meta_v4l2_set_format,
+	.vidioc_try_fmt_meta_cap	= uvc_meta_v4l2_try_format,
+	.vidioc_enum_fmt_meta_cap	= uvc_meta_v4l2_enum_formats,
+	.vidioc_reqbufs			= vb2_ioctl_reqbufs,
+	.vidioc_querybuf		= vb2_ioctl_querybuf,
+	.vidioc_qbuf			= vb2_ioctl_qbuf,
+	.vidioc_dqbuf			= vb2_ioctl_dqbuf,
+	.vidioc_create_bufs		= vb2_ioctl_create_bufs,
+	.vidioc_prepare_buf		= vb2_ioctl_prepare_buf,
+	.vidioc_streamon		= vb2_ioctl_streamon,
+	.vidioc_streamoff		= vb2_ioctl_streamoff,
+};
+
+/* -----------------------------------------------------------------------------
+ * V4L2 File Operations
+ */
+
+static const struct v4l2_file_operations uvc_meta_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = video_ioctl2,
+	.open = v4l2_fh_open,
+	.release = vb2_fop_release,
+	.poll = vb2_fop_poll,
+	.mmap = vb2_fop_mmap,
+};
+
+int uvc_meta_register(struct uvc_streaming *stream)
+{
+	struct uvc_device *dev = stream->dev;
+	struct video_device *vdev = &stream->meta.vdev;
+	struct uvc_video_queue *queue = &stream->meta.queue;
+
+	stream->meta.format = V4L2_META_FMT_UVC;
+
+	/*
+	 * The video interface queue uses manual locking and thus does not set
+	 * the queue pointer. Set it manually here.
+	 */
+	vdev->queue = &queue->queue;
+
+	return uvc_register_video_device(dev, stream, vdev, queue,
+					 V4L2_BUF_TYPE_META_CAPTURE,
+					 &uvc_meta_fops, &uvc_meta_ioctl_ops);
+}
diff --git a/drivers/media/usb/uvc/uvc_queue.c b/drivers/media/usb/uvc/uvc_queue.c
index c8d78b2f3de4..cd2ea5a23086 100644
--- a/drivers/media/usb/uvc/uvc_queue.c
+++ b/drivers/media/usb/uvc/uvc_queue.c
@@ -79,8 +79,19 @@ static int uvc_queue_setup(struct vb2_queue *vq,
 			   unsigned int sizes[], struct device *alloc_devs[])
 {
 	struct uvc_video_queue *queue = vb2_get_drv_priv(vq);
-	struct uvc_streaming *stream = uvc_queue_to_stream(queue);
-	unsigned size = stream->ctrl.dwMaxVideoFrameSize;
+	struct uvc_streaming *stream;
+	unsigned int size;
+
+	switch (vq->type) {
+	case V4L2_BUF_TYPE_META_CAPTURE:
+		size = UVC_METATADA_BUF_SIZE;
+		break;
+
+	default:
+		stream = uvc_queue_to_stream(queue);
+		size = stream->ctrl.dwMaxVideoFrameSize;
+		break;
+	}
 
 	/*
 	 * When called with plane sizes, validate them. The driver supports
@@ -114,7 +125,7 @@ static int uvc_buffer_prepare(struct vb2_buffer *vb)
 	buf->error = 0;
 	buf->mem = vb2_plane_vaddr(vb, 0);
 	buf->length = vb2_plane_size(vb, 0);
-	if (vb->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
+	if (vb->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)
 		buf->bytesused = 0;
 	else
 		buf->bytesused = vb2_get_plane_payload(vb, 0);
@@ -177,10 +188,10 @@ static int uvc_start_streaming(struct vb2_queue *vq, unsigned int count)
 static void uvc_stop_streaming(struct vb2_queue *vq)
 {
 	struct uvc_video_queue *queue = vb2_get_drv_priv(vq);
-	struct uvc_streaming *stream = uvc_queue_to_stream(queue);
 	unsigned long flags;
 
-	uvc_video_enable(stream, 0);
+	if (vq->type != V4L2_BUF_TYPE_META_CAPTURE)
+		uvc_video_enable(uvc_queue_to_stream(queue), 0);
 
 	spin_lock_irqsave(&queue->irqlock, flags);
 	uvc_queue_return_buffers(queue, UVC_BUF_STATE_ERROR);
@@ -198,20 +209,39 @@ static const struct vb2_ops uvc_queue_qops = {
 	.stop_streaming = uvc_stop_streaming,
 };
 
+static const struct vb2_ops uvc_meta_queue_qops = {
+	.queue_setup = uvc_queue_setup,
+	.buf_prepare = uvc_buffer_prepare,
+	.buf_queue = uvc_buffer_queue,
+	.wait_prepare = vb2_ops_wait_prepare,
+	.wait_finish = vb2_ops_wait_finish,
+	.stop_streaming = uvc_stop_streaming,
+};
+
 int uvc_queue_init(struct uvc_video_queue *queue, enum v4l2_buf_type type,
 		    int drop_corrupted)
 {
 	int ret;
 
 	queue->queue.type = type;
-	queue->queue.io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF;
+	queue->queue.io_modes = VB2_MMAP | VB2_USERPTR;
 	queue->queue.drv_priv = queue;
 	queue->queue.buf_struct_size = sizeof(struct uvc_buffer);
-	queue->queue.ops = &uvc_queue_qops;
 	queue->queue.mem_ops = &vb2_vmalloc_memops;
 	queue->queue.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC
 		| V4L2_BUF_FLAG_TSTAMP_SRC_SOE;
 	queue->queue.lock = &queue->mutex;
+
+	switch (type) {
+	case V4L2_BUF_TYPE_META_CAPTURE:
+		queue->queue.ops = &uvc_meta_queue_qops;
+		break;
+	default:
+		queue->queue.io_modes |= VB2_DMABUF;
+		queue->queue.ops = &uvc_queue_qops;
+		break;
+	}
+
 	ret = vb2_queue_init(&queue->queue);
 	if (ret)
 		return ret;
diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
index 13f459ea239c..5441553f74e1 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -1119,6 +1119,84 @@ static int uvc_video_encode_data(struct uvc_streaming *stream,
 	return nbytes;
 }
 
+/* ------------------------------------------------------------------------
+ * Metadata
+ */
+
+/*
+ * Additionally to the payload headers we also want to provide the user with USB
+ * Frame Numbers and system time values. The resulting buffer is thus composed
+ * of blocks, containing a 64-bit timestamp in  nanoseconds, a 16-bit USB Frame
+ * Number, and a copy of the payload header.
+ *
+ * Ideally we want to capture all payload headers for each frame. However, their
+ * number is unknown and unbound. We thus drop headers that contain no vendor
+ * data and that either contain no SCR value or an SCR value identical to the
+ * previous header.
+ */
+static void uvc_video_decode_meta(struct uvc_streaming *stream,
+				  struct uvc_buffer *meta_buf,
+				  const u8 *mem, unsigned int length)
+{
+	struct uvc_meta_buf *meta;
+	size_t len_std = 2;
+	bool has_pts, has_scr;
+	unsigned long flags;
+	unsigned int sof;
+	ktime_t time;
+	const u8 *scr;
+
+	if (!meta_buf || length == 2)
+		return;
+
+	if (meta_buf->length - meta_buf->bytesused <
+	    length + sizeof(meta->ns) + sizeof(meta->sof)) {
+		meta_buf->error = 1;
+		return;
+	}
+
+	has_pts = mem[1] & UVC_STREAM_PTS;
+	has_scr = mem[1] & UVC_STREAM_SCR;
+
+	if (has_pts) {
+		len_std += 4;
+		scr = mem + 6;
+	} else {
+		scr = mem + 2;
+	}
+
+	if (has_scr)
+		len_std += 6;
+
+	if (stream->meta.format == V4L2_META_FMT_UVC)
+		length = len_std;
+
+	if (length == len_std && (!has_scr ||
+				  !memcmp(scr, stream->clock.last_scr, 6)))
+		return;
+
+	meta = (struct uvc_meta_buf *)((u8 *)meta_buf->mem + meta_buf->bytesused);
+	local_irq_save(flags);
+	time = uvc_video_get_time();
+	sof = usb_get_current_frame_number(stream->dev->udev);
+	local_irq_restore(flags);
+	put_unaligned(ktime_to_ns(time), &meta->ns);
+	put_unaligned(sof, &meta->sof);
+
+	if (has_scr)
+		memcpy(stream->clock.last_scr, scr, 6);
+
+	memcpy(&meta->length, mem, length);
+	meta_buf->bytesused += length + sizeof(meta->ns) + sizeof(meta->sof);
+
+	uvc_trace(UVC_TRACE_FRAME,
+		  "%s(): t-sys %lluns, SOF %u, len %u, flags 0x%x, PTS %u, STC %u frame SOF %u\n",
+		  __func__, time, meta->sof, meta->length, meta->flags,
+		  has_pts ? *(u32 *)meta->buf : 0,
+		  has_scr ? *(u32 *)scr : 0,
+		  has_scr ? *(u32 *)(scr + 4) & 0x7ff : 0);
+}
+
 /* ------------------------------------------------------------------------
  * URB handling
  */
@@ -1137,8 +1215,29 @@ static void uvc_video_validate_buffer(const struct uvc_streaming *stream,
 /*
  * Completion handler for video URBs.
  */
+
+static void uvc_video_next_buffers(struct uvc_streaming *stream,
+		struct uvc_buffer **video_buf, struct uvc_buffer **meta_buf)
+{
+	if (*meta_buf) {
+		struct vb2_v4l2_buffer *vb2_meta = &(*meta_buf)->buf;
+		const struct vb2_v4l2_buffer *vb2_video = &(*video_buf)->buf;
+
+		vb2_meta->sequence = vb2_video->sequence;
+		vb2_meta->field = vb2_video->field;
+		vb2_meta->vb2_buf.timestamp = vb2_video->vb2_buf.timestamp;
+
+		(*meta_buf)->state = UVC_BUF_STATE_READY;
+		if (!(*meta_buf)->error)
+			(*meta_buf)->error = (*video_buf)->error;
+		*meta_buf = uvc_queue_next_buffer(&stream->meta.queue,
+						  *meta_buf);
+	}
+	*video_buf = uvc_queue_next_buffer(&stream->queue, *video_buf);
+}
+
 static void uvc_video_decode_isoc(struct urb *urb, struct uvc_streaming *stream,
-	struct uvc_buffer *buf)
+			struct uvc_buffer *buf, struct uvc_buffer *meta_buf)
 {
 	u8 *mem;
 	int ret, i;
@@ -1160,14 +1259,15 @@ static void uvc_video_decode_isoc(struct urb *urb, struct uvc_streaming *stream,
 				urb->iso_frame_desc[i].actual_length);
 			if (ret == -EAGAIN) {
 				uvc_video_validate_buffer(stream, buf);
-				buf = uvc_queue_next_buffer(&stream->queue,
-							    buf);
+				uvc_video_next_buffers(stream, &buf, &meta_buf);
 			}
 		} while (ret == -EAGAIN);
 
 		if (ret < 0)
 			continue;
 
+		uvc_video_decode_meta(stream, meta_buf, mem, ret);
+
 		/* Decode the payload data. */
 		uvc_video_decode_data(stream, buf, mem + ret,
 			urb->iso_frame_desc[i].actual_length - ret);
@@ -1178,13 +1278,13 @@ static void uvc_video_decode_isoc(struct urb *urb, struct uvc_streaming *stream,
 
 		if (buf->state == UVC_BUF_STATE_READY) {
 			uvc_video_validate_buffer(stream, buf);
-			buf = uvc_queue_next_buffer(&stream->queue, buf);
+			uvc_video_next_buffers(stream, &buf, &meta_buf);
 		}
 	}
 }
 
 static void uvc_video_decode_bulk(struct urb *urb, struct uvc_streaming *stream,
-	struct uvc_buffer *buf)
+			struct uvc_buffer *buf, struct uvc_buffer *meta_buf)
 {
 	u8 *mem;
 	int len, ret;
@@ -1207,8 +1307,7 @@ static void uvc_video_decode_bulk(struct urb *urb, struct uvc_streaming *stream,
 		do {
 			ret = uvc_video_decode_start(stream, buf, mem, len);
 			if (ret == -EAGAIN)
-				buf = uvc_queue_next_buffer(&stream->queue,
-							    buf);
+				uvc_video_next_buffers(stream, &buf, &meta_buf);
 		} while (ret == -EAGAIN);
 
 		/* If an error occurred skip the rest of the payload. */
@@ -1218,6 +1317,8 @@ static void uvc_video_decode_bulk(struct urb *urb, struct uvc_streaming *stream,
 			memcpy(stream->bulk.header, mem, ret);
 			stream->bulk.header_size = ret;
 
+			uvc_video_decode_meta(stream, meta_buf, mem, ret);
+
 			mem += ret;
 			len -= ret;
 		}
@@ -1241,7 +1342,7 @@ static void uvc_video_decode_bulk(struct urb *urb, struct uvc_streaming *stream,
 			uvc_video_decode_end(stream, buf, stream->bulk.header,
 				stream->bulk.payload_size);
 			if (buf->state == UVC_BUF_STATE_READY)
-				uvc_queue_next_buffer(&stream->queue, buf);
+				uvc_video_next_buffers(stream, &buf, &meta_buf);
 		}
 
 		stream->bulk.header_size = 0;
@@ -1251,7 +1352,7 @@ static void uvc_video_decode_bulk(struct urb *urb, struct uvc_streaming *stream,
 }
 
 static void uvc_video_encode_bulk(struct urb *urb, struct uvc_streaming *stream,
-	struct uvc_buffer *buf)
+	struct uvc_buffer *buf, struct uvc_buffer *meta_buf)
 {
 	u8 *mem = urb->transfer_buffer;
 	int len = stream->urb_size, ret;
@@ -1297,7 +1398,10 @@ static void uvc_video_complete(struct urb *urb)
 {
 	struct uvc_streaming *stream = urb->context;
 	struct uvc_video_queue *queue = &stream->queue;
+	struct uvc_video_queue *qmeta = &stream->meta.queue;
+	struct vb2_queue *vb2_qmeta = stream->meta.vdev.queue;
 	struct uvc_buffer *buf = NULL;
+	struct uvc_buffer *buf_meta = NULL;
 	unsigned long flags;
 	int ret;
 
@@ -1316,6 +1420,8 @@ static void uvc_video_complete(struct urb *urb)
 	case -ECONNRESET:	/* usb_unlink_urb() called. */
 	case -ESHUTDOWN:	/* The endpoint is being disabled. */
 		uvc_queue_cancel(queue, urb->status == -ESHUTDOWN);
+		if (vb2_qmeta)
+			uvc_queue_cancel(qmeta, urb->status == -ESHUTDOWN);
 		return;
 	}
 
@@ -1325,7 +1431,15 @@ static void uvc_video_complete(struct urb *urb)
 				       queue);
 	spin_unlock_irqrestore(&queue->irqlock, flags);
 
-	stream->decode(urb, stream, buf);
+	if (vb2_qmeta) {
+		spin_lock_irqsave(&qmeta->irqlock, flags);
+		if (!list_empty(&qmeta->irqqueue))
+			buf_meta = list_first_entry(&qmeta->irqqueue,
+						    struct uvc_buffer, queue);
+		spin_unlock_irqrestore(&qmeta->irqlock, flags);
+	}
+
+	stream->decode(urb, stream, buf, buf_meta);
 
 	if ((ret = usb_submit_urb(urb, GFP_ATOMIC)) < 0) {
 		uvc_printk(KERN_ERR, "Failed to resubmit video URB (%d).\n",
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index c36fa0991141..961d7c13b27b 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -479,6 +479,8 @@ struct uvc_stats_stream {
 	unsigned int max_sof;		/* Maximum STC.SOF value */
 };
 
+#define UVC_METATADA_BUF_SIZE 1024
+
 struct uvc_streaming {
 	struct list_head list;
 	struct uvc_device *dev;
@@ -510,7 +512,13 @@ struct uvc_streaming {
 	unsigned int frozen : 1;
 	struct uvc_video_queue queue;
 	void (*decode) (struct urb *urb, struct uvc_streaming *video,
-			struct uvc_buffer *buf);
+			struct uvc_buffer *buf, struct uvc_buffer *meta_buf);
+
+	struct {
+		struct video_device vdev;
+		struct uvc_video_queue queue;
+		__u32 format;
+	} meta;
 
 	/* Context data used by the bulk completion handler. */
 	struct {
@@ -552,6 +560,8 @@ struct uvc_streaming {
 		u16 last_sof;
 		u16 sof_offset;
 
+		u8 last_scr[6];
+
 		spinlock_t lock;
 	} clock;
 };
@@ -561,6 +571,7 @@ struct uvc_device {
 	struct usb_interface *intf;
 	unsigned long warnings;
 	__u32 quirks;
+	__u32 meta_format;
 	int intfnum;
 	char name[32];
 
@@ -715,6 +726,7 @@ extern int uvc_query_ctrl(struct uvc_device *dev, __u8 query, __u8 unit,
 void uvc_video_clock_update(struct uvc_streaming *stream,
 			    struct vb2_v4l2_buffer *vbuf,
 			    struct uvc_buffer *buf);
+int uvc_meta_register(struct uvc_streaming *stream);
 
 int uvc_register_video_device(struct uvc_device *dev,
 			      struct uvc_streaming *stream,
@@ -777,7 +789,7 @@ extern struct usb_host_endpoint *uvc_find_endpoint(
 
 /* Quirks support */
 void uvc_video_decode_isight(struct urb *urb, struct uvc_streaming *stream,
-		struct uvc_buffer *buf);
+		struct uvc_buffer *buf, struct uvc_buffer *meta_buf);
 
 /* debugfs and statistics */
 void uvc_debugfs_init(void);
diff --git a/include/uapi/linux/uvcvideo.h b/include/uapi/linux/uvcvideo.h
index e80b4655d8cd..020714d2c5bd 100644
--- a/include/uapi/linux/uvcvideo.h
+++ b/include/uapi/linux/uvcvideo.h
@@ -68,4 +68,30 @@ struct uvc_xu_control_query {
 #define UVCIOC_CTRL_MAP		_IOWR('u', 0x20, struct uvc_xu_control_mapping)
 #define UVCIOC_CTRL_QUERY	_IOWR('u', 0x21, struct uvc_xu_control_query)
 
+/*
+ * Metadata node
+ */
+
+/**
+ * struct uvc_meta_buf - metadata buffer building block
+ * @ns		- system timestamp of the payload in nanoseconds
+ * @sof		- USB Frame Number
+ * @length	- length of the payload header
+ * @flags	- payload header flags
+ * @buf		- optional device-specific header data
+ *
+ * UVC metadata nodes fill buffers with possibly multiple instances of this
+ * struct. The first two fields are added by the driver, they can be used for
+ * clock synchronisation. The rest is an exact copy of a UVC payload header.
+ * Only complete objects with complete buffers are included. Therefore it's
+ * always sizeof(meta->ts) + sizeof(meta->sof) + meta->length bytes large.
+ */
+struct uvc_meta_buf {
+	__u64 ns;
+	__u16 sof;
+	__u8 length;
+	__u8 flags;
+	__u8 buf[];
+} __packed;
+
 #endif
-- 
cgit v1.2.3


From 6e6a8b5a38cb04d5ef35d4eb57836126b954e7c8 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 4 Jan 2018 13:08:56 -0500
Subject: media: replace all <spaces><tab> occurrences

There are a lot of places where sequences of space/tabs are
found. Get rid of all spaces before tabs.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/common/saa7146/saa7146_video.c       |   8 +-
 drivers/media/dvb-core/Makefile                    |   2 +-
 drivers/media/dvb-frontends/au8522_priv.h          | 218 ++++++++---------
 drivers/media/dvb-frontends/drx39xyj/drx_driver.h  |   2 +-
 drivers/media/dvb-frontends/stb0899_drv.c          |  10 +-
 drivers/media/dvb-frontends/stb0899_drv.h          |   2 +-
 drivers/media/dvb-frontends/stb0899_priv.h         |   2 +-
 drivers/media/dvb-frontends/stv0900_core.c         |   2 +-
 drivers/media/dvb-frontends/stv0900_init.h         |  34 +--
 drivers/media/dvb-frontends/stv0900_priv.h         |   2 +-
 drivers/media/dvb-frontends/stv090x.c              |  12 +-
 drivers/media/dvb-frontends/stv090x_priv.h         |   2 +-
 drivers/media/dvb-frontends/stv6110x.c             |   2 +-
 drivers/media/dvb-frontends/stv6110x_priv.h        |   6 +-
 drivers/media/dvb-frontends/tda10023.c             |   2 +-
 drivers/media/firewire/firedtv-avc.c               |   4 +-
 drivers/media/firewire/firedtv-fe.c                |   6 +-
 drivers/media/i2c/cx25840/cx25840-core.c           |   2 +-
 drivers/media/i2c/cx25840/cx25840-core.h           |   2 +-
 drivers/media/i2c/cx25840/cx25840-ir.c             |   2 +-
 drivers/media/i2c/ks0127.c                         |   2 +-
 drivers/media/i2c/ov7670.c                         |  38 +--
 drivers/media/i2c/saa6752hs.c                      |   8 +-
 drivers/media/i2c/saa7115.c                        |   2 +-
 drivers/media/i2c/saa7127.c                        | 162 ++++++-------
 drivers/media/i2c/saa717x.c                        |  12 +-
 drivers/media/i2c/ths7303.c                        |   2 +-
 drivers/media/i2c/tvaudio.c                        |   2 +-
 drivers/media/i2c/tvp7002_reg.h                    |   6 +-
 drivers/media/i2c/vpx3220.c                        |   2 +-
 drivers/media/pci/bt8xx/bttv-cards.c               | 266 ++++++++++-----------
 drivers/media/pci/bt8xx/bttv-input.c               |   8 +-
 drivers/media/pci/bt8xx/bttv.h                     |   4 +-
 drivers/media/pci/bt8xx/bttvp.h                    |   6 +-
 drivers/media/pci/cx18/cx18-alsa-pcm.c             |   2 +-
 drivers/media/pci/cx18/cx18-av-audio.c             |   2 +-
 drivers/media/pci/cx18/cx18-av-core.c              |  18 +-
 drivers/media/pci/cx18/cx18-av-core.h              |   2 +-
 drivers/media/pci/cx18/cx18-cards.c                |   8 +-
 drivers/media/pci/cx18/cx18-cards.h                |  32 +--
 drivers/media/pci/cx18/cx18-driver.h               |  46 ++--
 drivers/media/pci/cx18/cx18-firmware.c             |  96 ++++----
 drivers/media/pci/cx18/cx18-mailbox.c              |   8 +-
 drivers/media/pci/cx18/cx18-streams.c              |   2 +-
 drivers/media/pci/cx18/cx18-vbi.c                  |   2 +-
 drivers/media/pci/cx18/cx23418.h                   |  88 +++----
 drivers/media/pci/cx23885/cimax2.c                 |   2 +-
 drivers/media/pci/cx23885/cx23885-video.c          |   2 +-
 drivers/media/pci/cx23885/cx23885.h                |   4 +-
 drivers/media/pci/cx23885/cx23888-ir.c             |   2 +-
 drivers/media/pci/ivtv/ivtv-cards.h                | 126 +++++-----
 drivers/media/pci/ivtv/ivtv-driver.h               | 102 ++++----
 drivers/media/pci/ivtv/ivtv-firmware.c             |  36 +--
 drivers/media/pci/ivtv/ivtv-i2c.c                  |  26 +-
 drivers/media/pci/ivtv/ivtv-ioctl.c                |  74 +++---
 drivers/media/pci/ivtv/ivtv-mailbox.c              | 182 +++++++-------
 drivers/media/pci/mantis/mantis_reg.h              |   6 +-
 drivers/media/pci/mantis/mantis_vp1041.c           | 210 ++++++++--------
 drivers/media/pci/meye/meye.c                      |   2 +-
 drivers/media/pci/saa7134/saa7134-cards.c          |  64 ++---
 drivers/media/pci/saa7134/saa7134-dvb.c            |   4 +-
 drivers/media/pci/saa7134/saa7134-video.c          |   4 +-
 drivers/media/pci/saa7134/saa7134.h                |   8 +-
 drivers/media/pci/saa7146/hexium_gemini.c          |  22 +-
 drivers/media/pci/saa7146/hexium_orion.c           |  18 +-
 drivers/media/pci/saa7146/mxb.c                    |  24 +-
 drivers/media/pci/ttpci/av7110.h                   |   2 +-
 drivers/media/pci/ttpci/budget-av.c                |   6 +-
 drivers/media/pci/ttpci/budget-ci.c                | 210 ++++++++--------
 drivers/media/pci/zoran/zoran_driver.c             |  38 +--
 drivers/media/pci/zoran/zr36057.h                  |   4 +-
 drivers/media/platform/Makefile                    |  14 +-
 drivers/media/platform/arv.c                       |  50 ++--
 drivers/media/platform/coda/coda_regs.h            |   2 +-
 drivers/media/platform/davinci/dm355_ccdc_regs.h   |   6 +-
 drivers/media/platform/davinci/dm644x_ccdc_regs.h  |   4 +-
 drivers/media/platform/davinci/isif_regs.h         |   6 +-
 drivers/media/platform/davinci/vpfe_capture.c      |   2 +-
 drivers/media/platform/davinci/vpif.h              |   4 +-
 drivers/media/platform/davinci/vpss.c              |  10 +-
 drivers/media/platform/exynos4-is/fimc-core.c      |   2 +-
 drivers/media/platform/m2m-deinterlace.c           |  12 +-
 drivers/media/platform/omap/omap_vout.c            |  12 +-
 drivers/media/platform/sh_vou.c                    |   2 +-
 drivers/media/radio/radio-aimslab.c                |   2 +-
 drivers/media/radio/radio-aztech.c                 |   2 +-
 drivers/media/radio/radio-cadet.c                  |   4 +-
 drivers/media/radio/radio-gemtek.c                 |   8 +-
 drivers/media/radio/radio-rtrack2.c                |   2 +-
 drivers/media/radio/radio-sf16fmi.c                |   4 +-
 drivers/media/radio/radio-sf16fmr2.c               |   2 +-
 drivers/media/radio/radio-tea5764.c                |   2 +-
 drivers/media/radio/radio-terratec.c               |   6 +-
 drivers/media/radio/tea575x.c                      |   2 +-
 drivers/media/rc/keymaps/rc-behold-columbus.c      |   6 +-
 drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c |   2 +-
 drivers/media/tuners/mxl5005s.c                    |   6 +-
 drivers/media/tuners/tda827x.h                     |   2 +-
 drivers/media/tuners/tda9887.c                     |   4 +-
 drivers/media/tuners/tuner-simple.c                |   2 +-
 drivers/media/tuners/tuner-xc2028.c                |   6 +-
 drivers/media/tuners/tuner-xc2028.h                |   2 +-
 drivers/media/usb/au0828/au0828-cards.h            |   2 +-
 drivers/media/usb/au0828/au0828-video.c            |   2 +-
 drivers/media/usb/au0828/au0828.h                  |   6 +-
 drivers/media/usb/cpia2/cpia2_usb.c                |  14 +-
 drivers/media/usb/cx231xx/cx231xx-audio.c          |   6 +-
 drivers/media/usb/cx231xx/cx231xx-avcore.c         |   2 +-
 drivers/media/usb/cx231xx/cx231xx-core.c           |   2 +-
 drivers/media/usb/cx231xx/cx231xx-i2c.c            |   2 +-
 drivers/media/usb/cx231xx/cx231xx-pcb-cfg.h        |   2 +-
 drivers/media/usb/cx231xx/cx231xx-reg.h            |  20 +-
 drivers/media/usb/dvb-usb/az6027.c                 | 216 ++++++++---------
 drivers/media/usb/gspca/stv06xx/stv06xx.c          |   2 +-
 drivers/media/usb/hdpvr/hdpvr-video.c              |  26 +-
 drivers/media/usb/hdpvr/hdpvr.h                    |  18 +-
 drivers/media/usb/pwc/pwc.h                        |   6 +-
 drivers/media/usb/siano/smsusb.c                   |   2 +-
 drivers/media/usb/stk1160/Makefile                 |   2 +-
 drivers/media/usb/stkwebcam/stk-sensor.c           |  44 ++--
 drivers/media/usb/uvc/uvc_driver.c                 |  14 +-
 drivers/media/usb/uvc/uvc_isight.c                 |  10 +-
 drivers/media/v4l2-core/v4l2-compat-ioctl32.c      |   6 +-
 drivers/media/v4l2-core/v4l2-ioctl.c               |  62 ++---
 include/media/drv-intf/cx2341x.h                   | 144 +++++------
 include/media/drv-intf/msp3400.h                   |  62 ++---
 include/media/drv-intf/saa7146.h                   |   2 +-
 include/media/i2c/bt819.h                          |   4 +-
 include/media/i2c/m52790.h                         |  52 ++--
 include/media/i2c/saa7115.h                        |  12 +-
 include/media/i2c/upd64031a.h                      |   6 +-
 include/media/v4l2-common.h                        |   8 +-
 include/uapi/linux/dvb/video.h                     |  20 +-
 include/uapi/linux/v4l2-controls.h                 |  96 ++++----
 include/uapi/linux/videodev2.h                     |  56 ++---
 135 files changed, 1722 insertions(+), 1722 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/common/saa7146/saa7146_video.c b/drivers/media/common/saa7146/saa7146_video.c
index 5dfc1f27d1cf..0dfa0c09d646 100644
--- a/drivers/media/common/saa7146/saa7146_video.c
+++ b/drivers/media/common/saa7146/saa7146_video.c
@@ -1001,9 +1001,9 @@ const struct v4l2_ioctl_ops saa7146_video_ioctl_ops = {
 	.vidioc_try_fmt_vid_overlay  = vidioc_try_fmt_vid_overlay,
 	.vidioc_s_fmt_vid_overlay    = vidioc_s_fmt_vid_overlay,
 
-	.vidioc_overlay 	     = vidioc_overlay,
-	.vidioc_g_fbuf  	     = vidioc_g_fbuf,
-	.vidioc_s_fbuf  	     = vidioc_s_fbuf,
+	.vidioc_overlay		     = vidioc_overlay,
+	.vidioc_g_fbuf		     = vidioc_g_fbuf,
+	.vidioc_s_fbuf		     = vidioc_s_fbuf,
 	.vidioc_reqbufs              = vidioc_reqbufs,
 	.vidioc_querybuf             = vidioc_querybuf,
 	.vidioc_qbuf                 = vidioc_qbuf,
@@ -1012,7 +1012,7 @@ const struct v4l2_ioctl_ops saa7146_video_ioctl_ops = {
 	.vidioc_s_std                = vidioc_s_std,
 	.vidioc_streamon             = vidioc_streamon,
 	.vidioc_streamoff            = vidioc_streamoff,
-	.vidioc_g_parm 		     = vidioc_g_parm,
+	.vidioc_g_parm		     = vidioc_g_parm,
 	.vidioc_subscribe_event      = v4l2_ctrl_subscribe_event,
 	.vidioc_unsubscribe_event    = v4l2_event_unsubscribe,
 };
diff --git a/drivers/media/dvb-core/Makefile b/drivers/media/dvb-core/Makefile
index 05827ee2a406..3a105d82019a 100644
--- a/drivers/media/dvb-core/Makefile
+++ b/drivers/media/dvb-core/Makefile
@@ -7,7 +7,7 @@ dvb-net-$(CONFIG_DVB_NET) := dvb_net.o
 dvb-vb2-$(CONFIG_DVB_MMSP) := dvb_vb2.o
 
 dvb-core-objs := dvbdev.o dmxdev.o dvb_demux.o			\
-		 dvb_ca_en50221.o dvb_frontend.o 		\
+		 dvb_ca_en50221.o dvb_frontend.o		\
 		 $(dvb-net-y) dvb_ringbuffer.o $(dvb-vb2-y) dvb_math.o
 
 obj-$(CONFIG_DVB_CORE) += dvb-core.o
diff --git a/drivers/media/dvb-frontends/au8522_priv.h b/drivers/media/dvb-frontends/au8522_priv.h
index f02dac958db6..2043c1744753 100644
--- a/drivers/media/dvb-frontends/au8522_priv.h
+++ b/drivers/media/dvb-frontends/au8522_priv.h
@@ -99,7 +99,7 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H		0x0A5
 #define AU8522_AGC_CONTROL_RANGE_REG0A6H		0x0A6
 #define AU8522_SYSTEM_GAIN_CONTROL_REG0A7H		0x0A7
-#define AU8522_TUNER_AGC_RF_STOP_REG0A8H  		0x0A8
+#define AU8522_TUNER_AGC_RF_STOP_REG0A8H		0x0A8
 #define AU8522_TUNER_AGC_RF_START_REG0A9H		0x0A9
 #define AU8522_TUNER_RF_AGC_DEFAULT_REG0AAH		0x0AA
 #define AU8522_TUNER_AGC_IF_STOP_REG0ABH		0x0AB
@@ -110,18 +110,18 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 
 /* Receiver registers */
 #define AU8522_FRMREGTHRD1_REG0B0H			0x0B0
-#define AU8522_FRMREGAGC1H_REG0B1H 			0x0B1
-#define AU8522_FRMREGSHIFT1_REG0B2H 			0x0B2
-#define AU8522_TOREGAGC1_REG0B3H 			0x0B3
-#define AU8522_TOREGASHIFT1_REG0B4H 			0x0B4
+#define AU8522_FRMREGAGC1H_REG0B1H			0x0B1
+#define AU8522_FRMREGSHIFT1_REG0B2H			0x0B2
+#define AU8522_TOREGAGC1_REG0B3H			0x0B3
+#define AU8522_TOREGASHIFT1_REG0B4H			0x0B4
 #define AU8522_FRMREGBBH_REG0B5H			0x0B5
-#define AU8522_FRMREGBBM_REG0B6H 			0x0B6
-#define AU8522_FRMREGBBL_REG0B7H     			0x0B7
+#define AU8522_FRMREGBBM_REG0B6H			0x0B6
+#define AU8522_FRMREGBBL_REG0B7H			0x0B7
 /* 0xB8 TO 0xD7 are the filter coefficients */
-#define AU8522_FRMREGTHRD2_REG0D8H 			0x0D8
-#define AU8522_FRMREGAGC2H_REG0D9H 			0x0D9
-#define AU8522_TOREGAGC2_REG0DAH 			0x0DA
-#define AU8522_TOREGSHIFT2_REG0DBH 			0x0DB
+#define AU8522_FRMREGTHRD2_REG0D8H			0x0D8
+#define AU8522_FRMREGAGC2H_REG0D9H			0x0D9
+#define AU8522_TOREGAGC2_REG0DAH			0x0DA
+#define AU8522_TOREGSHIFT2_REG0DBH			0x0DB
 #define AU8522_FRMREGPILOTH_REG0DCH			0x0DC
 #define AU8522_FRMREGPILOTM_REG0DDH			0x0DD
 #define AU8522_FRMREGPILOTL_REG0DEH			0x0DE
@@ -134,9 +134,9 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 #define AU8522_CHIP_MODE_REG0FEH			0x0FE
 
 /* I2C bus control registers */
-#define AU8522_I2C_CONTROL_REG0_REG090H    		0x090
-#define AU8522_I2C_CONTROL_REG1_REG091H    		0x091
-#define AU8522_I2C_STATUS_REG092H          		0x092
+#define AU8522_I2C_CONTROL_REG0_REG090H			0x090
+#define AU8522_I2C_CONTROL_REG1_REG091H			0x091
+#define AU8522_I2C_STATUS_REG092H			0x092
 #define AU8522_I2C_WR_DATA0_REG093H			0x093
 #define AU8522_I2C_WR_DATA1_REG094H			0x094
 #define AU8522_I2C_WR_DATA2_REG095H			0x095
@@ -156,48 +156,48 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 
 #define AU8522_ENA_USB_REG101H				0x101
 
-#define AU8522_I2S_CTRL_0_REG110H  			0x110
-#define AU8522_I2S_CTRL_1_REG111H 			0x111
-#define AU8522_I2S_CTRL_2_REG112H 			0x112
+#define AU8522_I2S_CTRL_0_REG110H			0x110
+#define AU8522_I2S_CTRL_1_REG111H			0x111
+#define AU8522_I2S_CTRL_2_REG112H			0x112
 
-#define AU8522_FRMREGFFECONTROL_REG121H    		0x121
-#define AU8522_FRMREGDFECONTROL_REG122H    		0x122
+#define AU8522_FRMREGFFECONTROL_REG121H			0x121
+#define AU8522_FRMREGDFECONTROL_REG122H			0x122
 
-#define AU8522_CARRFREQOFFSET0_REG201H 			0x201
+#define AU8522_CARRFREQOFFSET0_REG201H			0x201
 #define AU8522_CARRFREQOFFSET1_REG202H			0x202
 
 #define AU8522_DECIMATION_GAIN_REG21AH			0x21A
-#define AU8522_FRMREGIFSLP_REG21BH 			0x21B
-#define AU8522_FRMREGTHRDL2_REG21CH 			0x21C
-#define AU8522_FRMREGSTEP3DB_REG21DH 			0x21D
+#define AU8522_FRMREGIFSLP_REG21BH			0x21B
+#define AU8522_FRMREGTHRDL2_REG21CH			0x21C
+#define AU8522_FRMREGSTEP3DB_REG21DH			0x21D
 #define AU8522_DAGC_GAIN_ADJUSTMENT_REG21EH		0x21E
-#define AU8522_FRMREGPLLMODE_REG21FH 			0x21F
-#define AU8522_FRMREGCSTHRD_REG220H 			0x220
-#define AU8522_FRMREGCRLOCKDMAX_REG221H 		0x221
-#define AU8522_FRMREGCRPERIODMASK_REG222H 		0x222
-#define AU8522_FRMREGCRLOCK0THH_REG223H 		0x223
-#define AU8522_FRMREGCRLOCK1THH_REG224H 		0x224
-#define AU8522_FRMREGCRLOCK0THL_REG225H 		0x225
-#define AU8522_FRMREGCRLOCK1THL_REG226H 		0x226
+#define AU8522_FRMREGPLLMODE_REG21FH			0x21F
+#define AU8522_FRMREGCSTHRD_REG220H			0x220
+#define AU8522_FRMREGCRLOCKDMAX_REG221H			0x221
+#define AU8522_FRMREGCRPERIODMASK_REG222H		0x222
+#define AU8522_FRMREGCRLOCK0THH_REG223H			0x223
+#define AU8522_FRMREGCRLOCK1THH_REG224H			0x224
+#define AU8522_FRMREGCRLOCK0THL_REG225H			0x225
+#define AU8522_FRMREGCRLOCK1THL_REG226H			0x226
 #define AU_FRMREGPLLACQPHASESCL_REG227H			0x227
-#define AU8522_FRMREGFREQFBCTRL_REG228H 		0x228
+#define AU8522_FRMREGFREQFBCTRL_REG228H			0x228
 
 /* Analog TV Decoder */
 #define AU8522_TVDEC_STATUS_REG000H			0x000
 #define AU8522_TVDEC_INT_STATUS_REG001H			0x001
-#define AU8522_TVDEC_MACROVISION_STATUS_REG002H 	0x002
+#define AU8522_TVDEC_MACROVISION_STATUS_REG002H		0x002
 #define AU8522_TVDEC_SHARPNESSREG009H			0x009
 #define AU8522_TVDEC_BRIGHTNESS_REG00AH			0x00A
 #define AU8522_TVDEC_CONTRAST_REG00BH			0x00B
 #define AU8522_TVDEC_SATURATION_CB_REG00CH		0x00C
 #define AU8522_TVDEC_SATURATION_CR_REG00DH		0x00D
 #define AU8522_TVDEC_HUE_H_REG00EH			0x00E
-#define AU8522_TVDEC_HUE_L_REG00FH                   	0x00F
+#define AU8522_TVDEC_HUE_L_REG00FH			0x00F
 #define AU8522_TVDEC_INT_MASK_REG010H			0x010
 #define AU8522_VIDEO_MODE_REG011H			0x011
 #define AU8522_TVDEC_PGA_REG012H			0x012
 #define AU8522_TVDEC_COMB_MODE_REG015H			0x015
-#define AU8522_REG016H                            	0x016
+#define AU8522_REG016H					0x016
 #define AU8522_TVDED_DBG_MODE_REG060H			0x060
 #define AU8522_TVDEC_FORMAT_CTRL1_REG061H		0x061
 #define AU8522_TVDEC_FORMAT_CTRL2_REG062H		0x062
@@ -207,13 +207,13 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 #define AU8522_TVDEC_COMB_VDIF_THR2_REG066H		0x066
 #define AU8522_TVDEC_COMB_VDIF_THR3_REG067H		0x067
 #define AU8522_TVDEC_COMB_NOTCH_THR_REG068H		0x068
-#define AU8522_TVDEC_COMB_HDIF_THR1_REG069H   		0x069
+#define AU8522_TVDEC_COMB_HDIF_THR1_REG069H		0x069
 #define AU8522_TVDEC_COMB_HDIF_THR2_REG06AH		0x06A
-#define AU8522_TVDEC_COMB_HDIF_THR3_REG06BH   		0x06B
-#define AU8522_TVDEC_COMB_DCDIF_THR1_REG06CH  		0x06C
-#define AU8522_TVDEC_COMB_DCDIF_THR2_REG06DH 		0x06D
-#define AU8522_TVDEC_COMB_DCDIF_THR3_REG06EH       	0x06E
-#define AU8522_TVDEC_UV_SEP_THR_REG06FH  		0x06F
+#define AU8522_TVDEC_COMB_HDIF_THR3_REG06BH		0x06B
+#define AU8522_TVDEC_COMB_DCDIF_THR1_REG06CH		0x06C
+#define AU8522_TVDEC_COMB_DCDIF_THR2_REG06DH		0x06D
+#define AU8522_TVDEC_COMB_DCDIF_THR3_REG06EH		0x06E
+#define AU8522_TVDEC_UV_SEP_THR_REG06FH			0x06F
 #define AU8522_TVDEC_COMB_DC_THR1_NTSC_REG070H		0x070
 #define AU8522_TVDEC_COMB_DC_THR2_NTSC_REG073H		0x073
 #define AU8522_TVDEC_DCAGC_CTRL_REG077H			0x077
@@ -229,42 +229,42 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 
 #define AU8522_TVDEC_CHROMA_AGC_REG401H		0x401
 #define AU8522_TVDEC_CHROMA_SFT_REG402H		0x402
-#define AU8522_FILTER_COEF_R410     		0x410
-#define AU8522_FILTER_COEF_R411     		0x411
-#define AU8522_FILTER_COEF_R412     		0x412
-#define AU8522_FILTER_COEF_R413     		0x413
-#define AU8522_FILTER_COEF_R414     		0x414
-#define AU8522_FILTER_COEF_R415     		0x415
-#define AU8522_FILTER_COEF_R416     		0x416
-#define AU8522_FILTER_COEF_R417     		0x417
-#define AU8522_FILTER_COEF_R418     		0x418
-#define AU8522_FILTER_COEF_R419     		0x419
-#define AU8522_FILTER_COEF_R41A     		0x41A
-#define AU8522_FILTER_COEF_R41B     		0x41B
-#define AU8522_FILTER_COEF_R41C     		0x41C
-#define AU8522_FILTER_COEF_R41D     		0x41D
-#define AU8522_FILTER_COEF_R41E     		0x41E
-#define AU8522_FILTER_COEF_R41F     		0x41F
-#define AU8522_FILTER_COEF_R420     		0x420
-#define AU8522_FILTER_COEF_R421     		0x421
-#define AU8522_FILTER_COEF_R422     		0x422
-#define AU8522_FILTER_COEF_R423     		0x423
-#define AU8522_FILTER_COEF_R424     		0x424
-#define AU8522_FILTER_COEF_R425     		0x425
-#define AU8522_FILTER_COEF_R426     		0x426
-#define AU8522_FILTER_COEF_R427     		0x427
-#define AU8522_FILTER_COEF_R428     		0x428
-#define AU8522_FILTER_COEF_R429     		0x429
-#define AU8522_FILTER_COEF_R42A     		0x42A
-#define AU8522_FILTER_COEF_R42B     		0x42B
-#define AU8522_FILTER_COEF_R42C     		0x42C
-#define AU8522_FILTER_COEF_R42D     		0x42D
+#define AU8522_FILTER_COEF_R410			0x410
+#define AU8522_FILTER_COEF_R411			0x411
+#define AU8522_FILTER_COEF_R412			0x412
+#define AU8522_FILTER_COEF_R413			0x413
+#define AU8522_FILTER_COEF_R414			0x414
+#define AU8522_FILTER_COEF_R415			0x415
+#define AU8522_FILTER_COEF_R416			0x416
+#define AU8522_FILTER_COEF_R417			0x417
+#define AU8522_FILTER_COEF_R418			0x418
+#define AU8522_FILTER_COEF_R419			0x419
+#define AU8522_FILTER_COEF_R41A			0x41A
+#define AU8522_FILTER_COEF_R41B			0x41B
+#define AU8522_FILTER_COEF_R41C			0x41C
+#define AU8522_FILTER_COEF_R41D			0x41D
+#define AU8522_FILTER_COEF_R41E			0x41E
+#define AU8522_FILTER_COEF_R41F			0x41F
+#define AU8522_FILTER_COEF_R420			0x420
+#define AU8522_FILTER_COEF_R421			0x421
+#define AU8522_FILTER_COEF_R422			0x422
+#define AU8522_FILTER_COEF_R423			0x423
+#define AU8522_FILTER_COEF_R424			0x424
+#define AU8522_FILTER_COEF_R425			0x425
+#define AU8522_FILTER_COEF_R426			0x426
+#define AU8522_FILTER_COEF_R427			0x427
+#define AU8522_FILTER_COEF_R428			0x428
+#define AU8522_FILTER_COEF_R429			0x429
+#define AU8522_FILTER_COEF_R42A			0x42A
+#define AU8522_FILTER_COEF_R42B			0x42B
+#define AU8522_FILTER_COEF_R42C			0x42C
+#define AU8522_FILTER_COEF_R42D			0x42D
 
 /* VBI Control Registers */
-#define AU8522_TVDEC_VBI_RX_FIFO_CONTAIN_REG004H  	0x004
-#define AU8522_TVDEC_VBI_TX_FIFO_CONTAIN_REG005H  	0x005
-#define AU8522_TVDEC_VBI_RX_FIFO_READ_REG006H      	0x006
-#define AU8522_TVDEC_VBI_FIFO_STATUS_REG007H       	0x007
+#define AU8522_TVDEC_VBI_RX_FIFO_CONTAIN_REG004H	0x004
+#define AU8522_TVDEC_VBI_TX_FIFO_CONTAIN_REG005H	0x005
+#define AU8522_TVDEC_VBI_RX_FIFO_READ_REG006H		0x006
+#define AU8522_TVDEC_VBI_FIFO_STATUS_REG007H		0x007
 #define AU8522_TVDEC_VBI_CTRL_H_REG017H			0x017
 #define AU8522_TVDEC_VBI_CTRL_L_REG018H			0x018
 #define AU8522_TVDEC_VBI_USER_TOTAL_BITS_REG019H	0x019
@@ -272,10 +272,10 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 #define AU8522_TVDEC_VBI_USER_TUNIT_L_REG01BH		0x01B
 #define AU8522_TVDEC_VBI_USER_THRESH1_REG01CH		0x01C
 #define AU8522_TVDEC_VBI_USER_FRAME_PAT2_REG01EH	0x01E
-#define AU8522_TVDEC_VBI_USER_FRAME_PAT1_REG01FH   	0x01F
-#define AU8522_TVDEC_VBI_USER_FRAME_PAT0_REG020H   	0x020
-#define AU8522_TVDEC_VBI_USER_FRAME_MASK2_REG021H 	0x021
-#define AU8522_TVDEC_VBI_USER_FRAME_MASK1_REG022H  	0x022
+#define AU8522_TVDEC_VBI_USER_FRAME_PAT1_REG01FH	0x01F
+#define AU8522_TVDEC_VBI_USER_FRAME_PAT0_REG020H	0x020
+#define AU8522_TVDEC_VBI_USER_FRAME_MASK2_REG021H	0x021
+#define AU8522_TVDEC_VBI_USER_FRAME_MASK1_REG022H	0x022
 #define AU8522_TVDEC_VBI_USER_FRAME_MASK0_REG023H	0x023
 
 #define AU8522_REG071H					0x071
@@ -315,17 +315,17 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 #define AU8522_GPIO_DATA_REG0E2H			0x0E2
 
 /* Audio Control Registers */
-#define AU8522_AUDIOAGC_REG0EEH 			0x0EE
-#define AU8522_AUDIO_STATUS_REG0F0H 			0x0F0
-#define AU8522_AUDIO_MODE_REG0F1H 			0x0F1
-#define AU8522_AUDIO_VOLUME_L_REG0F2H 			0x0F2
-#define AU8522_AUDIO_VOLUME_R_REG0F3H 			0x0F3
-#define AU8522_AUDIO_VOLUME_REG0F4H 			0x0F4
-#define AU8522_FRMREGAUPHASE_REG0F7H 			0x0F7
+#define AU8522_AUDIOAGC_REG0EEH				0x0EE
+#define AU8522_AUDIO_STATUS_REG0F0H			0x0F0
+#define AU8522_AUDIO_MODE_REG0F1H			0x0F1
+#define AU8522_AUDIO_VOLUME_L_REG0F2H			0x0F2
+#define AU8522_AUDIO_VOLUME_R_REG0F3H			0x0F3
+#define AU8522_AUDIO_VOLUME_REG0F4H			0x0F4
+#define AU8522_FRMREGAUPHASE_REG0F7H			0x0F7
 #define AU8522_REG0F9H					0x0F9
 
-#define AU8522_AUDIOAGC2_REG605H 			0x605
-#define AU8522_AUDIOFREQ_REG606H 			0x606
+#define AU8522_AUDIOAGC2_REG605H			0x605
+#define AU8522_AUDIOFREQ_REG606H			0x606
 
 
 /**************************************************************/
@@ -356,53 +356,53 @@ int au8522_led_ctrl(struct au8522_state *state, int led);
 #define AU8522_TVDEC_FORMAT_CTRL2_REG062H_STD_PAL_M		0x02
 
 
-#define AU8522_INPUT_CONTROL_REG081H_ATSC               	0xC4
+#define AU8522_INPUT_CONTROL_REG081H_ATSC			0xC4
 #define AU8522_INPUT_CONTROL_REG081H_ATVRF			0xC4
 #define AU8522_INPUT_CONTROL_REG081H_ATVRF13			0xC4
-#define AU8522_INPUT_CONTROL_REG081H_J83B64             	0xC4
-#define AU8522_INPUT_CONTROL_REG081H_J83B256            	0xC4
-#define AU8522_INPUT_CONTROL_REG081H_CVBS               	0x20
+#define AU8522_INPUT_CONTROL_REG081H_J83B64			0xC4
+#define AU8522_INPUT_CONTROL_REG081H_J83B256			0xC4
+#define AU8522_INPUT_CONTROL_REG081H_CVBS			0x20
 #define AU8522_INPUT_CONTROL_REG081H_CVBS_CH1			0xA2
 #define AU8522_INPUT_CONTROL_REG081H_CVBS_CH2			0xA0
 #define AU8522_INPUT_CONTROL_REG081H_CVBS_CH3			0x69
 #define AU8522_INPUT_CONTROL_REG081H_CVBS_CH4			0x68
-#define AU8522_INPUT_CONTROL_REG081H_CVBS_CH4_SIF        	0x28
+#define AU8522_INPUT_CONTROL_REG081H_CVBS_CH4_SIF		0x28
 /* CH1 AS Y,CH3 AS C */
-#define AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH13        	0x23
+#define AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH13		0x23
 /* CH2 AS Y,CH4 AS C */
-#define AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH24        	0x20
-#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_ATSC        	0x0C
-#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_J83B64      	0x09
-#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_J83B256    		0x09
-#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_CVBS        	0x12
-#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_ATVRF       	0x1A
+#define AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH24		0x20
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_ATSC		0x0C
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_J83B64		0x09
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_J83B256		0x09
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_CVBS		0x12
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_ATVRF		0x1A
 #define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_ATVRF13		0x1A
 #define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_SVIDEO		0x02
 
 #define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CLEAR		0x00
 #define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_SVIDEO		0x9C
-#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CVBS     	0x9D
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CVBS		0x9D
 #define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_ATSC		0xE8
-#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_J83B256 		0xCA
-#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_J83B64  		0xCA
-#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_ATVRF   		0xDD
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_J83B256		0xCA
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_J83B64		0xCA
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_ATVRF		0xDD
 #define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_ATVRF13		0xDD
 #define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_PAL		0xDD
 #define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_FM		0xDD
 
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_ATSC		0x80
-#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_J83B256 		0x80
-#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_J83B64  		0x80
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_J83B256		0x80
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_J83B64		0x80
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_DONGLE_ATSC	0x40
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_DONGLE_J83B256	0x40
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_DONGLE_J83B64	0x40
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_DONGLE_CLEAR	0x00
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_ATVRF		0x01
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_ATVRF13		0x01
-#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_SVIDEO  		0x04
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_SVIDEO		0x04
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_CVBS		0x01
-#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_PWM     		0x03
-#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_IIS      	0x09
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_PWM		0x03
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_IIS		0x09
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_PAL		0x01
 #define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_FM		0x01
 
diff --git a/drivers/media/dvb-frontends/drx39xyj/drx_driver.h b/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
index 855685b6b386..1ec20eecc433 100644
--- a/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
+++ b/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
@@ -932,7 +932,7 @@ STRUCTS
  * Used by DRX_CTRL_LOAD_UCODE and DRX_CTRL_VERIFY_UCODE
  */
 struct drxu_code_info {
-	char 			*mc_file;
+	char			*mc_file;
 };
 
 /*
diff --git a/drivers/media/dvb-frontends/stb0899_drv.c b/drivers/media/dvb-frontends/stb0899_drv.c
index 2c5427c77db7..3c654ae16e78 100644
--- a/drivers/media/dvb-frontends/stb0899_drv.c
+++ b/drivers/media/dvb-frontends/stb0899_drv.c
@@ -1583,15 +1583,15 @@ static enum dvbfe_algo stb0899_frontend_algo(struct dvb_frontend *fe)
 static const struct dvb_frontend_ops stb0899_ops = {
 	.delsys = { SYS_DVBS, SYS_DVBS2, SYS_DSS },
 	.info = {
-		.name 			= "STB0899 Multistandard",
+		.name			= "STB0899 Multistandard",
 		.frequency_min		= 950000,
-		.frequency_max 		= 2150000,
+		.frequency_max		= 2150000,
 		.frequency_stepsize	= 0,
 		.frequency_tolerance	= 0,
-		.symbol_rate_min 	=  5000000,
-		.symbol_rate_max 	= 45000000,
+		.symbol_rate_min	=  5000000,
+		.symbol_rate_max	= 45000000,
 
-		.caps 			= FE_CAN_INVERSION_AUTO	|
+		.caps			= FE_CAN_INVERSION_AUTO	|
 					  FE_CAN_FEC_AUTO	|
 					  FE_CAN_2G_MODULATION	|
 					  FE_CAN_QPSK
diff --git a/drivers/media/dvb-frontends/stb0899_drv.h b/drivers/media/dvb-frontends/stb0899_drv.h
index 6c285aee7edf..f65f9a8266f8 100644
--- a/drivers/media/dvb-frontends/stb0899_drv.h
+++ b/drivers/media/dvb-frontends/stb0899_drv.h
@@ -82,7 +82,7 @@ enum stb0899_inversion {
  * 1. POWER ON/OFF		(index 0)
  * 2. FE_HAS_LOCK/LOCK_LOSS	(index 1)
  *
- * @gpio 	= one of the above listed GPIO's
+ * @gpio	= one of the above listed GPIO's
  * @level	= output state: pulled up or low
  */
 struct stb0899_postproc {
diff --git a/drivers/media/dvb-frontends/stb0899_priv.h b/drivers/media/dvb-frontends/stb0899_priv.h
index 86d140e4c5ed..3285cd1ba60a 100644
--- a/drivers/media/dvb-frontends/stb0899_priv.h
+++ b/drivers/media/dvb-frontends/stb0899_priv.h
@@ -252,7 +252,7 @@ extern int stb0899_write_s2reg(struct stb0899_state *state,
 extern int stb0899_i2c_gate_ctrl(struct dvb_frontend *fe, int enable);
 
 
-#define STB0899_READ_S2REG(DEVICE, REG) 	(_stb0899_read_s2reg(state, DEVICE, STB0899_BASE_##REG, STB0899_OFF0_##REG))
+#define STB0899_READ_S2REG(DEVICE, REG)		(_stb0899_read_s2reg(state, DEVICE, STB0899_BASE_##REG, STB0899_OFF0_##REG))
 //#define STB0899_WRITE_S2REG(DEVICE, REG, DATA)	(_stb0899_write_s2reg(state, DEVICE, STB0899_BASE_##REG, STB0899_OFF0_##REG, DATA))
 
 /* stb0899_algo.c	*/
diff --git a/drivers/media/dvb-frontends/stv0900_core.c b/drivers/media/dvb-frontends/stv0900_core.c
index 0b739725e3c0..72f17b97ca04 100644
--- a/drivers/media/dvb-frontends/stv0900_core.c
+++ b/drivers/media/dvb-frontends/stv0900_core.c
@@ -1929,7 +1929,7 @@ struct dvb_frontend *stv0900_attach(const struct stv0900_config *config,
 	switch (demod) {
 	case 0:
 	case 1:
-		init_params.dmd_ref_clk  	= config->xtal;
+		init_params.dmd_ref_clk		= config->xtal;
 		init_params.demod_mode		= config->demod_mode;
 		init_params.rolloff		= STV0900_35;
 		init_params.path1_ts_clock	= config->path1_mode;
diff --git a/drivers/media/dvb-frontends/stv0900_init.h b/drivers/media/dvb-frontends/stv0900_init.h
index 411941442086..550ef4a0f654 100644
--- a/drivers/media/dvb-frontends/stv0900_init.h
+++ b/drivers/media/dvb-frontends/stv0900_init.h
@@ -148,8 +148,8 @@ struct stv0900_short_frames_car_loop_optim_vs_mod {
 
 /* Cut 1.x Tracking carrier loop carrier QPSK 1/2 to 8PSK 9/10 long Frame */
 static const struct stv0900_car_loop_optim FE_STV0900_S2CarLoop[14] = {
-	/*Modcod		2MPon 	2MPoff	5MPon 	5MPoff	10MPon
-				10MPoff	20MPon 	20MPoff	30MPon 	30MPoff */
+	/*Modcod		2MPon	2MPoff	5MPon	5MPoff	10MPon
+				10MPoff	20MPon	20MPoff	30MPon	30MPoff */
 	{ STV0900_QPSK_12,	0x1C,	0x0D,	0x1B,	0x2C,	0x3A,
 				0x1C,	0x2A,	0x3B,	0x2A,	0x1B },
 	{ STV0900_QPSK_35,	0x2C,	0x0D,	0x2B,	0x2C,	0x3A,
@@ -176,15 +176,15 @@ static const struct stv0900_car_loop_optim FE_STV0900_S2CarLoop[14] = {
 				0x0B,	0x39,	0x1A,	0x19,	0x0A },
 	{ STV0900_8PSK_89,	0x3B,	0x3B,	0x0B,	0x2B,	0x2A,
 				0x0B,	0x39,	0x1A,	0x29,	0x39 },
-	{ STV0900_8PSK_910,	0x3B,	0x3B, 	0x0B,	0x2B, 	0x2A,
+	{ STV0900_8PSK_910,	0x3B,	0x3B,	0x0B,	0x2B,	0x2A,
 				0x0B,	0x39,	0x1A,	0x29,	0x39 }
 };
 
 
 /* Cut 2.0 Tracking carrier loop carrier QPSK 1/2 to 8PSK 9/10 long Frame */
 static const struct stv0900_car_loop_optim FE_STV0900_S2CarLoopCut20[14] = {
-	/* Modcod		2MPon 	2MPoff	5MPon 	5MPoff	10MPon
-				10MPoff	20MPon 	20MPoff	30MPon 	30MPoff */
+	/* Modcod		2MPon	2MPoff	5MPon	5MPoff	10MPon
+				10MPoff	20MPon	20MPoff	30MPon	30MPoff */
 	{ STV0900_QPSK_12,	0x1F,	0x3F,	0x1E,	0x3F,	0x3D,
 				0x1F,	0x3D,	0x3E,	0x3D,	0x1E },
 	{ STV0900_QPSK_35,	0x2F,	0x3F,	0x2E,	0x2F,	0x3D,
@@ -211,7 +211,7 @@ static const struct stv0900_car_loop_optim FE_STV0900_S2CarLoopCut20[14] = {
 				0x1e,	0x3c,	0x2d,	0x2c,	0x1d },
 	{ STV0900_8PSK_89,	0x3e,	0x3e,	0x1e,	0x2e,	0x3d,
 				0x1e,	0x0d,	0x2d,	0x3c,	0x1d },
-	{ STV0900_8PSK_910,	0x3e,	0x3e, 	0x1e,	0x2e, 	0x3d,
+	{ STV0900_8PSK_910,	0x3e,	0x3e,	0x1e,	0x2e,	0x3d,
 				0x1e,	0x1d,	0x2d,	0x0d,	0x1d },
 };
 
@@ -219,8 +219,8 @@ static const struct stv0900_car_loop_optim FE_STV0900_S2CarLoopCut20[14] = {
 
 /* Cut 2.0 Tracking carrier loop carrier 16APSK 2/3 to 32APSK 9/10 long Frame */
 static const struct stv0900_car_loop_optim FE_STV0900_S2APSKCarLoopCut20[11] = {
-	/* Modcod		2MPon 	2MPoff	5MPon 	5MPoff	10MPon
-				10MPoff	20MPon 	20MPoff	30MPon 	30MPoff */
+	/* Modcod		2MPon	2MPoff	5MPon	5MPoff	10MPon
+				10MPoff	20MPon	20MPoff	30MPon	30MPoff */
 	{ STV0900_16APSK_23,	0x0C,	0x0C,	0x0C,	0x0C,	0x1D,
 				0x0C,	0x3C,	0x0C,	0x2C,	0x0C },
 	{ STV0900_16APSK_34,	0x0C,	0x0C,	0x0C,	0x0C,	0x0E,
@@ -248,8 +248,8 @@ static const struct stv0900_car_loop_optim FE_STV0900_S2APSKCarLoopCut20[11] = {
 
 /* Cut 2.0 Tracking carrier loop carrier QPSK 1/4 to QPSK 2/5 long Frame */
 static const struct stv0900_car_loop_optim FE_STV0900_S2LowQPCarLoopCut20[3] = {
-	/* Modcod		2MPon 	2MPoff	5MPon 	5MPoff	10MPon
-				10MPoff	20MPon 	20MPoff	30MPon 	30MPoff */
+	/* Modcod		2MPon	2MPoff	5MPon	5MPoff	10MPon
+				10MPoff	20MPon	20MPoff	30MPon	30MPoff */
 	{ STV0900_QPSK_14,	0x0F,	0x3F,	0x0E,	0x3F,	0x2D,
 				0x2F,	0x2D,	0x1F,	0x3D,	0x3E },
 	{ STV0900_QPSK_13,	0x0F,	0x3F,	0x0E,	0x3F,	0x2D,
@@ -275,10 +275,10 @@ struct stv0900_short_frames_car_loop_optim FE_STV0900_S2ShortCarLoop[4] = {
 };
 
 static	const struct stv0900_car_loop_optim FE_STV0900_S2CarLoopCut30[14] = {
-	/*Modcod		2MPon 	2MPoff	5MPon 	5MPoff	10MPon
-				10MPoff	20MPon 	20MPoff	30MPon 	30MPoff	*/
+	/*Modcod		2MPon	2MPoff	5MPon	5MPoff	10MPon
+				10MPoff	20MPon	20MPoff	30MPon	30MPoff	*/
 	{ STV0900_QPSK_12,	0x3C,	0x2C,	0x0C,	0x2C,	0x1B,
-				0x2C,	0x1B,	0x1C,	0x0B, 	0x3B },
+				0x2C,	0x1B,	0x1C,	0x0B,	0x3B },
 	{ STV0900_QPSK_35,	0x0D,	0x0D,	0x0C,	0x0D,	0x1B,
 				0x3C,	0x1B,	0x1C,	0x0B,	0x3B },
 	{ STV0900_QPSK_23,	0x1D,	0x0D,	0x0C,	0x1D,	0x2B,
@@ -309,8 +309,8 @@ static	const struct stv0900_car_loop_optim FE_STV0900_S2CarLoopCut30[14] = {
 
 static	const
 struct stv0900_car_loop_optim FE_STV0900_S2APSKCarLoopCut30[11] = {
-	/*Modcod		2MPon 	2MPoff	5MPon 	5MPoff	10MPon
-				10MPoff	20MPon 	20MPoff	30MPon 	30MPoff	*/
+	/*Modcod		2MPon	2MPoff	5MPon	5MPoff	10MPon
+				10MPoff	20MPon	20MPoff	30MPon	30MPoff	*/
 	{ STV0900_16APSK_23,	0x0A,	0x0A,	0x0A,	0x0A,	0x1A,
 				0x0A,	0x3A,	0x0A,	0x2A,	0x0A },
 	{ STV0900_16APSK_34,	0x0A,	0x0A,	0x0A,	0x0A,	0x0B,
@@ -337,8 +337,8 @@ struct stv0900_car_loop_optim FE_STV0900_S2APSKCarLoopCut30[11] = {
 
 static	const
 struct stv0900_car_loop_optim FE_STV0900_S2LowQPCarLoopCut30[3] = {
-	/*Modcod		2MPon 	2MPoff	5MPon 	5MPoff	10MPon
-				10MPoff	20MPon 	20MPoff	30MPon 	30MPoff*/
+	/*Modcod		2MPon	2MPoff	5MPon	5MPoff	10MPon
+				10MPoff	20MPon	20MPoff	30MPon	30MPoff*/
 	{ STV0900_QPSK_14,	0x0C,	0x3C,	0x0B,	0x3C,	0x2A,
 				0x2C,	0x2A,	0x1C,	0x3A,	0x3B },
 	{ STV0900_QPSK_13,	0x0C,	0x3C,	0x0B,	0x3C,	0x2A,
diff --git a/drivers/media/dvb-frontends/stv0900_priv.h b/drivers/media/dvb-frontends/stv0900_priv.h
index 7a95f955627b..d1fc06ff27d3 100644
--- a/drivers/media/dvb-frontends/stv0900_priv.h
+++ b/drivers/media/dvb-frontends/stv0900_priv.h
@@ -243,7 +243,7 @@ struct stv0900_init_params{
 
 	u8	tun1_maddress;
 	int	tuner1_adc;
-	int 	tuner1_type;
+	int	tuner1_type;
 
 	/* IQ from the tuner1 to the demod */
 	enum stv0900_iq_inversion	tun1_iq_inv;
diff --git a/drivers/media/dvb-frontends/stv090x.c b/drivers/media/dvb-frontends/stv090x.c
index 20641bd2f977..9133f65d4623 100644
--- a/drivers/media/dvb-frontends/stv090x.c
+++ b/drivers/media/dvb-frontends/stv090x.c
@@ -677,7 +677,7 @@ static struct stv090x_short_frame_crloop stv090x_s2_short_crl_cut20[] = {
 
 /* Cut 3.0 Short Frame Tracking CR Loop */
 static struct stv090x_short_frame_crloop stv090x_s2_short_crl_cut30[] = {
-	/* MODCOD  	  2M	5M    10M   20M	  30M */
+	/* MODCOD	  2M	5M    10M   20M	  30M */
 	{ STV090x_QPSK,   0x2C, 0x2B, 0x0B, 0x0B, 0x3A },
 	{ STV090x_8PSK,   0x3B, 0x0B, 0x2A, 0x0A, 0x39 },
 	{ STV090x_16APSK, 0x1B, 0x1B, 0x1B, 0x3A, 0x2A },
@@ -701,7 +701,7 @@ static int stv090x_read_reg(struct stv090x_state *state, unsigned int reg)
 	u8 buf;
 
 	struct i2c_msg msg[] = {
-		{ .addr	= config->address, .flags	= 0, 		.buf = b0,   .len = 2 },
+		{ .addr	= config->address, .flags	= 0,		.buf = b0,   .len = 2 },
 		{ .addr	= config->address, .flags	= I2C_M_RD,	.buf = &buf, .len = 1 }
 	};
 
@@ -4906,11 +4906,11 @@ static const struct dvb_frontend_ops stv090x_ops = {
 	.info = {
 		.name			= "STV090x Multistandard",
 		.frequency_min		= 950000,
-		.frequency_max 		= 2150000,
+		.frequency_max		= 2150000,
 		.frequency_stepsize	= 0,
 		.frequency_tolerance	= 0,
-		.symbol_rate_min 	= 1000000,
-		.symbol_rate_max 	= 45000000,
+		.symbol_rate_min	= 1000000,
+		.symbol_rate_max	= 45000000,
 		.caps			= FE_CAN_INVERSION_AUTO |
 					  FE_CAN_FEC_AUTO       |
 					  FE_CAN_QPSK           |
@@ -4953,7 +4953,7 @@ struct dvb_frontend *stv090x_attach(struct stv090x_config *config,
 	state->frontend.ops			= stv090x_ops;
 	state->frontend.demodulator_priv	= state;
 	state->demod				= demod;
-	state->demod_mode 			= config->demod_mode; /* Single or Dual mode */
+	state->demod_mode			= config->demod_mode; /* Single or Dual mode */
 	state->device				= config->device;
 	state->rolloff				= STV090x_RO_35; /* default */
 
diff --git a/drivers/media/dvb-frontends/stv090x_priv.h b/drivers/media/dvb-frontends/stv090x_priv.h
index 37c9f93a8a6a..fdda2185db9d 100644
--- a/drivers/media/dvb-frontends/stv090x_priv.h
+++ b/drivers/media/dvb-frontends/stv090x_priv.h
@@ -231,7 +231,7 @@ struct stv090x_tab {
 };
 
 struct stv090x_internal {
-	struct i2c_adapter 	*i2c_adap;
+	struct i2c_adapter	*i2c_adap;
 	u8			i2c_addr;
 
 	struct mutex		demod_lock; /* Lock access to shared register */
diff --git a/drivers/media/dvb-frontends/stv6110x.c b/drivers/media/dvb-frontends/stv6110x.c
index d4ac29ac9b4f..d8950028d021 100644
--- a/drivers/media/dvb-frontends/stv6110x.c
+++ b/drivers/media/dvb-frontends/stv6110x.c
@@ -46,7 +46,7 @@ static int stv6110x_read_reg(struct stv6110x_state *stv6110x, u8 reg, u8 *data)
 	u8 b0[] = { reg };
 	u8 b1[] = { 0 };
 	struct i2c_msg msg[] = {
-		{ .addr = config->addr, .flags = 0, 	   .buf = b0, .len = 1 },
+		{ .addr = config->addr, .flags = 0,	   .buf = b0, .len = 1 },
 		{ .addr = config->addr, .flags = I2C_M_RD, .buf = b1, .len = 1 }
 	};
 
diff --git a/drivers/media/dvb-frontends/stv6110x_priv.h b/drivers/media/dvb-frontends/stv6110x_priv.h
index a993aba27b7e..109dfaf4ba42 100644
--- a/drivers/media/dvb-frontends/stv6110x_priv.h
+++ b/drivers/media/dvb-frontends/stv6110x_priv.h
@@ -48,11 +48,11 @@
 
 #define STV6110x_SETFIELD(mask, bitf, val)				\
 	(mask = (mask & (~(((1 << STV6110x_WIDTH_##bitf) - 1) <<	\
-				  STV6110x_OFFST_##bitf))) | 		\
+				  STV6110x_OFFST_##bitf))) |		\
 			  (val << STV6110x_OFFST_##bitf))
 
 #define STV6110x_GETFIELD(bitf, val)					\
-	((val >> STV6110x_OFFST_##bitf) & 				\
+	((val >> STV6110x_OFFST_##bitf) &				\
 	((1 << STV6110x_WIDTH_##bitf) - 1))
 
 #define MAKEWORD16(a, b)			(((a) << 8) | (b))
@@ -68,7 +68,7 @@
 struct stv6110x_state {
 	struct i2c_adapter		*i2c;
 	const struct stv6110x_config	*config;
-	u8 				regs[8];
+	u8				regs[8];
 
 	const struct stv6110x_devctl	*devctl;
 };
diff --git a/drivers/media/dvb-frontends/tda10023.c b/drivers/media/dvb-frontends/tda10023.c
index abe27029fe93..6c84916234e3 100644
--- a/drivers/media/dvb-frontends/tda10023.c
+++ b/drivers/media/dvb-frontends/tda10023.c
@@ -211,7 +211,7 @@ static int tda10023_set_symbolrate (struct tda10023_state* state, u32 sr)
 
 		BDRX=1<<(24+NDEC);
 		BDRX*=sr;
-		do_div(BDRX, state->sysclk); 	/* BDRX/=SYSCLK; */
+		do_div(BDRX, state->sysclk);	/* BDRX/=SYSCLK; */
 
 		BDR=(s32)BDRX;
 	}
diff --git a/drivers/media/firewire/firedtv-avc.c b/drivers/media/firewire/firedtv-avc.c
index 37db04f8104d..1c933b2cf760 100644
--- a/drivers/media/firewire/firedtv-avc.c
+++ b/drivers/media/firewire/firedtv-avc.c
@@ -47,7 +47,7 @@
 #define AVC_OPCODE_DSIT			0xc8
 #define AVC_OPCODE_DSD			0xcb
 
-#define DESCRIPTOR_TUNER_STATUS 	0x80
+#define DESCRIPTOR_TUNER_STATUS		0x80
 #define DESCRIPTOR_SUBUNIT_IDENTIFIER	0x00
 
 #define SFE_VENDOR_DE_COMPANYID_0	0x00 /* OUI of Digital Everywhere */
@@ -688,7 +688,7 @@ int avc_tuner_get_ts(struct firedtv *fdtv)
 	c->operand[2] = 0xff;	/* status */
 	c->operand[3] = 0x20;	/* system id = DVB */
 	c->operand[4] = 0x00;	/* antenna number */
-	c->operand[5] = 0x0; 	/* system_specific_search_flags */
+	c->operand[5] = 0x0;	/* system_specific_search_flags */
 	c->operand[6] = sl;	/* system_specific_multiplex selection_length */
 	/*
 	 * operand[7]: valid_flags[0]
diff --git a/drivers/media/firewire/firedtv-fe.c b/drivers/media/firewire/firedtv-fe.c
index 86efeb10d2f2..a2ef4ede8ebe 100644
--- a/drivers/media/firewire/firedtv-fe.c
+++ b/drivers/media/firewire/firedtv-fe.c
@@ -165,7 +165,7 @@ void fdtv_frontend_init(struct firedtv *fdtv, const char *name)
 	ops->read_snr			= fdtv_read_snr;
 	ops->read_ucblocks		= fdtv_read_uncorrected_blocks;
 
-	ops->diseqc_send_master_cmd 	= fdtv_diseqc_send_master_cmd;
+	ops->diseqc_send_master_cmd	= fdtv_diseqc_send_master_cmd;
 	ops->diseqc_send_burst		= fdtv_diseqc_send_burst;
 	ops->set_tone			= fdtv_set_tone;
 	ops->set_voltage		= fdtv_set_voltage;
@@ -220,7 +220,7 @@ void fdtv_frontend_init(struct firedtv *fdtv, const char *name)
 		fi->symbol_rate_min	= 870000;
 		fi->symbol_rate_max	= 6900000;
 
-		fi->caps 		= FE_CAN_INVERSION_AUTO |
+		fi->caps		= FE_CAN_INVERSION_AUTO |
 					  FE_CAN_QAM_16		|
 					  FE_CAN_QAM_32		|
 					  FE_CAN_QAM_64		|
@@ -236,7 +236,7 @@ void fdtv_frontend_init(struct firedtv *fdtv, const char *name)
 		fi->frequency_max	= 861000000;
 		fi->frequency_stepsize	= 62500;
 
-		fi->caps 		= FE_CAN_INVERSION_AUTO		|
+		fi->caps		= FE_CAN_INVERSION_AUTO		|
 					  FE_CAN_FEC_2_3		|
 					  FE_CAN_TRANSMISSION_MODE_AUTO |
 					  FE_CAN_GUARD_INTERVAL_AUTO	|
diff --git a/drivers/media/i2c/cx25840/cx25840-core.c b/drivers/media/i2c/cx25840/cx25840-core.c
index 4a9c137095fe..98be63ae8590 100644
--- a/drivers/media/i2c/cx25840/cx25840-core.c
+++ b/drivers/media/i2c/cx25840/cx25840-core.c
@@ -1263,7 +1263,7 @@ static int set_input(struct i2c_client *client, enum cx25840_video_input vid_inp
 static int set_v4lstd(struct i2c_client *client)
 {
 	struct cx25840_state *state = to_state(i2c_get_clientdata(client));
-	u8 fmt = 0; 	/* zero is autodetect */
+	u8 fmt = 0;	/* zero is autodetect */
 	u8 pal_m = 0;
 
 	/* First tests should be against specific std */
diff --git a/drivers/media/i2c/cx25840/cx25840-core.h b/drivers/media/i2c/cx25840/cx25840-core.h
index 55432ed42714..fb13a624d2e3 100644
--- a/drivers/media/i2c/cx25840/cx25840-core.h
+++ b/drivers/media/i2c/cx25840/cx25840-core.h
@@ -118,7 +118,7 @@ static inline bool is_cx23888(struct cx25840_state *state)
 }
 
 /* ----------------------------------------------------------------------- */
-/* cx25850-core.c 							   */
+/* cx25850-core.c							   */
 int cx25840_write(struct i2c_client *client, u16 addr, u8 value);
 int cx25840_write4(struct i2c_client *client, u16 addr, u32 value);
 u8 cx25840_read(struct i2c_client *client, u16 addr);
diff --git a/drivers/media/i2c/cx25840/cx25840-ir.c b/drivers/media/i2c/cx25840/cx25840-ir.c
index 548382b2b2e6..ad7f66c7aac8 100644
--- a/drivers/media/i2c/cx25840/cx25840-ir.c
+++ b/drivers/media/i2c/cx25840/cx25840-ir.c
@@ -28,7 +28,7 @@ static unsigned int ir_debug;
 module_param(ir_debug, int, 0644);
 MODULE_PARM_DESC(ir_debug, "enable integrated IR debug messages");
 
-#define CX25840_IR_REG_BASE 	0x200
+#define CX25840_IR_REG_BASE	0x200
 
 #define CX25840_IR_CNTRL_REG	0x200
 #define CNTRL_WIN_3_3	0x00000000
diff --git a/drivers/media/i2c/ks0127.c b/drivers/media/i2c/ks0127.c
index ab536c4a7115..5905ed6f8397 100644
--- a/drivers/media/i2c/ks0127.c
+++ b/drivers/media/i2c/ks0127.c
@@ -195,7 +195,7 @@ struct adjust {
 struct ks0127 {
 	struct v4l2_subdev sd;
 	v4l2_std_id	norm;
-	u8 		regs[256];
+	u8		regs[256];
 };
 
 static inline struct ks0127 *to_ks0127(struct v4l2_subdev *sd)
diff --git a/drivers/media/i2c/ov7670.c b/drivers/media/i2c/ov7670.c
index c6c32f649777..fd229bc8a0e5 100644
--- a/drivers/media/i2c/ov7670.c
+++ b/drivers/media/i2c/ov7670.c
@@ -412,12 +412,12 @@ static struct regval_list ov7670_fmt_yuv422[] = {
 	{ REG_COM1, 0 },	/* CCIR601 */
 	{ REG_COM15, COM15_R00FF },
 	{ REG_COM9, 0x48 }, /* 32x gain ceiling; 0x8 is reserved bit */
-	{ 0x4f, 0x80 }, 	/* "matrix coefficient 1" */
-	{ 0x50, 0x80 }, 	/* "matrix coefficient 2" */
+	{ 0x4f, 0x80 },		/* "matrix coefficient 1" */
+	{ 0x50, 0x80 },		/* "matrix coefficient 2" */
 	{ 0x51, 0    },		/* vb */
-	{ 0x52, 0x22 }, 	/* "matrix coefficient 4" */
-	{ 0x53, 0x5e }, 	/* "matrix coefficient 5" */
-	{ 0x54, 0x80 }, 	/* "matrix coefficient 6" */
+	{ 0x52, 0x22 },		/* "matrix coefficient 4" */
+	{ 0x53, 0x5e },		/* "matrix coefficient 5" */
+	{ 0x54, 0x80 },		/* "matrix coefficient 6" */
 	{ REG_COM13, COM13_GAMMA|COM13_UVSAT },
 	{ 0xff, 0xff },
 };
@@ -427,13 +427,13 @@ static struct regval_list ov7670_fmt_rgb565[] = {
 	{ REG_RGB444, 0 },	/* No RGB444 please */
 	{ REG_COM1, 0x0 },	/* CCIR601 */
 	{ REG_COM15, COM15_RGB565 },
-	{ REG_COM9, 0x38 }, 	/* 16x gain ceiling; 0x8 is reserved bit */
-	{ 0x4f, 0xb3 }, 	/* "matrix coefficient 1" */
-	{ 0x50, 0xb3 }, 	/* "matrix coefficient 2" */
+	{ REG_COM9, 0x38 },	/* 16x gain ceiling; 0x8 is reserved bit */
+	{ 0x4f, 0xb3 },		/* "matrix coefficient 1" */
+	{ 0x50, 0xb3 },		/* "matrix coefficient 2" */
 	{ 0x51, 0    },		/* vb */
-	{ 0x52, 0x3d }, 	/* "matrix coefficient 4" */
-	{ 0x53, 0xa7 }, 	/* "matrix coefficient 5" */
-	{ 0x54, 0xe4 }, 	/* "matrix coefficient 6" */
+	{ 0x52, 0x3d },		/* "matrix coefficient 4" */
+	{ 0x53, 0xa7 },		/* "matrix coefficient 5" */
+	{ 0x54, 0xe4 },		/* "matrix coefficient 6" */
 	{ REG_COM13, COM13_GAMMA|COM13_UVSAT },
 	{ 0xff, 0xff },
 };
@@ -443,13 +443,13 @@ static struct regval_list ov7670_fmt_rgb444[] = {
 	{ REG_RGB444, R444_ENABLE },	/* Enable xxxxrrrr ggggbbbb */
 	{ REG_COM1, 0x0 },	/* CCIR601 */
 	{ REG_COM15, COM15_R01FE|COM15_RGB565 }, /* Data range needed? */
-	{ REG_COM9, 0x38 }, 	/* 16x gain ceiling; 0x8 is reserved bit */
-	{ 0x4f, 0xb3 }, 	/* "matrix coefficient 1" */
-	{ 0x50, 0xb3 }, 	/* "matrix coefficient 2" */
+	{ REG_COM9, 0x38 },	/* 16x gain ceiling; 0x8 is reserved bit */
+	{ 0x4f, 0xb3 },		/* "matrix coefficient 1" */
+	{ 0x50, 0xb3 },		/* "matrix coefficient 2" */
 	{ 0x51, 0    },		/* vb */
-	{ 0x52, 0x3d }, 	/* "matrix coefficient 4" */
-	{ 0x53, 0xa7 }, 	/* "matrix coefficient 5" */
-	{ 0x54, 0xe4 }, 	/* "matrix coefficient 6" */
+	{ 0x52, 0x3d },		/* "matrix coefficient 4" */
+	{ 0x53, 0xa7 },		/* "matrix coefficient 5" */
+	{ 0x54, 0xe4 },		/* "matrix coefficient 6" */
 	{ REG_COM13, COM13_GAMMA|COM13_UVSAT|0x2 },  /* Magic rsvd bit */
 	{ 0xff, 0xff },
 };
@@ -667,7 +667,7 @@ static struct ov7670_format_struct {
 	{
 		.mbus_code	= MEDIA_BUS_FMT_YUYV8_2X8,
 		.colorspace	= V4L2_COLORSPACE_SRGB,
-		.regs 		= ov7670_fmt_yuv422,
+		.regs		= ov7670_fmt_yuv422,
 		.cmatrix	= { 128, -128, 0, -34, -94, 128 },
 	},
 	{
@@ -685,7 +685,7 @@ static struct ov7670_format_struct {
 	{
 		.mbus_code	= MEDIA_BUS_FMT_SBGGR8_1X8,
 		.colorspace	= V4L2_COLORSPACE_SRGB,
-		.regs 		= ov7670_fmt_raw,
+		.regs		= ov7670_fmt_raw,
 		.cmatrix	= { 0, 0, 0, 0, 0, 0 },
 	},
 };
diff --git a/drivers/media/i2c/saa6752hs.c b/drivers/media/i2c/saa6752hs.c
index 7202d3a3219a..170cc65c4f23 100644
--- a/drivers/media/i2c/saa6752hs.c
+++ b/drivers/media/i2c/saa6752hs.c
@@ -72,8 +72,8 @@ struct saa6752hs_mpeg_params {
 	/* video */
 	enum v4l2_mpeg_video_aspect	vi_aspect;
 	enum v4l2_mpeg_video_bitrate_mode vi_bitrate_mode;
-	__u32 				vi_bitrate;
-	__u32 				vi_bitrate_peak;
+	__u32				vi_bitrate;
+	__u32				vi_bitrate_peak;
 };
 
 static const struct v4l2_format v4l2_format_table[] =
@@ -98,8 +98,8 @@ struct saa6752hs_state {
 		struct v4l2_ctrl *video_bitrate;
 		struct v4l2_ctrl *video_bitrate_peak;
 	};
-	u32 			      revision;
-	int 			      has_ac3;
+	u32			      revision;
+	int			      has_ac3;
 	struct saa6752hs_mpeg_params  params;
 	enum saa6752hs_videoformat    video_format;
 	v4l2_std_id                   standard;
diff --git a/drivers/media/i2c/saa7115.c b/drivers/media/i2c/saa7115.c
index 7dd6cff6d811..e216cd768409 100644
--- a/drivers/media/i2c/saa7115.c
+++ b/drivers/media/i2c/saa7115.c
@@ -748,7 +748,7 @@ static int saa711x_s_clock_freq(struct v4l2_subdev *sd, u32 freq)
 	u32 acni;
 	u32 hz;
 	u64 f;
-	u8 acc = 0; 	/* reg 0x3a, audio clock control */
+	u8 acc = 0;	/* reg 0x3a, audio clock control */
 
 	/* Checks for chips that don't have audio clock (saa7111, saa7113) */
 	if (!saa711x_has_reg(state->ident, R_30_AUD_MAST_CLK_CYCLES_PER_FIELD))
diff --git a/drivers/media/i2c/saa7127.c b/drivers/media/i2c/saa7127.c
index 01784d441ae6..e58a150cec5c 100644
--- a/drivers/media/i2c/saa7127.c
+++ b/drivers/media/i2c/saa7127.c
@@ -132,109 +132,109 @@ struct i2c_reg_value {
 };
 
 static const struct i2c_reg_value saa7129_init_config_extra[] = {
-	{ SAA7127_REG_OUTPUT_PORT_CONTROL, 		0x38 },
-	{ SAA7127_REG_VTRIG, 				0xfa },
+	{ SAA7127_REG_OUTPUT_PORT_CONTROL,		0x38 },
+	{ SAA7127_REG_VTRIG,				0xfa },
 	{ 0, 0 }
 };
 
 static const struct i2c_reg_value saa7127_init_config_common[] = {
-	{ SAA7127_REG_WIDESCREEN_CONFIG, 		0x0d },
-	{ SAA7127_REG_WIDESCREEN_ENABLE, 		0x00 },
-	{ SAA7127_REG_COPYGEN_0, 			0x77 },
-	{ SAA7127_REG_COPYGEN_1, 			0x41 },
-	{ SAA7127_REG_COPYGEN_2, 			0x00 },	/* Macrovision enable/disable */
-	{ SAA7127_REG_OUTPUT_PORT_CONTROL, 		0xbf },
-	{ SAA7127_REG_GAIN_LUMINANCE_RGB, 		0x00 },
-	{ SAA7127_REG_GAIN_COLORDIFF_RGB, 		0x00 },
-	{ SAA7127_REG_INPUT_PORT_CONTROL_1, 		0x80 },	/* for color bars */
-	{ SAA7127_REG_LINE_21_ODD_0, 			0x77 },
-	{ SAA7127_REG_LINE_21_ODD_1, 			0x41 },
-	{ SAA7127_REG_LINE_21_EVEN_0, 			0x88 },
-	{ SAA7127_REG_LINE_21_EVEN_1, 			0x41 },
-	{ SAA7127_REG_RCV_PORT_CONTROL, 		0x12 },
-	{ SAA7127_REG_VTRIG, 				0xf9 },
-	{ SAA7127_REG_HTRIG_HI, 			0x00 },
-	{ SAA7127_REG_RCV2_OUTPUT_START, 		0x41 },
-	{ SAA7127_REG_RCV2_OUTPUT_END, 			0xc3 },
-	{ SAA7127_REG_RCV2_OUTPUT_MSBS, 		0x00 },
-	{ SAA7127_REG_TTX_REQUEST_H_START, 		0x3e },
-	{ SAA7127_REG_TTX_REQUEST_H_DELAY_LENGTH, 	0xb8 },
-	{ SAA7127_REG_CSYNC_ADVANCE_VSYNC_SHIFT,  	0x03 },
-	{ SAA7127_REG_TTX_ODD_REQ_VERT_START, 		0x15 },
-	{ SAA7127_REG_TTX_ODD_REQ_VERT_END, 		0x16 },
-	{ SAA7127_REG_TTX_EVEN_REQ_VERT_START, 		0x15 },
-	{ SAA7127_REG_TTX_EVEN_REQ_VERT_END, 		0x16 },
-	{ SAA7127_REG_FIRST_ACTIVE, 			0x1a },
-	{ SAA7127_REG_LAST_ACTIVE, 			0x01 },
-	{ SAA7127_REG_MSB_VERTICAL, 			0xc0 },
-	{ SAA7127_REG_DISABLE_TTX_LINE_LO_0, 		0x00 },
-	{ SAA7127_REG_DISABLE_TTX_LINE_LO_1, 		0x00 },
+	{ SAA7127_REG_WIDESCREEN_CONFIG,		0x0d },
+	{ SAA7127_REG_WIDESCREEN_ENABLE,		0x00 },
+	{ SAA7127_REG_COPYGEN_0,			0x77 },
+	{ SAA7127_REG_COPYGEN_1,			0x41 },
+	{ SAA7127_REG_COPYGEN_2,			0x00 },	/* Macrovision enable/disable */
+	{ SAA7127_REG_OUTPUT_PORT_CONTROL,		0xbf },
+	{ SAA7127_REG_GAIN_LUMINANCE_RGB,		0x00 },
+	{ SAA7127_REG_GAIN_COLORDIFF_RGB,		0x00 },
+	{ SAA7127_REG_INPUT_PORT_CONTROL_1,		0x80 },	/* for color bars */
+	{ SAA7127_REG_LINE_21_ODD_0,			0x77 },
+	{ SAA7127_REG_LINE_21_ODD_1,			0x41 },
+	{ SAA7127_REG_LINE_21_EVEN_0,			0x88 },
+	{ SAA7127_REG_LINE_21_EVEN_1,			0x41 },
+	{ SAA7127_REG_RCV_PORT_CONTROL,			0x12 },
+	{ SAA7127_REG_VTRIG,				0xf9 },
+	{ SAA7127_REG_HTRIG_HI,				0x00 },
+	{ SAA7127_REG_RCV2_OUTPUT_START,		0x41 },
+	{ SAA7127_REG_RCV2_OUTPUT_END,			0xc3 },
+	{ SAA7127_REG_RCV2_OUTPUT_MSBS,			0x00 },
+	{ SAA7127_REG_TTX_REQUEST_H_START,		0x3e },
+	{ SAA7127_REG_TTX_REQUEST_H_DELAY_LENGTH,	0xb8 },
+	{ SAA7127_REG_CSYNC_ADVANCE_VSYNC_SHIFT,	0x03 },
+	{ SAA7127_REG_TTX_ODD_REQ_VERT_START,		0x15 },
+	{ SAA7127_REG_TTX_ODD_REQ_VERT_END,		0x16 },
+	{ SAA7127_REG_TTX_EVEN_REQ_VERT_START,		0x15 },
+	{ SAA7127_REG_TTX_EVEN_REQ_VERT_END,		0x16 },
+	{ SAA7127_REG_FIRST_ACTIVE,			0x1a },
+	{ SAA7127_REG_LAST_ACTIVE,			0x01 },
+	{ SAA7127_REG_MSB_VERTICAL,			0xc0 },
+	{ SAA7127_REG_DISABLE_TTX_LINE_LO_0,		0x00 },
+	{ SAA7127_REG_DISABLE_TTX_LINE_LO_1,		0x00 },
 	{ 0, 0 }
 };
 
 #define SAA7127_60HZ_DAC_CONTROL 0x15
 static const struct i2c_reg_value saa7127_init_config_60hz[] = {
-	{ SAA7127_REG_BURST_START, 			0x19 },
+	{ SAA7127_REG_BURST_START,			0x19 },
 	/* BURST_END is also used as a chip ID in saa7127_probe */
-	{ SAA7127_REG_BURST_END, 			0x1d },
-	{ SAA7127_REG_CHROMA_PHASE, 			0xa3 },
-	{ SAA7127_REG_GAINU, 				0x98 },
-	{ SAA7127_REG_GAINV, 				0xd3 },
-	{ SAA7127_REG_BLACK_LEVEL, 			0x39 },
-	{ SAA7127_REG_BLANKING_LEVEL, 			0x2e },
-	{ SAA7127_REG_VBI_BLANKING, 			0x2e },
-	{ SAA7127_REG_DAC_CONTROL, 			0x15 },
-	{ SAA7127_REG_BURST_AMP, 			0x4d },
-	{ SAA7127_REG_SUBC3, 				0x1f },
-	{ SAA7127_REG_SUBC2, 				0x7c },
-	{ SAA7127_REG_SUBC1, 				0xf0 },
-	{ SAA7127_REG_SUBC0, 				0x21 },
-	{ SAA7127_REG_MULTI, 				0x90 },
-	{ SAA7127_REG_CLOSED_CAPTION, 			0x11 },
+	{ SAA7127_REG_BURST_END,			0x1d },
+	{ SAA7127_REG_CHROMA_PHASE,			0xa3 },
+	{ SAA7127_REG_GAINU,				0x98 },
+	{ SAA7127_REG_GAINV,				0xd3 },
+	{ SAA7127_REG_BLACK_LEVEL,			0x39 },
+	{ SAA7127_REG_BLANKING_LEVEL,			0x2e },
+	{ SAA7127_REG_VBI_BLANKING,			0x2e },
+	{ SAA7127_REG_DAC_CONTROL,			0x15 },
+	{ SAA7127_REG_BURST_AMP,			0x4d },
+	{ SAA7127_REG_SUBC3,				0x1f },
+	{ SAA7127_REG_SUBC2,				0x7c },
+	{ SAA7127_REG_SUBC1,				0xf0 },
+	{ SAA7127_REG_SUBC0,				0x21 },
+	{ SAA7127_REG_MULTI,				0x90 },
+	{ SAA7127_REG_CLOSED_CAPTION,			0x11 },
 	{ 0, 0 }
 };
 
 #define SAA7127_50HZ_PAL_DAC_CONTROL 0x02
 static struct i2c_reg_value saa7127_init_config_50hz_pal[] = {
-	{ SAA7127_REG_BURST_START, 			0x21 },
+	{ SAA7127_REG_BURST_START,			0x21 },
 	/* BURST_END is also used as a chip ID in saa7127_probe */
-	{ SAA7127_REG_BURST_END, 			0x1d },
-	{ SAA7127_REG_CHROMA_PHASE, 			0x3f },
-	{ SAA7127_REG_GAINU, 				0x7d },
-	{ SAA7127_REG_GAINV, 				0xaf },
-	{ SAA7127_REG_BLACK_LEVEL, 			0x33 },
-	{ SAA7127_REG_BLANKING_LEVEL, 			0x35 },
-	{ SAA7127_REG_VBI_BLANKING, 			0x35 },
-	{ SAA7127_REG_DAC_CONTROL, 			0x02 },
-	{ SAA7127_REG_BURST_AMP, 			0x2f },
-	{ SAA7127_REG_SUBC3, 				0xcb },
-	{ SAA7127_REG_SUBC2, 				0x8a },
-	{ SAA7127_REG_SUBC1, 				0x09 },
-	{ SAA7127_REG_SUBC0, 				0x2a },
-	{ SAA7127_REG_MULTI, 				0xa0 },
-	{ SAA7127_REG_CLOSED_CAPTION, 			0x00 },
+	{ SAA7127_REG_BURST_END,			0x1d },
+	{ SAA7127_REG_CHROMA_PHASE,			0x3f },
+	{ SAA7127_REG_GAINU,				0x7d },
+	{ SAA7127_REG_GAINV,				0xaf },
+	{ SAA7127_REG_BLACK_LEVEL,			0x33 },
+	{ SAA7127_REG_BLANKING_LEVEL,			0x35 },
+	{ SAA7127_REG_VBI_BLANKING,			0x35 },
+	{ SAA7127_REG_DAC_CONTROL,			0x02 },
+	{ SAA7127_REG_BURST_AMP,			0x2f },
+	{ SAA7127_REG_SUBC3,				0xcb },
+	{ SAA7127_REG_SUBC2,				0x8a },
+	{ SAA7127_REG_SUBC1,				0x09 },
+	{ SAA7127_REG_SUBC0,				0x2a },
+	{ SAA7127_REG_MULTI,				0xa0 },
+	{ SAA7127_REG_CLOSED_CAPTION,			0x00 },
 	{ 0, 0 }
 };
 
 #define SAA7127_50HZ_SECAM_DAC_CONTROL 0x08
 static struct i2c_reg_value saa7127_init_config_50hz_secam[] = {
-	{ SAA7127_REG_BURST_START, 			0x21 },
+	{ SAA7127_REG_BURST_START,			0x21 },
 	/* BURST_END is also used as a chip ID in saa7127_probe */
-	{ SAA7127_REG_BURST_END, 			0x1d },
-	{ SAA7127_REG_CHROMA_PHASE, 			0x3f },
-	{ SAA7127_REG_GAINU, 				0x6a },
-	{ SAA7127_REG_GAINV, 				0x81 },
-	{ SAA7127_REG_BLACK_LEVEL, 			0x33 },
-	{ SAA7127_REG_BLANKING_LEVEL, 			0x35 },
-	{ SAA7127_REG_VBI_BLANKING, 			0x35 },
-	{ SAA7127_REG_DAC_CONTROL, 			0x08 },
-	{ SAA7127_REG_BURST_AMP, 			0x2f },
-	{ SAA7127_REG_SUBC3, 				0xb2 },
-	{ SAA7127_REG_SUBC2, 				0x3b },
-	{ SAA7127_REG_SUBC1, 				0xa3 },
-	{ SAA7127_REG_SUBC0, 				0x28 },
-	{ SAA7127_REG_MULTI, 				0x90 },
-	{ SAA7127_REG_CLOSED_CAPTION, 			0x00 },
+	{ SAA7127_REG_BURST_END,			0x1d },
+	{ SAA7127_REG_CHROMA_PHASE,			0x3f },
+	{ SAA7127_REG_GAINU,				0x6a },
+	{ SAA7127_REG_GAINV,				0x81 },
+	{ SAA7127_REG_BLACK_LEVEL,			0x33 },
+	{ SAA7127_REG_BLANKING_LEVEL,			0x35 },
+	{ SAA7127_REG_VBI_BLANKING,			0x35 },
+	{ SAA7127_REG_DAC_CONTROL,			0x08 },
+	{ SAA7127_REG_BURST_AMP,			0x2f },
+	{ SAA7127_REG_SUBC3,				0xb2 },
+	{ SAA7127_REG_SUBC2,				0x3b },
+	{ SAA7127_REG_SUBC1,				0xa3 },
+	{ SAA7127_REG_SUBC0,				0x28 },
+	{ SAA7127_REG_MULTI,				0x90 },
+	{ SAA7127_REG_CLOSED_CAPTION,			0x00 },
 	{ 0, 0 }
 };
 
diff --git a/drivers/media/i2c/saa717x.c b/drivers/media/i2c/saa717x.c
index 102467e00fb3..668c39cc29e8 100644
--- a/drivers/media/i2c/saa717x.c
+++ b/drivers/media/i2c/saa717x.c
@@ -82,13 +82,13 @@ static inline struct v4l2_subdev *to_sd(struct v4l2_ctrl *ctrl)
 /* ----------------------------------------------------------------------- */
 
 /* for audio mode */
-#define TUNER_AUDIO_MONO   	0  /* LL */
-#define TUNER_AUDIO_STEREO 	1  /* LR */
-#define TUNER_AUDIO_LANG1  	2  /* LL */
-#define TUNER_AUDIO_LANG2  	3  /* RR */
+#define TUNER_AUDIO_MONO	0  /* LL */
+#define TUNER_AUDIO_STEREO	1  /* LR */
+#define TUNER_AUDIO_LANG1	2  /* LL */
+#define TUNER_AUDIO_LANG2	3  /* RR */
 
-#define SAA717X_NTSC_WIDTH   	(704)
-#define SAA717X_NTSC_HEIGHT  	(480)
+#define SAA717X_NTSC_WIDTH	(704)
+#define SAA717X_NTSC_HEIGHT	(480)
 
 /* ----------------------------------------------------------------------- */
 
diff --git a/drivers/media/i2c/ths7303.c b/drivers/media/i2c/ths7303.c
index 71a31352135c..8206bf7a5a8f 100644
--- a/drivers/media/i2c/ths7303.c
+++ b/drivers/media/i2c/ths7303.c
@@ -319,7 +319,7 @@ static const struct v4l2_subdev_core_ops ths7303_core_ops = {
 
 static const struct v4l2_subdev_ops ths7303_ops = {
 	.core	= &ths7303_core_ops,
-	.video 	= &ths7303_video_ops,
+	.video	= &ths7303_video_ops,
 };
 
 static int ths7303_probe(struct i2c_client *client,
diff --git a/drivers/media/i2c/tvaudio.c b/drivers/media/i2c/tvaudio.c
index e6edda524856..772164b848ef 100644
--- a/drivers/media/i2c/tvaudio.c
+++ b/drivers/media/i2c/tvaudio.c
@@ -134,7 +134,7 @@ struct CHIPSTATE {
 	/* thread */
 	struct task_struct   *thread;
 	struct timer_list    wt;
-	int 		     audmode;
+	int		     audmode;
 };
 
 static inline struct CHIPSTATE *to_state(struct v4l2_subdev *sd)
diff --git a/drivers/media/i2c/tvp7002_reg.h b/drivers/media/i2c/tvp7002_reg.h
index 933673561fa2..3c8c8b0a6a4c 100644
--- a/drivers/media/i2c/tvp7002_reg.h
+++ b/drivers/media/i2c/tvp7002_reg.h
@@ -109,15 +109,15 @@
 #define TVP7002_L_FRAME_STAT_LSBS	0x37
 #define TVP7002_L_FRAME_STAT_MSBS	0x38
 #define TVP7002_CLK_L_STAT_LSBS		0x39
-#define TVP7002_CLK_L_STAT_MSBS      	0x3a
+#define TVP7002_CLK_L_STAT_MSBS		0x3a
 #define TVP7002_HSYNC_W			0x3b
 #define TVP7002_VSYNC_W                 0x3c
-#define TVP7002_L_LENGTH_TOL 		0x3d
+#define TVP7002_L_LENGTH_TOL		0x3d
 /* Reserved 0x3e */
 #define TVP7002_VIDEO_BWTH_CTL		0x3f
 #define TVP7002_AVID_START_PIXEL_LSBS	0x40
 #define TVP7002_AVID_START_PIXEL_MSBS   0x41
-#define TVP7002_AVID_STOP_PIXEL_LSBS  	0x42
+#define TVP7002_AVID_STOP_PIXEL_LSBS	0x42
 #define TVP7002_AVID_STOP_PIXEL_MSBS    0x43
 #define TVP7002_VBLK_F_0_START_L_OFF	0x44
 #define TVP7002_VBLK_F_1_START_L_OFF    0x45
diff --git a/drivers/media/i2c/vpx3220.c b/drivers/media/i2c/vpx3220.c
index 67de79b2d550..c3549fa55b62 100644
--- a/drivers/media/i2c/vpx3220.c
+++ b/drivers/media/i2c/vpx3220.c
@@ -201,7 +201,7 @@ static const unsigned short init_pal[] = {
 				 * skipped by the VFE) */
 	0x8b, 16,		/* Horizontal begin */
 	0x8c, 768,		/* Horizontal length */
-	0x8d, 784, 		/* Number of pixels
+	0x8d, 784,		/* Number of pixels
 				 * Must be >= Horizontal begin + Horizontal length */
 	0x8f, 0xc00,		/* Disable window 2 */
 	0xf0, 0x77,		/* 13.5 MHz transport, Forced
diff --git a/drivers/media/pci/bt8xx/bttv-cards.c b/drivers/media/pci/bt8xx/bttv-cards.c
index 7dcf509e66d9..1902732f90e1 100644
--- a/drivers/media/pci/bt8xx/bttv-cards.c
+++ b/drivers/media/pci/bt8xx/bttv-cards.c
@@ -373,8 +373,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 15,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 2, 0, 0, 0 },
-		.gpiomute 	= 10,
+		.gpiomux	= { 2, 0, 0, 0 },
+		.gpiomute	= 10,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -385,8 +385,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 7,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 1, 2, 3 },
-		.gpiomute 	= 4,
+		.gpiomux	= { 0, 1, 2, 3 },
+		.gpiomute	= 4,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -397,8 +397,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 7,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 4, 0, 2, 3 },
-		.gpiomute 	= 1,
+		.gpiomux	= { 4, 0, 2, 3 },
+		.gpiomute	= 1,
 		.no_msp34xx	= 1,
 		.tuner_type     = TUNER_PHILIPS_NTSC,
 		.tuner_addr	= ADDR_UNSET,
@@ -414,7 +414,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0 },
+		.gpiomux	= { 0 },
 		.tuner_type	= TUNER_ABSENT,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -425,8 +425,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 3,
 		.muxsel		= MUXSEL(2, 3, 1, 0),
-		.gpiomux 	= { 0, 1, 0, 1 },
-		.gpiomute 	= 3,
+		.gpiomux	= { 0, 1, 0, 1 },
+		.gpiomute	= 3,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -437,7 +437,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 3,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
 		.gpiomask	= 0x0f,
-		.gpiomux 	= { 0x0c, 0x04, 0x08, 0x04 },
+		.gpiomux	= { 0x0c, 0x04, 0x08, 0x04 },
 		/*                0x04 for some cards ?? */
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -451,7 +451,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 3,
 		.gpiomask	= 0,
 		.muxsel		= MUXSEL(2, 3, 1, 0, 0),
-		.gpiomux 	= { 0 },
+		.gpiomux	= { 0 },
 		.tuner_type	= TUNER_ABSENT,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -464,8 +464,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xc00,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0xc00, 0x800, 0x400 },
-		.gpiomute 	= 0xc00,
+		.gpiomux	= { 0, 0xc00, 0x800, 0x400 },
+		.gpiomute	= 0xc00,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -477,7 +477,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 3,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 1, 1, 2, 3 },
+		.gpiomux	= { 1, 1, 2, 3 },
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_TEMIC_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -489,8 +489,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x0f, /* old: 7 */
 		.muxsel		= MUXSEL(2, 0, 1, 1),
-		.gpiomux 	= { 0, 1, 2, 3 },
-		.gpiomute 	= 4,
+		.gpiomux	= { 0, 1, 2, 3 },
+		.gpiomute	= 4,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -502,8 +502,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x3014f,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0x20001,0x10001, 0, 0 },
-		.gpiomute 	= 10,
+		.gpiomux	= { 0x20001,0x10001, 0, 0 },
+		.gpiomute	= 10,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -516,7 +516,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 15,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 13, 14, 11, 7 },
+		.gpiomux	= { 13, 14, 11, 7 },
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -527,7 +527,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 15,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 13, 14, 11, 7 },
+		.gpiomux	= { 13, 14, 11, 7 },
 		.msp34xx_alt    = 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL,
@@ -542,8 +542,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 7,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 2, 1, 3 }, /* old: {0, 1, 2, 3, 4} */
-		.gpiomute 	= 4,
+		.gpiomux	= { 0, 2, 1, 3 }, /* old: {0, 1, 2, 3, 4} */
+		.gpiomute	= 4,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -555,8 +555,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 15,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0, 1, 0 },
-		.gpiomute 	= 10,
+		.gpiomux	= { 0, 0, 1, 0 },
+		.gpiomute	= 10,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -571,7 +571,7 @@ struct tvcard bttv_tvcards[] = {
 		.muxsel		= MUXSEL(2, 3, 1, 1),
 		/* 2003-10-20 by "Anton A. Arapov" <arapov@mail.ru> */
 		.gpiomux        = { 0x001e00, 0, 0x018000, 0x014000 },
-		.gpiomute 	= 0x002000,
+		.gpiomute	= 0x002000,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr     = ADDR_UNSET,
@@ -583,8 +583,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x8300f8,
 		.muxsel		= MUXSEL(2, 3, 1, 1, 0),
-		.gpiomux 	= { 0x4fa007,0xcfa007,0xcfa007,0xcfa007 },
-		.gpiomute 	= 0xcfa007,
+		.gpiomux	= { 0x4fa007,0xcfa007,0xcfa007,0xcfa007 },
+		.gpiomute	= 0xcfa007,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 		.volume_gpio	= winview_volume,
@@ -597,7 +597,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 1, 0, 0, 0 },
+		.gpiomux	= { 1, 0, 0, 0 },
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -608,7 +608,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= NO_SVHS,
 		.gpiomask	= 0x8dff00,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0 },
+		.gpiomux	= { 0 },
 		.no_msp34xx	= 1,
 		.tuner_type	= TUNER_ABSENT,
 		.tuner_addr	= ADDR_UNSET,
@@ -631,8 +631,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x1800,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0x800, 0x1000, 0x1000 },
-		.gpiomute 	= 0x1800,
+		.gpiomux	= { 0, 0x800, 0x1000, 0x1000 },
+		.gpiomute	= 0x1800,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL_I,
 		.tuner_addr	= ADDR_UNSET,
@@ -644,8 +644,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xc00,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 1, 0x800, 0x400 },
-		.gpiomute 	= 0xc00,
+		.gpiomux	= { 0, 1, 0x800, 0x400 },
+		.gpiomute	= 0xc00,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -659,7 +659,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask	= 7,
 		.muxsel		= MUXSEL(2, 3, 0), /* input 2 is digital */
 		/* .digital_mode= DIGITAL_MODE_CAMERA, */
-		.gpiomux 	= { 0, 0, 0, 0 },
+		.gpiomux	= { 0, 0, 0, 0 },
 		.no_msp34xx	= 1,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_ALPS_TSBB5_PAL_I,
@@ -674,8 +674,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xe00,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= {0x400, 0x400, 0x400, 0x400 },
-		.gpiomute 	= 0xc00,
+		.gpiomux	= {0x400, 0x400, 0x400, 0x400 },
+		.gpiomute	= 0xc00,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -690,7 +690,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x1f0fff,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0x20000, 0x30000, 0x10000, 0 },
-		.gpiomute 	= 0x40000,
+		.gpiomute	= 0x40000,
 		.tuner_type	= TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
 		.audio_mode_gpio= terratv_audio,
@@ -702,8 +702,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 3,
 		.gpiomask	= 7,
 		.muxsel		= MUXSEL(2, 0, 1, 1),
-		.gpiomux 	= { 0, 1, 2, 3 },
-		.gpiomute 	= 4,
+		.gpiomux	= { 0, 1, 2, 3 },
+		.gpiomute	= 4,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -714,8 +714,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x1800,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0x800, 0x1000, 0x1000 },
-		.gpiomute 	= 0x1800,
+		.gpiomux	= { 0, 0x800, 0x1000, 0x1000 },
+		.gpiomute	= 0x1800,
 		.pll            = PLL_28,
 		.tuner_type	= TUNER_PHILIPS_SECAM,
 		.tuner_addr	= ADDR_UNSET,
@@ -729,8 +729,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x1f0fff,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0x20000, 0x30000, 0x10000, 0x00000 },
-		.gpiomute 	= 0x40000,
+		.gpiomux	= { 0x20000, 0x30000, 0x10000, 0x00000 },
+		.gpiomute	= 0x40000,
 		.tuner_type	= TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
 		.audio_mode_gpio= terratv_audio,
@@ -774,7 +774,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 1, /* was: 4 */
 		.gpiomask	= 0,
 		.muxsel		= MUXSEL(2, 3, 1, 0, 0),
-		.gpiomux 	= { 0 },
+		.gpiomux	= { 0 },
 		.tuner_type	= TUNER_ABSENT,
 		.tuner_addr	= ADDR_UNSET,
 		.muxsel_hook    = PXC200_muxsel,
@@ -787,8 +787,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x1800,  /* 0x8dfe00 */
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0x0800, 0x1000, 0x1000 },
-		.gpiomute 	= 0x1800,
+		.gpiomux	= { 0, 0x0800, 0x1000, 0x1000 },
+		.gpiomute	= 0x1800,
 		.pll            = PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -800,7 +800,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 3,
 		.gpiomask	= 1,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 1, 0, 0, 0 },
+		.gpiomux	= { 1, 0, 0, 0 },
 		.pll            = PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -814,7 +814,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0 },
+		.gpiomux	= { 0 },
 		.tuner_type	= TUNER_ABSENT,
 		.tuner_addr	= ADDR_UNSET,
 	},
@@ -825,8 +825,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xffff00,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0x500, 0, 0x300, 0x900 },
-		.gpiomute 	= 0x900,
+		.gpiomux	= { 0x500, 0, 0x300, 0x900 },
+		.gpiomute	= 0x900,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -840,8 +840,8 @@ struct tvcard bttv_tvcards[] = {
 		.muxsel		= MUXSEL(2, 3, 1, 1, 0),
 		/* Alexander Varakin <avarakin@hotmail.com> [stereo version] */
 		.gpiomask	= 0xb33000,
-		.gpiomux 	= { 0x122000,0x1000,0x0000,0x620000 },
-		.gpiomute 	= 0x800000,
+		.gpiomux	= { 0x122000,0x1000,0x0000,0x620000 },
+		.gpiomute	= 0x800000,
 		/* Audio Routing for "WinFast 2000 XP" (no tv stereo !)
 			gpio23 -- hef4052:nEnable (0x800000)
 			gpio12 -- hef4052:A1
@@ -867,8 +867,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x1800,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0x800, 0x1000, 0x1000 },
-		.gpiomute 	= 0x1800,
+		.gpiomux	= { 0, 0x800, 0x1000, 0x1000 },
+		.gpiomute	= 0x1800,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -882,8 +882,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x1800,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0x800, 0x1000, 0x1000 },
-		.gpiomute 	= 0x1800,
+		.gpiomux	= { 0, 0x800, 0x1000, 0x1000 },
+		.gpiomute	= 0x1800,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -896,8 +896,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xff,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0x21, 0x20, 0x24, 0x2c },
-		.gpiomute 	= 0x29,
+		.gpiomux	= { 0x21, 0x20, 0x24, 0x2c },
+		.gpiomute	= 0x29,
 		.no_msp34xx	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
@@ -910,8 +910,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x551e00,
 		.muxsel		= MUXSEL(2, 3, 1, 0),
-		.gpiomux 	= { 0x551400, 0x551200, 0, 0 },
-		.gpiomute 	= 0x551c00,
+		.gpiomux	= { 0x551400, 0x551200, 0, 0 },
+		.gpiomute	= 0x551c00,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL_I,
 		.tuner_addr	= ADDR_UNSET,
@@ -924,8 +924,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x03000F,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 2, 0xd0001, 0, 0 },
-		.gpiomute 	= 1,
+		.gpiomux	= { 2, 0xd0001, 0, 0 },
+		.gpiomute	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -939,8 +939,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 7,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 4, 0, 2, 3 },
-		.gpiomute 	= 1,
+		.gpiomux	= { 4, 0, 2, 3 },
+		.gpiomute	= 1,
 		.no_msp34xx	= 1,
 		.tuner_type     = TUNER_PHILIPS_NTSC,
 		.tuner_addr	= ADDR_UNSET,
@@ -954,7 +954,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 15,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 13, 4, 11, 7 },
+		.gpiomux	= { 13, 4, 11, 7 },
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -968,7 +968,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0, 0, 0},
+		.gpiomux	= { 0, 0, 0, 0},
 		.no_msp34xx	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL_I,
@@ -981,8 +981,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xe00b,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0xff9ff6, 0xff9ff6, 0xff1ff7, 0 },
-		.gpiomute 	= 0xff3ffc,
+		.gpiomux	= { 0xff9ff6, 0xff9ff6, 0xff1ff7, 0 },
+		.gpiomute	= 0xff3ffc,
 		.no_msp34xx	= 1,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -996,8 +996,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= NO_SVHS,
 		.gpiomask	= 3,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 1, 1, 0, 2 },
-		.gpiomute 	= 3,
+		.gpiomux	= { 1, 1, 0, 2 },
+		.gpiomute	= 3,
 		.no_msp34xx	= 1,
 		.pll		= PLL_NONE,
 		.tuner_type	= UNSET,
@@ -1010,7 +1010,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 3,
 		.gpiomask	= 0,
 		.muxsel		= MUXSEL(2, 3, 1, 0, 0),
-		.gpiomux 	= { 0 },
+		.gpiomux	= { 0 },
 		.no_msp34xx	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_ABSENT,
@@ -1023,8 +1023,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xbcf03f,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0xbc803f, 0xbc903f, 0xbcb03f, 0 },
-		.gpiomute 	= 0xbcb03f,
+		.gpiomux	= { 0xbc803f, 0xbc903f, 0xbcb03f, 0 },
+		.gpiomute	= 0xbcb03f,
 		.no_msp34xx	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_TEMIC_4039FR5_NTSC,
@@ -1037,8 +1037,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x70000,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0x20000, 0x30000, 0x10000, 0 },
-		.gpiomute 	= 0x40000,
+		.gpiomux	= { 0x20000, 0x30000, 0x10000, 0 },
+		.gpiomute	= 0x40000,
 		.no_msp34xx	= 1,
 		.pll		= PLL_35,
 		.tuner_type	= TUNER_PHILIPS_PAL_I,
@@ -1054,8 +1054,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 15,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= {2,0,0,0 },
-		.gpiomute 	= 1,
+		.gpiomux	= {2,0,0,0 },
+		.gpiomute	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
@@ -1067,7 +1067,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x010f00,
 		.muxsel		= MUXSEL(2, 3, 0, 0),
-		.gpiomux 	= {0x10000, 0, 0x10000, 0 },
+		.gpiomux	= {0x10000, 0, 0x10000, 0 },
 		.no_msp34xx	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_ALPS_TSHC6_NTSC,
@@ -1083,8 +1083,8 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask	= 0xAA0000,
 		.muxsel		= MUXSEL(2, 3, 1, 1, 0), /* in 4 is digital */
 		/* .digital_mode= DIGITAL_MODE_CAMERA, */
-		.gpiomux 	= { 0x20000, 0, 0x80000, 0x80000 },
-		.gpiomute 	= 0xa8000,
+		.gpiomux	= { 0x20000, 0, 0x80000, 0x80000 },
+		.gpiomute	= 0xa8000,
 		.no_msp34xx	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL_I,
@@ -1108,7 +1108,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 7,
 		.muxsel         = MUXSEL(2, 0, 1, 1),
 		.gpiomux        = { 0, 1, 2, 3 },
-		.gpiomute 	= 4,
+		.gpiomute	= 4,
 		.pll            = PLL_28,
 		.tuner_type     = UNSET /* TUNER_ALPS_TMDH2_NTSC */,
 		.tuner_addr	= ADDR_UNSET,
@@ -1123,8 +1123,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs           = 3,
 		.gpiomask       = 0x03000F,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 1, 0xd0001, 0, 0 },
-		.gpiomute 	= 10,
+		.gpiomux	= { 1, 0xd0001, 0, 0 },
+		.gpiomute	= 10,
 				/* sound path (5 sources):
 				MUX1 (mask 0x03), Enable Pin 0x08 (0=enable, 1=disable)
 					0= ext. Audio IN
@@ -1147,8 +1147,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x1c,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0, 0, 0x10, 8 },
-		.gpiomute 	= 4,
+		.gpiomux	= { 0, 0, 0x10, 8 },
+		.gpiomute	= 4,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -1166,8 +1166,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x18e0,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0x0000,0x0800,0x1000,0x1000 },
-		.gpiomute 	= 0x18e0,
+		.gpiomux	= { 0x0000,0x0800,0x1000,0x1000 },
+		.gpiomute	= 0x18e0,
 			/* For cards with tda9820/tda9821:
 				0x0000: Tuner normal stereo
 				0x0080: Tuner A2 SAP (second audio program = Zweikanalton)
@@ -1186,7 +1186,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0xF,
 		.muxsel         = MUXSEL(2, 3, 1, 0),
 		.gpiomux        = { 2, 0, 0, 0 },
-		.gpiomute 	= 10,
+		.gpiomute	= 10,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_TEMIC_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -1202,7 +1202,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x1800,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0, 0x800, 0x1000, 0x1000 },
-		.gpiomute 	= 0x1800,
+		.gpiomute	= 0x1800,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -1232,7 +1232,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0xe00,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0x400, 0x400, 0x400, 0x400 },
-		.gpiomute 	= 0x800,
+		.gpiomute	= 0x800,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_TEMIC_4036FY5_NTSC,
 		.tuner_addr	= ADDR_UNSET,
@@ -1246,7 +1246,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x03000F,
 		.muxsel		= MUXSEL(2, 3, 1, 0),
 		.gpiomux        = { 2, 0, 0, 0 },
-		.gpiomute 	= 1,
+		.gpiomute	= 1,
 		.pll            = PLL_28,
 		.tuner_type	= TUNER_TEMIC_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -1263,7 +1263,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 11,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 2, 0, 0, 1 },
-		.gpiomute 	= 8,
+		.gpiomute	= 8,
 		.pll            = PLL_35,
 		.tuner_type     = TUNER_TEMIC_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -1293,7 +1293,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0xFF,
 		.muxsel         = MUXSEL(2, 3, 1, 0),
 		.gpiomux        = { 1, 0, 4, 4 },
-		.gpiomute 	= 9,
+		.gpiomute	= 9,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -1306,8 +1306,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xf03f,
 		.muxsel		= MUXSEL(2, 3, 1, 0),
-		.gpiomux 	= { 0xbffe, 0, 0xbfff, 0 },
-		.gpiomute 	= 0xbffe,
+		.gpiomux	= { 0xbffe, 0, 0xbfff, 0 },
+		.gpiomute	= 0xbffe,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_TEMIC_4006FN5_MULTI_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -1322,7 +1322,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= NO_SVHS,
 		.gpiomask	= 1,
 		.muxsel		= MUXSEL(2, 3, 0, 1),
-		.gpiomux 	= { 0, 0, 1, 0 },
+		.gpiomux	= { 0, 0, 1, 0 },
 		.no_msp34xx	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_TEMIC_4006FN5_MULTI_PAL,
@@ -1339,8 +1339,8 @@ struct tvcard bttv_tvcards[] = {
 				/* Radio changed from 1e80 to 0x800 to make
 				FlyVideo2000S in .hu happy (gm)*/
 				/* -dk-???: set mute=0x1800 for tda9874h daughterboard */
-		.gpiomux 	= { 0x0000,0x0800,0x1000,0x1000 },
-		.gpiomute 	= 0x1800,
+		.gpiomux	= { 0x0000,0x0800,0x1000,0x1000 },
+		.gpiomute	= 0x1800,
 		.audio_mode_gpio= fv2000s_audio,
 		.no_msp34xx	= 1,
 		.pll            = PLL_28,
@@ -1354,8 +1354,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0xffff00,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0x500, 0x500, 0x300, 0x900 },
-		.gpiomute 	= 0x900,
+		.gpiomux	= { 0x500, 0x500, 0x300, 0x900 },
+		.gpiomute	= 0x900,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -1389,7 +1389,7 @@ struct tvcard bttv_tvcards[] = {
 		/* 0x100000: 1=MSP enabled (0=disable again)
 		* 0x010000: Connected to "S0" on tda9880 (0=Pal/BG, 1=NTSC) */
 		.gpiomux        = {0x947fff, 0x987fff,0x947fff,0x947fff },
-		.gpiomute 	= 0x947fff,
+		.gpiomute	= 0x947fff,
 		/* tvtuner, radio,   external,internal, mute,  stereo
 		* tuner, Composit, SVid, Composit-on-Svid-adapter */
 		.muxsel         = MUXSEL(2, 3, 0, 1),
@@ -1409,7 +1409,7 @@ struct tvcard bttv_tvcards[] = {
 		/* 0x100000: 1=MSP enabled (0=disable again)
 		* 0x010000: Connected to "S0" on tda9880 (0=Pal/BG, 1=NTSC) */
 		.gpiomux        = {0x947fff, 0x987fff,0x947fff,0x947fff },
-		.gpiomute 	= 0x947fff,
+		.gpiomute	= 0x947fff,
 		/* tvtuner, radio,   external,internal, mute,  stereo
 		* tuner, Composit, SVid, Composit-on-Svid-adapter */
 		.muxsel         = MUXSEL(2, 3, 0, 1),
@@ -1438,7 +1438,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 15,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0, 0, 11, 7 }, /* TV and Radio with same GPIO ! */
-		.gpiomute 	= 13,
+		.gpiomute	= 13,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_LG_PAL_I_FM,
 		.tuner_addr	= ADDR_UNSET,
@@ -1473,8 +1473,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x3f,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 0x01, 0x00, 0x03, 0x03 },
-		.gpiomute 	= 0x09,
+		.gpiomux	= { 0x01, 0x00, 0x03, 0x03 },
+		.gpiomute	= 0x09,
 		.no_msp34xx	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_PAL,
@@ -1525,7 +1525,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x1C800F,  /* Bit0-2: Audio select, 8-12:remote control 14:remote valid 15:remote reset */
 		.muxsel         = MUXSEL(2, 1, 1),
 		.gpiomux        = { 0, 1, 2, 2 },
-		.gpiomute 	= 4,
+		.gpiomute	= 4,
 		.tuner_type     = TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
 		.pll		= PLL_28,
@@ -1542,7 +1542,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x140007,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0, 1, 2, 3 },
-		.gpiomute 	= 4,
+		.gpiomute	= 4,
 		.tuner_type     = TUNER_PHILIPS_NTSC,
 		.tuner_addr	= ADDR_UNSET,
 		.audio_mode_gpio= windvr_audio,
@@ -1575,7 +1575,7 @@ struct tvcard bttv_tvcards[] = {
 						* gpiomux =1: lower volume, 2+3: mute
 						* btwincap uses 0x80000/0x80003
 						*/
-		.gpiomute 	= 4,
+		.gpiomute	= 4,
 		.no_msp34xx     = 1,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_PHILIPS_PAL,
@@ -1626,7 +1626,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x0f0f80,
 		.muxsel         = MUXSEL(2, 3, 1, 0),
 		.gpiomux        = {0x030000, 0x010000, 0, 0 },
-		.gpiomute 	= 0x020000,
+		.gpiomute	= 0x020000,
 		.no_msp34xx     = 1,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_PHILIPS_NTSC_M,
@@ -1829,7 +1829,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 7,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0, 1, 2, 3},
-		.gpiomute 	= 4,
+		.gpiomute	= 4,
 		.tuner_type     = TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
 		.pll            = PLL_28,
@@ -1872,7 +1872,7 @@ struct tvcard bttv_tvcards[] = {
 		.muxsel         = MUXSEL(2, 3, 1, 0),
 		/*                  Tuner, Radio, external, internal, off,  on */
 		.gpiomux        = { 0x08,  0x0f,  0x0a,     0x08 },
-		.gpiomute 	= 0x0f,
+		.gpiomute	= 0x0f,
 		.no_msp34xx     = 1,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_PHILIPS_NTSC,
@@ -2139,7 +2139,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x008007,
 		.muxsel         = MUXSEL(2, 3, 0, 0),
 		.gpiomux        = { 0, 0, 0, 0 },
-		.gpiomute 	= 0x000003,
+		.gpiomute	= 0x000003,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -2182,7 +2182,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x008007,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0, 1, 2, 2 },
-		.gpiomute 	= 3,
+		.gpiomute	= 3,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -2297,7 +2297,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0xFF,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 2, 0, 0, 0 },
-		.gpiomute 	= 10,
+		.gpiomute	= 10,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_PHILIPS_PAL,
 		.tuner_addr	= ADDR_UNSET,
@@ -2326,7 +2326,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x3f,
 		.muxsel         = MUXSEL(2, 3, 1, 0),
 		.gpiomux        = {0x31, 0x31, 0x31, 0x31 },
-		.gpiomute 	= 0x31,
+		.gpiomute	= 0x31,
 		.no_msp34xx     = 1,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_PHILIPS_NTSC_M,
@@ -2440,7 +2440,7 @@ struct tvcard bttv_tvcards[] = {
 		.muxsel		= MUXSEL(2, 3, 1),
 		.gpiomask       = 0x00e00007,
 		.gpiomux        = { 0x00400005, 0, 0x00000001, 0 },
-		.gpiomute 	= 0x00c00007,
+		.gpiomute	= 0x00c00007,
 		.no_msp34xx     = 1,
 		.no_tda7432     = 1,
 		.has_dvb        = 1,
@@ -2455,7 +2455,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask	= 0x01fe00,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0x001e00, 0, 0x018000, 0x014000 },
-		.gpiomute 	= 0x002000,
+		.gpiomute	= 0x002000,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_YMEC_TVF66T5_B_DFF,
 		.tuner_addr	= 0xc1 >>1,
@@ -2470,7 +2470,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x001c0007,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0, 1, 2, 2 },
-		.gpiomute 	= 3,
+		.gpiomute	= 3,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_TENA_9533_DI,
 		.tuner_addr	= ADDR_UNSET,
@@ -2505,7 +2505,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x3f,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0x21, 0x20, 0x24, 0x2c },
-		.gpiomute 	= 0x29,
+		.gpiomute	= 0x29,
 		.no_msp34xx     = 1,
 		.pll            = PLL_28,
 		.tuner_type     = TUNER_YMEC_TVF_5533MF,
@@ -2549,8 +2549,8 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 15,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 2, 0, 0, 0 },
-		.gpiomute 	= 1,
+		.gpiomux	= { 2, 0, 0, 0 },
+		.gpiomute	= 1,
 		.pll		= PLL_28,
 		.tuner_type	= TUNER_PHILIPS_NTSC,
 		.tuner_addr	= ADDR_UNSET,
@@ -2563,7 +2563,7 @@ struct tvcard bttv_tvcards[] = {
 		.svhs		= 2,
 		.gpiomask	= 0x108007,
 		.muxsel		= MUXSEL(2, 3, 1, 1),
-		.gpiomux 	= { 100000, 100002, 100002, 100000 },
+		.gpiomux	= { 100000, 100002, 100002, 100000 },
 		.no_msp34xx	= 1,
 		.no_tda7432     = 1,
 		.pll		= PLL_28,
@@ -2599,7 +2599,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 7,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0, 1, 2, 3 },
-		.gpiomute 	= 4,
+		.gpiomute	= 4,
 		.tuner_type     = TUNER_TEMIC_4009FR5_PAL,
 		.tuner_addr     = ADDR_UNSET,
 		.pll            = PLL_28,
@@ -2635,7 +2635,7 @@ struct tvcard bttv_tvcards[] = {
 		.muxsel		= MUXSEL(2, 3, 1),
 		.gpiomask       = 0x00e00007,
 		.gpiomux        = { 0x00400005, 0, 0x00000001, 0 },
-		.gpiomute 	= 0x00c00007,
+		.gpiomute	= 0x00c00007,
 		.no_msp34xx     = 1,
 		.no_tda7432     = 1,
 	},
@@ -2679,7 +2679,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x008007,
 		.muxsel         = MUXSEL(2, 3, 1, 1),
 		.gpiomux        = { 0, 1, 2, 2 }, /* CONTVFMi */
-		.gpiomute 	= 3, /* CONTVFMi */
+		.gpiomute	= 3, /* CONTVFMi */
 		.tuner_type     = TUNER_PHILIPS_FM1216ME_MK3, /* TCL MK3 */
 		.tuner_addr     = ADDR_UNSET,
 		.pll            = PLL_28,
@@ -2702,7 +2702,7 @@ struct tvcard bttv_tvcards[] = {
 		.gpiomask       = 0x060040,
 		.muxsel         = MUXSEL(2, 3, 3),
 		.gpiomux        = { 0x60000, 0x60000, 0x20000, 0x20000 },
-		.gpiomute 	= 0,
+		.gpiomute	= 0,
 		.tuner_type	= TUNER_TCL_MF02GIP_5N,
 		.tuner_addr     = ADDR_UNSET,
 		.pll            = PLL_28,
@@ -2752,8 +2752,8 @@ struct tvcard bttv_tvcards[] = {
 		/* Bruno Christo <bchristo@inf.ufsm.br>
 		 *
 		 * GeoVision GV-800(S) has 4 Conexant Fusion 878A:
-		 * 	1 audio input  per BT878A = 4 audio inputs
-		 * 	4 video inputs per BT878A = 16 video inputs
+		 *	1 audio input  per BT878A = 4 audio inputs
+		 *	4 video inputs per BT878A = 16 video inputs
 		 * This is the first BT878A chip of the GV-800(S). It's the
 		 * "master" chip and it controls the video inputs through an
 		 * analog multiplexer (a CD22M3494) via some GPIO pins. The
@@ -2779,8 +2779,8 @@ struct tvcard bttv_tvcards[] = {
 		/* Bruno Christo <bchristo@inf.ufsm.br>
 		 *
 		 * GeoVision GV-800(S) has 4 Conexant Fusion 878A:
-		 * 	1 audio input  per BT878A = 4 audio inputs
-		 * 	4 video inputs per BT878A = 16 video inputs
+		 *	1 audio input  per BT878A = 4 audio inputs
+		 *	4 video inputs per BT878A = 16 video inputs
 		 * The 3 other BT878A chips are "slave" chips of the GV-800(S)
 		 * and should use this card type.
 		 * The audio input is not working yet.
@@ -4784,9 +4784,9 @@ static void gv800s_write(struct bttv *btv,
 	* GPIO bits 0-9 are used for the analog switch:
 	*   00 - 03:	camera selector
 	*   04 - 06:	878A (controller) selector
-	*   16: 	cselect
+	*   16:		cselect
 	*   17:		strobe
-	*   18: 	data (1->on, 0->off)
+	*   18:		data (1->on, 0->off)
 	*   19:		reset
 	*/
 	const u32 ADDRESS = ((xaddr&0xf) | (yaddr&3)<<4);
@@ -4882,7 +4882,7 @@ void __init bttv_check_chipset(void)
 	int pcipci_fail = 0;
 	struct pci_dev *dev = NULL;
 
-	if (pci_pci_problems & (PCIPCI_FAIL|PCIAGP_FAIL)) 	/* should check if target is AGP */
+	if (pci_pci_problems & (PCIPCI_FAIL|PCIAGP_FAIL))	/* should check if target is AGP */
 		pcipci_fail = 1;
 	if (pci_pci_problems & (PCIPCI_TRITON|PCIPCI_NATOMA|PCIPCI_VIAETBF))
 		triton1 = 1;
diff --git a/drivers/media/pci/bt8xx/bttv-input.c b/drivers/media/pci/bt8xx/bttv-input.c
index ac7674700685..da49c5567db5 100644
--- a/drivers/media/pci/bt8xx/bttv-input.c
+++ b/drivers/media/pci/bt8xx/bttv-input.c
@@ -349,12 +349,12 @@ static int get_key_pv951(struct IR_i2c *ir, enum rc_proto *protocol,
 	 * NOTE:
 	 * lirc_i2c maps the pv951 code as:
 	 *	addr = 0x61D6
-	 * 	cmd = bit_reverse (b)
+	 *	cmd = bit_reverse (b)
 	 * So, it seems that this device uses NEC extended
 	 * I decided to not fix the table, due to two reasons:
-	 * 	1) Without the actual device, this is only a guess;
-	 * 	2) As the addr is not reported via I2C, nor can be changed,
-	 * 	   the device is bound to the vendor-provided RC.
+	 *	1) Without the actual device, this is only a guess;
+	 *	2) As the addr is not reported via I2C, nor can be changed,
+	 *	   the device is bound to the vendor-provided RC.
 	 */
 
 	*protocol = RC_PROTO_UNKNOWN;
diff --git a/drivers/media/pci/bt8xx/bttv.h b/drivers/media/pci/bt8xx/bttv.h
index cc555a4d4462..a27384adadd2 100644
--- a/drivers/media/pci/bt8xx/bttv.h
+++ b/drivers/media/pci/bt8xx/bttv.h
@@ -165,7 +165,7 @@
 #define BTTV_BOARD_PV_M4900                0x8b
 #define BTTV_BOARD_OSPREY440               0x8c
 #define BTTV_BOARD_ASOUND_SKYEYE	   0x8d
-#define BTTV_BOARD_SABRENT_TVFM   	   0x8e
+#define BTTV_BOARD_SABRENT_TVFM		   0x8e
 #define BTTV_BOARD_HAUPPAUGE_IMPACTVCB     0x8f
 #define BTTV_BOARD_MACHTV_MAGICTV          0x90
 #define BTTV_BOARD_SSAI_SECURITY	   0x91
@@ -265,7 +265,7 @@ extern struct tvcard bttv_tvcards[];
  * that they are changed to octal.  One should not use hex number, macros, or
  * anything else with this macro.  Just use plain integers from 0 to 3.
  */
-#define _MUXSELf(a)      	0##a << 30
+#define _MUXSELf(a)		0##a << 30
 #define _MUXSELe(a, b...)	0##a << 28 | _MUXSELf(b)
 #define _MUXSELd(a, b...)	0##a << 26 | _MUXSELe(b)
 #define _MUXSELc(a, b...)	0##a << 24 | _MUXSELd(b)
diff --git a/drivers/media/pci/bt8xx/bttvp.h b/drivers/media/pci/bt8xx/bttvp.h
index cb1b5e611130..7a86e7295166 100644
--- a/drivers/media/pci/bt8xx/bttvp.h
+++ b/drivers/media/pci/bt8xx/bttvp.h
@@ -141,7 +141,7 @@ struct bttv_ir {
 	bool			rc5_gpio;   /* Is RC5 legacy GPIO enabled? */
 	u32                     last_bit;   /* last raw bit seen */
 	u32                     code;       /* raw code under construction */
-	ktime_t          				base_time;  /* time of last seen code */
+	ktime_t						base_time;  /* time of last seen code */
 	bool                    active;     /* building raw code */
 };
 
@@ -400,8 +400,8 @@ struct bttv {
 	int                        i2c_state, i2c_rc;
 	int                        i2c_done;
 	wait_queue_head_t          i2c_queue;
-	struct v4l2_subdev 	  *sd_msp34xx;
-	struct v4l2_subdev 	  *sd_tvaudio;
+	struct v4l2_subdev	  *sd_msp34xx;
+	struct v4l2_subdev	  *sd_tvaudio;
 	struct v4l2_subdev	  *sd_tda7432;
 
 	/* video4linux (1) */
diff --git a/drivers/media/pci/cx18/cx18-alsa-pcm.c b/drivers/media/pci/cx18/cx18-alsa-pcm.c
index aadd76466aec..4f31042a442a 100644
--- a/drivers/media/pci/cx18/cx18-alsa-pcm.c
+++ b/drivers/media/pci/cx18/cx18-alsa-pcm.c
@@ -41,7 +41,7 @@ MODULE_PARM_DESC(pcm_debug, "enable debug messages for pcm");
 #define dprintk(fmt, arg...) do {					\
 	    if (pcm_debug)						\
 		printk(KERN_INFO "cx18-alsa-pcm %s: " fmt,		\
-				  __func__, ##arg); 			\
+				  __func__, ##arg);			\
 	} while (0)
 
 static const struct snd_pcm_hardware snd_cx18_hw_capture = {
diff --git a/drivers/media/pci/cx18/cx18-av-audio.c b/drivers/media/pci/cx18/cx18-av-audio.c
index 8b95e9aae576..3abc54cbe4a1 100644
--- a/drivers/media/pci/cx18/cx18-av-audio.c
+++ b/drivers/media/pci/cx18/cx18-av-audio.c
@@ -31,7 +31,7 @@ static int set_audclk_freq(struct cx18 *cx, u32 freq)
 	 * would ideally be:
 	 *
 	 * NTSC Color subcarrier freq * 8 =
-	 * 	4.5 MHz/286 * 455/2 * 8 = 28.63636363... MHz
+	 *	4.5 MHz/286 * 455/2 * 8 = 28.63636363... MHz
 	 *
 	 * The accidents of history and rationale that explain from where this
 	 * combination of magic numbers originate can be found in:
diff --git a/drivers/media/pci/cx18/cx18-av-core.c b/drivers/media/pci/cx18/cx18-av-core.c
index cf8817e9c8b9..eda343322ee0 100644
--- a/drivers/media/pci/cx18/cx18-av-core.c
+++ b/drivers/media/pci/cx18/cx18-av-core.c
@@ -236,10 +236,10 @@ static void cx18_av_initialize(struct v4l2_subdev *sd)
 	 */
 	cx18_av_and_or4(cx, CXADEC_AFE_CTRL, 0xFF000000, 0x00005D00);
 
-/* 	if(dwEnable && dw3DCombAvailable) { */
-/*      	CxDevWrReg(CXADEC_SRC_COMB_CFG, 0x7728021F); */
+/*	if(dwEnable && dw3DCombAvailable) { */
+/*		CxDevWrReg(CXADEC_SRC_COMB_CFG, 0x7728021F); */
 /*    } else { */
-/*      	CxDevWrReg(CXADEC_SRC_COMB_CFG, 0x6628021F); */
+/*		CxDevWrReg(CXADEC_SRC_COMB_CFG, 0x6628021F); */
 /*    } */
 	cx18_av_write4(cx, CXADEC_SRC_COMB_CFG, 0x6628021F);
 	default_volume = cx18_av_read(cx, 0x8d4);
@@ -319,13 +319,13 @@ void cx18_av_std_setup(struct cx18 *cx)
 		 * vblank656: half lines after line 625/mid-313 of blanked video
 		 * vblank:    half lines, after line 5/317, of blanked video
 		 * vactive:   half lines of active video +
-		 * 		5 half lines after the end of active video
+		 *		5 half lines after the end of active video
 		 *
 		 * As far as I can tell:
 		 * vblank656 starts counting from the falling edge of the first
-		 * 	vsync pulse (start of line 1 or mid-313)
+		 *	vsync pulse (start of line 1 or mid-313)
 		 * vblank starts counting from the after the 5 vsync pulses and
-		 * 	5 or 4 equalization pulses (start of line 6 or 318)
+		 *	5 or 4 equalization pulses (start of line 6 or 318)
 		 *
 		 * For 625 line systems the driver will extract VBI information
 		 * from lines 6-23 and lines 318-335 (but the slicer can only
@@ -395,9 +395,9 @@ void cx18_av_std_setup(struct cx18 *cx)
 		 *
 		 * As far as I can tell:
 		 * vblank656 starts counting from the falling edge of the first
-		 * 	vsync pulse (start of line 4 or mid-266)
+		 *	vsync pulse (start of line 4 or mid-266)
 		 * vblank starts counting from the after the 6 vsync pulses and
-		 * 	6 or 5 equalization pulses (start of line 10 or 272)
+		 *	6 or 5 equalization pulses (start of line 10 or 272)
 		 *
 		 * For 525 line systems the driver will extract VBI information
 		 * from lines 10-21 and lines 273-284.
@@ -851,7 +851,7 @@ static int cx18_av_s_std(struct v4l2_subdev *sd, v4l2_std_id norm)
 	struct cx18_av_state *state = to_cx18_av_state(sd);
 	struct cx18 *cx = v4l2_get_subdevdata(sd);
 
-	u8 fmt = 0; 	/* zero is autodetect */
+	u8 fmt = 0;	/* zero is autodetect */
 	u8 pal_m = 0;
 
 	if (state->radio == 0 && state->std == norm)
diff --git a/drivers/media/pci/cx18/cx18-av-core.h b/drivers/media/pci/cx18/cx18-av-core.h
index c976ce6e7a78..1a37f269d2e4 100644
--- a/drivers/media/pci/cx18/cx18-av-core.h
+++ b/drivers/media/pci/cx18/cx18-av-core.h
@@ -349,7 +349,7 @@ static inline struct v4l2_subdev *to_sd(struct v4l2_ctrl *ctrl)
 }
 
 /* ----------------------------------------------------------------------- */
-/* cx18_av-core.c 							   */
+/* cx18_av-core.c							   */
 int cx18_av_write(struct cx18 *cx, u16 addr, u8 value);
 int cx18_av_write4(struct cx18 *cx, u16 addr, u32 value);
 int cx18_av_write4_noretry(struct cx18 *cx, u16 addr, u32 value);
diff --git a/drivers/media/pci/cx18/cx18-cards.c b/drivers/media/pci/cx18/cx18-cards.c
index 11e898e66ce9..c2cf965d639e 100644
--- a/drivers/media/pci/cx18/cx18-cards.c
+++ b/drivers/media/pci/cx18/cx18-cards.c
@@ -388,7 +388,7 @@ static const struct cx18_card cx18_card_cnxt_raptor_pal = {
 		{ CX18_CARD_INPUT_COMPOSITE2, 2, CX18_AV_COMPOSITE6 },
 	},
 	.audio_inputs = {
-		{ CX18_CARD_INPUT_AUD_TUNER, CX18_AV_AUDIO5, 	    0 },
+		{ CX18_CARD_INPUT_AUD_TUNER, CX18_AV_AUDIO5,	    0 },
 		{ CX18_CARD_INPUT_LINE_IN1,  CX18_AV_AUDIO_SERIAL1, 1 },
 		{ CX18_CARD_INPUT_LINE_IN2,  CX18_AV_AUDIO_SERIAL2, 1 },
 	},
@@ -439,7 +439,7 @@ static const struct cx18_card cx18_card_toshiba_qosmio_dvbt = {
 		{ CX18_CARD_INPUT_COMPOSITE1, 1, CX18_AV_COMPOSITE1 },
 	},
 	.audio_inputs = {
-		{ CX18_CARD_INPUT_AUD_TUNER, CX18_AV_AUDIO5, 	    0 },
+		{ CX18_CARD_INPUT_AUD_TUNER, CX18_AV_AUDIO5,	    0 },
 		{ CX18_CARD_INPUT_LINE_IN1,  CX18_AV_AUDIO_SERIAL1, 1 },
 	},
 	.tuners = {
@@ -485,7 +485,7 @@ static const struct cx18_card cx18_card_leadtek_pvr2100 = {
 		{ CX18_CARD_INPUT_COMPONENT1, 1, CX18_AV_COMPONENT1 },
 	},
 	.audio_inputs = {
-		{ CX18_CARD_INPUT_AUD_TUNER, CX18_AV_AUDIO5, 	    0 },
+		{ CX18_CARD_INPUT_AUD_TUNER, CX18_AV_AUDIO5,	    0 },
 		{ CX18_CARD_INPUT_LINE_IN1,  CX18_AV_AUDIO_SERIAL1, 1 },
 	},
 	.tuners = {
@@ -538,7 +538,7 @@ static const struct cx18_card cx18_card_leadtek_dvr3100h = {
 		{ CX18_CARD_INPUT_COMPONENT1, 1, CX18_AV_COMPONENT1 },
 	},
 	.audio_inputs = {
-		{ CX18_CARD_INPUT_AUD_TUNER, CX18_AV_AUDIO5, 	    0 },
+		{ CX18_CARD_INPUT_AUD_TUNER, CX18_AV_AUDIO5,	    0 },
 		{ CX18_CARD_INPUT_LINE_IN1,  CX18_AV_AUDIO_SERIAL1, 1 },
 	},
 	.tuners = {
diff --git a/drivers/media/pci/cx18/cx18-cards.h b/drivers/media/pci/cx18/cx18-cards.h
index 5478f62b5cf3..02d0fb703a41 100644
--- a/drivers/media/pci/cx18/cx18-cards.h
+++ b/drivers/media/pci/cx18/cx18-cards.h
@@ -29,20 +29,20 @@
 
 /* video inputs */
 #define	CX18_CARD_INPUT_VID_TUNER	1
-#define	CX18_CARD_INPUT_SVIDEO1 	2
-#define	CX18_CARD_INPUT_SVIDEO2 	3
-#define	CX18_CARD_INPUT_COMPOSITE1 	4
-#define	CX18_CARD_INPUT_COMPOSITE2 	5
-#define	CX18_CARD_INPUT_COMPONENT1 	6
+#define	CX18_CARD_INPUT_SVIDEO1		2
+#define	CX18_CARD_INPUT_SVIDEO2		3
+#define	CX18_CARD_INPUT_COMPOSITE1	4
+#define	CX18_CARD_INPUT_COMPOSITE2	5
+#define	CX18_CARD_INPUT_COMPONENT1	6
 
 /* audio inputs */
 #define	CX18_CARD_INPUT_AUD_TUNER	1
-#define	CX18_CARD_INPUT_LINE_IN1 	2
-#define	CX18_CARD_INPUT_LINE_IN2 	3
+#define	CX18_CARD_INPUT_LINE_IN1	2
+#define	CX18_CARD_INPUT_LINE_IN2	3
 
 #define CX18_CARD_MAX_VIDEO_INPUTS 6
 #define CX18_CARD_MAX_AUDIO_INPUTS 3
-#define CX18_CARD_MAX_TUNERS  	   2
+#define CX18_CARD_MAX_TUNERS	   2
 
 /* V4L2 capability aliases */
 #define CX18_CAP_ENCODER (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_TUNER | \
@@ -51,7 +51,7 @@
 			  V4L2_CAP_SLICED_VBI_CAPTURE)
 
 struct cx18_card_video_input {
-	u8  video_type; 	/* video input type */
+	u8  video_type;		/* video input type */
 	u8  audio_index;	/* index in cx18_card_audio_input array */
 	u32 video_input;	/* hardware video input */
 };
@@ -74,7 +74,7 @@ struct cx18_card_pci_info {
 /* The mask is the set of bits used by the operation */
 
 struct cx18_gpio_init { /* set initial GPIO DIR and OUT values */
-	u32 direction; 	/* DIR setting. Leave to 0 if no init is needed */
+	u32 direction;	/* DIR setting. Leave to 0 if no init is needed */
 	u32 initial_value;
 };
 
@@ -86,16 +86,16 @@ struct cx18_gpio_i2c_slave_reset {
 	u32 ir_reset_mask;  /* GPIO to reset the Zilog Z8F0811 IR contoller */
 };
 
-struct cx18_gpio_audio_input { 	/* select tuner/line in input */
-	u32 mask; 		/* leave to 0 if not supported */
+struct cx18_gpio_audio_input {	/* select tuner/line in input */
+	u32 mask;		/* leave to 0 if not supported */
 	u32 tuner;
 	u32 linein;
 	u32 radio;
 };
 
 struct cx18_card_tuner {
-	v4l2_std_id std; 	/* standard for which the tuner is suitable */
-	int 	    tuner; 	/* tuner ID (from tuner.h) */
+	v4l2_std_id std;	/* standard for which the tuner is suitable */
+	int	    tuner;	/* tuner ID (from tuner.h) */
 };
 
 struct cx18_card_tuner_i2c {
@@ -128,8 +128,8 @@ struct cx18_card {
 	struct cx18_card_audio_input radio_input;
 
 	/* GPIO card-specific settings */
-	u8 xceive_pin; 		/* XCeive tuner GPIO reset pin */
-	struct cx18_gpio_init 		 gpio_init;
+	u8 xceive_pin;		/* XCeive tuner GPIO reset pin */
+	struct cx18_gpio_init		 gpio_init;
 	struct cx18_gpio_i2c_slave_reset gpio_i2c_slave_reset;
 	struct cx18_gpio_audio_input    gpio_audio_input;
 
diff --git a/drivers/media/pci/cx18/cx18-driver.h b/drivers/media/pci/cx18/cx18-driver.h
index 3492023a8675..0b707faca543 100644
--- a/drivers/media/pci/cx18/cx18-driver.h
+++ b/drivers/media/pci/cx18/cx18-driver.h
@@ -75,8 +75,8 @@
 /* Supported cards */
 #define CX18_CARD_HVR_1600_ESMT	      0	/* Hauppauge HVR 1600 (ESMT memory) */
 #define CX18_CARD_HVR_1600_SAMSUNG    1	/* Hauppauge HVR 1600 (Samsung memory) */
-#define CX18_CARD_COMPRO_H900 	      2	/* Compro VideoMate H900 */
-#define CX18_CARD_YUAN_MPC718 	      3	/* Yuan MPC718 */
+#define CX18_CARD_COMPRO_H900	      2	/* Compro VideoMate H900 */
+#define CX18_CARD_YUAN_MPC718	      3	/* Yuan MPC718 */
 #define CX18_CARD_CNXT_RAPTOR_PAL     4	/* Conexant Raptor PAL */
 #define CX18_CARD_TOSHIBA_QOSMIO_DVBT 5 /* Toshiba Qosmio Interal DVB-T/Analog*/
 #define CX18_CARD_LEADTEK_PVR2100     6 /* Leadtek WinFast PVR2100 */
@@ -99,9 +99,9 @@
 #define PCI_DEVICE_ID_CX23418 0x5b7a
 
 /* subsystem vendor ID */
-#define CX18_PCI_ID_HAUPPAUGE 		0x0070
-#define CX18_PCI_ID_COMPRO 		0x185b
-#define CX18_PCI_ID_YUAN 		0x12ab
+#define CX18_PCI_ID_HAUPPAUGE		0x0070
+#define CX18_PCI_ID_COMPRO		0x185b
+#define CX18_PCI_ID_YUAN		0x12ab
 #define CX18_PCI_ID_CONEXANT		0x14f1
 #define CX18_PCI_ID_TOSHIBA		0x1179
 #define CX18_PCI_ID_LEADTEK		0x107D
@@ -260,7 +260,7 @@ struct cx18_options {
 #define CX18_F_M_NEED_SWAP  0	/* mdl buffer data must be endianness swapped */
 
 /* per-stream, s_flags */
-#define CX18_F_S_CLAIMED 	3	/* this stream is claimed */
+#define CX18_F_S_CLAIMED	3	/* this stream is claimed */
 #define CX18_F_S_STREAMING      4	/* the fw is decoding/encoding this stream */
 #define CX18_F_S_INTERNAL_USE	5	/* this stream is used internally (sliced VBI processing) */
 #define CX18_F_S_STREAMOFF	7	/* signal end of stream EOS */
@@ -268,12 +268,12 @@ struct cx18_options {
 #define CX18_F_S_STOPPING	9	/* telling the fw to stop capturing */
 
 /* per-cx18, i_flags */
-#define CX18_F_I_LOADED_FW		0 	/* Loaded firmware 1st time */
-#define CX18_F_I_EOS			4 	/* End of encoder stream */
-#define CX18_F_I_RADIO_USER		5 	/* radio tuner is selected */
-#define CX18_F_I_ENC_PAUSED		13 	/* the encoder is paused */
-#define CX18_F_I_INITED			21 	/* set after first open */
-#define CX18_F_I_FAILED			22 	/* set if first open failed */
+#define CX18_F_I_LOADED_FW		0	/* Loaded firmware 1st time */
+#define CX18_F_I_EOS			4	/* End of encoder stream */
+#define CX18_F_I_RADIO_USER		5	/* radio tuner is selected */
+#define CX18_F_I_ENC_PAUSED		13	/* the encoder is paused */
+#define CX18_F_I_INITED			21	/* set after first open */
+#define CX18_F_I_FAILED			22	/* set if first open failed */
 
 /* These are the VBI types as they appear in the embedded VBI private packets. */
 #define CX18_SLICED_TYPE_TELETEXT_B     (1)
@@ -370,7 +370,7 @@ struct cx18_stream {
 	   is not actually created. */
 	struct video_device video_dev;	/* v4l2_dev is NULL when stream not created */
 	struct cx18_dvb *dvb;		/* DVB / Digital Transport */
-	struct cx18 *cx; 		/* for ease of use */
+	struct cx18 *cx;		/* for ease of use */
 	const char *name;		/* name of the stream */
 	int type;			/* stream type */
 	u32 handle;			/* task handle */
@@ -525,14 +525,14 @@ struct vbi_info {
 	 * into the MPEG PS stream.
 	 *
 	 * In each sliced_mpeg_data[] buffer is:
-	 * 	16 byte MPEG-2 PS Program Pack Header
-	 * 	16 byte MPEG-2 Private Stream 1 PES Header
-	 * 	 4 byte magic number: "itv0" or "ITV0"
-	 * 	 4 byte first  field line mask, if "itv0"
-	 * 	 4 byte second field line mask, if "itv0"
-	 * 	36 lines, if "ITV0"; or <36 lines, if "itv0"; of sliced VBI data
+	 *	16 byte MPEG-2 PS Program Pack Header
+	 *	16 byte MPEG-2 Private Stream 1 PES Header
+	 *	 4 byte magic number: "itv0" or "ITV0"
+	 *	 4 byte first  field line mask, if "itv0"
+	 *	 4 byte second field line mask, if "itv0"
+	 *	36 lines, if "ITV0"; or <36 lines, if "itv0"; of sliced VBI data
 	 *
-	 * 	Each line in the payload is
+	 *	Each line in the payload is
 	 *	 1 byte line header derived from the SDID (WSS, CC, VPS, etc.)
 	 *	42 bytes of line data
 	 *
@@ -583,7 +583,7 @@ struct cx18 {
 	u8 nof_inputs;		/* number of video inputs */
 	u8 nof_audio_inputs;	/* number of audio inputs */
 	u32 v4l2_cap;		/* V4L2 capabilities of card */
-	u32 hw_flags; 		/* Hardware description of the board */
+	u32 hw_flags;		/* Hardware description of the board */
 	unsigned int free_mdl_idx;
 	struct cx18_scb __iomem *scb; /* pointer to SCB */
 	struct mutex epu2apu_mb_lock; /* protect driver to chip mailbox in SCB*/
@@ -602,10 +602,10 @@ struct cx18 {
 	u32 dualwatch_stereo_mode;
 
 	struct mutex serialize_lock;    /* mutex used to serialize open/close/start/stop/ioctl operations */
-	struct cx18_options options; 	/* User options */
+	struct cx18_options options;	/* User options */
 	int stream_buffers[CX18_MAX_STREAMS]; /* # of buffers for each stream */
 	int stream_buf_size[CX18_MAX_STREAMS]; /* Stream buffer size */
-	struct cx18_stream streams[CX18_MAX_STREAMS]; 	/* Stream data */
+	struct cx18_stream streams[CX18_MAX_STREAMS];	/* Stream data */
 	struct snd_cx18_card *alsa; /* ALSA interface for PCM capture stream */
 	void (*pcm_announce_callback)(struct snd_cx18_card *card, u8 *pcm_data,
 				      size_t num_bytes);
diff --git a/drivers/media/pci/cx18/cx18-firmware.c b/drivers/media/pci/cx18/cx18-firmware.c
index 1b34ea1c3730..498a1854b3b0 100644
--- a/drivers/media/pci/cx18/cx18-firmware.c
+++ b/drivers/media/pci/cx18/cx18-firmware.c
@@ -23,65 +23,65 @@
 #include "cx18-cards.h"
 #include <linux/firmware.h>
 
-#define CX18_PROC_SOFT_RESET 		0xc70010
-#define CX18_DDR_SOFT_RESET          	0xc70014
-#define CX18_CLOCK_SELECT1           	0xc71000
-#define CX18_CLOCK_SELECT2           	0xc71004
-#define CX18_HALF_CLOCK_SELECT1      	0xc71008
-#define CX18_HALF_CLOCK_SELECT2      	0xc7100C
-#define CX18_CLOCK_POLARITY1         	0xc71010
-#define CX18_CLOCK_POLARITY2         	0xc71014
-#define CX18_ADD_DELAY_ENABLE1       	0xc71018
-#define CX18_ADD_DELAY_ENABLE2       	0xc7101C
-#define CX18_CLOCK_ENABLE1           	0xc71020
-#define CX18_CLOCK_ENABLE2           	0xc71024
-
-#define CX18_REG_BUS_TIMEOUT_EN      	0xc72024
-
-#define CX18_FAST_CLOCK_PLL_INT      	0xc78000
-#define CX18_FAST_CLOCK_PLL_FRAC     	0xc78004
-#define CX18_FAST_CLOCK_PLL_POST     	0xc78008
-#define CX18_FAST_CLOCK_PLL_PRESCALE 	0xc7800C
+#define CX18_PROC_SOFT_RESET		0xc70010
+#define CX18_DDR_SOFT_RESET		0xc70014
+#define CX18_CLOCK_SELECT1		0xc71000
+#define CX18_CLOCK_SELECT2		0xc71004
+#define CX18_HALF_CLOCK_SELECT1		0xc71008
+#define CX18_HALF_CLOCK_SELECT2		0xc7100C
+#define CX18_CLOCK_POLARITY1		0xc71010
+#define CX18_CLOCK_POLARITY2		0xc71014
+#define CX18_ADD_DELAY_ENABLE1		0xc71018
+#define CX18_ADD_DELAY_ENABLE2		0xc7101C
+#define CX18_CLOCK_ENABLE1		0xc71020
+#define CX18_CLOCK_ENABLE2		0xc71024
+
+#define CX18_REG_BUS_TIMEOUT_EN		0xc72024
+
+#define CX18_FAST_CLOCK_PLL_INT		0xc78000
+#define CX18_FAST_CLOCK_PLL_FRAC	0xc78004
+#define CX18_FAST_CLOCK_PLL_POST	0xc78008
+#define CX18_FAST_CLOCK_PLL_PRESCALE	0xc7800C
 #define CX18_FAST_CLOCK_PLL_ADJUST_BANDWIDTH 0xc78010
 
-#define CX18_SLOW_CLOCK_PLL_INT      	0xc78014
-#define CX18_SLOW_CLOCK_PLL_FRAC     	0xc78018
-#define CX18_SLOW_CLOCK_PLL_POST     	0xc7801C
+#define CX18_SLOW_CLOCK_PLL_INT		0xc78014
+#define CX18_SLOW_CLOCK_PLL_FRAC	0xc78018
+#define CX18_SLOW_CLOCK_PLL_POST	0xc7801C
 #define CX18_MPEG_CLOCK_PLL_INT		0xc78040
 #define CX18_MPEG_CLOCK_PLL_FRAC	0xc78044
 #define CX18_MPEG_CLOCK_PLL_POST	0xc78048
-#define CX18_PLL_POWER_DOWN          	0xc78088
+#define CX18_PLL_POWER_DOWN		0xc78088
 #define CX18_SW1_INT_STATUS             0xc73104
 #define CX18_SW1_INT_ENABLE_PCI         0xc7311C
 #define CX18_SW2_INT_SET                0xc73140
 #define CX18_SW2_INT_STATUS             0xc73144
-#define CX18_ADEC_CONTROL            	0xc78120
+#define CX18_ADEC_CONTROL		0xc78120
 
-#define CX18_DDR_REQUEST_ENABLE      	0xc80000
-#define CX18_DDR_CHIP_CONFIG         	0xc80004
-#define CX18_DDR_REFRESH            	0xc80008
-#define CX18_DDR_TIMING1             	0xc8000C
-#define CX18_DDR_TIMING2             	0xc80010
+#define CX18_DDR_REQUEST_ENABLE		0xc80000
+#define CX18_DDR_CHIP_CONFIG		0xc80004
+#define CX18_DDR_REFRESH		0xc80008
+#define CX18_DDR_TIMING1		0xc8000C
+#define CX18_DDR_TIMING2		0xc80010
 #define CX18_DDR_POWER_REG		0xc8001C
 
-#define CX18_DDR_TUNE_LANE           	0xc80048
-#define CX18_DDR_INITIAL_EMRS        	0xc80054
-#define CX18_DDR_MB_PER_ROW_7        	0xc8009C
-#define CX18_DDR_BASE_63_ADDR        	0xc804FC
-
-#define CX18_WMB_CLIENT02            	0xc90108
-#define CX18_WMB_CLIENT05            	0xc90114
-#define CX18_WMB_CLIENT06            	0xc90118
-#define CX18_WMB_CLIENT07            	0xc9011C
-#define CX18_WMB_CLIENT08            	0xc90120
-#define CX18_WMB_CLIENT09            	0xc90124
-#define CX18_WMB_CLIENT10            	0xc90128
-#define CX18_WMB_CLIENT11            	0xc9012C
-#define CX18_WMB_CLIENT12            	0xc90130
-#define CX18_WMB_CLIENT13            	0xc90134
-#define CX18_WMB_CLIENT14            	0xc90138
-
-#define CX18_DSP0_INTERRUPT_MASK     	0xd0004C
+#define CX18_DDR_TUNE_LANE		0xc80048
+#define CX18_DDR_INITIAL_EMRS		0xc80054
+#define CX18_DDR_MB_PER_ROW_7		0xc8009C
+#define CX18_DDR_BASE_63_ADDR		0xc804FC
+
+#define CX18_WMB_CLIENT02		0xc90108
+#define CX18_WMB_CLIENT05		0xc90114
+#define CX18_WMB_CLIENT06		0xc90118
+#define CX18_WMB_CLIENT07		0xc9011C
+#define CX18_WMB_CLIENT08		0xc90120
+#define CX18_WMB_CLIENT09		0xc90124
+#define CX18_WMB_CLIENT10		0xc90128
+#define CX18_WMB_CLIENT11		0xc9012C
+#define CX18_WMB_CLIENT12		0xc90130
+#define CX18_WMB_CLIENT13		0xc90134
+#define CX18_WMB_CLIENT14		0xc90138
+
+#define CX18_DSP0_INTERRUPT_MASK	0xd0004C
 
 #define APU_ROM_SYNC1 0x6D676553 /* "mgeS" */
 #define APU_ROM_SYNC2 0x72646548 /* "rdeH" */
@@ -229,7 +229,7 @@ void cx18_init_power(struct cx18 *cx, int lowpwr)
 	 * would ideally be:
 	 *
 	 * NTSC Color subcarrier freq * 8 =
-	 * 	4.5 MHz/286 * 455/2 * 8 = 28.63636363... MHz
+	 *	4.5 MHz/286 * 455/2 * 8 = 28.63636363... MHz
 	 *
 	 * The accidents of history and rationale that explain from where this
 	 * combination of magic numbers originate can be found in:
diff --git a/drivers/media/pci/cx18/cx18-mailbox.c b/drivers/media/pci/cx18/cx18-mailbox.c
index 763f960fc918..f66dd63e1994 100644
--- a/drivers/media/pci/cx18/cx18-mailbox.c
+++ b/drivers/media/pci/cx18/cx18-mailbox.c
@@ -35,7 +35,7 @@ struct cx18_api_info {
 	u32 cmd;
 	u8 flags;		/* Flags, see above */
 	u8 rpu;			/* Processing unit */
-	const char *name; 	/* The name of the command */
+	const char *name;	/* The name of the command */
 };
 
 #define API_ENTRY(rpu, x, f) { (x), (f), (rpu), #x }
@@ -43,9 +43,9 @@ struct cx18_api_info {
 static const struct cx18_api_info api_info[] = {
 	/* MPEG encoder API */
 	API_ENTRY(CPU, CX18_CPU_SET_CHANNEL_TYPE,		0),
-	API_ENTRY(CPU, CX18_EPU_DEBUG, 				0),
-	API_ENTRY(CPU, CX18_CREATE_TASK, 			0),
-	API_ENTRY(CPU, CX18_DESTROY_TASK, 			0),
+	API_ENTRY(CPU, CX18_EPU_DEBUG,				0),
+	API_ENTRY(CPU, CX18_CREATE_TASK,			0),
+	API_ENTRY(CPU, CX18_DESTROY_TASK,			0),
 	API_ENTRY(CPU, CX18_CPU_CAPTURE_START,                  API_SLOW),
 	API_ENTRY(CPU, CX18_CPU_CAPTURE_STOP,                   API_SLOW),
 	API_ENTRY(CPU, CX18_CPU_CAPTURE_PAUSE,                  0),
diff --git a/drivers/media/pci/cx18/cx18-streams.c b/drivers/media/pci/cx18/cx18-streams.c
index b9c6831c21c3..a594cfdeca20 100644
--- a/drivers/media/pci/cx18/cx18-streams.c
+++ b/drivers/media/pci/cx18/cx18-streams.c
@@ -29,7 +29,7 @@
 #include "cx18-scb.h"
 #include "cx18-dvb.h"
 
-#define CX18_DSP0_INTERRUPT_MASK     	0xd0004C
+#define CX18_DSP0_INTERRUPT_MASK	0xd0004C
 
 static const struct v4l2_file_operations cx18_v4l2_enc_fops = {
 	.owner = THIS_MODULE,
diff --git a/drivers/media/pci/cx18/cx18-vbi.c b/drivers/media/pci/cx18/cx18-vbi.c
index 72c74d60c6fb..81f1e27436fd 100644
--- a/drivers/media/pci/cx18/cx18-vbi.c
+++ b/drivers/media/pci/cx18/cx18-vbi.c
@@ -47,7 +47,7 @@ static void copy_vbi_data(struct cx18 *cx, int lines, u32 pts_stamp)
 		0x00, 0x00, 0x01, 0xbd,		    /* Priv Stream 1 start */
 		0x00, 0x1a,			    /* length */
 		0x84, 0x80, 0x07,		    /* flags, hdr data len */
-		0x21, 0x00, 0x5d, 0x63, 0xa7, 	    /* PTS, markers */
+		0x21, 0x00, 0x5d, 0x63, 0xa7,	    /* PTS, markers */
 		0xff, 0xff			    /* stuffing */
 	};
 	const int sd = sizeof(mpeg_hdr_data);	/* start of vbi data */
diff --git a/drivers/media/pci/cx18/cx23418.h b/drivers/media/pci/cx18/cx23418.h
index 901ed7fac10f..15205b662952 100644
--- a/drivers/media/pci/cx18/cx23418.h
+++ b/drivers/media/pci/cx18/cx23418.h
@@ -19,10 +19,10 @@
 
 #include <media/drv-intf/cx2341x.h>
 
-#define MGR_CMD_MASK            		0x40000000
+#define MGR_CMD_MASK				0x40000000
 /* The MSB of the command code indicates that this is the completion of a
    command */
-#define MGR_CMD_MASK_ACK        		(MGR_CMD_MASK | 0x80000000)
+#define MGR_CMD_MASK_ACK			(MGR_CMD_MASK | 0x80000000)
 
 /* Description: This command creates a new instance of a certain task
    IN[0]  - Task ID. This is one of the XPU_CMD_MASK_YYY where XPU is
@@ -30,26 +30,26 @@
    OUT[0] - Task handle. This handle is passed along with commands to
 	    dispatch to the right instance of the task
    ReturnCode - One of the ERR_SYS_... */
-#define CX18_CREATE_TASK      			(MGR_CMD_MASK | 0x0001)
+#define CX18_CREATE_TASK			(MGR_CMD_MASK | 0x0001)
 
 /* Description: This command destroys an instance of a task
    IN[0] - Task handle. Hanlde of the task to destroy
    ReturnCode - One of the ERR_SYS_... */
-#define CX18_DESTROY_TASK     			(MGR_CMD_MASK | 0x0002)
+#define CX18_DESTROY_TASK			(MGR_CMD_MASK | 0x0002)
 
 /* All commands for CPU have the following mask set */
-#define CPU_CMD_MASK                        	0x20000000
-#define CPU_CMD_MASK_DEBUG       		(CPU_CMD_MASK | 0x00000000)
-#define CPU_CMD_MASK_ACK                    	(CPU_CMD_MASK | 0x80000000)
-#define CPU_CMD_MASK_CAPTURE                	(CPU_CMD_MASK | 0x00020000)
-#define CPU_CMD_MASK_TS                     	(CPU_CMD_MASK | 0x00040000)
+#define CPU_CMD_MASK				0x20000000
+#define CPU_CMD_MASK_DEBUG			(CPU_CMD_MASK | 0x00000000)
+#define CPU_CMD_MASK_ACK			(CPU_CMD_MASK | 0x80000000)
+#define CPU_CMD_MASK_CAPTURE			(CPU_CMD_MASK | 0x00020000)
+#define CPU_CMD_MASK_TS				(CPU_CMD_MASK | 0x00040000)
 
-#define EPU_CMD_MASK                        	0x02000000
-#define EPU_CMD_MASK_DEBUG       		(EPU_CMD_MASK | 0x000000)
-#define EPU_CMD_MASK_DE                     	(EPU_CMD_MASK | 0x040000)
+#define EPU_CMD_MASK				0x02000000
+#define EPU_CMD_MASK_DEBUG			(EPU_CMD_MASK | 0x000000)
+#define EPU_CMD_MASK_DE				(EPU_CMD_MASK | 0x040000)
 
-#define APU_CMD_MASK 				0x10000000
-#define APU_CMD_MASK_ACK 			(APU_CMD_MASK | 0x80000000)
+#define APU_CMD_MASK				0x10000000
+#define APU_CMD_MASK_ACK			(APU_CMD_MASK | 0x80000000)
 
 #define CX18_APU_ENCODING_METHOD_MPEG		(0 << 28)
 #define CX18_APU_ENCODING_METHOD_AC3		(1 << 28)
@@ -67,7 +67,7 @@
 
 /* Description: Command APU to reset the AI
    ReturnCode - ??? */
-#define CX18_APU_RESETAI 			(APU_CMD_MASK | 0x05)
+#define CX18_APU_RESETAI			(APU_CMD_MASK | 0x05)
 
 /* Description: This command indicates that a Memory Descriptor List has been
    filled with the requested channel type
@@ -75,13 +75,13 @@
    IN[1] - Offset of the MDL_ACK from the beginning of the local DDR.
    IN[2] - Number of CNXT_MDL_ACK structures in the array pointed to by IN[1]
    ReturnCode - One of the ERR_DE_... */
-#define CX18_EPU_DMA_DONE              		(EPU_CMD_MASK_DE | 0x0001)
+#define CX18_EPU_DMA_DONE			(EPU_CMD_MASK_DE | 0x0001)
 
 /* Something interesting happened
    IN[0] - A value to log
    IN[1] - An offset of a string in the MiniMe memory;
 	   0/zero/NULL means "I have nothing to say" */
-#define CX18_EPU_DEBUG 				(EPU_CMD_MASK_DEBUG | 0x0003)
+#define CX18_EPU_DEBUG				(EPU_CMD_MASK_DEBUG | 0x0003)
 
 /* Reads memory/registers (32-bit)
    IN[0] - Address
@@ -91,40 +91,40 @@
 /* Description: This command starts streaming with the set channel type
    IN[0] - Task handle. Handle of the task to start
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_CAPTURE_START               	(CPU_CMD_MASK_CAPTURE | 0x0002)
+#define CX18_CPU_CAPTURE_START			(CPU_CMD_MASK_CAPTURE | 0x0002)
 
 /* Description: This command stops streaming with the set channel type
    IN[0] - Task handle. Handle of the task to stop
    IN[1] - 0 = stop at end of GOP, 1 = stop at end of frame (MPEG only)
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_CAPTURE_STOP                	(CPU_CMD_MASK_CAPTURE | 0x0003)
+#define CX18_CPU_CAPTURE_STOP			(CPU_CMD_MASK_CAPTURE | 0x0003)
 
 /* Description: This command pauses streaming with the set channel type
    IN[0] - Task handle. Handle of the task to pause
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_CAPTURE_PAUSE               	(CPU_CMD_MASK_CAPTURE | 0x0007)
+#define CX18_CPU_CAPTURE_PAUSE			(CPU_CMD_MASK_CAPTURE | 0x0007)
 
 /* Description: This command resumes streaming with the set channel type
    IN[0] - Task handle. Handle of the task to resume
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_CAPTURE_RESUME              	(CPU_CMD_MASK_CAPTURE | 0x0008)
-
-#define CAPTURE_CHANNEL_TYPE_NONE  		0
-#define CAPTURE_CHANNEL_TYPE_MPEG  		1
-#define CAPTURE_CHANNEL_TYPE_INDEX 		2
-#define CAPTURE_CHANNEL_TYPE_YUV   		3
-#define CAPTURE_CHANNEL_TYPE_PCM   		4
-#define CAPTURE_CHANNEL_TYPE_VBI   		5
+#define CX18_CPU_CAPTURE_RESUME			(CPU_CMD_MASK_CAPTURE | 0x0008)
+
+#define CAPTURE_CHANNEL_TYPE_NONE		0
+#define CAPTURE_CHANNEL_TYPE_MPEG		1
+#define CAPTURE_CHANNEL_TYPE_INDEX		2
+#define CAPTURE_CHANNEL_TYPE_YUV		3
+#define CAPTURE_CHANNEL_TYPE_PCM		4
+#define CAPTURE_CHANNEL_TYPE_VBI		5
 #define CAPTURE_CHANNEL_TYPE_SLICED_VBI		6
 #define CAPTURE_CHANNEL_TYPE_TS			7
-#define CAPTURE_CHANNEL_TYPE_MAX   		15
+#define CAPTURE_CHANNEL_TYPE_MAX		15
 
 /* Description: This command sets the channel type. This can only be done
    when stopped.
    IN[0] - Task handle. Handle of the task to start
    IN[1] - Channel Type. See Below.
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_SET_CHANNEL_TYPE      		(CPU_CMD_MASK_CAPTURE + 1)
+#define CX18_CPU_SET_CHANNEL_TYPE		(CPU_CMD_MASK_CAPTURE + 1)
 
 /* Description: Set stream output type
    IN[0] - task handle. Handle of the task to start
@@ -140,7 +140,7 @@
    IN[4] - reserved
    IN[5] - frame rate, 0 - 29.97f/s, 1 - 25f/s
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_SET_VIDEO_IN                	(CPU_CMD_MASK_CAPTURE | 0x0004)
+#define CX18_CPU_SET_VIDEO_IN			(CPU_CMD_MASK_CAPTURE | 0x0004)
 
 /* Description: Set video frame rate
    IN[0] - task handle. Handle of the task to start
@@ -149,7 +149,7 @@
    IN[3] - video peak rate
    IN[4] - system mux rate
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_SET_VIDEO_RATE              	(CPU_CMD_MASK_CAPTURE | 0x0005)
+#define CX18_CPU_SET_VIDEO_RATE			(CPU_CMD_MASK_CAPTURE | 0x0005)
 
 /* Description: Set video output resolution
    IN[0] - task handle
@@ -166,7 +166,7 @@
 				3 = horizontal/vertical, 4 = diagonal
    IN[3] - strength, temporal 0 - 31, spatial 0 - 15
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_SET_FILTER_PARAM            	(CPU_CMD_MASK_CAPTURE | 0x0009)
+#define CX18_CPU_SET_FILTER_PARAM		(CPU_CMD_MASK_CAPTURE | 0x0009)
 
 /* Description: This command set spatial filter type
    IN[0] - Task handle.
@@ -174,7 +174,7 @@
 		      3 = 2D H/V separable, 4 = 2D symmetric non-separable
    IN[2] - chroma type: 0 - disable, 1 = 1D horizontal
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_SET_SPATIAL_FILTER_TYPE     	(CPU_CMD_MASK_CAPTURE | 0x000C)
+#define CX18_CPU_SET_SPATIAL_FILTER_TYPE	(CPU_CMD_MASK_CAPTURE | 0x000C)
 
 /* Description: This command set coring levels for median filter
    IN[0] - Task handle.
@@ -183,16 +183,16 @@
    IN[3] - chroma_high
    IN[4] - chroma_low
    ReturnCode - One of the ERR_CAPTURE_... */
-#define CX18_CPU_SET_MEDIAN_CORING           	(CPU_CMD_MASK_CAPTURE | 0x000E)
+#define CX18_CPU_SET_MEDIAN_CORING		(CPU_CMD_MASK_CAPTURE | 0x000E)
 
 /* Description: This command set the picture type mask for index file
    IN[0] - Task handle (ignored by firmware)
-   IN[1] - 	0 = disable index file output
+   IN[1] -	0 = disable index file output
 			1 = output I picture
 			2 = P picture
 			4 = B picture
 			other = illegal */
-#define CX18_CPU_SET_INDEXTABLE         	(CPU_CMD_MASK_CAPTURE | 0x0010)
+#define CX18_CPU_SET_INDEXTABLE			(CPU_CMD_MASK_CAPTURE | 0x0010)
 
 /* Description: Set audio parameters
    IN[0] - task handle. Handle of the task to start
@@ -218,7 +218,7 @@
 /* Description: Set stream output type
    IN[0] - task handle. Handle of the task to start
    IN[1] - subType
-	    SET_INITIAL_SCR      		1
+	    SET_INITIAL_SCR			1
 	    SET_QUALITY_MODE            2
 	    SET_VIM_PROTECT_MODE        3
 	    SET_PTS_CORRECTION          4
@@ -311,7 +311,7 @@
 			bit 0:	output user data, 1 - enable
 			bit 1:	output private stream, 1 - enable
 			bit 2:	mux option, 0 - in GOP, 1 - in picture
-			bit[7:0] 	private stream ID
+			bit[7:0]	private stream ID
    IN[5] - insertion period while mux option is in picture
    ReturnCode - VBI data offset */
 #define CX18_CPU_SET_SLICED_VBI_PARAM		(CPU_CMD_MASK_CAPTURE | 0x0020)
@@ -344,13 +344,13 @@
 #define CX18_CPU_SET_VFC_PARAM                  (CPU_CMD_MASK_CAPTURE | 0x0023)
 
 /* Below is the list of commands related to the data exchange */
-#define CPU_CMD_MASK_DE 			(CPU_CMD_MASK | 0x040000)
+#define CPU_CMD_MASK_DE				(CPU_CMD_MASK | 0x040000)
 
 /* Description: This command provides the physical base address of the local
    DDR as viewed by EPU
    IN[0] - Physical offset where EPU has the local DDR mapped
    ReturnCode - One of the ERR_DE_... */
-#define CPU_CMD_DE_SetBase 			(CPU_CMD_MASK_DE | 0x0001)
+#define CPU_CMD_DE_SetBase			(CPU_CMD_MASK_DE | 0x0001)
 
 /* Description: This command provides the offsets in the device memory where
    the 2 cx18_mdl_ack blocks reside
@@ -360,7 +360,7 @@
    IN[2] - Offset of the second cx18_mdl_ack from the beginning of the
 	   local DDR.
    ReturnCode - One of the ERR_DE_... */
-#define CX18_CPU_DE_SET_MDL_ACK                	(CPU_CMD_MASK_DE | 0x0002)
+#define CX18_CPU_DE_SET_MDL_ACK			(CPU_CMD_MASK_DE | 0x0002)
 
 /* Description: This command provides the offset to a Memory Descriptor List
    IN[0] - Task handle. Handle of the task to start
@@ -369,13 +369,13 @@
    IN[3] - Buffer ID
    IN[4] - Total buffer length
    ReturnCode - One of the ERR_DE_... */
-#define CX18_CPU_DE_SET_MDL                   	(CPU_CMD_MASK_DE | 0x0005)
+#define CX18_CPU_DE_SET_MDL			(CPU_CMD_MASK_DE | 0x0005)
 
 /* Description: This command requests return of all current Memory
    Descriptor Lists to the driver
    IN[0] - Task handle. Handle of the task to start
    ReturnCode - One of the ERR_DE_... */
-#define CX18_CPU_DE_RELEASE_MDL               	(CPU_CMD_MASK_DE | 0x0006)
+#define CX18_CPU_DE_RELEASE_MDL			(CPU_CMD_MASK_DE | 0x0006)
 
 /* Description: This command signals the cpu that the dat buffer has been
    consumed and ready for re-use.
diff --git a/drivers/media/pci/cx23885/cimax2.c b/drivers/media/pci/cx23885/cimax2.c
index 96f75f658b1b..19c005f4a57d 100644
--- a/drivers/media/pci/cx23885/cimax2.c
+++ b/drivers/media/pci/cx23885/cimax2.c
@@ -54,7 +54,7 @@
 #define NETUP_CI_CTL		0x04
 #define NETUP_CI_RD		1
 
-#define NETUP_IRQ_DETAM 	0x1
+#define NETUP_IRQ_DETAM		0x1
 #define NETUP_IRQ_IRQAM		0x4
 
 static unsigned int ci_dbg;
diff --git a/drivers/media/pci/cx23885/cx23885-video.c b/drivers/media/pci/cx23885/cx23885-video.c
index ecc580af0148..a03dcb662953 100644
--- a/drivers/media/pci/cx23885/cx23885-video.c
+++ b/drivers/media/pci/cx23885/cx23885-video.c
@@ -1146,7 +1146,7 @@ static struct video_device cx23885_vbi_template;
 static struct video_device cx23885_video_template = {
 	.name                 = "cx23885-video",
 	.fops                 = &video_fops,
-	.ioctl_ops 	      = &video_ioctl_ops,
+	.ioctl_ops	      = &video_ioctl_ops,
 	.tvnorms              = CX23885_NORMS,
 };
 
diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
index 6aab713e0476..2a17209eb4f6 100644
--- a/drivers/media/pci/cx23885/cx23885.h
+++ b/drivers/media/pci/cx23885/cx23885.h
@@ -357,7 +357,7 @@ struct cx23885_audio_dev {
 
 struct cx23885_dev {
 	atomic_t                   refcount;
-	struct v4l2_device 	   v4l2_dev;
+	struct v4l2_device	   v4l2_dev;
 	struct v4l2_ctrl_handler   ctrl_handler;
 
 	/* pci stuff */
@@ -407,7 +407,7 @@ struct cx23885_dev {
 	unsigned int               tuner_bus;
 	unsigned int               radio_type;
 	unsigned char              radio_addr;
-	struct v4l2_subdev 	   *sd_cx25840;
+	struct v4l2_subdev	   *sd_cx25840;
 	struct work_struct	   cx25840_work;
 
 	/* Infrared */
diff --git a/drivers/media/pci/cx23885/cx23888-ir.c b/drivers/media/pci/cx23885/cx23888-ir.c
index b0d4e4437b87..00329f668b59 100644
--- a/drivers/media/pci/cx23885/cx23888-ir.c
+++ b/drivers/media/pci/cx23885/cx23888-ir.c
@@ -29,7 +29,7 @@ static unsigned int ir_888_debug;
 module_param(ir_888_debug, int, 0644);
 MODULE_PARM_DESC(ir_888_debug, "enable debug messages [CX23888 IR controller]");
 
-#define CX23888_IR_REG_BASE 	0x170000
+#define CX23888_IR_REG_BASE	0x170000
 /*
  * These CX23888 register offsets have a straightforward one to one mapping
  * to the CX23885 register offsets of 0x200 through 0x218
diff --git a/drivers/media/pci/ivtv/ivtv-cards.h b/drivers/media/pci/ivtv/ivtv-cards.h
index 06e7b4ed6444..1557a6ea4dd9 100644
--- a/drivers/media/pci/ivtv/ivtv-cards.h
+++ b/drivers/media/pci/ivtv/ivtv-cards.h
@@ -22,26 +22,26 @@
 #define IVTV_CARDS_H
 
 /* Supported cards */
-#define IVTV_CARD_PVR_250 	      0	/* WinTV PVR 250 */
-#define IVTV_CARD_PVR_350 	      1	/* encoder, decoder, tv-out */
-#define IVTV_CARD_PVR_150 	      2	/* WinTV PVR 150 and PVR 500 (really just two
+#define IVTV_CARD_PVR_250	      0	/* WinTV PVR 250 */
+#define IVTV_CARD_PVR_350	      1	/* encoder, decoder, tv-out */
+#define IVTV_CARD_PVR_150	      2	/* WinTV PVR 150 and PVR 500 (really just two
 					   PVR150s on one PCI board) */
-#define IVTV_CARD_M179    	      3	/* AVerMedia M179 (encoder only) */
-#define IVTV_CARD_MPG600  	      4	/* Kuroutoshikou ITVC16-STVLP/YUAN MPG600, encoder only */
-#define IVTV_CARD_MPG160  	      5	/* Kuroutoshikou ITVC15-STVLP/YUAN MPG160
+#define IVTV_CARD_M179		      3	/* AVerMedia M179 (encoder only) */
+#define IVTV_CARD_MPG600	      4	/* Kuroutoshikou ITVC16-STVLP/YUAN MPG600, encoder only */
+#define IVTV_CARD_MPG160	      5	/* Kuroutoshikou ITVC15-STVLP/YUAN MPG160
 					   cx23415 based, but does not have tv-out */
-#define IVTV_CARD_PG600 	      6	/* YUAN PG600/DIAMONDMM PVR-550 based on the CX Falcon 2 */
-#define IVTV_CARD_AVC2410 	      7	/* Adaptec AVC-2410 */
-#define IVTV_CARD_AVC2010 	      8	/* Adaptec AVD-2010 (No Tuner) */
-#define IVTV_CARD_TG5000TV   	      9 /* NAGASE TRANSGEAR 5000TV, encoder only */
+#define IVTV_CARD_PG600		      6	/* YUAN PG600/DIAMONDMM PVR-550 based on the CX Falcon 2 */
+#define IVTV_CARD_AVC2410	      7	/* Adaptec AVC-2410 */
+#define IVTV_CARD_AVC2010	      8	/* Adaptec AVD-2010 (No Tuner) */
+#define IVTV_CARD_TG5000TV	      9 /* NAGASE TRANSGEAR 5000TV, encoder only */
 #define IVTV_CARD_VA2000MAX_SNT6     10 /* VA2000MAX-STN6 */
-#define IVTV_CARD_CX23416GYC 	     11 /* Kuroutoshikou CX23416GYC-STVLP (Yuan MPG600GR OEM) */
-#define IVTV_CARD_GV_MVPRX   	     12 /* I/O Data GV-MVP/RX, RX2, RX2W */
-#define IVTV_CARD_GV_MVPRX2E 	     13 /* I/O Data GV-MVP/RX2E */
+#define IVTV_CARD_CX23416GYC	     11 /* Kuroutoshikou CX23416GYC-STVLP (Yuan MPG600GR OEM) */
+#define IVTV_CARD_GV_MVPRX	     12 /* I/O Data GV-MVP/RX, RX2, RX2W */
+#define IVTV_CARD_GV_MVPRX2E	     13 /* I/O Data GV-MVP/RX2E */
 #define IVTV_CARD_GOTVIEW_PCI_DVD    14	/* GotView PCI DVD */
 #define IVTV_CARD_GOTVIEW_PCI_DVD2   15	/* GotView PCI DVD2 */
 #define IVTV_CARD_YUAN_MPC622        16	/* Yuan MPC622 miniPCI */
-#define IVTV_CARD_DCTMTVP1 	     17 /* DIGITAL COWBOY DCT-MTVP1 */
+#define IVTV_CARD_DCTMTVP1	     17 /* DIGITAL COWBOY DCT-MTVP1 */
 #define IVTV_CARD_PG600V2	     18 /* Yuan PG600V2/GotView PCI DVD Lite */
 #define IVTV_CARD_CLUB3D	     19 /* Club3D ZAP-TV1x01 */
 #define IVTV_CARD_AVERTV_MCE116	     20 /* AVerTV MCE 116 Plus */
@@ -52,7 +52,7 @@
 #define IVTV_CARD_BUFFALO_MV5L       25 /* Buffalo PC-MV5L/PCI card */
 #define IVTV_CARD_AVER_ULTRA1500MCE  26 /* AVerMedia UltraTV 1500 MCE */
 #define IVTV_CARD_KIKYOU             27 /* Sony VAIO Giga Pocket (ENX Kikyou) */
-#define IVTV_CARD_LAST 		     27
+#define IVTV_CARD_LAST		     27
 
 /* Variants of existing cards but with the same PCI IDs. The driver
    detects these based on other device information.
@@ -61,7 +61,7 @@
    must be adjusted accordingly. */
 
 /* PVR-350 V1 (uses saa7114) */
-#define IVTV_CARD_PVR_350_V1 	     (IVTV_CARD_LAST+1)
+#define IVTV_CARD_PVR_350_V1	     (IVTV_CARD_LAST+1)
 /* 2 variants of Kuroutoshikou CX23416GYC-STVLP (Yuan MPG600GR OEM) */
 #define IVTV_CARD_CX23416GYC_NOGR    (IVTV_CARD_LAST+2)
 #define IVTV_CARD_CX23416GYC_NOGRYCS (IVTV_CARD_LAST+3)
@@ -72,22 +72,22 @@
 #define PCI_DEVICE_ID_IVTV16 0x0016
 
 /* subsystem vendor ID */
-#define IVTV_PCI_ID_HAUPPAUGE 		0x0070
-#define IVTV_PCI_ID_HAUPPAUGE_ALT1 	0x0270
-#define IVTV_PCI_ID_HAUPPAUGE_ALT2 	0x4070
-#define IVTV_PCI_ID_ADAPTEC 		0x9005
-#define IVTV_PCI_ID_ASUSTEK 		0x1043
-#define IVTV_PCI_ID_AVERMEDIA 		0x1461
+#define IVTV_PCI_ID_HAUPPAUGE		0x0070
+#define IVTV_PCI_ID_HAUPPAUGE_ALT1	0x0270
+#define IVTV_PCI_ID_HAUPPAUGE_ALT2	0x4070
+#define IVTV_PCI_ID_ADAPTEC		0x9005
+#define IVTV_PCI_ID_ASUSTEK		0x1043
+#define IVTV_PCI_ID_AVERMEDIA		0x1461
 #define IVTV_PCI_ID_YUAN1		0x12ab
-#define IVTV_PCI_ID_YUAN2 		0xff01
-#define IVTV_PCI_ID_YUAN3 		0xffab
-#define IVTV_PCI_ID_YUAN4 		0xfbab
-#define IVTV_PCI_ID_DIAMONDMM 		0xff92
-#define IVTV_PCI_ID_IODATA 		0x10fc
-#define IVTV_PCI_ID_MELCO 		0x1154
+#define IVTV_PCI_ID_YUAN2		0xff01
+#define IVTV_PCI_ID_YUAN3		0xffab
+#define IVTV_PCI_ID_YUAN4		0xfbab
+#define IVTV_PCI_ID_DIAMONDMM		0xff92
+#define IVTV_PCI_ID_IODATA		0x10fc
+#define IVTV_PCI_ID_MELCO		0x1154
 #define IVTV_PCI_ID_GOTVIEW1		0xffac
-#define IVTV_PCI_ID_GOTVIEW2 		0xffad
-#define IVTV_PCI_ID_SONY 		0x104d
+#define IVTV_PCI_ID_GOTVIEW2		0xffad
+#define IVTV_PCI_ID_SONY		0x104d
 
 /* hardware flags, no gaps allowed */
 #define IVTV_HW_CX25840			(1 << 0)
@@ -122,20 +122,20 @@
 
 /* video inputs */
 #define	IVTV_CARD_INPUT_VID_TUNER	1
-#define	IVTV_CARD_INPUT_SVIDEO1 	2
-#define	IVTV_CARD_INPUT_SVIDEO2 	3
-#define	IVTV_CARD_INPUT_COMPOSITE1 	4
-#define	IVTV_CARD_INPUT_COMPOSITE2 	5
-#define	IVTV_CARD_INPUT_COMPOSITE3 	6
+#define	IVTV_CARD_INPUT_SVIDEO1		2
+#define	IVTV_CARD_INPUT_SVIDEO2		3
+#define	IVTV_CARD_INPUT_COMPOSITE1	4
+#define	IVTV_CARD_INPUT_COMPOSITE2	5
+#define	IVTV_CARD_INPUT_COMPOSITE3	6
 
 /* audio inputs */
 #define	IVTV_CARD_INPUT_AUD_TUNER	1
-#define	IVTV_CARD_INPUT_LINE_IN1 	2
-#define	IVTV_CARD_INPUT_LINE_IN2 	3
+#define	IVTV_CARD_INPUT_LINE_IN1	2
+#define	IVTV_CARD_INPUT_LINE_IN2	3
 
 #define IVTV_CARD_MAX_VIDEO_INPUTS 6
 #define IVTV_CARD_MAX_AUDIO_INPUTS 3
-#define IVTV_CARD_MAX_TUNERS  	   3
+#define IVTV_CARD_MAX_TUNERS	   3
 
 /* SAA71XX HW inputs */
 #define IVTV_SAA71XX_COMPOSITE0 0
@@ -172,7 +172,7 @@
 			  V4L2_CAP_SLICED_VBI_OUTPUT | V4L2_CAP_VIDEO_OUTPUT_OVERLAY)
 
 struct ivtv_card_video_input {
-	u8  video_type; 	/* video input type */
+	u8  video_type;		/* video input type */
 	u8  audio_index;	/* index in ivtv_card_audio_input array */
 	u16 video_input;	/* hardware video input */
 };
@@ -199,55 +199,55 @@ struct ivtv_card_pci_info {
 
 /* The mask is the set of bits used by the operation */
 
-struct ivtv_gpio_init { 	/* set initial GPIO DIR and OUT values */
-	u16 direction; 		/* DIR setting. Leave to 0 if no init is needed */
+struct ivtv_gpio_init {		/* set initial GPIO DIR and OUT values */
+	u16 direction;		/* DIR setting. Leave to 0 if no init is needed */
 	u16 initial_value;
 };
 
-struct ivtv_gpio_video_input { 	/* select tuner/line in input */
-	u16 mask; 		/* leave to 0 if not supported */
+struct ivtv_gpio_video_input {	/* select tuner/line in input */
+	u16 mask;		/* leave to 0 if not supported */
 	u16 tuner;
 	u16 composite;
 	u16 svideo;
 };
 
-struct ivtv_gpio_audio_input { 	/* select tuner/line in input */
-	u16 mask; 		/* leave to 0 if not supported */
+struct ivtv_gpio_audio_input {	/* select tuner/line in input */
+	u16 mask;		/* leave to 0 if not supported */
 	u16 tuner;
 	u16 linein;
 	u16 radio;
 };
 
 struct ivtv_gpio_audio_mute {
-	u16 mask; 		/* leave to 0 if not supported */
+	u16 mask;		/* leave to 0 if not supported */
 	u16 mute;		/* set this value to mute, 0 to unmute */
 };
 
 struct ivtv_gpio_audio_mode {
-	u16 mask; 		/* leave to 0 if not supported */
-	u16 mono; 		/* set audio to mono */
-	u16 stereo; 		/* set audio to stereo */
+	u16 mask;		/* leave to 0 if not supported */
+	u16 mono;		/* set audio to mono */
+	u16 stereo;		/* set audio to stereo */
 	u16 lang1;		/* set audio to the first language */
 	u16 lang2;		/* set audio to the second language */
-	u16 both; 		/* both languages are output */
+	u16 both;		/* both languages are output */
 };
 
 struct ivtv_gpio_audio_freq {
-	u16 mask; 		/* leave to 0 if not supported */
+	u16 mask;		/* leave to 0 if not supported */
 	u16 f32000;
 	u16 f44100;
 	u16 f48000;
 };
 
 struct ivtv_gpio_audio_detect {
-	u16 mask; 		/* leave to 0 if not supported */
-	u16 stereo; 		/* if the input matches this value then
+	u16 mask;		/* leave to 0 if not supported */
+	u16 stereo;		/* if the input matches this value then
 				   stereo is detected */
 };
 
 struct ivtv_card_tuner {
-	v4l2_std_id std; 	/* standard for which the tuner is suitable */
-	int 	    tuner; 	/* tuner ID (from tuner.h) */
+	v4l2_std_id std;	/* standard for which the tuner is suitable */
+	int	    tuner;	/* tuner ID (from tuner.h) */
 };
 
 struct ivtv_card_tuner_i2c {
@@ -272,17 +272,17 @@ struct ivtv_card {
 	struct ivtv_card_audio_input radio_input;
 	int nof_outputs;
 	const struct ivtv_card_output *video_outputs;
-	u8 gr_config; 		/* config byte for the ghost reduction device */
-	u8 xceive_pin; 		/* XCeive tuner GPIO reset pin */
+	u8 gr_config;		/* config byte for the ghost reduction device */
+	u8 xceive_pin;		/* XCeive tuner GPIO reset pin */
 
 	/* GPIO card-specific settings */
-	struct ivtv_gpio_init 		gpio_init;
+	struct ivtv_gpio_init		gpio_init;
 	struct ivtv_gpio_video_input	gpio_video_input;
-	struct ivtv_gpio_audio_input 	gpio_audio_input;
-	struct ivtv_gpio_audio_mute 	gpio_audio_mute;
-	struct ivtv_gpio_audio_mode 	gpio_audio_mode;
-	struct ivtv_gpio_audio_freq 	gpio_audio_freq;
-	struct ivtv_gpio_audio_detect 	gpio_audio_detect;
+	struct ivtv_gpio_audio_input	gpio_audio_input;
+	struct ivtv_gpio_audio_mute	gpio_audio_mute;
+	struct ivtv_gpio_audio_mode	gpio_audio_mode;
+	struct ivtv_gpio_audio_freq	gpio_audio_freq;
+	struct ivtv_gpio_audio_detect	gpio_audio_detect;
 
 	struct ivtv_card_tuner tuners[IVTV_CARD_MAX_TUNERS];
 	struct ivtv_card_tuner_i2c *i2c;
diff --git a/drivers/media/pci/ivtv/ivtv-driver.h b/drivers/media/pci/ivtv/ivtv-driver.h
index d27c5c2c07ea..cafba6b1055d 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.h
+++ b/drivers/media/pci/ivtv/ivtv-driver.h
@@ -76,7 +76,7 @@
 #define IVTV_ENCODER_SIZE	0x00800000	/* Total size is 0x01000000, but only first half is used */
 #define IVTV_DECODER_OFFSET	0x01000000
 #define IVTV_DECODER_SIZE	0x00800000	/* Total size is 0x01000000, but only first half is used */
-#define IVTV_REG_OFFSET 	0x02000000
+#define IVTV_REG_OFFSET		0x02000000
 #define IVTV_REG_SIZE		0x00010000
 
 /* Maximum ivtv driver instances. Some people have a huge number of
@@ -97,26 +97,26 @@
 #define IVTV_DMA_SG_OSD_ENT	(2883584/PAGE_SIZE)	/* sg entities */
 
 /* DMA Registers */
-#define IVTV_REG_DMAXFER 	(0x0000)
-#define IVTV_REG_DMASTATUS 	(0x0004)
-#define IVTV_REG_DECDMAADDR 	(0x0008)
-#define IVTV_REG_ENCDMAADDR 	(0x000c)
-#define IVTV_REG_DMACONTROL 	(0x0010)
-#define IVTV_REG_IRQSTATUS 	(0x0040)
-#define IVTV_REG_IRQMASK 	(0x0048)
+#define IVTV_REG_DMAXFER	(0x0000)
+#define IVTV_REG_DMASTATUS	(0x0004)
+#define IVTV_REG_DECDMAADDR	(0x0008)
+#define IVTV_REG_ENCDMAADDR	(0x000c)
+#define IVTV_REG_DMACONTROL	(0x0010)
+#define IVTV_REG_IRQSTATUS	(0x0040)
+#define IVTV_REG_IRQMASK	(0x0048)
 
 /* Setup Registers */
-#define IVTV_REG_ENC_SDRAM_REFRESH 	(0x07F8)
-#define IVTV_REG_ENC_SDRAM_PRECHARGE 	(0x07FC)
-#define IVTV_REG_DEC_SDRAM_REFRESH 	(0x08F8)
-#define IVTV_REG_DEC_SDRAM_PRECHARGE 	(0x08FC)
-#define IVTV_REG_VDM 			(0x2800)
-#define IVTV_REG_AO 			(0x2D00)
-#define IVTV_REG_BYTEFLUSH 		(0x2D24)
-#define IVTV_REG_SPU 			(0x9050)
-#define IVTV_REG_HW_BLOCKS 		(0x9054)
-#define IVTV_REG_VPU 			(0x9058)
-#define IVTV_REG_APU 			(0xA064)
+#define IVTV_REG_ENC_SDRAM_REFRESH	(0x07F8)
+#define IVTV_REG_ENC_SDRAM_PRECHARGE	(0x07FC)
+#define IVTV_REG_DEC_SDRAM_REFRESH	(0x08F8)
+#define IVTV_REG_DEC_SDRAM_PRECHARGE	(0x08FC)
+#define IVTV_REG_VDM			(0x2800)
+#define IVTV_REG_AO			(0x2D00)
+#define IVTV_REG_BYTEFLUSH		(0x2D24)
+#define IVTV_REG_SPU			(0x9050)
+#define IVTV_REG_HW_BLOCKS		(0x9054)
+#define IVTV_REG_VPU			(0x9058)
+#define IVTV_REG_APU			(0xA064)
 
 /* Other registers */
 #define IVTV_REG_DEC_LINE_FIELD		(0x28C0)
@@ -158,7 +158,7 @@ extern int ivtv_fw_debug;
 
 #define IVTV_DEBUG_HIGH_VOL(x, type, fmt, args...) \
 	do { \
-		if (((x) & ivtv_debug) && (ivtv_debug & IVTV_DBGFLG_HIGHVOL)) 	\
+		if (((x) & ivtv_debug) && (ivtv_debug & IVTV_DBGFLG_HIGHVOL))	\
 			v4l2_info(&itv->v4l2_dev, " " type ": " fmt , ##args);	\
 	} while (0)
 #define IVTV_DEBUG_HI_WARN(fmt, args...)  IVTV_DEBUG_HIGH_VOL(IVTV_DBGFLG_WARN,  "warn",  fmt , ## args)
@@ -226,9 +226,9 @@ struct ivtv_mailbox_data {
 /* per-stream, s_flags */
 #define IVTV_F_S_DMA_PENDING	0	/* this stream has pending DMA */
 #define IVTV_F_S_DMA_HAS_VBI	1       /* the current DMA request also requests VBI data */
-#define IVTV_F_S_NEEDS_DATA	2 	/* this decoding stream needs more data */
+#define IVTV_F_S_NEEDS_DATA	2	/* this decoding stream needs more data */
 
-#define IVTV_F_S_CLAIMED 	3	/* this stream is claimed */
+#define IVTV_F_S_CLAIMED	3	/* this stream is claimed */
 #define IVTV_F_S_STREAMING      4	/* the fw is decoding/encoding this stream */
 #define IVTV_F_S_INTERNAL_USE	5	/* this stream is used internally (sliced VBI processing) */
 #define IVTV_F_S_PASSTHROUGH	6	/* this stream is in passthrough mode */
@@ -239,35 +239,35 @@ struct ivtv_mailbox_data {
 #define IVTV_F_S_PIO_HAS_VBI	1       /* the current PIO request also requests VBI data */
 
 /* per-ivtv, i_flags */
-#define IVTV_F_I_DMA		   0 	/* DMA in progress */
-#define IVTV_F_I_UDMA		   1 	/* UDMA in progress */
-#define IVTV_F_I_UDMA_PENDING	   2 	/* UDMA pending */
-#define IVTV_F_I_SPEED_CHANGE	   3 	/* a speed change is in progress */
-#define IVTV_F_I_EOS		   4 	/* end of encoder stream reached */
-#define IVTV_F_I_RADIO_USER	   5 	/* the radio tuner is selected */
-#define IVTV_F_I_DIG_RST	   6 	/* reset digitizer */
-#define IVTV_F_I_DEC_YUV	   7 	/* YUV instead of MPG is being decoded */
-#define IVTV_F_I_UPDATE_CC	   9  	/* CC should be updated */
-#define IVTV_F_I_UPDATE_WSS	   10 	/* WSS should be updated */
-#define IVTV_F_I_UPDATE_VPS	   11 	/* VPS should be updated */
-#define IVTV_F_I_DECODING_YUV	   12 	/* this stream is YUV frame decoding */
-#define IVTV_F_I_ENC_PAUSED	   13 	/* the encoder is paused */
-#define IVTV_F_I_VALID_DEC_TIMINGS 14 	/* last_dec_timing is valid */
-#define IVTV_F_I_HAVE_WORK  	   15	/* used in the interrupt handler: there is work to be done */
+#define IVTV_F_I_DMA		   0	/* DMA in progress */
+#define IVTV_F_I_UDMA		   1	/* UDMA in progress */
+#define IVTV_F_I_UDMA_PENDING	   2	/* UDMA pending */
+#define IVTV_F_I_SPEED_CHANGE	   3	/* a speed change is in progress */
+#define IVTV_F_I_EOS		   4	/* end of encoder stream reached */
+#define IVTV_F_I_RADIO_USER	   5	/* the radio tuner is selected */
+#define IVTV_F_I_DIG_RST	   6	/* reset digitizer */
+#define IVTV_F_I_DEC_YUV	   7	/* YUV instead of MPG is being decoded */
+#define IVTV_F_I_UPDATE_CC	   9	/* CC should be updated */
+#define IVTV_F_I_UPDATE_WSS	   10	/* WSS should be updated */
+#define IVTV_F_I_UPDATE_VPS	   11	/* VPS should be updated */
+#define IVTV_F_I_DECODING_YUV	   12	/* this stream is YUV frame decoding */
+#define IVTV_F_I_ENC_PAUSED	   13	/* the encoder is paused */
+#define IVTV_F_I_VALID_DEC_TIMINGS 14	/* last_dec_timing is valid */
+#define IVTV_F_I_HAVE_WORK	   15	/* used in the interrupt handler: there is work to be done */
 #define IVTV_F_I_WORK_HANDLER_VBI  16	/* there is work to be done for VBI */
 #define IVTV_F_I_WORK_HANDLER_YUV  17	/* there is work to be done for YUV */
 #define IVTV_F_I_WORK_HANDLER_PIO  18	/* there is work to be done for PIO */
 #define IVTV_F_I_PIO		   19	/* PIO in progress */
-#define IVTV_F_I_DEC_PAUSED	   20 	/* the decoder is paused */
-#define IVTV_F_I_INITED		   21 	/* set after first open */
-#define IVTV_F_I_FAILED		   22 	/* set if first open failed */
+#define IVTV_F_I_DEC_PAUSED	   20	/* the decoder is paused */
+#define IVTV_F_I_INITED		   21	/* set after first open */
+#define IVTV_F_I_FAILED		   22	/* set if first open failed */
 #define IVTV_F_I_WORK_HANDLER_PCM  23	/* there is work to be done for PCM */
 
 /* Event notifications */
 #define IVTV_F_I_EV_DEC_STOPPED	   28	/* decoder stopped event */
-#define IVTV_F_I_EV_VSYNC	   29 	/* VSYNC event */
-#define IVTV_F_I_EV_VSYNC_FIELD    30 	/* VSYNC event field (0 = first, 1 = second field) */
-#define IVTV_F_I_EV_VSYNC_ENABLED  31 	/* VSYNC event enabled */
+#define IVTV_F_I_EV_VSYNC	   29	/* VSYNC event */
+#define IVTV_F_I_EV_VSYNC_FIELD    30	/* VSYNC event field (0 = first, 1 = second field) */
+#define IVTV_F_I_EV_VSYNC_ENABLED  31	/* VSYNC event enabled */
 
 /* Scatter-Gather array element, used in DMA transfers */
 struct ivtv_sg_element {
@@ -330,13 +330,13 @@ struct ivtv_stream {
 	/* These first four fields are always set, even if the stream
 	   is not actually created. */
 	struct video_device vdev;	/* vdev.v4l2_dev is NULL if there is no device */
-	struct ivtv *itv; 		/* for ease of use */
+	struct ivtv *itv;		/* for ease of use */
 	const char *name;		/* name of the stream */
 	int type;			/* stream type */
 	u32 caps;			/* V4L2 capabilities */
 
 	struct v4l2_fh *fh;		/* pointer to the streaming filehandle */
-	spinlock_t qlock; 		/* locks access to the queues */
+	spinlock_t qlock;		/* locks access to the queues */
 	unsigned long s_flags;		/* status flags, see above */
 	int dma;			/* can be PCI_DMA_TODEVICE, PCI_DMA_FROMDEVICE or PCI_DMA_NONE */
 	u32 pending_offset;
@@ -564,7 +564,7 @@ struct vbi_info {
 
 	/* Raw VBI compatibility hack */
 
-	u32 frame; 				/* frame counter hack needed for backwards compatibility
+	u32 frame;				/* frame counter hack needed for backwards compatibility
 						   of old VBI software */
 
 	/* Sliced VBI output data */
@@ -620,7 +620,7 @@ struct ivtv {
 	u8 nof_inputs;			/* number of video inputs */
 	u8 nof_audio_inputs;		/* number of audio inputs */
 	u32 v4l2_cap;			/* V4L2 capabilities of card */
-	u32 hw_flags; 			/* hardware description of the board */
+	u32 hw_flags;			/* hardware description of the board */
 	v4l2_std_id tuner_std;		/* the norm of the card's tuner (fixed) */
 	struct v4l2_subdev *sd_video;	/* controlling video decoder subdev */
 	struct v4l2_subdev *sd_audio;	/* controlling audio subdev */
@@ -629,7 +629,7 @@ struct ivtv {
 	volatile void __iomem *enc_mem; /* pointer to mapped encoder memory */
 	volatile void __iomem *dec_mem; /* pointer to mapped decoder memory */
 	volatile void __iomem *reg_mem; /* pointer to mapped registers */
-	struct ivtv_options options; 	/* user options */
+	struct ivtv_options options;	/* user options */
 
 	struct v4l2_device v4l2_dev;
 	struct cx2341x_handler cxhdl;
@@ -668,7 +668,7 @@ struct ivtv {
 
 	/* Streams */
 	int stream_buf_size[IVTV_MAX_STREAMS];          /* stream buffer size */
-	struct ivtv_stream streams[IVTV_MAX_STREAMS]; 	/* stream data */
+	struct ivtv_stream streams[IVTV_MAX_STREAMS];	/* stream data */
 	atomic_t capturing;		/* count number of active capture streams */
 	atomic_t decoding;		/* count number of active decoding streams */
 
@@ -704,7 +704,7 @@ struct ivtv {
 	/* Mailbox */
 	struct ivtv_mailbox_data enc_mbox;              /* encoder mailboxes */
 	struct ivtv_mailbox_data dec_mbox;              /* decoder mailboxes */
-	struct ivtv_api_cache api_cache[256]; 		/* cached API commands */
+	struct ivtv_api_cache api_cache[256];		/* cached API commands */
 
 
 	/* I2C */
@@ -828,7 +828,7 @@ static inline int ivtv_raw_vbi(const struct ivtv *itv)
 
 /* Call the specified callback for all subdevs matching hw (if 0, then
    match them all). Ignore any errors. */
-#define ivtv_call_hw(itv, hw, o, f, args...) 				\
+#define ivtv_call_hw(itv, hw, o, f, args...)				\
 	v4l2_device_mask_call_all(&(itv)->v4l2_dev, hw, o, f, ##args)
 
 #define ivtv_call_all(itv, o, f, args...) ivtv_call_hw(itv, 0, o, f , ##args)
diff --git a/drivers/media/pci/ivtv/ivtv-firmware.c b/drivers/media/pci/ivtv/ivtv-firmware.c
index ba279fdb3df8..9f05472fca20 100644
--- a/drivers/media/pci/ivtv/ivtv-firmware.c
+++ b/drivers/media/pci/ivtv/ivtv-firmware.c
@@ -28,26 +28,26 @@
 #include <linux/firmware.h>
 #include <media/i2c/saa7127.h>
 
-#define IVTV_MASK_SPU_ENABLE 		0xFFFFFFFE
-#define IVTV_MASK_VPU_ENABLE15 		0xFFFFFFF6
-#define IVTV_MASK_VPU_ENABLE16 		0xFFFFFFFB
-#define IVTV_CMD_VDM_STOP 		0x00000000
-#define IVTV_CMD_AO_STOP 		0x00000005
-#define IVTV_CMD_APU_PING 		0x00000000
-#define IVTV_CMD_VPU_STOP15 		0xFFFFFFFE
-#define IVTV_CMD_VPU_STOP16 		0xFFFFFFEE
-#define IVTV_CMD_HW_BLOCKS_RST 		0xFFFFFFFF
-#define IVTV_CMD_SPU_STOP 		0x00000001
-#define IVTV_CMD_SDRAM_PRECHARGE_INIT 	0x0000001A
-#define IVTV_CMD_SDRAM_REFRESH_INIT 	0x80000640
-#define IVTV_SDRAM_SLEEPTIME 		600
-
-#define IVTV_DECODE_INIT_MPEG_FILENAME 	"v4l-cx2341x-init.mpg"
-#define IVTV_DECODE_INIT_MPEG_SIZE 	(152*1024)
+#define IVTV_MASK_SPU_ENABLE		0xFFFFFFFE
+#define IVTV_MASK_VPU_ENABLE15		0xFFFFFFF6
+#define IVTV_MASK_VPU_ENABLE16		0xFFFFFFFB
+#define IVTV_CMD_VDM_STOP		0x00000000
+#define IVTV_CMD_AO_STOP		0x00000005
+#define IVTV_CMD_APU_PING		0x00000000
+#define IVTV_CMD_VPU_STOP15		0xFFFFFFFE
+#define IVTV_CMD_VPU_STOP16		0xFFFFFFEE
+#define IVTV_CMD_HW_BLOCKS_RST		0xFFFFFFFF
+#define IVTV_CMD_SPU_STOP		0x00000001
+#define IVTV_CMD_SDRAM_PRECHARGE_INIT	0x0000001A
+#define IVTV_CMD_SDRAM_REFRESH_INIT	0x80000640
+#define IVTV_SDRAM_SLEEPTIME		600
+
+#define IVTV_DECODE_INIT_MPEG_FILENAME	"v4l-cx2341x-init.mpg"
+#define IVTV_DECODE_INIT_MPEG_SIZE	(152*1024)
 
 /* Encoder/decoder firmware sizes */
-#define IVTV_FW_ENC_SIZE 		(376836)
-#define IVTV_FW_DEC_SIZE 		(256*1024)
+#define IVTV_FW_ENC_SIZE		(376836)
+#define IVTV_FW_DEC_SIZE		(256*1024)
 
 static int load_fw_direct(const char *fn, volatile u8 __iomem *mem, struct ivtv *itv, long size)
 {
diff --git a/drivers/media/pci/ivtv/ivtv-i2c.c b/drivers/media/pci/ivtv/ivtv-i2c.c
index 66696e6ee587..522cd111e399 100644
--- a/drivers/media/pci/ivtv/ivtv-i2c.c
+++ b/drivers/media/pci/ivtv/ivtv-i2c.c
@@ -76,22 +76,22 @@
 
 #define IVTV_CS53L32A_I2C_ADDR		0x11
 #define IVTV_M52790_I2C_ADDR		0x48
-#define IVTV_CX25840_I2C_ADDR 		0x44
-#define IVTV_SAA7115_I2C_ADDR 		0x21
-#define IVTV_SAA7127_I2C_ADDR 		0x44
-#define IVTV_SAA717x_I2C_ADDR 		0x21
-#define IVTV_MSP3400_I2C_ADDR 		0x40
-#define IVTV_HAUPPAUGE_I2C_ADDR 	0x50
-#define IVTV_WM8739_I2C_ADDR 		0x1a
+#define IVTV_CX25840_I2C_ADDR		0x44
+#define IVTV_SAA7115_I2C_ADDR		0x21
+#define IVTV_SAA7127_I2C_ADDR		0x44
+#define IVTV_SAA717x_I2C_ADDR		0x21
+#define IVTV_MSP3400_I2C_ADDR		0x40
+#define IVTV_HAUPPAUGE_I2C_ADDR		0x50
+#define IVTV_WM8739_I2C_ADDR		0x1a
 #define IVTV_WM8775_I2C_ADDR		0x1b
 #define IVTV_TEA5767_I2C_ADDR		0x60
-#define IVTV_UPD64031A_I2C_ADDR 	0x12
-#define IVTV_UPD64083_I2C_ADDR 		0x5c
-#define IVTV_VP27SMPX_I2C_ADDR      	0x5b
-#define IVTV_M52790_I2C_ADDR      	0x48
+#define IVTV_UPD64031A_I2C_ADDR		0x12
+#define IVTV_UPD64083_I2C_ADDR		0x5c
+#define IVTV_VP27SMPX_I2C_ADDR		0x5b
+#define IVTV_M52790_I2C_ADDR		0x48
 #define IVTV_AVERMEDIA_IR_RX_I2C_ADDR	0x40
-#define IVTV_HAUP_EXT_IR_RX_I2C_ADDR 	0x1a
-#define IVTV_HAUP_INT_IR_RX_I2C_ADDR 	0x18
+#define IVTV_HAUP_EXT_IR_RX_I2C_ADDR	0x1a
+#define IVTV_HAUP_INT_IR_RX_I2C_ADDR	0x18
 #define IVTV_Z8F0811_IR_TX_I2C_ADDR	0x70
 #define IVTV_Z8F0811_IR_RX_I2C_ADDR	0x71
 #define IVTV_ADAPTEC_IR_ADDR		0x6b
diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c
index 670462d195b5..4cdc6d2be85d 100644
--- a/drivers/media/pci/ivtv/ivtv-ioctl.c
+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c
@@ -1884,65 +1884,65 @@ static long ivtv_default(struct file *file, void *fh, bool valid_prio,
 }
 
 static const struct v4l2_ioctl_ops ivtv_ioctl_ops = {
-	.vidioc_querycap    		    = ivtv_querycap,
-	.vidioc_s_audio     		    = ivtv_s_audio,
-	.vidioc_g_audio     		    = ivtv_g_audio,
-	.vidioc_enumaudio   		    = ivtv_enumaudio,
-	.vidioc_s_audout     		    = ivtv_s_audout,
-	.vidioc_g_audout     		    = ivtv_g_audout,
-	.vidioc_enum_input   		    = ivtv_enum_input,
-	.vidioc_enum_output   		    = ivtv_enum_output,
-	.vidioc_enumaudout   		    = ivtv_enumaudout,
-	.vidioc_cropcap       		    = ivtv_cropcap,
+	.vidioc_querycap		    = ivtv_querycap,
+	.vidioc_s_audio			    = ivtv_s_audio,
+	.vidioc_g_audio			    = ivtv_g_audio,
+	.vidioc_enumaudio		    = ivtv_enumaudio,
+	.vidioc_s_audout		    = ivtv_s_audout,
+	.vidioc_g_audout		    = ivtv_g_audout,
+	.vidioc_enum_input		    = ivtv_enum_input,
+	.vidioc_enum_output		    = ivtv_enum_output,
+	.vidioc_enumaudout		    = ivtv_enumaudout,
+	.vidioc_cropcap			    = ivtv_cropcap,
 	.vidioc_s_selection		    = ivtv_s_selection,
 	.vidioc_g_selection		    = ivtv_g_selection,
-	.vidioc_g_input      		    = ivtv_g_input,
-	.vidioc_s_input      		    = ivtv_s_input,
-	.vidioc_g_output     		    = ivtv_g_output,
-	.vidioc_s_output     		    = ivtv_s_output,
-	.vidioc_g_frequency 		    = ivtv_g_frequency,
-	.vidioc_s_frequency  		    = ivtv_s_frequency,
-	.vidioc_s_tuner      		    = ivtv_s_tuner,
-	.vidioc_g_tuner      		    = ivtv_g_tuner,
-	.vidioc_g_enc_index 		    = ivtv_g_enc_index,
+	.vidioc_g_input			    = ivtv_g_input,
+	.vidioc_s_input			    = ivtv_s_input,
+	.vidioc_g_output		    = ivtv_g_output,
+	.vidioc_s_output		    = ivtv_s_output,
+	.vidioc_g_frequency		    = ivtv_g_frequency,
+	.vidioc_s_frequency		    = ivtv_s_frequency,
+	.vidioc_s_tuner			    = ivtv_s_tuner,
+	.vidioc_g_tuner			    = ivtv_g_tuner,
+	.vidioc_g_enc_index		    = ivtv_g_enc_index,
 	.vidioc_g_fbuf			    = ivtv_g_fbuf,
 	.vidioc_s_fbuf			    = ivtv_s_fbuf,
-	.vidioc_g_std 			    = ivtv_g_std,
-	.vidioc_s_std 			    = ivtv_s_std,
+	.vidioc_g_std			    = ivtv_g_std,
+	.vidioc_s_std			    = ivtv_s_std,
 	.vidioc_overlay			    = ivtv_overlay,
 	.vidioc_log_status		    = ivtv_log_status,
-	.vidioc_enum_fmt_vid_cap 	    = ivtv_enum_fmt_vid_cap,
-	.vidioc_encoder_cmd  		    = ivtv_encoder_cmd,
-	.vidioc_try_encoder_cmd 	    = ivtv_try_encoder_cmd,
+	.vidioc_enum_fmt_vid_cap	    = ivtv_enum_fmt_vid_cap,
+	.vidioc_encoder_cmd		    = ivtv_encoder_cmd,
+	.vidioc_try_encoder_cmd		    = ivtv_try_encoder_cmd,
 	.vidioc_decoder_cmd		    = ivtv_decoder_cmd,
 	.vidioc_try_decoder_cmd		    = ivtv_try_decoder_cmd,
-	.vidioc_enum_fmt_vid_out 	    = ivtv_enum_fmt_vid_out,
-	.vidioc_g_fmt_vid_cap 		    = ivtv_g_fmt_vid_cap,
+	.vidioc_enum_fmt_vid_out	    = ivtv_enum_fmt_vid_out,
+	.vidioc_g_fmt_vid_cap		    = ivtv_g_fmt_vid_cap,
 	.vidioc_g_fmt_vbi_cap		    = ivtv_g_fmt_vbi_cap,
 	.vidioc_g_fmt_sliced_vbi_cap        = ivtv_g_fmt_sliced_vbi_cap,
 	.vidioc_g_fmt_vid_out               = ivtv_g_fmt_vid_out,
 	.vidioc_g_fmt_vid_out_overlay       = ivtv_g_fmt_vid_out_overlay,
 	.vidioc_g_fmt_sliced_vbi_out        = ivtv_g_fmt_sliced_vbi_out,
-	.vidioc_s_fmt_vid_cap  		    = ivtv_s_fmt_vid_cap,
-	.vidioc_s_fmt_vbi_cap 		    = ivtv_s_fmt_vbi_cap,
+	.vidioc_s_fmt_vid_cap		    = ivtv_s_fmt_vid_cap,
+	.vidioc_s_fmt_vbi_cap		    = ivtv_s_fmt_vbi_cap,
 	.vidioc_s_fmt_sliced_vbi_cap        = ivtv_s_fmt_sliced_vbi_cap,
 	.vidioc_s_fmt_vid_out               = ivtv_s_fmt_vid_out,
 	.vidioc_s_fmt_vid_out_overlay       = ivtv_s_fmt_vid_out_overlay,
 	.vidioc_s_fmt_sliced_vbi_out        = ivtv_s_fmt_sliced_vbi_out,
-	.vidioc_try_fmt_vid_cap  	    = ivtv_try_fmt_vid_cap,
+	.vidioc_try_fmt_vid_cap		    = ivtv_try_fmt_vid_cap,
 	.vidioc_try_fmt_vbi_cap		    = ivtv_try_fmt_vbi_cap,
 	.vidioc_try_fmt_sliced_vbi_cap      = ivtv_try_fmt_sliced_vbi_cap,
-	.vidioc_try_fmt_vid_out 	    = ivtv_try_fmt_vid_out,
+	.vidioc_try_fmt_vid_out		    = ivtv_try_fmt_vid_out,
 	.vidioc_try_fmt_vid_out_overlay     = ivtv_try_fmt_vid_out_overlay,
-	.vidioc_try_fmt_sliced_vbi_out 	    = ivtv_try_fmt_sliced_vbi_out,
-	.vidioc_g_sliced_vbi_cap 	    = ivtv_g_sliced_vbi_cap,
+	.vidioc_try_fmt_sliced_vbi_out	    = ivtv_try_fmt_sliced_vbi_out,
+	.vidioc_g_sliced_vbi_cap	    = ivtv_g_sliced_vbi_cap,
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-	.vidioc_g_register 		    = ivtv_g_register,
-	.vidioc_s_register 		    = ivtv_s_register,
+	.vidioc_g_register		    = ivtv_g_register,
+	.vidioc_s_register		    = ivtv_s_register,
 #endif
-	.vidioc_default 		    = ivtv_default,
-	.vidioc_subscribe_event 	    = ivtv_subscribe_event,
-	.vidioc_unsubscribe_event 	    = v4l2_event_unsubscribe,
+	.vidioc_default			    = ivtv_default,
+	.vidioc_subscribe_event		    = ivtv_subscribe_event,
+	.vidioc_unsubscribe_event	    = v4l2_event_unsubscribe,
 };
 
 void ivtv_set_funcs(struct video_device *vdev)
diff --git a/drivers/media/pci/ivtv/ivtv-mailbox.c b/drivers/media/pci/ivtv/ivtv-mailbox.c
index 9a2506a5edbe..f317c8f0938d 100644
--- a/drivers/media/pci/ivtv/ivtv-mailbox.c
+++ b/drivers/media/pci/ivtv/ivtv-mailbox.c
@@ -28,118 +28,118 @@
 #define IVTV_MBOX_FIRMWARE_DONE 0x00000004
 #define IVTV_MBOX_DRIVER_DONE   0x00000002
 #define IVTV_MBOX_DRIVER_BUSY   0x00000001
-#define IVTV_MBOX_FREE 		0x00000000
+#define IVTV_MBOX_FREE		0x00000000
 
 /* Firmware mailbox standard timeout */
-#define IVTV_API_STD_TIMEOUT 	0x02000000
+#define IVTV_API_STD_TIMEOUT	0x02000000
 
-#define API_CACHE 	 (1 << 0) 	/* Allow the command to be stored in the cache */
-#define API_RESULT	 (1 << 1) 	/* Allow 1 second for this cmd to end */
+#define API_CACHE	 (1 << 0)	/* Allow the command to be stored in the cache */
+#define API_RESULT	 (1 << 1)	/* Allow 1 second for this cmd to end */
 #define API_FAST_RESULT	 (3 << 1)	/* Allow 0.1 second for this cmd to end */
-#define API_DMA 	 (1 << 3)	/* DMA mailbox, has special handling */
-#define API_HIGH_VOL 	 (1 << 5)	/* High volume command (i.e. called during encoding or decoding) */
-#define API_NO_WAIT_MB 	 (1 << 4)	/* Command may not wait for a free mailbox */
+#define API_DMA		 (1 << 3)	/* DMA mailbox, has special handling */
+#define API_HIGH_VOL	 (1 << 5)	/* High volume command (i.e. called during encoding or decoding) */
+#define API_NO_WAIT_MB	 (1 << 4)	/* Command may not wait for a free mailbox */
 #define API_NO_WAIT_RES	 (1 << 5)	/* Command may not wait for the result */
 #define API_NO_POLL	 (1 << 6)	/* Avoid pointless polling */
 
 struct ivtv_api_info {
 	int flags;		/* Flags, see above */
-	const char *name; 	/* The name of the command */
+	const char *name;	/* The name of the command */
 };
 
 #define API_ENTRY(x, f) [x] = { (f), #x }
 
 static const struct ivtv_api_info api_info[256] = {
 	/* MPEG encoder API */
-	API_ENTRY(CX2341X_ENC_PING_FW, 			API_FAST_RESULT),
-	API_ENTRY(CX2341X_ENC_START_CAPTURE, 		API_RESULT | API_NO_POLL),
-	API_ENTRY(CX2341X_ENC_STOP_CAPTURE, 		API_RESULT),
-	API_ENTRY(CX2341X_ENC_SET_AUDIO_ID, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_VIDEO_ID, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_PCR_ID, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_FRAME_RATE, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_FRAME_SIZE, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_BIT_RATE, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_GOP_PROPERTIES, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_ASPECT_RATIO, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_DNR_FILTER_MODE, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_DNR_FILTER_PROPS, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_CORING_LEVELS, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_SPATIAL_FILTER_TYPE, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_VBI_LINE, 		API_RESULT),
-	API_ENTRY(CX2341X_ENC_SET_STREAM_TYPE, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_OUTPUT_PORT, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_AUDIO_PROPERTIES, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_HALT_FW, 			API_FAST_RESULT),
-	API_ENTRY(CX2341X_ENC_GET_VERSION, 		API_FAST_RESULT),
-	API_ENTRY(CX2341X_ENC_SET_GOP_CLOSURE, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_GET_SEQ_END, 		API_RESULT),
-	API_ENTRY(CX2341X_ENC_SET_PGM_INDEX_INFO, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_ENC_SET_VBI_CONFIG, 		API_RESULT),
-	API_ENTRY(CX2341X_ENC_SET_DMA_BLOCK_SIZE, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_GET_PREV_DMA_INFO_MB_10, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_ENC_GET_PREV_DMA_INFO_MB_9, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_ENC_SCHED_DMA_TO_HOST, 	API_DMA | API_HIGH_VOL),
-	API_ENTRY(CX2341X_ENC_INITIALIZE_INPUT, 	API_RESULT),
-	API_ENTRY(CX2341X_ENC_SET_FRAME_DROP_RATE, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_PAUSE_ENCODER, 		API_RESULT),
-	API_ENTRY(CX2341X_ENC_REFRESH_INPUT, 		API_NO_WAIT_MB | API_HIGH_VOL),
-	API_ENTRY(CX2341X_ENC_SET_COPYRIGHT, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_EVENT_NOTIFICATION, 	API_RESULT),
-	API_ENTRY(CX2341X_ENC_SET_NUM_VSYNC_LINES, 	API_CACHE),
-	API_ENTRY(CX2341X_ENC_SET_PLACEHOLDER, 		API_CACHE),
-	API_ENTRY(CX2341X_ENC_MUTE_VIDEO, 		API_RESULT),
-	API_ENTRY(CX2341X_ENC_MUTE_AUDIO, 		API_RESULT),
+	API_ENTRY(CX2341X_ENC_PING_FW,			API_FAST_RESULT),
+	API_ENTRY(CX2341X_ENC_START_CAPTURE,		API_RESULT | API_NO_POLL),
+	API_ENTRY(CX2341X_ENC_STOP_CAPTURE,		API_RESULT),
+	API_ENTRY(CX2341X_ENC_SET_AUDIO_ID,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_VIDEO_ID,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_PCR_ID,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_FRAME_RATE,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_FRAME_SIZE,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_BIT_RATE,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_GOP_PROPERTIES,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_ASPECT_RATIO,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_DNR_FILTER_MODE,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_DNR_FILTER_PROPS,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_CORING_LEVELS,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_SPATIAL_FILTER_TYPE,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_VBI_LINE,		API_RESULT),
+	API_ENTRY(CX2341X_ENC_SET_STREAM_TYPE,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_OUTPUT_PORT,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_AUDIO_PROPERTIES,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_HALT_FW,			API_FAST_RESULT),
+	API_ENTRY(CX2341X_ENC_GET_VERSION,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_ENC_SET_GOP_CLOSURE,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_GET_SEQ_END,		API_RESULT),
+	API_ENTRY(CX2341X_ENC_SET_PGM_INDEX_INFO,	API_FAST_RESULT),
+	API_ENTRY(CX2341X_ENC_SET_VBI_CONFIG,		API_RESULT),
+	API_ENTRY(CX2341X_ENC_SET_DMA_BLOCK_SIZE,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_GET_PREV_DMA_INFO_MB_10,	API_FAST_RESULT),
+	API_ENTRY(CX2341X_ENC_GET_PREV_DMA_INFO_MB_9,	API_FAST_RESULT),
+	API_ENTRY(CX2341X_ENC_SCHED_DMA_TO_HOST,	API_DMA | API_HIGH_VOL),
+	API_ENTRY(CX2341X_ENC_INITIALIZE_INPUT,		API_RESULT),
+	API_ENTRY(CX2341X_ENC_SET_FRAME_DROP_RATE,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_PAUSE_ENCODER,		API_RESULT),
+	API_ENTRY(CX2341X_ENC_REFRESH_INPUT,		API_NO_WAIT_MB | API_HIGH_VOL),
+	API_ENTRY(CX2341X_ENC_SET_COPYRIGHT,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_EVENT_NOTIFICATION,	API_RESULT),
+	API_ENTRY(CX2341X_ENC_SET_NUM_VSYNC_LINES,	API_CACHE),
+	API_ENTRY(CX2341X_ENC_SET_PLACEHOLDER,		API_CACHE),
+	API_ENTRY(CX2341X_ENC_MUTE_VIDEO,		API_RESULT),
+	API_ENTRY(CX2341X_ENC_MUTE_AUDIO,		API_RESULT),
 	API_ENTRY(CX2341X_ENC_SET_VERT_CROP_LINE,	API_FAST_RESULT),
-	API_ENTRY(CX2341X_ENC_MISC, 			API_FAST_RESULT),
+	API_ENTRY(CX2341X_ENC_MISC,			API_FAST_RESULT),
 	/* Obsolete PULLDOWN API command */
-	API_ENTRY(0xb1, 				API_CACHE),
+	API_ENTRY(0xb1,					API_CACHE),
 
 	/* MPEG decoder API */
-	API_ENTRY(CX2341X_DEC_PING_FW, 			API_FAST_RESULT),
-	API_ENTRY(CX2341X_DEC_START_PLAYBACK, 		API_RESULT | API_NO_POLL),
-	API_ENTRY(CX2341X_DEC_STOP_PLAYBACK, 		API_RESULT),
-	API_ENTRY(CX2341X_DEC_SET_PLAYBACK_SPEED, 	API_RESULT),
-	API_ENTRY(CX2341X_DEC_STEP_VIDEO, 		API_RESULT),
-	API_ENTRY(CX2341X_DEC_SET_DMA_BLOCK_SIZE, 	API_CACHE),
-	API_ENTRY(CX2341X_DEC_GET_XFER_INFO, 		API_FAST_RESULT),
-	API_ENTRY(CX2341X_DEC_GET_DMA_STATUS, 		API_FAST_RESULT),
-	API_ENTRY(CX2341X_DEC_SCHED_DMA_FROM_HOST, 	API_DMA | API_HIGH_VOL),
-	API_ENTRY(CX2341X_DEC_PAUSE_PLAYBACK, 		API_RESULT),
-	API_ENTRY(CX2341X_DEC_HALT_FW, 			API_FAST_RESULT),
-	API_ENTRY(CX2341X_DEC_SET_STANDARD, 		API_CACHE),
-	API_ENTRY(CX2341X_DEC_GET_VERSION, 		API_FAST_RESULT),
-	API_ENTRY(CX2341X_DEC_SET_STREAM_INPUT, 	API_CACHE),
-	API_ENTRY(CX2341X_DEC_GET_TIMING_INFO, 		API_RESULT /*| API_NO_WAIT_RES*/),
-	API_ENTRY(CX2341X_DEC_SET_AUDIO_MODE, 		API_CACHE),
-	API_ENTRY(CX2341X_DEC_SET_EVENT_NOTIFICATION, 	API_RESULT),
-	API_ENTRY(CX2341X_DEC_SET_DISPLAY_BUFFERS, 	API_CACHE),
-	API_ENTRY(CX2341X_DEC_EXTRACT_VBI, 		API_RESULT),
-	API_ENTRY(CX2341X_DEC_SET_DECODER_SOURCE, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_DEC_SET_PREBUFFERING, 	API_CACHE),
+	API_ENTRY(CX2341X_DEC_PING_FW,			API_FAST_RESULT),
+	API_ENTRY(CX2341X_DEC_START_PLAYBACK,		API_RESULT | API_NO_POLL),
+	API_ENTRY(CX2341X_DEC_STOP_PLAYBACK,		API_RESULT),
+	API_ENTRY(CX2341X_DEC_SET_PLAYBACK_SPEED,	API_RESULT),
+	API_ENTRY(CX2341X_DEC_STEP_VIDEO,		API_RESULT),
+	API_ENTRY(CX2341X_DEC_SET_DMA_BLOCK_SIZE,	API_CACHE),
+	API_ENTRY(CX2341X_DEC_GET_XFER_INFO,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_DEC_GET_DMA_STATUS,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_DEC_SCHED_DMA_FROM_HOST,	API_DMA | API_HIGH_VOL),
+	API_ENTRY(CX2341X_DEC_PAUSE_PLAYBACK,		API_RESULT),
+	API_ENTRY(CX2341X_DEC_HALT_FW,			API_FAST_RESULT),
+	API_ENTRY(CX2341X_DEC_SET_STANDARD,		API_CACHE),
+	API_ENTRY(CX2341X_DEC_GET_VERSION,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_DEC_SET_STREAM_INPUT,		API_CACHE),
+	API_ENTRY(CX2341X_DEC_GET_TIMING_INFO,		API_RESULT /*| API_NO_WAIT_RES*/),
+	API_ENTRY(CX2341X_DEC_SET_AUDIO_MODE,		API_CACHE),
+	API_ENTRY(CX2341X_DEC_SET_EVENT_NOTIFICATION,	API_RESULT),
+	API_ENTRY(CX2341X_DEC_SET_DISPLAY_BUFFERS,	API_CACHE),
+	API_ENTRY(CX2341X_DEC_EXTRACT_VBI,		API_RESULT),
+	API_ENTRY(CX2341X_DEC_SET_DECODER_SOURCE,	API_FAST_RESULT),
+	API_ENTRY(CX2341X_DEC_SET_PREBUFFERING,		API_CACHE),
 
 	/* OSD API */
-	API_ENTRY(CX2341X_OSD_GET_FRAMEBUFFER, 		API_FAST_RESULT),
-	API_ENTRY(CX2341X_OSD_GET_PIXEL_FORMAT, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_OSD_SET_PIXEL_FORMAT, 	API_CACHE),
-	API_ENTRY(CX2341X_OSD_GET_STATE, 		API_FAST_RESULT),
-	API_ENTRY(CX2341X_OSD_SET_STATE, 		API_CACHE),
-	API_ENTRY(CX2341X_OSD_GET_OSD_COORDS, 		API_FAST_RESULT),
-	API_ENTRY(CX2341X_OSD_SET_OSD_COORDS, 		API_CACHE),
-	API_ENTRY(CX2341X_OSD_GET_SCREEN_COORDS, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_OSD_SET_SCREEN_COORDS, 	API_CACHE),
-	API_ENTRY(CX2341X_OSD_GET_GLOBAL_ALPHA, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_OSD_SET_GLOBAL_ALPHA, 	API_CACHE),
-	API_ENTRY(CX2341X_OSD_SET_BLEND_COORDS, 	API_CACHE),
-	API_ENTRY(CX2341X_OSD_GET_FLICKER_STATE, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_OSD_SET_FLICKER_STATE, 	API_CACHE),
-	API_ENTRY(CX2341X_OSD_BLT_COPY, 		API_RESULT),
-	API_ENTRY(CX2341X_OSD_BLT_FILL, 		API_RESULT),
-	API_ENTRY(CX2341X_OSD_BLT_TEXT, 		API_RESULT),
-	API_ENTRY(CX2341X_OSD_SET_FRAMEBUFFER_WINDOW, 	API_CACHE),
-	API_ENTRY(CX2341X_OSD_SET_CHROMA_KEY, 		API_CACHE),
-	API_ENTRY(CX2341X_OSD_GET_ALPHA_CONTENT_INDEX, 	API_FAST_RESULT),
-	API_ENTRY(CX2341X_OSD_SET_ALPHA_CONTENT_INDEX, 	API_CACHE)
+	API_ENTRY(CX2341X_OSD_GET_FRAMEBUFFER,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_OSD_GET_PIXEL_FORMAT,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_OSD_SET_PIXEL_FORMAT,		API_CACHE),
+	API_ENTRY(CX2341X_OSD_GET_STATE,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_OSD_SET_STATE,		API_CACHE),
+	API_ENTRY(CX2341X_OSD_GET_OSD_COORDS,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_OSD_SET_OSD_COORDS,		API_CACHE),
+	API_ENTRY(CX2341X_OSD_GET_SCREEN_COORDS,	API_FAST_RESULT),
+	API_ENTRY(CX2341X_OSD_SET_SCREEN_COORDS,	API_CACHE),
+	API_ENTRY(CX2341X_OSD_GET_GLOBAL_ALPHA,		API_FAST_RESULT),
+	API_ENTRY(CX2341X_OSD_SET_GLOBAL_ALPHA,		API_CACHE),
+	API_ENTRY(CX2341X_OSD_SET_BLEND_COORDS,		API_CACHE),
+	API_ENTRY(CX2341X_OSD_GET_FLICKER_STATE,	API_FAST_RESULT),
+	API_ENTRY(CX2341X_OSD_SET_FLICKER_STATE,	API_CACHE),
+	API_ENTRY(CX2341X_OSD_BLT_COPY,			API_RESULT),
+	API_ENTRY(CX2341X_OSD_BLT_FILL,			API_RESULT),
+	API_ENTRY(CX2341X_OSD_BLT_TEXT,			API_RESULT),
+	API_ENTRY(CX2341X_OSD_SET_FRAMEBUFFER_WINDOW,	API_CACHE),
+	API_ENTRY(CX2341X_OSD_SET_CHROMA_KEY,		API_CACHE),
+	API_ENTRY(CX2341X_OSD_GET_ALPHA_CONTENT_INDEX,	API_FAST_RESULT),
+	API_ENTRY(CX2341X_OSD_SET_ALPHA_CONTENT_INDEX,	API_CACHE)
 };
 
 static int try_mailbox(struct ivtv *itv, struct ivtv_mailbox_data *mbdata, int mb)
diff --git a/drivers/media/pci/mantis/mantis_reg.h b/drivers/media/pci/mantis/mantis_reg.h
index 7761f9dc7fe0..762ed9f7a08e 100644
--- a/drivers/media/pci/mantis/mantis_reg.h
+++ b/drivers/media/pci/mantis/mantis_reg.h
@@ -166,12 +166,12 @@
 #define MANTIS_CARD_PLUGOUT		(0x01 <<  0)
 
 #define MANTIS_GPIF_BRADDR		0xa0
-#define MANTIS_GPIF_PCMCIAREG		(0x01 		<< 27)
-#define MANTIS_GPIF_PCMCIAIOM		(0x01 		<< 26)
+#define MANTIS_GPIF_PCMCIAREG		(0x01		<< 27)
+#define MANTIS_GPIF_PCMCIAIOM		(0x01		<< 26)
 #define MANTIS_GPIF_BR_ADDR		(0xfffffff	<<  0)
 
 #define MANTIS_GPIF_BRBYTES		0xa4
-#define MANTIS_GPIF_BRCNT		(0xfff 		<<  0)
+#define MANTIS_GPIF_BRCNT		(0xfff		<<  0)
 
 #define MANTIS_PCMCIA_RESET		0xa8
 #define MANTIS_PCMCIA_RSTVAL		(0xff << 0)
diff --git a/drivers/media/pci/mantis/mantis_vp1041.c b/drivers/media/pci/mantis/mantis_vp1041.c
index 47e0c48c3abc..0eeccc2d19a5 100644
--- a/drivers/media/pci/mantis/mantis_vp1041.c
+++ b/drivers/media/pci/mantis/mantis_vp1041.c
@@ -47,70 +47,70 @@ static const struct stb0899_s1_reg vp1041_stb0899_s1_init_1[] = {
 	/* 0x0000000b, *//* SYSREG */
 	{ STB0899_DEV_ID		, 0x30 },
 	{ STB0899_DISCNTRL1		, 0x32 },
-	{ STB0899_DISCNTRL2     	, 0x80 },
-	{ STB0899_DISRX_ST0     	, 0x04 },
-	{ STB0899_DISRX_ST1     	, 0x00 },
-	{ STB0899_DISPARITY     	, 0x00 },
+	{ STB0899_DISCNTRL2		, 0x80 },
+	{ STB0899_DISRX_ST0		, 0x04 },
+	{ STB0899_DISRX_ST1		, 0x00 },
+	{ STB0899_DISPARITY		, 0x00 },
 	{ STB0899_DISSTATUS		, 0x20 },
-	{ STB0899_DISF22        	, 0x99 },
-	{ STB0899_DISF22RX      	, 0xa8 },
+	{ STB0899_DISF22		, 0x99 },
+	{ STB0899_DISF22RX		, 0xa8 },
 	/* SYSREG ? */
-	{ STB0899_ACRPRESC      	, 0x11 },
-	{ STB0899_ACRDIV1       	, 0x0a },
-	{ STB0899_ACRDIV2       	, 0x05 },
-	{ STB0899_DACR1         	, 0x00 },
-	{ STB0899_DACR2         	, 0x00 },
-	{ STB0899_OUTCFG        	, 0x00 },
-	{ STB0899_MODECFG       	, 0x00 },
+	{ STB0899_ACRPRESC		, 0x11 },
+	{ STB0899_ACRDIV1		, 0x0a },
+	{ STB0899_ACRDIV2		, 0x05 },
+	{ STB0899_DACR1			, 0x00 },
+	{ STB0899_DACR2			, 0x00 },
+	{ STB0899_OUTCFG		, 0x00 },
+	{ STB0899_MODECFG		, 0x00 },
 	{ STB0899_IRQSTATUS_3		, 0xfe },
 	{ STB0899_IRQSTATUS_2		, 0x03 },
 	{ STB0899_IRQSTATUS_1		, 0x7c },
 	{ STB0899_IRQSTATUS_0		, 0xf4 },
-	{ STB0899_IRQMSK_3      	, 0xf3 },
-	{ STB0899_IRQMSK_2      	, 0xfc },
-	{ STB0899_IRQMSK_1      	, 0xff },
+	{ STB0899_IRQMSK_3		, 0xf3 },
+	{ STB0899_IRQMSK_2		, 0xfc },
+	{ STB0899_IRQMSK_1		, 0xff },
 	{ STB0899_IRQMSK_0		, 0xff },
 	{ STB0899_IRQCFG		, 0x00 },
-	{ STB0899_I2CCFG        	, 0x88 },
-	{ STB0899_I2CRPT        	, 0x58 },
+	{ STB0899_I2CCFG		, 0x88 },
+	{ STB0899_I2CRPT		, 0x58 },
 	{ STB0899_IOPVALUE5		, 0x00 },
 	{ STB0899_IOPVALUE4		, 0x33 },
 	{ STB0899_IOPVALUE3		, 0x6d },
 	{ STB0899_IOPVALUE2		, 0x90 },
 	{ STB0899_IOPVALUE1		, 0x60 },
 	{ STB0899_IOPVALUE0		, 0x00 },
-	{ STB0899_GPIO00CFG     	, 0x82 },
-	{ STB0899_GPIO01CFG     	, 0x82 },
-	{ STB0899_GPIO02CFG     	, 0x82 },
-	{ STB0899_GPIO03CFG     	, 0x82 },
-	{ STB0899_GPIO04CFG     	, 0x82 },
-	{ STB0899_GPIO05CFG     	, 0x82 },
-	{ STB0899_GPIO06CFG     	, 0x82 },
-	{ STB0899_GPIO07CFG     	, 0x82 },
-	{ STB0899_GPIO08CFG     	, 0x82 },
-	{ STB0899_GPIO09CFG     	, 0x82 },
-	{ STB0899_GPIO10CFG     	, 0x82 },
-	{ STB0899_GPIO11CFG     	, 0x82 },
-	{ STB0899_GPIO12CFG     	, 0x82 },
-	{ STB0899_GPIO13CFG     	, 0x82 },
-	{ STB0899_GPIO14CFG     	, 0x82 },
-	{ STB0899_GPIO15CFG     	, 0x82 },
-	{ STB0899_GPIO16CFG     	, 0x82 },
-	{ STB0899_GPIO17CFG     	, 0x82 },
-	{ STB0899_GPIO18CFG     	, 0x82 },
-	{ STB0899_GPIO19CFG     	, 0x82 },
-	{ STB0899_GPIO20CFG     	, 0x82 },
-	{ STB0899_SDATCFG       	, 0xb8 },
-	{ STB0899_SCLTCFG       	, 0xba },
-	{ STB0899_AGCRFCFG      	, 0x1c }, /* 0x11 */
-	{ STB0899_GPIO22        	, 0x82 }, /* AGCBB2CFG */
-	{ STB0899_GPIO21        	, 0x91 }, /* AGCBB1CFG */
-	{ STB0899_DIRCLKCFG     	, 0x82 },
-	{ STB0899_CLKOUT27CFG   	, 0x7e },
-	{ STB0899_STDBYCFG      	, 0x82 },
-	{ STB0899_CS0CFG        	, 0x82 },
-	{ STB0899_CS1CFG        	, 0x82 },
-	{ STB0899_DISEQCOCFG    	, 0x20 },
+	{ STB0899_GPIO00CFG		, 0x82 },
+	{ STB0899_GPIO01CFG		, 0x82 },
+	{ STB0899_GPIO02CFG		, 0x82 },
+	{ STB0899_GPIO03CFG		, 0x82 },
+	{ STB0899_GPIO04CFG		, 0x82 },
+	{ STB0899_GPIO05CFG		, 0x82 },
+	{ STB0899_GPIO06CFG		, 0x82 },
+	{ STB0899_GPIO07CFG		, 0x82 },
+	{ STB0899_GPIO08CFG		, 0x82 },
+	{ STB0899_GPIO09CFG		, 0x82 },
+	{ STB0899_GPIO10CFG		, 0x82 },
+	{ STB0899_GPIO11CFG		, 0x82 },
+	{ STB0899_GPIO12CFG		, 0x82 },
+	{ STB0899_GPIO13CFG		, 0x82 },
+	{ STB0899_GPIO14CFG		, 0x82 },
+	{ STB0899_GPIO15CFG		, 0x82 },
+	{ STB0899_GPIO16CFG		, 0x82 },
+	{ STB0899_GPIO17CFG		, 0x82 },
+	{ STB0899_GPIO18CFG		, 0x82 },
+	{ STB0899_GPIO19CFG		, 0x82 },
+	{ STB0899_GPIO20CFG		, 0x82 },
+	{ STB0899_SDATCFG		, 0xb8 },
+	{ STB0899_SCLTCFG		, 0xba },
+	{ STB0899_AGCRFCFG		, 0x1c }, /* 0x11 */
+	{ STB0899_GPIO22		, 0x82 }, /* AGCBB2CFG */
+	{ STB0899_GPIO21		, 0x91 }, /* AGCBB1CFG */
+	{ STB0899_DIRCLKCFG		, 0x82 },
+	{ STB0899_CLKOUT27CFG		, 0x7e },
+	{ STB0899_STDBYCFG		, 0x82 },
+	{ STB0899_CS0CFG		, 0x82 },
+	{ STB0899_CS1CFG		, 0x82 },
+	{ STB0899_DISEQCOCFG		, 0x20 },
 	{ STB0899_GPIO32CFG		, 0x82 },
 	{ STB0899_GPIO33CFG		, 0x82 },
 	{ STB0899_GPIO34CFG		, 0x82 },
@@ -119,35 +119,35 @@ static const struct stb0899_s1_reg vp1041_stb0899_s1_init_1[] = {
 	{ STB0899_GPIO37CFG		, 0x82 },
 	{ STB0899_GPIO38CFG		, 0x82 },
 	{ STB0899_GPIO39CFG		, 0x82 },
-	{ STB0899_NCOARSE       	, 0x17 }, /* 0x15 = 27 Mhz Clock, F/3 = 198MHz, F/6 = 99MHz */
-	{ STB0899_SYNTCTRL      	, 0x02 }, /* 0x00 = CLK from CLKI, 0x02 = CLK from XTALI */
-	{ STB0899_FILTCTRL      	, 0x00 },
-	{ STB0899_SYSCTRL       	, 0x01 },
-	{ STB0899_STOPCLK1      	, 0x20 },
-	{ STB0899_STOPCLK2      	, 0x00 },
+	{ STB0899_NCOARSE		, 0x17 }, /* 0x15 = 27 Mhz Clock, F/3 = 198MHz, F/6 = 99MHz */
+	{ STB0899_SYNTCTRL		, 0x02 }, /* 0x00 = CLK from CLKI, 0x02 = CLK from XTALI */
+	{ STB0899_FILTCTRL		, 0x00 },
+	{ STB0899_SYSCTRL		, 0x01 },
+	{ STB0899_STOPCLK1		, 0x20 },
+	{ STB0899_STOPCLK2		, 0x00 },
 	{ STB0899_INTBUFSTATUS		, 0x00 },
-	{ STB0899_INTBUFCTRL    	, 0x0a },
+	{ STB0899_INTBUFCTRL		, 0x0a },
 	{ 0xffff			, 0xff },
 };
 
 static const struct stb0899_s1_reg vp1041_stb0899_s1_init_3[] = {
-	{ STB0899_DEMOD         	, 0x00 },
-	{ STB0899_RCOMPC        	, 0xc9 },
-	{ STB0899_AGC1CN        	, 0x01 },
-	{ STB0899_AGC1REF       	, 0x10 },
+	{ STB0899_DEMOD			, 0x00 },
+	{ STB0899_RCOMPC		, 0xc9 },
+	{ STB0899_AGC1CN		, 0x01 },
+	{ STB0899_AGC1REF		, 0x10 },
 	{ STB0899_RTC			, 0x23 },
-	{ STB0899_TMGCFG        	, 0x4e },
-	{ STB0899_AGC2REF       	, 0x34 },
-	{ STB0899_TLSR          	, 0x84 },
-	{ STB0899_CFD           	, 0xf7 },
+	{ STB0899_TMGCFG		, 0x4e },
+	{ STB0899_AGC2REF		, 0x34 },
+	{ STB0899_TLSR			, 0x84 },
+	{ STB0899_CFD			, 0xf7 },
 	{ STB0899_ACLC			, 0x87 },
-	{ STB0899_BCLC          	, 0x94 },
-	{ STB0899_EQON          	, 0x41 },
-	{ STB0899_LDT           	, 0xf1 },
-	{ STB0899_LDT2          	, 0xe3 },
-	{ STB0899_EQUALREF      	, 0xb4 },
-	{ STB0899_TMGRAMP       	, 0x10 },
-	{ STB0899_TMGTHD        	, 0x30 },
+	{ STB0899_BCLC			, 0x94 },
+	{ STB0899_EQON			, 0x41 },
+	{ STB0899_LDT			, 0xf1 },
+	{ STB0899_LDT2			, 0xe3 },
+	{ STB0899_EQUALREF		, 0xb4 },
+	{ STB0899_TMGRAMP		, 0x10 },
+	{ STB0899_TMGTHD		, 0x30 },
 	{ STB0899_IDCCOMP		, 0xfd },
 	{ STB0899_QDCCOMP		, 0xff },
 	{ STB0899_POWERI		, 0x0c },
@@ -166,12 +166,12 @@ static const struct stb0899_s1_reg vp1041_stb0899_s1_init_3[] = {
 	{ STB0899_NIRL			, 0x80 },
 	{ STB0899_ISYMB			, 0x1d },
 	{ STB0899_QSYMB			, 0xa6 },
-	{ STB0899_SFRH          	, 0x2f },
-	{ STB0899_SFRM          	, 0x68 },
-	{ STB0899_SFRL          	, 0x40 },
-	{ STB0899_SFRUPH        	, 0x2f },
-	{ STB0899_SFRUPM        	, 0x68 },
-	{ STB0899_SFRUPL        	, 0x40 },
+	{ STB0899_SFRH			, 0x2f },
+	{ STB0899_SFRM			, 0x68 },
+	{ STB0899_SFRL			, 0x40 },
+	{ STB0899_SFRUPH		, 0x2f },
+	{ STB0899_SFRUPM		, 0x68 },
+	{ STB0899_SFRUPL		, 0x40 },
 	{ STB0899_EQUAI1		, 0x02 },
 	{ STB0899_EQUAQ1		, 0xff },
 	{ STB0899_EQUAI2		, 0x04 },
@@ -183,7 +183,7 @@ static const struct stb0899_s1_reg vp1041_stb0899_s1_init_3[] = {
 	{ STB0899_EQUAI5		, 0x08 },
 	{ STB0899_EQUAQ5		, 0xf5 },
 	{ STB0899_DSTATUS2		, 0x00 },
-	{ STB0899_VSTATUS       	, 0x00 },
+	{ STB0899_VSTATUS		, 0x00 },
 	{ STB0899_VERROR		, 0x86 },
 	{ STB0899_IQSWAP		, 0x2a },
 	{ STB0899_ECNT1M		, 0x00 },
@@ -192,26 +192,26 @@ static const struct stb0899_s1_reg vp1041_stb0899_s1_init_3[] = {
 	{ STB0899_ECNT2L		, 0x00 },
 	{ STB0899_ECNT3M		, 0x0a },
 	{ STB0899_ECNT3L		, 0xad },
-	{ STB0899_FECAUTO1      	, 0x06 },
+	{ STB0899_FECAUTO1		, 0x06 },
 	{ STB0899_FECM			, 0x01 },
-	{ STB0899_VTH12         	, 0xb0 },
-	{ STB0899_VTH23         	, 0x7a },
+	{ STB0899_VTH12			, 0xb0 },
+	{ STB0899_VTH23			, 0x7a },
 	{ STB0899_VTH34			, 0x58 },
-	{ STB0899_VTH56         	, 0x38 },
-	{ STB0899_VTH67         	, 0x34 },
-	{ STB0899_VTH78         	, 0x24 },
-	{ STB0899_PRVIT         	, 0xff },
-	{ STB0899_VITSYNC       	, 0x19 },
-	{ STB0899_RSULC         	, 0xb1 }, /* DVB = 0xb1, DSS = 0xa1 */
-	{ STB0899_TSULC         	, 0x42 },
-	{ STB0899_RSLLC         	, 0x41 },
+	{ STB0899_VTH56			, 0x38 },
+	{ STB0899_VTH67			, 0x34 },
+	{ STB0899_VTH78			, 0x24 },
+	{ STB0899_PRVIT			, 0xff },
+	{ STB0899_VITSYNC		, 0x19 },
+	{ STB0899_RSULC			, 0xb1 }, /* DVB = 0xb1, DSS = 0xa1 */
+	{ STB0899_TSULC			, 0x42 },
+	{ STB0899_RSLLC			, 0x41 },
 	{ STB0899_TSLPL			, 0x12 },
-	{ STB0899_TSCFGH        	, 0x0c },
-	{ STB0899_TSCFGM        	, 0x00 },
-	{ STB0899_TSCFGL        	, 0x00 },
+	{ STB0899_TSCFGH		, 0x0c },
+	{ STB0899_TSCFGM		, 0x00 },
+	{ STB0899_TSCFGL		, 0x00 },
 	{ STB0899_TSOUT			, 0x69 }, /* 0x0d for CAM */
-	{ STB0899_RSSYNCDEL     	, 0x00 },
-	{ STB0899_TSINHDELH     	, 0x02 },
+	{ STB0899_RSSYNCDEL		, 0x00 },
+	{ STB0899_TSINHDELH		, 0x02 },
 	{ STB0899_TSINHDELM		, 0x00 },
 	{ STB0899_TSINHDELL		, 0x00 },
 	{ STB0899_TSLLSTKM		, 0x1b },
@@ -222,18 +222,18 @@ static const struct stb0899_s1_reg vp1041_stb0899_s1_init_3[] = {
 	{ STB0899_PCKLENLL		, 0xcc },
 	{ STB0899_RSPCKLEN		, 0xbd },
 	{ STB0899_TSSTATUS		, 0x90 },
-	{ STB0899_ERRCTRL1      	, 0xb6 },
-	{ STB0899_ERRCTRL2      	, 0x95 },
-	{ STB0899_ERRCTRL3      	, 0x8d },
+	{ STB0899_ERRCTRL1		, 0xb6 },
+	{ STB0899_ERRCTRL2		, 0x95 },
+	{ STB0899_ERRCTRL3		, 0x8d },
 	{ STB0899_DMONMSK1		, 0x27 },
 	{ STB0899_DMONMSK0		, 0x03 },
-	{ STB0899_DEMAPVIT      	, 0x5c },
+	{ STB0899_DEMAPVIT		, 0x5c },
 	{ STB0899_PLPARM		, 0x19 },
-	{ STB0899_PDELCTRL      	, 0x48 },
-	{ STB0899_PDELCTRL2     	, 0x00 },
-	{ STB0899_BBHCTRL1      	, 0x00 },
-	{ STB0899_BBHCTRL2      	, 0x00 },
-	{ STB0899_HYSTTHRESH    	, 0x77 },
+	{ STB0899_PDELCTRL		, 0x48 },
+	{ STB0899_PDELCTRL2		, 0x00 },
+	{ STB0899_BBHCTRL1		, 0x00 },
+	{ STB0899_BBHCTRL2		, 0x00 },
+	{ STB0899_HYSTTHRESH		, 0x77 },
 	{ STB0899_MATCSTM		, 0x00 },
 	{ STB0899_MATCSTL		, 0x00 },
 	{ STB0899_UPLCSTM		, 0x00 },
@@ -270,7 +270,7 @@ static struct stb0899_config vp1041_stb0899_config = {
 	.init_s2_fec		= stb0899_s2_init_4,
 	.init_tst		= stb0899_s1_init_5,
 
-	.demod_address 		= 0x68, /*  0xd0 >> 1 */
+	.demod_address		= 0x68, /*  0xd0 >> 1 */
 
 	.xtal_freq		= 27000000,
 	.inversion		= IQ_SWAP_ON,
diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c
index 23999a8cef37..be860ec129b6 100644
--- a/drivers/media/pci/meye/meye.c
+++ b/drivers/media/pci/meye/meye.c
@@ -1536,7 +1536,7 @@ static const struct v4l2_ioctl_ops meye_ioctl_ops = {
 static const struct video_device meye_template = {
 	.name		= "meye",
 	.fops		= &meye_fops,
-	.ioctl_ops 	= &meye_ioctl_ops,
+	.ioctl_ops	= &meye_ioctl_ops,
 	.release	= video_device_release_empty,
 };
 
diff --git a/drivers/media/pci/saa7134/saa7134-cards.c b/drivers/media/pci/saa7134/saa7134-cards.c
index 9965d3531c80..9d6688a82b50 100644
--- a/drivers/media/pci/saa7134/saa7134-cards.c
+++ b/drivers/media/pci/saa7134/saa7134-cards.c
@@ -323,7 +323,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr	= ADDR_UNSET,
 		.radio_addr	= ADDR_UNSET,
-		.empress_addr 	= 0x20,
+		.empress_addr	= 0x20,
 
 		.inputs         = {{
 			.type = SAA7134_INPUT_COMPOSITE1,
@@ -454,7 +454,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr	= ADDR_UNSET,
 		.radio_addr	= ADDR_UNSET,
-		.empress_addr 	= 0x20,
+		.empress_addr	= 0x20,
 		.tda9887_conf	= TDA9887_PRESENT,
 		.gpiomask	= 0x820000,
 		.inputs		= {{
@@ -849,7 +849,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr	= ADDR_UNSET,
 		.radio_addr	= ADDR_UNSET,
-		.empress_addr 	= 0x20,
+		.empress_addr	= 0x20,
 		.inputs         = {{
 			.type = SAA7134_INPUT_COMPOSITE1,
 			.vmux = 4,
@@ -1006,7 +1006,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr	= ADDR_UNSET,
 		.radio_addr	= ADDR_UNSET,
-		.empress_addr 	= 0x20,
+		.empress_addr	= 0x20,
 		.inputs         = {{
 			.type = SAA7134_INPUT_COMPOSITE1,
 			.vmux = 1,
@@ -1767,7 +1767,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr	= ADDR_UNSET,
 		.radio_addr	= ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.inputs         = {{
 			.type = SAA7134_INPUT_TV,
@@ -2412,7 +2412,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type	= UNSET,
 		.tuner_addr	= ADDR_UNSET,
 		.radio_addr	= ADDR_UNSET,
-		.empress_addr 	= 0x21,
+		.empress_addr	= 0x21,
 		.inputs		= {{
 			.type = SAA7134_INPUT_COMPOSITE0,
 			.vmux   = 0,
@@ -3978,13 +3978,13 @@ struct saa7134_board saa7134_boards[] = {
 	[SAA7134_BOARD_BEHOLD_407] = {
 		/*       Beholder Intl. Ltd. 2008      */
 		/*Dmitry Belimov <d.belimov@gmail.com> */
-		.name 		= "Beholder BeholdTV 407",
-		.audio_clock 	= 0x00187de7,
-		.tuner_type 	= TUNER_PHILIPS_FM1216ME_MK3,
-		.radio_type 	= UNSET,
-		.tuner_addr 	= ADDR_UNSET,
-		.radio_addr 	= ADDR_UNSET,
-		.tda9887_conf 	= TDA9887_PRESENT,
+		.name		= "Beholder BeholdTV 407",
+		.audio_clock	= 0x00187de7,
+		.tuner_type	= TUNER_PHILIPS_FM1216ME_MK3,
+		.radio_type	= UNSET,
+		.tuner_addr	= ADDR_UNSET,
+		.radio_addr	= ADDR_UNSET,
+		.tda9887_conf	= TDA9887_PRESENT,
 		.gpiomask       = 0x00008000,
 		.inputs = {{
 			.type = SAA7134_INPUT_SVIDEO,
@@ -4006,13 +4006,13 @@ struct saa7134_board saa7134_boards[] = {
 	[SAA7134_BOARD_BEHOLD_407FM] = {
 		/*       Beholder Intl. Ltd. 2008      */
 		/*Dmitry Belimov <d.belimov@gmail.com> */
-		.name 		= "Beholder BeholdTV 407 FM",
-		.audio_clock 	= 0x00187de7,
-		.tuner_type 	= TUNER_PHILIPS_FM1216ME_MK3,
-		.radio_type 	= UNSET,
-		.tuner_addr 	= ADDR_UNSET,
-		.radio_addr 	= ADDR_UNSET,
-		.tda9887_conf 	= TDA9887_PRESENT,
+		.name		= "Beholder BeholdTV 407 FM",
+		.audio_clock	= 0x00187de7,
+		.tuner_type	= TUNER_PHILIPS_FM1216ME_MK3,
+		.radio_type	= UNSET,
+		.tuner_addr	= ADDR_UNSET,
+		.radio_addr	= ADDR_UNSET,
+		.tda9887_conf	= TDA9887_PRESENT,
 		.gpiomask       = 0x00008000,
 		.inputs = {{
 			.type = SAA7134_INPUT_SVIDEO,
@@ -4103,7 +4103,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.gpiomask       = 0x00008000,
 		.inputs         = {{
@@ -4166,7 +4166,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.gpiomask       = 0x00008000,
 		.inputs         = {{
@@ -4196,7 +4196,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.gpiomask       = 0x00008000,
 		.inputs         = {{
@@ -4366,7 +4366,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.inputs         = {{
 			.type = SAA7134_INPUT_TV,
@@ -4394,7 +4394,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.inputs         = {{
 			.type = SAA7134_INPUT_TV,
@@ -4422,7 +4422,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.inputs         = {{
 			.type = SAA7134_INPUT_TV,
@@ -4450,7 +4450,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.inputs         = {{
 			.type = SAA7134_INPUT_TV,
@@ -4481,7 +4481,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.empress_addr 	= 0x20,
+		.empress_addr	= 0x20,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.inputs         = { {
 			.type = SAA7134_INPUT_TV,
@@ -4517,7 +4517,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.empress_addr 	= 0x20,
+		.empress_addr	= 0x20,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.inputs         = { {
 			.type = SAA7134_INPUT_TV,
@@ -4554,8 +4554,8 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
-		.empress_addr 	= 0x20,
+		.rds_addr	= 0x10,
+		.empress_addr	= 0x20,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.inputs         = { {
 			.type = SAA7134_INPUT_TV,
@@ -5297,7 +5297,7 @@ struct saa7134_board saa7134_boards[] = {
 		.radio_type     = UNSET,
 		.tuner_addr     = ADDR_UNSET,
 		.radio_addr     = ADDR_UNSET,
-		.rds_addr 	= 0x10,
+		.rds_addr	= 0x10,
 		.tda9887_conf   = TDA9887_PRESENT,
 		.gpiomask       = 0x00008000,
 		.inputs         = {{
diff --git a/drivers/media/pci/saa7134/saa7134-dvb.c b/drivers/media/pci/saa7134/saa7134-dvb.c
index b55f9a1d9a63..a7a63d608dde 100644
--- a/drivers/media/pci/saa7134/saa7134-dvb.c
+++ b/drivers/media/pci/saa7134/saa7134-dvb.c
@@ -1389,7 +1389,7 @@ static int dvb_init(struct saa7134_dev *dev)
 			if (configure_tda827x_fe(dev, &lifeview_trio_config,
 						 &tda827x_cfg_0) < 0)
 				goto detach_frontend;
-		} else {  		/* satellite */
+		} else {		/* satellite */
 			fe0->dvb.frontend = dvb_attach(tda10086_attach, &flydvbs, &dev->i2c_adap);
 			if (fe0->dvb.frontend) {
 				if (dvb_attach(tda826x_attach, fe0->dvb.frontend, 0x63,
@@ -1659,7 +1659,7 @@ static int dvb_init(struct saa7134_dev *dev)
 			if (configure_tda827x_fe(dev, &asus_tiger_3in1_config,
 							&tda827x_cfg_2) < 0)
 				goto detach_frontend;
-		} else {  		/* satellite */
+		} else {		/* satellite */
 			fe0->dvb.frontend = dvb_attach(tda10086_attach,
 						&flydvbs, &dev->i2c_adap);
 			if (fe0->dvb.frontend) {
diff --git a/drivers/media/pci/saa7134/saa7134-video.c b/drivers/media/pci/saa7134/saa7134-video.c
index 1ae5d2dac3bf..052e101d898c 100644
--- a/drivers/media/pci/saa7134/saa7134-video.c
+++ b/drivers/media/pci/saa7134/saa7134-video.c
@@ -2043,14 +2043,14 @@ static const struct v4l2_ioctl_ops radio_ioctl_ops = {
 struct video_device saa7134_video_template = {
 	.name				= "saa7134-video",
 	.fops				= &video_fops,
-	.ioctl_ops 			= &video_ioctl_ops,
+	.ioctl_ops			= &video_ioctl_ops,
 	.tvnorms			= SAA7134_NORMS,
 };
 
 struct video_device saa7134_radio_template = {
 	.name			= "saa7134-radio",
 	.fops			= &radio_fops,
-	.ioctl_ops 		= &radio_ioctl_ops,
+	.ioctl_ops		= &radio_ioctl_ops,
 };
 
 static const struct v4l2_ctrl_ops saa7134_ctrl_ops = {
diff --git a/drivers/media/pci/saa7134/saa7134.h b/drivers/media/pci/saa7134/saa7134.h
index 39c36e6aefbe..d99e937a98c1 100644
--- a/drivers/media/pci/saa7134/saa7134.h
+++ b/drivers/media/pci/saa7134/saa7134.h
@@ -261,8 +261,8 @@ struct saa7134_card_ir {
 #define SAA7134_BOARD_SABRENT_TV_PCB05     115
 #define SAA7134_BOARD_10MOONSTVMASTER3     116
 #define SAA7134_BOARD_AVERMEDIA_SUPER_007  117
-#define SAA7134_BOARD_BEHOLD_401  	118
-#define SAA7134_BOARD_BEHOLD_403  	119
+#define SAA7134_BOARD_BEHOLD_401	118
+#define SAA7134_BOARD_BEHOLD_403	119
 #define SAA7134_BOARD_BEHOLD_403FM	120
 #define SAA7134_BOARD_BEHOLD_405	121
 #define SAA7134_BOARD_BEHOLD_405FM	122
@@ -581,7 +581,7 @@ struct saa7134_dev {
 	/* config info */
 	unsigned int               board;
 	unsigned int               tuner_type;
-	unsigned int 		   radio_type;
+	unsigned int		   radio_type;
 	unsigned char		   tuner_addr;
 	unsigned char		   radio_addr;
 
@@ -592,7 +592,7 @@ struct saa7134_dev {
 	struct i2c_adapter         i2c_adap;
 	struct i2c_client          i2c_client;
 	unsigned char              eedata[256];
-	int 			   has_rds;
+	int			   has_rds;
 
 	/* video overlay */
 	struct v4l2_framebuffer    ovbuf;
diff --git a/drivers/media/pci/saa7146/hexium_gemini.c b/drivers/media/pci/saa7146/hexium_gemini.c
index 39357eddee32..5817d9cde4d0 100644
--- a/drivers/media/pci/saa7146/hexium_gemini.c
+++ b/drivers/media/pci/saa7146/hexium_gemini.c
@@ -70,8 +70,8 @@ struct hexium
 	struct video_device	video_dev;
 	struct i2c_adapter	i2c_adapter;
 
-	int 		cur_input;	/* current input */
-	v4l2_std_id 	cur_std;	/* current standard */
+	int		cur_input;	/* current input */
+	v4l2_std_id	cur_std;	/* current standard */
 };
 
 /* Samsung KS0127B decoder default registers */
@@ -138,19 +138,19 @@ static struct hexium_data hexium_input_select[] = {
    are currently *not* supported*/
 static struct saa7146_standard hexium_standards[] = {
 	{
-		.name	= "PAL", 	.id	= V4L2_STD_PAL,
-		.v_offset	= 28,	.v_field 	= 288,
-		.h_offset	= 1,	.h_pixels 	= 680,
+		.name	= "PAL",	.id	= V4L2_STD_PAL,
+		.v_offset	= 28,	.v_field	= 288,
+		.h_offset	= 1,	.h_pixels	= 680,
 		.v_max_out	= 576,	.h_max_out	= 768,
 	}, {
-		.name	= "NTSC", 	.id	= V4L2_STD_NTSC,
-		.v_offset	= 28,	.v_field 	= 240,
-		.h_offset	= 1,	.h_pixels 	= 640,
+		.name	= "NTSC",	.id	= V4L2_STD_NTSC,
+		.v_offset	= 28,	.v_field	= 240,
+		.h_offset	= 1,	.h_pixels	= 640,
 		.v_max_out	= 480,	.h_max_out	= 640,
 	}, {
-		.name	= "SECAM", 	.id	= V4L2_STD_SECAM,
-		.v_offset	= 28,	.v_field 	= 288,
-		.h_offset	= 1,	.h_pixels 	= 720,
+		.name	= "SECAM",	.id	= V4L2_STD_SECAM,
+		.v_offset	= 28,	.v_field	= 288,
+		.h_offset	= 1,	.h_pixels	= 720,
 		.v_max_out	= 576,	.h_max_out	= 768,
 	}
 };
diff --git a/drivers/media/pci/saa7146/hexium_orion.c b/drivers/media/pci/saa7146/hexium_orion.c
index 461e421080f3..0a05176c18ab 100644
--- a/drivers/media/pci/saa7146/hexium_orion.c
+++ b/drivers/media/pci/saa7146/hexium_orion.c
@@ -188,19 +188,19 @@ static struct {
 
 static struct saa7146_standard hexium_standards[] = {
 	{
-		.name	= "PAL", 	.id	= V4L2_STD_PAL,
-		.v_offset	= 16,	.v_field 	= 288,
-		.h_offset	= 1,	.h_pixels 	= 680,
+		.name	= "PAL",	.id	= V4L2_STD_PAL,
+		.v_offset	= 16,	.v_field	= 288,
+		.h_offset	= 1,	.h_pixels	= 680,
 		.v_max_out	= 576,	.h_max_out	= 768,
 	}, {
-		.name	= "NTSC", 	.id	= V4L2_STD_NTSC,
-		.v_offset	= 16,	.v_field 	= 240,
-		.h_offset	= 1,	.h_pixels 	= 640,
+		.name	= "NTSC",	.id	= V4L2_STD_NTSC,
+		.v_offset	= 16,	.v_field	= 240,
+		.h_offset	= 1,	.h_pixels	= 640,
 		.v_max_out	= 480,	.h_max_out	= 640,
 	}, {
-		.name	= "SECAM", 	.id	= V4L2_STD_SECAM,
-		.v_offset	= 16,	.v_field 	= 288,
-		.h_offset	= 1,	.h_pixels 	= 720,
+		.name	= "SECAM",	.id	= V4L2_STD_SECAM,
+		.v_offset	= 16,	.v_field	= 288,
+		.h_offset	= 1,	.h_pixels	= 720,
 		.v_max_out	= 576,	.h_max_out	= 768,
 	}
 };
diff --git a/drivers/media/pci/saa7146/mxb.c b/drivers/media/pci/saa7146/mxb.c
index 2526fc051b65..6b5582b7c595 100644
--- a/drivers/media/pci/saa7146/mxb.c
+++ b/drivers/media/pci/saa7146/mxb.c
@@ -793,24 +793,24 @@ static int std_callback(struct saa7146_dev *dev, struct saa7146_standard *standa
 
 static struct saa7146_standard standard[] = {
 	{
-		.name	= "PAL-BG", 	.id	= V4L2_STD_PAL_BG,
-		.v_offset	= 0x17,	.v_field 	= 288,
-		.h_offset	= 0x14,	.h_pixels 	= 680,
+		.name	= "PAL-BG",	.id	= V4L2_STD_PAL_BG,
+		.v_offset	= 0x17,	.v_field	= 288,
+		.h_offset	= 0x14,	.h_pixels	= 680,
 		.v_max_out	= 576,	.h_max_out	= 768,
 	}, {
-		.name	= "PAL-I", 	.id	= V4L2_STD_PAL_I,
-		.v_offset	= 0x17,	.v_field 	= 288,
-		.h_offset	= 0x14,	.h_pixels 	= 680,
+		.name	= "PAL-I",	.id	= V4L2_STD_PAL_I,
+		.v_offset	= 0x17,	.v_field	= 288,
+		.h_offset	= 0x14,	.h_pixels	= 680,
 		.v_max_out	= 576,	.h_max_out	= 768,
 	}, {
-		.name	= "NTSC", 	.id	= V4L2_STD_NTSC,
-		.v_offset	= 0x16,	.v_field 	= 240,
-		.h_offset	= 0x06,	.h_pixels 	= 708,
+		.name	= "NTSC",	.id	= V4L2_STD_NTSC,
+		.v_offset	= 0x16,	.v_field	= 240,
+		.h_offset	= 0x06,	.h_pixels	= 708,
 		.v_max_out	= 480,	.h_max_out	= 640,
 	}, {
-		.name	= "SECAM", 	.id	= V4L2_STD_SECAM,
-		.v_offset	= 0x14,	.v_field 	= 288,
-		.h_offset	= 0x14,	.h_pixels 	= 720,
+		.name	= "SECAM",	.id	= V4L2_STD_SECAM,
+		.v_offset	= 0x14,	.v_field	= 288,
+		.h_offset	= 0x14,	.h_pixels	= 720,
 		.v_max_out	= 576,	.h_max_out	= 768,
 	}
 };
diff --git a/drivers/media/pci/ttpci/av7110.h b/drivers/media/pci/ttpci/av7110.h
index 9bfbb1471717..8606ef5ebbe2 100644
--- a/drivers/media/pci/ttpci/av7110.h
+++ b/drivers/media/pci/ttpci/av7110.h
@@ -52,7 +52,7 @@ extern int av7110_debug;
 enum {AV_PES_STREAM, PS_STREAM, TS_STREAM, PES_STREAM};
 
 enum av7110_video_mode {
-	AV7110_VIDEO_MODE_PAL 	= 0,
+	AV7110_VIDEO_MODE_PAL	= 0,
 	AV7110_VIDEO_MODE_NTSC	= 1
 };
 
diff --git a/drivers/media/pci/ttpci/budget-av.c b/drivers/media/pci/ttpci/budget-av.c
index 6b0e09ca01dc..abc98f1ad26c 100644
--- a/drivers/media/pci/ttpci/budget-av.c
+++ b/drivers/media/pci/ttpci/budget-av.c
@@ -1181,14 +1181,14 @@ static u8 read_pwm(struct budget_av *budget_av)
 #define SUBID_DVBS_KNC1_PLUS		0x0011
 #define SUBID_DVBS_TYPHOON		0x4f56
 #define SUBID_DVBS_CINERGY1200		0x1154
-#define SUBID_DVBS_CYNERGY1200N 	0x1155
+#define SUBID_DVBS_CYNERGY1200N		0x1155
 #define SUBID_DVBS_TV_STAR		0x0014
 #define SUBID_DVBS_TV_STAR_PLUS_X4	0x0015
 #define SUBID_DVBS_TV_STAR_CI		0x0016
 #define SUBID_DVBS2_KNC1		0x0018
 #define SUBID_DVBS2_KNC1_OEM		0x0019
-#define SUBID_DVBS_EASYWATCH_1  	0x001a
-#define SUBID_DVBS_EASYWATCH_2  	0x001b
+#define SUBID_DVBS_EASYWATCH_1		0x001a
+#define SUBID_DVBS_EASYWATCH_2		0x001b
 #define SUBID_DVBS2_EASYWATCH		0x001d
 #define SUBID_DVBS_EASYWATCH		0x001e
 
diff --git a/drivers/media/pci/ttpci/budget-ci.c b/drivers/media/pci/ttpci/budget-ci.c
index f67ed118f273..ec8f92540f7c 100644
--- a/drivers/media/pci/ttpci/budget-ci.c
+++ b/drivers/media/pci/ttpci/budget-ci.c
@@ -1050,70 +1050,70 @@ static const struct stb0899_s1_reg tt3200_stb0899_s1_init_1[] = {
 
 	{ STB0899_DEV_ID		, 0x81 },
 	{ STB0899_DISCNTRL1		, 0x32 },
-	{ STB0899_DISCNTRL2     	, 0x80 },
-	{ STB0899_DISRX_ST0     	, 0x04 },
-	{ STB0899_DISRX_ST1     	, 0x00 },
-	{ STB0899_DISPARITY     	, 0x00 },
+	{ STB0899_DISCNTRL2		, 0x80 },
+	{ STB0899_DISRX_ST0		, 0x04 },
+	{ STB0899_DISRX_ST1		, 0x00 },
+	{ STB0899_DISPARITY		, 0x00 },
 	{ STB0899_DISSTATUS		, 0x20 },
-	{ STB0899_DISF22        	, 0x8c },
-	{ STB0899_DISF22RX      	, 0x9a },
+	{ STB0899_DISF22		, 0x8c },
+	{ STB0899_DISF22RX		, 0x9a },
 	{ STB0899_SYSREG		, 0x0b },
-	{ STB0899_ACRPRESC      	, 0x11 },
-	{ STB0899_ACRDIV1       	, 0x0a },
-	{ STB0899_ACRDIV2       	, 0x05 },
-	{ STB0899_DACR1         	, 0x00 },
-	{ STB0899_DACR2         	, 0x00 },
-	{ STB0899_OUTCFG        	, 0x00 },
-	{ STB0899_MODECFG       	, 0x00 },
+	{ STB0899_ACRPRESC		, 0x11 },
+	{ STB0899_ACRDIV1		, 0x0a },
+	{ STB0899_ACRDIV2		, 0x05 },
+	{ STB0899_DACR1			, 0x00 },
+	{ STB0899_DACR2			, 0x00 },
+	{ STB0899_OUTCFG		, 0x00 },
+	{ STB0899_MODECFG		, 0x00 },
 	{ STB0899_IRQSTATUS_3		, 0x30 },
 	{ STB0899_IRQSTATUS_2		, 0x00 },
 	{ STB0899_IRQSTATUS_1		, 0x00 },
 	{ STB0899_IRQSTATUS_0		, 0x00 },
-	{ STB0899_IRQMSK_3      	, 0xf3 },
-	{ STB0899_IRQMSK_2      	, 0xfc },
-	{ STB0899_IRQMSK_1      	, 0xff },
+	{ STB0899_IRQMSK_3		, 0xf3 },
+	{ STB0899_IRQMSK_2		, 0xfc },
+	{ STB0899_IRQMSK_1		, 0xff },
 	{ STB0899_IRQMSK_0		, 0xff },
 	{ STB0899_IRQCFG		, 0x00 },
-	{ STB0899_I2CCFG        	, 0x88 },
-	{ STB0899_I2CRPT        	, 0x48 }, /* 12k Pullup, Repeater=16, Stop=disabled */
+	{ STB0899_I2CCFG		, 0x88 },
+	{ STB0899_I2CRPT		, 0x48 }, /* 12k Pullup, Repeater=16, Stop=disabled */
 	{ STB0899_IOPVALUE5		, 0x00 },
 	{ STB0899_IOPVALUE4		, 0x20 },
 	{ STB0899_IOPVALUE3		, 0xc9 },
 	{ STB0899_IOPVALUE2		, 0x90 },
 	{ STB0899_IOPVALUE1		, 0x40 },
 	{ STB0899_IOPVALUE0		, 0x00 },
-	{ STB0899_GPIO00CFG     	, 0x82 },
-	{ STB0899_GPIO01CFG     	, 0x82 },
-	{ STB0899_GPIO02CFG     	, 0x82 },
-	{ STB0899_GPIO03CFG     	, 0x82 },
-	{ STB0899_GPIO04CFG     	, 0x82 },
-	{ STB0899_GPIO05CFG     	, 0x82 },
-	{ STB0899_GPIO06CFG     	, 0x82 },
-	{ STB0899_GPIO07CFG     	, 0x82 },
-	{ STB0899_GPIO08CFG     	, 0x82 },
-	{ STB0899_GPIO09CFG     	, 0x82 },
-	{ STB0899_GPIO10CFG     	, 0x82 },
-	{ STB0899_GPIO11CFG     	, 0x82 },
-	{ STB0899_GPIO12CFG     	, 0x82 },
-	{ STB0899_GPIO13CFG     	, 0x82 },
-	{ STB0899_GPIO14CFG     	, 0x82 },
-	{ STB0899_GPIO15CFG     	, 0x82 },
-	{ STB0899_GPIO16CFG     	, 0x82 },
-	{ STB0899_GPIO17CFG     	, 0x82 },
-	{ STB0899_GPIO18CFG     	, 0x82 },
-	{ STB0899_GPIO19CFG     	, 0x82 },
-	{ STB0899_GPIO20CFG     	, 0x82 },
-	{ STB0899_SDATCFG       	, 0xb8 },
-	{ STB0899_SCLTCFG       	, 0xba },
-	{ STB0899_AGCRFCFG      	, 0x1c }, /* 0x11 */
-	{ STB0899_GPIO22        	, 0x82 }, /* AGCBB2CFG */
-	{ STB0899_GPIO21        	, 0x91 }, /* AGCBB1CFG */
-	{ STB0899_DIRCLKCFG     	, 0x82 },
-	{ STB0899_CLKOUT27CFG   	, 0x7e },
-	{ STB0899_STDBYCFG      	, 0x82 },
-	{ STB0899_CS0CFG        	, 0x82 },
-	{ STB0899_CS1CFG        	, 0x82 },
-	{ STB0899_DISEQCOCFG    	, 0x20 },
+	{ STB0899_GPIO00CFG		, 0x82 },
+	{ STB0899_GPIO01CFG		, 0x82 },
+	{ STB0899_GPIO02CFG		, 0x82 },
+	{ STB0899_GPIO03CFG		, 0x82 },
+	{ STB0899_GPIO04CFG		, 0x82 },
+	{ STB0899_GPIO05CFG		, 0x82 },
+	{ STB0899_GPIO06CFG		, 0x82 },
+	{ STB0899_GPIO07CFG		, 0x82 },
+	{ STB0899_GPIO08CFG		, 0x82 },
+	{ STB0899_GPIO09CFG		, 0x82 },
+	{ STB0899_GPIO10CFG		, 0x82 },
+	{ STB0899_GPIO11CFG		, 0x82 },
+	{ STB0899_GPIO12CFG		, 0x82 },
+	{ STB0899_GPIO13CFG		, 0x82 },
+	{ STB0899_GPIO14CFG		, 0x82 },
+	{ STB0899_GPIO15CFG		, 0x82 },
+	{ STB0899_GPIO16CFG		, 0x82 },
+	{ STB0899_GPIO17CFG		, 0x82 },
+	{ STB0899_GPIO18CFG		, 0x82 },
+	{ STB0899_GPIO19CFG		, 0x82 },
+	{ STB0899_GPIO20CFG		, 0x82 },
+	{ STB0899_SDATCFG		, 0xb8 },
+	{ STB0899_SCLTCFG		, 0xba },
+	{ STB0899_AGCRFCFG		, 0x1c }, /* 0x11 */
+	{ STB0899_GPIO22		, 0x82 }, /* AGCBB2CFG */
+	{ STB0899_GPIO21		, 0x91 }, /* AGCBB1CFG */
+	{ STB0899_DIRCLKCFG		, 0x82 },
+	{ STB0899_CLKOUT27CFG		, 0x7e },
+	{ STB0899_STDBYCFG		, 0x82 },
+	{ STB0899_CS0CFG		, 0x82 },
+	{ STB0899_CS1CFG		, 0x82 },
+	{ STB0899_DISEQCOCFG		, 0x20 },
 	{ STB0899_GPIO32CFG		, 0x82 },
 	{ STB0899_GPIO33CFG		, 0x82 },
 	{ STB0899_GPIO34CFG		, 0x82 },
@@ -1122,35 +1122,35 @@ static const struct stb0899_s1_reg tt3200_stb0899_s1_init_1[] = {
 	{ STB0899_GPIO37CFG		, 0x82 },
 	{ STB0899_GPIO38CFG		, 0x82 },
 	{ STB0899_GPIO39CFG		, 0x82 },
-	{ STB0899_NCOARSE       	, 0x15 }, /* 0x15 = 27 Mhz Clock, F/3 = 198MHz, F/6 = 99MHz */
-	{ STB0899_SYNTCTRL      	, 0x02 }, /* 0x00 = CLK from CLKI, 0x02 = CLK from XTALI */
-	{ STB0899_FILTCTRL      	, 0x00 },
-	{ STB0899_SYSCTRL       	, 0x00 },
-	{ STB0899_STOPCLK1      	, 0x20 },
-	{ STB0899_STOPCLK2      	, 0x00 },
+	{ STB0899_NCOARSE		, 0x15 }, /* 0x15 = 27 Mhz Clock, F/3 = 198MHz, F/6 = 99MHz */
+	{ STB0899_SYNTCTRL		, 0x02 }, /* 0x00 = CLK from CLKI, 0x02 = CLK from XTALI */
+	{ STB0899_FILTCTRL		, 0x00 },
+	{ STB0899_SYSCTRL		, 0x00 },
+	{ STB0899_STOPCLK1		, 0x20 },
+	{ STB0899_STOPCLK2		, 0x00 },
 	{ STB0899_INTBUFSTATUS		, 0x00 },
-	{ STB0899_INTBUFCTRL    	, 0x0a },
+	{ STB0899_INTBUFCTRL		, 0x0a },
 	{ 0xffff			, 0xff },
 };
 
 static const struct stb0899_s1_reg tt3200_stb0899_s1_init_3[] = {
-	{ STB0899_DEMOD         	, 0x00 },
-	{ STB0899_RCOMPC        	, 0xc9 },
-	{ STB0899_AGC1CN        	, 0x41 },
-	{ STB0899_AGC1REF       	, 0x10 },
+	{ STB0899_DEMOD			, 0x00 },
+	{ STB0899_RCOMPC		, 0xc9 },
+	{ STB0899_AGC1CN		, 0x41 },
+	{ STB0899_AGC1REF		, 0x10 },
 	{ STB0899_RTC			, 0x7a },
-	{ STB0899_TMGCFG        	, 0x4e },
-	{ STB0899_AGC2REF       	, 0x34 },
-	{ STB0899_TLSR          	, 0x84 },
-	{ STB0899_CFD           	, 0xc7 },
+	{ STB0899_TMGCFG		, 0x4e },
+	{ STB0899_AGC2REF		, 0x34 },
+	{ STB0899_TLSR			, 0x84 },
+	{ STB0899_CFD			, 0xc7 },
 	{ STB0899_ACLC			, 0x87 },
-	{ STB0899_BCLC          	, 0x94 },
-	{ STB0899_EQON          	, 0x41 },
-	{ STB0899_LDT           	, 0xdd },
-	{ STB0899_LDT2          	, 0xc9 },
-	{ STB0899_EQUALREF      	, 0xb4 },
-	{ STB0899_TMGRAMP       	, 0x10 },
-	{ STB0899_TMGTHD        	, 0x30 },
+	{ STB0899_BCLC			, 0x94 },
+	{ STB0899_EQON			, 0x41 },
+	{ STB0899_LDT			, 0xdd },
+	{ STB0899_LDT2			, 0xc9 },
+	{ STB0899_EQUALREF		, 0xb4 },
+	{ STB0899_TMGRAMP		, 0x10 },
+	{ STB0899_TMGTHD		, 0x30 },
 	{ STB0899_IDCCOMP		, 0xfb },
 	{ STB0899_QDCCOMP		, 0x03 },
 	{ STB0899_POWERI		, 0x3b },
@@ -1169,12 +1169,12 @@ static const struct stb0899_s1_reg tt3200_stb0899_s1_init_3[] = {
 	{ STB0899_NIRL			, 0x05 },
 	{ STB0899_ISYMB			, 0x17 },
 	{ STB0899_QSYMB			, 0xfa },
-	{ STB0899_SFRH          	, 0x2f },
-	{ STB0899_SFRM          	, 0x68 },
-	{ STB0899_SFRL          	, 0x40 },
-	{ STB0899_SFRUPH        	, 0x2f },
-	{ STB0899_SFRUPM        	, 0x68 },
-	{ STB0899_SFRUPL        	, 0x40 },
+	{ STB0899_SFRH			, 0x2f },
+	{ STB0899_SFRM			, 0x68 },
+	{ STB0899_SFRL			, 0x40 },
+	{ STB0899_SFRUPH		, 0x2f },
+	{ STB0899_SFRUPM		, 0x68 },
+	{ STB0899_SFRUPL		, 0x40 },
 	{ STB0899_EQUAI1		, 0xfd },
 	{ STB0899_EQUAQ1		, 0x04 },
 	{ STB0899_EQUAI2		, 0x0f },
@@ -1186,7 +1186,7 @@ static const struct stb0899_s1_reg tt3200_stb0899_s1_init_3[] = {
 	{ STB0899_EQUAI5		, 0xbd },
 	{ STB0899_EQUAQ5		, 0xf7 },
 	{ STB0899_DSTATUS2		, 0x00 },
-	{ STB0899_VSTATUS       	, 0x00 },
+	{ STB0899_VSTATUS		, 0x00 },
 	{ STB0899_VERROR		, 0xff },
 	{ STB0899_IQSWAP		, 0x2a },
 	{ STB0899_ECNT1M		, 0x00 },
@@ -1195,26 +1195,26 @@ static const struct stb0899_s1_reg tt3200_stb0899_s1_init_3[] = {
 	{ STB0899_ECNT2L		, 0x00 },
 	{ STB0899_ECNT3M		, 0x00 },
 	{ STB0899_ECNT3L		, 0x00 },
-	{ STB0899_FECAUTO1      	, 0x06 },
+	{ STB0899_FECAUTO1		, 0x06 },
 	{ STB0899_FECM			, 0x01 },
-	{ STB0899_VTH12         	, 0xf0 },
-	{ STB0899_VTH23         	, 0xa0 },
+	{ STB0899_VTH12			, 0xf0 },
+	{ STB0899_VTH23			, 0xa0 },
 	{ STB0899_VTH34			, 0x78 },
-	{ STB0899_VTH56         	, 0x4e },
-	{ STB0899_VTH67         	, 0x48 },
-	{ STB0899_VTH78         	, 0x38 },
-	{ STB0899_PRVIT         	, 0xff },
-	{ STB0899_VITSYNC       	, 0x19 },
-	{ STB0899_RSULC         	, 0xb1 }, /* DVB = 0xb1, DSS = 0xa1 */
-	{ STB0899_TSULC         	, 0x42 },
-	{ STB0899_RSLLC         	, 0x40 },
+	{ STB0899_VTH56			, 0x4e },
+	{ STB0899_VTH67			, 0x48 },
+	{ STB0899_VTH78			, 0x38 },
+	{ STB0899_PRVIT			, 0xff },
+	{ STB0899_VITSYNC		, 0x19 },
+	{ STB0899_RSULC			, 0xb1 }, /* DVB = 0xb1, DSS = 0xa1 */
+	{ STB0899_TSULC			, 0x42 },
+	{ STB0899_RSLLC			, 0x40 },
 	{ STB0899_TSLPL			, 0x12 },
-	{ STB0899_TSCFGH        	, 0x0c },
-	{ STB0899_TSCFGM        	, 0x00 },
-	{ STB0899_TSCFGL        	, 0x0c },
+	{ STB0899_TSCFGH		, 0x0c },
+	{ STB0899_TSCFGM		, 0x00 },
+	{ STB0899_TSCFGL		, 0x0c },
 	{ STB0899_TSOUT			, 0x4d }, /* 0x0d for CAM */
-	{ STB0899_RSSYNCDEL     	, 0x00 },
-	{ STB0899_TSINHDELH     	, 0x02 },
+	{ STB0899_RSSYNCDEL		, 0x00 },
+	{ STB0899_TSINHDELH		, 0x02 },
 	{ STB0899_TSINHDELM		, 0x00 },
 	{ STB0899_TSINHDELL		, 0x00 },
 	{ STB0899_TSLLSTKM		, 0x00 },
@@ -1225,18 +1225,18 @@ static const struct stb0899_s1_reg tt3200_stb0899_s1_init_3[] = {
 	{ STB0899_PCKLENLL		, 0xcc },
 	{ STB0899_RSPCKLEN		, 0xcc },
 	{ STB0899_TSSTATUS		, 0x80 },
-	{ STB0899_ERRCTRL1      	, 0xb6 },
-	{ STB0899_ERRCTRL2      	, 0x96 },
-	{ STB0899_ERRCTRL3      	, 0x89 },
+	{ STB0899_ERRCTRL1		, 0xb6 },
+	{ STB0899_ERRCTRL2		, 0x96 },
+	{ STB0899_ERRCTRL3		, 0x89 },
 	{ STB0899_DMONMSK1		, 0x27 },
 	{ STB0899_DMONMSK0		, 0x03 },
-	{ STB0899_DEMAPVIT      	, 0x5c },
+	{ STB0899_DEMAPVIT		, 0x5c },
 	{ STB0899_PLPARM		, 0x1f },
-	{ STB0899_PDELCTRL      	, 0x48 },
-	{ STB0899_PDELCTRL2     	, 0x00 },
-	{ STB0899_BBHCTRL1      	, 0x00 },
-	{ STB0899_BBHCTRL2      	, 0x00 },
-	{ STB0899_HYSTTHRESH    	, 0x77 },
+	{ STB0899_PDELCTRL		, 0x48 },
+	{ STB0899_PDELCTRL2		, 0x00 },
+	{ STB0899_BBHCTRL1		, 0x00 },
+	{ STB0899_BBHCTRL2		, 0x00 },
+	{ STB0899_HYSTTHRESH		, 0x77 },
 	{ STB0899_MATCSTM		, 0x00 },
 	{ STB0899_MATCSTL		, 0x00 },
 	{ STB0899_UPLCSTM		, 0x00 },
@@ -1275,7 +1275,7 @@ static struct stb0899_config tt3200_config = {
 
 	.postproc		= NULL,
 
-	.demod_address 		= 0x68,
+	.demod_address		= 0x68,
 
 	.xtal_freq		= 27000000,
 	.inversion		= IQ_SWAP_ON,
diff --git a/drivers/media/pci/zoran/zoran_driver.c b/drivers/media/pci/zoran/zoran_driver.c
index d07840072337..10fefdf2f1e2 100644
--- a/drivers/media/pci/zoran/zoran_driver.c
+++ b/drivers/media/pci/zoran/zoran_driver.c
@@ -2792,21 +2792,21 @@ zoran_mmap (struct file           *file,
 }
 
 static const struct v4l2_ioctl_ops zoran_ioctl_ops = {
-	.vidioc_querycap    		    = zoran_querycap,
+	.vidioc_querycap		    = zoran_querycap,
 	.vidioc_s_selection		    = zoran_s_selection,
 	.vidioc_g_selection		    = zoran_g_selection,
-	.vidioc_enum_input     		    = zoran_enum_input,
-	.vidioc_g_input      		    = zoran_g_input,
-	.vidioc_s_input      		    = zoran_s_input,
-	.vidioc_enum_output    		    = zoran_enum_output,
-	.vidioc_g_output     		    = zoran_g_output,
-	.vidioc_s_output     		    = zoran_s_output,
+	.vidioc_enum_input		    = zoran_enum_input,
+	.vidioc_g_input			    = zoran_g_input,
+	.vidioc_s_input			    = zoran_s_input,
+	.vidioc_enum_output		    = zoran_enum_output,
+	.vidioc_g_output		    = zoran_g_output,
+	.vidioc_s_output		    = zoran_s_output,
 	.vidioc_g_fbuf			    = zoran_g_fbuf,
 	.vidioc_s_fbuf			    = zoran_s_fbuf,
-	.vidioc_g_std 			    = zoran_g_std,
-	.vidioc_s_std 			    = zoran_s_std,
-	.vidioc_g_jpegcomp 		    = zoran_g_jpegcomp,
-	.vidioc_s_jpegcomp 		    = zoran_s_jpegcomp,
+	.vidioc_g_std			    = zoran_g_std,
+	.vidioc_s_std			    = zoran_s_std,
+	.vidioc_g_jpegcomp		    = zoran_g_jpegcomp,
+	.vidioc_s_jpegcomp		    = zoran_s_jpegcomp,
 	.vidioc_overlay			    = zoran_overlay,
 	.vidioc_reqbufs			    = zoran_reqbufs,
 	.vidioc_querybuf		    = zoran_querybuf,
@@ -2814,18 +2814,18 @@ static const struct v4l2_ioctl_ops zoran_ioctl_ops = {
 	.vidioc_dqbuf			    = zoran_dqbuf,
 	.vidioc_streamon		    = zoran_streamon,
 	.vidioc_streamoff		    = zoran_streamoff,
-	.vidioc_enum_fmt_vid_cap 	    = zoran_enum_fmt_vid_cap,
-	.vidioc_enum_fmt_vid_out 	    = zoran_enum_fmt_vid_out,
-	.vidioc_enum_fmt_vid_overlay 	    = zoran_enum_fmt_vid_overlay,
-	.vidioc_g_fmt_vid_cap 		    = zoran_g_fmt_vid_cap,
+	.vidioc_enum_fmt_vid_cap	    = zoran_enum_fmt_vid_cap,
+	.vidioc_enum_fmt_vid_out	    = zoran_enum_fmt_vid_out,
+	.vidioc_enum_fmt_vid_overlay	    = zoran_enum_fmt_vid_overlay,
+	.vidioc_g_fmt_vid_cap		    = zoran_g_fmt_vid_cap,
 	.vidioc_g_fmt_vid_out               = zoran_g_fmt_vid_out,
 	.vidioc_g_fmt_vid_overlay           = zoran_g_fmt_vid_overlay,
-	.vidioc_s_fmt_vid_cap  		    = zoran_s_fmt_vid_cap,
+	.vidioc_s_fmt_vid_cap		    = zoran_s_fmt_vid_cap,
 	.vidioc_s_fmt_vid_out               = zoran_s_fmt_vid_out,
 	.vidioc_s_fmt_vid_overlay           = zoran_s_fmt_vid_overlay,
-	.vidioc_try_fmt_vid_cap  	    = zoran_try_fmt_vid_cap,
-	.vidioc_try_fmt_vid_out 	    = zoran_try_fmt_vid_out,
-	.vidioc_try_fmt_vid_overlay 	    = zoran_try_fmt_vid_overlay,
+	.vidioc_try_fmt_vid_cap		    = zoran_try_fmt_vid_cap,
+	.vidioc_try_fmt_vid_out		    = zoran_try_fmt_vid_out,
+	.vidioc_try_fmt_vid_overlay	    = zoran_try_fmt_vid_overlay,
 	.vidioc_subscribe_event             = v4l2_ctrl_subscribe_event,
 	.vidioc_unsubscribe_event           = v4l2_event_unsubscribe,
 };
diff --git a/drivers/media/pci/zoran/zr36057.h b/drivers/media/pci/zoran/zr36057.h
index c9ffef15532d..c8acb21dcb5c 100644
--- a/drivers/media/pci/zoran/zr36057.h
+++ b/drivers/media/pci/zoran/zr36057.h
@@ -103,8 +103,8 @@
 #define ZR36057_ICR_IntPinEn            (1<<24)
 
 #define ZR36057_I2CBR           0x044	/* I2C Bus Register */
-#define ZR36057_I2CBR_SDA       	(1<<1)
-#define ZR36057_I2CBR_SCL       	(1<<0)
+#define ZR36057_I2CBR_SDA		(1<<1)
+#define ZR36057_I2CBR_SCL		(1<<0)
 
 #define ZR36057_JMC             0x100	/* JPEG Mode and Control */
 #define ZR36057_JMC_JPG                 (1 << 31)
diff --git a/drivers/media/platform/Makefile b/drivers/media/platform/Makefile
index 347fba8177b5..7f3080437be6 100644
--- a/drivers/media/platform/Makefile
+++ b/drivers/media/platform/Makefile
@@ -23,7 +23,7 @@ obj-$(CONFIG_VIDEO_TI_VPE)		+= ti-vpe/
 obj-$(CONFIG_VIDEO_TI_CAL)		+= ti-vpe/
 
 obj-$(CONFIG_VIDEO_MX2_EMMAPRP)		+= mx2_emmaprp.o
-obj-$(CONFIG_VIDEO_CODA) 		+= coda/
+obj-$(CONFIG_VIDEO_CODA)		+= coda/
 
 obj-$(CONFIG_VIDEO_SH_VEU)		+= sh_veu.o
 
@@ -33,8 +33,8 @@ obj-$(CONFIG_VIDEO_MEM2MEM_DEINTERLACE)	+= m2m-deinterlace.o
 
 obj-$(CONFIG_VIDEO_MUX)			+= video-mux.o
 
-obj-$(CONFIG_VIDEO_S3C_CAMIF) 		+= s3c-camif/
-obj-$(CONFIG_VIDEO_SAMSUNG_EXYNOS4_IS) 	+= exynos4-is/
+obj-$(CONFIG_VIDEO_S3C_CAMIF)		+= s3c-camif/
+obj-$(CONFIG_VIDEO_SAMSUNG_EXYNOS4_IS)	+= exynos4-is/
 obj-$(CONFIG_VIDEO_SAMSUNG_S5P_JPEG)	+= s5p-jpeg/
 obj-$(CONFIG_VIDEO_SAMSUNG_S5P_MFC)	+= s5p-mfc/
 
@@ -45,13 +45,13 @@ obj-$(CONFIG_VIDEO_SAMSUNG_EXYNOS_GSC)	+= exynos-gsc/
 obj-$(CONFIG_VIDEO_STI_BDISP)		+= sti/bdisp/
 obj-$(CONFIG_VIDEO_STI_HVA)		+= sti/hva/
 obj-$(CONFIG_DVB_C8SECTPFE)		+= sti/c8sectpfe/
-obj-$(CONFIG_VIDEO_STI_HDMI_CEC) 	+= sti/cec/
+obj-$(CONFIG_VIDEO_STI_HDMI_CEC)	+= sti/cec/
 
 obj-$(CONFIG_VIDEO_STI_DELTA)		+= sti/delta/
 
 obj-$(CONFIG_VIDEO_TEGRA_HDMI_CEC)	+= tegra-cec/
 
-obj-y 					+= stm32/
+obj-y					+= stm32/
 
 obj-y                                   += blackfin/
 
@@ -62,9 +62,9 @@ obj-$(CONFIG_VIDEO_SH_VOU)		+= sh_vou.o
 obj-$(CONFIG_SOC_CAMERA)		+= soc_camera/
 
 obj-$(CONFIG_VIDEO_RCAR_DRIF)		+= rcar_drif.o
-obj-$(CONFIG_VIDEO_RENESAS_FCP) 	+= rcar-fcp.o
+obj-$(CONFIG_VIDEO_RENESAS_FCP)		+= rcar-fcp.o
 obj-$(CONFIG_VIDEO_RENESAS_FDP1)	+= rcar_fdp1.o
-obj-$(CONFIG_VIDEO_RENESAS_JPU) 	+= rcar_jpu.o
+obj-$(CONFIG_VIDEO_RENESAS_JPU)		+= rcar_jpu.o
 obj-$(CONFIG_VIDEO_RENESAS_VSP1)	+= vsp1/
 
 obj-$(CONFIG_VIDEO_ROCKCHIP_RGA)	+= rockchip/rga/
diff --git a/drivers/media/platform/arv.c b/drivers/media/platform/arv.c
index 1351374bb1ef..1e865fea803c 100644
--- a/drivers/media/platform/arv.c
+++ b/drivers/media/platform/arv.c
@@ -56,7 +56,7 @@
 
 #define VERSION	"0.0.5"
 
-#define ar_inl(addr) 		inl((unsigned long)(addr))
+#define ar_inl(addr)		inl((unsigned long)(addr))
 #define ar_outl(val, addr)	outl((unsigned long)(val), (unsigned long)(addr))
 
 extern struct cpuinfo_m32r	boot_cpu_data;
@@ -210,8 +210,8 @@ static void init_iic(void)
 	 * ICU Setting (iic)
 	 */
 	/* I2C Setting */
-	ar_outl(0x0, PLDI2CCR);      	/* I2CCR Disable                   */
-	ar_outl(0x0300, PLDI2CMOD); 	/* I2CMOD ACK/8b-data/7b-addr/auto */
+	ar_outl(0x0, PLDI2CCR);		/* I2CCR Disable                   */
+	ar_outl(0x0300, PLDI2CMOD);	/* I2CMOD ACK/8b-data/7b-addr/auto */
 	ar_outl(0x1, PLDI2CACK);	/* I2CACK ACK                      */
 
 	/* I2C CLK */
@@ -222,7 +222,7 @@ static void init_iic(void)
 		ar_outl(244, PLDI2CFREQ);	/* BCLK = 50MHz */
 	else
 		ar_outl(244, PLDI2CFREQ);	/* default: BCLK = 50MHz */
-	ar_outl(0x1, PLDI2CCR); 	/* I2CCR Enable */
+	ar_outl(0x1, PLDI2CCR);		/* I2CCR Enable */
 }
 
 /**************************************************************************
@@ -300,9 +300,9 @@ static ssize_t ar_read(struct file *file, char *buf, size_t count, loff_t *ppos)
 	ar_outl(ARDATA32, M32R_DMA0CSA_PORTL);
 	ar_outl(ARDATA32, M32R_DMA0RSA_PORTL);
 	ar_outl(ar->line_buff, M32R_DMA0CDA_PORTL);	/* destination addr. */
-	ar_outl(ar->line_buff, M32R_DMA0RDA_PORTL); 	/* reload address */
-	ar_outl(ar->line_bytes, M32R_DMA0CBCUT_PORTL); 	/* byte count (bytes) */
-	ar_outl(ar->line_bytes, M32R_DMA0RBCUT_PORTL); 	/* reload count (bytes) */
+	ar_outl(ar->line_buff, M32R_DMA0RDA_PORTL);	/* reload address */
+	ar_outl(ar->line_bytes, M32R_DMA0CBCUT_PORTL);	/* byte count (bytes) */
+	ar_outl(ar->line_bytes, M32R_DMA0RBCUT_PORTL);	/* reload count (bytes) */
 
 	/*
 	 * Okay, kick AR LSI to invoke an interrupt
@@ -364,7 +364,7 @@ static ssize_t ar_read(struct file *file, char *buf, size_t count, loff_t *ppos)
 
 	/*
 	 * convert YUV422 to YUV422P
-	 * 	+--------------------+
+	 *	+--------------------+
 	 *	|  Y0,Y1,...	     |
 	 *	|  ..............Yn  |
 	 *	+--------------------+
@@ -533,9 +533,9 @@ static void ar_interrupt(int irq, void *dev)
 	line_count = ar_inl(ARVHCOUNT);			/* line number */
 	if (ar->mode == AR_MODE_INTERLACE && ar->size == AR_SIZE_VGA) {
 		/* operations for interlace mode */
-		if (line_count < (AR_HEIGHT_VGA / 2)) 	/* even line */
+		if (line_count < (AR_HEIGHT_VGA / 2))	/* even line */
 			line_number = (line_count << 1);
-		else 					/* odd line */
+		else					/* odd line */
 			line_number =
 			(((line_count - (AR_HEIGHT_VGA / 2)) << 1) + 1);
 	} else {
@@ -568,7 +568,7 @@ static void ar_interrupt(int irq, void *dev)
 		 * if captured all line of a frame, disable AR interrupt
 		 * and wake a process up.
 		 */
-		if (line_number == (ar->height - 1)) { 	/* end  of line */
+		if (line_number == (ar->height - 1)) {	/* end  of line */
 
 			ar->start_capture = 0;
 
@@ -718,14 +718,14 @@ static const struct v4l2_file_operations ar_fops = {
 };
 
 static const struct v4l2_ioctl_ops ar_ioctl_ops = {
-	.vidioc_querycap    		    = ar_querycap,
-	.vidioc_g_input      		    = ar_g_input,
-	.vidioc_s_input      		    = ar_s_input,
-	.vidioc_enum_input   		    = ar_enum_input,
-	.vidioc_enum_fmt_vid_cap 	    = ar_enum_fmt_vid_cap,
-	.vidioc_g_fmt_vid_cap 		    = ar_g_fmt_vid_cap,
-	.vidioc_s_fmt_vid_cap  		    = ar_s_fmt_vid_cap,
-	.vidioc_try_fmt_vid_cap  	    = ar_try_fmt_vid_cap,
+	.vidioc_querycap		    = ar_querycap,
+	.vidioc_g_input			    = ar_g_input,
+	.vidioc_s_input			    = ar_s_input,
+	.vidioc_enum_input		    = ar_enum_input,
+	.vidioc_enum_fmt_vid_cap	    = ar_enum_fmt_vid_cap,
+	.vidioc_g_fmt_vid_cap		    = ar_g_fmt_vid_cap,
+	.vidioc_s_fmt_vid_cap		    = ar_s_fmt_vid_cap,
+	.vidioc_try_fmt_vid_cap		    = ar_try_fmt_vid_cap,
 };
 
 #define ALIGN4(x)	((((int)(x)) & 0x3) == 0)
@@ -776,9 +776,9 @@ static int __init ar_init(void)
 	video_set_drvdata(&ar->vdev, ar);
 
 	if (vga) {
-		ar->width 	= AR_WIDTH_VGA;
-		ar->height 	= AR_HEIGHT_VGA;
-		ar->size 	= AR_SIZE_VGA;
+		ar->width	= AR_WIDTH_VGA;
+		ar->height	= AR_HEIGHT_VGA;
+		ar->size	= AR_SIZE_VGA;
 		ar->frame_bytes = AR_FRAME_BYTES_VGA;
 		ar->line_bytes	= AR_LINE_BYTES_VGA;
 		if (vga_interlace)
@@ -786,9 +786,9 @@ static int __init ar_init(void)
 		else
 			ar->mode = AR_MODE_NORMAL;
 	} else {
-		ar->width 	= AR_WIDTH_QVGA;
-		ar->height 	= AR_HEIGHT_QVGA;
-		ar->size 	= AR_SIZE_QVGA;
+		ar->width	= AR_WIDTH_QVGA;
+		ar->height	= AR_HEIGHT_QVGA;
+		ar->size	= AR_SIZE_QVGA;
 		ar->frame_bytes = AR_FRAME_BYTES_QVGA;
 		ar->line_bytes	= AR_LINE_BYTES_QVGA;
 		ar->mode	= AR_MODE_INTERLACE;
diff --git a/drivers/media/platform/coda/coda_regs.h b/drivers/media/platform/coda/coda_regs.h
index 35e620c7f1f4..3b650b8aabe9 100644
--- a/drivers/media/platform/coda/coda_regs.h
+++ b/drivers/media/platform/coda/coda_regs.h
@@ -125,7 +125,7 @@
 #define		CODA9_MODE_ENCODE_H264		8
 #define		CODA9_MODE_ENCODE_MP4		11
 #define		CODA9_MODE_ENCODE_MJPG		13
-#define 	CODA_MODE_INVALID		0xffff
+#define		CODA_MODE_INVALID		0xffff
 #define CODA_REG_BIT_INT_ENABLE		0x170
 #define		CODA_INT_INTERRUPT_ENABLE	(1 << 3)
 #define CODA_REG_BIT_INT_REASON			0x174
diff --git a/drivers/media/platform/davinci/dm355_ccdc_regs.h b/drivers/media/platform/davinci/dm355_ccdc_regs.h
index a753ce262583..20ba390763b5 100644
--- a/drivers/media/platform/davinci/dm355_ccdc_regs.h
+++ b/drivers/media/platform/davinci/dm355_ccdc_regs.h
@@ -107,7 +107,7 @@
 #define CCDC_RAW_IP_MODE			0
 #define CCDC_VDHDOUT_INPUT			0
 #define CCDC_YCINSWP_RAW			(0 << 4)
-#define CCDC_EXWEN_DISABLE 			0
+#define CCDC_EXWEN_DISABLE			0
 #define CCDC_DATAPOL_NORMAL			0
 #define CCDC_CCDCFG_FIDMD_LATCH_VSYNC		0
 #define CCDC_CCDCFG_FIDMD_NO_LATCH_VSYNC	(1 << 6)
@@ -152,7 +152,7 @@
 #define CCDC_ALAW_GAMMA_WD_MASK			7
 #define CCDC_REC656IF_BT656_EN			3
 
-#define CCDC_FMTCFG_FMTMODE_MASK 		3
+#define CCDC_FMTCFG_FMTMODE_MASK		3
 #define CCDC_FMTCFG_FMTMODE_SHIFT		1
 #define CCDC_FMTCFG_LNUM_MASK			3
 #define CCDC_FMTCFG_LNUM_SHIFT			4
@@ -196,7 +196,7 @@
 #define CCDC_LATCH_ON_VSYNC_DISABLE		(1 << 15)
 #define CCDC_LATCH_ON_VSYNC_ENABLE		(0 << 15)
 #define CCDC_FPC_ENABLE				(1 << 15)
-#define CCDC_FPC_FPC_NUM_MASK 			0x7FFF
+#define CCDC_FPC_FPC_NUM_MASK			0x7FFF
 #define CCDC_DATA_PACK_ENABLE			(1 << 11)
 #define CCDC_FMT_HORZ_FMTLNH_MASK		0x1FFF
 #define CCDC_FMT_HORZ_FMTSPH_MASK		0x1FFF
diff --git a/drivers/media/platform/davinci/dm644x_ccdc_regs.h b/drivers/media/platform/davinci/dm644x_ccdc_regs.h
index bece0bd9c9de..ffd89c7ea2b6 100644
--- a/drivers/media/platform/davinci/dm644x_ccdc_regs.h
+++ b/drivers/media/platform/davinci/dm644x_ccdc_regs.h
@@ -97,7 +97,7 @@
 #define CCDC_LATCH_ON_VSYNC_DISABLE		(1 << 15)
 #define CCDC_FPC_ENABLE				(1 << 15)
 #define CCDC_FPC_DISABLE			0
-#define CCDC_FPC_FPC_NUM_MASK 			0x7FFF
+#define CCDC_FPC_FPC_NUM_MASK			0x7FFF
 #define CCDC_DATA_PACK_ENABLE			(1 << 11)
 #define CCDC_FMTCFG_VPIN_MASK			7
 #define CCDC_FMTCFG_VPIN_SHIFT			12
@@ -143,7 +143,7 @@
 #define CCDC_REC656IF_BT656_EN			3
 #define CCDC_SYN_MODE_VD_POL_NEGATIVE		(1 << 2)
 #define CCDC_CCDCFG_Y8POS_SHIFT			11
-#define CCDC_CCDCFG_BW656_10BIT 		(1 << 5)
+#define CCDC_CCDCFG_BW656_10BIT			(1 << 5)
 #define CCDC_SDOFST_FIELD_INTERLEAVED		0x249
 #define CCDC_NO_CULLING				0xffff00ff
 #endif
diff --git a/drivers/media/platform/davinci/isif_regs.h b/drivers/media/platform/davinci/isif_regs.h
index a3564abe08ae..97d3ba1614d6 100644
--- a/drivers/media/platform/davinci/isif_regs.h
+++ b/drivers/media/platform/davinci/isif_regs.h
@@ -35,7 +35,7 @@
 #define LINCFG0					0x44
 #define LINCFG1					0x48
 #define CCOLP					0x4c
-#define CRGAIN 					0x50
+#define CRGAIN					0x50
 #define CGRGAIN					0x54
 #define CGBGAIN					0x58
 #define CBGAIN					0x5c
@@ -46,7 +46,7 @@
 #define VDINT0					0x70
 #define VDINT1					0x74
 #define VDINT2					0x78
-#define MISC 					0x7c
+#define MISC					0x7c
 #define CGAMMAWD				0x80
 #define REC656IF				0x84
 #define CCDCFG					0x88
@@ -191,7 +191,7 @@
 #define ISIF_VD_POL_SHIFT			2
 #define ISIF_DATAPOL_NORMAL			0
 #define ISIF_DATAPOL_SHIFT			6
-#define ISIF_EXWEN_DISABLE 			0
+#define ISIF_EXWEN_DISABLE			0
 #define ISIF_EXWEN_SHIFT			5
 #define ISIF_FRM_FMT_SHIFT			7
 #define ISIF_DATASFT_SHIFT			8
diff --git a/drivers/media/platform/davinci/vpfe_capture.c b/drivers/media/platform/davinci/vpfe_capture.c
index 498f69b53de3..7d08f0f283a5 100644
--- a/drivers/media/platform/davinci/vpfe_capture.c
+++ b/drivers/media/platform/davinci/vpfe_capture.c
@@ -1794,7 +1794,7 @@ static int vpfe_probe(struct platform_device *pdev)
 	vfd->fops		= &vpfe_fops;
 	vfd->ioctl_ops		= &vpfe_ioctl_ops;
 	vfd->tvnorms		= 0;
-	vfd->v4l2_dev 		= &vpfe_dev->v4l2_dev;
+	vfd->v4l2_dev		= &vpfe_dev->v4l2_dev;
 	snprintf(vfd->name, sizeof(vfd->name),
 		 "%s_V%d.%d.%d",
 		 CAPTURE_DRV_NAME,
diff --git a/drivers/media/platform/davinci/vpif.h b/drivers/media/platform/davinci/vpif.h
index 9956e6788693..2466c7c77deb 100644
--- a/drivers/media/platform/davinci/vpif.h
+++ b/drivers/media/platform/davinci/vpif.h
@@ -226,11 +226,11 @@ static inline void vpif_clr_bit(u32 reg, u32 bit)
 	(VPIF_INT_BOTH << VPIF_CH1_INT_CTRL_SHIFT)), VPIF_CH1_CTRL))
 
 /* enabled interrupt on both the fields on vpid_ch0_ctrl register */
-#define channel2_intr_assert() 	(regw((regr(VPIF_CH2_CTRL)|\
+#define channel2_intr_assert()	(regw((regr(VPIF_CH2_CTRL)|\
 	(VPIF_INT_BOTH << VPIF_CH2_INT_CTRL_SHIFT)), VPIF_CH2_CTRL))
 
 /* enabled interrupt on both the fields on vpid_ch1_ctrl register */
-#define channel3_intr_assert() 	(regw((regr(VPIF_CH3_CTRL)|\
+#define channel3_intr_assert()	(regw((regr(VPIF_CH3_CTRL)|\
 	(VPIF_INT_BOTH << VPIF_CH3_INT_CTRL_SHIFT)), VPIF_CH3_CTRL))
 
 #define VPIF_CH_FID_MASK	(0x20)
diff --git a/drivers/media/platform/davinci/vpss.c b/drivers/media/platform/davinci/vpss.c
index f2d27b932999..b73886519f4f 100644
--- a/drivers/media/platform/davinci/vpss.c
+++ b/drivers/media/platform/davinci/vpss.c
@@ -59,9 +59,9 @@ MODULE_AUTHOR("Texas Instruments");
 #define DM365_ISP5_INTSEL1		0x10
 #define DM365_ISP5_INTSEL2		0x14
 #define DM365_ISP5_INTSEL3		0x18
-#define DM365_ISP5_CCDCMUX 		0x20
-#define DM365_ISP5_PG_FRAME_SIZE 	0x28
-#define DM365_VPBE_CLK_CTRL 		0x00
+#define DM365_ISP5_CCDCMUX		0x20
+#define DM365_ISP5_PG_FRAME_SIZE	0x28
+#define DM365_VPBE_CLK_CTRL		0x00
 
 #define VPSS_CLK_CTRL			0x01c40044
 #define VPSS_CLK_CTRL_VENCCLKEN		BIT(3)
@@ -78,8 +78,8 @@ MODULE_AUTHOR("Texas Instruments");
 #define DM365_ISP5_INTSEL3_DEFAULT	0x00000015
 
 /* masks and shifts for DM365*/
-#define DM365_CCDC_PG_VD_POL_SHIFT 	0
-#define DM365_CCDC_PG_HD_POL_SHIFT 	1
+#define DM365_CCDC_PG_VD_POL_SHIFT	0
+#define DM365_CCDC_PG_HD_POL_SHIFT	1
 
 #define CCD_SRC_SEL_MASK		(BIT_MASK(5) | BIT_MASK(4))
 #define CCD_SRC_SEL_SHIFT		4
diff --git a/drivers/media/platform/exynos4-is/fimc-core.c b/drivers/media/platform/exynos4-is/fimc-core.c
index 7ae239f2b0fd..d8d8c9902b19 100644
--- a/drivers/media/platform/exynos4-is/fimc-core.c
+++ b/drivers/media/platform/exynos4-is/fimc-core.c
@@ -1246,7 +1246,7 @@ static struct platform_driver fimc_driver = {
 	.driver = {
 		.of_match_table = fimc_of_match,
 		.name		= FIMC_DRIVER_NAME,
-		.pm     	= &fimc_pm_ops,
+		.pm		= &fimc_pm_ops,
 	}
 };
 
diff --git a/drivers/media/platform/m2m-deinterlace.c b/drivers/media/platform/m2m-deinterlace.c
index c8a12493f395..5f5c34ed4359 100644
--- a/drivers/media/platform/m2m-deinterlace.c
+++ b/drivers/media/platform/m2m-deinterlace.c
@@ -384,16 +384,16 @@ static void deinterlace_device_run(void *priv)
 	 * 4 possible field conversions are possible at the moment:
 	 *  V4L2_FIELD_SEQ_TB --> V4L2_FIELD_INTERLACED_TB:
 	 *	two separate fields in the same input buffer are interlaced
-	 * 	in the output buffer using weaving. Top field comes first.
+	 *	in the output buffer using weaving. Top field comes first.
 	 *  V4L2_FIELD_SEQ_TB --> V4L2_FIELD_NONE:
-	 * 	top field from the input buffer is copied to the output buffer
-	 * 	using line doubling. Bottom field from the input buffer is discarded.
+	 *	top field from the input buffer is copied to the output buffer
+	 *	using line doubling. Bottom field from the input buffer is discarded.
 	 * V4L2_FIELD_SEQ_BT --> V4L2_FIELD_INTERLACED_BT:
 	 *	two separate fields in the same input buffer are interlaced
-	 * 	in the output buffer using weaving. Bottom field comes first.
+	 *	in the output buffer using weaving. Bottom field comes first.
 	 * V4L2_FIELD_SEQ_BT --> V4L2_FIELD_NONE:
-	 * 	bottom field from the input buffer is copied to the output buffer
-	 * 	using line doubling. Top field from the input buffer is discarded.
+	 *	bottom field from the input buffer is copied to the output buffer
+	 *	using line doubling. Top field from the input buffer is discarded.
 	 */
 	switch (dst_q_data->fmt->fourcc) {
 	case V4L2_PIX_FMT_YUV420:
diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c
index 6f1b0c799e58..7278a1ef2931 100644
--- a/drivers/media/platform/omap/omap_vout.c
+++ b/drivers/media/platform/omap/omap_vout.c
@@ -1774,8 +1774,8 @@ static int vidioc_g_fbuf(struct file *file, void *fh,
 }
 
 static const struct v4l2_ioctl_ops vout_ioctl_ops = {
-	.vidioc_querycap      			= vidioc_querycap,
-	.vidioc_enum_fmt_vid_out 		= vidioc_enum_fmt_vid_out,
+	.vidioc_querycap			= vidioc_querycap,
+	.vidioc_enum_fmt_vid_out		= vidioc_enum_fmt_vid_out,
 	.vidioc_g_fmt_vid_out			= vidioc_g_fmt_vid_out,
 	.vidioc_try_fmt_vid_out			= vidioc_try_fmt_vid_out,
 	.vidioc_s_fmt_vid_out			= vidioc_s_fmt_vid_out,
@@ -1795,12 +1795,12 @@ static const struct v4l2_ioctl_ops vout_ioctl_ops = {
 };
 
 static const struct v4l2_file_operations omap_vout_fops = {
-	.owner 		= THIS_MODULE,
+	.owner		= THIS_MODULE,
 	.poll		= omap_vout_poll,
 	.unlocked_ioctl	= video_ioctl2,
-	.mmap 		= omap_vout_mmap,
-	.open 		= omap_vout_open,
-	.release 	= omap_vout_release,
+	.mmap		= omap_vout_mmap,
+	.open		= omap_vout_open,
+	.release	= omap_vout_release,
 };
 
 /* Init functions used during driver initialization */
diff --git a/drivers/media/platform/sh_vou.c b/drivers/media/platform/sh_vou.c
index 871da2a2a91c..4dccf29e9d78 100644
--- a/drivers/media/platform/sh_vou.c
+++ b/drivers/media/platform/sh_vou.c
@@ -1181,7 +1181,7 @@ static int sh_vou_release(struct file *file)
 
 /* sh_vou display ioctl operations */
 static const struct v4l2_ioctl_ops sh_vou_ioctl_ops = {
-	.vidioc_querycap        	= sh_vou_querycap,
+	.vidioc_querycap		= sh_vou_querycap,
 	.vidioc_enum_fmt_vid_out	= sh_vou_enum_fmt_vid_out,
 	.vidioc_g_fmt_vid_out		= sh_vou_g_fmt_vid_out,
 	.vidioc_s_fmt_vid_out		= sh_vou_s_fmt_vid_out,
diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c
index ea9308796741..5ef635e72e10 100644
--- a/drivers/media/radio/radio-aimslab.c
+++ b/drivers/media/radio/radio-aimslab.c
@@ -26,7 +26,7 @@
  * Fully tested with the Keene USB FM Transmitter and the v4l2-compliance tool.
  */
 
-#include <linux/module.h>	/* Modules 			*/
+#include <linux/module.h>	/* Modules			*/
 #include <linux/init.h>		/* Initdata			*/
 #include <linux/ioport.h>	/* request_region		*/
 #include <linux/delay.h>	/* msleep			*/
diff --git a/drivers/media/radio/radio-aztech.c b/drivers/media/radio/radio-aztech.c
index f445327f282d..9e12c6027359 100644
--- a/drivers/media/radio/radio-aztech.c
+++ b/drivers/media/radio/radio-aztech.c
@@ -15,7 +15,7 @@
  * Fully tested with the Keene USB FM Transmitter and the v4l2-compliance tool.
 */
 
-#include <linux/module.h>	/* Modules 			*/
+#include <linux/module.h>	/* Modules			*/
 #include <linux/init.h>		/* Initdata			*/
 #include <linux/ioport.h>	/* request_region		*/
 #include <linux/delay.h>	/* udelay			*/
diff --git a/drivers/media/radio/radio-cadet.c b/drivers/media/radio/radio-cadet.c
index 7575e5370a49..ec5c88801402 100644
--- a/drivers/media/radio/radio-cadet.c
+++ b/drivers/media/radio/radio-cadet.c
@@ -30,7 +30,7 @@
  *		Changed API to V4L2
  */
 
-#include <linux/module.h>	/* Modules 			*/
+#include <linux/module.h>	/* Modules			*/
 #include <linux/init.h>		/* Initdata			*/
 #include <linux/ioport.h>	/* request_region		*/
 #include <linux/delay.h>	/* udelay			*/
@@ -503,7 +503,7 @@ static unsigned int cadet_poll(struct file *file, struct poll_table_struct *wait
 static const struct v4l2_file_operations cadet_fops = {
 	.owner		= THIS_MODULE,
 	.open		= cadet_open,
-	.release       	= cadet_release,
+	.release	= cadet_release,
 	.read		= cadet_read,
 	.unlocked_ioctl	= video_ioctl2,
 	.poll		= cadet_poll,
diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c
index ddc12b16f77c..3ff4c4e1435f 100644
--- a/drivers/media/radio/radio-gemtek.c
+++ b/drivers/media/radio/radio-gemtek.c
@@ -22,7 +22,7 @@
  * Fully tested with the Keene USB FM Transmitter and the v4l2-compliance tool.
  */
 
-#include <linux/module.h>	/* Modules 			*/
+#include <linux/module.h>	/* Modules			*/
 #include <linux/init.h>		/* Initdata			*/
 #include <linux/ioport.h>	/* request_region		*/
 #include <linux/delay.h>	/* udelay			*/
@@ -102,9 +102,9 @@ struct gemtek {
 	u32 bu2614data;
 };
 
-#define BU2614_FREQ_BITS 	16 /* D0..D15, Frequency data		*/
+#define BU2614_FREQ_BITS	16 /* D0..D15, Frequency data		*/
 #define BU2614_PORT_BITS	3 /* P0..P2, Output port control data	*/
-#define BU2614_VOID_BITS	4 /* unused 				*/
+#define BU2614_VOID_BITS	4 /* unused				*/
 #define BU2614_FMES_BITS	1 /* CT, Frequency measurement beginning data */
 #define BU2614_STDF_BITS	3 /* R0..R2, Standard frequency data	*/
 #define BU2614_SWIN_BITS	1 /* S, Switch between FMIN / AMIN	*/
@@ -113,7 +113,7 @@ struct gemtek {
 #define BU2614_FMUN_BITS	1 /* GT, Frequency measurement time & unlock */
 #define BU2614_TEST_BITS	1 /* TS, Test data is input		*/
 
-#define BU2614_FREQ_SHIFT 	0
+#define BU2614_FREQ_SHIFT	0
 #define BU2614_PORT_SHIFT	(BU2614_FREQ_BITS + BU2614_FREQ_SHIFT)
 #define BU2614_VOID_SHIFT	(BU2614_PORT_BITS + BU2614_PORT_SHIFT)
 #define BU2614_FMES_SHIFT	(BU2614_VOID_BITS + BU2614_VOID_SHIFT)
diff --git a/drivers/media/radio/radio-rtrack2.c b/drivers/media/radio/radio-rtrack2.c
index 09cfbc373c92..abeaedd8d437 100644
--- a/drivers/media/radio/radio-rtrack2.c
+++ b/drivers/media/radio/radio-rtrack2.c
@@ -12,7 +12,7 @@
  * Fully tested with actual hardware and the v4l2-compliance tool.
  */
 
-#include <linux/module.h>	/* Modules 			*/
+#include <linux/module.h>	/* Modules			*/
 #include <linux/init.h>		/* Initdata			*/
 #include <linux/ioport.h>	/* request_region		*/
 #include <linux/delay.h>	/* udelay			*/
diff --git a/drivers/media/radio/radio-sf16fmi.c b/drivers/media/radio/radio-sf16fmi.c
index 28a89466cddc..fc4e63d36e4c 100644
--- a/drivers/media/radio/radio-sf16fmi.c
+++ b/drivers/media/radio/radio-sf16fmi.c
@@ -17,7 +17,7 @@
  */
 
 #include <linux/kernel.h>	/* __setup			*/
-#include <linux/module.h>	/* Modules 			*/
+#include <linux/module.h>	/* Modules			*/
 #include <linux/init.h>		/* Initdata			*/
 #include <linux/ioport.h>	/* request_region		*/
 #include <linux/delay.h>	/* udelay			*/
@@ -110,7 +110,7 @@ static inline int fmi_getsigstr(struct fmi *fmi)
 	val = fmi->mute ? 0x00 : 0x08;	/* mute/unmute */
 	outb(val, fmi->io);
 	outb(val | 0x10, fmi->io);
-	msleep(143); 		/* was schedule_timeout(HZ/7) */
+	msleep(143);		/* was schedule_timeout(HZ/7) */
 	res = (int)inb(fmi->io + 1);
 	outb(val, fmi->io);
 
diff --git a/drivers/media/radio/radio-sf16fmr2.c b/drivers/media/radio/radio-sf16fmr2.c
index de79d5569c2a..7b07d42a9909 100644
--- a/drivers/media/radio/radio-sf16fmr2.c
+++ b/drivers/media/radio/radio-sf16fmr2.c
@@ -7,7 +7,7 @@
  */
 
 #include <linux/delay.h>
-#include <linux/module.h>	/* Modules 			*/
+#include <linux/module.h>	/* Modules			*/
 #include <linux/init.h>		/* Initdata			*/
 #include <linux/slab.h>
 #include <linux/ioport.h>	/* request_region		*/
diff --git a/drivers/media/radio/radio-tea5764.c b/drivers/media/radio/radio-tea5764.c
index bc7e69e7e32e..afb763256fd6 100644
--- a/drivers/media/radio/radio-tea5764.c
+++ b/drivers/media/radio/radio-tea5764.c
@@ -417,7 +417,7 @@ static const struct v4l2_ioctl_ops tea5764_ioctl_ops = {
 static const struct video_device tea5764_radio_template = {
 	.name		= "TEA5764 FM-Radio",
 	.fops           = &tea5764_fops,
-	.ioctl_ops 	= &tea5764_ioctl_ops,
+	.ioctl_ops	= &tea5764_ioctl_ops,
 	.release	= video_device_release_empty,
 };
 
diff --git a/drivers/media/radio/radio-terratec.c b/drivers/media/radio/radio-terratec.c
index be10a802e3a9..4f116ea294fb 100644
--- a/drivers/media/radio/radio-terratec.c
+++ b/drivers/media/radio/radio-terratec.c
@@ -20,7 +20,7 @@
  * Converted to V4L2 API by Mauro Carvalho Chehab <mchehab@infradead.org>
  */
 
-#include <linux/module.h>	/* Modules 			*/
+#include <linux/module.h>	/* Modules			*/
 #include <linux/init.h>		/* Initdata			*/
 #include <linux/ioport.h>	/* request_region		*/
 #include <linux/videodev2.h>	/* kernel radio structs		*/
@@ -45,12 +45,12 @@ static int radio_nr = -1;
 module_param(radio_nr, int, 0444);
 MODULE_PARM_DESC(radio_nr, "Radio device number");
 
-#define WRT_DIS 	0x00
+#define WRT_DIS		0x00
 #define CLK_OFF		0x00
 #define IIC_DATA	0x01
 #define IIC_CLK		0x02
 #define DATA		0x04
-#define CLK_ON 		0x08
+#define CLK_ON		0x08
 #define WRT_EN		0x10
 
 static struct radio_isa_card *terratec_alloc(void)
diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c
index 4dc2067bce14..7412fe1b10c6 100644
--- a/drivers/media/radio/tea575x.c
+++ b/drivers/media/radio/tea575x.c
@@ -498,7 +498,7 @@ static const struct v4l2_ioctl_ops tea575x_ioctl_ops = {
 };
 
 static const struct video_device tea575x_radio = {
-	.ioctl_ops 	= &tea575x_ioctl_ops,
+	.ioctl_ops	= &tea575x_ioctl_ops,
 	.release        = video_device_release_empty,
 };
 
diff --git a/drivers/media/rc/keymaps/rc-behold-columbus.c b/drivers/media/rc/keymaps/rc-behold-columbus.c
index 61f679fec45c..e73057945bd1 100644
--- a/drivers/media/rc/keymaps/rc-behold-columbus.c
+++ b/drivers/media/rc/keymaps/rc-behold-columbus.c
@@ -30,12 +30,12 @@ static struct rc_map_table behold_columbus[] = {
 
 	/*  0x01    0x02    0x03  0x0D    *
 	 *   1       2       3   Stereo   *
-	 *                        	  *
+	 *				  *
 	 *  0x04    0x05    0x06  0x19    *
 	 *   4       5       6   Snapshot *
-	 *                        	  *
+	 *				  *
 	 *  0x07    0x08    0x09  0x10    *
-	 *   7       8       9    Zoom 	  *
+	 *   7       8       9    Zoom	  *
 	 *                                */
 	{ 0x01, KEY_1 },
 	{ 0x02, KEY_2 },
diff --git a/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c b/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c
index 30495673cddd..e443192dbe14 100644
--- a/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c
+++ b/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c
@@ -37,7 +37,7 @@ static struct rc_map_table winfast_usbii_deluxe[] = {
 	{ 0x60, KEY_CHANNELDOWN},	/* CHANNELDOWN */
 	{ 0x61, KEY_LAST},		/* LAST CHANNEL (RECALL) */
 
-	{ 0x72, KEY_VIDEO}, 		/* INPUT MODES (TV/FM) */
+	{ 0x72, KEY_VIDEO},		/* INPUT MODES (TV/FM) */
 
 	{ 0x70, KEY_POWER2},		/* TV ON/OFF */
 
diff --git a/drivers/media/tuners/mxl5005s.c b/drivers/media/tuners/mxl5005s.c
index 57c6d9061072..355ef2959b7d 100644
--- a/drivers/media/tuners/mxl5005s.c
+++ b/drivers/media/tuners/mxl5005s.c
@@ -1677,10 +1677,10 @@ static u16 MXL5005_TunerConfig(struct dvb_frontend *fe,
 	u8	AGC_Mode,	/* AGC Mode - Dual AGC: 0, Single AGC: 1 */
 	u16	TOP,		/* 0: Dual AGC; Value: take over point */
 	u16	IF_OUT_LOAD,	/* IF Out Load Resistor (200 / 300 Ohms) */
-	u8	CLOCK_OUT, 	/* 0: turn off clk out; 1: turn on clock out */
+	u8	CLOCK_OUT,	/* 0: turn off clk out; 1: turn on clock out */
 	u8	DIV_OUT,	/* 0: Div-1; 1: Div-4 */
-	u8	CAPSELECT, 	/* 0: disable On-Chip pulling cap; 1: enable */
-	u8	EN_RSSI, 	/* 0: disable RSSI; 1: enable RSSI */
+	u8	CAPSELECT,	/* 0: disable On-Chip pulling cap; 1: enable */
+	u8	EN_RSSI,	/* 0: disable RSSI; 1: enable RSSI */
 
 	/* Modulation Type; */
 	/* 0 - Default;	1 - DVB-T; 2 - ATSC; 3 - QAM; 4 - Analog Cable */
diff --git a/drivers/media/tuners/tda827x.h b/drivers/media/tuners/tda827x.h
index 264e80bd7e24..a08d3f9fcea1 100644
--- a/drivers/media/tuners/tda827x.h
+++ b/drivers/media/tuners/tda827x.h
@@ -36,7 +36,7 @@ struct tda827x_config
 
 	/* interface to tda829x driver */
 	enum tda8290_lna config;
-	int 	     switch_addr;
+	int	     switch_addr;
 
 	void (*agcf)(struct dvb_frontend *fe);
 };
diff --git a/drivers/media/tuners/tda9887.c b/drivers/media/tuners/tda9887.c
index c0e815f8b951..9777da03e308 100644
--- a/drivers/media/tuners/tda9887.c
+++ b/drivers/media/tuners/tda9887.c
@@ -31,7 +31,7 @@ struct tda9887_priv {
 	struct tuner_i2c_props i2c_props;
 	struct list_head hybrid_tuner_instance_list;
 
-	unsigned char 	   data[4];
+	unsigned char	   data[4];
 	unsigned int       config;
 	unsigned int       mode;
 	unsigned int       audmode;
@@ -94,7 +94,7 @@ struct tvnorm {
 #define cAudioGain6             0x80    // bit c7
 
 #define cTopMask                0x1f    // bit c0:4
-#define cTopDefault		0x10 	// bit c0:4
+#define cTopDefault		0x10	// bit c0:4
 
 //// third reg (e)
 #define cAudioIF_4_5             0x00    // bit e0:1
diff --git a/drivers/media/tuners/tuner-simple.c b/drivers/media/tuners/tuner-simple.c
index cf44d3657f55..36b88f820239 100644
--- a/drivers/media/tuners/tuner-simple.c
+++ b/drivers/media/tuners/tuner-simple.c
@@ -53,7 +53,7 @@ MODULE_PARM_DESC(dtv_input, "specify dtv rf input, 0 for autoselect");
 /* tv tuner system standard selection for Philips FQ1216ME
    this value takes the low bits of control byte 2
    from datasheet "1999 Nov 16" (supersedes "1999 Mar 23")
-     standard 		BG	DK	I	L	L`
+     standard		BG	DK	I	L	L`
      picture carrier	38.90	38.90	38.90	38.90	33.95
      colour		34.47	34.47	34.47	34.47	38.38
      sound 1		33.40	32.40	32.90	32.40	40.45
diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c
index 8cda36a0b20b..fca85e08ebd7 100644
--- a/drivers/media/tuners/tuner-xc2028.c
+++ b/drivers/media/tuners/tuner-xc2028.c
@@ -87,7 +87,7 @@ struct firmware_properties {
 	v4l2_std_id	std_req;
 	__u16		int_freq;
 	unsigned int	scode_table;
-	int 		scode_nr;
+	int		scode_nr;
 };
 
 enum xc2028_state {
@@ -137,7 +137,7 @@ struct xc2028_data {
 				       ibuf, isize);			\
 	if (isize != _rc)						\
 		tuner_err("i2c input error: rc = %d (should be %d)\n",	\
-			   _rc, (int)isize); 				\
+			   _rc, (int)isize);				\
 	if (priv->ctrl.msleep)						\
 		msleep(priv->ctrl.msleep);				\
 	_rc;								\
@@ -172,7 +172,7 @@ static int xc2028_get_reg(struct xc2028_data *priv, u16 reg, u16 *val)
 	return 0;
 }
 
-#define dump_firm_type(t) 	dump_firm_type_and_int_freq(t, 0)
+#define dump_firm_type(t)	dump_firm_type_and_int_freq(t, 0)
 static void dump_firm_type_and_int_freq(unsigned int type, u16 int_freq)
 {
 	if (type & BASE)
diff --git a/drivers/media/tuners/tuner-xc2028.h b/drivers/media/tuners/tuner-xc2028.h
index cd96288aff54..03fd6d4233a4 100644
--- a/drivers/media/tuners/tuner-xc2028.h
+++ b/drivers/media/tuners/tuner-xc2028.h
@@ -48,7 +48,7 @@ struct xc2028_ctrl {
 
 struct xc2028_config {
 	struct i2c_adapter *i2c_adap;
-	u8 		   i2c_addr;
+	u8		   i2c_addr;
 	struct xc2028_ctrl *ctrl;
 };
 
diff --git a/drivers/media/usb/au0828/au0828-cards.h b/drivers/media/usb/au0828/au0828-cards.h
index 1f4412ee6da4..dbd8a90ee76f 100644
--- a/drivers/media/usb/au0828/au0828-cards.h
+++ b/drivers/media/usb/au0828/au0828-cards.h
@@ -17,7 +17,7 @@
 
 #define AU0828_BOARD_UNKNOWN		0
 #define AU0828_BOARD_HAUPPAUGE_HVR950Q	1
-#define AU0828_BOARD_HAUPPAUGE_HVR850 	2
+#define AU0828_BOARD_HAUPPAUGE_HVR850	2
 #define AU0828_BOARD_DVICO_FUSIONHDTV7	3
 #define AU0828_BOARD_HAUPPAUGE_HVR950Q_MXL	4
 #define AU0828_BOARD_HAUPPAUGE_WOODBURY	5
diff --git a/drivers/media/usb/au0828/au0828-video.c b/drivers/media/usb/au0828/au0828-video.c
index a240153821e0..c765d546114d 100644
--- a/drivers/media/usb/au0828/au0828-video.c
+++ b/drivers/media/usb/au0828/au0828-video.c
@@ -1797,7 +1797,7 @@ static const struct v4l2_ioctl_ops video_ioctl_ops = {
 static const struct video_device au0828_video_template = {
 	.fops                       = &au0828_v4l_fops,
 	.release                    = video_device_release_empty,
-	.ioctl_ops 		    = &video_ioctl_ops,
+	.ioctl_ops		    = &video_ioctl_ops,
 	.tvnorms                    = V4L2_STD_NTSC_M | V4L2_STD_PAL_M,
 };
 
diff --git a/drivers/media/usb/au0828/au0828.h b/drivers/media/usb/au0828/au0828.h
index 9e3c1237a274..004eadef55c7 100644
--- a/drivers/media/usb/au0828/au0828.h
+++ b/drivers/media/usb/au0828/au0828.h
@@ -190,7 +190,7 @@ struct au0828_dev {
 	struct i2c_adapter		i2c_adap;
 	struct i2c_algorithm		i2c_algo;
 	struct i2c_client		i2c_client;
-	u32 				i2c_rc;
+	u32				i2c_rc;
 
 	/* Digital */
 	struct au0828_dvb		dvb;
@@ -293,8 +293,8 @@ struct au0828_dev {
 /* ----------------------------------------------------------- */
 #define au0828_read(dev, reg) au0828_readreg(dev, reg)
 #define au0828_write(dev, reg, value) au0828_writereg(dev, reg, value)
-#define au0828_andor(dev, reg, mask, value) 				\
-	 au0828_writereg(dev, reg, 					\
+#define au0828_andor(dev, reg, mask, value)				\
+	 au0828_writereg(dev, reg,					\
 	(au0828_readreg(dev, reg) & ~(mask)) | ((value) & (mask)))
 
 #define au0828_set(dev, reg, bit) au0828_andor(dev, (reg), (bit), (bit))
diff --git a/drivers/media/usb/cpia2/cpia2_usb.c b/drivers/media/usb/cpia2/cpia2_usb.c
index 6089036049d9..f3a1e5b1e57c 100644
--- a/drivers/media/usb/cpia2/cpia2_usb.c
+++ b/drivers/media/usb/cpia2/cpia2_usb.c
@@ -33,13 +33,13 @@
 
 static int frame_sizes[] = {
 	0,	// USBIF_CMDONLY
-	0, 	// USBIF_BULK
-	128, 	// USBIF_ISO_1
-	384, 	// USBIF_ISO_2
-	640, 	// USBIF_ISO_3
-	768, 	// USBIF_ISO_4
-	896, 	// USBIF_ISO_5
-	1023, 	// USBIF_ISO_6
+	0,	// USBIF_BULK
+	128,	// USBIF_ISO_1
+	384,	// USBIF_ISO_2
+	640,	// USBIF_ISO_3
+	768,	// USBIF_ISO_4
+	896,	// USBIF_ISO_5
+	1023,	// USBIF_ISO_6
 };
 
 #define FRAMES_PER_DESC    10
diff --git a/drivers/media/usb/cx231xx/cx231xx-audio.c b/drivers/media/usb/cx231xx/cx231xx-audio.c
index 06f10d7fc4b0..d96236d786d1 100644
--- a/drivers/media/usb/cx231xx/cx231xx-audio.c
+++ b/drivers/media/usb/cx231xx/cx231xx-audio.c
@@ -404,9 +404,9 @@ static int snd_pcm_alloc_vmalloc_buffer(struct snd_pcm_substream *subs,
 }
 
 static const struct snd_pcm_hardware snd_cx231xx_hw_capture = {
-	.info = SNDRV_PCM_INFO_BLOCK_TRANSFER 	|
-	    SNDRV_PCM_INFO_MMAP 		|
-	    SNDRV_PCM_INFO_INTERLEAVED 		|
+	.info = SNDRV_PCM_INFO_BLOCK_TRANSFER	|
+	    SNDRV_PCM_INFO_MMAP			|
+	    SNDRV_PCM_INFO_INTERLEAVED		|
 	    SNDRV_PCM_INFO_MMAP_VALID,
 
 	.formats = SNDRV_PCM_FMTBIT_S16_LE,
diff --git a/drivers/media/usb/cx231xx/cx231xx-avcore.c b/drivers/media/usb/cx231xx/cx231xx-avcore.c
index 0df62d3951cf..fdd3c221fa0d 100644
--- a/drivers/media/usb/cx231xx/cx231xx-avcore.c
+++ b/drivers/media/usb/cx231xx/cx231xx-avcore.c
@@ -2168,7 +2168,7 @@ int cx231xx_tuner_post_channel_change(struct cx231xx *dev)
 }
 
 /******************************************************************************
- *        	    I 2 S - B L O C K    C O N T R O L   functions            *
+ *		    I 2 S - B L O C K    C O N T R O L   functions            *
  ******************************************************************************/
 int cx231xx_i2s_blk_initialize(struct cx231xx *dev)
 {
diff --git a/drivers/media/usb/cx231xx/cx231xx-core.c b/drivers/media/usb/cx231xx/cx231xx-core.c
index f372ad3917a8..4f43668df15d 100644
--- a/drivers/media/usb/cx231xx/cx231xx-core.c
+++ b/drivers/media/usb/cx231xx/cx231xx-core.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(alt, "alternate setting to use for video endpoint");
 			 dev->name, __func__ , ##arg); } while (0)
 
 /*****************************************************************
-*             Device control list functions     				 *
+*             Device control list functions					 *
 ******************************************************************/
 
 LIST_HEAD(cx231xx_devlist);
diff --git a/drivers/media/usb/cx231xx/cx231xx-i2c.c b/drivers/media/usb/cx231xx/cx231xx-i2c.c
index 23648dab7be8..6e1bef2a45bb 100644
--- a/drivers/media/usb/cx231xx/cx231xx-i2c.c
+++ b/drivers/media/usb/cx231xx/cx231xx-i2c.c
@@ -51,7 +51,7 @@ do {							\
 	if (i2c_debug >= lvl) {				\
 		printk(KERN_DEBUG "%s at %s: " fmt,	\
 		       dev->name, __func__ , ##args);	\
-      } 						\
+      }							\
 } while (0)
 
 static inline int get_real_i2c_port(struct cx231xx *dev, int bus_nr)
diff --git a/drivers/media/usb/cx231xx/cx231xx-pcb-cfg.h b/drivers/media/usb/cx231xx/cx231xx-pcb-cfg.h
index 4511dc5d199c..8f00b1d38277 100644
--- a/drivers/media/usb/cx231xx/cx231xx-pcb-cfg.h
+++ b/drivers/media/usb/cx231xx/cx231xx-pcb-cfg.h
@@ -144,7 +144,7 @@ enum AVDEC_STATUS{
 #define SOURCE_EXTERNAL         0x8
 #define SOURCE_TS_BDA			0x10
 #define SOURCE_TS_ENCODE		0x20
-#define SOURCE_TS_EXTERNAL   	0x40
+#define SOURCE_TS_EXTERNAL	0x40
 
 /***************************************************************************
 				* interface information define *
diff --git a/drivers/media/usb/cx231xx/cx231xx-reg.h b/drivers/media/usb/cx231xx/cx231xx-reg.h
index 750c5d37d569..db5af8d51b61 100644
--- a/drivers/media/usb/cx231xx/cx231xx-reg.h
+++ b/drivers/media/usb/cx231xx/cx231xx-reg.h
@@ -1433,16 +1433,16 @@
 #define      FLD_AC97_SHUTDOWN        0x00000001
 
 /* Cx231xx redefine */
-#define      QPSK_IAGC_CTL1  		0x94c
-#define      QPSK_IAGC_CTL2  		0x950
-#define      QPSK_FEPR_FREQ  		0x954
-#define      QPSK_BTL_CTL1  		0x958
-#define      QPSK_BTL_CTL2  		0x95c
-#define      QPSK_CTL_CTL1  		0x960
-#define      QPSK_CTL_CTL2  		0x964
-#define      QPSK_MF_FAGC_CTL 		0x968
-#define      QPSK_EQ_CTL  		0x96c
-#define      QPSK_LOCK_CTL  		0x970
+#define      QPSK_IAGC_CTL1		0x94c
+#define      QPSK_IAGC_CTL2		0x950
+#define      QPSK_FEPR_FREQ		0x954
+#define      QPSK_BTL_CTL1		0x958
+#define      QPSK_BTL_CTL2		0x95c
+#define      QPSK_CTL_CTL1		0x960
+#define      QPSK_CTL_CTL2		0x964
+#define      QPSK_MF_FAGC_CTL		0x968
+#define      QPSK_EQ_CTL		0x96c
+#define      QPSK_LOCK_CTL		0x970
 
 /*****************************************************************************/
 #define      FM1_DFT_CTL              0x9a8
diff --git a/drivers/media/usb/dvb-usb/az6027.c b/drivers/media/usb/dvb-usb/az6027.c
index 96bbb53a4a91..f0d10ac03a37 100644
--- a/drivers/media/usb/dvb-usb/az6027.c
+++ b/drivers/media/usb/dvb-usb/az6027.c
@@ -36,70 +36,70 @@ static const struct stb0899_s1_reg az6027_stb0899_s1_init_1[] = {
 	/* 0x0000000b, SYSREG */
 	{ STB0899_DEV_ID		, 0x30 },
 	{ STB0899_DISCNTRL1		, 0x32 },
-	{ STB0899_DISCNTRL2     	, 0x80 },
-	{ STB0899_DISRX_ST0     	, 0x04 },
-	{ STB0899_DISRX_ST1     	, 0x00 },
-	{ STB0899_DISPARITY     	, 0x00 },
+	{ STB0899_DISCNTRL2		, 0x80 },
+	{ STB0899_DISRX_ST0		, 0x04 },
+	{ STB0899_DISRX_ST1		, 0x00 },
+	{ STB0899_DISPARITY		, 0x00 },
 	{ STB0899_DISSTATUS		, 0x20 },
-	{ STB0899_DISF22        	, 0x99 },
-	{ STB0899_DISF22RX      	, 0xa8 },
+	{ STB0899_DISF22		, 0x99 },
+	{ STB0899_DISF22RX		, 0xa8 },
 	/* SYSREG ? */
-	{ STB0899_ACRPRESC      	, 0x11 },
-	{ STB0899_ACRDIV1       	, 0x0a },
-	{ STB0899_ACRDIV2       	, 0x05 },
-	{ STB0899_DACR1         	, 0x00 },
-	{ STB0899_DACR2         	, 0x00 },
-	{ STB0899_OUTCFG        	, 0x00 },
-	{ STB0899_MODECFG       	, 0x00 },
+	{ STB0899_ACRPRESC		, 0x11 },
+	{ STB0899_ACRDIV1		, 0x0a },
+	{ STB0899_ACRDIV2		, 0x05 },
+	{ STB0899_DACR1			, 0x00 },
+	{ STB0899_DACR2			, 0x00 },
+	{ STB0899_OUTCFG		, 0x00 },
+	{ STB0899_MODECFG		, 0x00 },
 	{ STB0899_IRQSTATUS_3		, 0xfe },
 	{ STB0899_IRQSTATUS_2		, 0x03 },
 	{ STB0899_IRQSTATUS_1		, 0x7c },
 	{ STB0899_IRQSTATUS_0		, 0xf4 },
-	{ STB0899_IRQMSK_3      	, 0xf3 },
-	{ STB0899_IRQMSK_2      	, 0xfc },
-	{ STB0899_IRQMSK_1      	, 0xff },
+	{ STB0899_IRQMSK_3		, 0xf3 },
+	{ STB0899_IRQMSK_2		, 0xfc },
+	{ STB0899_IRQMSK_1		, 0xff },
 	{ STB0899_IRQMSK_0		, 0xff },
 	{ STB0899_IRQCFG		, 0x00 },
-	{ STB0899_I2CCFG        	, 0x88 },
-	{ STB0899_I2CRPT        	, 0x58 },
+	{ STB0899_I2CCFG		, 0x88 },
+	{ STB0899_I2CRPT		, 0x58 },
 	{ STB0899_IOPVALUE5		, 0x00 },
 	{ STB0899_IOPVALUE4		, 0x33 },
 	{ STB0899_IOPVALUE3		, 0x6d },
 	{ STB0899_IOPVALUE2		, 0x90 },
 	{ STB0899_IOPVALUE1		, 0x60 },
 	{ STB0899_IOPVALUE0		, 0x00 },
-	{ STB0899_GPIO00CFG     	, 0x82 },
-	{ STB0899_GPIO01CFG     	, 0x82 },
-	{ STB0899_GPIO02CFG     	, 0x82 },
-	{ STB0899_GPIO03CFG     	, 0x82 },
-	{ STB0899_GPIO04CFG     	, 0x82 },
-	{ STB0899_GPIO05CFG     	, 0x82 },
-	{ STB0899_GPIO06CFG     	, 0x82 },
-	{ STB0899_GPIO07CFG     	, 0x82 },
-	{ STB0899_GPIO08CFG     	, 0x82 },
-	{ STB0899_GPIO09CFG     	, 0x82 },
-	{ STB0899_GPIO10CFG     	, 0x82 },
-	{ STB0899_GPIO11CFG     	, 0x82 },
-	{ STB0899_GPIO12CFG     	, 0x82 },
-	{ STB0899_GPIO13CFG     	, 0x82 },
-	{ STB0899_GPIO14CFG     	, 0x82 },
-	{ STB0899_GPIO15CFG     	, 0x82 },
-	{ STB0899_GPIO16CFG     	, 0x82 },
-	{ STB0899_GPIO17CFG     	, 0x82 },
-	{ STB0899_GPIO18CFG     	, 0x82 },
-	{ STB0899_GPIO19CFG     	, 0x82 },
-	{ STB0899_GPIO20CFG     	, 0x82 },
-	{ STB0899_SDATCFG       	, 0xb8 },
-	{ STB0899_SCLTCFG       	, 0xba },
-	{ STB0899_AGCRFCFG      	, 0x1c }, /* 0x11 */
-	{ STB0899_GPIO22        	, 0x82 }, /* AGCBB2CFG */
-	{ STB0899_GPIO21        	, 0x91 }, /* AGCBB1CFG */
-	{ STB0899_DIRCLKCFG     	, 0x82 },
-	{ STB0899_CLKOUT27CFG   	, 0x7e },
-	{ STB0899_STDBYCFG      	, 0x82 },
-	{ STB0899_CS0CFG        	, 0x82 },
-	{ STB0899_CS1CFG        	, 0x82 },
-	{ STB0899_DISEQCOCFG    	, 0x20 },
+	{ STB0899_GPIO00CFG		, 0x82 },
+	{ STB0899_GPIO01CFG		, 0x82 },
+	{ STB0899_GPIO02CFG		, 0x82 },
+	{ STB0899_GPIO03CFG		, 0x82 },
+	{ STB0899_GPIO04CFG		, 0x82 },
+	{ STB0899_GPIO05CFG		, 0x82 },
+	{ STB0899_GPIO06CFG		, 0x82 },
+	{ STB0899_GPIO07CFG		, 0x82 },
+	{ STB0899_GPIO08CFG		, 0x82 },
+	{ STB0899_GPIO09CFG		, 0x82 },
+	{ STB0899_GPIO10CFG		, 0x82 },
+	{ STB0899_GPIO11CFG		, 0x82 },
+	{ STB0899_GPIO12CFG		, 0x82 },
+	{ STB0899_GPIO13CFG		, 0x82 },
+	{ STB0899_GPIO14CFG		, 0x82 },
+	{ STB0899_GPIO15CFG		, 0x82 },
+	{ STB0899_GPIO16CFG		, 0x82 },
+	{ STB0899_GPIO17CFG		, 0x82 },
+	{ STB0899_GPIO18CFG		, 0x82 },
+	{ STB0899_GPIO19CFG		, 0x82 },
+	{ STB0899_GPIO20CFG		, 0x82 },
+	{ STB0899_SDATCFG		, 0xb8 },
+	{ STB0899_SCLTCFG		, 0xba },
+	{ STB0899_AGCRFCFG		, 0x1c }, /* 0x11 */
+	{ STB0899_GPIO22		, 0x82 }, /* AGCBB2CFG */
+	{ STB0899_GPIO21		, 0x91 }, /* AGCBB1CFG */
+	{ STB0899_DIRCLKCFG		, 0x82 },
+	{ STB0899_CLKOUT27CFG		, 0x7e },
+	{ STB0899_STDBYCFG		, 0x82 },
+	{ STB0899_CS0CFG		, 0x82 },
+	{ STB0899_CS1CFG		, 0x82 },
+	{ STB0899_DISEQCOCFG		, 0x20 },
 	{ STB0899_GPIO32CFG		, 0x82 },
 	{ STB0899_GPIO33CFG		, 0x82 },
 	{ STB0899_GPIO34CFG		, 0x82 },
@@ -108,35 +108,35 @@ static const struct stb0899_s1_reg az6027_stb0899_s1_init_1[] = {
 	{ STB0899_GPIO37CFG		, 0x82 },
 	{ STB0899_GPIO38CFG		, 0x82 },
 	{ STB0899_GPIO39CFG		, 0x82 },
-	{ STB0899_NCOARSE       	, 0x17 }, /* 0x15 = 27 Mhz Clock, F/3 = 198MHz, F/6 = 99MHz */
-	{ STB0899_SYNTCTRL      	, 0x02 }, /* 0x00 = CLK from CLKI, 0x02 = CLK from XTALI */
-	{ STB0899_FILTCTRL      	, 0x00 },
-	{ STB0899_SYSCTRL       	, 0x01 },
-	{ STB0899_STOPCLK1      	, 0x20 },
-	{ STB0899_STOPCLK2      	, 0x00 },
+	{ STB0899_NCOARSE		, 0x17 }, /* 0x15 = 27 Mhz Clock, F/3 = 198MHz, F/6 = 99MHz */
+	{ STB0899_SYNTCTRL		, 0x02 }, /* 0x00 = CLK from CLKI, 0x02 = CLK from XTALI */
+	{ STB0899_FILTCTRL		, 0x00 },
+	{ STB0899_SYSCTRL		, 0x01 },
+	{ STB0899_STOPCLK1		, 0x20 },
+	{ STB0899_STOPCLK2		, 0x00 },
 	{ STB0899_INTBUFSTATUS		, 0x00 },
-	{ STB0899_INTBUFCTRL    	, 0x0a },
+	{ STB0899_INTBUFCTRL		, 0x0a },
 	{ 0xffff			, 0xff },
 };
 
 static const struct stb0899_s1_reg az6027_stb0899_s1_init_3[] = {
-	{ STB0899_DEMOD         	, 0x00 },
-	{ STB0899_RCOMPC        	, 0xc9 },
-	{ STB0899_AGC1CN        	, 0x01 },
-	{ STB0899_AGC1REF       	, 0x10 },
+	{ STB0899_DEMOD			, 0x00 },
+	{ STB0899_RCOMPC		, 0xc9 },
+	{ STB0899_AGC1CN		, 0x01 },
+	{ STB0899_AGC1REF		, 0x10 },
 	{ STB0899_RTC			, 0x23 },
-	{ STB0899_TMGCFG        	, 0x4e },
-	{ STB0899_AGC2REF       	, 0x34 },
-	{ STB0899_TLSR          	, 0x84 },
-	{ STB0899_CFD           	, 0xf7 },
+	{ STB0899_TMGCFG		, 0x4e },
+	{ STB0899_AGC2REF		, 0x34 },
+	{ STB0899_TLSR			, 0x84 },
+	{ STB0899_CFD			, 0xf7 },
 	{ STB0899_ACLC			, 0x87 },
-	{ STB0899_BCLC          	, 0x94 },
-	{ STB0899_EQON          	, 0x41 },
-	{ STB0899_LDT           	, 0xf1 },
-	{ STB0899_LDT2          	, 0xe3 },
-	{ STB0899_EQUALREF      	, 0xb4 },
-	{ STB0899_TMGRAMP       	, 0x10 },
-	{ STB0899_TMGTHD        	, 0x30 },
+	{ STB0899_BCLC			, 0x94 },
+	{ STB0899_EQON			, 0x41 },
+	{ STB0899_LDT			, 0xf1 },
+	{ STB0899_LDT2			, 0xe3 },
+	{ STB0899_EQUALREF		, 0xb4 },
+	{ STB0899_TMGRAMP		, 0x10 },
+	{ STB0899_TMGTHD		, 0x30 },
 	{ STB0899_IDCCOMP		, 0xfd },
 	{ STB0899_QDCCOMP		, 0xff },
 	{ STB0899_POWERI		, 0x0c },
@@ -155,12 +155,12 @@ static const struct stb0899_s1_reg az6027_stb0899_s1_init_3[] = {
 	{ STB0899_NIRL			, 0x80 },
 	{ STB0899_ISYMB			, 0x1d },
 	{ STB0899_QSYMB			, 0xa6 },
-	{ STB0899_SFRH          	, 0x2f },
-	{ STB0899_SFRM          	, 0x68 },
-	{ STB0899_SFRL          	, 0x40 },
-	{ STB0899_SFRUPH        	, 0x2f },
-	{ STB0899_SFRUPM        	, 0x68 },
-	{ STB0899_SFRUPL        	, 0x40 },
+	{ STB0899_SFRH			, 0x2f },
+	{ STB0899_SFRM			, 0x68 },
+	{ STB0899_SFRL			, 0x40 },
+	{ STB0899_SFRUPH		, 0x2f },
+	{ STB0899_SFRUPM		, 0x68 },
+	{ STB0899_SFRUPL		, 0x40 },
 	{ STB0899_EQUAI1		, 0x02 },
 	{ STB0899_EQUAQ1		, 0xff },
 	{ STB0899_EQUAI2		, 0x04 },
@@ -172,7 +172,7 @@ static const struct stb0899_s1_reg az6027_stb0899_s1_init_3[] = {
 	{ STB0899_EQUAI5		, 0x08 },
 	{ STB0899_EQUAQ5		, 0xf5 },
 	{ STB0899_DSTATUS2		, 0x00 },
-	{ STB0899_VSTATUS       	, 0x00 },
+	{ STB0899_VSTATUS		, 0x00 },
 	{ STB0899_VERROR		, 0x86 },
 	{ STB0899_IQSWAP		, 0x2a },
 	{ STB0899_ECNT1M		, 0x00 },
@@ -181,26 +181,26 @@ static const struct stb0899_s1_reg az6027_stb0899_s1_init_3[] = {
 	{ STB0899_ECNT2L		, 0x00 },
 	{ STB0899_ECNT3M		, 0x0a },
 	{ STB0899_ECNT3L		, 0xad },
-	{ STB0899_FECAUTO1      	, 0x06 },
+	{ STB0899_FECAUTO1		, 0x06 },
 	{ STB0899_FECM			, 0x01 },
-	{ STB0899_VTH12         	, 0xb0 },
-	{ STB0899_VTH23         	, 0x7a },
+	{ STB0899_VTH12			, 0xb0 },
+	{ STB0899_VTH23			, 0x7a },
 	{ STB0899_VTH34			, 0x58 },
-	{ STB0899_VTH56         	, 0x38 },
-	{ STB0899_VTH67         	, 0x34 },
-	{ STB0899_VTH78         	, 0x24 },
-	{ STB0899_PRVIT         	, 0xff },
-	{ STB0899_VITSYNC       	, 0x19 },
-	{ STB0899_RSULC         	, 0xb1 }, /* DVB = 0xb1, DSS = 0xa1 */
-	{ STB0899_TSULC         	, 0x42 },
-	{ STB0899_RSLLC         	, 0x41 },
+	{ STB0899_VTH56			, 0x38 },
+	{ STB0899_VTH67			, 0x34 },
+	{ STB0899_VTH78			, 0x24 },
+	{ STB0899_PRVIT			, 0xff },
+	{ STB0899_VITSYNC		, 0x19 },
+	{ STB0899_RSULC			, 0xb1 }, /* DVB = 0xb1, DSS = 0xa1 */
+	{ STB0899_TSULC			, 0x42 },
+	{ STB0899_RSLLC			, 0x41 },
 	{ STB0899_TSLPL			, 0x12 },
-	{ STB0899_TSCFGH        	, 0x0c },
-	{ STB0899_TSCFGM        	, 0x00 },
-	{ STB0899_TSCFGL        	, 0x00 },
+	{ STB0899_TSCFGH		, 0x0c },
+	{ STB0899_TSCFGM		, 0x00 },
+	{ STB0899_TSCFGL		, 0x00 },
 	{ STB0899_TSOUT			, 0x69 }, /* 0x0d for CAM */
-	{ STB0899_RSSYNCDEL     	, 0x00 },
-	{ STB0899_TSINHDELH     	, 0x02 },
+	{ STB0899_RSSYNCDEL		, 0x00 },
+	{ STB0899_TSINHDELH		, 0x02 },
 	{ STB0899_TSINHDELM		, 0x00 },
 	{ STB0899_TSINHDELL		, 0x00 },
 	{ STB0899_TSLLSTKM		, 0x1b },
@@ -211,18 +211,18 @@ static const struct stb0899_s1_reg az6027_stb0899_s1_init_3[] = {
 	{ STB0899_PCKLENLL		, 0xcc },
 	{ STB0899_RSPCKLEN		, 0xbd },
 	{ STB0899_TSSTATUS		, 0x90 },
-	{ STB0899_ERRCTRL1      	, 0xb6 },
-	{ STB0899_ERRCTRL2      	, 0x95 },
-	{ STB0899_ERRCTRL3      	, 0x8d },
+	{ STB0899_ERRCTRL1		, 0xb6 },
+	{ STB0899_ERRCTRL2		, 0x95 },
+	{ STB0899_ERRCTRL3		, 0x8d },
 	{ STB0899_DMONMSK1		, 0x27 },
 	{ STB0899_DMONMSK0		, 0x03 },
-	{ STB0899_DEMAPVIT      	, 0x5c },
+	{ STB0899_DEMAPVIT		, 0x5c },
 	{ STB0899_PLPARM		, 0x19 },
-	{ STB0899_PDELCTRL      	, 0x48 },
-	{ STB0899_PDELCTRL2     	, 0x00 },
-	{ STB0899_BBHCTRL1      	, 0x00 },
-	{ STB0899_BBHCTRL2      	, 0x00 },
-	{ STB0899_HYSTTHRESH    	, 0x77 },
+	{ STB0899_PDELCTRL		, 0x48 },
+	{ STB0899_PDELCTRL2		, 0x00 },
+	{ STB0899_BBHCTRL1		, 0x00 },
+	{ STB0899_BBHCTRL2		, 0x00 },
+	{ STB0899_HYSTTHRESH		, 0x77 },
 	{ STB0899_MATCSTM		, 0x00 },
 	{ STB0899_MATCSTL		, 0x00 },
 	{ STB0899_UPLCSTM		, 0x00 },
@@ -261,7 +261,7 @@ static struct stb0899_config az6027_stb0899_config = {
 	.init_s2_fec		= stb0899_s2_init_4,
 	.init_tst		= stb0899_s1_init_5,
 
-	.demod_address 		= 0xd0, /* 0x68, 0xd0 >> 1 */
+	.demod_address		= 0xd0, /* 0x68, 0xd0 >> 1 */
 
 	.xtal_freq		= 27000000,
 	.inversion		= IQ_SWAP_ON,
@@ -1181,9 +1181,9 @@ static struct dvb_usb_device_properties az6027_properties = {
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver az6027_usb_driver = {
 	.name		= "dvb_usb_az6027",
-	.probe 		= az6027_usb_probe,
-	.disconnect 	= az6027_usb_disconnect,
-	.id_table 	= az6027_usb_table,
+	.probe		= az6027_usb_probe,
+	.disconnect	= az6027_usb_disconnect,
+	.id_table	= az6027_usb_table,
 };
 
 module_usb_driver(az6027_usb_driver);
diff --git a/drivers/media/usb/gspca/stv06xx/stv06xx.c b/drivers/media/usb/gspca/stv06xx/stv06xx.c
index 2715218fe436..6080a35310ca 100644
--- a/drivers/media/usb/gspca/stv06xx/stv06xx.c
+++ b/drivers/media/usb/gspca/stv06xx/stv06xx.c
@@ -579,7 +579,7 @@ static int stv06xx_config(struct gspca_dev *gspca_dev,
 
 /* -- module initialisation -- */
 static const struct usb_device_id device_table[] = {
-	{USB_DEVICE(0x046d, 0x0840), .driver_info = BRIDGE_STV600 }, 	/* QuickCam Express */
+	{USB_DEVICE(0x046d, 0x0840), .driver_info = BRIDGE_STV600 },	/* QuickCam Express */
 	{USB_DEVICE(0x046d, 0x0850), .driver_info = BRIDGE_STV610 },	/* LEGO cam / QuickCam Web */
 	{USB_DEVICE(0x046d, 0x0870), .driver_info = BRIDGE_STV602 },	/* Dexxa WebCam USB */
 	{USB_DEVICE(0x046D, 0x08F0), .driver_info = BRIDGE_ST6422 },	/* QuickCam Messenger */
diff --git a/drivers/media/usb/hdpvr/hdpvr-video.c b/drivers/media/usb/hdpvr/hdpvr-video.c
index 7fb036d6a86e..5e5f63f2652b 100644
--- a/drivers/media/usb/hdpvr/hdpvr-video.c
+++ b/drivers/media/usb/hdpvr/hdpvr-video.c
@@ -941,18 +941,18 @@ static int hdpvr_s_ctrl(struct v4l2_ctrl *ctrl)
 		return 0;
 	case V4L2_CID_MPEG_VIDEO_ENCODING:
 		return 0;
-/* 	case V4L2_CID_MPEG_VIDEO_B_FRAMES: */
-/* 		if (ctrl->value == 0 && !(opt->gop_mode & 0x2)) { */
-/* 			opt->gop_mode |= 0x2; */
-/* 			hdpvr_config_call(dev, CTRL_GOP_MODE_VALUE, */
-/* 					  opt->gop_mode); */
-/* 		} */
-/* 		if (ctrl->value == 128 && opt->gop_mode & 0x2) { */
-/* 			opt->gop_mode &= ~0x2; */
-/* 			hdpvr_config_call(dev, CTRL_GOP_MODE_VALUE, */
-/* 					  opt->gop_mode); */
-/* 		} */
-/* 		break; */
+/*	case V4L2_CID_MPEG_VIDEO_B_FRAMES: */
+/*		if (ctrl->value == 0 && !(opt->gop_mode & 0x2)) { */
+/*			opt->gop_mode |= 0x2; */
+/*			hdpvr_config_call(dev, CTRL_GOP_MODE_VALUE, */
+/*					  opt->gop_mode); */
+/*		} */
+/*		if (ctrl->value == 128 && opt->gop_mode & 0x2) { */
+/*			opt->gop_mode &= ~0x2; */
+/*			hdpvr_config_call(dev, CTRL_GOP_MODE_VALUE, */
+/*					  opt->gop_mode); */
+/*		} */
+/*		break; */
 	case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: {
 		uint peak_bitrate = dev->video_bitrate_peak->val / 100000;
 		uint bitrate = dev->video_bitrate->val / 100000;
@@ -1154,7 +1154,7 @@ static void hdpvr_device_release(struct video_device *vdev)
 static const struct video_device hdpvr_video_template = {
 	.fops			= &hdpvr_fops,
 	.release		= hdpvr_device_release,
-	.ioctl_ops 		= &hdpvr_ioctl_ops,
+	.ioctl_ops		= &hdpvr_ioctl_ops,
 	.tvnorms		= V4L2_STD_ALL,
 };
 
diff --git a/drivers/media/usb/hdpvr/hdpvr.h b/drivers/media/usb/hdpvr/hdpvr.h
index 96e36a8e5f43..1d65b4185f57 100644
--- a/drivers/media/usb/hdpvr/hdpvr.h
+++ b/drivers/media/usb/hdpvr/hdpvr.h
@@ -232,15 +232,15 @@ enum {
 
 
 /* :0 s 38 d3 0000 0000 0001 1 = 00 */
-/* 		ret = usb_control_msg(dev->udev, */
-/* 				      usb_sndctrlpipe(dev->udev, 0), */
-/* 				      0xd3, 0x38, */
-/* 				      0, 0, */
-/* 				      "\0", 1, */
-/* 				      1000); */
-
-/* 		info("control request returned %d", ret); */
-/* 		msleep(5000); */
+/*		ret = usb_control_msg(dev->udev, */
+/*				      usb_sndctrlpipe(dev->udev, 0), */
+/*				      0xd3, 0x38, */
+/*				      0, 0, */
+/*				      "\0", 1, */
+/*				      1000); */
+
+/*		info("control request returned %d", ret); */
+/*		msleep(5000); */
 
 
 	/* :0 s b8 81 1400 0003 0005 5 <
diff --git a/drivers/media/usb/pwc/pwc.h b/drivers/media/usb/pwc/pwc.h
index 3c73bdaae450..67010010d2a2 100644
--- a/drivers/media/usb/pwc/pwc.h
+++ b/drivers/media/usb/pwc/pwc.h
@@ -50,7 +50,7 @@
 
 /* Version block */
 #define PWC_VERSION	"10.0.15"
-#define PWC_NAME 	"pwc"
+#define PWC_NAME	"pwc"
 #define PFX		PWC_NAME ": "
 
 
@@ -120,10 +120,10 @@
 #define MAX_ISO_BUFS		3
 #define ISO_FRAMES_PER_DESC	10
 #define ISO_MAX_FRAME_SIZE	960
-#define ISO_BUFFER_SIZE 	(ISO_FRAMES_PER_DESC * ISO_MAX_FRAME_SIZE)
+#define ISO_BUFFER_SIZE		(ISO_FRAMES_PER_DESC * ISO_MAX_FRAME_SIZE)
 
 /* Maximum size after decompression is 640x480 YUV data, 1.5 * 640 * 480 */
-#define PWC_FRAME_SIZE 		(460800 + TOUCAM_HEADER_SIZE + TOUCAM_TRAILER_SIZE)
+#define PWC_FRAME_SIZE		(460800 + TOUCAM_HEADER_SIZE + TOUCAM_TRAILER_SIZE)
 
 /* Absolute minimum and maximum number of buffers available for mmap() */
 #define MIN_FRAMES		2
diff --git a/drivers/media/usb/siano/smsusb.c b/drivers/media/usb/siano/smsusb.c
index d07349cf9489..f13e4b01b5a5 100644
--- a/drivers/media/usb/siano/smsusb.c
+++ b/drivers/media/usb/siano/smsusb.c
@@ -61,7 +61,7 @@ struct smsusb_device_t {
 	struct usb_device *udev;
 	struct smscore_device_t *coredev;
 
-	struct smsusb_urb_t 	surbs[MAX_URBS];
+	struct smsusb_urb_t	surbs[MAX_URBS];
 
 	int		response_alignment;
 	int		buffer_size;
diff --git a/drivers/media/usb/stk1160/Makefile b/drivers/media/usb/stk1160/Makefile
index 8e6c22fb1803..b943db01ccf7 100644
--- a/drivers/media/usb/stk1160/Makefile
+++ b/drivers/media/usb/stk1160/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-stk1160-y := 	stk1160-core.o \
+stk1160-y :=	stk1160-core.o \
 		stk1160-v4l.o \
 		stk1160-video.o \
 		stk1160-i2c.o \
diff --git a/drivers/media/usb/stkwebcam/stk-sensor.c b/drivers/media/usb/stkwebcam/stk-sensor.c
index c1d4505f84ea..9a7dbeff1337 100644
--- a/drivers/media/usb/stkwebcam/stk-sensor.c
+++ b/drivers/media/usb/stkwebcam/stk-sensor.c
@@ -397,12 +397,12 @@ int stk_sensor_init(struct stk_camera *dev)
 /* V4L2_PIX_FMT_UYVY */
 static struct regval ov_fmt_uyvy[] = {
 	{REG_TSLB, TSLB_YLAST|0x08 },
-	{ 0x4f, 0x80 }, 	/* "matrix coefficient 1" */
-	{ 0x50, 0x80 }, 	/* "matrix coefficient 2" */
+	{ 0x4f, 0x80 },		/* "matrix coefficient 1" */
+	{ 0x50, 0x80 },		/* "matrix coefficient 2" */
 	{ 0x51, 0    },		/* vb */
-	{ 0x52, 0x22 }, 	/* "matrix coefficient 4" */
-	{ 0x53, 0x5e }, 	/* "matrix coefficient 5" */
-	{ 0x54, 0x80 }, 	/* "matrix coefficient 6" */
+	{ 0x52, 0x22 },		/* "matrix coefficient 4" */
+	{ 0x53, 0x5e },		/* "matrix coefficient 5" */
+	{ 0x54, 0x80 },		/* "matrix coefficient 6" */
 	{REG_COM13, COM13_UVSAT|COM13_CMATRIX},
 	{REG_COM15, COM15_R00FF },
 	{0xff, 0xff}, /* END MARKER */
@@ -410,12 +410,12 @@ static struct regval ov_fmt_uyvy[] = {
 /* V4L2_PIX_FMT_YUYV */
 static struct regval ov_fmt_yuyv[] = {
 	{REG_TSLB, 0 },
-	{ 0x4f, 0x80 }, 	/* "matrix coefficient 1" */
-	{ 0x50, 0x80 }, 	/* "matrix coefficient 2" */
+	{ 0x4f, 0x80 },		/* "matrix coefficient 1" */
+	{ 0x50, 0x80 },		/* "matrix coefficient 2" */
 	{ 0x51, 0    },		/* vb */
-	{ 0x52, 0x22 }, 	/* "matrix coefficient 4" */
-	{ 0x53, 0x5e }, 	/* "matrix coefficient 5" */
-	{ 0x54, 0x80 }, 	/* "matrix coefficient 6" */
+	{ 0x52, 0x22 },		/* "matrix coefficient 4" */
+	{ 0x53, 0x5e },		/* "matrix coefficient 5" */
+	{ 0x54, 0x80 },		/* "matrix coefficient 6" */
 	{REG_COM13, COM13_UVSAT|COM13_CMATRIX},
 	{REG_COM15, COM15_R00FF },
 	{0xff, 0xff}, /* END MARKER */
@@ -426,13 +426,13 @@ static struct regval ov_fmt_rgbr[] = {
 	{ REG_RGB444, 0 },	/* No RGB444 please */
 	{REG_TSLB, 0x00},
 	{ REG_COM1, 0x0 },
-	{ REG_COM9, 0x38 }, 	/* 16x gain ceiling; 0x8 is reserved bit */
-	{ 0x4f, 0xb3 }, 	/* "matrix coefficient 1" */
-	{ 0x50, 0xb3 }, 	/* "matrix coefficient 2" */
+	{ REG_COM9, 0x38 },	/* 16x gain ceiling; 0x8 is reserved bit */
+	{ 0x4f, 0xb3 },		/* "matrix coefficient 1" */
+	{ 0x50, 0xb3 },		/* "matrix coefficient 2" */
 	{ 0x51, 0    },		/* vb */
-	{ 0x52, 0x3d }, 	/* "matrix coefficient 4" */
-	{ 0x53, 0xa7 }, 	/* "matrix coefficient 5" */
-	{ 0x54, 0xe4 }, 	/* "matrix coefficient 6" */
+	{ 0x52, 0x3d },		/* "matrix coefficient 4" */
+	{ 0x53, 0xa7 },		/* "matrix coefficient 5" */
+	{ 0x54, 0xe4 },		/* "matrix coefficient 6" */
 	{ REG_COM13, COM13_GAMMA },
 	{ REG_COM15, COM15_RGB565|COM15_R00FF },
 	{ 0xff, 0xff },
@@ -443,13 +443,13 @@ static struct regval ov_fmt_rgbp[] = {
 	{ REG_RGB444, 0 },	/* No RGB444 please */
 	{REG_TSLB, TSLB_BYTEORD },
 	{ REG_COM1, 0x0 },
-	{ REG_COM9, 0x38 }, 	/* 16x gain ceiling; 0x8 is reserved bit */
-	{ 0x4f, 0xb3 }, 	/* "matrix coefficient 1" */
-	{ 0x50, 0xb3 }, 	/* "matrix coefficient 2" */
+	{ REG_COM9, 0x38 },	/* 16x gain ceiling; 0x8 is reserved bit */
+	{ 0x4f, 0xb3 },		/* "matrix coefficient 1" */
+	{ 0x50, 0xb3 },		/* "matrix coefficient 2" */
 	{ 0x51, 0    },		/* vb */
-	{ 0x52, 0x3d }, 	/* "matrix coefficient 4" */
-	{ 0x53, 0xa7 }, 	/* "matrix coefficient 5" */
-	{ 0x54, 0xe4 }, 	/* "matrix coefficient 6" */
+	{ 0x52, 0x3d },		/* "matrix coefficient 4" */
+	{ 0x53, 0xa7 },		/* "matrix coefficient 5" */
+	{ 0x54, 0xe4 },		/* "matrix coefficient 6" */
 	{ REG_COM13, COM13_GAMMA },
 	{ REG_COM15, COM15_RGB565|COM15_R00FF },
 	{ 0xff, 0xff },
diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
index 108577c50097..fd387bf3f02d 100644
--- a/drivers/media/usb/uvc/uvc_driver.c
+++ b/drivers/media/usb/uvc/uvc_driver.c
@@ -963,11 +963,11 @@ static int uvc_parse_vendor_control(struct uvc_device *dev,
 		 *	Size of this descriptor, in bytes: 24+p+n*2
 		 * ----------------------------------------------------------
 		 * 23+p+n	bmControlsType	N	Bitmap
-		 * 	Individual bits in the set are defined:
-		 * 	0: Absolute
-		 * 	1: Relative
+		 *	Individual bits in the set are defined:
+		 *	0: Absolute
+		 *	1: Relative
 		 *
-		 * 	This bitset is mapped exactly the same as bmControls.
+		 *	This bitset is mapped exactly the same as bmControls.
 		 * ----------------------------------------------------------
 		 * 23+p+n*2	bReserved	1	Boolean
 		 * ----------------------------------------------------------
@@ -2481,7 +2481,7 @@ static const struct usb_device_id uvc_ids[] = {
 	  .bInterfaceClass	= USB_CLASS_VIDEO,
 	  .bInterfaceSubClass	= 1,
 	  .bInterfaceProtocol	= 0,
-	  .driver_info 		= (kernel_ulong_t)&uvc_quirk_probe_def },
+	  .driver_info		= (kernel_ulong_t)&uvc_quirk_probe_def },
 	/* Dell SP2008WFP Monitor */
 	{ .match_flags		= USB_DEVICE_ID_MATCH_DEVICE
 				| USB_DEVICE_ID_MATCH_INT_INFO,
@@ -2490,7 +2490,7 @@ static const struct usb_device_id uvc_ids[] = {
 	  .bInterfaceClass	= USB_CLASS_VIDEO,
 	  .bInterfaceSubClass	= 1,
 	  .bInterfaceProtocol	= 0,
-	  .driver_info 		= (kernel_ulong_t)&uvc_quirk_probe_def },
+	  .driver_info		= (kernel_ulong_t)&uvc_quirk_probe_def },
 	/* Dell Alienware X51 */
 	{ .match_flags		= USB_DEVICE_ID_MATCH_DEVICE
 				| USB_DEVICE_ID_MATCH_INT_INFO,
@@ -2526,7 +2526,7 @@ static const struct usb_device_id uvc_ids[] = {
 	  .bInterfaceClass	= USB_CLASS_VIDEO,
 	  .bInterfaceSubClass	= 1,
 	  .bInterfaceProtocol	= 0,
-	  .driver_info 		= UVC_QUIRK_INFO(UVC_QUIRK_PROBE_MINMAX
+	  .driver_info		= UVC_QUIRK_INFO(UVC_QUIRK_PROBE_MINMAX
 					| UVC_QUIRK_BUILTIN_ISIGHT) },
 	/* Apple Built-In iSight via iBridge */
 	{ .match_flags		= USB_DEVICE_ID_MATCH_DEVICE
diff --git a/drivers/media/usb/uvc/uvc_isight.c b/drivers/media/usb/uvc/uvc_isight.c
index fb940cfae575..5059fbf41020 100644
--- a/drivers/media/usb/uvc/uvc_isight.c
+++ b/drivers/media/usb/uvc/uvc_isight.c
@@ -27,11 +27,11 @@
  *
  * Offset   Size (bytes)	Description
  * ------------------------------------------------------------------
- * 0x00 	1   	Header length
- * 0x01 	1   	Flags (UVC-compliant)
- * 0x02 	4   	Always equal to '11223344'
- * 0x06 	8   	Always equal to 'deadbeefdeadface'
- * 0x0e 	16  	Unknown
+ * 0x00	1	Header length
+ * 0x01	1	Flags (UVC-compliant)
+ * 0x02	4	Always equal to '11223344'
+ * 0x06	8	Always equal to 'deadbeefdeadface'
+ * 0x0e	16	Unknown
  *
  * The header can be prefixed by an optional, unknown-purpose byte.
  */
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index 8d79691b1dce..e48d59046086 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -33,7 +33,7 @@ static long native_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 struct v4l2_clip32 {
 	struct v4l2_rect        c;
-	compat_caddr_t 		next;
+	compat_caddr_t		next;
 };
 
 struct v4l2_window32 {
@@ -582,7 +582,7 @@ static int put_v4l2_buffer32(struct v4l2_buffer *kp, struct v4l2_buffer32 __user
 struct v4l2_framebuffer32 {
 	__u32			capability;
 	__u32			flags;
-	compat_caddr_t 		base;
+	compat_caddr_t		base;
 	struct {
 		__u32		width;
 		__u32		height;
@@ -857,7 +857,7 @@ static int put_v4l2_edid32(struct v4l2_edid *kp, struct v4l2_edid32 __user *up)
 #define VIDIOC_ENUMINPUT32	_IOWR('V', 26, struct v4l2_input32)
 #define VIDIOC_G_EDID32		_IOWR('V', 40, struct v4l2_edid32)
 #define VIDIOC_S_EDID32		_IOWR('V', 41, struct v4l2_edid32)
-#define VIDIOC_TRY_FMT32      	_IOWR('V', 64, struct v4l2_format32)
+#define VIDIOC_TRY_FMT32	_IOWR('V', 64, struct v4l2_format32)
 #define VIDIOC_G_EXT_CTRLS32    _IOWR('V', 71, struct v4l2_ext_controls32)
 #define VIDIOC_S_EXT_CTRLS32    _IOWR('V', 72, struct v4l2_ext_controls32)
 #define VIDIOC_TRY_EXT_CTRLS32  _IOWR('V', 73, struct v4l2_ext_controls32)
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 1d7c2ea78c3e..59d2100eeff6 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -46,37 +46,37 @@ struct std_descr {
 };
 
 static const struct std_descr standards[] = {
-	{ V4L2_STD_NTSC, 	"NTSC"      },
-	{ V4L2_STD_NTSC_M, 	"NTSC-M"    },
-	{ V4L2_STD_NTSC_M_JP, 	"NTSC-M-JP" },
+	{ V4L2_STD_NTSC,	"NTSC"      },
+	{ V4L2_STD_NTSC_M,	"NTSC-M"    },
+	{ V4L2_STD_NTSC_M_JP,	"NTSC-M-JP" },
 	{ V4L2_STD_NTSC_M_KR,	"NTSC-M-KR" },
-	{ V4L2_STD_NTSC_443, 	"NTSC-443"  },
-	{ V4L2_STD_PAL, 	"PAL"       },
-	{ V4L2_STD_PAL_BG, 	"PAL-BG"    },
-	{ V4L2_STD_PAL_B, 	"PAL-B"     },
-	{ V4L2_STD_PAL_B1, 	"PAL-B1"    },
-	{ V4L2_STD_PAL_G, 	"PAL-G"     },
-	{ V4L2_STD_PAL_H, 	"PAL-H"     },
-	{ V4L2_STD_PAL_I, 	"PAL-I"     },
-	{ V4L2_STD_PAL_DK, 	"PAL-DK"    },
-	{ V4L2_STD_PAL_D, 	"PAL-D"     },
-	{ V4L2_STD_PAL_D1, 	"PAL-D1"    },
-	{ V4L2_STD_PAL_K, 	"PAL-K"     },
-	{ V4L2_STD_PAL_M, 	"PAL-M"     },
-	{ V4L2_STD_PAL_N, 	"PAL-N"     },
-	{ V4L2_STD_PAL_Nc, 	"PAL-Nc"    },
-	{ V4L2_STD_PAL_60, 	"PAL-60"    },
-	{ V4L2_STD_SECAM, 	"SECAM"     },
-	{ V4L2_STD_SECAM_B, 	"SECAM-B"   },
-	{ V4L2_STD_SECAM_G, 	"SECAM-G"   },
-	{ V4L2_STD_SECAM_H, 	"SECAM-H"   },
-	{ V4L2_STD_SECAM_DK, 	"SECAM-DK"  },
-	{ V4L2_STD_SECAM_D, 	"SECAM-D"   },
-	{ V4L2_STD_SECAM_K, 	"SECAM-K"   },
-	{ V4L2_STD_SECAM_K1, 	"SECAM-K1"  },
-	{ V4L2_STD_SECAM_L, 	"SECAM-L"   },
-	{ V4L2_STD_SECAM_LC, 	"SECAM-Lc"  },
-	{ 0, 			"Unknown"   }
+	{ V4L2_STD_NTSC_443,	"NTSC-443"  },
+	{ V4L2_STD_PAL,		"PAL"       },
+	{ V4L2_STD_PAL_BG,	"PAL-BG"    },
+	{ V4L2_STD_PAL_B,	"PAL-B"     },
+	{ V4L2_STD_PAL_B1,	"PAL-B1"    },
+	{ V4L2_STD_PAL_G,	"PAL-G"     },
+	{ V4L2_STD_PAL_H,	"PAL-H"     },
+	{ V4L2_STD_PAL_I,	"PAL-I"     },
+	{ V4L2_STD_PAL_DK,	"PAL-DK"    },
+	{ V4L2_STD_PAL_D,	"PAL-D"     },
+	{ V4L2_STD_PAL_D1,	"PAL-D1"    },
+	{ V4L2_STD_PAL_K,	"PAL-K"     },
+	{ V4L2_STD_PAL_M,	"PAL-M"     },
+	{ V4L2_STD_PAL_N,	"PAL-N"     },
+	{ V4L2_STD_PAL_Nc,	"PAL-Nc"    },
+	{ V4L2_STD_PAL_60,	"PAL-60"    },
+	{ V4L2_STD_SECAM,	"SECAM"     },
+	{ V4L2_STD_SECAM_B,	"SECAM-B"   },
+	{ V4L2_STD_SECAM_G,	"SECAM-G"   },
+	{ V4L2_STD_SECAM_H,	"SECAM-H"   },
+	{ V4L2_STD_SECAM_DK,	"SECAM-DK"  },
+	{ V4L2_STD_SECAM_D,	"SECAM-D"   },
+	{ V4L2_STD_SECAM_K,	"SECAM-K"   },
+	{ V4L2_STD_SECAM_K1,	"SECAM-K1"  },
+	{ V4L2_STD_SECAM_L,	"SECAM-L"   },
+	{ V4L2_STD_SECAM_LC,	"SECAM-Lc"  },
+	{ 0,			"Unknown"   }
 };
 
 /* video4linux standard ID conversion to standard name
@@ -2544,7 +2544,7 @@ struct v4l2_ioctl_info {
 #define INFO_FL_CLEAR(v4l2_struct, field)			\
 	((offsetof(struct v4l2_struct, field) +			\
 	  sizeof(((struct v4l2_struct *)0)->field)) << 16)
-#define INFO_FL_CLEAR_MASK 	(_IOC_SIZEMASK << 16)
+#define INFO_FL_CLEAR_MASK	(_IOC_SIZEMASK << 16)
 
 #define IOCTL_INFO_STD(_ioctl, _vidioc, _debug, _flags)			\
 	[_IOC_NR(_ioctl)] = {						\
diff --git a/include/media/drv-intf/cx2341x.h b/include/media/drv-intf/cx2341x.h
index 9635eebaab09..33a97bfcea58 100644
--- a/include/media/drv-intf/cx2341x.h
+++ b/include/media/drv-intf/cx2341x.h
@@ -29,8 +29,8 @@ enum cx2341x_port {
 
 enum cx2341x_cap {
 	CX2341X_CAP_HAS_SLICED_VBI = 1 << 0,
-	CX2341X_CAP_HAS_TS 	   = 1 << 1,
-	CX2341X_CAP_HAS_AC3 	   = 1 << 2,
+	CX2341X_CAP_HAS_TS	   = 1 << 1,
+	CX2341X_CAP_HAS_AC3	   = 1 << 2,
 };
 
 struct cx2341x_mpeg_params {
@@ -204,92 +204,92 @@ void cx2341x_handler_set_busy(struct cx2341x_handler *cxhdl, int busy);
 /* Firmware API commands */
 
 /* MPEG decoder API, specific to the cx23415 */
-#define CX2341X_DEC_PING_FW 			0x00
-#define CX2341X_DEC_START_PLAYBACK 		0x01
-#define CX2341X_DEC_STOP_PLAYBACK 		0x02
-#define CX2341X_DEC_SET_PLAYBACK_SPEED 		0x03
-#define CX2341X_DEC_STEP_VIDEO 			0x05
-#define CX2341X_DEC_SET_DMA_BLOCK_SIZE 		0x08
+#define CX2341X_DEC_PING_FW			0x00
+#define CX2341X_DEC_START_PLAYBACK		0x01
+#define CX2341X_DEC_STOP_PLAYBACK		0x02
+#define CX2341X_DEC_SET_PLAYBACK_SPEED		0x03
+#define CX2341X_DEC_STEP_VIDEO			0x05
+#define CX2341X_DEC_SET_DMA_BLOCK_SIZE		0x08
 #define CX2341X_DEC_GET_XFER_INFO		0x09
 #define CX2341X_DEC_GET_DMA_STATUS		0x0a
 #define CX2341X_DEC_SCHED_DMA_FROM_HOST		0x0b
-#define CX2341X_DEC_PAUSE_PLAYBACK 		0x0d
-#define CX2341X_DEC_HALT_FW 			0x0e
-#define CX2341X_DEC_SET_STANDARD 		0x10
+#define CX2341X_DEC_PAUSE_PLAYBACK		0x0d
+#define CX2341X_DEC_HALT_FW			0x0e
+#define CX2341X_DEC_SET_STANDARD		0x10
 #define CX2341X_DEC_GET_VERSION			0x11
-#define CX2341X_DEC_SET_STREAM_INPUT 		0x14
-#define CX2341X_DEC_GET_TIMING_INFO 		0x15
-#define CX2341X_DEC_SET_AUDIO_MODE 		0x16
+#define CX2341X_DEC_SET_STREAM_INPUT		0x14
+#define CX2341X_DEC_GET_TIMING_INFO		0x15
+#define CX2341X_DEC_SET_AUDIO_MODE		0x16
 #define CX2341X_DEC_SET_EVENT_NOTIFICATION	0x17
 #define CX2341X_DEC_SET_DISPLAY_BUFFERS		0x18
-#define CX2341X_DEC_EXTRACT_VBI 		0x19
-#define CX2341X_DEC_SET_DECODER_SOURCE 		0x1a
+#define CX2341X_DEC_EXTRACT_VBI			0x19
+#define CX2341X_DEC_SET_DECODER_SOURCE		0x1a
 #define CX2341X_DEC_SET_PREBUFFERING		0x1e
 
 /* MPEG encoder API */
-#define CX2341X_ENC_PING_FW 			0x80
-#define CX2341X_ENC_START_CAPTURE 		0x81
-#define CX2341X_ENC_STOP_CAPTURE 		0x82
-#define CX2341X_ENC_SET_AUDIO_ID 		0x89
-#define CX2341X_ENC_SET_VIDEO_ID 		0x8b
-#define CX2341X_ENC_SET_PCR_ID 			0x8d
-#define CX2341X_ENC_SET_FRAME_RATE 		0x8f
-#define CX2341X_ENC_SET_FRAME_SIZE 		0x91
-#define CX2341X_ENC_SET_BIT_RATE 		0x95
-#define CX2341X_ENC_SET_GOP_PROPERTIES 		0x97
-#define CX2341X_ENC_SET_ASPECT_RATIO 		0x99
-#define CX2341X_ENC_SET_DNR_FILTER_MODE 	0x9b
-#define CX2341X_ENC_SET_DNR_FILTER_PROPS 	0x9d
-#define CX2341X_ENC_SET_CORING_LEVELS 		0x9f
-#define CX2341X_ENC_SET_SPATIAL_FILTER_TYPE 	0xa1
-#define CX2341X_ENC_SET_VBI_LINE 		0xb7
-#define CX2341X_ENC_SET_STREAM_TYPE 		0xb9
-#define CX2341X_ENC_SET_OUTPUT_PORT 		0xbb
-#define CX2341X_ENC_SET_AUDIO_PROPERTIES 	0xbd
-#define CX2341X_ENC_HALT_FW 			0xc3
+#define CX2341X_ENC_PING_FW			0x80
+#define CX2341X_ENC_START_CAPTURE		0x81
+#define CX2341X_ENC_STOP_CAPTURE		0x82
+#define CX2341X_ENC_SET_AUDIO_ID		0x89
+#define CX2341X_ENC_SET_VIDEO_ID		0x8b
+#define CX2341X_ENC_SET_PCR_ID			0x8d
+#define CX2341X_ENC_SET_FRAME_RATE		0x8f
+#define CX2341X_ENC_SET_FRAME_SIZE		0x91
+#define CX2341X_ENC_SET_BIT_RATE		0x95
+#define CX2341X_ENC_SET_GOP_PROPERTIES		0x97
+#define CX2341X_ENC_SET_ASPECT_RATIO		0x99
+#define CX2341X_ENC_SET_DNR_FILTER_MODE		0x9b
+#define CX2341X_ENC_SET_DNR_FILTER_PROPS	0x9d
+#define CX2341X_ENC_SET_CORING_LEVELS		0x9f
+#define CX2341X_ENC_SET_SPATIAL_FILTER_TYPE	0xa1
+#define CX2341X_ENC_SET_VBI_LINE		0xb7
+#define CX2341X_ENC_SET_STREAM_TYPE		0xb9
+#define CX2341X_ENC_SET_OUTPUT_PORT		0xbb
+#define CX2341X_ENC_SET_AUDIO_PROPERTIES	0xbd
+#define CX2341X_ENC_HALT_FW			0xc3
 #define CX2341X_ENC_GET_VERSION			0xc4
-#define CX2341X_ENC_SET_GOP_CLOSURE 		0xc5
-#define CX2341X_ENC_GET_SEQ_END 		0xc6
-#define CX2341X_ENC_SET_PGM_INDEX_INFO 		0xc7
+#define CX2341X_ENC_SET_GOP_CLOSURE		0xc5
+#define CX2341X_ENC_GET_SEQ_END			0xc6
+#define CX2341X_ENC_SET_PGM_INDEX_INFO		0xc7
 #define CX2341X_ENC_SET_VBI_CONFIG		0xc8
-#define CX2341X_ENC_SET_DMA_BLOCK_SIZE 		0xc9
+#define CX2341X_ENC_SET_DMA_BLOCK_SIZE		0xc9
 #define CX2341X_ENC_GET_PREV_DMA_INFO_MB_10	0xca
 #define CX2341X_ENC_GET_PREV_DMA_INFO_MB_9	0xcb
-#define CX2341X_ENC_SCHED_DMA_TO_HOST 		0xcc
-#define CX2341X_ENC_INITIALIZE_INPUT 		0xcd
-#define CX2341X_ENC_SET_FRAME_DROP_RATE 	0xd0
-#define CX2341X_ENC_PAUSE_ENCODER 		0xd2
-#define CX2341X_ENC_REFRESH_INPUT 		0xd3
+#define CX2341X_ENC_SCHED_DMA_TO_HOST		0xcc
+#define CX2341X_ENC_INITIALIZE_INPUT		0xcd
+#define CX2341X_ENC_SET_FRAME_DROP_RATE		0xd0
+#define CX2341X_ENC_PAUSE_ENCODER		0xd2
+#define CX2341X_ENC_REFRESH_INPUT		0xd3
 #define CX2341X_ENC_SET_COPYRIGHT		0xd4
-#define CX2341X_ENC_SET_EVENT_NOTIFICATION 	0xd5
-#define CX2341X_ENC_SET_NUM_VSYNC_LINES 	0xd6
-#define CX2341X_ENC_SET_PLACEHOLDER 		0xd7
-#define CX2341X_ENC_MUTE_VIDEO 			0xd9
-#define CX2341X_ENC_MUTE_AUDIO 			0xda
+#define CX2341X_ENC_SET_EVENT_NOTIFICATION	0xd5
+#define CX2341X_ENC_SET_NUM_VSYNC_LINES		0xd6
+#define CX2341X_ENC_SET_PLACEHOLDER		0xd7
+#define CX2341X_ENC_MUTE_VIDEO			0xd9
+#define CX2341X_ENC_MUTE_AUDIO			0xda
 #define CX2341X_ENC_SET_VERT_CROP_LINE		0xdb
-#define CX2341X_ENC_MISC 			0xdc
+#define CX2341X_ENC_MISC			0xdc
 
 /* OSD API, specific to the cx23415 */
-#define CX2341X_OSD_GET_FRAMEBUFFER 		0x41
-#define CX2341X_OSD_GET_PIXEL_FORMAT 		0x42
-#define CX2341X_OSD_SET_PIXEL_FORMAT 		0x43
-#define CX2341X_OSD_GET_STATE 			0x44
-#define CX2341X_OSD_SET_STATE 			0x45
-#define CX2341X_OSD_GET_OSD_COORDS 		0x46
-#define CX2341X_OSD_SET_OSD_COORDS 		0x47
-#define CX2341X_OSD_GET_SCREEN_COORDS 		0x48
-#define CX2341X_OSD_SET_SCREEN_COORDS 		0x49
-#define CX2341X_OSD_GET_GLOBAL_ALPHA 		0x4a
-#define CX2341X_OSD_SET_GLOBAL_ALPHA 		0x4b
-#define CX2341X_OSD_SET_BLEND_COORDS 		0x4c
-#define CX2341X_OSD_GET_FLICKER_STATE 		0x4f
-#define CX2341X_OSD_SET_FLICKER_STATE 		0x50
-#define CX2341X_OSD_BLT_COPY 			0x52
-#define CX2341X_OSD_BLT_FILL 			0x53
-#define CX2341X_OSD_BLT_TEXT 			0x54
-#define CX2341X_OSD_SET_FRAMEBUFFER_WINDOW 	0x56
-#define CX2341X_OSD_SET_CHROMA_KEY 		0x60
-#define CX2341X_OSD_GET_ALPHA_CONTENT_INDEX 	0x61
-#define CX2341X_OSD_SET_ALPHA_CONTENT_INDEX 	0x62
+#define CX2341X_OSD_GET_FRAMEBUFFER		0x41
+#define CX2341X_OSD_GET_PIXEL_FORMAT		0x42
+#define CX2341X_OSD_SET_PIXEL_FORMAT		0x43
+#define CX2341X_OSD_GET_STATE			0x44
+#define CX2341X_OSD_SET_STATE			0x45
+#define CX2341X_OSD_GET_OSD_COORDS		0x46
+#define CX2341X_OSD_SET_OSD_COORDS		0x47
+#define CX2341X_OSD_GET_SCREEN_COORDS		0x48
+#define CX2341X_OSD_SET_SCREEN_COORDS		0x49
+#define CX2341X_OSD_GET_GLOBAL_ALPHA		0x4a
+#define CX2341X_OSD_SET_GLOBAL_ALPHA		0x4b
+#define CX2341X_OSD_SET_BLEND_COORDS		0x4c
+#define CX2341X_OSD_GET_FLICKER_STATE		0x4f
+#define CX2341X_OSD_SET_FLICKER_STATE		0x50
+#define CX2341X_OSD_BLT_COPY			0x52
+#define CX2341X_OSD_BLT_FILL			0x53
+#define CX2341X_OSD_BLT_TEXT			0x54
+#define CX2341X_OSD_SET_FRAMEBUFFER_WINDOW	0x56
+#define CX2341X_OSD_SET_CHROMA_KEY		0x60
+#define CX2341X_OSD_GET_ALPHA_CONTENT_INDEX	0x61
+#define CX2341X_OSD_SET_ALPHA_CONTENT_INDEX	0x62
 
 #endif /* CX2341X_H */
diff --git a/include/media/drv-intf/msp3400.h b/include/media/drv-intf/msp3400.h
index 1e6e80213a77..db98ce49e17b 100644
--- a/include/media/drv-intf/msp3400.h
+++ b/include/media/drv-intf/msp3400.h
@@ -80,17 +80,17 @@
  */
 
 /* SCART input to DSP selection */
-#define MSP_IN_SCART1  		0  /* Pin SC1_IN */
-#define MSP_IN_SCART2  		1  /* Pin SC2_IN */
-#define MSP_IN_SCART3  		2  /* Pin SC3_IN */
-#define MSP_IN_SCART4  		3  /* Pin SC4_IN */
-#define MSP_IN_MONO     	6  /* Pin MONO_IN */
-#define MSP_IN_MUTE     	7  /* Mute DSP input */
-#define MSP_SCART_TO_DSP(in) 	(in)
+#define MSP_IN_SCART1		0  /* Pin SC1_IN */
+#define MSP_IN_SCART2		1  /* Pin SC2_IN */
+#define MSP_IN_SCART3		2  /* Pin SC3_IN */
+#define MSP_IN_SCART4		3  /* Pin SC4_IN */
+#define MSP_IN_MONO		6  /* Pin MONO_IN */
+#define MSP_IN_MUTE		7  /* Mute DSP input */
+#define MSP_SCART_TO_DSP(in)	(in)
 /* Tuner input to demodulator and DSP selection */
-#define MSP_IN_TUNER1 		0  /* Analog Sound IF input pin ANA_IN1 */
-#define MSP_IN_TUNER2 		1  /* Analog Sound IF input pin ANA_IN2 */
-#define MSP_TUNER_TO_DSP(in) 	((in) << 3)
+#define MSP_IN_TUNER1		0  /* Analog Sound IF input pin ANA_IN1 */
+#define MSP_IN_TUNER2		1  /* Analog Sound IF input pin ANA_IN2 */
+#define MSP_TUNER_TO_DSP(in)	((in) << 3)
 
 /* The msp has up to 5 DSP outputs, each output can independently select
    a DSP input.
@@ -109,30 +109,30 @@
    DSP. This is currently not implemented. Also not implemented is the
    multi-channel capable I2S3 input of the 44x0G. If someone can demonstrate
    a need for one of those features then additional support can be added. */
-#define MSP_DSP_IN_TUNER 	0  /* Tuner DSP input */
-#define MSP_DSP_IN_SCART 	2  /* SCART DSP input */
-#define MSP_DSP_IN_I2S1 	5  /* I2S1 DSP input */
-#define MSP_DSP_IN_I2S2 	6  /* I2S2 DSP input */
-#define MSP_DSP_IN_I2S3    	7  /* I2S3 DSP input */
-#define MSP_DSP_IN_MAIN_AVC 	11 /* MAIN AVC processed DSP input */
-#define MSP_DSP_IN_MAIN 	12 /* MAIN DSP input */
-#define MSP_DSP_IN_AUX 		13 /* AUX DSP input */
-#define MSP_DSP_TO_MAIN(in)   	((in) << 4)
-#define MSP_DSP_TO_AUX(in)    	((in) << 8)
-#define MSP_DSP_TO_SCART1(in) 	((in) << 12)
-#define MSP_DSP_TO_SCART2(in) 	((in) << 16)
-#define MSP_DSP_TO_I2S(in)    	((in) << 20)
+#define MSP_DSP_IN_TUNER	0  /* Tuner DSP input */
+#define MSP_DSP_IN_SCART	2  /* SCART DSP input */
+#define MSP_DSP_IN_I2S1		5  /* I2S1 DSP input */
+#define MSP_DSP_IN_I2S2		6  /* I2S2 DSP input */
+#define MSP_DSP_IN_I2S3		7  /* I2S3 DSP input */
+#define MSP_DSP_IN_MAIN_AVC	11 /* MAIN AVC processed DSP input */
+#define MSP_DSP_IN_MAIN		12 /* MAIN DSP input */
+#define MSP_DSP_IN_AUX		13 /* AUX DSP input */
+#define MSP_DSP_TO_MAIN(in)	((in) << 4)
+#define MSP_DSP_TO_AUX(in)	((in) << 8)
+#define MSP_DSP_TO_SCART1(in)	((in) << 12)
+#define MSP_DSP_TO_SCART2(in)	((in) << 16)
+#define MSP_DSP_TO_I2S(in)	((in) << 20)
 
 /* Output SCART select: the SCART outputs can select which input
    to use. */
-#define MSP_SC_IN_SCART1 	0  /* SCART1 input, bypassing the DSP */
-#define MSP_SC_IN_SCART2 	1  /* SCART2 input, bypassing the DSP */
-#define MSP_SC_IN_SCART3 	2  /* SCART3 input, bypassing the DSP */
-#define MSP_SC_IN_SCART4 	3  /* SCART4 input, bypassing the DSP */
-#define MSP_SC_IN_DSP_SCART1 	4  /* DSP SCART1 input */
-#define MSP_SC_IN_DSP_SCART2 	5  /* DSP SCART2 input */
-#define MSP_SC_IN_MONO 		6  /* MONO input, bypassing the DSP */
-#define MSP_SC_IN_MUTE 		7  /* MUTE output */
+#define MSP_SC_IN_SCART1	0  /* SCART1 input, bypassing the DSP */
+#define MSP_SC_IN_SCART2	1  /* SCART2 input, bypassing the DSP */
+#define MSP_SC_IN_SCART3	2  /* SCART3 input, bypassing the DSP */
+#define MSP_SC_IN_SCART4	3  /* SCART4 input, bypassing the DSP */
+#define MSP_SC_IN_DSP_SCART1	4  /* DSP SCART1 input */
+#define MSP_SC_IN_DSP_SCART2	5  /* DSP SCART2 input */
+#define MSP_SC_IN_MONO		6  /* MONO input, bypassing the DSP */
+#define MSP_SC_IN_MUTE		7  /* MUTE output */
 #define MSP_SC_TO_SCART1(in)	(in)
 #define MSP_SC_TO_SCART2(in)	((in) << 4)
 
diff --git a/include/media/drv-intf/saa7146.h b/include/media/drv-intf/saa7146.h
index 769c6cf7eb4c..a7bf2c4a2e4d 100644
--- a/include/media/drv-intf/saa7146.h
+++ b/include/media/drv-intf/saa7146.h
@@ -118,7 +118,7 @@ struct saa7146_dev
 {
 	struct module			*module;
 
-	struct v4l2_device 		v4l2_dev;
+	struct v4l2_device		v4l2_dev;
 	struct v4l2_ctrl_handler	ctrl_handler;
 
 	/* different device locks */
diff --git a/include/media/i2c/bt819.h b/include/media/i2c/bt819.h
index 8025f4bc2bb6..1bcf0dbeb516 100644
--- a/include/media/i2c/bt819.h
+++ b/include/media/i2c/bt819.h
@@ -30,7 +30,7 @@
 
    Note: these ioctls that internal to the kernel and are never called
    from userspace. */
-#define BT819_FIFO_RESET_LOW 	_IO('b', 0)
-#define BT819_FIFO_RESET_HIGH 	_IO('b', 1)
+#define BT819_FIFO_RESET_LOW	_IO('b', 0)
+#define BT819_FIFO_RESET_HIGH	_IO('b', 1)
 
 #endif
diff --git a/include/media/i2c/m52790.h b/include/media/i2c/m52790.h
index 7ddffae31a67..8d9db3cf6fab 100644
--- a/include/media/i2c/m52790.h
+++ b/include/media/i2c/m52790.h
@@ -23,57 +23,57 @@
 
 /* Input routing switch 1 */
 
-#define M52790_SW1_IN_MASK 	0x0003
-#define M52790_SW1_IN_TUNER 	0x0000
-#define M52790_SW1_IN_V2    	0x0001
-#define M52790_SW1_IN_V3    	0x0002
-#define M52790_SW1_IN_V4    	0x0003
+#define M52790_SW1_IN_MASK	0x0003
+#define M52790_SW1_IN_TUNER	0x0000
+#define M52790_SW1_IN_V2	0x0001
+#define M52790_SW1_IN_V3	0x0002
+#define M52790_SW1_IN_V4	0x0003
 
 /* Selects component input instead of composite */
-#define M52790_SW1_YCMIX    	0x0004
+#define M52790_SW1_YCMIX	0x0004
 
 
 /* Input routing switch 2 */
 
-#define M52790_SW2_IN_MASK 	0x0300
-#define M52790_SW2_IN_TUNER 	0x0000
-#define M52790_SW2_IN_V2    	0x0100
-#define M52790_SW2_IN_V3    	0x0200
-#define M52790_SW2_IN_V4    	0x0300
+#define M52790_SW2_IN_MASK	0x0300
+#define M52790_SW2_IN_TUNER	0x0000
+#define M52790_SW2_IN_V2	0x0100
+#define M52790_SW2_IN_V3	0x0200
+#define M52790_SW2_IN_V4	0x0300
 
 /* Selects component input instead of composite */
-#define M52790_SW2_YCMIX    	0x0400
+#define M52790_SW2_YCMIX	0x0400
 
 
 /* Output routing switch 1 */
 
 /* Enable 6dB amplifier for composite out */
-#define M52790_SW1_V_AMP    	0x0008
+#define M52790_SW1_V_AMP	0x0008
 
 /* Enable 6dB amplifier for component out */
-#define M52790_SW1_YC_AMP   	0x0010
+#define M52790_SW1_YC_AMP	0x0010
 
 /* Audio output mode */
-#define M52790_SW1_AUDIO_MASK 	0x00c0
-#define M52790_SW1_AUDIO_MUTE 	0x0000
-#define M52790_SW1_AUDIO_R 	0x0040
-#define M52790_SW1_AUDIO_L 	0x0080
+#define M52790_SW1_AUDIO_MASK	0x00c0
+#define M52790_SW1_AUDIO_MUTE	0x0000
+#define M52790_SW1_AUDIO_R	0x0040
+#define M52790_SW1_AUDIO_L	0x0080
 #define M52790_SW1_AUDIO_STEREO 0x00c0
 
 
 /* Output routing switch 2 */
 
 /* Enable 6dB amplifier for composite out */
-#define M52790_SW2_V_AMP    	0x0800
+#define M52790_SW2_V_AMP	0x0800
 
 /* Enable 6dB amplifier for component out */
-#define M52790_SW2_YC_AMP   	0x1000
+#define M52790_SW2_YC_AMP	0x1000
 
 /* Audio output mode */
-#define M52790_SW2_AUDIO_MASK 	0xc000
-#define M52790_SW2_AUDIO_MUTE 	0x0000
-#define M52790_SW2_AUDIO_R 	0x4000
-#define M52790_SW2_AUDIO_L 	0x8000
+#define M52790_SW2_AUDIO_MASK	0xc000
+#define M52790_SW2_AUDIO_MUTE	0x0000
+#define M52790_SW2_AUDIO_R	0x4000
+#define M52790_SW2_AUDIO_L	0x8000
 #define M52790_SW2_AUDIO_STEREO 0xc000
 
 
@@ -83,9 +83,9 @@
 #define M52790_IN_V3    (M52790_SW1_IN_V3 | M52790_SW2_IN_V3)
 #define M52790_IN_V4    (M52790_SW1_IN_V4 | M52790_SW2_IN_V4)
 
-#define M52790_OUT_STEREO 	(M52790_SW1_AUDIO_STEREO | \
+#define M52790_OUT_STEREO	(M52790_SW1_AUDIO_STEREO | \
 				 M52790_SW2_AUDIO_STEREO)
-#define M52790_OUT_AMP_STEREO 	(M52790_SW1_AUDIO_STEREO | \
+#define M52790_OUT_AMP_STEREO	(M52790_SW1_AUDIO_STEREO | \
 				 M52790_SW1_V_AMP | \
 				 M52790_SW2_AUDIO_STEREO | \
 				 M52790_SW2_V_AMP)
diff --git a/include/media/i2c/saa7115.h b/include/media/i2c/saa7115.h
index 53954c90e7f6..a0cda423509d 100644
--- a/include/media/i2c/saa7115.h
+++ b/include/media/i2c/saa7115.h
@@ -36,15 +36,15 @@
 #define SAA7115_SVIDEO3    9
 
 /* outputs */
-#define SAA7115_IPORT_ON    	1
-#define SAA7115_IPORT_OFF   	0
+#define SAA7115_IPORT_ON	1
+#define SAA7115_IPORT_OFF	0
 
 /* SAA7111 specific outputs. */
-#define SAA7111_VBI_BYPASS 	2
+#define SAA7111_VBI_BYPASS	2
 #define SAA7111_FMT_YUV422      0x00
-#define SAA7111_FMT_RGB 	0x40
-#define SAA7111_FMT_CCIR 	0x80
-#define SAA7111_FMT_YUV411 	0xc0
+#define SAA7111_FMT_RGB		0x40
+#define SAA7111_FMT_CCIR	0x80
+#define SAA7111_FMT_YUV411	0xc0
 
 /* config flags */
 /*
diff --git a/include/media/i2c/upd64031a.h b/include/media/i2c/upd64031a.h
index 48ec03c4ef23..1eba24dfee48 100644
--- a/include/media/i2c/upd64031a.h
+++ b/include/media/i2c/upd64031a.h
@@ -18,9 +18,9 @@
 #define _UPD64031A_H_
 
 /* Ghost reduction modes */
-#define UPD64031A_GR_ON 	0
-#define UPD64031A_GR_OFF 	1
-#define UPD64031A_GR_THROUGH 	3
+#define UPD64031A_GR_ON		0
+#define UPD64031A_GR_OFF	1
+#define UPD64031A_GR_THROUGH	3
 
 /* Direct 3D/YCS Connection */
 #define UPD64031A_3DYCS_DISABLE   (0 << 2)
diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index 7fc0bc6b8007..e0d95a7c5d48 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -50,7 +50,7 @@
 /* These three macros assume that the debug level is set with a module
    parameter called 'debug'. */
 #define v4l_dbg(level, debug, client, fmt, arg...)			     \
-	do { 								     \
+	do {								     \
 		if (debug >= (level))					     \
 			v4l_client_printk(KERN_DEBUG, client, fmt , ## arg); \
 	} while (0)
@@ -80,9 +80,9 @@
 /* These three macros assume that the debug level is set with a module
    parameter called 'debug'. */
 #define v4l2_dbg(level, debug, dev, fmt, arg...)			\
-	do { 								\
+	do {								\
 		if (debug >= (level))					\
-			v4l2_printk(KERN_DEBUG, dev, fmt , ## arg); 	\
+			v4l2_printk(KERN_DEBUG, dev, fmt , ## arg);	\
 	} while (0)
 
 /**
@@ -266,7 +266,7 @@ struct v4l2_priv_tun_config {
 };
 #define TUNER_SET_CONFIG           _IOW('d', 92, struct v4l2_priv_tun_config)
 
-#define VIDIOC_INT_RESET            	_IOW ('d', 102, u32)
+#define VIDIOC_INT_RESET		_IOW ('d', 102, u32)
 
 /* ------------------------------------------------------------------------- */
 
diff --git a/include/uapi/linux/dvb/video.h b/include/uapi/linux/dvb/video.h
index 4d51f98182bb..df3d7028c807 100644
--- a/include/uapi/linux/dvb/video.h
+++ b/include/uapi/linux/dvb/video.h
@@ -83,11 +83,11 @@ typedef enum {
 #define VIDEO_CMD_CONTINUE    (3)
 
 /* Flags for VIDEO_CMD_FREEZE */
-#define VIDEO_CMD_FREEZE_TO_BLACK     	(1 << 0)
+#define VIDEO_CMD_FREEZE_TO_BLACK	(1 << 0)
 
 /* Flags for VIDEO_CMD_STOP */
-#define VIDEO_CMD_STOP_TO_BLACK      	(1 << 0)
-#define VIDEO_CMD_STOP_IMMEDIATELY     	(1 << 1)
+#define VIDEO_CMD_STOP_TO_BLACK		(1 << 0)
+#define VIDEO_CMD_STOP_IMMEDIATELY	(1 << 1)
 
 /* Play input formats: */
 /* The decoder has no special format requirements */
@@ -124,8 +124,8 @@ struct video_command {
 /* FIELD_UNKNOWN can be used if the hardware does not know whether
    the Vsync is for an odd, even or progressive (i.e. non-interlaced)
    field. */
-#define VIDEO_VSYNC_FIELD_UNKNOWN  	(0)
-#define VIDEO_VSYNC_FIELD_ODD 		(1)
+#define VIDEO_VSYNC_FIELD_UNKNOWN	(0)
+#define VIDEO_VSYNC_FIELD_ODD		(1)
 #define VIDEO_VSYNC_FIELD_EVEN		(2)
 #define VIDEO_VSYNC_FIELD_PROGRESSIVE	(3)
 
@@ -133,8 +133,8 @@ struct video_event {
 	__s32 type;
 #define VIDEO_EVENT_SIZE_CHANGED	1
 #define VIDEO_EVENT_FRAME_RATE_CHANGED	2
-#define VIDEO_EVENT_DECODER_STOPPED 	3
-#define VIDEO_EVENT_VSYNC 		4
+#define VIDEO_EVENT_DECODER_STOPPED	3
+#define VIDEO_EVENT_VSYNC		4
 	/* unused, make sure to use atomic time for y2038 if it ever gets used */
 	long timestamp;
 	union {
@@ -268,9 +268,9 @@ typedef __u16 video_attributes_t;
 #define VIDEO_GET_PTS              _IOR('o', 57, __u64)
 
 /* Read the number of displayed frames since the decoder was started */
-#define VIDEO_GET_FRAME_COUNT  	   _IOR('o', 58, __u64)
+#define VIDEO_GET_FRAME_COUNT	   _IOR('o', 58, __u64)
 
-#define VIDEO_COMMAND     	   _IOWR('o', 59, struct video_command)
-#define VIDEO_TRY_COMMAND 	   _IOWR('o', 60, struct video_command)
+#define VIDEO_COMMAND		   _IOWR('o', 59, struct video_command)
+#define VIDEO_TRY_COMMAND	   _IOWR('o', 60, struct video_command)
 
 #endif /* _UAPI_DVBVIDEO_H_ */
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index a692623e0236..cbbb750d87d1 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -67,8 +67,8 @@
 /* User-class control IDs */
 
 #define V4L2_CID_BASE			(V4L2_CTRL_CLASS_USER | 0x900)
-#define V4L2_CID_USER_BASE 		V4L2_CID_BASE
-#define V4L2_CID_USER_CLASS 		(V4L2_CTRL_CLASS_USER | 1)
+#define V4L2_CID_USER_BASE		V4L2_CID_BASE
+#define V4L2_CID_USER_CLASS		(V4L2_CTRL_CLASS_USER | 1)
 #define V4L2_CID_BRIGHTNESS		(V4L2_CID_BASE+0)
 #define V4L2_CID_CONTRAST		(V4L2_CID_BASE+1)
 #define V4L2_CID_SATURATION		(V4L2_CID_BASE+2)
@@ -102,7 +102,7 @@ enum v4l2_power_line_frequency {
 #define V4L2_CID_HUE_AUTO			(V4L2_CID_BASE+25)
 #define V4L2_CID_WHITE_BALANCE_TEMPERATURE	(V4L2_CID_BASE+26)
 #define V4L2_CID_SHARPNESS			(V4L2_CID_BASE+27)
-#define V4L2_CID_BACKLIGHT_COMPENSATION 	(V4L2_CID_BASE+28)
+#define V4L2_CID_BACKLIGHT_COMPENSATION		(V4L2_CID_BASE+28)
 #define V4L2_CID_CHROMA_AGC                     (V4L2_CID_BASE+29)
 #define V4L2_CID_COLOR_KILLER                   (V4L2_CID_BASE+30)
 #define V4L2_CID_COLORFX			(V4L2_CID_BASE+31)
@@ -194,11 +194,11 @@ enum v4l2_colorfx {
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
 
-#define V4L2_CID_MPEG_BASE 			(V4L2_CTRL_CLASS_MPEG | 0x900)
-#define V4L2_CID_MPEG_CLASS 			(V4L2_CTRL_CLASS_MPEG | 1)
+#define V4L2_CID_MPEG_BASE			(V4L2_CTRL_CLASS_MPEG | 0x900)
+#define V4L2_CID_MPEG_CLASS			(V4L2_CTRL_CLASS_MPEG | 1)
 
 /*  MPEG streams, specific to multiplexed streams */
-#define V4L2_CID_MPEG_STREAM_TYPE 		(V4L2_CID_MPEG_BASE+0)
+#define V4L2_CID_MPEG_STREAM_TYPE		(V4L2_CID_MPEG_BASE+0)
 enum v4l2_mpeg_stream_type {
 	V4L2_MPEG_STREAM_TYPE_MPEG2_PS   = 0, /* MPEG-2 program stream */
 	V4L2_MPEG_STREAM_TYPE_MPEG2_TS   = 1, /* MPEG-2 transport stream */
@@ -207,26 +207,26 @@ enum v4l2_mpeg_stream_type {
 	V4L2_MPEG_STREAM_TYPE_MPEG1_VCD  = 4, /* MPEG-1 VCD-compatible stream */
 	V4L2_MPEG_STREAM_TYPE_MPEG2_SVCD = 5, /* MPEG-2 SVCD-compatible stream */
 };
-#define V4L2_CID_MPEG_STREAM_PID_PMT 		(V4L2_CID_MPEG_BASE+1)
-#define V4L2_CID_MPEG_STREAM_PID_AUDIO 		(V4L2_CID_MPEG_BASE+2)
-#define V4L2_CID_MPEG_STREAM_PID_VIDEO 		(V4L2_CID_MPEG_BASE+3)
-#define V4L2_CID_MPEG_STREAM_PID_PCR 		(V4L2_CID_MPEG_BASE+4)
-#define V4L2_CID_MPEG_STREAM_PES_ID_AUDIO 	(V4L2_CID_MPEG_BASE+5)
-#define V4L2_CID_MPEG_STREAM_PES_ID_VIDEO 	(V4L2_CID_MPEG_BASE+6)
-#define V4L2_CID_MPEG_STREAM_VBI_FMT 		(V4L2_CID_MPEG_BASE+7)
+#define V4L2_CID_MPEG_STREAM_PID_PMT		(V4L2_CID_MPEG_BASE+1)
+#define V4L2_CID_MPEG_STREAM_PID_AUDIO		(V4L2_CID_MPEG_BASE+2)
+#define V4L2_CID_MPEG_STREAM_PID_VIDEO		(V4L2_CID_MPEG_BASE+3)
+#define V4L2_CID_MPEG_STREAM_PID_PCR		(V4L2_CID_MPEG_BASE+4)
+#define V4L2_CID_MPEG_STREAM_PES_ID_AUDIO	(V4L2_CID_MPEG_BASE+5)
+#define V4L2_CID_MPEG_STREAM_PES_ID_VIDEO	(V4L2_CID_MPEG_BASE+6)
+#define V4L2_CID_MPEG_STREAM_VBI_FMT		(V4L2_CID_MPEG_BASE+7)
 enum v4l2_mpeg_stream_vbi_fmt {
 	V4L2_MPEG_STREAM_VBI_FMT_NONE = 0,  /* No VBI in the MPEG stream */
 	V4L2_MPEG_STREAM_VBI_FMT_IVTV = 1,  /* VBI in private packets, IVTV format */
 };
 
 /*  MPEG audio controls specific to multiplexed streams  */
-#define V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ 	(V4L2_CID_MPEG_BASE+100)
+#define V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ	(V4L2_CID_MPEG_BASE+100)
 enum v4l2_mpeg_audio_sampling_freq {
 	V4L2_MPEG_AUDIO_SAMPLING_FREQ_44100 = 0,
 	V4L2_MPEG_AUDIO_SAMPLING_FREQ_48000 = 1,
 	V4L2_MPEG_AUDIO_SAMPLING_FREQ_32000 = 2,
 };
-#define V4L2_CID_MPEG_AUDIO_ENCODING 		(V4L2_CID_MPEG_BASE+101)
+#define V4L2_CID_MPEG_AUDIO_ENCODING		(V4L2_CID_MPEG_BASE+101)
 enum v4l2_mpeg_audio_encoding {
 	V4L2_MPEG_AUDIO_ENCODING_LAYER_1 = 0,
 	V4L2_MPEG_AUDIO_ENCODING_LAYER_2 = 1,
@@ -234,7 +234,7 @@ enum v4l2_mpeg_audio_encoding {
 	V4L2_MPEG_AUDIO_ENCODING_AAC     = 3,
 	V4L2_MPEG_AUDIO_ENCODING_AC3     = 4,
 };
-#define V4L2_CID_MPEG_AUDIO_L1_BITRATE 		(V4L2_CID_MPEG_BASE+102)
+#define V4L2_CID_MPEG_AUDIO_L1_BITRATE		(V4L2_CID_MPEG_BASE+102)
 enum v4l2_mpeg_audio_l1_bitrate {
 	V4L2_MPEG_AUDIO_L1_BITRATE_32K  = 0,
 	V4L2_MPEG_AUDIO_L1_BITRATE_64K  = 1,
@@ -251,7 +251,7 @@ enum v4l2_mpeg_audio_l1_bitrate {
 	V4L2_MPEG_AUDIO_L1_BITRATE_416K = 12,
 	V4L2_MPEG_AUDIO_L1_BITRATE_448K = 13,
 };
-#define V4L2_CID_MPEG_AUDIO_L2_BITRATE 		(V4L2_CID_MPEG_BASE+103)
+#define V4L2_CID_MPEG_AUDIO_L2_BITRATE		(V4L2_CID_MPEG_BASE+103)
 enum v4l2_mpeg_audio_l2_bitrate {
 	V4L2_MPEG_AUDIO_L2_BITRATE_32K  = 0,
 	V4L2_MPEG_AUDIO_L2_BITRATE_48K  = 1,
@@ -268,7 +268,7 @@ enum v4l2_mpeg_audio_l2_bitrate {
 	V4L2_MPEG_AUDIO_L2_BITRATE_320K = 12,
 	V4L2_MPEG_AUDIO_L2_BITRATE_384K = 13,
 };
-#define V4L2_CID_MPEG_AUDIO_L3_BITRATE 		(V4L2_CID_MPEG_BASE+104)
+#define V4L2_CID_MPEG_AUDIO_L3_BITRATE		(V4L2_CID_MPEG_BASE+104)
 enum v4l2_mpeg_audio_l3_bitrate {
 	V4L2_MPEG_AUDIO_L3_BITRATE_32K  = 0,
 	V4L2_MPEG_AUDIO_L3_BITRATE_40K  = 1,
@@ -285,32 +285,32 @@ enum v4l2_mpeg_audio_l3_bitrate {
 	V4L2_MPEG_AUDIO_L3_BITRATE_256K = 12,
 	V4L2_MPEG_AUDIO_L3_BITRATE_320K = 13,
 };
-#define V4L2_CID_MPEG_AUDIO_MODE 		(V4L2_CID_MPEG_BASE+105)
+#define V4L2_CID_MPEG_AUDIO_MODE		(V4L2_CID_MPEG_BASE+105)
 enum v4l2_mpeg_audio_mode {
 	V4L2_MPEG_AUDIO_MODE_STEREO       = 0,
 	V4L2_MPEG_AUDIO_MODE_JOINT_STEREO = 1,
 	V4L2_MPEG_AUDIO_MODE_DUAL         = 2,
 	V4L2_MPEG_AUDIO_MODE_MONO         = 3,
 };
-#define V4L2_CID_MPEG_AUDIO_MODE_EXTENSION 	(V4L2_CID_MPEG_BASE+106)
+#define V4L2_CID_MPEG_AUDIO_MODE_EXTENSION	(V4L2_CID_MPEG_BASE+106)
 enum v4l2_mpeg_audio_mode_extension {
 	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_4  = 0,
 	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_8  = 1,
 	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_12 = 2,
 	V4L2_MPEG_AUDIO_MODE_EXTENSION_BOUND_16 = 3,
 };
-#define V4L2_CID_MPEG_AUDIO_EMPHASIS 		(V4L2_CID_MPEG_BASE+107)
+#define V4L2_CID_MPEG_AUDIO_EMPHASIS		(V4L2_CID_MPEG_BASE+107)
 enum v4l2_mpeg_audio_emphasis {
 	V4L2_MPEG_AUDIO_EMPHASIS_NONE         = 0,
 	V4L2_MPEG_AUDIO_EMPHASIS_50_DIV_15_uS = 1,
 	V4L2_MPEG_AUDIO_EMPHASIS_CCITT_J17    = 2,
 };
-#define V4L2_CID_MPEG_AUDIO_CRC 		(V4L2_CID_MPEG_BASE+108)
+#define V4L2_CID_MPEG_AUDIO_CRC			(V4L2_CID_MPEG_BASE+108)
 enum v4l2_mpeg_audio_crc {
 	V4L2_MPEG_AUDIO_CRC_NONE  = 0,
 	V4L2_MPEG_AUDIO_CRC_CRC16 = 1,
 };
-#define V4L2_CID_MPEG_AUDIO_MUTE 		(V4L2_CID_MPEG_BASE+109)
+#define V4L2_CID_MPEG_AUDIO_MUTE		(V4L2_CID_MPEG_BASE+109)
 #define V4L2_CID_MPEG_AUDIO_AAC_BITRATE		(V4L2_CID_MPEG_BASE+110)
 #define V4L2_CID_MPEG_AUDIO_AC3_BITRATE		(V4L2_CID_MPEG_BASE+111)
 enum v4l2_mpeg_audio_ac3_bitrate {
@@ -346,33 +346,33 @@ enum v4l2_mpeg_audio_dec_playback {
 #define V4L2_CID_MPEG_AUDIO_DEC_MULTILINGUAL_PLAYBACK (V4L2_CID_MPEG_BASE+113)
 
 /*  MPEG video controls specific to multiplexed streams */
-#define V4L2_CID_MPEG_VIDEO_ENCODING 		(V4L2_CID_MPEG_BASE+200)
+#define V4L2_CID_MPEG_VIDEO_ENCODING		(V4L2_CID_MPEG_BASE+200)
 enum v4l2_mpeg_video_encoding {
 	V4L2_MPEG_VIDEO_ENCODING_MPEG_1     = 0,
 	V4L2_MPEG_VIDEO_ENCODING_MPEG_2     = 1,
 	V4L2_MPEG_VIDEO_ENCODING_MPEG_4_AVC = 2,
 };
-#define V4L2_CID_MPEG_VIDEO_ASPECT 		(V4L2_CID_MPEG_BASE+201)
+#define V4L2_CID_MPEG_VIDEO_ASPECT		(V4L2_CID_MPEG_BASE+201)
 enum v4l2_mpeg_video_aspect {
 	V4L2_MPEG_VIDEO_ASPECT_1x1     = 0,
 	V4L2_MPEG_VIDEO_ASPECT_4x3     = 1,
 	V4L2_MPEG_VIDEO_ASPECT_16x9    = 2,
 	V4L2_MPEG_VIDEO_ASPECT_221x100 = 3,
 };
-#define V4L2_CID_MPEG_VIDEO_B_FRAMES 		(V4L2_CID_MPEG_BASE+202)
-#define V4L2_CID_MPEG_VIDEO_GOP_SIZE 		(V4L2_CID_MPEG_BASE+203)
-#define V4L2_CID_MPEG_VIDEO_GOP_CLOSURE 	(V4L2_CID_MPEG_BASE+204)
-#define V4L2_CID_MPEG_VIDEO_PULLDOWN 		(V4L2_CID_MPEG_BASE+205)
-#define V4L2_CID_MPEG_VIDEO_BITRATE_MODE 	(V4L2_CID_MPEG_BASE+206)
+#define V4L2_CID_MPEG_VIDEO_B_FRAMES		(V4L2_CID_MPEG_BASE+202)
+#define V4L2_CID_MPEG_VIDEO_GOP_SIZE		(V4L2_CID_MPEG_BASE+203)
+#define V4L2_CID_MPEG_VIDEO_GOP_CLOSURE		(V4L2_CID_MPEG_BASE+204)
+#define V4L2_CID_MPEG_VIDEO_PULLDOWN		(V4L2_CID_MPEG_BASE+205)
+#define V4L2_CID_MPEG_VIDEO_BITRATE_MODE	(V4L2_CID_MPEG_BASE+206)
 enum v4l2_mpeg_video_bitrate_mode {
 	V4L2_MPEG_VIDEO_BITRATE_MODE_VBR = 0,
 	V4L2_MPEG_VIDEO_BITRATE_MODE_CBR = 1,
 };
-#define V4L2_CID_MPEG_VIDEO_BITRATE 		(V4L2_CID_MPEG_BASE+207)
-#define V4L2_CID_MPEG_VIDEO_BITRATE_PEAK 	(V4L2_CID_MPEG_BASE+208)
+#define V4L2_CID_MPEG_VIDEO_BITRATE		(V4L2_CID_MPEG_BASE+207)
+#define V4L2_CID_MPEG_VIDEO_BITRATE_PEAK	(V4L2_CID_MPEG_BASE+208)
 #define V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION (V4L2_CID_MPEG_BASE+209)
-#define V4L2_CID_MPEG_VIDEO_MUTE 		(V4L2_CID_MPEG_BASE+210)
-#define V4L2_CID_MPEG_VIDEO_MUTE_YUV 		(V4L2_CID_MPEG_BASE+211)
+#define V4L2_CID_MPEG_VIDEO_MUTE		(V4L2_CID_MPEG_BASE+210)
+#define V4L2_CID_MPEG_VIDEO_MUTE_YUV		(V4L2_CID_MPEG_BASE+211)
 #define V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE		(V4L2_CID_MPEG_BASE+212)
 #define V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER	(V4L2_CID_MPEG_BASE+213)
 #define V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB		(V4L2_CID_MPEG_BASE+214)
@@ -590,14 +590,14 @@ enum v4l2_vp8_golden_frame_sel {
 #define V4L2_CID_MPEG_VIDEO_VPX_PROFILE			(V4L2_CID_MPEG_BASE+511)
 
 /*  MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */
-#define V4L2_CID_MPEG_CX2341X_BASE 				(V4L2_CTRL_CLASS_MPEG | 0x1000)
-#define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE 	(V4L2_CID_MPEG_CX2341X_BASE+0)
+#define V4L2_CID_MPEG_CX2341X_BASE				(V4L2_CTRL_CLASS_MPEG | 0x1000)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE		(V4L2_CID_MPEG_CX2341X_BASE+0)
 enum v4l2_mpeg_cx2341x_video_spatial_filter_mode {
 	V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_MANUAL = 0,
 	V4L2_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE_AUTO   = 1,
 };
-#define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER 		(V4L2_CID_MPEG_CX2341X_BASE+1)
-#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE 	(V4L2_CID_MPEG_CX2341X_BASE+2)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER		(V4L2_CID_MPEG_CX2341X_BASE+1)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE	(V4L2_CID_MPEG_CX2341X_BASE+2)
 enum v4l2_mpeg_cx2341x_video_luma_spatial_filter_type {
 	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_OFF                  = 0,
 	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_1D_HOR               = 1,
@@ -605,18 +605,18 @@ enum v4l2_mpeg_cx2341x_video_luma_spatial_filter_type {
 	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_HV_SEPARABLE      = 3,
 	V4L2_MPEG_CX2341X_VIDEO_LUMA_SPATIAL_FILTER_TYPE_2D_SYM_NON_SEPARABLE = 4,
 };
-#define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE 	(V4L2_CID_MPEG_CX2341X_BASE+3)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE	(V4L2_CID_MPEG_CX2341X_BASE+3)
 enum v4l2_mpeg_cx2341x_video_chroma_spatial_filter_type {
 	V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_OFF    = 0,
 	V4L2_MPEG_CX2341X_VIDEO_CHROMA_SPATIAL_FILTER_TYPE_1D_HOR = 1,
 };
-#define V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE 	(V4L2_CID_MPEG_CX2341X_BASE+4)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE	(V4L2_CID_MPEG_CX2341X_BASE+4)
 enum v4l2_mpeg_cx2341x_video_temporal_filter_mode {
 	V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_MANUAL = 0,
 	V4L2_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER_MODE_AUTO   = 1,
 };
-#define V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER 		(V4L2_CID_MPEG_CX2341X_BASE+5)
-#define V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE 		(V4L2_CID_MPEG_CX2341X_BASE+6)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_TEMPORAL_FILTER		(V4L2_CID_MPEG_CX2341X_BASE+5)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE		(V4L2_CID_MPEG_CX2341X_BASE+6)
 enum v4l2_mpeg_cx2341x_video_median_filter_type {
 	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_OFF      = 0,
 	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_HOR      = 1,
@@ -624,11 +624,11 @@ enum v4l2_mpeg_cx2341x_video_median_filter_type {
 	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_HOR_VERT = 3,
 	V4L2_MPEG_CX2341X_VIDEO_MEDIAN_FILTER_TYPE_DIAG     = 4,
 };
-#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM 	(V4L2_CID_MPEG_CX2341X_BASE+7)
-#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP 	(V4L2_CID_MPEG_CX2341X_BASE+8)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_BOTTOM	(V4L2_CID_MPEG_CX2341X_BASE+7)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_LUMA_MEDIAN_FILTER_TOP	(V4L2_CID_MPEG_CX2341X_BASE+8)
 #define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_BOTTOM	(V4L2_CID_MPEG_CX2341X_BASE+9)
-#define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP 	(V4L2_CID_MPEG_CX2341X_BASE+10)
-#define V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS 	(V4L2_CID_MPEG_CX2341X_BASE+11)
+#define V4L2_CID_MPEG_CX2341X_VIDEO_CHROMA_MEDIAN_FILTER_TOP	(V4L2_CID_MPEG_CX2341X_BASE+10)
+#define V4L2_CID_MPEG_CX2341X_STREAM_INSERT_NAV_PACKETS		(V4L2_CID_MPEG_CX2341X_BASE+11)
 
 /*  MPEG-class control IDs specific to the Samsung MFC 5.1 driver as defined by V4L2 */
 #define V4L2_CID_MPEG_MFC51_BASE				(V4L2_CTRL_CLASS_MPEG | 0x1100)
@@ -660,8 +660,8 @@ enum v4l2_mpeg_mfc51_video_force_frame_type {
 
 /*  Camera class control IDs */
 
-#define V4L2_CID_CAMERA_CLASS_BASE 	(V4L2_CTRL_CLASS_CAMERA | 0x900)
-#define V4L2_CID_CAMERA_CLASS 		(V4L2_CTRL_CLASS_CAMERA | 1)
+#define V4L2_CID_CAMERA_CLASS_BASE	(V4L2_CTRL_CLASS_CAMERA | 0x900)
+#define V4L2_CID_CAMERA_CLASS		(V4L2_CTRL_CLASS_CAMERA | 1)
 
 #define V4L2_CID_EXPOSURE_AUTO			(V4L2_CID_CAMERA_CLASS_BASE+1)
 enum  v4l2_exposure_auto_type {
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index faa97fda588a..982718965180 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -107,14 +107,14 @@ enum v4l2_field {
 					 transmitted first */
 };
 #define V4L2_FIELD_HAS_TOP(field)	\
-	((field) == V4L2_FIELD_TOP 	||\
+	((field) == V4L2_FIELD_TOP	||\
 	 (field) == V4L2_FIELD_INTERLACED ||\
 	 (field) == V4L2_FIELD_INTERLACED_TB ||\
 	 (field) == V4L2_FIELD_INTERLACED_BT ||\
 	 (field) == V4L2_FIELD_SEQ_TB	||\
 	 (field) == V4L2_FIELD_SEQ_BT)
 #define V4L2_FIELD_HAS_BOTTOM(field)	\
-	((field) == V4L2_FIELD_BOTTOM 	||\
+	((field) == V4L2_FIELD_BOTTOM	||\
 	 (field) == V4L2_FIELD_INTERLACED ||\
 	 (field) == V4L2_FIELD_INTERLACED_TB ||\
 	 (field) == V4L2_FIELD_INTERLACED_BT ||\
@@ -467,12 +467,12 @@ struct v4l2_capability {
  *	V I D E O   I M A G E   F O R M A T
  */
 struct v4l2_pix_format {
-	__u32         		width;
+	__u32			width;
 	__u32			height;
 	__u32			pixelformat;
 	__u32			field;		/* enum v4l2_field */
-	__u32            	bytesperline;	/* for padding, zero if unused */
-	__u32          		sizeimage;
+	__u32			bytesperline;	/* for padding, zero if unused */
+	__u32			sizeimage;
 	__u32			colorspace;	/* enum v4l2_colorspace */
 	__u32			priv;		/* private data, depends on pixelformat */
 	__u32			flags;		/* format flags (V4L2_PIX_FMT_FLAG_*) */
@@ -1173,7 +1173,7 @@ typedef __u64 v4l2_std_id;
 				 V4L2_STD_NTSC_M_JP     |\
 				 V4L2_STD_NTSC_M_KR)
 /* Secam macros */
-#define V4L2_STD_SECAM_DK      	(V4L2_STD_SECAM_D	|\
+#define V4L2_STD_SECAM_DK	(V4L2_STD_SECAM_D	|\
 				 V4L2_STD_SECAM_K	|\
 				 V4L2_STD_SECAM_K1)
 /* All Secam Standards */
@@ -1254,7 +1254,7 @@ struct v4l2_standard {
 };
 
 /*
- *	D V 	B T	T I M I N G S
+ *	D V	B T	T I M I N G S
  */
 
 /** struct v4l2_bt_timings - BT.656/BT.1120 timing data
@@ -1595,7 +1595,7 @@ struct v4l2_ext_controls {
 	struct v4l2_ext_control *controls;
 };
 
-#define V4L2_CTRL_ID_MASK      	  (0x0fffffff)
+#define V4L2_CTRL_ID_MASK	  (0x0fffffff)
 #ifndef __KERNEL__
 #define V4L2_CTRL_ID2CLASS(id)    ((id) & 0x0fff0000UL)
 #endif
@@ -1667,11 +1667,11 @@ struct v4l2_querymenu {
 /*  Control flags  */
 #define V4L2_CTRL_FLAG_DISABLED		0x0001
 #define V4L2_CTRL_FLAG_GRABBED		0x0002
-#define V4L2_CTRL_FLAG_READ_ONLY 	0x0004
-#define V4L2_CTRL_FLAG_UPDATE 		0x0008
-#define V4L2_CTRL_FLAG_INACTIVE 	0x0010
-#define V4L2_CTRL_FLAG_SLIDER 		0x0020
-#define V4L2_CTRL_FLAG_WRITE_ONLY 	0x0040
+#define V4L2_CTRL_FLAG_READ_ONLY	0x0004
+#define V4L2_CTRL_FLAG_UPDATE		0x0008
+#define V4L2_CTRL_FLAG_INACTIVE		0x0010
+#define V4L2_CTRL_FLAG_SLIDER		0x0020
+#define V4L2_CTRL_FLAG_WRITE_ONLY	0x0040
 #define V4L2_CTRL_FLAG_VOLATILE		0x0080
 #define V4L2_CTRL_FLAG_HAS_PAYLOAD	0x0100
 #define V4L2_CTRL_FLAG_EXECUTE_ON_WRITE	0x0200
@@ -1785,21 +1785,21 @@ struct v4l2_hw_freq_seek {
  */
 
 struct v4l2_rds_data {
-	__u8 	lsb;
-	__u8 	msb;
-	__u8 	block;
+	__u8	lsb;
+	__u8	msb;
+	__u8	block;
 } __attribute__ ((packed));
 
-#define V4L2_RDS_BLOCK_MSK 	 0x7
-#define V4L2_RDS_BLOCK_A 	 0
-#define V4L2_RDS_BLOCK_B 	 1
-#define V4L2_RDS_BLOCK_C 	 2
-#define V4L2_RDS_BLOCK_D 	 3
-#define V4L2_RDS_BLOCK_C_ALT 	 4
-#define V4L2_RDS_BLOCK_INVALID 	 7
+#define V4L2_RDS_BLOCK_MSK	 0x7
+#define V4L2_RDS_BLOCK_A	 0
+#define V4L2_RDS_BLOCK_B	 1
+#define V4L2_RDS_BLOCK_C	 2
+#define V4L2_RDS_BLOCK_D	 3
+#define V4L2_RDS_BLOCK_C_ALT	 4
+#define V4L2_RDS_BLOCK_INVALID	 7
 
 #define V4L2_RDS_BLOCK_CORRECTED 0x40
-#define V4L2_RDS_BLOCK_ERROR 	 0x80
+#define V4L2_RDS_BLOCK_ERROR	 0x80
 
 /*
  *	A U D I O
@@ -2355,8 +2355,8 @@ struct v4l2_create_buffers {
 #define VIDIOC_S_CROP		 _IOW('V', 60, struct v4l2_crop)
 #define VIDIOC_G_JPEGCOMP	 _IOR('V', 61, struct v4l2_jpegcompression)
 #define VIDIOC_S_JPEGCOMP	 _IOW('V', 62, struct v4l2_jpegcompression)
-#define VIDIOC_QUERYSTD      	 _IOR('V', 63, v4l2_std_id)
-#define VIDIOC_TRY_FMT      	_IOWR('V', 64, struct v4l2_format)
+#define VIDIOC_QUERYSTD		 _IOR('V', 63, v4l2_std_id)
+#define VIDIOC_TRY_FMT		_IOWR('V', 64, struct v4l2_format)
 #define VIDIOC_ENUMAUDIO	_IOWR('V', 65, struct v4l2_audio)
 #define VIDIOC_ENUMAUDOUT	_IOWR('V', 66, struct v4l2_audioout)
 #define VIDIOC_G_PRIORITY	 _IOR('V', 67, __u32) /* enum v4l2_priority */
@@ -2377,8 +2377,8 @@ struct v4l2_create_buffers {
  * Only implemented if CONFIG_VIDEO_ADV_DEBUG is defined.
  * You must be root to use these ioctls. Never use these in applications!
  */
-#define	VIDIOC_DBG_S_REGISTER 	 _IOW('V', 79, struct v4l2_dbg_register)
-#define	VIDIOC_DBG_G_REGISTER 	_IOWR('V', 80, struct v4l2_dbg_register)
+#define	VIDIOC_DBG_S_REGISTER	 _IOW('V', 79, struct v4l2_dbg_register)
+#define	VIDIOC_DBG_G_REGISTER	_IOWR('V', 80, struct v4l2_dbg_register)
 
 #define VIDIOC_S_HW_FREQ_SEEK	 _IOW('V', 82, struct v4l2_hw_freq_seek)
 #define	VIDIOC_S_DV_TIMINGS	_IOWR('V', 87, struct v4l2_dv_timings)
-- 
cgit v1.2.3


From 6926e041a8920c8ec27e4e155efa760aa01551fd Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Wed, 3 Jan 2018 23:14:21 +0100
Subject: uapi/if_ether.h: prevent redefinition of struct ethhdr

Musl provides its own ethhdr struct definition. Add a guard to prevent
its definition of the appropriate musl header has already been included.

glibc does not implement this header, but when glibc will implement this
they can just define __UAPI_DEF_ETHHDR 0 to make it work with the
kernel.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h    | 3 +++
 include/uapi/linux/libc-compat.h | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 3ee3bf7c8526..144de4d2f385 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -23,6 +23,7 @@
 #define _UAPI_LINUX_IF_ETHER_H
 
 #include <linux/types.h>
+#include <linux/libc-compat.h>
 
 /*
  *	IEEE 802.3 Ethernet magic constants.  The frame sizes omit the preamble
@@ -149,11 +150,13 @@
  *	This is an Ethernet frame header.
  */
 
+#if __UAPI_DEF_ETHHDR
 struct ethhdr {
 	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
 	unsigned char	h_source[ETH_ALEN];	/* source ether addr	*/
 	__be16		h_proto;		/* packet type ID field	*/
 } __attribute__((packed));
+#endif
 
 
 #endif /* _UAPI_LINUX_IF_ETHER_H */
diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index 8254c937c9f4..fc29efaa918c 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -264,4 +264,10 @@
 
 #endif /* __GLIBC__ */
 
+/* Definitions for if_ether.h */
+/* allow libcs like musl to deactivate this, glibc does not implement this. */
+#ifndef __UAPI_DEF_ETHHDR
+#define __UAPI_DEF_ETHHDR		1
+#endif
+
 #endif /* _UAPI_LIBC_COMPAT_H */
-- 
cgit v1.2.3


From e53927393b9987b7c986b6364c27111077f0ea3e Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 5 Jan 2018 14:16:14 +0100
Subject: lightnvm: set target over-provision on create ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow to set the over-provision percentage on target creation. In case
that the value is not provided, fall back to the default value set by
the target.

In pblk, set the default OP to 11% of the total size of the device

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c       | 106 +++++++++++++++++++++++++++++++++---------
 drivers/lightnvm/pblk-init.c  |   5 +-
 drivers/lightnvm/pblk.h       |   2 +
 include/linux/lightnvm.h      |   6 +++
 include/uapi/linux/lightnvm.h |   9 ++++
 5 files changed, 104 insertions(+), 24 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index d5f231c9339e..dcc9e621e651 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -140,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
 }
 
 static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
-					      int lun_begin, int lun_end)
+					      u16 lun_begin, u16 lun_end,
+					      u16 op)
 {
 	struct nvm_tgt_dev *tgt_dev = NULL;
 	struct nvm_dev_map *dev_rmap = dev->rmap;
@@ -219,6 +220,7 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 	tgt_dev->geo.nr_chnls = nr_chnls;
 	tgt_dev->geo.all_luns = nr_luns;
 	tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
+	tgt_dev->geo.op = op;
 	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
 	tgt_dev->q = dev->q;
 	tgt_dev->map = dev_map;
@@ -266,9 +268,57 @@ static struct nvm_tgt_type *nvm_find_target_type(const char *name)
 	return tt;
 }
 
+static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
+				 int lun_end)
+{
+	if (lun_begin > lun_end || lun_end >= geo->all_luns) {
+		pr_err("nvm: lun out of bound (%u:%u > %u)\n",
+			lun_begin, lun_end, geo->all_luns - 1);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int __nvm_config_simple(struct nvm_dev *dev,
+			       struct nvm_ioctl_create_simple *s)
+{
+	struct nvm_geo *geo = &dev->geo;
+
+	if (s->lun_begin == -1 && s->lun_end == -1) {
+		s->lun_begin = 0;
+		s->lun_end = geo->all_luns - 1;
+	}
+
+	return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
+}
+
+static int __nvm_config_extended(struct nvm_dev *dev,
+				 struct nvm_ioctl_create_extended *e)
+{
+	struct nvm_geo *geo = &dev->geo;
+
+	if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
+		e->lun_begin = 0;
+		e->lun_end = dev->geo.all_luns - 1;
+	}
+
+	/* op not set falls into target's default */
+	if (e->op == 0xFFFF)
+		e->op = NVM_TARGET_DEFAULT_OP;
+
+	if (e->op < NVM_TARGET_MIN_OP ||
+	    e->op > NVM_TARGET_MAX_OP) {
+		pr_err("nvm: invalid over provisioning value\n");
+		return -EINVAL;
+	}
+
+	return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
+}
+
 static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 {
-	struct nvm_ioctl_create_simple *s = &create->conf.s;
+	struct nvm_ioctl_create_extended e;
 	struct request_queue *tqueue;
 	struct gendisk *tdisk;
 	struct nvm_tgt_type *tt;
@@ -277,6 +327,28 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	void *targetdata;
 	int ret;
 
+	switch (create->conf.type) {
+	case NVM_CONFIG_TYPE_SIMPLE:
+		ret = __nvm_config_simple(dev, &create->conf.s);
+		if (ret)
+			return ret;
+
+		e.lun_begin = create->conf.s.lun_begin;
+		e.lun_end = create->conf.s.lun_end;
+		e.op = NVM_TARGET_DEFAULT_OP;
+		break;
+	case NVM_CONFIG_TYPE_EXTENDED:
+		ret = __nvm_config_extended(dev, &create->conf.e);
+		if (ret)
+			return ret;
+
+		e = create->conf.e;
+		break;
+	default:
+		pr_err("nvm: config type not valid\n");
+		return -EINVAL;
+	}
+
 	tt = nvm_find_target_type(create->tgttype);
 	if (!tt) {
 		pr_err("nvm: target type %s not found\n", create->tgttype);
@@ -289,7 +361,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 		return -EINVAL;
 	}
 
-	ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end);
+	ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
 	if (ret)
 		return ret;
 
@@ -299,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 		goto err_reserve;
 	}
 
-	tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
+	tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
 	if (!tgt_dev) {
 		pr_err("nvm: could not create target device\n");
 		ret = -ENOMEM;
@@ -369,7 +441,7 @@ err_dev:
 err_t:
 	kfree(t);
 err_reserve:
-	nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
+	nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
 	return ret;
 }
 
@@ -949,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister);
 static int __nvm_configure_create(struct nvm_ioctl_create *create)
 {
 	struct nvm_dev *dev;
-	struct nvm_ioctl_create_simple *s;
 
 	down_write(&nvm_lock);
 	dev = nvm_find_nvm_dev(create->dev);
@@ -960,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 		return -EINVAL;
 	}
 
-	if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
-		pr_err("nvm: config type not valid\n");
-		return -EINVAL;
-	}
-	s = &create->conf.s;
-
-	if (s->lun_begin == -1 && s->lun_end == -1) {
-		s->lun_begin = 0;
-		s->lun_end = dev->geo.all_luns - 1;
-	}
-
-	if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.all_luns) {
-		pr_err("nvm: lun out of bound (%u:%u > %u)\n",
-			s->lun_begin, s->lun_end, dev->geo.all_luns - 1);
-		return -EINVAL;
-	}
-
 	return nvm_create_tgt(dev, create);
 }
 
@@ -1076,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
 	if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
 		return -EFAULT;
 
+	if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
+	    create.conf.e.rsv != 0) {
+		pr_err("nvm: reserved config field in use\n");
+		return -EINVAL;
+	}
+
 	create.dev[DISK_NAME_LEN - 1] = '\0';
 	create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
 	create.tgtname[DISK_NAME_LEN - 1] = '\0';
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index c8a718249e26..533f6908e238 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -585,7 +585,10 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 	sector_t provisioned;
 	int sec_meta, blk_meta;
 
-	pblk->op = 20;
+	if (geo->op == NVM_TARGET_DEFAULT_OP)
+		pblk->op = PBLK_DEFAULT_OP;
+	else
+		pblk->op = geo->op;
 
 	provisioned = nr_free_blks;
 	provisioned *= (100 - pblk->op);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 1e719d4181ce..19e622c65e92 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -54,6 +54,8 @@
 /* Static pool sizes */
 #define PBLK_GEN_WS_POOL_SIZE (2)
 
+#define PBLK_DEFAULT_OP (11)
+
 enum {
 	PBLK_READ		= READ,
 	PBLK_WRITE		= WRITE,/* Write from write buffer */
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 8e43bfebd38d..7f4b60abdf27 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -218,6 +218,10 @@ struct nvm_target {
 
 #define ADDR_EMPTY (~0ULL)
 
+#define NVM_TARGET_DEFAULT_OP (101)
+#define NVM_TARGET_MIN_OP (3)
+#define NVM_TARGET_MAX_OP (80)
+
 #define NVM_VERSION_MAJOR 1
 #define NVM_VERSION_MINOR 0
 #define NVM_VERSION_PATCH 0
@@ -291,6 +295,8 @@ struct nvm_geo {
 
 	int max_rq_size;
 
+	int op;
+
 	struct nvm_addr_format ppaf;
 
 	/* Legacy 1.2 specific geometry */
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index 42d1a434af29..f9a1be7fc696 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -75,14 +75,23 @@ struct nvm_ioctl_create_simple {
 	__u32 lun_end;
 };
 
+struct nvm_ioctl_create_extended {
+	__u16 lun_begin;
+	__u16 lun_end;
+	__u16 op;
+	__u16 rsv;
+};
+
 enum {
 	NVM_CONFIG_TYPE_SIMPLE = 0,
+	NVM_CONFIG_TYPE_EXTENDED = 1,
 };
 
 struct nvm_ioctl_create_conf {
 	__u32 type;
 	union {
 		struct nvm_ioctl_create_simple s;
+		struct nvm_ioctl_create_extended e;
 	};
 };
 
-- 
cgit v1.2.3


From 863def15b9755d9016df4d93addf3127f1dc67f4 Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 3 Jan 2018 22:48:04 +0000
Subject: l2tp: revert "l2tp: add peer_offset parameter"

Revert commit f15bc54eeecd ("l2tp: add peer_offset parameter"). This
is removed because it is adding another configurable offset and
configurable offsets are being removed.

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h |  1 -
 net/l2tp/l2tp_core.c      |  3 +--
 net/l2tp/l2tp_core.h      | 13 +++----------
 net/l2tp/l2tp_debugfs.c   |  8 +++-----
 net/l2tp/l2tp_netlink.c   | 21 +--------------------
 5 files changed, 8 insertions(+), 38 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index d6fee55dbded..d84ce5c1c9aa 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -127,7 +127,6 @@ enum {
 	L2TP_ATTR_UDP_ZERO_CSUM6_TX,	/* flag */
 	L2TP_ATTR_UDP_ZERO_CSUM6_RX,	/* flag */
 	L2TP_ATTR_PAD,
-	L2TP_ATTR_PEER_OFFSET,		/* u16 */
 	__L2TP_ATTR_MAX,
 };
 
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 6ff64717da1e..115918ad8eca 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -792,7 +792,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 			ptr += 2 + offset;
 		}
 	} else
-		ptr += session->peer_offset;
+		ptr += session->offset;
 
 	offset = ptr - optr;
 	if (!pskb_may_pull(skb, offset))
@@ -1785,7 +1785,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 			session->lns_mode = cfg->lns_mode;
 			session->reorder_timeout = cfg->reorder_timeout;
 			session->offset = cfg->offset;
-			session->peer_offset = cfg->peer_offset;
 			session->l2specific_type = cfg->l2specific_type;
 			session->l2specific_len = cfg->l2specific_len;
 			session->cookie_len = cfg->cookie_len;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index c6fe7cc42a05..9534e16965cc 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -59,8 +59,7 @@ struct l2tp_session_cfg {
 	int			debug;		/* bitmask of debug message
 						 * categories */
 	u16			vlan_id;	/* VLAN pseudowire only */
-	u16			offset;		/* offset to tx payload */
-	u16			peer_offset;	/* offset to rx payload */
+	u16			offset;		/* offset to payload */
 	u16			l2specific_len;	/* Layer 2 specific length */
 	u16			l2specific_type; /* Layer 2 specific type */
 	u8			cookie[8];	/* optional cookie */
@@ -87,14 +86,8 @@ struct l2tp_session {
 	int			cookie_len;
 	u8			peer_cookie[8];
 	int			peer_cookie_len;
-	u16			offset;		/* offset from end of L2TP
-						 * header to beginning of
-						 * tx data
-						 */
-	u16			peer_offset;	/* offset from end of L2TP
-						 * header to beginning of
-						 * rx data
-						 */
+	u16			offset;		/* offset from end of L2TP header
+						   to beginning of data */
 	u16			l2specific_len;
 	u16			l2specific_type;
 	u16			hdr_len;
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 4cc30b38aba4..eb69411bcb47 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -180,9 +180,8 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 		   session->lns_mode ? "LNS" : "LAC",
 		   session->debug,
 		   jiffies_to_msecs(session->reorder_timeout));
-	seq_printf(m, "   offset %hu peer_offset %hu l2specific %hu/%hu\n",
-		   session->offset, session->peer_offset,
-		   session->l2specific_type, session->l2specific_len);
+	seq_printf(m, "   offset %hu l2specific %hu/%hu\n",
+		   session->offset, session->l2specific_type, session->l2specific_len);
 	if (session->cookie_len) {
 		seq_printf(m, "   cookie %02x%02x%02x%02x",
 			   session->cookie[0], session->cookie[1],
@@ -229,8 +228,7 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
 		seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
 		seq_puts(m, "  SESSION ID, peer ID, PWTYPE\n");
 		seq_puts(m, "   refcnt cnt\n");
-		seq_puts(m, "   offset OFFSET peer_offset OFFSET");
-		seq_puts(m, " l2specific TYPE/LEN\n");
+		seq_puts(m, "   offset OFFSET l2specific TYPE/LEN\n");
 		seq_puts(m, "   [ cookie ]\n");
 		seq_puts(m, "   [ peer cookie ]\n");
 		seq_puts(m, "   config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n");
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index d7d4d7a7a54d..7e9c50125556 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -547,25 +547,9 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 	}
 
 	if (tunnel->version > 2) {
-		if (info->attrs[L2TP_ATTR_PEER_OFFSET]) {
-			struct nlattr *peer_offset;
-
-			peer_offset = info->attrs[L2TP_ATTR_PEER_OFFSET];
-			cfg.peer_offset = nla_get_u16(peer_offset);
-		}
-
-		if (info->attrs[L2TP_ATTR_OFFSET]) {
+		if (info->attrs[L2TP_ATTR_OFFSET])
 			cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]);
 
-			/* in order to maintain compatibility with older
-			 * versions where offset was used for both tx and
-			 * rx side, update rx side with offset if peer_offset
-			 * is not provided by userspace
-			 */
-			if (!info->attrs[L2TP_ATTR_PEER_OFFSET])
-				cfg.peer_offset = cfg.offset;
-		}
-
 		if (info->attrs[L2TP_ATTR_DATA_SEQ])
 			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
 
@@ -779,8 +763,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 	     nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) ||
 	    (session->offset &&
 	     nla_put_u16(skb, L2TP_ATTR_OFFSET, session->offset)) ||
-	    (session->peer_offset &&
-	     nla_put_u16(skb, L2TP_ATTR_PEER_OFFSET, session->peer_offset)) ||
 	    (session->cookie_len &&
 	     nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len,
 		     &session->cookie[0])) ||
@@ -921,7 +903,6 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
 	[L2TP_ATTR_PW_TYPE]		= { .type = NLA_U16, },
 	[L2TP_ATTR_ENCAP_TYPE]		= { .type = NLA_U16, },
 	[L2TP_ATTR_OFFSET]		= { .type = NLA_U16, },
-	[L2TP_ATTR_PEER_OFFSET]		= { .type = NLA_U16, },
 	[L2TP_ATTR_DATA_SEQ]		= { .type = NLA_U8, },
 	[L2TP_ATTR_L2SPEC_TYPE]		= { .type = NLA_U8, },
 	[L2TP_ATTR_L2SPEC_LEN]		= { .type = NLA_U8, },
-- 
cgit v1.2.3


From 4887d8933a8dfdfa6602e0faaa0e31cd343ccefe Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 3 Jan 2018 22:48:07 +0000
Subject: l2tp: add comment in API header that L2TP_ATTR_OFFSET is not used

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index d84ce5c1c9aa..f78eef4cc56a 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -94,7 +94,7 @@ enum {
 	L2TP_ATTR_NONE,			/* no data */
 	L2TP_ATTR_PW_TYPE,		/* u16, enum l2tp_pwtype */
 	L2TP_ATTR_ENCAP_TYPE,		/* u16, enum l2tp_encap_type */
-	L2TP_ATTR_OFFSET,		/* u16 */
+	L2TP_ATTR_OFFSET,		/* u16 (not used) */
 	L2TP_ATTR_DATA_SEQ,		/* u16 */
 	L2TP_ATTR_L2SPEC_TYPE,		/* u8, enum l2tp_l2spec_type */
 	L2TP_ATTR_L2SPEC_LEN,		/* u8, enum l2tp_l2spec_type */
-- 
cgit v1.2.3


From 02dd3291b2f095bbc88e1d2628fd5bf2e92de69b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:14 +0100
Subject: bpf: finally expose xdp_rxq_info to XDP bpf-programs

Now all XDP driver have been updated to setup xdp_rxq_info and assign
this to xdp_buff->rxq.  Thus, it is now safe to enable access to some
of the xdp_rxq_info struct members.

This patch extend xdp_md and expose UAPI to userspace for
ingress_ifindex and rx_queue_index.  Access happens via bpf
instruction rewrite, that load data directly from struct xdp_rxq_info.

* ingress_ifindex map to xdp_rxq_info->dev->ifindex
* rx_queue_index  map to xdp_rxq_info->queue_index

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c        | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f2f8b36e2ad4..405317f9c064 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -899,6 +899,9 @@ struct xdp_md {
 	__u32 data;
 	__u32 data_end;
 	__u32 data_meta;
+	/* Below access go though struct xdp_rxq_info */
+	__u32 ingress_ifindex; /* rxq->dev->ifindex */
+	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
 
 enum sk_action {
diff --git a/net/core/filter.c b/net/core/filter.c
index 130b842c3a15..acdb94c0e97f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4304,6 +4304,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
+	case offsetof(struct xdp_md, ingress_ifindex):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
+				      si->dst_reg, si->dst_reg,
+				      offsetof(struct xdp_rxq_info, dev));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct net_device,
+						     ifindex, 4, target_size));
+		break;
+	case offsetof(struct xdp_md, rx_queue_index):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct xdp_rxq_info,
+						queue_index, 4, target_size));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From 81df978c49379481716aef591de77313c286d747 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 7 Jan 2018 17:03:48 +0100
Subject: perf: Add sample_id to PERF_RECORD_ITRACE_START event comment

Adding missing sample_id line into PERF_RECORD_ITRACE_START
event comment.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180107160356.28203-5-jolsa@kernel.org
[ Update the tools/include/uapi/linux copy ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/perf_event.h       | 1 +
 tools/include/uapi/linux/perf_event.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b9a4953018ed..8bb66e8da945 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -864,6 +864,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u32				pid;
 	 *	u32				tid;
+	 *	struct sample_id		sample_id;
 	 * };
 	 */
 	PERF_RECORD_ITRACE_START		= 12,
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index b9a4953018ed..8bb66e8da945 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -864,6 +864,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u32				pid;
 	 *	u32				tid;
+	 *	struct sample_id		sample_id;
 	 * };
 	 */
 	PERF_RECORD_ITRACE_START		= 12,
-- 
cgit v1.2.3


From 972c14884728bf5f69ec69cfb1beeec1a9cd29ee Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 7 Jan 2018 17:03:51 +0100
Subject: perf: Update PERF_RECORD_MISC_* comment for perf_event_header::misc
 bit 13

The perf_event_header::misc bit 13 is shared on different events and
next patch is adding yet another bit 13 user.  Updating the comment to
make it more structured and clear which events use bit 13.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/20180107160356.28203-8-jolsa@kernel.org
[ Update the tools/include/uapi/linux copy ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/perf_event.h       | 9 ++++++---
 tools/include/uapi/linux/perf_event.h | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8bb66e8da945..c77c9a2ebbbb 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -612,9 +612,12 @@ struct perf_event_mmap_page {
  */
 #define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT	(1 << 12)
 /*
- * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
- * different events so can reuse the same bit position.
- * Ditto PERF_RECORD_MISC_SWITCH_OUT.
+ * Following PERF_RECORD_MISC_* are used on different
+ * events, so can reuse the same bit position:
+ *
+ *   PERF_RECORD_MISC_MMAP_DATA  - PERF_RECORD_MMAP* events
+ *   PERF_RECORD_MISC_COMM_EXEC  - PERF_RECORD_COMM event
+ *   PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events
  */
 #define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 8bb66e8da945..c77c9a2ebbbb 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -612,9 +612,12 @@ struct perf_event_mmap_page {
  */
 #define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT	(1 << 12)
 /*
- * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
- * different events so can reuse the same bit position.
- * Ditto PERF_RECORD_MISC_SWITCH_OUT.
+ * Following PERF_RECORD_MISC_* are used on different
+ * events, so can reuse the same bit position:
+ *
+ *   PERF_RECORD_MISC_MMAP_DATA  - PERF_RECORD_MMAP* events
+ *   PERF_RECORD_MISC_COMM_EXEC  - PERF_RECORD_COMM event
+ *   PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events
  */
 #define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
-- 
cgit v1.2.3


From e58f33cc84bc089c430ac955f3cad6380ae98591 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 7 Dec 2017 16:28:23 +0100
Subject: netfilter: add defines for arp/decnet max hooks

The kernel already has defines for this, but they are in uapi exposed
headers.

Including these from netns.h causes build errors and also adds unneeded
dependencies on heads that we don't need.

So move these defines to netfilter_defs.h and place the uapi ones
in ifndef __KERNEL__ to keep them for userspace.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_defs.h        | 6 ++++++
 include/uapi/linux/netfilter_arp.h    | 3 +++
 include/uapi/linux/netfilter_decnet.h | 4 +++-
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h
index dc6111adea06..fdcdf2bf34df 100644
--- a/include/linux/netfilter_defs.h
+++ b/include/linux/netfilter_defs.h
@@ -7,4 +7,10 @@
 /* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */
 #define NF_MAX_HOOKS 8
 
+/* in/out/forward only */
+#define NF_ARP_NUMHOOKS 3
+
+/* max hook is NF_DN_ROUTE (6), also see uapi/linux/netfilter_decnet.h */
+#define NF_DN_NUMHOOKS 7
+
 #endif
diff --git a/include/uapi/linux/netfilter_arp.h b/include/uapi/linux/netfilter_arp.h
index 81b6a4cbcb72..791dfc5ae907 100644
--- a/include/uapi/linux/netfilter_arp.h
+++ b/include/uapi/linux/netfilter_arp.h
@@ -15,6 +15,9 @@
 #define NF_ARP_IN	0
 #define NF_ARP_OUT	1
 #define NF_ARP_FORWARD	2
+
+#ifndef __KERNEL__
 #define NF_ARP_NUMHOOKS	3
+#endif
 
 #endif /* __LINUX_ARP_NETFILTER_H */
diff --git a/include/uapi/linux/netfilter_decnet.h b/include/uapi/linux/netfilter_decnet.h
index 9089c38f6abe..61f1c7dfd033 100644
--- a/include/uapi/linux/netfilter_decnet.h
+++ b/include/uapi/linux/netfilter_decnet.h
@@ -24,6 +24,9 @@
 #define NFC_DN_IF_IN		0x0004
 /* Output device. */
 #define NFC_DN_IF_OUT		0x0008
+
+/* kernel define is in netfilter_defs.h */
+#define NF_DN_NUMHOOKS		7
 #endif /* ! __KERNEL__ */
 
 /* DECnet Hooks */
@@ -41,7 +44,6 @@
 #define NF_DN_HELLO		5
 /* Input Routing Packets */
 #define NF_DN_ROUTE		6
-#define NF_DN_NUMHOOKS		7
 
 enum nf_dn_hook_priorities {
 	NF_DN_PRI_FIRST = INT_MIN,
-- 
cgit v1.2.3


From 625c556118f3c2fd28bb8ef6da18c53bd4037be4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 9 Dec 2017 21:01:08 +0100
Subject: netfilter: connlimit: split xt_connlimit into front and backend

This allows to reuse xt_connlimit infrastructure from nf_tables.
The upcoming nf_tables frontend can just pass in an nftables register
as input key, this allows limiting by any nft-supported key, including
concatenations.

For xt_connlimit, pass in the zone and the ip/ipv6 address.

With help from Yi-Hung Wei.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h  |  17 ++
 include/uapi/linux/netfilter/xt_connlimit.h |   2 +-
 net/netfilter/Kconfig                       |   3 +
 net/netfilter/Makefile                      |   2 +
 net/netfilter/nf_conncount.c                | 373 ++++++++++++++++++++++++++++
 net/netfilter/xt_connlimit.c                | 369 ++-------------------------
 6 files changed, 420 insertions(+), 346 deletions(-)
 create mode 100644 include/net/netfilter/nf_conntrack_count.h
 create mode 100644 net/netfilter/nf_conncount.c

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
new file mode 100644
index 000000000000..adf8db44cf86
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -0,0 +1,17 @@
+#ifndef _NF_CONNTRACK_COUNT_H
+#define _NF_CONNTRACK_COUNT_H
+
+struct nf_conncount_data;
+
+struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
+					    unsigned int keylen);
+void nf_conncount_destroy(struct net *net, unsigned int family,
+			  struct nf_conncount_data *data);
+
+unsigned int nf_conncount_count(struct net *net,
+				struct nf_conncount_data *data,
+				const u32 *key,
+				unsigned int family,
+				const struct nf_conntrack_tuple *tuple,
+				const struct nf_conntrack_zone *zone);
+#endif
diff --git a/include/uapi/linux/netfilter/xt_connlimit.h b/include/uapi/linux/netfilter/xt_connlimit.h
index 07e5e9d47882..d4d1943dcd11 100644
--- a/include/uapi/linux/netfilter/xt_connlimit.h
+++ b/include/uapi/linux/netfilter/xt_connlimit.h
@@ -27,7 +27,7 @@ struct xt_connlimit_info {
 	__u32 flags;
 
 	/* Used internally by the kernel */
-	struct xt_connlimit_data *data __attribute__((aligned(8)));
+	struct nf_conncount_data *data __attribute__((aligned(8)));
 };
 
 #endif /* _XT_CONNLIMIT_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 263609a7e010..af3d9f721b3f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -68,6 +68,8 @@ config NF_LOG_NETDEV
 	select NF_LOG_COMMON
 
 if NF_CONNTRACK
+config NETFILTER_CONNCOUNT
+	tristate
 
 config NF_CONNTRACK_MARK
 	bool  'Connection mark tracking support'
@@ -1126,6 +1128,7 @@ config NETFILTER_XT_MATCH_CONNLIMIT
 	tristate '"connlimit" match support'
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
+	select NETFILTER_CONNCOUNT
 	---help---
 	  This match allows you to match against the number of parallel
 	  connections to a server per client IP address (or address block).
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index f78ed2470831..490a55e7166d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -67,6 +67,8 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 # SYNPROXY
 obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
 
+obj-$(CONFIG_NETFILTER_CONNCOUNT) += nf_conncount.o
+
 # generic packet duplication from netdev family
 obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
new file mode 100644
index 000000000000..a95518261168
--- /dev/null
+++ b/net/netfilter/nf_conncount.c
@@ -0,0 +1,373 @@
+/*
+ * count the number of connections matching an arbitrary key.
+ *
+ * (C) 2017 Red Hat GmbH
+ * Author: Florian Westphal <fw@strlen.de>
+ *
+ * split from xt_connlimit.c:
+ *   (c) 2000 Gerd Knorr <kraxel@bytesex.org>
+ *   Nov 2002: Martin Bene <martin.bene@icomedias.com>:
+ *		only ignore TIME_WAIT or gone connections
+ *   (C) CC Computer Consultants GmbH, 2007
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_tcp.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_count.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+#define CONNCOUNT_SLOTS		256U
+
+#ifdef CONFIG_LOCKDEP
+#define CONNCOUNT_LOCK_SLOTS	8U
+#else
+#define CONNCOUNT_LOCK_SLOTS	256U
+#endif
+
+#define CONNCOUNT_GC_MAX_NODES	8
+#define MAX_KEYLEN		5
+
+/* we will save the tuples of all connections we care about */
+struct nf_conncount_tuple {
+	struct hlist_node		node;
+	struct nf_conntrack_tuple	tuple;
+};
+
+struct nf_conncount_rb {
+	struct rb_node node;
+	struct hlist_head hhead; /* connections/hosts in same subnet */
+	u32 key[MAX_KEYLEN];
+};
+
+static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
+
+struct nf_conncount_data {
+	unsigned int keylen;
+	struct rb_root root[CONNCOUNT_SLOTS];
+};
+
+static u_int32_t conncount_rnd __read_mostly;
+static struct kmem_cache *conncount_rb_cachep __read_mostly;
+static struct kmem_cache *conncount_conn_cachep __read_mostly;
+
+static inline bool already_closed(const struct nf_conn *conn)
+{
+	if (nf_ct_protonum(conn) == IPPROTO_TCP)
+		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
+		       conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
+	else
+		return 0;
+}
+
+static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
+{
+	return memcmp(a, b, klen * sizeof(u32));
+}
+
+static bool add_hlist(struct hlist_head *head,
+		      const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conncount_tuple *conn;
+
+	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+	if (conn == NULL)
+		return false;
+	conn->tuple = *tuple;
+	hlist_add_head(&conn->node, head);
+	return true;
+}
+
+static unsigned int check_hlist(struct net *net,
+				struct hlist_head *head,
+				const struct nf_conntrack_tuple *tuple,
+				const struct nf_conntrack_zone *zone,
+				bool *addit)
+{
+	const struct nf_conntrack_tuple_hash *found;
+	struct nf_conncount_tuple *conn;
+	struct hlist_node *n;
+	struct nf_conn *found_ct;
+	unsigned int length = 0;
+
+	*addit = true;
+
+	/* check the saved connections */
+	hlist_for_each_entry_safe(conn, n, head, node) {
+		found = nf_conntrack_find_get(net, zone, &conn->tuple);
+		if (found == NULL) {
+			hlist_del(&conn->node);
+			kmem_cache_free(conncount_conn_cachep, conn);
+			continue;
+		}
+
+		found_ct = nf_ct_tuplehash_to_ctrack(found);
+
+		if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
+			/*
+			 * Just to be sure we have it only once in the list.
+			 * We should not see tuples twice unless someone hooks
+			 * this into a table without "-p tcp --syn".
+			 */
+			*addit = false;
+		} else if (already_closed(found_ct)) {
+			/*
+			 * we do not care about connections which are
+			 * closed already -> ditch it
+			 */
+			nf_ct_put(found_ct);
+			hlist_del(&conn->node);
+			kmem_cache_free(conncount_conn_cachep, conn);
+			continue;
+		}
+
+		nf_ct_put(found_ct);
+		length++;
+	}
+
+	return length;
+}
+
+static void tree_nodes_free(struct rb_root *root,
+			    struct nf_conncount_rb *gc_nodes[],
+			    unsigned int gc_count)
+{
+	struct nf_conncount_rb *rbconn;
+
+	while (gc_count) {
+		rbconn = gc_nodes[--gc_count];
+		rb_erase(&rbconn->node, root);
+		kmem_cache_free(conncount_rb_cachep, rbconn);
+	}
+}
+
+static unsigned int
+count_tree(struct net *net, struct rb_root *root,
+	   const u32 *key, u8 keylen,
+	   u8 family,
+	   const struct nf_conntrack_tuple *tuple,
+	   const struct nf_conntrack_zone *zone)
+{
+	struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+	struct rb_node **rbnode, *parent;
+	struct nf_conncount_rb *rbconn;
+	struct nf_conncount_tuple *conn;
+	unsigned int gc_count;
+	bool no_gc = false;
+
+ restart:
+	gc_count = 0;
+	parent = NULL;
+	rbnode = &(root->rb_node);
+	while (*rbnode) {
+		int diff;
+		bool addit;
+
+		rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
+
+		parent = *rbnode;
+		diff = key_diff(key, rbconn->key, keylen);
+		if (diff < 0) {
+			rbnode = &((*rbnode)->rb_left);
+		} else if (diff > 0) {
+			rbnode = &((*rbnode)->rb_right);
+		} else {
+			/* same source network -> be counted! */
+			unsigned int count;
+			count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+
+			tree_nodes_free(root, gc_nodes, gc_count);
+			if (!addit)
+				return count;
+
+			if (!add_hlist(&rbconn->hhead, tuple))
+				return 0; /* hotdrop */
+
+			return count + 1;
+		}
+
+		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
+			continue;
+
+		/* only used for GC on hhead, retval and 'addit' ignored */
+		check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+		if (hlist_empty(&rbconn->hhead))
+			gc_nodes[gc_count++] = rbconn;
+	}
+
+	if (gc_count) {
+		no_gc = true;
+		tree_nodes_free(root, gc_nodes, gc_count);
+		/* tree_node_free before new allocation permits
+		 * allocator to re-use newly free'd object.
+		 *
+		 * This is a rare event; in most cases we will find
+		 * existing node to re-use. (or gc_count is 0).
+		 */
+		goto restart;
+	}
+
+	/* no match, need to insert new node */
+	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
+	if (rbconn == NULL)
+		return 0;
+
+	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+	if (conn == NULL) {
+		kmem_cache_free(conncount_rb_cachep, rbconn);
+		return 0;
+	}
+
+	conn->tuple = *tuple;
+	memcpy(rbconn->key, key, sizeof(u32) * keylen);
+
+	INIT_HLIST_HEAD(&rbconn->hhead);
+	hlist_add_head(&conn->node, &rbconn->hhead);
+
+	rb_link_node(&rbconn->node, parent, rbnode);
+	rb_insert_color(&rbconn->node, root);
+	return 1;
+}
+
+unsigned int nf_conncount_count(struct net *net,
+				struct nf_conncount_data *data,
+				const u32 *key,
+				unsigned int family,
+				const struct nf_conntrack_tuple *tuple,
+				const struct nf_conntrack_zone *zone)
+{
+	struct rb_root *root;
+	int count;
+	u32 hash;
+
+	hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
+	root = &data->root[hash];
+
+	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+
+	count = count_tree(net, root, key, data->keylen, family, tuple, zone);
+
+	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_count);
+
+struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
+					    unsigned int keylen)
+{
+	struct nf_conncount_data *data;
+	int ret, i;
+
+	if (keylen % sizeof(u32) ||
+	    keylen / sizeof(u32) > MAX_KEYLEN ||
+	    keylen == 0)
+		return ERR_PTR(-EINVAL);
+
+	net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	ret = nf_ct_netns_get(net, family);
+	if (ret < 0) {
+		kfree(data);
+		return ERR_PTR(ret);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(data->root); ++i)
+		data->root[i] = RB_ROOT;
+
+	data->keylen = keylen / sizeof(u32);
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_init);
+
+static void destroy_tree(struct rb_root *r)
+{
+	struct nf_conncount_tuple *conn;
+	struct nf_conncount_rb *rbconn;
+	struct hlist_node *n;
+	struct rb_node *node;
+
+	while ((node = rb_first(r)) != NULL) {
+		rbconn = rb_entry(node, struct nf_conncount_rb, node);
+
+		rb_erase(node, r);
+
+		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
+			kmem_cache_free(conncount_conn_cachep, conn);
+
+		kmem_cache_free(conncount_rb_cachep, rbconn);
+	}
+}
+
+void nf_conncount_destroy(struct net *net, unsigned int family,
+			  struct nf_conncount_data *data)
+{
+	unsigned int i;
+
+	nf_ct_netns_put(net, family);
+
+	for (i = 0; i < ARRAY_SIZE(data->root); ++i)
+		destroy_tree(&data->root[i]);
+
+	kfree(data);
+}
+EXPORT_SYMBOL_GPL(nf_conncount_destroy);
+
+static int __init nf_conncount_modinit(void)
+{
+	int i;
+
+	BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS);
+	BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0);
+
+	for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i)
+		spin_lock_init(&nf_conncount_locks[i]);
+
+	conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
+					   sizeof(struct nf_conncount_tuple),
+					   0, 0, NULL);
+	if (!conncount_conn_cachep)
+		return -ENOMEM;
+
+	conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
+					   sizeof(struct nf_conncount_rb),
+					   0, 0, NULL);
+	if (!conncount_rb_cachep) {
+		kmem_cache_destroy(conncount_conn_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit nf_conncount_modexit(void)
+{
+	kmem_cache_destroy(conncount_conn_cachep);
+	kmem_cache_destroy(conncount_rb_cachep);
+}
+
+module_init(nf_conncount_modinit);
+module_exit(nf_conncount_modexit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("netfilter: count number of connections matching a key");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index a6214f235333..b1b17b9353e1 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -12,292 +12,30 @@
  * GPL (C) 1999  Rusty Russell (rusty@rustcorp.com.au).
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/jhash.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/rbtree.h>
+
 #include <linux/module.h>
-#include <linux/random.h>
 #include <linux/skbuff.h>
-#include <linux/spinlock.h>
-#include <linux/netfilter/nf_conntrack_tcp.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_connlimit.h>
+
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_zones.h>
-
-#define CONNLIMIT_SLOTS		256U
-
-#ifdef CONFIG_LOCKDEP
-#define CONNLIMIT_LOCK_SLOTS	8U
-#else
-#define CONNLIMIT_LOCK_SLOTS	256U
-#endif
-
-#define CONNLIMIT_GC_MAX_NODES	8
-
-/* we will save the tuples of all connections we care about */
-struct xt_connlimit_conn {
-	struct hlist_node		node;
-	struct nf_conntrack_tuple	tuple;
-};
-
-struct xt_connlimit_rb {
-	struct rb_node node;
-	struct hlist_head hhead; /* connections/hosts in same subnet */
-	union nf_inet_addr addr; /* search key */
-};
-
-static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp;
-
-struct xt_connlimit_data {
-	struct rb_root climit_root[CONNLIMIT_SLOTS];
-};
-
-static u_int32_t connlimit_rnd __read_mostly;
-static struct kmem_cache *connlimit_rb_cachep __read_mostly;
-static struct kmem_cache *connlimit_conn_cachep __read_mostly;
-
-static inline unsigned int connlimit_iphash(__be32 addr)
-{
-	return jhash_1word((__force __u32)addr,
-			    connlimit_rnd) % CONNLIMIT_SLOTS;
-}
-
-static inline unsigned int
-connlimit_iphash6(const union nf_inet_addr *addr)
-{
-	return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6),
-		       connlimit_rnd) % CONNLIMIT_SLOTS;
-}
-
-static inline bool already_closed(const struct nf_conn *conn)
-{
-	if (nf_ct_protonum(conn) == IPPROTO_TCP)
-		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
-		       conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
-	else
-		return 0;
-}
-
-static int
-same_source(const union nf_inet_addr *addr,
-	    const union nf_inet_addr *u3, u_int8_t family)
-{
-	if (family == NFPROTO_IPV4)
-		return ntohl(addr->ip) - ntohl(u3->ip);
-
-	return memcmp(addr->ip6, u3->ip6, sizeof(addr->ip6));
-}
-
-static bool add_hlist(struct hlist_head *head,
-		      const struct nf_conntrack_tuple *tuple,
-		      const union nf_inet_addr *addr)
-{
-	struct xt_connlimit_conn *conn;
-
-	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
-	if (conn == NULL)
-		return false;
-	conn->tuple = *tuple;
-	hlist_add_head(&conn->node, head);
-	return true;
-}
-
-static unsigned int check_hlist(struct net *net,
-				struct hlist_head *head,
-				const struct nf_conntrack_tuple *tuple,
-				const struct nf_conntrack_zone *zone,
-				bool *addit)
-{
-	const struct nf_conntrack_tuple_hash *found;
-	struct xt_connlimit_conn *conn;
-	struct hlist_node *n;
-	struct nf_conn *found_ct;
-	unsigned int length = 0;
-
-	*addit = true;
-
-	/* check the saved connections */
-	hlist_for_each_entry_safe(conn, n, head, node) {
-		found = nf_conntrack_find_get(net, zone, &conn->tuple);
-		if (found == NULL) {
-			hlist_del(&conn->node);
-			kmem_cache_free(connlimit_conn_cachep, conn);
-			continue;
-		}
-
-		found_ct = nf_ct_tuplehash_to_ctrack(found);
-
-		if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
-			/*
-			 * Just to be sure we have it only once in the list.
-			 * We should not see tuples twice unless someone hooks
-			 * this into a table without "-p tcp --syn".
-			 */
-			*addit = false;
-		} else if (already_closed(found_ct)) {
-			/*
-			 * we do not care about connections which are
-			 * closed already -> ditch it
-			 */
-			nf_ct_put(found_ct);
-			hlist_del(&conn->node);
-			kmem_cache_free(connlimit_conn_cachep, conn);
-			continue;
-		}
-
-		nf_ct_put(found_ct);
-		length++;
-	}
-
-	return length;
-}
-
-static void tree_nodes_free(struct rb_root *root,
-			    struct xt_connlimit_rb *gc_nodes[],
-			    unsigned int gc_count)
-{
-	struct xt_connlimit_rb *rbconn;
-
-	while (gc_count) {
-		rbconn = gc_nodes[--gc_count];
-		rb_erase(&rbconn->node, root);
-		kmem_cache_free(connlimit_rb_cachep, rbconn);
-	}
-}
-
-static unsigned int
-count_tree(struct net *net, struct rb_root *root,
-	   const struct nf_conntrack_tuple *tuple,
-	   const union nf_inet_addr *addr,
-	   u8 family, const struct nf_conntrack_zone *zone)
-{
-	struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
-	struct rb_node **rbnode, *parent;
-	struct xt_connlimit_rb *rbconn;
-	struct xt_connlimit_conn *conn;
-	unsigned int gc_count;
-	bool no_gc = false;
-
- restart:
-	gc_count = 0;
-	parent = NULL;
-	rbnode = &(root->rb_node);
-	while (*rbnode) {
-		int diff;
-		bool addit;
-
-		rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node);
-
-		parent = *rbnode;
-		diff = same_source(addr, &rbconn->addr, family);
-		if (diff < 0) {
-			rbnode = &((*rbnode)->rb_left);
-		} else if (diff > 0) {
-			rbnode = &((*rbnode)->rb_right);
-		} else {
-			/* same source network -> be counted! */
-			unsigned int count;
-			count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
-
-			tree_nodes_free(root, gc_nodes, gc_count);
-			if (!addit)
-				return count;
-
-			if (!add_hlist(&rbconn->hhead, tuple, addr))
-				return 0; /* hotdrop */
-
-			return count + 1;
-		}
-
-		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
-			continue;
-
-		/* only used for GC on hhead, retval and 'addit' ignored */
-		check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
-		if (hlist_empty(&rbconn->hhead))
-			gc_nodes[gc_count++] = rbconn;
-	}
-
-	if (gc_count) {
-		no_gc = true;
-		tree_nodes_free(root, gc_nodes, gc_count);
-		/* tree_node_free before new allocation permits
-		 * allocator to re-use newly free'd object.
-		 *
-		 * This is a rare event; in most cases we will find
-		 * existing node to re-use. (or gc_count is 0).
-		 */
-		goto restart;
-	}
-
-	/* no match, need to insert new node */
-	rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC);
-	if (rbconn == NULL)
-		return 0;
-
-	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
-	if (conn == NULL) {
-		kmem_cache_free(connlimit_rb_cachep, rbconn);
-		return 0;
-	}
-
-	conn->tuple = *tuple;
-	rbconn->addr = *addr;
-
-	INIT_HLIST_HEAD(&rbconn->hhead);
-	hlist_add_head(&conn->node, &rbconn->hhead);
-
-	rb_link_node(&rbconn->node, parent, rbnode);
-	rb_insert_color(&rbconn->node, root);
-	return 1;
-}
-
-static int count_them(struct net *net,
-		      struct xt_connlimit_data *data,
-		      const struct nf_conntrack_tuple *tuple,
-		      const union nf_inet_addr *addr,
-		      u_int8_t family,
-		      const struct nf_conntrack_zone *zone)
-{
-	struct rb_root *root;
-	int count;
-	u32 hash;
-
-	if (family == NFPROTO_IPV6)
-		hash = connlimit_iphash6(addr);
-	else
-		hash = connlimit_iphash(addr->ip);
-	root = &data->climit_root[hash];
-
-	spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
-
-	count = count_tree(net, root, tuple, addr, family, zone);
-
-	spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
-
-	return count;
-}
+#include <net/netfilter/nf_conntrack_count.h>
 
 static bool
 connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	struct net *net = xt_net(par);
 	const struct xt_connlimit_info *info = par->matchinfo;
-	union nf_inet_addr addr;
 	struct nf_conntrack_tuple tuple;
 	const struct nf_conntrack_tuple *tuple_ptr = &tuple;
 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
 	unsigned int connections;
+	u32 key[5];
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (ct != NULL) {
@@ -310,6 +48,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 	if (xt_family(par) == NFPROTO_IPV6) {
 		const struct ipv6hdr *iph = ipv6_hdr(skb);
+		union nf_inet_addr addr;
 		unsigned int i;
 
 		memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
@@ -317,22 +56,24 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 		for (i = 0; i < ARRAY_SIZE(addr.ip6); ++i)
 			addr.ip6[i] &= info->mask.ip6[i];
+		memcpy(key, &addr, sizeof(addr.ip6));
+		key[4] = zone->id;
 	} else {
 		const struct iphdr *iph = ip_hdr(skb);
-		addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ?
+		key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
 			  iph->daddr : iph->saddr;
 
-		addr.ip &= info->mask.ip;
+		key[0] &= info->mask.ip;
+		key[1] = zone->id;
 	}
 
-	connections = count_them(net, info->data, tuple_ptr, &addr,
-				 xt_family(par), zone);
+	connections = nf_conncount_count(net, info->data, key,
+					 xt_family(par), tuple_ptr, zone);
 	if (connections == 0)
 		/* kmalloc failed, drop it entirely */
 		goto hotdrop;
 
-	return (connections > info->limit) ^
-	       !!(info->flags & XT_CONNLIMIT_INVERT);
+	return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);
 
  hotdrop:
 	par->hotdrop = true;
@@ -342,61 +83,27 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 static int connlimit_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_connlimit_info *info = par->matchinfo;
-	unsigned int i;
-	int ret;
+	unsigned int keylen;
 
-	net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
-
-	ret = nf_ct_netns_get(par->net, par->family);
-	if (ret < 0) {
-		pr_info("cannot load conntrack support for "
-			"address family %u\n", par->family);
-		return ret;
-	}
+	keylen = sizeof(u32);
+	if (par->family == NFPROTO_IPV6)
+		keylen += sizeof(struct in6_addr);
+	else
+		keylen += sizeof(struct in_addr);
 
 	/* init private data */
-	info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
-	if (info->data == NULL) {
-		nf_ct_netns_put(par->net, par->family);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i)
-		info->data->climit_root[i] = RB_ROOT;
+	info->data = nf_conncount_init(par->net, par->family, keylen);
+	if (IS_ERR(info->data))
+		return PTR_ERR(info->data);
 
 	return 0;
 }
 
-static void destroy_tree(struct rb_root *r)
-{
-	struct xt_connlimit_conn *conn;
-	struct xt_connlimit_rb *rbconn;
-	struct hlist_node *n;
-	struct rb_node *node;
-
-	while ((node = rb_first(r)) != NULL) {
-		rbconn = rb_entry(node, struct xt_connlimit_rb, node);
-
-		rb_erase(node, r);
-
-		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
-			kmem_cache_free(connlimit_conn_cachep, conn);
-
-		kmem_cache_free(connlimit_rb_cachep, rbconn);
-	}
-}
-
 static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
 {
 	const struct xt_connlimit_info *info = par->matchinfo;
-	unsigned int i;
-
-	nf_ct_netns_put(par->net, par->family);
-
-	for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i)
-		destroy_tree(&info->data->climit_root[i]);
 
-	kfree(info->data);
+	nf_conncount_destroy(par->net, par->family, info->data);
 }
 
 static struct xt_match connlimit_mt_reg __read_mostly = {
@@ -413,40 +120,12 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
 
 static int __init connlimit_mt_init(void)
 {
-	int ret, i;
-
-	BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS);
-	BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0);
-
-	for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i)
-		spin_lock_init(&xt_connlimit_locks[i]);
-
-	connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn",
-					   sizeof(struct xt_connlimit_conn),
-					   0, 0, NULL);
-	if (!connlimit_conn_cachep)
-		return -ENOMEM;
-
-	connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb",
-					   sizeof(struct xt_connlimit_rb),
-					   0, 0, NULL);
-	if (!connlimit_rb_cachep) {
-		kmem_cache_destroy(connlimit_conn_cachep);
-		return -ENOMEM;
-	}
-	ret = xt_register_match(&connlimit_mt_reg);
-	if (ret != 0) {
-		kmem_cache_destroy(connlimit_conn_cachep);
-		kmem_cache_destroy(connlimit_rb_cachep);
-	}
-	return ret;
+	return xt_register_match(&connlimit_mt_reg);
 }
 
 static void __exit connlimit_mt_exit(void)
 {
 	xt_unregister_match(&connlimit_mt_reg);
-	kmem_cache_destroy(connlimit_conn_cachep);
-	kmem_cache_destroy(connlimit_rb_cachep);
 }
 
 module_init(connlimit_mt_init);
-- 
cgit v1.2.3


From f6931f5f5b713705c3cc91e4f9c222f2b181e2ef Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Dec 2017 16:18:16 +0100
Subject: netfilter: meta: secpath support

replacement for iptables "-m policy --dir in --policy {ipsec,none}".

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_meta.c                 | 43 ++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a3ee277b17a1..2efbf9744c2a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -777,6 +777,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_OIFGROUP: packet output interface group
  * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid)
  * @NFT_META_PRANDOM: a 32bit pseudo-random number
+ * @NFT_META_SECPATH: boolean, secpath_exists (!!skb->sp)
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -804,6 +805,7 @@ enum nft_meta_keys {
 	NFT_META_OIFGROUP,
 	NFT_META_CGROUP,
 	NFT_META_PRANDOM,
+	NFT_META_SECPATH,
 };
 
 /**
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 5a60eb23a7ed..1a91e676f13e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -210,6 +210,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		*dest = prandom_u32_state(state);
 		break;
 	}
+#ifdef CONFIG_XFRM
+	case NFT_META_SECPATH:
+		nft_reg_store8(dest, !!skb->sp);
+		break;
+#endif
 	default:
 		WARN_ON(1);
 		goto err;
@@ -308,6 +313,11 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 		prandom_init_once(&nft_prandom_state);
 		len = sizeof(u32);
 		break;
+#ifdef CONFIG_XFRM
+	case NFT_META_SECPATH:
+		len = sizeof(u8);
+		break;
+#endif
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -318,6 +328,38 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(nft_meta_get_init);
 
+static int nft_meta_get_validate(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr,
+				 const struct nft_data **data)
+{
+#ifdef CONFIG_XFRM
+	const struct nft_meta *priv = nft_expr_priv(expr);
+	unsigned int hooks;
+
+	if (priv->key != NFT_META_SECPATH)
+		return 0;
+
+	switch (ctx->afi->family) {
+	case NFPROTO_NETDEV:
+		hooks = 1 << NF_NETDEV_INGRESS;
+		break;
+	case NFPROTO_IPV4:
+	case NFPROTO_IPV6:
+	case NFPROTO_INET:
+		hooks = (1 << NF_INET_PRE_ROUTING) |
+			(1 << NF_INET_LOCAL_IN) |
+			(1 << NF_INET_FORWARD);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return nft_chain_validate_hooks(ctx->chain, hooks);
+#else
+	return 0;
+#endif
+}
+
 int nft_meta_set_validate(const struct nft_ctx *ctx,
 			  const struct nft_expr *expr,
 			  const struct nft_data **data)
@@ -434,6 +476,7 @@ static const struct nft_expr_ops nft_meta_get_ops = {
 	.eval		= nft_meta_get_eval,
 	.init		= nft_meta_get_init,
 	.dump		= nft_meta_get_dump,
+	.validate	= nft_meta_get_validate,
 };
 
 static const struct nft_expr_ops nft_meta_set_ops = {
-- 
cgit v1.2.3


From 90964016e5d34758033e75884e41d68ccb93212e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:03:56 +0100
Subject: netfilter: nf_conntrack: add IPS_OFFLOAD status bit

This new bit tells us that the conntrack entry is owned by the flow
table offload infrastructure.

 # cat /proc/net/nf_conntrack
 ipv4     2 tcp      6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2

Note the [OFFLOAD] tag in the listing.

The timer of such conntrack entries look like stopped from userspace.
In practise, to make sure the conntrack entry does not go away, the
conntrack timer is periodically set to an arbitrary large value that
gets refreshed on every iteration from the garbage collector, so it
never expires- and they display no internal state in the case of TCP
flows. This allows us to save a bitcheck from the packet path via
nf_ct_is_expired().

Conntrack entries that have been offloaded to the flow table
infrastructure cannot be deleted/flushed via ctnetlink. The flow table
infrastructure is also responsible for releasing this conntrack entry.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h |  6 +++++-
 net/netfilter/nf_conntrack_core.c                  | 20 ++++++++++++++++++++
 net/netfilter/nf_conntrack_netlink.c               | 15 ++++++++++++++-
 net/netfilter/nf_conntrack_proto_tcp.c             |  3 +++
 net/netfilter/nf_conntrack_standalone.c            | 12 ++++++++----
 5 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 3fea7709a441..fc8c15a24a43 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -101,12 +101,16 @@ enum ip_conntrack_status {
 	IPS_HELPER_BIT = 13,
 	IPS_HELPER = (1 << IPS_HELPER_BIT),
 
+	/* Conntrack has been offloaded to flow table. */
+	IPS_OFFLOAD_BIT = 14,
+	IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
+
 	/* Be careful here, modifying these bits can make things messy,
 	 * so don't let users modify them directly.
 	 */
 	IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
 				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
-				 IPS_SEQ_ADJUST | IPS_TEMPLATE),
+				 IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
 
 	__IPS_MAX_BIT = 14,
 };
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 85f643c1e227..6a64d528d076 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -901,6 +901,9 @@ static unsigned int early_drop_list(struct net *net,
 	hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
 		tmp = nf_ct_tuplehash_to_ctrack(h);
 
+		if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
+			continue;
+
 		if (nf_ct_is_expired(tmp)) {
 			nf_ct_gc_expired(tmp);
 			continue;
@@ -975,6 +978,18 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 	return false;
 }
 
+#define	DAY	(86400 * HZ)
+
+/* Set an arbitrary timeout large enough not to ever expire, this save
+ * us a check for the IPS_OFFLOAD_BIT from the packet path via
+ * nf_ct_is_expired().
+ */
+static void nf_ct_offload_timeout(struct nf_conn *ct)
+{
+	if (nf_ct_expires(ct) < DAY / 2)
+		ct->timeout = nfct_time_stamp + DAY;
+}
+
 static void gc_worker(struct work_struct *work)
 {
 	unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
@@ -1011,6 +1026,11 @@ static void gc_worker(struct work_struct *work)
 			tmp = nf_ct_tuplehash_to_ctrack(h);
 
 			scanned++;
+			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
+				nf_ct_offload_timeout(tmp);
+				continue;
+			}
+
 			if (nf_ct_is_expired(tmp)) {
 				nf_ct_gc_expired(tmp);
 				expired_count++;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 316bbdc4a158..7c7921a53b13 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1110,6 +1110,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
 				    .len = NF_CT_LABELS_MAX_SIZE },
 };
 
+static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
+{
+	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+		return 0;
+
+	return ctnetlink_filter_match(ct, data);
+}
+
 static int ctnetlink_flush_conntrack(struct net *net,
 				     const struct nlattr * const cda[],
 				     u32 portid, int report)
@@ -1122,7 +1130,7 @@ static int ctnetlink_flush_conntrack(struct net *net,
 			return PTR_ERR(filter);
 	}
 
-	nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
+	nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
 				  portid, report);
 	kfree(filter);
 
@@ -1168,6 +1176,11 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
 
 	ct = nf_ct_tuplehash_to_ctrack(h);
 
+	if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+		nf_ct_put(ct);
+		return -EBUSY;
+	}
+
 	if (cda[CTA_ID]) {
 		u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
 		if (id != (u32)(unsigned long)ct) {
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 684cc29010a0..e97cdc1cf98c 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
+	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+		return;
+
 	seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
 }
 #endif
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5a101caa3e12..46d32baad095 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	WARN_ON(!l4proto);
 
 	ret = -ENOSPC;
-	seq_printf(s, "%-8s %u %-8s %u %ld ",
+	seq_printf(s, "%-8s %u %-8s %u ",
 		   l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
-		   l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
-		   nf_ct_expires(ct)  / HZ);
+		   l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
+
+	if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
+		seq_printf(s, "%ld ", nf_ct_expires(ct)  / HZ);
 
 	if (l4proto->print_conntrack)
 		l4proto->print_conntrack(s, ct);
@@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
 		goto release;
 
-	if (test_bit(IPS_ASSURED_BIT, &ct->status))
+	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+		seq_puts(s, "[OFFLOAD] ");
+	else if (test_bit(IPS_ASSURED_BIT, &ct->status))
 		seq_puts(s, "[ASSURED] ");
 
 	if (seq_has_overflowed(s))
-- 
cgit v1.2.3


From 3b49e2e94e6ebb8b23d0955d9e898254455734f8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:07 +0100
Subject: netfilter: nf_tables: add flow table netlink frontend

This patch introduces a netlink control plane to create, delete and dump
flow tables. Flow tables are identified by name, this name is used from
rules to refer to an specific flow table. Flow tables use the rhashtable
class and a generic garbage collector to remove expired entries.

This also adds the infrastructure to add different flow table types, so
we can add one for each layer 3 protocol family.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h    |  23 +
 include/net/netfilter/nf_tables.h        |  48 ++
 include/uapi/linux/netfilter/nf_tables.h |  53 +++
 net/netfilter/nf_tables_api.c            | 747 ++++++++++++++++++++++++++++++-
 4 files changed, 870 insertions(+), 1 deletion(-)
 create mode 100644 include/net/netfilter/nf_flow_table.h

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
new file mode 100644
index 000000000000..3a0779589281
--- /dev/null
+++ b/include/net/netfilter/nf_flow_table.h
@@ -0,0 +1,23 @@
+#ifndef _NF_FLOW_TABLE_H
+#define _NF_FLOW_TABLE_H
+
+#include <linux/rhashtable.h>
+
+struct nf_flowtable;
+
+struct nf_flowtable_type {
+	struct list_head		list;
+	int				family;
+	void				(*gc)(struct work_struct *work);
+	const struct rhashtable_params	*params;
+	nf_hookfn			*hook;
+	struct module			*owner;
+};
+
+struct nf_flowtable {
+	struct rhashtable		rhashtable;
+	const struct nf_flowtable_type	*type;
+	struct delayed_work		gc_work;
+};
+
+#endif /* _FLOW_OFFLOAD_H */
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e3ec02fd0f67..dd238950df81 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -9,6 +9,7 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nf_tables.h>
 #include <linux/u64_stats_sync.h>
+#include <net/netfilter/nf_flow_table.h>
 #include <net/netlink.h>
 
 #define NFT_JUMP_STACK_SIZE	16
@@ -943,6 +944,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@chains: chains in the table
  *	@sets: sets in the table
  *	@objects: stateful objects in the table
+ *	@flowtables: flow tables in the table
  *	@hgenerator: handle generator state
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
@@ -954,6 +956,7 @@ struct nft_table {
 	struct list_head		chains;
 	struct list_head		sets;
 	struct list_head		objects;
+	struct list_head		flowtables;
 	u64				hgenerator;
 	u32				use;
 	u16				flags:14,
@@ -1084,6 +1087,44 @@ struct nft_object_ops {
 int nft_register_obj(struct nft_object_type *obj_type);
 void nft_unregister_obj(struct nft_object_type *obj_type);
 
+/**
+ *	struct nft_flowtable - nf_tables flow table
+ *
+ *	@list: flow table list node in table list
+ * 	@table: the table the flow table is contained in
+ *	@name: name of this flow table
+ *	@hooknum: hook number
+ *	@priority: hook priority
+ *	@ops_len: number of hooks in array
+ *	@genmask: generation mask
+ *	@use: number of references to this flow table
+ *	@data: rhashtable and garbage collector
+ * 	@ops: array of hooks
+ */
+struct nft_flowtable {
+	struct list_head		list;
+	struct nft_table		*table;
+	char				*name;
+	int				hooknum;
+	int				priority;
+	int				ops_len;
+	u32				genmask:2,
+					use:30;
+	/* runtime data below here */
+	struct nf_hook_ops		*ops ____cacheline_aligned;
+	struct nf_flowtable		data;
+};
+
+struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
+						 const struct nlattr *nla,
+						 u8 genmask);
+void nft_flow_table_iterate(struct net *net,
+			    void (*iter)(struct nf_flowtable *flowtable, void *data),
+			    void *data);
+
+void nft_register_flowtable_type(struct nf_flowtable_type *type);
+void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
+
 /**
  *	struct nft_traceinfo - nft tracing information and state
  *
@@ -1317,4 +1358,11 @@ struct nft_trans_obj {
 #define nft_trans_obj(trans)	\
 	(((struct nft_trans_obj *)trans->data)->obj)
 
+struct nft_trans_flowtable {
+	struct nft_flowtable		*flowtable;
+};
+
+#define nft_trans_flowtable(trans)	\
+	(((struct nft_trans_flowtable *)trans->data)->flowtable)
+
 #endif /* _NET_NF_TABLES_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 2efbf9744c2a..591b53bce070 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -92,6 +92,9 @@ enum nft_verdicts {
  * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes)
  * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes)
  * @NFT_MSG_GETOBJ_RESET: get and reset a stateful object (enum nft_obj_attributes)
+ * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes)
+ * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes)
+ * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes)
  */
 enum nf_tables_msg_types {
 	NFT_MSG_NEWTABLE,
@@ -116,6 +119,9 @@ enum nf_tables_msg_types {
 	NFT_MSG_GETOBJ,
 	NFT_MSG_DELOBJ,
 	NFT_MSG_GETOBJ_RESET,
+	NFT_MSG_NEWFLOWTABLE,
+	NFT_MSG_GETFLOWTABLE,
+	NFT_MSG_DELFLOWTABLE,
 	NFT_MSG_MAX,
 };
 
@@ -1309,6 +1315,53 @@ enum nft_object_attributes {
 };
 #define NFTA_OBJ_MAX		(__NFTA_OBJ_MAX - 1)
 
+/**
+ * enum nft_flowtable_attributes - nf_tables flow table netlink attributes
+ *
+ * @NFTA_FLOWTABLE_TABLE: name of the table containing the expression (NLA_STRING)
+ * @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING)
+ * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
+ * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
+ */
+enum nft_flowtable_attributes {
+	NFTA_FLOWTABLE_UNSPEC,
+	NFTA_FLOWTABLE_TABLE,
+	NFTA_FLOWTABLE_NAME,
+	NFTA_FLOWTABLE_HOOK,
+	NFTA_FLOWTABLE_USE,
+	__NFTA_FLOWTABLE_MAX
+};
+#define NFTA_FLOWTABLE_MAX	(__NFTA_FLOWTABLE_MAX - 1)
+
+/**
+ * enum nft_flowtable_hook_attributes - nf_tables flow table hook netlink attributes
+ *
+ * @NFTA_FLOWTABLE_HOOK_NUM: netfilter hook number (NLA_U32)
+ * @NFTA_FLOWTABLE_HOOK_PRIORITY: netfilter hook priority (NLA_U32)
+ * @NFTA_FLOWTABLE_HOOK_DEVS: input devices this flow table is bound to (NLA_NESTED)
+ */
+enum nft_flowtable_hook_attributes {
+	NFTA_FLOWTABLE_HOOK_UNSPEC,
+	NFTA_FLOWTABLE_HOOK_NUM,
+	NFTA_FLOWTABLE_HOOK_PRIORITY,
+	NFTA_FLOWTABLE_HOOK_DEVS,
+	__NFTA_FLOWTABLE_HOOK_MAX
+};
+#define NFTA_FLOWTABLE_HOOK_MAX	(__NFTA_FLOWTABLE_HOOK_MAX - 1)
+
+/**
+ * enum nft_device_attributes - nf_tables device netlink attributes
+ *
+ * @NFTA_DEVICE_NAME: name of this device (NLA_STRING)
+ */
+enum nft_devices_attributes {
+	NFTA_DEVICE_UNSPEC,
+	NFTA_DEVICE_NAME,
+	__NFTA_DEVICE_MAX
+};
+#define NFTA_DEVICE_MAX		(__NFTA_DEVICE_MAX - 1)
+
+
 /**
  * enum nft_trace_attributes - nf_tables trace netlink attributes
  *
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fa564dac66a2..db0933256ec9 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -17,6 +17,7 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/net_namespace.h>
@@ -24,6 +25,7 @@
 
 static LIST_HEAD(nf_tables_expressions);
 static LIST_HEAD(nf_tables_objects);
+static LIST_HEAD(nf_tables_flowtables);
 
 /**
  *	nft_register_afinfo - register nf_tables address family info
@@ -345,6 +347,40 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
 	return err;
 }
 
+static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
+				   struct nft_flowtable *flowtable)
+{
+	struct nft_trans *trans;
+
+	trans = nft_trans_alloc(ctx, msg_type,
+				sizeof(struct nft_trans_flowtable));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	if (msg_type == NFT_MSG_NEWFLOWTABLE)
+		nft_activate_next(ctx->net, flowtable);
+
+	nft_trans_flowtable(trans) = flowtable;
+	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+	return 0;
+}
+
+static int nft_delflowtable(struct nft_ctx *ctx,
+			    struct nft_flowtable *flowtable)
+{
+	int err;
+
+	err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
+	if (err < 0)
+		return err;
+
+	nft_deactivate_next(ctx->net, flowtable);
+	ctx->table->use--;
+
+	return err;
+}
+
 /*
  * Tables
  */
@@ -728,6 +764,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
+	INIT_LIST_HEAD(&table->flowtables);
 	table->flags = flags;
 
 	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
@@ -749,10 +786,11 @@ err1:
 
 static int nft_flush_table(struct nft_ctx *ctx)
 {
-	int err;
+	struct nft_flowtable *flowtable, *nft;
 	struct nft_chain *chain, *nc;
 	struct nft_object *obj, *ne;
 	struct nft_set *set, *ns;
+	int err;
 
 	list_for_each_entry(chain, &ctx->table->chains, list) {
 		if (!nft_is_active_next(ctx->net, chain))
@@ -778,6 +816,12 @@ static int nft_flush_table(struct nft_ctx *ctx)
 			goto out;
 	}
 
+	list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) {
+		err = nft_delflowtable(ctx, flowtable);
+		if (err < 0)
+			goto out;
+	}
+
 	list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
 		err = nft_delobj(ctx, obj);
 		if (err < 0)
@@ -4839,6 +4883,605 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx,
 		       ctx->afi->family, ctx->report, GFP_KERNEL);
 }
 
+/*
+ * Flow tables
+ */
+void nft_register_flowtable_type(struct nf_flowtable_type *type)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_add_tail_rcu(&type->list, &nf_tables_flowtables);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_register_flowtable_type);
+
+void nft_unregister_flowtable_type(struct nf_flowtable_type *type)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del_rcu(&type->list);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_flowtable_type);
+
+static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
+	[NFTA_FLOWTABLE_TABLE]		= { .type = NLA_STRING,
+					    .len = NFT_NAME_MAXLEN - 1 },
+	[NFTA_FLOWTABLE_NAME]		= { .type = NLA_STRING,
+					    .len = NFT_NAME_MAXLEN - 1 },
+	[NFTA_FLOWTABLE_HOOK]		= { .type = NLA_NESTED },
+};
+
+struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
+						 const struct nlattr *nla,
+						 u8 genmask)
+{
+	struct nft_flowtable *flowtable;
+
+	list_for_each_entry(flowtable, &table->flowtables, list) {
+		if (!nla_strcmp(nla, flowtable->name) &&
+		    nft_active_genmask(flowtable, genmask))
+			return flowtable;
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
+
+#define NFT_FLOWTABLE_DEVICE_MAX	8
+
+static int nf_tables_parse_devices(const struct nft_ctx *ctx,
+				   const struct nlattr *attr,
+				   struct net_device *dev_array[], int *len)
+{
+	const struct nlattr *tmp;
+	struct net_device *dev;
+	char ifname[IFNAMSIZ];
+	int rem, n = 0, err;
+
+	nla_for_each_nested(tmp, attr, rem) {
+		if (nla_type(tmp) != NFTA_DEVICE_NAME) {
+			err = -EINVAL;
+			goto err1;
+		}
+
+		nla_strlcpy(ifname, tmp, IFNAMSIZ);
+		dev = dev_get_by_name(ctx->net, ifname);
+		if (!dev) {
+			err = -ENOENT;
+			goto err1;
+		}
+
+		dev_array[n++] = dev;
+		if (n == NFT_FLOWTABLE_DEVICE_MAX) {
+			err = -EFBIG;
+			goto err1;
+		}
+	}
+	if (!len)
+		return -EINVAL;
+
+	err = 0;
+err1:
+	*len = n;
+	return err;
+}
+
+static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
+	[NFTA_FLOWTABLE_HOOK_NUM]	= { .type = NLA_U32 },
+	[NFTA_FLOWTABLE_HOOK_PRIORITY]	= { .type = NLA_U32 },
+	[NFTA_FLOWTABLE_HOOK_DEVS]	= { .type = NLA_NESTED },
+};
+
+static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
+					  const struct nlattr *attr,
+					  struct nft_flowtable *flowtable)
+{
+	struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX];
+	struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
+	struct nf_hook_ops *ops;
+	int hooknum, priority;
+	int err, n = 0, i;
+
+	err = nla_parse_nested(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
+			       nft_flowtable_hook_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
+	    !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] ||
+	    !tb[NFTA_FLOWTABLE_HOOK_DEVS])
+		return -EINVAL;
+
+	hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
+	if (hooknum >= ctx->afi->nhooks)
+		return -EINVAL;
+
+	priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+
+	err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS],
+				      dev_array, &n);
+	if (err < 0)
+		goto err1;
+
+	ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL);
+	if (!ops) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	flowtable->ops		= ops;
+	flowtable->ops_len	= n;
+
+	for (i = 0; i < n; i++) {
+		flowtable->ops[i].pf		= NFPROTO_NETDEV;
+		flowtable->ops[i].hooknum	= hooknum;
+		flowtable->ops[i].priority	= priority;
+		flowtable->ops[i].priv		= &flowtable->data.rhashtable;
+		flowtable->ops[i].hook		= flowtable->data.type->hook;
+		flowtable->ops[i].dev		= dev_array[i];
+	}
+
+	err = 0;
+err1:
+	for (i = 0; i < n; i++)
+		dev_put(dev_array[i]);
+
+	return err;
+}
+
+static const struct nf_flowtable_type *
+__nft_flowtable_type_get(const struct nft_af_info *afi)
+{
+	const struct nf_flowtable_type *type;
+
+	list_for_each_entry(type, &nf_tables_flowtables, list) {
+		if (afi->family == type->family)
+			return type;
+	}
+	return NULL;
+}
+
+static const struct nf_flowtable_type *
+nft_flowtable_type_get(const struct nft_af_info *afi)
+{
+	const struct nf_flowtable_type *type;
+
+	type = __nft_flowtable_type_get(afi);
+	if (type != NULL && try_module_get(type->owner))
+		return type;
+
+#ifdef CONFIG_MODULES
+	if (type == NULL) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nf-flowtable-%u", afi->family);
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (__nft_flowtable_type_get(afi))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-ENOENT);
+}
+
+void nft_flow_table_iterate(struct net *net,
+			    void (*iter)(struct nf_flowtable *flowtable, void *data),
+			    void *data)
+{
+	struct nft_flowtable *flowtable;
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+		list_for_each_entry_rcu(table, &afi->tables, list) {
+			list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+				iter(&flowtable->data, data);
+			}
+		}
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
+
+static void nft_unregister_flowtable_net_hooks(struct net *net,
+					       struct nft_flowtable *flowtable)
+{
+	int i;
+
+	for (i = 0; i < flowtable->ops_len; i++) {
+		if (!flowtable->ops[i].dev)
+			continue;
+
+		nf_unregister_net_hook(net, &flowtable->ops[i]);
+	}
+}
+
+static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
+				  struct sk_buff *skb,
+				  const struct nlmsghdr *nlh,
+				  const struct nlattr * const nla[],
+				  struct netlink_ext_ack *extack)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nf_flowtable_type *type;
+	u8 genmask = nft_genmask_next(net);
+	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_ctx ctx;
+	int err, i, k;
+
+	if (!nla[NFTA_FLOWTABLE_TABLE] ||
+	    !nla[NFTA_FLOWTABLE_NAME] ||
+	    !nla[NFTA_FLOWTABLE_HOOK])
+		return -EINVAL;
+
+	afi = nf_tables_afinfo_lookup(net, family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					       genmask);
+	if (IS_ERR(flowtable)) {
+		err = PTR_ERR(flowtable);
+		if (err != -ENOENT)
+			return err;
+	} else {
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+
+		return 0;
+	}
+
+	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+	flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
+	if (!flowtable)
+		return -ENOMEM;
+
+	flowtable->table = table;
+	flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
+	if (!flowtable->name) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	type = nft_flowtable_type_get(afi);
+	if (IS_ERR(type)) {
+		err = PTR_ERR(type);
+		goto err2;
+	}
+
+	flowtable->data.type = type;
+	err = rhashtable_init(&flowtable->data.rhashtable, type->params);
+	if (err < 0)
+		goto err3;
+
+	err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
+					     flowtable);
+	if (err < 0)
+		goto err3;
+
+	for (i = 0; i < flowtable->ops_len; i++) {
+		err = nf_register_net_hook(net, &flowtable->ops[i]);
+		if (err < 0)
+			goto err4;
+	}
+
+	err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
+	if (err < 0)
+		goto err5;
+
+	INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
+	queue_delayed_work(system_power_efficient_wq,
+			   &flowtable->data.gc_work, HZ);
+
+	list_add_tail_rcu(&flowtable->list, &table->flowtables);
+	table->use++;
+
+	return 0;
+err5:
+	i = flowtable->ops_len;
+err4:
+	for (k = i - 1; k >= 0; k--)
+		nf_unregister_net_hook(net, &flowtable->ops[i]);
+
+	kfree(flowtable->ops);
+err3:
+	module_put(type->owner);
+err2:
+	kfree(flowtable->name);
+err1:
+	kfree(flowtable);
+	return err;
+}
+
+static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
+				  struct sk_buff *skb,
+				  const struct nlmsghdr *nlh,
+				  const struct nlattr * const nla[],
+				  struct netlink_ext_ack *extack)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
+	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_ctx ctx;
+
+	afi = nf_tables_afinfo_lookup(net, family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					       genmask);
+	if (IS_ERR(flowtable))
+                return PTR_ERR(flowtable);
+	if (flowtable->use > 0)
+		return -EBUSY;
+
+	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+	return nft_delflowtable(&ctx, flowtable);
+}
+
+static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
+					 u32 portid, u32 seq, int event,
+					 u32 flags, int family,
+					 struct nft_flowtable *flowtable)
+{
+	struct nlattr *nest, *nest_devs;
+	struct nfgenmsg *nfmsg;
+	struct nlmsghdr *nlh;
+	int i;
+
+	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= htons(net->nft.base_seq & 0xffff);
+
+	if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
+	    nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
+	    nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)))
+		goto nla_put_failure;
+
+	nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
+	if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
+	    nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority)))
+		goto nla_put_failure;
+
+	nest_devs = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK_DEVS);
+	if (!nest_devs)
+		goto nla_put_failure;
+
+	for (i = 0; i < flowtable->ops_len; i++) {
+		if (flowtable->ops[i].dev &&
+		    nla_put_string(skb, NFTA_DEVICE_NAME,
+				   flowtable->ops[i].dev->name))
+			goto nla_put_failure;
+	}
+	nla_nest_end(skb, nest_devs);
+	nla_nest_end(skb, nest);
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+struct nft_flowtable_filter {
+	char		*table;
+};
+
+static int nf_tables_dump_flowtable(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	struct nft_flowtable_filter *filter = cb->data;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
+	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+
+	rcu_read_lock();
+	cb->seq = net->nft.base_seq;
+
+	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry_rcu(table, &afi->tables, list) {
+			list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+				if (!nft_is_active(net, flowtable))
+					goto cont;
+				if (idx < s_idx)
+					goto cont;
+				if (idx > s_idx)
+					memset(&cb->args[1], 0,
+					       sizeof(cb->args) - sizeof(cb->args[0]));
+				if (filter && filter->table[0] &&
+				    strcmp(filter->table, table->name))
+					goto cont;
+
+				if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid,
+								  cb->nlh->nlmsg_seq,
+								  NFT_MSG_NEWFLOWTABLE,
+								  NLM_F_MULTI | NLM_F_APPEND,
+								  afi->family, flowtable) < 0)
+					goto done;
+
+				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+				idx++;
+			}
+		}
+	}
+done:
+	rcu_read_unlock();
+
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
+{
+	struct nft_flowtable_filter *filter = cb->data;
+
+	if (!filter)
+		return 0;
+
+	kfree(filter->table);
+	kfree(filter);
+
+	return 0;
+}
+
+static struct nft_flowtable_filter *
+nft_flowtable_filter_alloc(const struct nlattr * const nla[])
+{
+	struct nft_flowtable_filter *filter;
+
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!filter)
+		return ERR_PTR(-ENOMEM);
+
+	if (nla[NFTA_FLOWTABLE_TABLE]) {
+		filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
+					   GFP_KERNEL);
+		if (!filter->table) {
+			kfree(filter);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return filter;
+}
+
+static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
+				  struct sk_buff *skb,
+				  const struct nlmsghdr *nlh,
+				  const struct nlattr * const nla[],
+				  struct netlink_ext_ack *extack)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_cur(net);
+	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	struct sk_buff *skb2;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_flowtable,
+			.done = nf_tables_dump_flowtable_done,
+		};
+
+		if (nla[NFTA_FLOWTABLE_TABLE]) {
+			struct nft_flowtable_filter *filter;
+
+			filter = nft_flowtable_filter_alloc(nla);
+			if (IS_ERR(filter))
+				return -ENOMEM;
+
+			c.data = filter;
+		}
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	if (!nla[NFTA_FLOWTABLE_NAME])
+		return -EINVAL;
+
+	afi = nf_tables_afinfo_lookup(net, family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					       genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(flowtable);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
+					    nlh->nlmsg_seq,
+					    NFT_MSG_NEWFLOWTABLE, 0, family,
+					    flowtable);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
+				       struct nft_flowtable *flowtable,
+				       int event)
+{
+	struct sk_buff *skb;
+	int err;
+
+	if (ctx->report &&
+	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+		return;
+
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
+					    ctx->seq, event, 0,
+					    ctx->afi->family, flowtable);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+		       ctx->report, GFP_KERNEL);
+	return;
+err:
+	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
+}
+
+static void nft_flowtable_destroy(void *ptr, void *arg)
+{
+	kfree(ptr);
+}
+
+static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
+{
+	cancel_delayed_work_sync(&flowtable->data.gc_work);
+	kfree(flowtable->name);
+	rhashtable_free_and_destroy(&flowtable->data.rhashtable,
+				    nft_flowtable_destroy, NULL);
+	module_put(flowtable->data.type->owner);
+}
+
 static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 				   u32 portid, u32 seq)
 {
@@ -4869,6 +5512,49 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static void nft_flowtable_event(unsigned long event, struct net_device *dev,
+				struct nft_flowtable *flowtable)
+{
+	int i;
+
+	for (i = 0; i < flowtable->ops_len; i++) {
+		if (flowtable->ops[i].dev != dev)
+			continue;
+
+		nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]);
+		flowtable->ops[i].dev = NULL;
+		break;
+	}
+}
+
+static int nf_tables_flowtable_event(struct notifier_block *this,
+				     unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct nft_flowtable *flowtable;
+	struct nft_table *table;
+	struct nft_af_info *afi;
+
+	if (event != NETDEV_UNREGISTER)
+		return 0;
+
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) {
+		list_for_each_entry(table, &afi->tables, list) {
+			list_for_each_entry(flowtable, &table->flowtables, list) {
+				nft_flowtable_event(event, dev, flowtable);
+			}
+		}
+	}
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nf_tables_flowtable_notifier = {
+	.notifier_call	= nf_tables_flowtable_event,
+};
+
 static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
 				 int event)
 {
@@ -5021,6 +5707,21 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
 	},
+	[NFT_MSG_NEWFLOWTABLE] = {
+		.call_batch	= nf_tables_newflowtable,
+		.attr_count	= NFTA_FLOWTABLE_MAX,
+		.policy		= nft_flowtable_policy,
+	},
+	[NFT_MSG_GETFLOWTABLE] = {
+		.call		= nf_tables_getflowtable,
+		.attr_count	= NFTA_FLOWTABLE_MAX,
+		.policy		= nft_flowtable_policy,
+	},
+	[NFT_MSG_DELFLOWTABLE] = {
+		.call_batch	= nf_tables_delflowtable,
+		.attr_count	= NFTA_FLOWTABLE_MAX,
+		.policy		= nft_flowtable_policy,
+	},
 };
 
 static void nft_chain_commit_update(struct nft_trans *trans)
@@ -5066,6 +5767,9 @@ static void nf_tables_commit_release(struct nft_trans *trans)
 	case NFT_MSG_DELOBJ:
 		nft_obj_destroy(nft_trans_obj(trans));
 		break;
+	case NFT_MSG_DELFLOWTABLE:
+		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+		break;
 	}
 	kfree(trans);
 }
@@ -5183,6 +5887,21 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
 					     NFT_MSG_DELOBJ);
 			break;
+		case NFT_MSG_NEWFLOWTABLE:
+			nft_clear(net, nft_trans_flowtable(trans));
+			nf_tables_flowtable_notify(&trans->ctx,
+						   nft_trans_flowtable(trans),
+						   NFT_MSG_NEWFLOWTABLE);
+			nft_trans_destroy(trans);
+			break;
+		case NFT_MSG_DELFLOWTABLE:
+			list_del_rcu(&nft_trans_flowtable(trans)->list);
+			nf_tables_flowtable_notify(&trans->ctx,
+						   nft_trans_flowtable(trans),
+						   NFT_MSG_DELFLOWTABLE);
+			nft_unregister_flowtable_net_hooks(net,
+					nft_trans_flowtable(trans));
+			break;
 		}
 	}
 
@@ -5220,6 +5939,9 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 	case NFT_MSG_NEWOBJ:
 		nft_obj_destroy(nft_trans_obj(trans));
 		break;
+	case NFT_MSG_NEWFLOWTABLE:
+		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+		break;
 	}
 	kfree(trans);
 }
@@ -5309,6 +6031,17 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 			nft_clear(trans->ctx.net, nft_trans_obj(trans));
 			nft_trans_destroy(trans);
 			break;
+		case NFT_MSG_NEWFLOWTABLE:
+			trans->ctx.table->use--;
+			list_del_rcu(&nft_trans_flowtable(trans)->list);
+			nft_unregister_flowtable_net_hooks(net,
+					nft_trans_flowtable(trans));
+			break;
+		case NFT_MSG_DELFLOWTABLE:
+			trans->ctx.table->use++;
+			nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+			nft_trans_destroy(trans);
+			break;
 		}
 	}
 
@@ -5865,6 +6598,7 @@ EXPORT_SYMBOL_GPL(__nft_release_basechain);
 /* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */
 static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 {
+	struct nft_flowtable *flowtable, *nf;
 	struct nft_table *table, *nt;
 	struct nft_chain *chain, *nc;
 	struct nft_object *obj, *ne;
@@ -5878,6 +6612,9 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 	list_for_each_entry_safe(table, nt, &afi->tables, list) {
 		list_for_each_entry(chain, &table->chains, list)
 			nf_tables_unregister_hook(net, table, chain);
+		list_for_each_entry(flowtable, &table->flowtables, list)
+			nf_unregister_net_hooks(net, flowtable->ops,
+						flowtable->ops_len);
 		/* No packets are walking on these chains anymore. */
 		ctx.table = table;
 		list_for_each_entry(chain, &table->chains, list) {
@@ -5888,6 +6625,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 				nf_tables_rule_destroy(&ctx, rule);
 			}
 		}
+		list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
+			list_del(&flowtable->list);
+			table->use--;
+			nf_tables_flowtable_destroy(flowtable);
+		}
 		list_for_each_entry_safe(set, ns, &table->sets, list) {
 			list_del(&set->list);
 			table->use--;
@@ -5932,6 +6674,8 @@ static int __init nf_tables_module_init(void)
 	if (err < 0)
 		goto err3;
 
+	register_netdevice_notifier(&nf_tables_flowtable_notifier);
+
 	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
 	return register_pernet_subsys(&nf_tables_net_ops);
 err3:
@@ -5946,6 +6690,7 @@ static void __exit nf_tables_module_exit(void)
 {
 	unregister_pernet_subsys(&nf_tables_net_ops);
 	nfnetlink_subsys_unregister(&nf_tables_subsys);
+	unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
 	rcu_barrier();
 	nf_tables_core_module_exit();
 	kfree(info);
-- 
cgit v1.2.3


From a3c90f7a2323b331ae816d5b0633e68148e25d04 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:26 +0100
Subject: netfilter: nf_tables: flow offload expression

Add new instruction for the nf_tables VM that allows us to specify what
flows are offloaded into a given flow table via name. This new
instruction creates the flow entry and adds it to the flow table.

Only established flows, ie. we have seen traffic in both directions, are
added to the flow table. You can still decide to offload entries at a
later stage via packet counting or checking the ct status in case you
want to offload assured conntracks.

This new extension depends on the conntrack subsystem.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  11 ++
 net/netfilter/Kconfig                    |   7 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_flow_offload.c         | 264 +++++++++++++++++++++++++++++++
 4 files changed, 283 insertions(+)
 create mode 100644 net/netfilter/nft_flow_offload.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 591b53bce070..53e8dd2a3a03 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -957,6 +957,17 @@ enum nft_ct_attributes {
 };
 #define NFTA_CT_MAX		(__NFTA_CT_MAX - 1)
 
+/**
+ * enum nft_flow_attributes - ct offload expression attributes
+ * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING)
+ */
+enum nft_offload_attributes {
+	NFTA_FLOW_UNSPEC,
+	NFTA_FLOW_TABLE_NAME,
+	__NFTA_FLOW_MAX,
+};
+#define NFTA_FLOW_MAX		(__NFTA_FLOW_MAX - 1)
+
 enum nft_limit_type {
 	NFT_LIMIT_PKTS,
 	NFT_LIMIT_PKT_BYTES
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 272803079bf2..0ee0fcf3abbf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -505,6 +505,13 @@ config NFT_CT
 	  This option adds the "ct" expression that you can use to match
 	  connection tracking information such as the flow state.
 
+config NFT_FLOW_OFFLOAD
+	depends on NF_CONNTRACK
+	tristate "Netfilter nf_tables hardware flow offload module"
+	help
+	  This option adds the "flow_offload" expression that you can use to
+	  choose what flows are placed into the hardware.
+
 config NFT_SET_RBTREE
 	tristate "Netfilter nf_tables rbtree set module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 061365875cde..5d9b8b959e58 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_RT)		+= nft_rt.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
+obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
 obj-$(CONFIG_NFT_NAT)		+= nft_nat.o
 obj-$(CONFIG_NFT_OBJREF)	+= nft_objref.o
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
new file mode 100644
index 000000000000..dd38785dfed9
--- /dev/null
+++ b/net/netfilter/nft_flow_offload.c
@@ -0,0 +1,264 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/ip.h> /* for ipv4 options. */
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <net/netfilter/nf_flow_table.h>
+
+struct nft_flow_offload {
+	struct nft_flowtable	*flowtable;
+};
+
+static int nft_flow_route(const struct nft_pktinfo *pkt,
+			  const struct nf_conn *ct,
+			  struct nf_flow_route *route,
+			  enum ip_conntrack_dir dir)
+{
+	struct dst_entry *this_dst = skb_dst(pkt->skb);
+	struct dst_entry *other_dst = NULL;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(fl));
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		fl.u.ip4.daddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
+		break;
+	case NFPROTO_IPV6:
+		fl.u.ip6.daddr = ct->tuplehash[!dir].tuple.dst.u3.in6;
+		break;
+	}
+
+	nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
+	if (!other_dst)
+		return -ENOENT;
+
+	route->tuple[dir].dst		= this_dst;
+	route->tuple[dir].ifindex	= nft_in(pkt)->ifindex;
+	route->tuple[!dir].dst		= other_dst;
+	route->tuple[!dir].ifindex	= nft_out(pkt)->ifindex;
+
+	return 0;
+}
+
+static bool nft_flow_offload_skip(struct sk_buff *skb)
+{
+	struct ip_options *opt  = &(IPCB(skb)->opt);
+
+	if (unlikely(opt->optlen))
+		return true;
+	if (skb_sec_path(skb))
+		return true;
+
+	return false;
+}
+
+static void nft_flow_offload_eval(const struct nft_expr *expr,
+				  struct nft_regs *regs,
+				  const struct nft_pktinfo *pkt)
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+	struct nf_flowtable *flowtable = &priv->flowtable->data;
+	enum ip_conntrack_info ctinfo;
+	struct nf_flow_route route;
+	struct flow_offload *flow;
+	enum ip_conntrack_dir dir;
+	struct nf_conn *ct;
+	int ret;
+
+	if (nft_flow_offload_skip(pkt->skb))
+		goto out;
+
+	ct = nf_ct_get(pkt->skb, &ctinfo);
+	if (!ct)
+		goto out;
+
+	switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		break;
+	default:
+		goto out;
+	}
+
+	if (test_bit(IPS_HELPER_BIT, &ct->status))
+		goto out;
+
+	if (ctinfo == IP_CT_NEW ||
+	    ctinfo == IP_CT_RELATED)
+		goto out;
+
+	if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+		goto out;
+
+	dir = CTINFO2DIR(ctinfo);
+	if (nft_flow_route(pkt, ct, &route, dir) < 0)
+		goto err_flow_route;
+
+	flow = flow_offload_alloc(ct, &route);
+	if (!flow)
+		goto err_flow_alloc;
+
+	ret = flow_offload_add(flowtable, flow);
+	if (ret < 0)
+		goto err_flow_add;
+
+	return;
+
+err_flow_add:
+	flow_offload_free(flow);
+err_flow_alloc:
+	dst_release(route.tuple[!dir].dst);
+err_flow_route:
+	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+out:
+	regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_flow_offload_validate(const struct nft_ctx *ctx,
+				     const struct nft_expr *expr,
+				     const struct nft_data **data)
+{
+	unsigned int hook_mask = (1 << NF_INET_FORWARD);
+
+	return nft_chain_validate_hooks(ctx->chain, hook_mask);
+}
+
+static int nft_flow_offload_init(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr,
+				 const struct nlattr * const tb[])
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+	struct nft_flowtable *flowtable;
+
+	if (!tb[NFTA_FLOW_TABLE_NAME])
+		return -EINVAL;
+
+	flowtable = nf_tables_flowtable_lookup(ctx->table,
+					       tb[NFTA_FLOW_TABLE_NAME],
+					       genmask);
+	if (IS_ERR(flowtable))
+		return PTR_ERR(flowtable);
+
+	priv->flowtable = flowtable;
+	flowtable->use++;
+
+	return nf_ct_netns_get(ctx->net, ctx->afi->family);
+}
+
+static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
+				     const struct nft_expr *expr)
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+	priv->flowtable->use--;
+	nf_ct_netns_put(ctx->net, ctx->afi->family);
+}
+
+static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+	if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_flow_offload_type;
+static const struct nft_expr_ops nft_flow_offload_ops = {
+	.type		= &nft_flow_offload_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)),
+	.eval		= nft_flow_offload_eval,
+	.init		= nft_flow_offload_init,
+	.destroy	= nft_flow_offload_destroy,
+	.validate	= nft_flow_offload_validate,
+	.dump		= nft_flow_offload_dump,
+};
+
+static struct nft_expr_type nft_flow_offload_type __read_mostly = {
+	.name		= "flow_offload",
+	.ops		= &nft_flow_offload_ops,
+	.maxattr	= NFTA_FLOW_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data)
+{
+	struct net_device *dev = data;
+
+	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+		return;
+
+	flow_offload_dead(flow);
+}
+
+static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable,
+					     void *data)
+{
+	nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
+}
+
+static int flow_offload_netdev_event(struct notifier_block *this,
+				     unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (event != NETDEV_DOWN)
+		return NOTIFY_DONE;
+
+	nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block flow_offload_netdev_notifier = {
+	.notifier_call	= flow_offload_netdev_event,
+};
+
+static int __init nft_flow_offload_module_init(void)
+{
+	int err;
+
+	register_netdevice_notifier(&flow_offload_netdev_notifier);
+
+	err = nft_register_expr(&nft_flow_offload_type);
+	if (err < 0)
+		goto register_expr;
+
+	return 0;
+
+register_expr:
+	unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+	return err;
+}
+
+static void __exit nft_flow_offload_module_exit(void)
+{
+	struct net *net;
+
+	nft_unregister_expr(&nft_flow_offload_type);
+	unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+	rtnl_lock();
+	for_each_net(net)
+		nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL);
+	rtnl_unlock();
+}
+
+module_init(nft_flow_offload_module_init);
+module_exit(nft_flow_offload_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("flow_offload");
-- 
cgit v1.2.3


From 23fe846f9a48d5375722b3bd060e0a02ad1ca7f1 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 5 Jan 2018 19:47:14 +0100
Subject: l2tp: adjust comments about L2TPv3 offsets

The "offset" option has been removed by
commit 900631ee6a26 ("l2tp: remove configurable payload offset").

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Acked-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 2 +-
 net/l2tp/l2tp_core.c      | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index f78eef4cc56a..71e62795104d 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -65,7 +65,7 @@ struct sockaddr_l2tpip6 {
  * TUNNEL_MODIFY	- CONN_ID, udpcsum
  * TUNNEL_GETSTATS	- CONN_ID, (stats)
  * TUNNEL_GET		- CONN_ID, (...)
- * SESSION_CREATE	- SESSION_ID, PW_TYPE, offset, data_seq, cookie, peer_cookie, offset, l2spec
+ * SESSION_CREATE	- SESSION_ID, PW_TYPE, data_seq, cookie, peer_cookie, l2spec
  * SESSION_DELETE	- SESSION_ID
  * SESSION_MODIFY	- SESSION_ID, data_seq
  * SESSION_GET		- SESSION_ID, (...)
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 786cd7f6a5e8..62285fc6eb59 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -662,10 +662,9 @@ discard:
  * |x|S|x|x|x|x|x|x|              Sequence Number                  |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
- * Cookie value, sublayer format and offset (pad) are negotiated with
- * the peer when the session is set up. Unlike L2TPv2, we do not need
- * to parse the packet header to determine if optional fields are
- * present.
+ * Cookie value and sublayer format are negotiated with the peer when
+ * the session is set up. Unlike L2TPv2, we do not need to parse the
+ * packet header to determine if optional fields are present.
  *
  * Caller must already have parsed the frame and determined that it is
  * a data (not control) frame before coming here. Fields up to the
-- 
cgit v1.2.3


From c5a9f6f0ab4054082dd5ce9bbdaa8e8ff05cf365 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Mon, 17 Jul 2017 13:47:07 +0300
Subject: net/core: Add drop counters to VF statistics

Modern hardware can decide to drop packets going to/from a VF.
Add receive and transmit drop counters to be displayed at hypervisor
layer in iproute2 per VF statistics.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/if_link.h      |  2 ++
 include/uapi/linux/if_link.h |  2 ++
 net/core/rtnetlink.c         | 10 +++++++++-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 4c54611e03e9..622658dfbf0a 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -13,6 +13,8 @@ struct ifla_vf_stats {
 	__u64 tx_bytes;
 	__u64 broadcast;
 	__u64 multicast;
+	__u64 rx_dropped;
+	__u64 tx_dropped;
 };
 
 struct ifla_vf_info {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 19fc02660e0c..f8f04fed6186 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -732,6 +732,8 @@ enum {
 	IFLA_VF_STATS_BROADCAST,
 	IFLA_VF_STATS_MULTICAST,
 	IFLA_VF_STATS_PAD,
+	IFLA_VF_STATS_RX_DROPPED,
+	IFLA_VF_STATS_TX_DROPPED,
 	__IFLA_VF_STATS_MAX,
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c688dc564b11..5421a3fd3ba1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -904,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 			 nla_total_size_64bit(sizeof(__u64)) +
 			 /* IFLA_VF_STATS_MULTICAST */
 			 nla_total_size_64bit(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_RX_DROPPED */
+			 nla_total_size_64bit(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_TX_DROPPED */
+			 nla_total_size_64bit(sizeof(__u64)) +
 			 nla_total_size(sizeof(struct ifla_vf_trust)));
 		return size;
 	} else
@@ -1258,7 +1262,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
 			      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
-			      vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+			      vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+			      vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+			      vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
 		nla_nest_cancel(skb, vfstats);
 		goto nla_put_vf_failure;
 	}
-- 
cgit v1.2.3


From 7c0143153cd33a0a267908ca419e2adc40ee513a Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Tue, 19 Dec 2017 13:28:30 +0200
Subject: USB: clarify USB_DT_USB_SSP_CAP_SIZE(ssac) definition

USB_DT_USB_SSP_CAP_SIZE(ssac) gives the size of the SSP capability
descriptor. The descriptor consists of 12 bytes plus a array of
SSA entries.

The number of SSA entries is stored in a SSAC value in the first 12 bytes,
The USB3.1 specification 9.6.2.5 defines SSAC as zero based:
"The number of Sublink Speed Attributes = SSAC + 1." This is not
intuitive and has already caused some confusion.

Make a small modifiaction to the USB_DT_USB_SSP_CAP_SIZE(ssac)
definition to make it a bit clearer

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/usb/ch9.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index c4c79aa331bd..d5a5caec8fbc 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -1077,9 +1077,9 @@ struct usb_ptm_cap_descriptor {
 #define USB_DT_USB_PTM_ID_SIZE		3
 /*
  * The size of the descriptor for the Sublink Speed Attribute Count
- * (SSAC) specified in bmAttributes[4:0].
+ * (SSAC) specified in bmAttributes[4:0]. SSAC is zero-based
  */
-#define USB_DT_USB_SSP_CAP_SIZE(ssac)	(16 + ssac * 4)
+#define USB_DT_USB_SSP_CAP_SIZE(ssac)	(12 + (ssac + 1) * 4)
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3


From ccfdec9089229503d3a305e02accac01817d293e Mon Sep 17 00:00:00 2001
From: Felix Walter <felix.walter@cloudandheat.com>
Date: Fri, 5 Jan 2018 14:33:31 +0100
Subject: macsec: Add support for GCM-AES-256 cipher suite

This adds support for the GCM-AES-256 cipher suite as specified in
IEEE 802.1AEbn-2011. The prepared cipher suite selection mechanism is used,
with GCM-AES-128 being the default cipher suite as defined in the standard.

Signed-off-by: Felix Walter <felix.walter@cloudandheat.com>
Cc: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macsec.c           | 72 ++++++++++++++++++++++++++++++++++--------
 include/uapi/linux/if_macsec.h | 11 +++++--
 2 files changed, 67 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 1d025ab9568f..f522715c6595 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -393,7 +393,12 @@ static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb)
 #define MACSEC_PORT_SCB (0x0000)
 #define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL)
 
-#define DEFAULT_SAK_LEN 16
+#define MACSEC_GCM_AES_128_SAK_LEN 16
+#define MACSEC_GCM_AES_256_SAK_LEN 32
+
+#define MAX_SAK_LEN MACSEC_GCM_AES_256_SAK_LEN
+
+#define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN
 #define DEFAULT_SEND_SCI true
 #define DEFAULT_ENCRYPT false
 #define DEFAULT_ENCODING_SA 0
@@ -1600,7 +1605,7 @@ static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = {
 	[MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY,
 				   .len = MACSEC_KEYID_LEN, },
 	[MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY,
-				 .len = MACSEC_MAX_KEY_LEN, },
+				 .len = MAX_SAK_LEN, },
 };
 
 static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa)
@@ -2362,15 +2367,26 @@ static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb)
 {
 	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
 	struct nlattr *secy_nest = nla_nest_start(skb, MACSEC_ATTR_SECY);
+	u64 csid;
 
 	if (!secy_nest)
 		return 1;
 
+	switch (secy->key_len) {
+	case MACSEC_GCM_AES_128_SAK_LEN:
+		csid = MACSEC_CIPHER_ID_GCM_AES_128;
+		break;
+	case MACSEC_GCM_AES_256_SAK_LEN:
+		csid = MACSEC_CIPHER_ID_GCM_AES_256;
+		break;
+	default:
+		goto cancel;
+	}
+
 	if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci,
 			MACSEC_SECY_ATTR_PAD) ||
 	    nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE,
-			      MACSEC_DEFAULT_CIPHER_ID,
-			      MACSEC_SECY_ATTR_PAD) ||
+			      csid, MACSEC_SECY_ATTR_PAD) ||
 	    nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) ||
 	    nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) ||
 	    nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) ||
@@ -3015,8 +3031,8 @@ static void macsec_setup(struct net_device *dev)
 	eth_zero_addr(dev->broadcast);
 }
 
-static void macsec_changelink_common(struct net_device *dev,
-				     struct nlattr *data[])
+static int macsec_changelink_common(struct net_device *dev,
+				    struct nlattr *data[])
 {
 	struct macsec_secy *secy;
 	struct macsec_tx_sc *tx_sc;
@@ -3056,6 +3072,22 @@ static void macsec_changelink_common(struct net_device *dev,
 
 	if (data[IFLA_MACSEC_VALIDATION])
 		secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]);
+
+	if (data[IFLA_MACSEC_CIPHER_SUITE]) {
+		switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) {
+		case MACSEC_CIPHER_ID_GCM_AES_128:
+		case MACSEC_DEFAULT_CIPHER_ALT:
+			secy->key_len = MACSEC_GCM_AES_128_SAK_LEN;
+			break;
+		case MACSEC_CIPHER_ID_GCM_AES_256:
+			secy->key_len = MACSEC_GCM_AES_256_SAK_LEN;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return 0;
 }
 
 static int macsec_changelink(struct net_device *dev, struct nlattr *tb[],
@@ -3071,9 +3103,7 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[],
 	    data[IFLA_MACSEC_PORT])
 		return -EINVAL;
 
-	macsec_changelink_common(dev, data);
-
-	return 0;
+	return macsec_changelink_common(dev, data);
 }
 
 static void macsec_del_dev(struct macsec_dev *macsec)
@@ -3270,8 +3300,11 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 	if (err)
 		goto unlink;
 
-	if (data)
-		macsec_changelink_common(dev, data);
+	if (data) {
+		err = macsec_changelink_common(dev, data);
+		if (err)
+			goto del_dev;
+	}
 
 	err = register_macsec_dev(real_dev, dev);
 	if (err < 0)
@@ -3320,7 +3353,8 @@ static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[],
 	}
 
 	switch (csid) {
-	case MACSEC_DEFAULT_CIPHER_ID:
+	case MACSEC_CIPHER_ID_GCM_AES_128:
+	case MACSEC_CIPHER_ID_GCM_AES_256:
 	case MACSEC_DEFAULT_CIPHER_ALT:
 		if (icv_len < MACSEC_MIN_ICV_LEN ||
 		    icv_len > MACSEC_STD_ICV_LEN)
@@ -3390,12 +3424,24 @@ static int macsec_fill_info(struct sk_buff *skb,
 {
 	struct macsec_secy *secy = &macsec_priv(dev)->secy;
 	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
+	u64 csid;
+
+	switch (secy->key_len) {
+	case MACSEC_GCM_AES_128_SAK_LEN:
+		csid = MACSEC_CIPHER_ID_GCM_AES_128;
+		break;
+	case MACSEC_GCM_AES_256_SAK_LEN:
+		csid = MACSEC_CIPHER_ID_GCM_AES_256;
+		break;
+	default:
+		goto nla_put_failure;
+	}
 
 	if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci,
 			IFLA_MACSEC_PAD) ||
 	    nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) ||
 	    nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE,
-			      MACSEC_DEFAULT_CIPHER_ID, IFLA_MACSEC_PAD) ||
+			      csid, IFLA_MACSEC_PAD) ||
 	    nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) ||
 	    nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) ||
 	    nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) ||
diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h
index 719d243471f4..2e522835a4af 100644
--- a/include/uapi/linux/if_macsec.h
+++ b/include/uapi/linux/if_macsec.h
@@ -18,12 +18,17 @@
 #define MACSEC_GENL_NAME "macsec"
 #define MACSEC_GENL_VERSION 1
 
-#define MACSEC_MAX_KEY_LEN 128
+#define MACSEC_MAX_KEY_LEN 256
 
 #define MACSEC_KEYID_LEN 16
 
-#define MACSEC_DEFAULT_CIPHER_ID   0x0080020001000001ULL
-#define MACSEC_DEFAULT_CIPHER_ALT  0x0080C20001000001ULL
+/* cipher IDs as per IEEE802.1AEbn-2011 */
+#define MACSEC_CIPHER_ID_GCM_AES_128 0x0080C20001000001ULL
+#define MACSEC_CIPHER_ID_GCM_AES_256 0x0080C20001000002ULL
+
+#define MACSEC_DEFAULT_CIPHER_ID     MACSEC_CIPHER_ID_GCM_AES_128
+/* deprecated cipher ID for GCM-AES-128 */
+#define MACSEC_DEFAULT_CIPHER_ALT    0x0080020001000001ULL
 
 #define MACSEC_MIN_ICV_LEN 8
 #define MACSEC_MAX_ICV_LEN 32
-- 
cgit v1.2.3


From faa9b39f0e9ddfa8c83725b3f230784976dd3c7f Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 5 Jan 2018 17:44:54 -0500
Subject: virtio_net: propagate linkspeed/duplex settings from the hypervisor

The ability to set speed and duplex for virtio_net is useful in various
scenarios as described here:

16032be virtio_net: add ethtool support for set and get of settings

However, it would be nice to be able to set this from the hypervisor,
such that virtio_net doesn't require custom guest ethtool commands.

Introduce a new feature flag, VIRTIO_NET_F_SPEED_DUPLEX, which allows
the hypervisor to export a linkspeed and duplex setting. The user can
subsequently overwrite it later if desired via: 'ethtool -s'.

Note that VIRTIO_NET_F_SPEED_DUPLEX is defined as bit 63, the intention
is that device feature bits are to grow down from bit 63, since the
transports are starting from bit 24 and growing up.

Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: virtio-dev@lists.oasis-open.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c        | 23 ++++++++++++++++++++++-
 include/uapi/linux/virtio_net.h | 13 +++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ed8299343728..12dfc5fee58e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1906,6 +1906,24 @@ static void virtnet_init_settings(struct net_device *dev)
 	vi->duplex = DUPLEX_UNKNOWN;
 }
 
+static void virtnet_update_settings(struct virtnet_info *vi)
+{
+	u32 speed;
+	u8 duplex;
+
+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
+		return;
+
+	speed = virtio_cread32(vi->vdev, offsetof(struct virtio_net_config,
+						  speed));
+	if (ethtool_validate_speed(speed))
+		vi->speed = speed;
+	duplex = virtio_cread8(vi->vdev, offsetof(struct virtio_net_config,
+						  duplex));
+	if (ethtool_validate_duplex(duplex))
+		vi->duplex = duplex;
+}
+
 static const struct ethtool_ops virtnet_ethtool_ops = {
 	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
@@ -2159,6 +2177,7 @@ static void virtnet_config_changed_work(struct work_struct *work)
 	vi->status = v;
 
 	if (vi->status & VIRTIO_NET_S_LINK_UP) {
+		virtnet_update_settings(vi);
 		netif_carrier_on(vi->dev);
 		netif_tx_wake_all_queues(vi->dev);
 	} else {
@@ -2707,6 +2726,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 		schedule_work(&vi->config_work);
 	} else {
 		vi->status = VIRTIO_NET_S_LINK_UP;
+		virtnet_update_settings(vi);
 		netif_carrier_on(dev);
 	}
 
@@ -2808,7 +2828,8 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
-	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
+	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
+	VIRTIO_NET_F_SPEED_DUPLEX
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index fc353b518288..5de6ed37695b 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,8 @@
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
 
+#define VIRTIO_NET_F_SPEED_DUPLEX 63	/* Device set linkspeed and duplex */
+
 #ifndef VIRTIO_NET_NO_LEGACY
 #define VIRTIO_NET_F_GSO	6	/* Host handles pkts w/ any GSO type */
 #endif /* VIRTIO_NET_NO_LEGACY */
@@ -76,6 +78,17 @@ struct virtio_net_config {
 	__u16 max_virtqueue_pairs;
 	/* Default maximum transmit unit advice */
 	__u16 mtu;
+	/*
+	 * speed, in units of 1Mb. All values 0 to INT_MAX are legal.
+	 * Any other value stands for unknown.
+	 */
+	__u32 speed;
+	/*
+	 * 0x00 - half duplex
+	 * 0x01 - full duplex
+	 * Any other value stands for unknown.
+	 */
+	__u8 duplex;
 } __attribute__((packed));
 
 /*
-- 
cgit v1.2.3


From 232d07b74a33b9f5d48516dc1d8ce41723ada593 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 8 Jan 2018 21:03:30 +0100
Subject: tipc: improve groupcast scope handling

When a member joins a group, it also indicates a binding scope. This
makes it possible to create both node local groups, invisible to other
nodes, as well as cluster global groups, visible everywhere.

In order to avoid that different members end up having permanently
differing views of group size and memberhip, we must inhibit locally
and globally bound members from joining the same group.

We do this by using the binding scope as an additional separator between
groups. I.e., a member must ignore all membership events from sockets
using a different scope than itself, and all lookups for message
destinations must require an exact match between the message's lookup
scope and the potential target's binding scope.

Apart from making it possible to create local groups using the same
identity on different nodes, a side effect of this is that it now also
becomes possible to create a cluster global group with the same identity
across the same nodes, without interfering with the local groups.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |  7 ++--
 net/tipc/group.c          | 13 ++++---
 net/tipc/name_table.c     | 40 ++++++++++-----------
 net/tipc/name_table.h     |  4 +--
 net/tipc/server.c         |  4 +--
 net/tipc/server.h         |  6 ++--
 net/tipc/socket.c         | 88 ++++++++++++++++++++++++++++-------------------
 net/tipc/subscr.c         | 10 ++++--
 net/tipc/subscr.h         |  2 +-
 9 files changed, 99 insertions(+), 75 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 35f79d1f8c3a..14bacc7e6cef 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -117,10 +117,9 @@ static inline unsigned int tipc_node(__u32 addr)
 /*
  * Publication scopes when binding port names and port name sequences
  */
-
-#define TIPC_ZONE_SCOPE		1
-#define TIPC_CLUSTER_SCOPE	2
-#define TIPC_NODE_SCOPE		3
+#define TIPC_ZONE_SCOPE         1
+#define TIPC_CLUSTER_SCOPE      2
+#define TIPC_NODE_SCOPE         3
 
 /*
  * Limiting values for messages
diff --git a/net/tipc/group.c b/net/tipc/group.c
index cf996bd6ec98..1908773c9fca 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -87,7 +87,6 @@ struct tipc_group {
 	int subid;
 	u32 type;
 	u32 instance;
-	u32 domain;
 	u32 scope;
 	u32 portid;
 	u16 member_cnt;
@@ -158,6 +157,8 @@ int tipc_group_size(struct tipc_group *grp)
 struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 				     struct tipc_group_req *mreq)
 {
+	u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS;
+	bool global = mreq->scope != TIPC_NODE_SCOPE;
 	struct tipc_group *grp;
 	u32 type = mreq->type;
 
@@ -171,15 +172,14 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	grp->members = RB_ROOT;
 	grp->net = net;
 	grp->portid = portid;
-	grp->domain = addr_domain(net, mreq->scope);
 	grp->type = type;
 	grp->instance = mreq->instance;
 	grp->scope = mreq->scope;
 	grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK;
 	grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS;
-	if (tipc_topsrv_kern_subscr(net, portid, type,
-				    TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS,
-				    0, ~0, &grp->subid))
+	filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE;
+	if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0,
+				    filter, &grp->subid))
 		return grp;
 	kfree(grp);
 	return NULL;
@@ -732,6 +732,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 	if (!grp)
 		return;
 
+	if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net))
+		return;
+
 	m = tipc_group_find_member(grp, node, port);
 
 	switch (msg_type(hdr)) {
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 60af9885f160..64cdd3c302b0 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -328,7 +328,8 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
 		tipc_subscrp_report_overlap(s, publ->lower, publ->upper,
 					    TIPC_PUBLISHED, publ->ref,
-					    publ->node, created_subseq);
+					    publ->node, publ->scope,
+					    created_subseq);
 	}
 	return publ;
 }
@@ -398,7 +399,8 @@ found:
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
 		tipc_subscrp_report_overlap(s, publ->lower, publ->upper,
 					    TIPC_WITHDRAWN, publ->ref,
-					    publ->node, removed_subseq);
+					    publ->node, publ->scope,
+					    removed_subseq);
 	}
 
 	return publ;
@@ -435,6 +437,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 							    sseq->upper,
 							    TIPC_PUBLISHED,
 							    crs->ref, crs->node,
+							    crs->scope,
 							    must_report);
 				must_report = 0;
 			}
@@ -598,7 +601,7 @@ not_found:
 	return ref;
 }
 
-bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
+bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
 			 struct list_head *dsts, int *dstcnt, u32 exclude,
 			 bool all)
 {
@@ -608,9 +611,6 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
 	struct name_seq *seq;
 	struct sub_seq *sseq;
 
-	if (!tipc_in_scope(domain, self))
-		return false;
-
 	*dstcnt = 0;
 	rcu_read_lock();
 	seq = nametbl_find_seq(net, type);
@@ -621,7 +621,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
 	if (likely(sseq)) {
 		info = sseq->info;
 		list_for_each_entry(publ, &info->zone_list, zone_list) {
-			if (!tipc_in_scope(domain, publ->node))
+			if (publ->scope != scope)
 				continue;
 			if (publ->ref == exclude && publ->node == self)
 				continue;
@@ -639,13 +639,14 @@ exit:
 	return !list_empty(dsts);
 }
 
-int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
-			      u32 limit, struct list_head *dports)
+int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
+			   u32 scope, bool exact, struct list_head *dports)
 {
-	struct name_seq *seq;
-	struct sub_seq *sseq;
 	struct sub_seq *sseq_stop;
 	struct name_info *info;
+	struct publication *p;
+	struct name_seq *seq;
+	struct sub_seq *sseq;
 	int res = 0;
 
 	rcu_read_lock();
@@ -657,15 +658,12 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 	sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
 	sseq_stop = seq->sseqs + seq->first_free;
 	for (; sseq != sseq_stop; sseq++) {
-		struct publication *publ;
-
 		if (sseq->lower > upper)
 			break;
-
 		info = sseq->info;
-		list_for_each_entry(publ, &info->node_list, node_list) {
-			if (publ->scope <= limit)
-				tipc_dest_push(dports, 0, publ->ref);
+		list_for_each_entry(p, &info->node_list, node_list) {
+			if (p->scope == scope || (!exact && p->scope < scope))
+				tipc_dest_push(dports, 0, p->ref);
 		}
 
 		if (info->cluster_list_size != info->node_list_size)
@@ -682,7 +680,7 @@ exit:
  * - Determines if any node local ports overlap
  */
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
-				   u32 upper, u32 domain,
+				   u32 upper, u32 scope,
 				   struct tipc_nlist *nodes)
 {
 	struct sub_seq *sseq, *stop;
@@ -701,7 +699,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
 	for (; sseq != stop && sseq->lower <= upper; sseq++) {
 		info = sseq->info;
 		list_for_each_entry(publ, &info->zone_list, zone_list) {
-			if (tipc_in_scope(domain, publ->node))
+			if (publ->scope == scope)
 				tipc_nlist_add(nodes, publ->node);
 		}
 	}
@@ -713,7 +711,7 @@ exit:
 /* tipc_nametbl_build_group - build list of communication group members
  */
 void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
-			      u32 type, u32 domain)
+			      u32 type, u32 scope)
 {
 	struct sub_seq *sseq, *stop;
 	struct name_info *info;
@@ -731,7 +729,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 	for (; sseq != stop; sseq++) {
 		info = sseq->info;
 		list_for_each_entry(p, &info->zone_list, zone_list) {
-			if (!tipc_in_scope(domain, p->node))
+			if (p->scope != scope)
 				continue;
 			tipc_group_add_member(grp, p->node, p->ref, p->lower);
 		}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 73a148c85c15..b595d8aa00f0 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -100,8 +100,8 @@ struct name_table {
 int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
 
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
-int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
-			      u32 limit, struct list_head *dports);
+int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
+			   u32 scope, bool exact, struct list_head *dports);
 void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 			      u32 type, u32 domain);
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 950c54cbcf3a..8ee5e86b7870 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -489,8 +489,8 @@ void tipc_conn_terminate(struct tipc_server *s, int conid)
 	}
 }
 
-bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type,
-			     u32 filter, u32 lower, u32 upper, int *conid)
+bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
+			     u32 upper, u32 filter, int *conid)
 {
 	struct tipc_subscriber *scbr;
 	struct tipc_subscr sub;
diff --git a/net/tipc/server.h b/net/tipc/server.h
index ea1effbff23e..17f49ee44cfd 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -41,6 +41,8 @@
 #include <net/net_namespace.h>
 
 #define TIPC_SERVER_NAME_LEN	32
+#define TIPC_SUB_CLUSTER_SCOPE  0x20
+#define TIPC_SUB_NODE_SCOPE     0x40
 #define TIPC_SUB_NO_STATUS      0x80
 
 /**
@@ -84,8 +86,8 @@ struct tipc_server {
 int tipc_conn_sendmsg(struct tipc_server *s, int conid,
 		      struct sockaddr_tipc *addr, void *data, size_t len);
 
-bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type,
-			     u32 filter, u32 lower, u32 upper, int *conid);
+bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
+			     u32 upper, u32 filter, int *conid);
 void tipc_topsrv_kern_unsubscr(struct net *net, int conid);
 
 /**
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index e3a02f1fcab5..b24dab3996c9 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -928,21 +928,22 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
 	struct list_head *cong_links = &tsk->cong_links;
 	int blks = tsk_blocks(GROUP_H_SIZE + dlen);
 	struct tipc_group *grp = tsk->group;
+	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_member *first = NULL;
 	struct tipc_member *mbr = NULL;
 	struct net *net = sock_net(sk);
 	u32 node, port, exclude;
-	u32 type, inst, domain;
 	struct list_head dsts;
+	u32 type, inst, scope;
 	int lookups = 0;
 	int dstcnt, rc;
 	bool cong;
 
 	INIT_LIST_HEAD(&dsts);
 
-	type = dest->addr.name.name.type;
+	type = msg_nametype(hdr);
 	inst = dest->addr.name.name.instance;
-	domain = addr_domain(net, dest->scope);
+	scope = msg_lookup_scope(hdr);
 	exclude = tipc_group_exclude(grp);
 
 	while (++lookups < 4) {
@@ -950,7 +951,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
 
 		/* Look for a non-congested destination member, if any */
 		while (1) {
-			if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts,
+			if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts,
 						 &dstcnt, exclude, false))
 				return -EHOSTUNREACH;
 			tipc_dest_pop(&dsts, &node, &port);
@@ -1079,22 +1080,23 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m,
 {
 	struct sock *sk = sock->sk;
 	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
-	struct tipc_name_seq *seq = &dest->addr.nameseq;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_group *grp = tsk->group;
+	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
-	u32 domain, exclude, dstcnt;
+	u32 type, inst, scope, exclude;
 	struct list_head dsts;
+	u32 dstcnt;
 
 	INIT_LIST_HEAD(&dsts);
 
-	if (seq->lower != seq->upper)
-		return -ENOTSUPP;
-
-	domain = addr_domain(net, dest->scope);
+	type = msg_nametype(hdr);
+	inst = dest->addr.name.name.instance;
+	scope = msg_lookup_scope(hdr);
 	exclude = tipc_group_exclude(grp);
-	if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain,
-				 &dsts, &dstcnt, exclude, true))
+
+	if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts,
+				 &dstcnt, exclude, true))
 		return -EHOSTUNREACH;
 
 	if (dstcnt == 1) {
@@ -1116,24 +1118,29 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m,
 void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 		       struct sk_buff_head *inputq)
 {
-	u32 scope = TIPC_CLUSTER_SCOPE;
 	u32 self = tipc_own_addr(net);
+	u32 type, lower, upper, scope;
 	struct sk_buff *skb, *_skb;
-	u32 lower = 0, upper = ~0;
-	struct sk_buff_head tmpq;
 	u32 portid, oport, onode;
+	struct sk_buff_head tmpq;
 	struct list_head dports;
-	struct tipc_msg *msg;
-	int user, mtyp, hsz;
+	struct tipc_msg *hdr;
+	int user, mtyp, hlen;
+	bool exact;
 
 	__skb_queue_head_init(&tmpq);
 	INIT_LIST_HEAD(&dports);
 
 	skb = tipc_skb_peek(arrvq, &inputq->lock);
 	for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
-		msg = buf_msg(skb);
-		user = msg_user(msg);
-		mtyp = msg_type(msg);
+		hdr = buf_msg(skb);
+		user = msg_user(hdr);
+		mtyp = msg_type(hdr);
+		hlen = skb_headroom(skb) + msg_hdr_sz(hdr);
+		oport = msg_origport(hdr);
+		onode = msg_orignode(hdr);
+		type = msg_nametype(hdr);
+
 		if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) {
 			spin_lock_bh(&inputq->lock);
 			if (skb_peek(arrvq) == skb) {
@@ -1144,21 +1151,31 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 			spin_unlock_bh(&inputq->lock);
 			continue;
 		}
-		hsz = skb_headroom(skb) + msg_hdr_sz(msg);
-		oport = msg_origport(msg);
-		onode = msg_orignode(msg);
-		if (onode == self)
-			scope = TIPC_NODE_SCOPE;
-
-		/* Create destination port list and message clones: */
-		if (!msg_in_group(msg)) {
-			lower = msg_namelower(msg);
-			upper = msg_nameupper(msg);
+
+		/* Group messages require exact scope match */
+		if (msg_in_group(hdr)) {
+			lower = 0;
+			upper = ~0;
+			scope = msg_lookup_scope(hdr);
+			exact = true;
+		} else {
+			/* TIPC_NODE_SCOPE means "any scope" in this context */
+			if (onode == self)
+				scope = TIPC_NODE_SCOPE;
+			else
+				scope = TIPC_CLUSTER_SCOPE;
+			exact = false;
+			lower = msg_namelower(hdr);
+			upper = msg_nameupper(hdr);
 		}
-		tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper,
-					  scope, &dports);
+
+		/* Create destination port list: */
+		tipc_nametbl_mc_lookup(net, type, lower, upper,
+				       scope, exact, &dports);
+
+		/* Clone message per destination */
 		while (tipc_dest_pop(&dports, NULL, &portid)) {
-			_skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
+			_skb = __pskb_copy(skb, hlen, GFP_ATOMIC);
 			if (_skb) {
 				msg_set_destport(buf_msg(_skb), portid);
 				__skb_queue_tail(&tmpq, _skb);
@@ -2731,7 +2748,6 @@ void tipc_sk_rht_destroy(struct net *net)
 static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 {
 	struct net *net = sock_net(&tsk->sk);
-	u32 domain = addr_domain(net, mreq->scope);
 	struct tipc_group *grp = tsk->group;
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_name_seq seq;
@@ -2739,6 +2755,8 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 
 	if (mreq->type < TIPC_RESERVED_TYPES)
 		return -EACCES;
+	if (mreq->scope > TIPC_NODE_SCOPE)
+		return -EINVAL;
 	if (grp)
 		return -EACCES;
 	grp = tipc_group_create(net, tsk->portid, mreq);
@@ -2751,7 +2769,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
 	seq.type = mreq->type;
 	seq.lower = mreq->instance;
 	seq.upper = seq.lower;
-	tipc_nametbl_build_group(net, grp, mreq->type, domain);
+	tipc_nametbl_build_group(net, grp, mreq->type, mreq->scope);
 	rc = tipc_sk_publish(tsk, mreq->scope, &seq);
 	if (rc) {
 		tipc_group_delete(net, grp);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 1052341a0ea9..44df528ed6ab 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -118,15 +118,19 @@ void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
 
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
 				 u32 found_upper, u32 event, u32 port_ref,
-				 u32 node, int must)
+				 u32 node, u32 scope, int must)
 {
+	u32 filter = htohl(sub->evt.s.filter, sub->swap);
 	struct tipc_name_seq seq;
 
 	tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq);
 	if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
 		return;
-	if (!must &&
-	    !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS))
+	if (!must && !(filter & TIPC_SUB_PORTS))
+		return;
+	if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE)
+		return;
+	if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE)
 		return;
 
 	tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index ee52957dc952..f3edca775d9f 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -71,7 +71,7 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 			       u32 found_upper);
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
 				 u32 found_lower, u32 found_upper, u32 event,
-				 u32 port_ref, u32 node, int must);
+				 u32 port_ref, u32 node, u32 scope, int must);
 void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
 			      struct tipc_name_seq *out);
 u32 tipc_subscrp_convert_seq_type(u32 type, int swap);
-- 
cgit v1.2.3


From 34be39305a77b8b1ec9f279163c7cdb6cc719b91 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Tue, 12 Dec 2017 12:10:24 +0100
Subject: sched/deadline: Implement "runtime overrun signal" support

This patch adds the possibility of getting the delivery of a SIGXCPU
signal whenever there is a runtime overrun. The request is done through
the sched_flags field within the sched_attr structure.

Forward port of https://lkml.org/lkml/2009/10/16/170

Tested-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Claudio Scordino <claudio@evidence.eu.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1513077024-25461-1-git-send-email-claudio@evidence.eu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h          |  4 ++++
 include/uapi/linux/sched.h     |  5 +++++
 kernel/sched/core.c            |  3 +--
 kernel/sched/deadline.c        |  7 +++++++
 kernel/time/posix-cpu-timers.c | 18 ++++++++++++++++++
 5 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2588263a989..274a449c805a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -472,11 +472,15 @@ struct sched_dl_entity {
 	 * has not been executed yet. This flag is useful to avoid race
 	 * conditions between the inactive timer handler and the wakeup
 	 * code.
+	 *
+	 * @dl_overrun tells if the task asked to be informed about runtime
+	 * overruns.
 	 */
 	unsigned int			dl_throttled      : 1;
 	unsigned int			dl_boosted        : 1;
 	unsigned int			dl_yielded        : 1;
 	unsigned int			dl_non_contending : 1;
+	unsigned int			dl_overrun	  : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 30a9e51bbb1e..22627f80063e 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -49,5 +49,10 @@
  */
 #define SCHED_FLAG_RESET_ON_FORK	0x01
 #define SCHED_FLAG_RECLAIM		0x02
+#define SCHED_FLAG_DL_OVERRUN		0x04
+
+#define SCHED_FLAG_ALL	(SCHED_FLAG_RESET_ON_FORK	| \
+			 SCHED_FLAG_RECLAIM		| \
+			 SCHED_FLAG_DL_OVERRUN)
 
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a794f8155cd5..e28391bf8b04 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4085,8 +4085,7 @@ recheck:
 			return -EINVAL;
 	}
 
-	if (attr->sched_flags &
-		~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
+	if (attr->sched_flags & ~SCHED_FLAG_ALL)
 		return -EINVAL;
 
 	/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2473736c7616..4c666dbe5038 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1155,6 +1155,12 @@ static void update_curr_dl(struct rq *rq)
 throttle:
 	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
 		dl_se->dl_throttled = 1;
+
+		/* If requested, inform the user about runtime overruns. */
+		if (dl_runtime_exceeded(dl_se) &&
+		    (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
+			dl_se->dl_overrun = 1;
+
 		__dequeue_task_dl(rq, curr, 0);
 		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
@@ -2566,6 +2572,7 @@ void __dl_clear_params(struct task_struct *p)
 	dl_se->dl_throttled = 0;
 	dl_se->dl_yielded = 0;
 	dl_se->dl_non_contending = 0;
+	dl_se->dl_overrun = 0;
 }
 
 bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1f27887aa194..cf50ea34dbd1 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -14,6 +14,7 @@
 #include <linux/tick.h>
 #include <linux/workqueue.h>
 #include <linux/compat.h>
+#include <linux/sched/deadline.h>
 
 #include "posix-timers.h"
 
@@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers,
 	return 0;
 }
 
+static inline void check_dl_overrun(struct task_struct *tsk)
+{
+	if (tsk->dl.dl_overrun) {
+		tsk->dl.dl_overrun = 0;
+		__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+	}
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them off
  * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
@@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk,
 	u64 expires;
 	unsigned long soft;
 
+	if (dl_task(tsk))
+		check_dl_overrun(tsk);
+
 	/*
 	 * If cputime_expires is zero, then there are no active
 	 * per thread CPU timers.
@@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk,
 	struct task_cputime cputime;
 	unsigned long soft;
 
+	if (dl_task(tsk))
+		check_dl_overrun(tsk);
+
 	/*
 	 * If cputimer is not running, then there are no active
 	 * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
@@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 			return 1;
 	}
 
+	if (dl_task(tsk) && tsk->dl.dl_overrun)
+		return 1;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 202a8ff545ccdaa5ac2000d9201df3453c8816be Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Sun, 7 Jan 2018 19:22:02 +0100
Subject: netfilter: add IPv6 segment routing header 'srh' match

It allows matching packets based on Segment Routing Header
(SRH) information.
The implementation considers revision 7 of the SRH draft.
https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-07

Currently supported match options include:
(1) Next Header
(2) Hdr Ext Len
(3) Segments Left
(4) Last Entry
(5) Tag value of SRH

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_srh.h |  57 ++++++++++
 net/ipv6/netfilter/Kconfig                   |   9 ++
 net/ipv6/netfilter/Makefile                  |   1 +
 net/ipv6/netfilter/ip6t_srh.c                | 161 +++++++++++++++++++++++++++
 4 files changed, 228 insertions(+)
 create mode 100644 include/uapi/linux/netfilter_ipv6/ip6t_srh.h
 create mode 100644 net/ipv6/netfilter/ip6t_srh.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
new file mode 100644
index 000000000000..f3cc0ef514a7
--- /dev/null
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _IP6T_SRH_H
+#define _IP6T_SRH_H
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+
+/* Values for "mt_flags" field in struct ip6t_srh */
+#define IP6T_SRH_NEXTHDR        0x0001
+#define IP6T_SRH_LEN_EQ         0x0002
+#define IP6T_SRH_LEN_GT         0x0004
+#define IP6T_SRH_LEN_LT         0x0008
+#define IP6T_SRH_SEGS_EQ        0x0010
+#define IP6T_SRH_SEGS_GT        0x0020
+#define IP6T_SRH_SEGS_LT        0x0040
+#define IP6T_SRH_LAST_EQ        0x0080
+#define IP6T_SRH_LAST_GT        0x0100
+#define IP6T_SRH_LAST_LT        0x0200
+#define IP6T_SRH_TAG            0x0400
+#define IP6T_SRH_MASK           0x07FF
+
+/* Values for "mt_invflags" field in struct ip6t_srh */
+#define IP6T_SRH_INV_NEXTHDR    0x0001
+#define IP6T_SRH_INV_LEN_EQ     0x0002
+#define IP6T_SRH_INV_LEN_GT     0x0004
+#define IP6T_SRH_INV_LEN_LT     0x0008
+#define IP6T_SRH_INV_SEGS_EQ    0x0010
+#define IP6T_SRH_INV_SEGS_GT    0x0020
+#define IP6T_SRH_INV_SEGS_LT    0x0040
+#define IP6T_SRH_INV_LAST_EQ    0x0080
+#define IP6T_SRH_INV_LAST_GT    0x0100
+#define IP6T_SRH_INV_LAST_LT    0x0200
+#define IP6T_SRH_INV_TAG        0x0400
+#define IP6T_SRH_INV_MASK       0x07FF
+
+/**
+ *      struct ip6t_srh - SRH match options
+ *      @ next_hdr: Next header field of SRH
+ *      @ hdr_len: Extension header length field of SRH
+ *      @ segs_left: Segments left field of SRH
+ *      @ last_entry: Last entry field of SRH
+ *      @ tag: Tag field of SRH
+ *      @ mt_flags: match options
+ *      @ mt_invflags: Invert the sense of match options
+ */
+
+struct ip6t_srh {
+	__u8                    next_hdr;
+	__u8                    hdr_len;
+	__u8                    segs_left;
+	__u8                    last_entry;
+	__u16                   tag;
+	__u16                   mt_flags;
+	__u16                   mt_invflags;
+};
+
+#endif /*_IP6T_SRH_H*/
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 806e95375ec8..b6f5edf926d2 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -240,6 +240,15 @@ config IP6_NF_MATCH_RT
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_MATCH_SRH
+        tristate '"srh" Segment Routing header match support'
+        depends on NETFILTER_ADVANCED
+        help
+          srh matching allows you to match packets based on the segment
+	  routing header of the packet.
+
+          To compile it as a module, choose M here.  If unsure, say N.
+
 # The targets
 config IP6_NF_TARGET_HL
 	tristate '"HL" hoplimit target support'
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 95611c4b39b0..d984057b8395 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o
 obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o
 obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o
 obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
+obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o
 
 # targets
 obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
new file mode 100644
index 000000000000..9642164107ce
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -0,0 +1,161 @@
+/* Kernel module to match Segment Routing Header (SRH) parameters. */
+
+/* Author:
+ * Ahmed Abdelsalam <amsalam20@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version 2
+ *	of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/ipv6.h>
+#include <net/seg6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6t_srh.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+/* Test a struct->mt_invflags and a boolean for inequality */
+#define NF_SRH_INVF(ptr, flag, boolean)	\
+	((boolean) ^ !!((ptr)->mt_invflags & (flag)))
+
+static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ip6t_srh *srhinfo = par->matchinfo;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6_sr_hdr _srh;
+	int hdrlen, srhoff = 0;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return false;
+	srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+	if (!srh)
+		return false;
+
+	hdrlen = ipv6_optlen(srh);
+	if (skb->len - srhoff < hdrlen)
+		return false;
+
+	if (srh->type != IPV6_SRCRT_TYPE_4)
+		return false;
+
+	if (srh->segments_left > srh->first_segment)
+		return false;
+
+	/* Next Header matching */
+	if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+				!(srh->nexthdr == srhinfo->next_hdr)))
+			return false;
+
+	/* Header Extension Length matching */
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+				!(srh->hdrlen == srhinfo->hdr_len)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+				!(srh->hdrlen > srhinfo->hdr_len)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+				!(srh->hdrlen < srhinfo->hdr_len)))
+			return false;
+
+	/* Segments Left matching */
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+				!(srh->segments_left == srhinfo->segs_left)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+				!(srh->segments_left > srhinfo->segs_left)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+				!(srh->segments_left < srhinfo->segs_left)))
+			return false;
+
+	/**
+	 * Last Entry matching
+	 * Last_Entry field was introduced in revision 6 of the SRH draft.
+	 * It was called First_Segment in the previous revision
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+				!(srh->first_segment == srhinfo->last_entry)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+				!(srh->first_segment > srhinfo->last_entry)))
+			return false;
+
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+				!(srh->first_segment < srhinfo->last_entry)))
+			return false;
+
+	/**
+	 * Tag matchig
+	 * Tag field was introduced in revision 6 of the SRH draft.
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_TAG)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+				!(srh->tag == srhinfo->tag)))
+			return false;
+	return true;
+}
+
+static int srh_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_srh *srhinfo = par->matchinfo;
+
+	if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
+		pr_err("unknown srh match flags  %X\n", srhinfo->mt_flags);
+		return -EINVAL;
+	}
+
+	if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
+		pr_err("unknown srh invflags %X\n", srhinfo->mt_invflags);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match srh_mt6_reg __read_mostly = {
+	.name		= "srh",
+	.family		= NFPROTO_IPV6,
+	.match		= srh_mt6,
+	.matchsize	= sizeof(struct ip6t_srh),
+	.checkentry	= srh_mt6_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init srh_mt6_init(void)
+{
+	return xt_register_match(&srh_mt6_reg);
+}
+
+static void __exit srh_mt6_exit(void)
+{
+	xt_unregister_match(&srh_mt6_reg);
+}
+
+module_init(srh_mt6_init);
+module_exit(srh_mt6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match");
+MODULE_AUTHOR("Ahmed Abdelsalam <amsalam20@gmail.com>");
-- 
cgit v1.2.3


From fdabc3fe998203038a78763c1b3d6ace517e0eea Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 15 Dec 2017 15:31:30 -0600
Subject: PCI: Add #defines for Completion Timeout Disable feature

Add #defines for the Completion Timeout Disable feature and use them.  No
functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 6 +++---
 include/uapi/linux/pci_regs.h                | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 4650fb294e7a..2f7cd0ef3cdc 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1654,14 +1654,14 @@ static int pnv_eeh_restore_vf_config(struct pci_dn *pdn)
 		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
 				      2, devctl);
 
-		/* Disable Completion Timeout */
+		/* Disable Completion Timeout if possible */
 		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2,
 				     4, &cap2);
-		if (cap2 & 0x10) {
+		if (cap2 & PCI_EXP_DEVCAP2_COMP_TMOUT_DIS) {
 			eeh_ops->read_config(pdn,
 					     edev->pcie_cap + PCI_EXP_DEVCTL2,
 					     4, &cap2);
-			cap2 |= 0x10;
+			cap2 |= PCI_EXP_DEVCTL2_COMP_TMOUT_DIS;
 			eeh_ops->write_config(pdn,
 					      edev->pcie_cap + PCI_EXP_DEVCTL2,
 					      4, cap2);
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 70c2b2ade048..9dc67643fc18 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -622,6 +622,7 @@
  * safely.
  */
 #define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
+#define  PCI_EXP_DEVCAP2_COMP_TMOUT_DIS	0x00000010 /* Completion Timeout Disable supported */
 #define  PCI_EXP_DEVCAP2_ARI		0x00000020 /* Alternative Routing-ID */
 #define  PCI_EXP_DEVCAP2_ATOMIC_ROUTE	0x00000040 /* Atomic Op routing */
 #define PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* Atomic 64-bit compare */
@@ -631,6 +632,7 @@
 #define  PCI_EXP_DEVCAP2_OBFF_WAKE	0x00080000 /* Re-use WAKE# for OBFF */
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_COMP_TIMEOUT	0x000f	/* Completion Timeout Value */
+#define  PCI_EXP_DEVCTL2_COMP_TMOUT_DIS	0x0010	/* Completion Timeout Disable */
 #define  PCI_EXP_DEVCTL2_ARI		0x0020	/* Alternative Routing-ID */
 #define PCI_EXP_DEVCTL2_ATOMIC_REQ	0x0040	/* Set Atomic requests */
 #define PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK 0x0080 /* Block atomic egress */
-- 
cgit v1.2.3


From 902d6a4c2a4f411582689e53fb101895ffe99028 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Wed, 10 Jan 2018 20:51:57 -0700
Subject: netfilter: nf_defrag: Skip defrag if NOTRACK is set

conntrack defrag is needed only if some module like CONNTRACK or NAT
explicitly requests it. For plain forwarding scenarios, defrag is
not needed and can be skipped if NOTRACK is set in a rule.

Since conntrack defrag is currently higher priority than raw table,
setting NOTRACK is not sufficient. We need to move raw to a higher
priority for iptables only.

This is achieved by introducing a module parameter "raw_before_defrag"
which allows to change the priority of raw table to place it before
defrag. By default, the parameter is disabled and the priority of raw
table is NF_IP_PRI_RAW to support legacy behavior. If the module
parameter is enabled, then the priority of the raw table is set to
NF_IP_PRI_RAW_BEFORE_DEFRAG.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv4.h       |  1 +
 include/uapi/linux/netfilter_ipv6.h       |  1 +
 net/ipv4/netfilter/iptable_raw.c          | 13 ++++++++++++-
 net/ipv4/netfilter/nf_defrag_ipv4.c       |  2 +-
 net/ipv6/netfilter/ip6table_raw.c         | 13 ++++++++++++-
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c |  3 +++
 6 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h
index e6b1a84f5dd3..c3b060775e13 100644
--- a/include/uapi/linux/netfilter_ipv4.h
+++ b/include/uapi/linux/netfilter_ipv4.h
@@ -57,6 +57,7 @@
 
 enum nf_ip_hook_priorities {
 	NF_IP_PRI_FIRST = INT_MIN,
+	NF_IP_PRI_RAW_BEFORE_DEFRAG = -450,
 	NF_IP_PRI_CONNTRACK_DEFRAG = -400,
 	NF_IP_PRI_RAW = -300,
 	NF_IP_PRI_SELINUX_FIRST = -225,
diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h
index 2f9724611cc2..dc624fd24d25 100644
--- a/include/uapi/linux/netfilter_ipv6.h
+++ b/include/uapi/linux/netfilter_ipv6.h
@@ -62,6 +62,7 @@
 
 enum nf_ip6_hook_priorities {
 	NF_IP6_PRI_FIRST = INT_MIN,
+	NF_IP6_PRI_RAW_BEFORE_DEFRAG = -450,
 	NF_IP6_PRI_CONNTRACK_DEFRAG = -400,
 	NF_IP6_PRI_RAW = -300,
 	NF_IP6_PRI_SELINUX_FIRST = -225,
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index a869d1fea7d9..29b64d3024e0 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -3,6 +3,7 @@
  *
  * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
  */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/slab.h>
@@ -12,7 +13,11 @@
 
 static int __net_init iptable_raw_table_init(struct net *net);
 
-static const struct xt_table packet_raw = {
+static bool raw_before_defrag __read_mostly;
+MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
+module_param(raw_before_defrag, bool, 0000);
+
+static struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks =  RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
@@ -64,6 +69,12 @@ static int __init iptable_raw_init(void)
 {
 	int ret;
 
+	if (raw_before_defrag) {
+		packet_raw.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG;
+
+		pr_info("Enabling raw table before defrag\n");
+	}
+
 	rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook);
 	if (IS_ERR(rawtable_ops))
 		return PTR_ERR(rawtable_ops);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 37fe1616ca0b..cbd987f6b1f8 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -80,7 +80,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
 #endif
 #endif
 	/* Gather fragments. */
-	if (ip_is_fragment(ip_hdr(skb))) {
+	if (skb->_nfct != IP_CT_UNTRACKED && ip_is_fragment(ip_hdr(skb))) {
 		enum ip_defrag_users user =
 			nf_ct_defrag_user(state->hook, skb);
 
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index d4bc56443dc1..3df7383f96d0 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -3,6 +3,7 @@
  *
  * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
  */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/slab.h>
@@ -11,7 +12,11 @@
 
 static int __net_init ip6table_raw_table_init(struct net *net);
 
-static const struct xt_table packet_raw = {
+static bool raw_before_defrag __read_mostly;
+MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
+module_param(raw_before_defrag, bool, 0000);
+
+static struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks = RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
@@ -63,6 +68,12 @@ static int __init ip6table_raw_init(void)
 {
 	int ret;
 
+	if (raw_before_defrag) {
+		packet_raw.priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG;
+
+		pr_info("Enabling raw table before defrag\n");
+	}
+
 	/* Register hooks */
 	rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook);
 	if (IS_ERR(rawtable_ops))
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index b326da59257f..87b503a8f5ef 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -65,6 +65,9 @@ static unsigned int ipv6_defrag(void *priv,
 		return NF_ACCEPT;
 #endif
 
+	if (skb->_nfct == IP_CT_UNTRACKED)
+		return NF_ACCEPT;
+
 	err = nf_ct_frag6_gather(state->net, skb,
 				 nf_ct6_defrag_user(state->hook, skb));
 	/* queued */
-- 
cgit v1.2.3


From daaf24c634ab951cad3dcef28492001ef9c931d0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 11 Jan 2018 17:39:09 +0100
Subject: bpf: simplify xdp_convert_ctx_access for xdp_rxq_info

As pointed out by Daniel Borkmann, using bpf_target_off() is not
necessary for xdp_rxq_info when extracting queue_index and
ifindex, as these members are u32 like BPF_W.

Also fix trivial spelling mistake introduced in same commit.

Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 2 +-
 net/core/filter.c        | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 405317f9c064..395d261948de 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -899,7 +899,7 @@ struct xdp_md {
 	__u32 data;
 	__u32 data_end;
 	__u32 data_meta;
-	/* Below access go though struct xdp_rxq_info */
+	/* Below access go through struct xdp_rxq_info */
 	__u32 ingress_ifindex; /* rxq->dev->ifindex */
 	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index d4b190e63b79..db2ee8c7e1bd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4310,16 +4310,15 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->dst_reg,
 				      offsetof(struct xdp_rxq_info, dev));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      bpf_target_off(struct net_device,
-						     ifindex, 4, target_size));
+				      offsetof(struct net_device, ifindex));
 		break;
 	case offsetof(struct xdp_md, rx_queue_index):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, rxq));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      bpf_target_off(struct xdp_rxq_info,
-						queue_index, 4, target_size));
+				      offsetof(struct xdp_rxq_info,
+					       queue_index));
 		break;
 	}
 
-- 
cgit v1.2.3


From ad6eb31ef90355993eb55ff77e0e855ae7d91e4c Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 8 Jan 2018 15:38:09 +0000
Subject: firmware: arm_sdei: Add driver for Software Delegated Exceptions

The Software Delegated Exception Interface (SDEI) is an ARM standard
for registering callbacks from the platform firmware into the OS.
This is typically used to implement firmware notifications (such as
firmware-first RAS) or promote an IRQ that has been promoted to a
firmware-assisted NMI.

Add the code for detecting the SDEI version and the framework for
registering and unregistering events. Subsequent patches will add the
arch-specific backend code and the necessary power management hooks.

Only shared events are supported, power management, private events and
discovery for ACPI systems will be added by later patches.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 MAINTAINERS                   |   9 +
 arch/arm64/include/asm/sdei.h |   8 +
 drivers/firmware/Kconfig      |   8 +
 drivers/firmware/Makefile     |   1 +
 drivers/firmware/arm_sdei.c   | 619 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/arm_sdei.h      |  79 ++++++
 include/uapi/linux/arm_sdei.h |  73 +++++
 7 files changed, 797 insertions(+)
 create mode 100644 arch/arm64/include/asm/sdei.h
 create mode 100644 drivers/firmware/arm_sdei.c
 create mode 100644 include/linux/arm_sdei.h
 create mode 100644 include/uapi/linux/arm_sdei.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 82ad0eabce4f..c06885c4c0f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12618,6 +12618,15 @@ L:	linux-media@vger.kernel.org
 S:	Supported
 F:	drivers/media/pci/solo6x10/
 
+SOFTWARE DELEGATED EXCEPTION INTERFACE (SDEI)
+M:	James Morse <james.morse@arm.com>
+L:	linux-arm-kernel@lists.infradead.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/arm/firmware/sdei.txt
+F:	drivers/firmware/arm_sdei.c
+F:	include/linux/sdei.h
+F:	include/uapi/linux/sdei.h
+
 SOFTWARE RAID (Multiple Disks) SUPPORT
 M:	Shaohua Li <shli@kernel.org>
 L:	linux-raid@vger.kernel.org
diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
new file mode 100644
index 000000000000..59f26b6e673d
--- /dev/null
+++ b/arch/arm64/include/asm/sdei.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Arm Ltd.
+#ifndef __ASM_SDEI_H
+#define __ASM_SDEI_H
+
+/* Later patches add the arch specific bits */
+
+#endif /* __ASM_SDEI_H */
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index fa87a055905e..e77f77caa0f3 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -48,6 +48,14 @@ config ARM_SCPI_POWER_DOMAIN
 	  This enables support for the SCPI power domains which can be
 	  enabled or disabled via the SCP firmware
 
+config ARM_SDE_INTERFACE
+	bool "ARM Software Delegated Exception Interface (SDEI)"
+	depends on ARM64
+	help
+	  The Software Delegated Exception Interface (SDEI) is an ARM
+	  standard for registering callbacks from the platform firmware
+	  into the OS. This is typically used to implement RAS notifications.
+
 config EDD
 	tristate "BIOS Enhanced Disk Drive calls determine boot disk"
 	depends on X86
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index feaa890197f3..b248238ddc6a 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_ARM_PSCI_FW)	+= psci.o
 obj-$(CONFIG_ARM_PSCI_CHECKER)	+= psci_checker.o
 obj-$(CONFIG_ARM_SCPI_PROTOCOL)	+= arm_scpi.o
 obj-$(CONFIG_ARM_SCPI_POWER_DOMAIN) += scpi_pm_domain.o
+obj-$(CONFIG_ARM_SDE_INTERFACE)	+= arm_sdei.o
 obj-$(CONFIG_DMI)		+= dmi_scan.o
 obj-$(CONFIG_DMI_SYSFS)		+= dmi-sysfs.o
 obj-$(CONFIG_EDD)		+= edd.o
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
new file mode 100644
index 000000000000..8da173cc7e43
--- /dev/null
+++ b/drivers/firmware/arm_sdei.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Arm Ltd.
+#define pr_fmt(fmt) "sdei: " fmt
+
+#include <linux/acpi.h>
+#include <linux/arm_sdei.h>
+#include <linux/arm-smccc.h>
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/percpu.h>
+#include <linux/platform_device.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+
+/*
+ * The call to use to reach the firmware.
+ */
+static asmlinkage void (*sdei_firmware_call)(unsigned long function_id,
+		      unsigned long arg0, unsigned long arg1,
+		      unsigned long arg2, unsigned long arg3,
+		      unsigned long arg4, struct arm_smccc_res *res);
+
+/* entry point from firmware to arch asm code */
+static unsigned long sdei_entry_point;
+
+struct sdei_event {
+	struct list_head	list;
+	u32			event_num;
+	u8			type;
+	u8			priority;
+
+	/* This pointer is handed to firmware as the event argument. */
+	struct sdei_registered_event *registered;
+};
+
+/* Take the mutex for any API call or modification. Take the mutex first. */
+static DEFINE_MUTEX(sdei_events_lock);
+
+/* and then hold this when modifying the list */
+static DEFINE_SPINLOCK(sdei_list_lock);
+static LIST_HEAD(sdei_list);
+
+static int sdei_to_linux_errno(unsigned long sdei_err)
+{
+	switch (sdei_err) {
+	case SDEI_NOT_SUPPORTED:
+		return -EOPNOTSUPP;
+	case SDEI_INVALID_PARAMETERS:
+		return -EINVAL;
+	case SDEI_DENIED:
+		return -EPERM;
+	case SDEI_PENDING:
+		return -EINPROGRESS;
+	case SDEI_OUT_OF_RESOURCE:
+		return -ENOMEM;
+	}
+
+	/* Not an error value ... */
+	return sdei_err;
+}
+
+/*
+ * If x0 is any of these values, then the call failed, use sdei_to_linux_errno()
+ * to translate.
+ */
+static int sdei_is_err(struct arm_smccc_res *res)
+{
+	switch (res->a0) {
+	case SDEI_NOT_SUPPORTED:
+	case SDEI_INVALID_PARAMETERS:
+	case SDEI_DENIED:
+	case SDEI_PENDING:
+	case SDEI_OUT_OF_RESOURCE:
+		return true;
+	}
+
+	return false;
+}
+
+static int invoke_sdei_fn(unsigned long function_id, unsigned long arg0,
+			  unsigned long arg1, unsigned long arg2,
+			  unsigned long arg3, unsigned long arg4,
+			  u64 *result)
+{
+	int err = 0;
+	struct arm_smccc_res res;
+
+	if (sdei_firmware_call) {
+		sdei_firmware_call(function_id, arg0, arg1, arg2, arg3, arg4,
+				   &res);
+		if (sdei_is_err(&res))
+			err = sdei_to_linux_errno(res.a0);
+	} else {
+		/*
+		 * !sdei_firmware_call means we failed to probe or called
+		 * sdei_mark_interface_broken(). -EIO is not an error returned
+		 * by sdei_to_linux_errno() and is used to suppress messages
+		 * from this driver.
+		 */
+		err = -EIO;
+		res.a0 = SDEI_NOT_SUPPORTED;
+	}
+
+	if (result)
+		*result = res.a0;
+
+	return err;
+}
+
+static struct sdei_event *sdei_event_find(u32 event_num)
+{
+	struct sdei_event *e, *found = NULL;
+
+	lockdep_assert_held(&sdei_events_lock);
+
+	spin_lock(&sdei_list_lock);
+	list_for_each_entry(e, &sdei_list, list) {
+		if (e->event_num == event_num) {
+			found = e;
+			break;
+		}
+	}
+	spin_unlock(&sdei_list_lock);
+
+	return found;
+}
+
+int sdei_api_event_context(u32 query, u64 *result)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_CONTEXT, query, 0, 0, 0, 0,
+			      result);
+}
+NOKPROBE_SYMBOL(sdei_api_event_context);
+
+static int sdei_api_event_get_info(u32 event, u32 info, u64 *result)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_GET_INFO, event, info, 0,
+			      0, 0, result);
+}
+
+static struct sdei_event *sdei_event_create(u32 event_num,
+					    sdei_event_callback *cb,
+					    void *cb_arg)
+{
+	int err;
+	u64 result;
+	struct sdei_event *event;
+	struct sdei_registered_event *reg;
+
+	lockdep_assert_held(&sdei_events_lock);
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&event->list);
+	event->event_num = event_num;
+
+	err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_PRIORITY,
+				      &result);
+	if (err) {
+		kfree(event);
+		return ERR_PTR(err);
+	}
+	event->priority = result;
+
+	err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_TYPE,
+				      &result);
+	if (err) {
+		kfree(event);
+		return ERR_PTR(err);
+	}
+	event->type = result;
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED) {
+		reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+		if (!reg) {
+			kfree(event);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		reg->event_num = event_num;
+		reg->priority = event->priority;
+
+		reg->callback = cb;
+		reg->callback_arg = cb_arg;
+		event->registered = reg;
+	}
+
+	if (sdei_event_find(event_num)) {
+		kfree(event->registered);
+		kfree(event);
+		event = ERR_PTR(-EBUSY);
+	} else {
+		spin_lock(&sdei_list_lock);
+		list_add(&event->list, &sdei_list);
+		spin_unlock(&sdei_list_lock);
+	}
+
+	return event;
+}
+
+static void sdei_event_destroy(struct sdei_event *event)
+{
+	lockdep_assert_held(&sdei_events_lock);
+
+	spin_lock(&sdei_list_lock);
+	list_del(&event->list);
+	spin_unlock(&sdei_list_lock);
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		kfree(event->registered);
+
+	kfree(event);
+}
+
+static int sdei_api_get_version(u64 *version)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_VERSION, 0, 0, 0, 0, 0, version);
+}
+
+int sdei_mask_local_cpu(void)
+{
+	int err;
+
+	WARN_ON_ONCE(preemptible());
+
+	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PE_MASK, 0, 0, 0, 0, 0, NULL);
+	if (err && err != -EIO) {
+		pr_warn_once("failed to mask CPU[%u]: %d\n",
+			      smp_processor_id(), err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void _ipi_mask_cpu(void *ignored)
+{
+	sdei_mask_local_cpu();
+}
+
+int sdei_unmask_local_cpu(void)
+{
+	int err;
+
+	WARN_ON_ONCE(preemptible());
+
+	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PE_UNMASK, 0, 0, 0, 0, 0, NULL);
+	if (err && err != -EIO) {
+		pr_warn_once("failed to unmask CPU[%u]: %d\n",
+			     smp_processor_id(), err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void _ipi_unmask_cpu(void *ignored)
+{
+	sdei_unmask_local_cpu();
+}
+
+static void _ipi_private_reset(void *ignored)
+{
+	int err;
+
+	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PRIVATE_RESET, 0, 0, 0, 0, 0,
+			     NULL);
+	if (err && err != -EIO)
+		pr_warn_once("failed to reset CPU[%u]: %d\n",
+			     smp_processor_id(), err);
+}
+
+static int sdei_api_shared_reset(void)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_SHARED_RESET, 0, 0, 0, 0, 0,
+			      NULL);
+}
+
+static void sdei_mark_interface_broken(void)
+{
+	pr_err("disabling SDEI firmware interface\n");
+	on_each_cpu(&_ipi_mask_cpu, NULL, true);
+	sdei_firmware_call = NULL;
+}
+
+static int sdei_platform_reset(void)
+{
+	int err;
+
+	on_each_cpu(&_ipi_private_reset, NULL, true);
+	err = sdei_api_shared_reset();
+	if (err) {
+		pr_err("Failed to reset platform: %d\n", err);
+		sdei_mark_interface_broken();
+	}
+
+	return err;
+}
+
+static int sdei_api_event_enable(u32 event_num)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_ENABLE, event_num, 0, 0, 0,
+			      0, NULL);
+}
+
+int sdei_event_enable(u32 event_num)
+{
+	int err = -EINVAL;
+	struct sdei_event *event;
+
+	mutex_lock(&sdei_events_lock);
+	event = sdei_event_find(event_num);
+	if (!event) {
+		mutex_unlock(&sdei_events_lock);
+		return -ENOENT;
+	}
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		err = sdei_api_event_enable(event->event_num);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(sdei_event_enable);
+
+static int sdei_api_event_disable(u32 event_num)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_DISABLE, event_num, 0, 0,
+			      0, 0, NULL);
+}
+
+int sdei_event_disable(u32 event_num)
+{
+	int err = -EINVAL;
+	struct sdei_event *event;
+
+	mutex_lock(&sdei_events_lock);
+	event = sdei_event_find(event_num);
+	if (!event) {
+		mutex_unlock(&sdei_events_lock);
+		return -ENOENT;
+	}
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		err = sdei_api_event_disable(event->event_num);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(sdei_event_disable);
+
+static int sdei_api_event_unregister(u32 event_num)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_UNREGISTER, event_num, 0,
+			      0, 0, 0, NULL);
+}
+
+static int _sdei_event_unregister(struct sdei_event *event)
+{
+	lockdep_assert_held(&sdei_events_lock);
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		return sdei_api_event_unregister(event->event_num);
+
+	return -EINVAL;
+}
+
+int sdei_event_unregister(u32 event_num)
+{
+	int err;
+	struct sdei_event *event;
+
+	WARN_ON(in_nmi());
+
+	mutex_lock(&sdei_events_lock);
+	event = sdei_event_find(event_num);
+	do {
+		if (!event) {
+			pr_warn("Event %u not registered\n", event_num);
+			err = -ENOENT;
+			break;
+		}
+
+		err = _sdei_event_unregister(event);
+		if (err)
+			break;
+
+		sdei_event_destroy(event);
+	} while (0);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(sdei_event_unregister);
+
+static int sdei_api_event_register(u32 event_num, unsigned long entry_point,
+				   void *arg, u64 flags, u64 affinity)
+{
+	return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_REGISTER, event_num,
+			      (unsigned long)entry_point, (unsigned long)arg,
+			      flags, affinity, NULL);
+}
+
+static int _sdei_event_register(struct sdei_event *event)
+{
+	lockdep_assert_held(&sdei_events_lock);
+
+	if (event->type == SDEI_EVENT_TYPE_SHARED)
+		return sdei_api_event_register(event->event_num,
+					       sdei_entry_point,
+					       event->registered,
+					       SDEI_EVENT_REGISTER_RM_ANY, 0);
+
+	return -EINVAL;
+}
+
+int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg)
+{
+	int err;
+	struct sdei_event *event;
+
+	WARN_ON(in_nmi());
+
+	mutex_lock(&sdei_events_lock);
+	do {
+		if (sdei_event_find(event_num)) {
+			pr_warn("Event %u already registered\n", event_num);
+			err = -EBUSY;
+			break;
+		}
+
+		event = sdei_event_create(event_num, cb, arg);
+		if (IS_ERR(event)) {
+			err = PTR_ERR(event);
+			pr_warn("Failed to create event %u: %d\n", event_num,
+				err);
+			break;
+		}
+
+		err = _sdei_event_register(event);
+		if (err) {
+			sdei_event_destroy(event);
+			pr_warn("Failed to register event %u: %d\n", event_num,
+				err);
+		}
+	} while (0);
+	mutex_unlock(&sdei_events_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(sdei_event_register);
+
+static void sdei_smccc_smc(unsigned long function_id,
+			   unsigned long arg0, unsigned long arg1,
+			   unsigned long arg2, unsigned long arg3,
+			   unsigned long arg4, struct arm_smccc_res *res)
+{
+	arm_smccc_smc(function_id, arg0, arg1, arg2, arg3, arg4, 0, 0, res);
+}
+
+static void sdei_smccc_hvc(unsigned long function_id,
+			   unsigned long arg0, unsigned long arg1,
+			   unsigned long arg2, unsigned long arg3,
+			   unsigned long arg4, struct arm_smccc_res *res)
+{
+	arm_smccc_hvc(function_id, arg0, arg1, arg2, arg3, arg4, 0, 0, res);
+}
+
+static int sdei_get_conduit(struct platform_device *pdev)
+{
+	const char *method;
+	struct device_node *np = pdev->dev.of_node;
+
+	sdei_firmware_call = NULL;
+	if (np) {
+		if (of_property_read_string(np, "method", &method)) {
+			pr_warn("missing \"method\" property\n");
+			return CONDUIT_INVALID;
+		}
+
+		if (!strcmp("hvc", method)) {
+			sdei_firmware_call = &sdei_smccc_hvc;
+			return CONDUIT_HVC;
+		} else if (!strcmp("smc", method)) {
+			sdei_firmware_call = &sdei_smccc_smc;
+			return CONDUIT_SMC;
+		}
+
+		pr_warn("invalid \"method\" property: %s\n", method);
+	}
+
+	return CONDUIT_INVALID;
+}
+
+static int sdei_probe(struct platform_device *pdev)
+{
+	int err;
+	u64 ver = 0;
+	int conduit;
+
+	conduit = sdei_get_conduit(pdev);
+	if (!sdei_firmware_call)
+		return 0;
+
+	err = sdei_api_get_version(&ver);
+	if (err == -EOPNOTSUPP)
+		pr_err("advertised but not implemented in platform firmware\n");
+	if (err) {
+		pr_err("Failed to get SDEI version: %d\n", err);
+		sdei_mark_interface_broken();
+		return err;
+	}
+
+	pr_info("SDEIv%d.%d (0x%x) detected in firmware.\n",
+		(int)SDEI_VERSION_MAJOR(ver), (int)SDEI_VERSION_MINOR(ver),
+		(int)SDEI_VERSION_VENDOR(ver));
+
+	if (SDEI_VERSION_MAJOR(ver) != 1) {
+		pr_warn("Conflicting SDEI version detected.\n");
+		sdei_mark_interface_broken();
+		return -EINVAL;
+	}
+
+	err = sdei_platform_reset();
+	if (err)
+		return err;
+
+	sdei_entry_point = sdei_arch_get_entry_point(conduit);
+	if (!sdei_entry_point) {
+		/* Not supported due to hardware or boot configuration */
+		sdei_mark_interface_broken();
+		return 0;
+	}
+
+	on_each_cpu(&_ipi_unmask_cpu, NULL, false);
+
+	return 0;
+}
+
+static const struct of_device_id sdei_of_match[] = {
+	{ .compatible = "arm,sdei-1.0" },
+	{}
+};
+
+static struct platform_driver sdei_driver = {
+	.driver		= {
+		.name			= "sdei",
+		.of_match_table		= sdei_of_match,
+	},
+	.probe		= sdei_probe,
+};
+
+static bool __init sdei_present_dt(void)
+{
+	struct platform_device *pdev;
+	struct device_node *np, *fw_np;
+
+	fw_np = of_find_node_by_name(NULL, "firmware");
+	if (!fw_np)
+		return false;
+
+	np = of_find_matching_node(fw_np, sdei_of_match);
+	of_node_put(fw_np);
+	if (!np)
+		return false;
+
+	pdev = of_platform_device_create(np, sdei_driver.driver.name, NULL);
+	of_node_put(np);
+	if (IS_ERR(pdev))
+		return false;
+
+	return true;
+}
+
+static int __init sdei_init(void)
+{
+	if (sdei_present_dt())
+		platform_driver_register(&sdei_driver);
+
+	return 0;
+}
+
+subsys_initcall_sync(sdei_init);
+
+int sdei_event_handler(struct pt_regs *regs,
+		       struct sdei_registered_event *arg)
+{
+	int err;
+	mm_segment_t orig_addr_limit;
+	u32 event_num = arg->event_num;
+
+	orig_addr_limit = get_fs();
+	set_fs(USER_DS);
+
+	err = arg->callback(event_num, regs, arg->callback_arg);
+	if (err)
+		pr_err_ratelimited("event %u on CPU %u failed with error: %d\n",
+				   event_num, smp_processor_id(), err);
+
+	set_fs(orig_addr_limit);
+
+	return err;
+}
+NOKPROBE_SYMBOL(sdei_event_handler);
diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h
new file mode 100644
index 000000000000..942afbd544b7
--- /dev/null
+++ b/include/linux/arm_sdei.h
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Arm Ltd.
+#ifndef __LINUX_ARM_SDEI_H
+#define __LINUX_ARM_SDEI_H
+
+#include <uapi/linux/arm_sdei.h>
+
+enum sdei_conduit_types {
+	CONDUIT_INVALID = 0,
+	CONDUIT_SMC,
+	CONDUIT_HVC,
+};
+
+#include <asm/sdei.h>
+
+/* Arch code should override this to set the entry point from firmware... */
+#ifndef sdei_arch_get_entry_point
+#define sdei_arch_get_entry_point(conduit)	(0)
+#endif
+
+/*
+ * When an event occurs sdei_event_handler() will call a user-provided callback
+ * like this in NMI context on the CPU that received the event.
+ */
+typedef int (sdei_event_callback)(u32 event, struct pt_regs *regs, void *arg);
+
+/*
+ * Register your callback to claim an event. The event must be described
+ * by firmware.
+ */
+int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg);
+
+/*
+ * Calls to sdei_event_unregister() may return EINPROGRESS. Keep calling
+ * it until it succeeds.
+ */
+int sdei_event_unregister(u32 event_num);
+
+int sdei_event_enable(u32 event_num);
+int sdei_event_disable(u32 event_num);
+
+#ifdef CONFIG_ARM_SDE_INTERFACE
+/* For use by arch code when CPU hotplug notifiers are not appropriate. */
+int sdei_mask_local_cpu(void);
+int sdei_unmask_local_cpu(void);
+#else
+static inline int sdei_mask_local_cpu(void) { return 0; }
+static inline int sdei_unmask_local_cpu(void) { return 0; }
+#endif /* CONFIG_ARM_SDE_INTERFACE */
+
+
+/*
+ * This struct represents an event that has been registered. The driver
+ * maintains a list of all events, and which ones are registered. (Private
+ * events have one entry in the list, but are registered on each CPU).
+ * A pointer to this struct is passed to firmware, and back to the event
+ * handler. The event handler can then use this to invoke the registered
+ * callback, without having to walk the list.
+ *
+ * For CPU private events, this structure is per-cpu.
+ */
+struct sdei_registered_event {
+	/* For use by arch code: */
+	struct pt_regs          interrupted_regs;
+
+	sdei_event_callback	*callback;
+	void			*callback_arg;
+	u32			 event_num;
+	u8			 priority;
+};
+
+/* The arch code entry point should then call this when an event arrives. */
+int notrace sdei_event_handler(struct pt_regs *regs,
+			       struct sdei_registered_event *arg);
+
+/* arch code may use this to retrieve the extra registers. */
+int sdei_api_event_context(u32 query, u64 *result);
+
+#endif /* __LINUX_ARM_SDEI_H */
diff --git a/include/uapi/linux/arm_sdei.h b/include/uapi/linux/arm_sdei.h
new file mode 100644
index 000000000000..af0630ba5437
--- /dev/null
+++ b/include/uapi/linux/arm_sdei.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (C) 2017 Arm Ltd. */
+#ifndef _UAPI_LINUX_ARM_SDEI_H
+#define _UAPI_LINUX_ARM_SDEI_H
+
+#define SDEI_1_0_FN_BASE			0xC4000020
+#define SDEI_1_0_MASK				0xFFFFFFE0
+#define SDEI_1_0_FN(n)				(SDEI_1_0_FN_BASE + (n))
+
+#define SDEI_1_0_FN_SDEI_VERSION			SDEI_1_0_FN(0x00)
+#define SDEI_1_0_FN_SDEI_EVENT_REGISTER			SDEI_1_0_FN(0x01)
+#define SDEI_1_0_FN_SDEI_EVENT_ENABLE			SDEI_1_0_FN(0x02)
+#define SDEI_1_0_FN_SDEI_EVENT_DISABLE			SDEI_1_0_FN(0x03)
+#define SDEI_1_0_FN_SDEI_EVENT_CONTEXT			SDEI_1_0_FN(0x04)
+#define SDEI_1_0_FN_SDEI_EVENT_COMPLETE			SDEI_1_0_FN(0x05)
+#define SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME	SDEI_1_0_FN(0x06)
+#define SDEI_1_0_FN_SDEI_EVENT_UNREGISTER		SDEI_1_0_FN(0x07)
+#define SDEI_1_0_FN_SDEI_EVENT_STATUS			SDEI_1_0_FN(0x08)
+#define SDEI_1_0_FN_SDEI_EVENT_GET_INFO			SDEI_1_0_FN(0x09)
+#define SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET		SDEI_1_0_FN(0x0A)
+#define SDEI_1_0_FN_SDEI_PE_MASK			SDEI_1_0_FN(0x0B)
+#define SDEI_1_0_FN_SDEI_PE_UNMASK			SDEI_1_0_FN(0x0C)
+#define SDEI_1_0_FN_SDEI_INTERRUPT_BIND			SDEI_1_0_FN(0x0D)
+#define SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE		SDEI_1_0_FN(0x0E)
+#define SDEI_1_0_FN_SDEI_PRIVATE_RESET			SDEI_1_0_FN(0x11)
+#define SDEI_1_0_FN_SDEI_SHARED_RESET			SDEI_1_0_FN(0x12)
+
+#define SDEI_VERSION_MAJOR_SHIFT			48
+#define SDEI_VERSION_MAJOR_MASK				0x7fff
+#define SDEI_VERSION_MINOR_SHIFT			32
+#define SDEI_VERSION_MINOR_MASK				0xffff
+#define SDEI_VERSION_VENDOR_SHIFT			0
+#define SDEI_VERSION_VENDOR_MASK			0xffffffff
+
+#define SDEI_VERSION_MAJOR(x)	(x>>SDEI_VERSION_MAJOR_SHIFT & SDEI_VERSION_MAJOR_MASK)
+#define SDEI_VERSION_MINOR(x)	(x>>SDEI_VERSION_MINOR_SHIFT & SDEI_VERSION_MINOR_MASK)
+#define SDEI_VERSION_VENDOR(x)	(x>>SDEI_VERSION_VENDOR_SHIFT & SDEI_VERSION_VENDOR_MASK)
+
+/* SDEI return values */
+#define SDEI_SUCCESS		0
+#define SDEI_NOT_SUPPORTED	-1
+#define SDEI_INVALID_PARAMETERS	-2
+#define SDEI_DENIED		-3
+#define SDEI_PENDING		-5
+#define SDEI_OUT_OF_RESOURCE	-10
+
+/* EVENT_REGISTER flags */
+#define SDEI_EVENT_REGISTER_RM_ANY	0
+#define SDEI_EVENT_REGISTER_RM_PE	1
+
+/* EVENT_STATUS return value bits */
+#define SDEI_EVENT_STATUS_RUNNING	2
+#define SDEI_EVENT_STATUS_ENABLED	1
+#define SDEI_EVENT_STATUS_REGISTERED	0
+
+/* EVENT_COMPLETE status values */
+#define SDEI_EV_HANDLED	0
+#define SDEI_EV_FAILED	1
+
+/* GET_INFO values */
+#define SDEI_EVENT_INFO_EV_TYPE			0
+#define SDEI_EVENT_INFO_EV_SIGNALED		1
+#define SDEI_EVENT_INFO_EV_PRIORITY		2
+#define SDEI_EVENT_INFO_EV_ROUTING_MODE		3
+#define SDEI_EVENT_INFO_EV_ROUTING_AFF		4
+
+/* and their results */
+#define SDEI_EVENT_TYPE_PRIVATE			0
+#define SDEI_EVENT_TYPE_SHARED			1
+#define SDEI_EVENT_PRIORITY_NORMAL		0
+#define SDEI_EVENT_PRIORITY_CRITICAL		1
+
+#endif /* _UAPI_LINUX_ARM_SDEI_H */
-- 
cgit v1.2.3


From a38845729ea3985db5d2544ec3ef3dc8f6313a27 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 11 Jan 2018 20:29:09 -0800
Subject: bpf: offload: add map offload infrastructure

BPF map offload follow similar path to program offload.  At creation
time users may specify ifindex of the device on which they want to
create the map.  Map will be validated by the kernel's
.map_alloc_check callback and device driver will be called for the
actual allocation.  Map will have an empty set of operations
associated with it (save for alloc and free callbacks).  The real
device callbacks are kept in map->offload->dev_ops because they
have slightly different signatures.  Map operations are called in
process context so the driver may communicate with HW freely,
msleep(), wait() etc.

Map alloc and free callbacks are muxed via existing .ndo_bpf, and
are always called with rtnl lock held.  Maps and programs are
guaranteed to be destroyed before .ndo_uninit (i.e. before
unregister_netdev() returns).  Map callbacks are invoked with
bpf_devs_lock *read* locked, drivers must take care of exclusive
locking if necessary.

All offload-specific branches are marked with unlikely() (through
bpf_map_is_dev_bound()), given that branch penalty will be
negligible compared to IO anyway, and we don't want to penalize
SW path unnecessarily.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h            |  59 +++++++++++++
 include/linux/netdevice.h      |   6 ++
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/offload.c           | 188 +++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/syscall.c           |  44 ++++++++--
 kernel/bpf/verifier.c          |   7 ++
 tools/include/uapi/linux/bpf.h |   1 +
 7 files changed, 293 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9fff1ace1d8e..5c2c104dc2c5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -74,6 +74,33 @@ struct bpf_map {
 	char name[BPF_OBJ_NAME_LEN];
 };
 
+struct bpf_offloaded_map;
+
+struct bpf_map_dev_ops {
+	int (*map_get_next_key)(struct bpf_offloaded_map *map,
+				void *key, void *next_key);
+	int (*map_lookup_elem)(struct bpf_offloaded_map *map,
+			       void *key, void *value);
+	int (*map_update_elem)(struct bpf_offloaded_map *map,
+			       void *key, void *value, u64 flags);
+	int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key);
+};
+
+struct bpf_offloaded_map {
+	struct bpf_map map;
+	struct net_device *netdev;
+	const struct bpf_map_dev_ops *dev_ops;
+	void *dev_priv;
+	struct list_head offloads;
+};
+
+static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
+{
+	return container_of(map, struct bpf_offloaded_map, map);
+}
+
+extern const struct bpf_map_ops bpf_map_offload_ops;
+
 /* function argument constraints */
 enum bpf_arg_type {
 	ARG_DONTCARE = 0,	/* unused argument in helper function */
@@ -369,6 +396,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages);
 void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
 
 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
@@ -556,6 +584,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
 int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 			       struct bpf_prog *prog);
 
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
+int bpf_map_offload_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags);
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
+int bpf_map_offload_get_next_key(struct bpf_map *map,
+				 void *key, void *next_key);
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map);
+
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
 
@@ -563,6 +600,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
 {
 	return aux->offload_requested;
 }
+
+static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+{
+	return unlikely(map->ops == &bpf_map_offload_ops);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
+void bpf_map_offload_map_free(struct bpf_map *map);
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
 					union bpf_attr *attr)
@@ -574,6 +619,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
 {
 	return false;
 }
+
+static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+{
+	return false;
+}
+
+static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_map_offload_map_free(struct bpf_map *map)
+{
+}
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ef7b348e8498..0b3ab42d50fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -804,6 +804,8 @@ enum bpf_netdev_command {
 	BPF_OFFLOAD_VERIFIER_PREP,
 	BPF_OFFLOAD_TRANSLATE,
 	BPF_OFFLOAD_DESTROY,
+	BPF_OFFLOAD_MAP_ALLOC,
+	BPF_OFFLOAD_MAP_FREE,
 };
 
 struct bpf_prog_offload_ops;
@@ -834,6 +836,10 @@ struct netdev_bpf {
 		struct {
 			struct bpf_prog *prog;
 		} offload;
+		/* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
+		struct {
+			struct bpf_offloaded_map *offmap;
+		};
 	};
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 395d261948de..7c2259e8bc54 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -245,6 +245,7 @@ union bpf_attr {
 					 * BPF_F_NUMA_NODE is set).
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
+		__u32	map_ifindex;	/* ifindex of netdev to create on */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index cdd1e19a668b..453785fa1881 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -24,11 +24,13 @@
 #include <linux/rtnetlink.h>
 #include <linux/rwsem.h>
 
-/* Protects bpf_prog_offload_devs and offload members of all progs.
+/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+ * of all progs.
  * RTNL lock cannot be taken when holding this lock.
  */
 static DECLARE_RWSEM(bpf_devs_lock);
 static LIST_HEAD(bpf_prog_offload_devs);
+static LIST_HEAD(bpf_map_offload_devs);
 
 static int bpf_dev_offload_check(struct net_device *netdev)
 {
@@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+			       enum bpf_netdev_command cmd)
+{
+	struct netdev_bpf data = {};
+	struct net_device *netdev;
+
+	ASSERT_RTNL();
+
+	data.command = cmd;
+	data.offmap = offmap;
+	/* Caller must make sure netdev is valid */
+	netdev = offmap->netdev;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_offloaded_map *offmap;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (attr->map_type != BPF_MAP_TYPE_HASH)
+		return ERR_PTR(-EINVAL);
+
+	offmap = kzalloc(sizeof(*offmap), GFP_USER);
+	if (!offmap)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&offmap->map, attr);
+
+	rtnl_lock();
+	down_write(&bpf_devs_lock);
+	offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+	err = bpf_dev_offload_check(offmap->netdev);
+	if (err)
+		goto err_unlock;
+
+	err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+	if (err)
+		goto err_unlock;
+
+	list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return &offmap->map;
+
+err_unlock:
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+	kfree(offmap);
+	return ERR_PTR(err);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+	WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+	/* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+	bpf_map_free_id(&offmap->map, true);
+	list_del_init(&offmap->offloads);
+	offmap->netdev = NULL;
+}
+
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+
+	rtnl_lock();
+	down_write(&bpf_devs_lock);
+	if (offmap->netdev)
+		__bpf_map_offload_destroy(offmap);
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+
+	kfree(offmap);
+}
+
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+						       flags);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_delete_elem(offmap, key);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+	struct bpf_offloaded_map *offmap;
+	struct bpf_prog_offload *offload;
+	bool ret;
+
+	if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map))
+		return false;
+	if (!bpf_prog_is_dev_bound(prog->aux))
+		return true;
+
+	down_read(&bpf_devs_lock);
+	offload = prog->aux->offload;
+	offmap = map_to_offmap(map);
+
+	ret = offload && offload->netdev == offmap->netdev;
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+{
+	struct bpf_prog_offload *offload, *tmp;
+
+	list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
+		if (offload->netdev == netdev)
+			__bpf_prog_offload_destroy(offload->prog);
+}
+
+static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+{
+	struct bpf_offloaded_map *offmap, *tmp;
+
+	list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
+		if (offmap->netdev == netdev)
+			__bpf_map_offload_destroy(offmap);
+}
+
 static int bpf_offload_notification(struct notifier_block *notifier,
 				    ulong event, void *ptr)
 {
 	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
-	struct bpf_prog_offload *offload, *tmp;
 
 	ASSERT_RTNL();
 
@@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier,
 			break;
 
 		down_write(&bpf_devs_lock);
-		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
-					 offloads) {
-			if (offload->netdev == netdev)
-				__bpf_prog_offload_destroy(offload->prog);
-		}
+		bpf_offload_orphan_all_progs(netdev);
+		bpf_offload_orphan_all_maps(netdev);
 		up_write(&bpf_devs_lock);
 		break;
 	default:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a3f726bb42ea..c691b9e972e3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr,
 	return 0;
 }
 
+const struct bpf_map_ops bpf_map_offload_ops = {
+	.map_alloc = bpf_map_offload_map_alloc,
+	.map_free = bpf_map_offload_map_free,
+};
+
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 {
 	const struct bpf_map_ops *ops;
@@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 		if (err)
 			return ERR_PTR(err);
 	}
+	if (attr->map_ifindex)
+		ops = &bpf_map_offload_ops;
 	map = ops->map_alloc(attr);
 	if (IS_ERR(map))
 		return map;
@@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
 	unsigned long flags;
 
+	/* Offloaded maps are removed from the IDR store when their device
+	 * disappears - even if someone holds an fd to them they are unusable,
+	 * the memory is gone, all ops will fail; they are simply waiting for
+	 * refcnt to drop to be freed.
+	 */
+	if (!map->id)
+		return;
+
 	if (do_idr_lock)
 		spin_lock_irqsave(&map_idr_lock, flags);
 	else
 		__acquire(&map_idr_lock);
 
 	idr_remove(&map_idr, map->id);
+	map->id = 0;
 
 	if (do_idr_lock)
 		spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_name
+#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (!value)
 		goto free_key;
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
@@ -673,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
 		goto free_value;
 
 	/* Need to create a kthread, thus must support schedule */
-	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_update_elem(map, key, value, attr->flags);
+		goto out;
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
 		goto out;
 	}
@@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_delete_elem(map, key);
+		goto out;
+	}
+
 	preempt_disable();
 	__this_cpu_inc(bpf_prog_active);
 	rcu_read_lock();
@@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr)
 	rcu_read_unlock();
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_delete_elem(map, ufd, key);
 	kfree(key);
@@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (!next_key)
 		goto free_key;
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_get_next_key(map, key, next_key);
+		goto out;
+	}
+
 	rcu_read_lock();
 	err = map->ops->map_get_next_key(map, key, next_key);
 	rcu_read_unlock();
+out:
 	if (err)
 		goto free_next_key;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48b61caa94cb..ceabb394d2dc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 	}
+
+	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+	    !bpf_offload_dev_match(prog, map)) {
+		verbose(env, "offload device mismatch between prog and map\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4e8c60acfa32..69f96af4a569 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -245,6 +245,7 @@ union bpf_attr {
 					 * BPF_F_NUMA_NODE is set).
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
+		__u32	map_ifindex;	/* ifindex of netdev to create on */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
-- 
cgit v1.2.3


From fb455baad6fc4de77d762e89dae75c2e2aa98559 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2017 14:13:30 -0500
Subject: nfs: Define NFS_RDMA_PORT

The NFS/RDMA port assignment is specified in Section 9 of RFC 8267.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/uapi/linux/nfs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
index 057d22a48416..946cb62d64b0 100644
--- a/include/uapi/linux/nfs.h
+++ b/include/uapi/linux/nfs.h
@@ -12,6 +12,7 @@
 
 #define NFS_PROGRAM	100003
 #define NFS_PORT	2049
+#define NFS_RDMA_PORT	20049
 #define NFS_MAXDATA	8192
 #define NFS_MAXPATHLEN	1024
 #define NFS_MAXNAMLEN	255
-- 
cgit v1.2.3


From 95a332088ecb113c2e8753fa3f1df9b0dda9beec Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 12 Jan 2018 12:29:22 -0800
Subject: Revert "openvswitch: Add erspan tunnel support."

This reverts commit ceaa001a170e43608854d5290a48064f57b565ed.

The OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS attr should be designed
as a nested attribute to support all ERSPAN v1 and v2's fields.
The current attr is a be32 supporting only one field.  Thus, this
patch reverts it and later patch will redo it using nested attr.

Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Jiri Benc <jbenc@redhat.com>
Cc: Pravin Shelar <pshelar@ovn.org>
Acked-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  1 -
 net/openvswitch/flow_netlink.c   | 51 +---------------------------------------
 2 files changed, 1 insertion(+), 51 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 4265d7f9e1f2..dcfab5e3b55c 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -363,7 +363,6 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_IPV6_SRC,		/* struct in6_addr src IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_IPV6_DST,		/* struct in6_addr dst IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_PAD,
-	OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,	/* be32 ERSPAN index. */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624ea74353dd..f143908b651d 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -49,7 +49,6 @@
 #include <net/mpls.h>
 #include <net/vxlan.h>
 #include <net/tun_proto.h>
-#include <net/erspan.h>
 
 #include "flow_netlink.h"
 
@@ -334,8 +333,7 @@ size_t ovs_tun_key_attr_size(void)
 		 * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
 		 */
 		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
-		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_DST */
-		+ nla_total_size(4);   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */
+		+ nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
 }
 
 static size_t ovs_nsh_key_attr_size(void)
@@ -402,7 +400,6 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 						.next = ovs_vxlan_ext_key_lens },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_SRC]      = { .len = sizeof(struct in6_addr) },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
-	[OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = sizeof(u32) },
 };
 
 static const struct ovs_len_tbl
@@ -634,33 +631,6 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
 	return 0;
 }
 
-static int erspan_tun_opt_from_nlattr(const struct nlattr *attr,
-				      struct sw_flow_match *match, bool is_mask,
-				      bool log)
-{
-	unsigned long opt_key_offset;
-	struct erspan_metadata opts;
-
-	BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
-
-	memset(&opts, 0, sizeof(opts));
-	opts.index = nla_get_be32(attr);
-
-	/* Index has only 20-bit */
-	if (ntohl(opts.index) & ~INDEX_MASK) {
-		OVS_NLERR(log, "ERSPAN index number %x too large.",
-			  ntohl(opts.index));
-		return -EINVAL;
-	}
-
-	SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask);
-	opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
-	SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
-				  is_mask);
-
-	return 0;
-}
-
 static int ip_tun_from_nlattr(const struct nlattr *attr,
 			      struct sw_flow_match *match, bool is_mask,
 			      bool log)
@@ -768,19 +738,6 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_PAD:
 			break;
-		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
-			if (opts_type) {
-				OVS_NLERR(log, "Multiple metadata blocks provided");
-				return -EINVAL;
-			}
-
-			err = erspan_tun_opt_from_nlattr(a, match, is_mask, log);
-			if (err)
-				return err;
-
-			tun_flags |= TUNNEL_ERSPAN_OPT;
-			opts_type = type;
-			break;
 		default:
 			OVS_NLERR(log, "Unknown IP tunnel attribute %d",
 				  type);
@@ -905,10 +862,6 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 		else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
 			 vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
 			return -EMSGSIZE;
-		else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
-			 nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
-				      ((struct erspan_metadata *)tun_opts)->index))
-			return -EMSGSIZE;
 	}
 
 	return 0;
@@ -2533,8 +2486,6 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
 			break;
-		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
-			break;
 		}
 	};
 
-- 
cgit v1.2.3


From 2290aefa2e90a43af8555ad6431d49de43259aa3 Mon Sep 17 00:00:00 2001
From: Franklin S Cooper Jr <fcooper@ti.com>
Date: Wed, 10 Jan 2018 16:25:18 +0530
Subject: can: dev: Add support for limiting configured bitrate

Various CAN or CAN-FD IP may be able to run at a faster rate than
what the transceiver the CAN node is connected to. This can lead to
unexpected errors. However, CAN transceivers typically have fixed
limitations and provide no means to discover these limitations at
runtime. Therefore, add support for a can-transceiver node that
can be reused by other CAN peripheral drivers to determine for both
CAN and CAN-FD what the max bitrate that can be used. If the user
tries to configure CAN to pass these maximum bitrates it will throw
an error.

Also add support for reading bitrate_max via the netlink interface.

Reviewed-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Franklin S Cooper Jr <fcooper@ti.com>
[nsekhar@ti.com: fix build error with !CONFIG_OF]
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Faiz Abbas <faiz_abbas@ti.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c            | 45 +++++++++++++++++++++++++++++++++++++++-
 include/linux/can/dev.h          |  7 +++++++
 include/uapi/linux/can/netlink.h |  1 +
 3 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 365a8cc62405..cc94604b23e0 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -27,6 +27,7 @@
 #include <linux/can/skb.h>
 #include <linux/can/netlink.h>
 #include <linux/can/led.h>
+#include <linux/of.h>
 #include <net/rtnetlink.h>
 
 #define MOD_DESC "CAN device driver interface"
@@ -814,6 +815,29 @@ int open_candev(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(open_candev);
 
+#ifdef CONFIG_OF
+/* Common function that can be used to understand the limitation of
+ * a transceiver when it provides no means to determine these limitations
+ * at runtime.
+ */
+void of_can_transceiver(struct net_device *dev)
+{
+	struct device_node *dn;
+	struct can_priv *priv = netdev_priv(dev);
+	struct device_node *np = dev->dev.parent->of_node;
+	int ret;
+
+	dn = of_get_child_by_name(np, "can-transceiver");
+	if (!dn)
+		return;
+
+	ret = of_property_read_u32(dn, "max-bitrate", &priv->bitrate_max);
+	if ((ret && ret != -EINVAL) || (!ret && !priv->bitrate_max))
+		netdev_warn(dev, "Invalid value for transceiver max bitrate. Ignoring bitrate limit.\n");
+}
+EXPORT_SYMBOL_GPL(of_can_transceiver);
+#endif
+
 /*
  * Common close function for cleanup before the device gets closed.
  *
@@ -913,6 +937,13 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 					priv->bitrate_const_cnt);
 		if (err)
 			return err;
+
+		if (priv->bitrate_max && bt.bitrate > priv->bitrate_max) {
+			netdev_err(dev, "arbitration bitrate surpasses transceiver capabilities of %d bps\n",
+				   priv->bitrate_max);
+			return -EINVAL;
+		}
+
 		memcpy(&priv->bittiming, &bt, sizeof(bt));
 
 		if (priv->do_set_bittiming) {
@@ -997,6 +1028,13 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 					priv->data_bitrate_const_cnt);
 		if (err)
 			return err;
+
+		if (priv->bitrate_max && dbt.bitrate > priv->bitrate_max) {
+			netdev_err(dev, "canfd data bitrate surpasses transceiver capabilities of %d bps\n",
+				   priv->bitrate_max);
+			return -EINVAL;
+		}
+
 		memcpy(&priv->data_bittiming, &dbt, sizeof(dbt));
 
 		if (priv->do_set_data_bittiming) {
@@ -1064,6 +1102,7 @@ static size_t can_get_size(const struct net_device *dev)
 	if (priv->data_bitrate_const)				/* IFLA_CAN_DATA_BITRATE_CONST */
 		size += nla_total_size(sizeof(*priv->data_bitrate_const) *
 				       priv->data_bitrate_const_cnt);
+	size += sizeof(priv->bitrate_max);			/* IFLA_CAN_BITRATE_MAX */
 
 	return size;
 }
@@ -1121,7 +1160,11 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	     nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST,
 		     sizeof(*priv->data_bitrate_const) *
 		     priv->data_bitrate_const_cnt,
-		     priv->data_bitrate_const))
+		     priv->data_bitrate_const)) ||
+
+	    (nla_put(skb, IFLA_CAN_BITRATE_MAX,
+		     sizeof(priv->bitrate_max),
+		     &priv->bitrate_max))
 	    )
 
 		return -EMSGSIZE;
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 61f1cf2d9f44..055aaf5ed9af 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -46,6 +46,7 @@ struct can_priv {
 	unsigned int bitrate_const_cnt;
 	const u32 *data_bitrate_const;
 	unsigned int data_bitrate_const_cnt;
+	u32 bitrate_max;
 	struct can_clock clock;
 
 	enum can_state state;
@@ -166,6 +167,12 @@ void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
 void can_free_echo_skb(struct net_device *dev, unsigned int idx);
 
+#ifdef CONFIG_OF
+void of_can_transceiver(struct net_device *dev);
+#else
+static inline void of_can_transceiver(struct net_device *dev) { }
+#endif
+
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
 struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 				struct canfd_frame **cfd);
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index 96710e76d5ce..9f56fad4785b 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -132,6 +132,7 @@ enum {
 	IFLA_CAN_TERMINATION_CONST,
 	IFLA_CAN_BITRATE_CONST,
 	IFLA_CAN_DATA_BITRATE_CONST,
+	IFLA_CAN_BITRATE_MAX,
 	__IFLA_CAN_MAX
 };
 
-- 
cgit v1.2.3


From 1ff2775a32ef105d9bdbb5f00f20293244a2accc Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 13 Jan 2018 17:37:13 -0500
Subject: nubus: Fix up header split

Due to the '#ifdef __KERNEL__' being located in the wrong place, some
definitions from the kernel API were placed in the UAPI header during
the scripted header split. Fix this. Also, remove the duplicate comment
which is only relevant to the UAPI header.

Fixes: 607ca46e97a1 ("UAPI: (Scripted) Disintegrate include/linux")
Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 include/linux/nubus.h      | 27 +++++++++++++++++++++++----
 include/uapi/linux/nubus.h | 23 -----------------------
 2 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index d8d63370a28c..55b9a4569a69 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -5,16 +5,28 @@
   Originally written by Alan Cox.
 
   Hacked to death by C. Scott Ananian and David Huggins-Daines.
-  
-  Some of the constants in here are from the corresponding
-  NetBSD/OpenBSD header file, by Allen Briggs.  We figured out the
-  rest of them on our own. */
+*/
+
 #ifndef LINUX_NUBUS_H
 #define LINUX_NUBUS_H
 
 #include <asm/nubus.h>
 #include <uapi/linux/nubus.h>
 
+struct nubus_dir {
+	unsigned char *base;
+	unsigned char *ptr;
+	int done;
+	int mask;
+};
+
+struct nubus_dirent {
+	unsigned char *base;
+	unsigned char type;
+	__u32 data;	/* Actually 24 bits used */
+	int mask;
+};
+
 struct nubus_board {
 	struct nubus_board* next;
 	struct nubus_dev* first_dev;
@@ -130,4 +142,11 @@ void nubus_get_rsrc_mem(void *dest, const struct nubus_dirent *dirent,
 			unsigned int len);
 void nubus_get_rsrc_str(char *dest, const struct nubus_dirent *dirent,
 			unsigned int maxlen);
+
+/* Returns a pointer to the "standard" slot space. */
+static inline void *nubus_slot_addr(int slot)
+{
+	return (void *)(0xF0000000 | (slot << 24));
+}
+
 #endif /* LINUX_NUBUS_H */
diff --git a/include/uapi/linux/nubus.h b/include/uapi/linux/nubus.h
index f3776cc80f4d..48031e7858f1 100644
--- a/include/uapi/linux/nubus.h
+++ b/include/uapi/linux/nubus.h
@@ -221,27 +221,4 @@ enum nubus_display_res_id {
 	NUBUS_RESID_SIXTHMODE   = 0x0085
 };
 
-struct nubus_dir
-{
-	unsigned char *base;
-	unsigned char *ptr;
-	int done;
-	int mask;
-};
-
-struct nubus_dirent
-{
-	unsigned char *base;
-	unsigned char type;
-	__u32 data;	/* Actually 24bits used */
-	int mask;
-};
-
-
-/* We'd like to get rid of this eventually.  Only daynaport.c uses it now. */
-static inline void *nubus_slot_addr(int slot)
-{
-	return (void *)(0xF0000000|(slot<<24));
-}
-
 #endif /* _UAPILINUX_NUBUS_H */
-- 
cgit v1.2.3


From d9f9b9a4d05fab693fd23a9ecaa330e03ebe2c31 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:03 +0100
Subject: devlink: Add support for resource abstraction

Add support for hardware resource abstraction over devlink. Each resource
is identified via id, furthermore it contains information regarding its
size and its related sub resources. Each resource can also provide its
current occupancy.

In some cases the sizes of some resources can be changed, yet for those
changes to take place a hot driver reload may be needed. The reload
capability will be introduced in the next patch.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  96 +++++++++++
 include/uapi/linux/devlink.h |  18 +++
 net/core/devlink.c           | 374 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 488 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4d2c6fc94837..ceb1895d119b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -26,6 +26,7 @@ struct devlink {
 	struct list_head port_list;
 	struct list_head sb_list;
 	struct list_head dpipe_table_list;
+	struct list_head resource_list;
 	struct devlink_dpipe_headers *dpipe_headers;
 	const struct devlink_ops *ops;
 	struct device *dev;
@@ -224,6 +225,61 @@ struct devlink_dpipe_headers {
 	unsigned int headers_count;
 };
 
+/**
+ * struct devlink_resource_ops - resource ops
+ * @occ_get: get the occupied size
+ * @size_validate: validate the size of the resource before update, reload
+ *                 is needed for changes to take place
+ */
+struct devlink_resource_ops {
+	u64 (*occ_get)(struct devlink *devlink);
+	int (*size_validate)(struct devlink *devlink, u64 size,
+			     struct netlink_ext_ack *extack);
+};
+
+/**
+ * struct devlink_resource_size_params - resource's size parameters
+ * @size_min: minimum size which can be set
+ * @size_max: maximum size which can be set
+ * @size_granularity: size granularity
+ * @size_unit: resource's basic unit
+ */
+struct devlink_resource_size_params {
+	u64 size_min;
+	u64 size_max;
+	u64 size_granularity;
+	enum devlink_resource_unit unit;
+};
+
+/**
+ * struct devlink_resource - devlink resource
+ * @name: name of the resource
+ * @id: id, per devlink instance
+ * @size: size of the resource
+ * @size_new: updated size of the resource, reload is needed
+ * @size_valid: valid in case the total size of the resource is valid
+ *              including its children
+ * @parent: parent resource
+ * @size_params: size parameters
+ * @list: parent list
+ * @resource_list: list of child resources
+ * @resource_ops: resource ops
+ */
+struct devlink_resource {
+	const char *name;
+	u64 id;
+	u64 size;
+	u64 size_new;
+	bool size_valid;
+	struct devlink_resource *parent;
+	struct devlink_resource_size_params *size_params;
+	struct list_head list;
+	struct list_head resource_list;
+	const struct devlink_resource_ops *resource_ops;
+};
+
+#define DEVLINK_RESOURCE_ID_PARENT_TOP 0
+
 struct devlink_ops {
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
@@ -333,6 +389,20 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv6;
 
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      struct devlink_resource_size_params *size_params,
+			      const struct devlink_resource_ops *resource_ops);
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource);
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size);
+
 #else
 
 static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
@@ -469,6 +539,32 @@ devlink_dpipe_match_put(struct sk_buff *skb,
 	return 0;
 }
 
+static inline int
+devlink_resource_register(struct devlink *devlink,
+			  const char *resource_name,
+			  bool top_hierarchy,
+			  u64 resource_size,
+			  u64 resource_id,
+			  u64 parent_resource_id,
+			  struct devlink_resource_size_params *size_params,
+			  const struct devlink_resource_ops *resource_ops)
+{
+	return 0;
+}
+
+static inline void
+devlink_resources_unregister(struct devlink *devlink,
+			     struct devlink_resource *resource)
+{
+}
+
+static inline int
+devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
+			  u64 *p_resource_size)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6665df69e26a..f89950443e17 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -70,6 +70,8 @@ enum devlink_command {
 	DEVLINK_CMD_DPIPE_ENTRIES_GET,
 	DEVLINK_CMD_DPIPE_HEADERS_GET,
 	DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
+	DEVLINK_CMD_RESOURCE_SET,
+	DEVLINK_CMD_RESOURCE_DUMP,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
@@ -202,6 +204,18 @@ enum devlink_attr {
 	DEVLINK_ATTR_PAD,
 
 	DEVLINK_ATTR_ESWITCH_ENCAP_MODE,	/* u8 */
+	DEVLINK_ATTR_RESOURCE_LIST,		/* nested */
+	DEVLINK_ATTR_RESOURCE,			/* nested */
+	DEVLINK_ATTR_RESOURCE_NAME,		/* string */
+	DEVLINK_ATTR_RESOURCE_ID,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_NEW,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_VALID,	/* u8 */
+	DEVLINK_ATTR_RESOURCE_SIZE_MIN,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_MAX,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_GRAN,        /* u64 */
+	DEVLINK_ATTR_RESOURCE_UNIT,		/* u8 */
+	DEVLINK_ATTR_RESOURCE_OCC,		/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
@@ -245,4 +259,8 @@ enum devlink_dpipe_header_id {
 	DEVLINK_DPIPE_HEADER_IPV6,
 };
 
+enum devlink_resource_unit {
+	DEVLINK_RESOURCE_UNIT_ENTRY,
+};
+
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2f71734c4ff6..89b3704fa450 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2288,6 +2288,233 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
 						counters_enable);
 }
 
+struct devlink_resource *
+devlink_resource_find(struct devlink *devlink,
+		      struct devlink_resource *resource, u64 resource_id)
+{
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		struct devlink_resource *child_resource;
+
+		if (resource->id == resource_id)
+			return resource;
+
+		child_resource = devlink_resource_find(devlink, resource,
+						       resource_id);
+		if (child_resource)
+			return child_resource;
+	}
+	return NULL;
+}
+
+void devlink_resource_validate_children(struct devlink_resource *resource)
+{
+	struct devlink_resource *child_resource;
+	bool size_valid = true;
+	u64 parts_size = 0;
+
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	list_for_each_entry(child_resource, &resource->resource_list, list)
+		parts_size += child_resource->size_new;
+
+	if (parts_size > resource->size)
+		size_valid = false;
+out:
+	resource->size_valid = size_valid;
+}
+
+static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	u64 resource_id;
+	u64 size;
+	int err;
+
+	if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
+	    !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
+		return -EINVAL;
+	resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
+
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource)
+		return -EINVAL;
+
+	if (!resource->resource_ops->size_validate)
+		return -EINVAL;
+
+	size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
+	err = resource->resource_ops->size_validate(devlink, size,
+						    info->extack);
+	if (err)
+		return err;
+
+	resource->size_new = size;
+	devlink_resource_validate_children(resource);
+	if (resource->parent)
+		devlink_resource_validate_children(resource->parent);
+	return 0;
+}
+
+static void
+devlink_resource_size_params_put(struct devlink_resource *resource,
+				 struct sk_buff *skb)
+{
+	struct devlink_resource_size_params *size_params;
+
+	size_params = resource->size_params;
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
+			  size_params->size_granularity, DEVLINK_ATTR_PAD);
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
+			  size_params->size_max, DEVLINK_ATTR_PAD);
+	nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
+			  size_params->size_min, DEVLINK_ATTR_PAD);
+	nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit);
+}
+
+static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
+				struct devlink_resource *resource)
+{
+	struct devlink_resource *child_resource;
+	struct nlattr *child_resource_attr;
+	struct nlattr *resource_attr;
+
+	resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE);
+	if (!resource_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size,
+			      DEVLINK_ATTR_PAD) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
+			      DEVLINK_ATTR_PAD))
+		goto nla_put_failure;
+	if (resource->size != resource->size_new)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+				  resource->size_new, DEVLINK_ATTR_PAD);
+	if (resource->resource_ops && resource->resource_ops->occ_get)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+				  resource->resource_ops->occ_get(devlink),
+				  DEVLINK_ATTR_PAD);
+	devlink_resource_size_params_put(resource, skb);
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
+		       resource->size_valid))
+		goto nla_put_failure;
+
+	child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!child_resource_attr)
+		goto nla_put_failure;
+
+	list_for_each_entry(child_resource, &resource->resource_list, list) {
+		if (devlink_resource_put(devlink, skb, child_resource))
+			goto resource_put_failure;
+	}
+
+	nla_nest_end(skb, child_resource_attr);
+out:
+	nla_nest_end(skb, resource_attr);
+	return 0;
+
+resource_put_failure:
+	nla_nest_cancel(skb, child_resource_attr);
+nla_put_failure:
+	nla_nest_cancel(skb, resource_attr);
+	return -EMSGSIZE;
+}
+
+static int devlink_resource_fill(struct genl_info *info,
+				 enum devlink_command cmd, int flags)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	struct nlattr *resources_attr;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	bool incomplete;
+	void *hdr;
+	int i;
+	int err;
+
+	resource = list_first_entry(&devlink->resource_list,
+				    struct devlink_resource, list);
+start_again:
+	err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+	if (err)
+		return err;
+
+	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+			  &devlink_nl_family, NLM_F_MULTI, cmd);
+	if (!hdr) {
+		nlmsg_free(skb);
+		return -EMSGSIZE;
+	}
+
+	if (devlink_nl_put_handle(skb, devlink))
+		goto nla_put_failure;
+
+	resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!resources_attr)
+		goto nla_put_failure;
+
+	incomplete = false;
+	i = 0;
+	list_for_each_entry_from(resource, &devlink->resource_list, list) {
+		err = devlink_resource_put(devlink, skb, resource);
+		if (err) {
+			if (!i)
+				goto err_resource_put;
+			incomplete = true;
+			break;
+		}
+		i++;
+	}
+	nla_nest_end(skb, resources_attr);
+	genlmsg_end(skb, hdr);
+	if (incomplete)
+		goto start_again;
+send_done:
+	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+			NLMSG_DONE, 0, flags | NLM_F_MULTI);
+	if (!nlh) {
+		err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+		if (err)
+			goto err_skb_send_alloc;
+		goto send_done;
+	}
+	return genlmsg_reply(skb, info);
+
+nla_put_failure:
+	err = -EMSGSIZE;
+err_resource_put:
+err_skb_send_alloc:
+	genlmsg_cancel(skb, hdr);
+	nlmsg_free(skb);
+	return err;
+}
+
+static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+
+	if (list_empty(&devlink->resource_list))
+		return -EOPNOTSUPP;
+
+	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2306,6 +2533,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
+	[DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -2466,6 +2695,20 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_SET,
+		.doit = devlink_nl_cmd_resource_set,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_DUMP,
+		.doit = devlink_nl_cmd_resource_dump,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
@@ -2503,6 +2746,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	INIT_LIST_HEAD(&devlink->resource_list);
 	mutex_init(&devlink->lock);
 	return devlink;
 }
@@ -2833,6 +3077,136 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
+/**
+ *	devlink_resource_register - devlink resource register
+ *
+ *	@devlink: devlink
+ *	@resource_name: resource's name
+ *	@top_hierarchy: top hierarchy
+ *	@reload_required: reload is required for new configuration to
+ *			  apply
+ *	@resource_size: resource's size
+ *	@resource_id: resource's id
+ *	@parent_reosurce_id: resource's parent id
+ *	@size params: size parameters
+ *	@resource_ops: resource ops
+ */
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      struct devlink_resource_size_params *size_params,
+			      const struct devlink_resource_ops *resource_ops)
+{
+	struct devlink_resource *resource;
+	struct list_head *resource_list;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (resource) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (top_hierarchy) {
+		resource_list = &devlink->resource_list;
+	} else {
+		struct devlink_resource *parent_resource;
+
+		parent_resource = devlink_resource_find(devlink, NULL,
+							parent_resource_id);
+		if (parent_resource) {
+			resource_list = &parent_resource->resource_list;
+			resource->parent = parent_resource;
+		} else {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	resource->name = resource_name;
+	resource->size = resource_size;
+	resource->size_new = resource_size;
+	resource->id = resource_id;
+	resource->resource_ops = resource_ops;
+	resource->size_valid = true;
+	resource->size_params = size_params;
+	INIT_LIST_HEAD(&resource->resource_list);
+	list_add_tail(&resource->list, resource_list);
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_register);
+
+/**
+ *	devlink_resources_unregister - free all resources
+ *
+ *	@devlink: devlink
+ *	@resource: resource
+ */
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource)
+{
+	struct devlink_resource *tmp, *child_resource;
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	if (!resource)
+		mutex_lock(&devlink->lock);
+
+	list_for_each_entry_safe(child_resource, tmp, resource_list, list) {
+		devlink_resources_unregister(devlink, child_resource);
+		list_del(&child_resource->list);
+		kfree(child_resource);
+	}
+
+	if (!resource)
+		mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resources_unregister);
+
+/**
+ *	devlink_resource_size_get - get and update size
+ *
+ *	@devlink: devlink
+ *	@resource_id: the requested resource id
+ *	@p_resource_size: ptr to update
+ */
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size)
+{
+	struct devlink_resource *resource;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource) {
+		err = -EINVAL;
+		goto out;
+	}
+	*p_resource_size = resource->size_new;
+	resource->size = resource->size_new;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_size_get);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
cgit v1.2.3


From 2d8dc5bbf4e7603747875eb5cadcd67c1fa8b1bb Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:04 +0100
Subject: devlink: Add support for reload

Add support for performing driver hot reload.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  5 +++++
 net/core/devlink.c           | 47 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ceb1895d119b..c698883fb0bb 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -281,6 +281,7 @@ struct devlink_resource {
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
 
 struct devlink_ops {
+	int (*reload)(struct devlink *devlink);
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
 	int (*port_split)(struct devlink *devlink, unsigned int port_index,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index f89950443e17..555ddcaf0be2 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -73,6 +73,11 @@ enum devlink_command {
 	DEVLINK_CMD_RESOURCE_SET,
 	DEVLINK_CMD_RESOURCE_DUMP,
 
+	/* Hot driver reload, makes configuration changes take place. The
+	 * devlink instance is not released during the process.
+	 */
+	DEVLINK_CMD_RELOAD,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 89b3704fa450..4c3d85560436 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2515,6 +2515,45 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
 	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
 }
 
+static int
+devlink_resources_validate(struct devlink *devlink,
+			   struct devlink_resource *resource,
+			   struct genl_info *info)
+{
+	struct list_head *resource_list;
+	int err = 0;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		if (!resource->size_valid)
+			return -EINVAL;
+		err = devlink_resources_validate(devlink, resource, info);
+		if (err)
+			return err;
+	}
+	return err;
+}
+
+static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	int err;
+
+	if (!devlink->ops->reload)
+		return -EOPNOTSUPP;
+
+	err = devlink_resources_validate(devlink, NULL, info);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
+		return err;
+	}
+	return devlink->ops->reload(devlink);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2709,6 +2748,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RELOAD,
+		.doit = devlink_nl_cmd_reload,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3


From 56dc7cd0a87a1ff4f49ee1e67bd88e768385d51a Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 15 Jan 2018 08:59:05 +0100
Subject: devlink: Add relation between dpipe and resource

The hardware processes which are modeled via dpipe commonly use some
internal hardware resources. Such relation can improve the understanding
of hardware limitations. The number of resource's unit consumed per
table's entry are also provided for each table.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        | 17 +++++++++++++++++
 include/uapi/linux/devlink.h |  2 ++
 net/core/devlink.c           | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index c698883fb0bb..6545b03e97f7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -183,6 +183,9 @@ struct devlink_dpipe_table_ops;
  * @counters_enabled: indicates if counters are active
  * @counter_control_extern: indicates if counter control is in dpipe or
  *			    external tool
+ * @resource_valid: Indicate that the resource id is valid
+ * @resource_id: relative resource this table is related to
+ * @resource_units: number of resource's unit consumed per table's entry
  * @table_ops: table operations
  * @rcu: rcu
  */
@@ -192,6 +195,9 @@ struct devlink_dpipe_table {
 	const char *name;
 	bool counters_enabled;
 	bool counter_control_extern;
+	bool resource_valid;
+	u64 resource_id;
+	u64 resource_units;
 	struct devlink_dpipe_table_ops *table_ops;
 	struct rcu_head rcu;
 };
@@ -403,6 +409,9 @@ void devlink_resources_unregister(struct devlink *devlink,
 int devlink_resource_size_get(struct devlink *devlink,
 			      u64 resource_id,
 			      u64 *p_resource_size);
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id,
+				     u64 resource_units);
 
 #else
 
@@ -566,6 +575,14 @@ devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_dpipe_table_resource_set(struct devlink *devlink,
+				 const char *table_name, u64 resource_id,
+				 u64 resource_units)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 555ddcaf0be2..1df65a4c2044 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -221,6 +221,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_RESOURCE_SIZE_GRAN,        /* u64 */
 	DEVLINK_ATTR_RESOURCE_UNIT,		/* u8 */
 	DEVLINK_ATTR_RESOURCE_OCC,		/* u64 */
+	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
+	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4c3d85560436..dd7d6dd07bfb 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1694,6 +1694,12 @@ static int devlink_dpipe_table_put(struct sk_buff *skb,
 		       table->counters_enabled))
 		goto nla_put_failure;
 
+	if (table->resource_valid) {
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+				  table->resource_id, DEVLINK_ATTR_PAD);
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
+				  table->resource_units, DEVLINK_ATTR_PAD);
+	}
 	if (devlink_dpipe_matches_put(table, skb))
 		goto nla_put_failure;
 
@@ -3254,6 +3260,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(devlink_resource_size_get);
 
+/**
+ *	devlink_dpipe_table_resource_set - set the resource id
+ *
+ *	@devlink: devlink
+ *	@table_name: table name
+ *	@resource_id: resource id
+ *	@resource_units: number of resource's units consumed per table's entry
+ */
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id,
+				     u64 resource_units)
+{
+	struct devlink_dpipe_table *table;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+					 table_name);
+	if (!table) {
+		err = -EINVAL;
+		goto out;
+	}
+	table->resource_id = resource_id;
+	table->resource_units = resource_units;
+	table->resource_valid = true;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
cgit v1.2.3


From 7960d1daf278cbe23bb48974fe6ae6a1e44c3c15 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:51 +0100
Subject: net: sched: use block index as a handle instead of qdisc when block
 is shared

As the tcm_ifindex with value TCM_IFINDEX_MAGIC_BLOCK is invalid ifindex,
use it to indicate that we work with block, instead of qdisc.
So if tcm_ifindex is set to TCM_IFINDEX_MAGIC_BLOCK, tcm_parent is used
to carry block_index.

If the block is set to be shared between at least 2 qdiscs, it is
forbidden to use the qdisc handle to add/delete filters. In that case,
userspace has to pass block_index.

Also, for dump of the filters, in case the block is shared in between at
least 2 qdiscs, the each filter is dumped with tcm_ifindex value
TCM_IFINDEX_MAGIC_BLOCK and tcm_parent set to block_index. That gives
the user clear indication, that the filter belongs to a shared block
and not only to one qdisc under which it is dumped.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h |  10 ++
 net/sched/cls_api.c            | 202 ++++++++++++++++++++++++-----------------
 2 files changed, 128 insertions(+), 84 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 843e29aa3cac..da878f2e7c39 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -541,9 +541,19 @@ struct tcmsg {
 	int		tcm_ifindex;
 	__u32		tcm_handle;
 	__u32		tcm_parent;
+/* tcm_block_index is used instead of tcm_parent
+ * in case tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK
+ */
+#define tcm_block_index tcm_parent
 	__u32		tcm_info;
 };
 
+/* For manipulation of filters in shared block, tcm_ifindex is set to
+ * TCM_IFINDEX_MAGIC_BLOCK, and tcm_parent is aliased to tcm_block_index
+ * which is the block index.
+ */
+#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
+
 enum {
 	TCA_UNSPEC,
 	TCA_KIND,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 03e2fa092d9e..e500d11da9cd 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -865,8 +865,9 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
 }
 
 static int tcf_fill_node(struct net *net, struct sk_buff *skb,
-			 struct tcf_proto *tp, struct Qdisc *q, u32 parent,
-			 void *fh, u32 portid, u32 seq, u16 flags, int event)
+			 struct tcf_proto *tp, struct tcf_block *block,
+			 struct Qdisc *q, u32 parent, void *fh,
+			 u32 portid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -879,8 +880,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
-	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
-	tcm->tcm_parent = parent;
+	if (q) {
+		tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+		tcm->tcm_parent = parent;
+	} else {
+		tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK;
+		tcm->tcm_block_index = block->index;
+	}
 	tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
 	if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
 		goto nla_put_failure;
@@ -903,8 +909,8 @@ nla_put_failure:
 
 static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 			  struct nlmsghdr *n, struct tcf_proto *tp,
-			  struct Qdisc *q, u32 parent,
-			  void *fh, int event, bool unicast)
+			  struct tcf_block *block, struct Qdisc *q,
+			  u32 parent, void *fh, int event, bool unicast)
 {
 	struct sk_buff *skb;
 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -913,8 +919,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 	if (!skb)
 		return -ENOBUFS;
 
-	if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq,
-			  n->nlmsg_flags, event) <= 0) {
+	if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+			  n->nlmsg_seq, n->nlmsg_flags, event) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -928,8 +934,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 
 static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 			      struct nlmsghdr *n, struct tcf_proto *tp,
-			      struct Qdisc *q, u32 parent,
-			      void *fh, bool unicast, bool *last)
+			      struct tcf_block *block, struct Qdisc *q,
+			      u32 parent, void *fh, bool unicast, bool *last)
 {
 	struct sk_buff *skb;
 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -939,8 +945,8 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 	if (!skb)
 		return -ENOBUFS;
 
-	if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq,
-			  n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
+	if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+			  n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -959,15 +965,16 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 }
 
 static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
-				 struct Qdisc *q, u32 parent,
-				 struct nlmsghdr *n,
+				 struct tcf_block *block, struct Qdisc *q,
+				 u32 parent, struct nlmsghdr *n,
 				 struct tcf_chain *chain, int event)
 {
 	struct tcf_proto *tp;
 
 	for (tp = rtnl_dereference(chain->filter_chain);
 	     tp; tp = rtnl_dereference(tp->next))
-		tfilter_notify(net, oskb, n, tp, q, parent, 0, event, false);
+		tfilter_notify(net, oskb, n, tp, block,
+			       q, parent, 0, event, false);
 }
 
 /* Add/change/delete/get a filter node */
@@ -983,13 +990,11 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 	bool prio_allocate;
 	u32 parent;
 	u32 chain_index;
-	struct net_device *dev;
-	struct Qdisc  *q;
+	struct Qdisc *q = NULL;
 	struct tcf_chain_info chain_info;
 	struct tcf_chain *chain = NULL;
 	struct tcf_block *block;
 	struct tcf_proto *tp;
-	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	void *fh;
 	int err;
@@ -1036,41 +1041,58 @@ replay:
 
 	/* Find head of filter chain. */
 
-	/* Find link */
-	dev = __dev_get_by_index(net, t->tcm_ifindex);
-	if (dev == NULL)
-		return -ENODEV;
-
-	/* Find qdisc */
-	if (!parent) {
-		q = dev->qdisc;
-		parent = q->handle;
+	if (t->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+		block = tcf_block_lookup(net, t->tcm_block_index);
+		if (!block) {
+			NL_SET_ERR_MSG(extack, "Block of given index was not found");
+			err = -EINVAL;
+			goto errout;
+		}
 	} else {
-		q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
-		if (q == NULL)
-			return -EINVAL;
-	}
+		const struct Qdisc_class_ops *cops;
+		struct net_device *dev;
 
-	/* Is it classful? */
-	cops = q->ops->cl_ops;
-	if (!cops)
-		return -EINVAL;
+		/* Find link */
+		dev = __dev_get_by_index(net, t->tcm_ifindex);
+		if (!dev)
+			return -ENODEV;
 
-	if (!cops->tcf_block)
-		return -EOPNOTSUPP;
+		/* Find qdisc */
+		if (!parent) {
+			q = dev->qdisc;
+			parent = q->handle;
+		} else {
+			q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
+			if (!q)
+				return -EINVAL;
+		}
 
-	/* Do we search for filter, attached to class? */
-	if (TC_H_MIN(parent)) {
-		cl = cops->find(q, parent);
-		if (cl == 0)
-			return -ENOENT;
-	}
+		/* Is it classful? */
+		cops = q->ops->cl_ops;
+		if (!cops)
+			return -EINVAL;
 
-	/* And the last stroke */
-	block = cops->tcf_block(q, cl, extack);
-	if (!block) {
-		err = -EINVAL;
-		goto errout;
+		if (!cops->tcf_block)
+			return -EOPNOTSUPP;
+
+		/* Do we search for filter, attached to class? */
+		if (TC_H_MIN(parent)) {
+			cl = cops->find(q, parent);
+			if (cl == 0)
+				return -ENOENT;
+		}
+
+		/* And the last stroke */
+		block = cops->tcf_block(q, cl, extack);
+		if (!block) {
+			err = -EINVAL;
+			goto errout;
+		}
+		if (tcf_block_shared(block)) {
+			NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
+			err = -EOPNOTSUPP;
+			goto errout;
+		}
 	}
 
 	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
@@ -1086,7 +1108,7 @@ replay:
 	}
 
 	if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
-		tfilter_notify_chain(net, skb, q, parent, n,
+		tfilter_notify_chain(net, skb, block, q, parent, n,
 				     chain, RTM_DELTFILTER);
 		tcf_chain_flush(chain);
 		err = 0;
@@ -1134,7 +1156,7 @@ replay:
 	if (!fh) {
 		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
 			tcf_chain_tp_remove(chain, &chain_info, tp);
-			tfilter_notify(net, skb, n, tp, q, parent, fh,
+			tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 				       RTM_DELTFILTER, false);
 			tcf_proto_destroy(tp);
 			err = 0;
@@ -1159,8 +1181,8 @@ replay:
 			}
 			break;
 		case RTM_DELTFILTER:
-			err = tfilter_del_notify(net, skb, n, tp, q, parent,
-						 fh, false, &last);
+			err = tfilter_del_notify(net, skb, n, tp, block,
+						 q, parent, fh, false, &last);
 			if (err)
 				goto errout;
 			if (last) {
@@ -1169,8 +1191,8 @@ replay:
 			}
 			goto errout;
 		case RTM_GETTFILTER:
-			err = tfilter_notify(net, skb, n, tp, q, parent, fh,
-					     RTM_NEWTFILTER, true);
+			err = tfilter_notify(net, skb, n, tp, block, q, parent,
+					     fh, RTM_NEWTFILTER, true);
 			goto errout;
 		default:
 			err = -EINVAL;
@@ -1183,7 +1205,7 @@ replay:
 	if (err == 0) {
 		if (tp_created)
 			tcf_chain_tp_insert(chain, &chain_info, tp);
-		tfilter_notify(net, skb, n, tp, q, parent, fh,
+		tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 			       RTM_NEWTFILTER, false);
 	} else {
 		if (tp_created)
@@ -1203,6 +1225,7 @@ struct tcf_dump_args {
 	struct tcf_walker w;
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
+	struct tcf_block *block;
 	struct Qdisc *q;
 	u32 parent;
 };
@@ -1212,7 +1235,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
 	struct tcf_dump_args *a = (void *)arg;
 	struct net *net = sock_net(a->skb->sk);
 
-	return tcf_fill_node(net, a->skb, tp, a->q, a->parent,
+	return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
 			     n, NETLINK_CB(a->cb->skb).portid,
 			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
 			     RTM_NEWTFILTER);
@@ -1223,6 +1246,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 			   long index_start, long *p_index)
 {
 	struct net *net = sock_net(skb->sk);
+	struct tcf_block *block = chain->block;
 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	struct tcf_dump_args arg;
 	struct tcf_proto *tp;
@@ -1241,7 +1265,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 			memset(&cb->args[1], 0,
 			       sizeof(cb->args) - sizeof(cb->args[0]));
 		if (cb->args[1] == 0) {
-			if (tcf_fill_node(net, skb, tp, q, parent, 0,
+			if (tcf_fill_node(net, skb, tp, block, q, parent, 0,
 					  NETLINK_CB(cb->skb).portid,
 					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					  RTM_NEWTFILTER) <= 0)
@@ -1254,6 +1278,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 		arg.w.fn = tcf_node_dump;
 		arg.skb = skb;
 		arg.cb = cb;
+		arg.block = block;
 		arg.q = q;
 		arg.parent = parent;
 		arg.w.stop = 0;
@@ -1272,13 +1297,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tca[TCA_MAX + 1];
-	struct net_device *dev;
-	struct Qdisc *q;
+	struct Qdisc *q = NULL;
 	struct tcf_block *block;
 	struct tcf_chain *chain;
 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
-	unsigned long cl = 0;
-	const struct Qdisc_class_ops *cops;
 	long index_start;
 	long index;
 	u32 parent;
@@ -1291,32 +1313,44 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	if (err)
 		return err;
 
-	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
-	if (!dev)
-		return skb->len;
-
-	parent = tcm->tcm_parent;
-	if (!parent) {
-		q = dev->qdisc;
-		parent = q->handle;
+	if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+		block = tcf_block_lookup(net, tcm->tcm_block_index);
+		if (!block)
+			goto out;
 	} else {
-		q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
-	}
-	if (!q)
-		goto out;
-	cops = q->ops->cl_ops;
-	if (!cops)
-		goto out;
-	if (!cops->tcf_block)
-		goto out;
-	if (TC_H_MIN(tcm->tcm_parent)) {
-		cl = cops->find(q, tcm->tcm_parent);
-		if (cl == 0)
+		const struct Qdisc_class_ops *cops;
+		struct net_device *dev;
+		unsigned long cl = 0;
+
+		dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+		if (!dev)
+			return skb->len;
+
+		parent = tcm->tcm_parent;
+		if (!parent) {
+			q = dev->qdisc;
+			parent = q->handle;
+		} else {
+			q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+		}
+		if (!q)
 			goto out;
+		cops = q->ops->cl_ops;
+		if (!cops)
+			goto out;
+		if (!cops->tcf_block)
+			goto out;
+		if (TC_H_MIN(tcm->tcm_parent)) {
+			cl = cops->find(q, tcm->tcm_parent);
+			if (cl == 0)
+				goto out;
+		}
+		block = cops->tcf_block(q, cl, NULL);
+		if (!block)
+			goto out;
+		if (tcf_block_shared(block))
+			q = NULL;
 	}
-	block = cops->tcf_block(q, cl, NULL);
-	if (!block)
-		goto out;
 
 	index_start = cb->args[0];
 	index = 0;
-- 
cgit v1.2.3


From d47a6b0e7c492a4ba4524d557db388e34fd0a47a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 Jan 2018 11:46:52 +0100
Subject: net: sched: introduce ingress/egress block index attributes for qdisc

Introduce two new attributes to be used for qdisc creation and dumping.
One for ingress block, one for egress block. Introduce a set of ops that
qdisc which supports block sharing would implement.

Passing block indexes in qdisc change is not supported yet and it is
checked and forbidded.

In future, these attributes are to be reused for specifying block
indexes for classes as well. As of this moment however, it is not
supported so a check is in place to forbid it.

Suggested-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h      |  7 +++++
 include/uapi/linux/rtnetlink.h |  2 ++
 net/sched/sch_api.c            | 60 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index bf5cc0a1d0f6..cfc19d0ba2ad 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -204,6 +204,13 @@ struct Qdisc_ops {
 	int			(*dump)(struct Qdisc *, struct sk_buff *);
 	int			(*dump_stats)(struct Qdisc *, struct gnet_dump *);
 
+	void			(*ingress_block_set)(struct Qdisc *sch,
+						     u32 block_index);
+	void			(*egress_block_set)(struct Qdisc *sch,
+						    u32 block_index);
+	u32			(*ingress_block_get)(struct Qdisc *sch);
+	u32			(*egress_block_get)(struct Qdisc *sch);
+
 	struct module		*owner;
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index da878f2e7c39..9b15005955fa 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -568,6 +568,8 @@ enum {
 	TCA_DUMP_INVISIBLE,
 	TCA_CHAIN,
 	TCA_HW_OFFLOAD,
+	TCA_INGRESS_BLOCK,
+	TCA_EGRESS_BLOCK,
 	__TCA_MAX
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 7dffa9dce28b..d512f49ee83c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -791,6 +791,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	unsigned char *b = skb_tail_pointer(skb);
 	struct gnet_dump d;
 	struct qdisc_size_table *stab;
+	u32 block_index;
 	__u32 qlen;
 
 	cond_resched();
@@ -807,6 +808,18 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_info = refcount_read(&q->refcnt);
 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
 		goto nla_put_failure;
+	if (q->ops->ingress_block_get) {
+		block_index = q->ops->ingress_block_get(q);
+		if (block_index &&
+		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
+			goto nla_put_failure;
+	}
+	if (q->ops->egress_block_get) {
+		block_index = q->ops->egress_block_get(q);
+		if (block_index &&
+		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
+			goto nla_put_failure;
+	}
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
@@ -994,6 +1007,40 @@ skip:
 	return err;
 }
 
+static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
+				   struct netlink_ext_ack *extack)
+{
+	u32 block_index;
+
+	if (tca[TCA_INGRESS_BLOCK]) {
+		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
+
+		if (!block_index) {
+			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
+			return -EINVAL;
+		}
+		if (!sch->ops->ingress_block_set) {
+			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
+			return -EOPNOTSUPP;
+		}
+		sch->ops->ingress_block_set(sch, block_index);
+	}
+	if (tca[TCA_EGRESS_BLOCK]) {
+		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
+
+		if (!block_index) {
+			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
+			return -EINVAL;
+		}
+		if (!sch->ops->egress_block_set) {
+			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
+			return -EOPNOTSUPP;
+		}
+		sch->ops->egress_block_set(sch, block_index);
+	}
+	return 0;
+}
+
 /* lockdep annotation is needed for ingress; egress gets it only for name */
 static struct lock_class_key qdisc_tx_lock;
 static struct lock_class_key qdisc_rx_lock;
@@ -1088,6 +1135,10 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
 	}
 
+	err = qdisc_block_indexes_set(sch, tca, extack);
+	if (err)
+		goto err_out3;
+
 	if (ops->init) {
 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
 		if (err != 0)
@@ -1169,6 +1220,10 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
 			return -EINVAL;
 		}
+		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
+			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
+			return -EOPNOTSUPP;
+		}
 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
 		if (err)
 			return err;
@@ -1894,6 +1949,11 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
 		}
 	}
 
+	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
+		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
+		return -EOPNOTSUPP;
+	}
+
 	new_cl = cl;
 	err = -EOPNOTSUPP;
 	if (cops->change)
-- 
cgit v1.2.3


From aff3d70a07fffc0abb53663e4a4acb059d2f36af Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 16 Jan 2018 16:31:02 +0800
Subject: tun: allow to attach ebpf socket filter

This patch allows userspace to attach eBPF filter to tun. This will
allow to implement VM dataplane filtering in a more efficient way
compared to cBPF filter by allowing either qemu or libvirt to
attach eBPF filter to tun.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 38 ++++++++++++++++++++++++++++++++++----
 include/uapi/linux/if_tun.h |  1 +
 2 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 76197ede22a9..170a3e89b5af 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -239,6 +239,12 @@ struct tun_struct {
 	struct tun_pcpu_stats __percpu *pcpu_stats;
 	struct bpf_prog __rcu *xdp_prog;
 	struct tun_prog __rcu *steering_prog;
+	struct tun_prog __rcu *filter_prog;
+};
+
+struct veth {
+	__be16 h_vlan_proto;
+	__be16 h_vlan_TCI;
 };
 
 bool tun_is_xdp_buff(void *ptr)
@@ -1036,12 +1042,25 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
 #endif
 }
 
+static unsigned int run_ebpf_filter(struct tun_struct *tun,
+				    struct sk_buff *skb,
+				    int len)
+{
+	struct tun_prog *prog = rcu_dereference(tun->filter_prog);
+
+	if (prog)
+		len = bpf_prog_run_clear_cb(prog->prog, skb);
+
+	return len;
+}
+
 /* Net device start xmit */
 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	int txq = skb->queue_mapping;
 	struct tun_file *tfile;
+	int len = skb->len;
 
 	rcu_read_lock();
 	tfile = rcu_dereference(tun->tfiles[txq]);
@@ -1067,6 +1086,15 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	    sk_filter(tfile->socket.sk, skb))
 		goto drop;
 
+	len = run_ebpf_filter(tun, skb, len);
+
+	/* Trim extra bytes since we may insert vlan proto & TCI
+	 * in tun_put_user().
+	 */
+	len -= skb_vlan_tag_present(skb) ? sizeof(struct veth) : 0;
+	if (len <= 0 || pskb_trim(skb, len))
+		goto drop;
+
 	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 		goto drop;
 
@@ -2054,10 +2082,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 
 	if (vlan_hlen) {
 		int ret;
-		struct {
-			__be16 h_vlan_proto;
-			__be16 h_vlan_TCI;
-		} veth;
+		struct veth veth;
 
 		veth.h_vlan_proto = skb->vlan_proto;
 		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
@@ -2225,6 +2250,7 @@ static void tun_free_netdev(struct net_device *dev)
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
 	__tun_set_ebpf(tun, &tun->steering_prog, NULL);
+	__tun_set_ebpf(tun, &tun->filter_prog, NULL);
 }
 
 static void tun_setup(struct net_device *dev)
@@ -3019,6 +3045,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
 		break;
 
+	case TUNSETFILTEREBPF:
+		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index fb38c1797131..ee432cd3018c 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -58,6 +58,7 @@
 #define TUNSETVNETBE _IOW('T', 222, int)
 #define TUNGETVNETBE _IOR('T', 223, int)
 #define TUNSETSTEERINGEBPF _IOR('T', 224, int)
+#define TUNSETFILTEREBPF _IOR('T', 225, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
cgit v1.2.3


From cb5f7334d479414adc6afe60105283277e297489 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 17 Jan 2018 12:05:36 +0100
Subject: bpf: add comments to BPF ld/ldx sizes

Doc BPF ld/ldx size defines as comments in code, as it makes in
faster to lookup in a programming/review setting, than looking up
the sizes in Documentation/networking/filter.txt.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h        | 2 +-
 include/uapi/linux/bpf_common.h | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7c2259e8bc54..74dc4dc98681 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -17,7 +17,7 @@
 #define BPF_ALU64	0x07	/* alu mode in double word width */
 
 /* ld/ldx fields */
-#define BPF_DW		0x18	/* double word */
+#define BPF_DW		0x18	/* double word (64-bit) */
 #define BPF_XADD	0xc0	/* exclusive add */
 
 /* alu/jmp fields */
diff --git a/include/uapi/linux/bpf_common.h b/include/uapi/linux/bpf_common.h
index 18be90725ab0..ee97668bdadb 100644
--- a/include/uapi/linux/bpf_common.h
+++ b/include/uapi/linux/bpf_common.h
@@ -15,9 +15,10 @@
 
 /* ld/ldx fields */
 #define BPF_SIZE(code)  ((code) & 0x18)
-#define		BPF_W		0x00
-#define		BPF_H		0x08
-#define		BPF_B		0x10
+#define		BPF_W		0x00 /* 32-bit */
+#define		BPF_H		0x08 /* 16-bit */
+#define		BPF_B		0x10 /*  8-bit */
+/* eBPF		BPF_DW		0x18    64-bit */
 #define BPF_MODE(code)  ((code) & 0xe0)
 #define		BPF_IMM		0x00
 #define		BPF_ABS		0x20
-- 
cgit v1.2.3


From 52775b33bb5072fbc07b02c0cf4fe8da1f7ee7cd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Jan 2018 19:13:28 -0800
Subject: bpf: offload: report device information about offloaded maps

Tell user space about device on which the map was created.
Unfortunate reality of user ABI makes sharing this code
with program offload difficult but the information is the
same.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h            |  2 ++
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/offload.c           | 55 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  6 +++++
 tools/include/uapi/linux/bpf.h |  3 +++
 5 files changed, 69 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 025b1c2f8053..66df387106de 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -586,6 +586,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
 int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 			       struct bpf_prog *prog);
 
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map);
+
 int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
 int bpf_map_offload_update_elem(struct bpf_map *map,
 				void *key, void *value, u64 flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 74dc4dc98681..406c19d6016b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -938,6 +938,9 @@ struct bpf_map_info {
 	__u32 max_entries;
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2657976aec2a..c9401075b58c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -413,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return ret;
 }
 
+struct ns_get_path_bpf_map_args {
+	struct bpf_offloaded_map *offmap;
+	struct bpf_map_info *info;
+};
+
+static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
+{
+	struct ns_get_path_bpf_map_args *args = private_data;
+	struct ns_common *ns;
+	struct net *net;
+
+	rtnl_lock();
+	down_read(&bpf_devs_lock);
+
+	if (args->offmap->netdev) {
+		args->info->ifindex = args->offmap->netdev->ifindex;
+		net = dev_net(args->offmap->netdev);
+		get_net(net);
+		ns = &net->ns;
+	} else {
+		args->info->ifindex = 0;
+		ns = NULL;
+	}
+
+	up_read(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return ns;
+}
+
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+	struct ns_get_path_bpf_map_args args = {
+		.offmap	= map_to_offmap(map),
+		.info	= info,
+	};
+	struct inode *ns_inode;
+	struct path ns_path;
+	void *res;
+
+	res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
+	if (IS_ERR(res)) {
+		if (!info->ifindex)
+			return -ENODEV;
+		return PTR_ERR(res);
+	}
+
+	ns_inode = ns_path.dentry->d_inode;
+	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+	info->netns_ino = ns_inode->i_ino;
+	path_put(&ns_path);
+
+	return 0;
+}
+
 bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
 {
 	struct bpf_offloaded_map *offmap;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 97a825ffc763..5bdb0cc84ad2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1801,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_info_fill(&info, map);
+		if (err)
+			return err;
+	}
+
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
 		return -EFAULT;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7c2259e8bc54..af1f49ad8b88 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -938,6 +938,9 @@ struct bpf_map_info {
 	__u32 max_entries;
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit v1.2.3


From 3214d01f139b7544e870fc0b7fcce8da13c1cb51 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 15 Jan 2018 16:06:47 +1100
Subject: KVM: PPC: Book3S: Provide information about hardware/firmware CVE
 workarounds

This adds a new ioctl, KVM_PPC_GET_CPU_CHAR, that gives userspace
information about the underlying machine's level of vulnerability
to the recently announced vulnerabilities CVE-2017-5715,
CVE-2017-5753 and CVE-2017-5754, and whether the machine provides
instructions to assist software to work around the vulnerabilities.

The ioctl returns two u64 words describing characteristics of the
CPU and required software behaviour respectively, plus two mask
words which indicate which bits have been filled in by the kernel,
for extensibility.  The bit definitions are the same as for the
new H_GET_CPU_CHARACTERISTICS hypercall.

There is also a new capability, KVM_CAP_PPC_GET_CPU_CHAR, which
indicates whether the new ioctl is available.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt   |  46 +++++++++++++
 arch/powerpc/include/uapi/asm/kvm.h |  25 +++++++
 arch/powerpc/kvm/powerpc.c          | 131 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h            |   3 +
 4 files changed, 205 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 57d3ee9e4bde..fc3ae951bc07 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3403,6 +3403,52 @@ invalid, if invalid pages are written to (e.g. after the end of memory)
 or if no page table is present for the addresses (e.g. when using
 hugepages).
 
+4.108 KVM_PPC_GET_CPU_CHAR
+
+Capability: KVM_CAP_PPC_GET_CPU_CHAR
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_cpu_char (out)
+Returns: 0 on successful completion
+	 -EFAULT if struct kvm_ppc_cpu_char cannot be written
+
+This ioctl gives userspace information about certain characteristics
+of the CPU relating to speculative execution of instructions and
+possible information leakage resulting from speculative execution (see
+CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754).  The information is
+returned in struct kvm_ppc_cpu_char, which looks like this:
+
+struct kvm_ppc_cpu_char {
+	__u64	character;		/* characteristics of the CPU */
+	__u64	behaviour;		/* recommended software behaviour */
+	__u64	character_mask;		/* valid bits in character */
+	__u64	behaviour_mask;		/* valid bits in behaviour */
+};
+
+For extensibility, the character_mask and behaviour_mask fields
+indicate which bits of character and behaviour have been filled in by
+the kernel.  If the set of defined bits is extended in future then
+userspace will be able to tell whether it is running on a kernel that
+knows about the new bits.
+
+The character field describes attributes of the CPU which can help
+with preventing inadvertent information disclosure - specifically,
+whether there is an instruction to flash-invalidate the L1 data cache
+(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set
+to a mode where entries can only be used by the thread that created
+them, whether the bcctr[l] instruction prevents speculation, and
+whether a speculation barrier instruction (ori 31,31,0) is provided.
+
+The behaviour field describes actions that software should take to
+prevent inadvertent information disclosure, and thus describes which
+vulnerabilities the hardware is subject to; specifically whether the
+L1 data cache should be flushed when returning to user mode from the
+kernel, and whether a speculation barrier should be placed between an
+array bounds check and the array access.
+
+These fields use the same bit definitions as the new
+H_GET_CPU_CHARACTERISTICS hypercall.
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 61d6049f4c1e..637b7263cb86 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -443,6 +443,31 @@ struct kvm_ppc_rmmu_info {
 	__u32	ap_encodings[8];
 };
 
+/* For KVM_PPC_GET_CPU_CHAR */
+struct kvm_ppc_cpu_char {
+	__u64	character;		/* characteristics of the CPU */
+	__u64	behaviour;		/* recommended software behaviour */
+	__u64	character_mask;		/* valid bits in character */
+	__u64	behaviour_mask;		/* valid bits in behaviour */
+};
+
+/*
+ * Values for character and character_mask.
+ * These are identical to the values used by H_GET_CPU_CHARACTERISTICS.
+ */
+#define KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31		(1ULL << 63)
+#define KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED	(1ULL << 62)
+#define KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30	(1ULL << 61)
+#define KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2	(1ULL << 60)
+#define KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV	(1ULL << 59)
+#define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED	(1ULL << 58)
+#define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF	(1ULL << 57)
+#define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS	(1ULL << 56)
+
+#define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY	(1ULL << 63)
+#define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR		(1ULL << 62)
+#define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ULL << 61)
+
 /* Per-vcpu XICS interrupt controller state */
 #define KVM_REG_PPC_ICP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1915e86cef6f..0a7c88786ec0 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -39,6 +39,10 @@
 #include <asm/iommu.h>
 #include <asm/switch_to.h>
 #include <asm/xive.h>
+#ifdef CONFIG_PPC_PSERIES
+#include <asm/hvcall.h>
+#include <asm/plpar_wrappers.h>
+#endif
 
 #include "timing.h"
 #include "irq.h"
@@ -548,6 +552,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_KVM_XICS
 	case KVM_CAP_IRQ_XICS:
 #endif
+	case KVM_CAP_PPC_GET_CPU_CHAR:
 		r = 1;
 		break;
 
@@ -1759,6 +1764,124 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 	return r;
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * These functions check whether the underlying hardware is safe
+ * against attacks based on observing the effects of speculatively
+ * executed instructions, and whether it supplies instructions for
+ * use in workarounds.  The information comes from firmware, either
+ * via the device tree on powernv platforms or from an hcall on
+ * pseries platforms.
+ */
+#ifdef CONFIG_PPC_PSERIES
+static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+	struct h_cpu_char_result c;
+	unsigned long rc;
+
+	if (!machine_is(pseries))
+		return -ENOTTY;
+
+	rc = plpar_get_cpu_characteristics(&c);
+	if (rc == H_SUCCESS) {
+		cp->character = c.character;
+		cp->behaviour = c.behaviour;
+		cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 |
+			KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED |
+			KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 |
+			KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 |
+			KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
+			KVM_PPC_CPU_CHAR_BR_HINT_HONOURED |
+			KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF |
+			KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+		cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
+			KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
+			KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+	}
+	return 0;
+}
+#else
+static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+	return -ENOTTY;
+}
+#endif
+
+static inline bool have_fw_feat(struct device_node *fw_features,
+				const char *state, const char *name)
+{
+	struct device_node *np;
+	bool r = false;
+
+	np = of_get_child_by_name(fw_features, name);
+	if (np) {
+		r = of_property_read_bool(np, state);
+		of_node_put(np);
+	}
+	return r;
+}
+
+static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+	struct device_node *np, *fw_features;
+	int r;
+
+	memset(cp, 0, sizeof(*cp));
+	r = pseries_get_cpu_char(cp);
+	if (r != -ENOTTY)
+		return r;
+
+	np = of_find_node_by_name(NULL, "ibm,opal");
+	if (np) {
+		fw_features = of_get_child_by_name(np, "fw-features");
+		of_node_put(np);
+		if (!fw_features)
+			return 0;
+		if (have_fw_feat(fw_features, "enabled",
+				 "inst-spec-barrier-ori31,31,0"))
+			cp->character |= KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31;
+		if (have_fw_feat(fw_features, "enabled",
+				 "fw-bcctrl-serialized"))
+			cp->character |= KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED;
+		if (have_fw_feat(fw_features, "enabled",
+				 "inst-l1d-flush-ori30,30,0"))
+			cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30;
+		if (have_fw_feat(fw_features, "enabled",
+				 "inst-l1d-flush-trig2"))
+			cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2;
+		if (have_fw_feat(fw_features, "enabled",
+				 "fw-l1d-thread-split"))
+			cp->character |= KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV;
+		if (have_fw_feat(fw_features, "enabled",
+				 "fw-count-cache-disabled"))
+			cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+		cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 |
+			KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED |
+			KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 |
+			KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 |
+			KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
+			KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+
+		if (have_fw_feat(fw_features, "enabled",
+				 "speculation-policy-favor-security"))
+			cp->behaviour |= KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY;
+		if (!have_fw_feat(fw_features, "disabled",
+				  "needs-l1d-flush-msr-pr-0-to-1"))
+			cp->behaviour |= KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR;
+		if (!have_fw_feat(fw_features, "disabled",
+				  "needs-spec-barrier-for-bound-checks"))
+			cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+		cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
+			KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
+			KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+
+		of_node_put(fw_features);
+	}
+
+	return 0;
+}
+#endif
+
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
@@ -1861,6 +1984,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			r = -EFAULT;
 		break;
 	}
+	case KVM_PPC_GET_CPU_CHAR: {
+		struct kvm_ppc_cpu_char cpuchar;
+
+		r = kvmppc_get_cpu_char(&cpuchar);
+		if (r >= 0 && copy_to_user(argp, &cpuchar, sizeof(cpuchar)))
+			r = -EFAULT;
+		break;
+	}
 	default: {
 		struct kvm *kvm = filp->private_data;
 		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 496e59a2738b..7a99b98cf88e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -932,6 +932,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_SYNIC2 148
 #define KVM_CAP_HYPERV_VP_INDEX 149
 #define KVM_CAP_S390_AIS_MIGRATION 150
+#define KVM_CAP_PPC_GET_CPU_CHAR 151
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1261,6 +1262,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
 /* Available with KVM_CAP_PPC_RADIX_MMU */
 #define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
+/* Available with KVM_CAP_PPC_GET_CPU_CHAR */
+#define KVM_PPC_GET_CPU_CHAR	  _IOR(KVMIO,  0xb1, struct kvm_ppc_cpu_char)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
-- 
cgit v1.2.3


From 3ecbfd65f50e5ff9c538c1bfa3356ef52cc66586 Mon Sep 17 00:00:00 2001
From: Harsha Sharma <harshasharmaiitr@gmail.com>
Date: Wed, 27 Dec 2017 00:59:00 +0530
Subject: netfilter: nf_tables: allocate handle and delete objects via handle

This patch allows deletion of objects via unique handle which can be
listed via '-a' option.

Signed-off-by: Harsha Sharma <harshasharmaiitr@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  11 ++-
 include/uapi/linux/netfilter/nf_tables.h |  10 +++
 net/netfilter/nf_tables_api.c            | 146 ++++++++++++++++++++++++++++---
 3 files changed, 153 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 4aca413367ee..663b015dace5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -374,6 +374,7 @@ void nft_unregister_set(struct nft_set_type *type);
  *	@list: table set list node
  *	@bindings: list of set bindings
  * 	@name: name of the set
+ *	@handle: unique handle of the set
  * 	@ktype: key type (numeric type defined by userspace, not used in the kernel)
  * 	@dtype: data type (verdict or numeric type defined by userspace)
  * 	@objtype: object type (see NFT_OBJECT_* definitions)
@@ -396,6 +397,7 @@ struct nft_set {
 	struct list_head		list;
 	struct list_head		bindings;
 	char				*name;
+	u64				handle;
 	u32				ktype;
 	u32				dtype;
 	u32				objtype;
@@ -946,6 +948,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@objects: stateful objects in the table
  *	@flowtables: flow tables in the table
  *	@hgenerator: handle generator state
+ *	@handle: table handle
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
  *	@genmask: generation mask
@@ -959,6 +962,7 @@ struct nft_table {
 	struct list_head		objects;
 	struct list_head		flowtables;
 	u64				hgenerator;
+	u64				handle;
 	u32				use;
 	u16				family:6,
 					flags:8,
@@ -983,9 +987,9 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  *	@name: name of this stateful object
  *	@genmask: generation mask
  *	@use: number of references to this stateful object
- * 	@data: object data, layout depends on type
+ *	@handle: unique object handle
  *	@ops: object operations
- *	@data: pointer to object data
+ * 	@data: object data, layout depends on type
  */
 struct nft_object {
 	struct list_head		list;
@@ -993,6 +997,7 @@ struct nft_object {
 	struct nft_table		*table;
 	u32				genmask:2,
 					use:30;
+	u64				handle;
 	/* runtime data below here */
 	const struct nft_object_ops	*ops ____cacheline_aligned;
 	unsigned char			data[]
@@ -1074,6 +1079,7 @@ void nft_unregister_obj(struct nft_object_type *obj_type);
  *	@ops_len: number of hooks in array
  *	@genmask: generation mask
  *	@use: number of references to this flow table
+ * 	@handle: unique object handle
  *	@data: rhashtable and garbage collector
  * 	@ops: array of hooks
  */
@@ -1086,6 +1092,7 @@ struct nft_flowtable {
 	int				ops_len;
 	u32				genmask:2,
 					use:30;
+	u64				handle;
 	/* runtime data below here */
 	struct nf_hook_ops		*ops ____cacheline_aligned;
 	struct nf_flowtable		data;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 53e8dd2a3a03..66dceee0ae30 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -174,6 +174,8 @@ enum nft_table_attributes {
 	NFTA_TABLE_NAME,
 	NFTA_TABLE_FLAGS,
 	NFTA_TABLE_USE,
+	NFTA_TABLE_HANDLE,
+	NFTA_TABLE_PAD,
 	__NFTA_TABLE_MAX
 };
 #define NFTA_TABLE_MAX		(__NFTA_TABLE_MAX - 1)
@@ -317,6 +319,7 @@ enum nft_set_desc_attributes {
  * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32)
  * @NFTA_SET_USERDATA: user data (NLA_BINARY)
  * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*)
+ * @NFTA_SET_HANDLE: set handle (NLA_U64)
  */
 enum nft_set_attributes {
 	NFTA_SET_UNSPEC,
@@ -335,6 +338,7 @@ enum nft_set_attributes {
 	NFTA_SET_USERDATA,
 	NFTA_SET_PAD,
 	NFTA_SET_OBJ_TYPE,
+	NFTA_SET_HANDLE,
 	__NFTA_SET_MAX
 };
 #define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
@@ -1314,6 +1318,7 @@ enum nft_ct_helper_attributes {
  * @NFTA_OBJ_TYPE: stateful object type (NLA_U32)
  * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED)
  * @NFTA_OBJ_USE: number of references to this expression (NLA_U32)
+ * @NFTA_OBJ_HANDLE: object handle (NLA_U64)
  */
 enum nft_object_attributes {
 	NFTA_OBJ_UNSPEC,
@@ -1322,6 +1327,8 @@ enum nft_object_attributes {
 	NFTA_OBJ_TYPE,
 	NFTA_OBJ_DATA,
 	NFTA_OBJ_USE,
+	NFTA_OBJ_HANDLE,
+	NFTA_OBJ_PAD,
 	__NFTA_OBJ_MAX
 };
 #define NFTA_OBJ_MAX		(__NFTA_OBJ_MAX - 1)
@@ -1333,6 +1340,7 @@ enum nft_object_attributes {
  * @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING)
  * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
  * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
+ * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64)
  */
 enum nft_flowtable_attributes {
 	NFTA_FLOWTABLE_UNSPEC,
@@ -1340,6 +1348,8 @@ enum nft_flowtable_attributes {
 	NFTA_FLOWTABLE_NAME,
 	NFTA_FLOWTABLE_HOOK,
 	NFTA_FLOWTABLE_USE,
+	NFTA_FLOWTABLE_HANDLE,
+	NFTA_FLOWTABLE_PAD,
 	__NFTA_FLOWTABLE_MAX
 };
 #define NFTA_FLOWTABLE_MAX	(__NFTA_FLOWTABLE_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b541e5094dce..1addc401ff7d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -26,6 +26,7 @@
 static LIST_HEAD(nf_tables_expressions);
 static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
+static u64 table_handle;
 
 static void nft_ctx_init(struct nft_ctx *ctx,
 			 struct net *net,
@@ -332,6 +333,20 @@ static struct nft_table *nft_table_lookup(const struct net *net,
 	return NULL;
 }
 
+static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
+						   const struct nlattr *nla,
+						   u8 genmask)
+{
+	struct nft_table *table;
+
+	list_for_each_entry(table, &net->nft.tables, list) {
+		if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
+		    nft_active_genmask(table, genmask))
+			return table;
+	}
+	return NULL;
+}
+
 static struct nft_table *nf_tables_table_lookup(const struct net *net,
 						const struct nlattr *nla,
 						u8 family, u8 genmask)
@@ -348,6 +363,22 @@ static struct nft_table *nf_tables_table_lookup(const struct net *net,
 	return ERR_PTR(-ENOENT);
 }
 
+static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net,
+							 const struct nlattr *nla,
+							 u8 genmask)
+{
+	struct nft_table *table;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	table = nft_table_lookup_byhandle(net, nla, genmask);
+	if (table != NULL)
+		return table;
+
+	return ERR_PTR(-ENOENT);
+}
+
 static inline u64 nf_tables_alloc_handle(struct nft_table *table)
 {
 	return ++table->hgenerator;
@@ -394,6 +425,7 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
 	[NFTA_TABLE_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_TABLE_MAXNAMELEN - 1 },
 	[NFTA_TABLE_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_TABLE_HANDLE]	= { .type = NLA_U64 },
 };
 
 static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -415,7 +447,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
 
 	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
 	    nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
-	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)))
+	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
+	    nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
+			 NFTA_TABLE_PAD))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -674,6 +708,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	INIT_LIST_HEAD(&table->flowtables);
 	table->family = family;
 	table->flags = flags;
+	table->handle = ++table_handle;
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
@@ -791,11 +826,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	struct nft_ctx ctx;
 
 	nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
-	if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
+	if (family == AF_UNSPEC ||
+	    (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
 		return nft_flush(&ctx, family);
 
-	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
-				       genmask);
+	if (nla[NFTA_TABLE_HANDLE])
+		table = nf_tables_table_lookup_byhandle(net,
+							nla[NFTA_TABLE_HANDLE],
+							genmask);
+	else
+		table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME],
+					       family, genmask);
+
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1539,6 +1581,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	struct nft_rule *rule;
 	int family = nfmsg->nfgen_family;
 	struct nft_ctx ctx;
+	u64 handle;
 	u32 use;
 	int err;
 
@@ -1547,7 +1590,12 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+	if (nla[NFTA_CHAIN_HANDLE]) {
+		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+		chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+	} else {
+		chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+	}
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
@@ -2503,6 +2551,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
 	[NFTA_SET_USERDATA]		= { .type = NLA_BINARY,
 					    .len  = NFT_USERDATA_MAXLEN },
 	[NFTA_SET_OBJ_TYPE]		= { .type = NLA_U32 },
+	[NFTA_SET_HANDLE]		= { .type = NLA_U64 },
 };
 
 static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -2546,6 +2595,22 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
 	return ERR_PTR(-ENOENT);
 }
 
+static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table,
+						     const struct nlattr *nla, u8 genmask)
+{
+	struct nft_set *set;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	list_for_each_entry(set, &table->sets, list) {
+		if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
+		    nft_active_genmask(set, genmask))
+			return set;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
 						 const struct nlattr *nla,
 						 u8 genmask)
@@ -2661,6 +2726,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 		goto nla_put_failure;
 	if (nla_put_string(skb, NFTA_SET_NAME, set->name))
 		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle),
+			 NFTA_SET_PAD))
+		goto nla_put_failure;
 	if (set->flags != 0)
 		if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
 			goto nla_put_failure;
@@ -3069,6 +3137,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	set->udata  = udata;
 	set->timeout = timeout;
 	set->gc_int = gc_int;
+	set->handle = nf_tables_alloc_handle(table);
 
 	err = ops->init(set, &desc, nla);
 	if (err < 0)
@@ -3126,7 +3195,10 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+	if (nla[NFTA_SET_HANDLE])
+		set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask);
+	else
+		set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
@@ -4256,6 +4328,21 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 }
 EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
 
+struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
+						 const struct nlattr *nla,
+						 u32 objtype, u8 genmask)
+{
+	struct nft_object *obj;
+
+	list_for_each_entry(obj, &table->objects, list) {
+		if (be64_to_cpu(nla_get_be64(nla)) == obj->handle &&
+		    objtype == obj->ops->type->type &&
+		    nft_active_genmask(obj, genmask))
+			return obj;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
 	[NFTA_OBJ_TABLE]	= { .type = NLA_STRING,
 				    .len = NFT_TABLE_MAXNAMELEN - 1 },
@@ -4263,6 +4350,7 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
 				    .len = NFT_OBJ_MAXNAMELEN - 1 },
 	[NFTA_OBJ_TYPE]		= { .type = NLA_U32 },
 	[NFTA_OBJ_DATA]		= { .type = NLA_NESTED },
+	[NFTA_OBJ_HANDLE]	= { .type = NLA_U64},
 };
 
 static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
@@ -4410,6 +4498,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 	obj->table = table;
+	obj->handle = nf_tables_alloc_handle(table);
+
 	obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
 	if (!obj->name) {
 		err = -ENOMEM;
@@ -4456,7 +4546,9 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
 	    nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
 	    nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
 	    nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
-	    nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+	    nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) ||
+	    nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
+			 NFTA_OBJ_PAD))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -4654,7 +4746,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	u32 objtype;
 
 	if (!nla[NFTA_OBJ_TYPE] ||
-	    !nla[NFTA_OBJ_NAME])
+	    (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
 		return -EINVAL;
 
 	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
@@ -4663,7 +4755,12 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
-	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	if (nla[NFTA_OBJ_HANDLE])
+		obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
+						    objtype, genmask);
+	else
+		obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME],
+					   objtype, genmask);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 	if (obj->use > 0)
@@ -4735,6 +4832,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
 	[NFTA_FLOWTABLE_NAME]		= { .type = NLA_STRING,
 					    .len = NFT_NAME_MAXLEN - 1 },
 	[NFTA_FLOWTABLE_HOOK]		= { .type = NLA_NESTED },
+	[NFTA_FLOWTABLE_HANDLE]		= { .type = NLA_U64 },
 };
 
 struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
@@ -4752,6 +4850,20 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
 }
 EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
 
+struct nft_flowtable *
+nf_tables_flowtable_lookup_byhandle(const struct nft_table *table,
+				    const struct nlattr *nla, u8 genmask)
+{
+       struct nft_flowtable *flowtable;
+
+       list_for_each_entry(flowtable, &table->flowtables, list) {
+               if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle &&
+                   nft_active_genmask(flowtable, genmask))
+                       return flowtable;
+       }
+       return ERR_PTR(-ENOENT);
+}
+
 #define NFT_FLOWTABLE_DEVICE_MAX	8
 
 static int nf_tables_parse_devices(const struct nft_ctx *ctx,
@@ -4960,6 +5072,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		return -ENOMEM;
 
 	flowtable->table = table;
+	flowtable->handle = nf_tables_alloc_handle(table);
+
 	flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
 	if (!flowtable->name) {
 		err = -ENOMEM;
@@ -5034,8 +5148,14 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
-					       genmask);
+	if (nla[NFTA_FLOWTABLE_HANDLE])
+		flowtable = nf_tables_flowtable_lookup_byhandle(table,
+								nla[NFTA_FLOWTABLE_HANDLE],
+								genmask);
+	else
+		flowtable = nf_tables_flowtable_lookup(table,
+						       nla[NFTA_FLOWTABLE_NAME],
+						       genmask);
 	if (IS_ERR(flowtable))
                 return PTR_ERR(flowtable);
 	if (flowtable->use > 0)
@@ -5068,7 +5188,9 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 
 	if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
 	    nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
-	    nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)))
+	    nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
+	    nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
+			 NFTA_FLOWTABLE_PAD))
 		goto nla_put_failure;
 
 	nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
-- 
cgit v1.2.3


From 4db5a802e565f0a60e08bd39a055f0095689802b Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 16 Jan 2018 23:01:57 +0100
Subject: l2tp: mark L2TP_ATTR_L2SPEC_LEN as not used

Reviewed-by: Guillaume Nault <g.nault@alphalink.fr>
Tested-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 71e62795104d..7d570c7bd117 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -97,7 +97,7 @@ enum {
 	L2TP_ATTR_OFFSET,		/* u16 (not used) */
 	L2TP_ATTR_DATA_SEQ,		/* u16 */
 	L2TP_ATTR_L2SPEC_TYPE,		/* u8, enum l2tp_l2spec_type */
-	L2TP_ATTR_L2SPEC_LEN,		/* u8, enum l2tp_l2spec_type */
+	L2TP_ATTR_L2SPEC_LEN,		/* u8 (not used) */
 	L2TP_ATTR_PROTO_VERSION,	/* u8 */
 	L2TP_ATTR_IFNAME,		/* string */
 	L2TP_ATTR_CONN_ID,		/* u32 */
-- 
cgit v1.2.3


From c5cc1f4df6b16646f8fae7aab523c1820bf916e8 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Date: Thu, 18 Jan 2018 17:50:43 -0800
Subject: powerpc/ptrace: Add memory protection key regset

The AMR/IAMR/UAMOR are part of the program context.
Allow it to be accessed via ptrace and through core files.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/pkeys.h    |  5 +++
 arch/powerpc/include/uapi/asm/elf.h |  1 +
 arch/powerpc/kernel/ptrace.c        | 66 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/traps.c         |  7 ++++
 include/uapi/linux/elf.h            |  1 +
 5 files changed, 80 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 2298771b066b..c3cbad824e5a 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -202,6 +202,11 @@ static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	return __arch_set_user_pkey_access(tsk, pkey, init_val);
 }
 
+static inline bool arch_pkeys_enabled(void)
+{
+	return !static_branch_likely(&pkey_disabled);
+}
+
 extern void pkey_mm_init(struct mm_struct *mm);
 extern void thread_pkey_regs_save(struct thread_struct *thread);
 extern void thread_pkey_regs_restore(struct thread_struct *new_thread,
diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h
index 5f201d40bcca..860c59291bfc 100644
--- a/arch/powerpc/include/uapi/asm/elf.h
+++ b/arch/powerpc/include/uapi/asm/elf.h
@@ -97,6 +97,7 @@
 #define ELF_NTMSPRREG	3	/* include tfhar, tfiar, texasr */
 #define ELF_NEBB	3	/* includes ebbrr, ebbhr, bescr */
 #define ELF_NPMU	5	/* includes siar, sdar, sier, mmcr2, mmcr0 */
+#define ELF_NPKEY	3	/* includes amr, iamr, uamor */
 
 typedef unsigned long elf_greg_t64;
 typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG];
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index aef08e579946..ca72d7391d40 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -35,6 +35,7 @@
 #include <linux/context_tracking.h>
 
 #include <linux/uaccess.h>
+#include <linux/pkeys.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/switch_to.h>
@@ -1787,6 +1788,61 @@ static int pmu_set(struct task_struct *target,
 	return ret;
 }
 #endif
+
+#ifdef CONFIG_PPC_MEM_KEYS
+static int pkey_active(struct task_struct *target,
+		       const struct user_regset *regset)
+{
+	if (!arch_pkeys_enabled())
+		return -ENODEV;
+
+	return regset->n;
+}
+
+static int pkey_get(struct task_struct *target,
+		    const struct user_regset *regset,
+		    unsigned int pos, unsigned int count,
+		    void *kbuf, void __user *ubuf)
+{
+	BUILD_BUG_ON(TSO(amr) + sizeof(unsigned long) != TSO(iamr));
+	BUILD_BUG_ON(TSO(iamr) + sizeof(unsigned long) != TSO(uamor));
+
+	if (!arch_pkeys_enabled())
+		return -ENODEV;
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &target->thread.amr, 0,
+				   ELF_NPKEY * sizeof(unsigned long));
+}
+
+static int pkey_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	u64 new_amr;
+	int ret;
+
+	if (!arch_pkeys_enabled())
+		return -ENODEV;
+
+	/* Only the AMR can be set from userspace */
+	if (pos != 0 || count != sizeof(new_amr))
+		return -EINVAL;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &new_amr, 0, sizeof(new_amr));
+	if (ret)
+		return ret;
+
+	/* UAMOR determines which bits of the AMR can be set from userspace. */
+	target->thread.amr = (new_amr & target->thread.uamor) |
+		(target->thread.amr & ~target->thread.uamor);
+
+	return 0;
+}
+#endif /* CONFIG_PPC_MEM_KEYS */
+
 /*
  * These are our native regset flavors.
  */
@@ -1821,6 +1877,9 @@ enum powerpc_regset {
 	REGSET_EBB,		/* EBB registers */
 	REGSET_PMR,		/* Performance Monitor Registers */
 #endif
+#ifdef CONFIG_PPC_MEM_KEYS
+	REGSET_PKEY,		/* AMR register */
+#endif
 };
 
 static const struct user_regset native_regsets[] = {
@@ -1926,6 +1985,13 @@ static const struct user_regset native_regsets[] = {
 		.active = pmu_active, .get = pmu_get, .set = pmu_set
 	},
 #endif
+#ifdef CONFIG_PPC_MEM_KEYS
+	[REGSET_PKEY] = {
+		.core_note_type = NT_PPC_PKEY, .n = ELF_NPKEY,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = pkey_active, .get = pkey_get, .set = pkey_set
+	},
+#endif
 };
 
 static const struct user_regset_view user_ppc_native_view = {
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 4b1a8e2ec023..122a3c883f4e 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -292,6 +292,13 @@ void _exception_pkey(int signr, struct pt_regs *regs, int code,
 		local_irq_enable();
 
 	current->thread.trap_nr = code;
+
+	/*
+	 * Save all the pkey registers AMR/IAMR/UAMOR. Eg: Core dumps need
+	 * to capture the content, if the task gets killed.
+	 */
+	thread_pkey_regs_save(&current->thread);
+
 	memset(&info, 0, sizeof(info));
 	info.si_signo = signr;
 	info.si_code = code;
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index bb6836986200..3bf73fb58045 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -396,6 +396,7 @@ typedef struct elf64_shdr {
 #define NT_PPC_TM_CTAR	0x10d		/* TM checkpointed Target Address Register */
 #define NT_PPC_TM_CPPR	0x10e		/* TM checkpointed Program Priority Register */
 #define NT_PPC_TM_CDSCR	0x10f		/* TM checkpointed Data Stream Control Register */
+#define NT_PPC_PKEY	0x110		/* Memory Protection Keys registers */
 #define NT_386_TLS	0x200		/* i386 TLS slots (struct user_desc) */
 #define NT_386_IOPERM	0x201		/* x86 io permission bitmap (1=deny) */
 #define NT_X86_XSTATE	0x202		/* x86 extended state using xsave */
-- 
cgit v1.2.3


From 35b3fde6203b932b2b1a5b53b3d8808abc9c4f60 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 17 Jan 2018 14:44:34 +0100
Subject: KVM: s390: wire up bpb feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new firmware interfaces for branch prediction behaviour changes
are transparently available for the guest. Nevertheless, there is
new state attached that should be migrated and properly resetted.
Provide a mechanism for handling reset, migration and VSIE.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
[Changed capability number to 152. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/s390/include/asm/kvm_host.h |  3 ++-
 arch/s390/include/uapi/asm/kvm.h |  5 ++++-
 arch/s390/kvm/kvm-s390.c         | 12 ++++++++++++
 arch/s390/kvm/vsie.c             | 10 ++++++++++
 include/uapi/linux/kvm.h         |  1 +
 5 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index e14f381757f6..c1b0a9ac1dc8 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -207,7 +207,8 @@ struct kvm_s390_sie_block {
 	__u16	ipa;			/* 0x0056 */
 	__u32	ipb;			/* 0x0058 */
 	__u32	scaoh;			/* 0x005c */
-	__u8	reserved60;		/* 0x0060 */
+#define FPF_BPBC 	0x20
+	__u8	fpf;			/* 0x0060 */
 #define ECB_GS		0x40
 #define ECB_TE		0x10
 #define ECB_SRSI	0x04
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 38535a57fef8..4cdaa55fabfe 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -224,6 +224,7 @@ struct kvm_guest_debug_arch {
 #define KVM_SYNC_RICCB  (1UL << 7)
 #define KVM_SYNC_FPRS   (1UL << 8)
 #define KVM_SYNC_GSCB   (1UL << 9)
+#define KVM_SYNC_BPBC   (1UL << 10)
 /* length and alignment of the sdnx as a power of two */
 #define SDNXC 8
 #define SDNXL (1UL << SDNXC)
@@ -247,7 +248,9 @@ struct kvm_sync_regs {
 	};
 	__u8  reserved[512];	/* for future vector expansion */
 	__u32 fpc;		/* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */
-	__u8 padding1[52];	/* riccb needs to be 64byte aligned */
+	__u8 bpbc : 1;		/* bp mode */
+	__u8 reserved2 : 7;
+	__u8 padding1[51];	/* riccb needs to be 64byte aligned */
 	__u8 riccb[64];		/* runtime instrumentation controls block */
 	__u8 padding2[192];	/* sdnx needs to be 256byte aligned */
 	union {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2c93cbbcd15e..2598cf243b86 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -421,6 +421,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_GS:
 		r = test_facility(133);
 		break;
+	case KVM_CAP_S390_BPB:
+		r = test_facility(82);
+		break;
 	default:
 		r = 0;
 	}
@@ -2198,6 +2201,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_s390_set_prefix(vcpu, 0);
 	if (test_kvm_facility(vcpu->kvm, 64))
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
+	if (test_kvm_facility(vcpu->kvm, 82))
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
 	if (test_kvm_facility(vcpu->kvm, 133))
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
 	/* fprs can be synchronized via vrs, even if the guest has no vx. With
@@ -2339,6 +2344,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
 	current->thread.fpu.fpc = 0;
 	vcpu->arch.sie_block->gbea = 1;
 	vcpu->arch.sie_block->pp = 0;
+	vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
 	vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
 	kvm_clear_async_pf_completion_queue(vcpu);
 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
@@ -3298,6 +3304,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
 		vcpu->arch.gs_enabled = 1;
 	}
+	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_BPBC) &&
+	    test_kvm_facility(vcpu->kvm, 82)) {
+		vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+		vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0;
+	}
 	save_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->run->s.regs.acrs);
 	/* save host (userspace) fprs/vrs */
@@ -3344,6 +3355,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_run->s.regs.pft = vcpu->arch.pfault_token;
 	kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
 	kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
+	kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
 	save_access_regs(vcpu->run->s.regs.acrs);
 	restore_access_regs(vcpu->arch.host_acrs);
 	/* Save guest register state */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 5d6ae0326d9e..751348348477 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -223,6 +223,12 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	memcpy(scb_o->gcr, scb_s->gcr, 128);
 	scb_o->pp = scb_s->pp;
 
+	/* branch prediction */
+	if (test_kvm_facility(vcpu->kvm, 82)) {
+		scb_o->fpf &= ~FPF_BPBC;
+		scb_o->fpf |= scb_s->fpf & FPF_BPBC;
+	}
+
 	/* interrupt intercept */
 	switch (scb_s->icptcode) {
 	case ICPT_PROGI:
@@ -265,6 +271,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	scb_s->ecb3 = 0;
 	scb_s->ecd = 0;
 	scb_s->fac = 0;
+	scb_s->fpf = 0;
 
 	rc = prepare_cpuflags(vcpu, vsie_page);
 	if (rc)
@@ -324,6 +331,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			prefix_unmapped(vsie_page);
 		scb_s->ecb |= scb_o->ecb & ECB_TE;
 	}
+	/* branch prediction */
+	if (test_kvm_facility(vcpu->kvm, 82))
+		scb_s->fpf |= scb_o->fpf & FPF_BPBC;
 	/* SIMD */
 	if (test_kvm_facility(vcpu->kvm, 129)) {
 		scb_s->eca |= scb_o->eca & ECA_VX;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7a99b98cf88e..8fb90a0819c3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -933,6 +933,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_VP_INDEX 149
 #define KVM_CAP_S390_AIS_MIGRATION 150
 #define KVM_CAP_PPC_GET_CPU_CHAR 151
+#define KVM_CAP_S390_BPB 152
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From ad8bc4d005576e3f380ba2dab24c183519f4e9fa Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 6 Dec 2017 11:40:10 +0800
Subject: btrfs: put btrfs_ioctl_vol_args_v2 related defines together

Just a code spatial rearrangement, no functional change.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/uapi/linux/btrfs.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index ce615b75e855..c8d99b9ca550 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -33,7 +33,12 @@ struct btrfs_ioctl_vol_args {
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+#define BTRFS_DEVICE_PATH_NAME_MAX	1024
+#define BTRFS_SUBVOL_NAME_MAX 		4039
+
+#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
+#define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)
 
 #define BTRFS_DEVICE_SPEC_BY_ID		(1ULL << 3)
 
@@ -101,11 +106,7 @@ struct btrfs_ioctl_qgroup_limit_args {
  * - BTRFS_IOC_SUBVOL_GETFLAGS
  * - BTRFS_IOC_SUBVOL_SETFLAGS
  */
-#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
-#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
-#define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)
 
-#define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
 	__s64 fd;
 	__u64 transid;
-- 
cgit v1.2.3


From e2731e55884f2138a252b0a3d7b24d57e49c3c59 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Tue, 9 Jan 2018 09:05:41 +0800
Subject: btrfs: define SUPER_FLAG_METADUMP_V2

btrfs-progs uses super flag bit BTRFS_SUPER_FLAG_METADUMP_V2 (1ULL << 34).
So just define that in kernel so that we know its been used.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c              | 3 ++-
 include/uapi/linux/btrfs_tree.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 73ab44159d82..1916016e9fbb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -61,7 +61,8 @@
 				 BTRFS_HEADER_FLAG_RELOC |\
 				 BTRFS_SUPER_FLAG_ERROR |\
 				 BTRFS_SUPER_FLAG_SEEDING |\
-				 BTRFS_SUPER_FLAG_METADUMP)
+				 BTRFS_SUPER_FLAG_METADUMP |\
+				 BTRFS_SUPER_FLAG_METADUMP_V2)
 
 static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 6d6e5da51527..38ab0e06259a 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -456,6 +456,7 @@ struct btrfs_free_space_header {
 
 #define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
+#define BTRFS_SUPER_FLAG_METADUMP_V2	(1ULL << 34)
 
 
 /*
-- 
cgit v1.2.3


From 98820a7e244b17b8a4d9e9d1ff9d3b4e5bfca58b Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Tue, 9 Jan 2018 09:05:42 +0800
Subject: btrfs: add support for SUPER_FLAG_CHANGING_FSID

The UUID change by btrfstune sets SUPER_FLAG_CHANGING_FSID and resets it
only when changing fsid is complete. Its not a good idea to mount the
device anything in between, reading metadata blocks would fail with UUID
mismatch.

This patch doesn't add SUPER_FLAG_CHANGING_FSID into
BTRFS_SUPER_FLAG_SUPP list, so mount will fail (along with the fix in
the next patch) when SUPER_FLAG_CHANGING_FSID is set.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/uapi/linux/btrfs_tree.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 38ab0e06259a..aff1356c2bb8 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -457,6 +457,7 @@ struct btrfs_free_space_header {
 #define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
 #define BTRFS_SUPER_FLAG_METADUMP_V2	(1ULL << 34)
+#define BTRFS_SUPER_FLAG_CHANGING_FSID	(1ULL << 35)
 
 
 /*
-- 
cgit v1.2.3


From e8660ded7f5a9889395d33ce3d5e8c729a462bf5 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 18 Jan 2018 17:48:18 +0100
Subject: macsec: restore uAPI after addition of GCM-AES-256

Commit ccfdec908922 ("macsec: Add support for GCM-AES-256 cipher suite")
changed a few values in the uapi headers for MACsec.

Because of existing userspace implementations, we need to preserve the
value of MACSEC_DEFAULT_CIPHER_ID. Not doing that resulted in
wpa_supplicant segfaults when a secure channel was created using the
default cipher. Thus, swap MACSEC_DEFAULT_CIPHER_{ID,ALT} back to their
original values.

Changing the maximum length of the MACSEC_SA_ATTR_KEY attribute is
unnecessary, as the previous value (MACSEC_MAX_KEY_LEN, which was 128B)
is large enough to carry 32-bytes keys. This patch reverts
MACSEC_MAX_KEY_LEN to 128B and restores the old length check on
MACSEC_SA_ATTR_KEY.

Fixes: ccfdec908922 ("macsec: Add support for GCM-AES-256 cipher suite")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macsec.c           | 12 +++++-------
 include/uapi/linux/if_macsec.h |  6 +++---
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index f522715c6595..7de88b33d5b9 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -396,8 +396,6 @@ static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb)
 #define MACSEC_GCM_AES_128_SAK_LEN 16
 #define MACSEC_GCM_AES_256_SAK_LEN 32
 
-#define MAX_SAK_LEN MACSEC_GCM_AES_256_SAK_LEN
-
 #define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN
 #define DEFAULT_SEND_SCI true
 #define DEFAULT_ENCRYPT false
@@ -1605,7 +1603,7 @@ static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = {
 	[MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY,
 				   .len = MACSEC_KEYID_LEN, },
 	[MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY,
-				 .len = MAX_SAK_LEN, },
+				 .len = MACSEC_MAX_KEY_LEN, },
 };
 
 static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa)
@@ -2374,7 +2372,7 @@ static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb)
 
 	switch (secy->key_len) {
 	case MACSEC_GCM_AES_128_SAK_LEN:
-		csid = MACSEC_CIPHER_ID_GCM_AES_128;
+		csid = MACSEC_DEFAULT_CIPHER_ID;
 		break;
 	case MACSEC_GCM_AES_256_SAK_LEN:
 		csid = MACSEC_CIPHER_ID_GCM_AES_256;
@@ -3076,7 +3074,7 @@ static int macsec_changelink_common(struct net_device *dev,
 	if (data[IFLA_MACSEC_CIPHER_SUITE]) {
 		switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) {
 		case MACSEC_CIPHER_ID_GCM_AES_128:
-		case MACSEC_DEFAULT_CIPHER_ALT:
+		case MACSEC_DEFAULT_CIPHER_ID:
 			secy->key_len = MACSEC_GCM_AES_128_SAK_LEN;
 			break;
 		case MACSEC_CIPHER_ID_GCM_AES_256:
@@ -3355,7 +3353,7 @@ static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[],
 	switch (csid) {
 	case MACSEC_CIPHER_ID_GCM_AES_128:
 	case MACSEC_CIPHER_ID_GCM_AES_256:
-	case MACSEC_DEFAULT_CIPHER_ALT:
+	case MACSEC_DEFAULT_CIPHER_ID:
 		if (icv_len < MACSEC_MIN_ICV_LEN ||
 		    icv_len > MACSEC_STD_ICV_LEN)
 			return -EINVAL;
@@ -3428,7 +3426,7 @@ static int macsec_fill_info(struct sk_buff *skb,
 
 	switch (secy->key_len) {
 	case MACSEC_GCM_AES_128_SAK_LEN:
-		csid = MACSEC_CIPHER_ID_GCM_AES_128;
+		csid = MACSEC_DEFAULT_CIPHER_ID;
 		break;
 	case MACSEC_GCM_AES_256_SAK_LEN:
 		csid = MACSEC_CIPHER_ID_GCM_AES_256;
diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h
index 2e522835a4af..98e4d5d7c45c 100644
--- a/include/uapi/linux/if_macsec.h
+++ b/include/uapi/linux/if_macsec.h
@@ -18,7 +18,7 @@
 #define MACSEC_GENL_NAME "macsec"
 #define MACSEC_GENL_VERSION 1
 
-#define MACSEC_MAX_KEY_LEN 256
+#define MACSEC_MAX_KEY_LEN 128
 
 #define MACSEC_KEYID_LEN 16
 
@@ -26,9 +26,9 @@
 #define MACSEC_CIPHER_ID_GCM_AES_128 0x0080C20001000001ULL
 #define MACSEC_CIPHER_ID_GCM_AES_256 0x0080C20001000002ULL
 
-#define MACSEC_DEFAULT_CIPHER_ID     MACSEC_CIPHER_ID_GCM_AES_128
 /* deprecated cipher ID for GCM-AES-128 */
-#define MACSEC_DEFAULT_CIPHER_ALT    0x0080020001000001ULL
+#define MACSEC_DEFAULT_CIPHER_ID     0x0080020001000001ULL
+#define MACSEC_DEFAULT_CIPHER_ALT    MACSEC_CIPHER_ID_GCM_AES_128
 
 #define MACSEC_MIN_ICV_LEN 8
 #define MACSEC_MAX_ICV_LEN 32
-- 
cgit v1.2.3


From b2d3bcfa26a7a8de41f358a6cae8b848673b3c6e Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Thu, 18 Jan 2018 09:59:13 -0800
Subject: net: core: Expose number of link up/down transitions

Expose the number of times the link has been going UP or DOWN, and
update the "carrier_changes" counter to be the sum of these two events.
While at it, also update the sysfs-class-net documentation to cover:
carrier_changes (3.15), carrier_up_count (4.16) and carrier_down_count
(4.16)

Signed-off-by: David Decotigny <decot@googlers.com>
[Florian:
* rebase
* add documentation
* merge carrier_changes with up/down counters]
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-class-net | 24 ++++++++++++++++++++++++
 include/linux/netdevice.h                 |  6 ++++--
 include/uapi/linux/if_link.h              |  2 ++
 net/core/net-sysfs.c                      | 25 ++++++++++++++++++++++++-
 net/core/rtnetlink.c                      | 13 +++++++++++--
 net/sched/sch_generic.c                   |  4 ++--
 6 files changed, 67 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 6856da99b6f7..2f1788111cd9 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -259,3 +259,27 @@ Contact:	netdev@vger.kernel.org
 Description:
 		Symbolic link to the PHY device this network device is attached
 		to.
+
+What:		/sys/class/net/<iface>/carrier_changes
+Date:		Mar 2014
+KernelVersion:	3.15
+Contact:	netdev@vger.kernel.org
+Description:
+		32-bit unsigned integer counting the number of times the link has
+		seen a change from UP to DOWN and vice versa
+
+What:		/sys/class/net/<iface>/carrier_up_count
+Date:		Jan 2018
+KernelVersion:	4.16
+Contact:	netdev@vger.kernel.org
+Description:
+		32-bit unsigned integer counting the number of times the link has
+		been up
+
+What:		/sys/class/net/<iface>/carrier_down_count
+Date:		Jan 2018
+KernelVersion:	4.16
+Contact:	netdev@vger.kernel.org
+Description:
+		32-bit unsigned integer counting the number of times the link has
+		been down
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ed0799a12bf2..837e9cb7e358 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1680,8 +1680,6 @@ struct net_device {
 	unsigned long		base_addr;
 	int			irq;
 
-	atomic_t		carrier_changes;
-
 	/*
 	 *	Some hardware also needs these fields (state,dev_list,
 	 *	napi_list,unreg_list,close_list) but they are not
@@ -1719,6 +1717,10 @@ struct net_device {
 	atomic_long_t		tx_dropped;
 	atomic_long_t		rx_nohandler;
 
+	/* Stats to monitor link on/off, flapping */
+	atomic_t		carrier_up_count;
+	atomic_t		carrier_down_count;
+
 #ifdef CONFIG_WIRELESS_EXT
 	const struct iw_handler_def *wireless_handlers;
 	struct iw_public_data	*wireless_data;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index f8f04fed6186..8616131e2c61 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -161,6 +161,8 @@ enum {
 	IFLA_EVENT,
 	IFLA_NEW_NETNSID,
 	IFLA_IF_NETNSID,
+	IFLA_CARRIER_UP_COUNT,
+	IFLA_CARRIER_DOWN_COUNT,
 	__IFLA_MAX
 };
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7bf8b85ade16..c4a28f4667b6 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -295,10 +295,31 @@ static ssize_t carrier_changes_show(struct device *dev,
 	struct net_device *netdev = to_net_dev(dev);
 
 	return sprintf(buf, fmt_dec,
-		       atomic_read(&netdev->carrier_changes));
+		       atomic_read(&netdev->carrier_up_count) +
+		       atomic_read(&netdev->carrier_down_count));
 }
 static DEVICE_ATTR_RO(carrier_changes);
 
+static ssize_t carrier_up_count_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
+}
+static DEVICE_ATTR_RO(carrier_up_count);
+
+static ssize_t carrier_down_count_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
+}
+static DEVICE_ATTR_RO(carrier_down_count);
+
 /* read-write attributes */
 
 static int change_mtu(struct net_device *dev, unsigned long new_mtu)
@@ -547,6 +568,8 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_phys_port_name.attr,
 	&dev_attr_phys_switch_id.attr,
 	&dev_attr_proto_down.attr,
+	&dev_attr_carrier_up_count.attr,
+	&dev_attr_carrier_down_count.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 16d644a4f974..97874daa1336 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -990,6 +990,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1)  /* IFLA_PROTO_DOWN */
 	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
+	       + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
+	       + nla_total_size(4)  /* IFLA_CARRIER_DOWN_COUNT */
 	       + 0;
 }
 
@@ -1551,8 +1553,13 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
 	    nla_put_ifalias(skb, dev) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
-			atomic_read(&dev->carrier_changes)) ||
-	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
+			atomic_read(&dev->carrier_up_count) +
+			atomic_read(&dev->carrier_down_count)) ||
+	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) ||
+	    nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
+			atomic_read(&dev->carrier_up_count)) ||
+	    nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
+			atomic_read(&dev->carrier_down_count)))
 		goto nla_put_failure;
 
 	if (event != IFLA_EVENT_NONE) {
@@ -1656,6 +1663,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_EVENT]		= { .type = NLA_U32 },
 	[IFLA_GROUP]		= { .type = NLA_U32 },
 	[IFLA_IF_NETNSID]	= { .type = NLA_S32 },
+	[IFLA_CARRIER_UP_COUNT]	= { .type = NLA_U32 },
+	[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ef8b4ecde2ac..1816bde47256 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -510,7 +510,7 @@ void netif_carrier_on(struct net_device *dev)
 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
 		if (dev->reg_state == NETREG_UNINITIALIZED)
 			return;
-		atomic_inc(&dev->carrier_changes);
+		atomic_inc(&dev->carrier_up_count);
 		linkwatch_fire_event(dev);
 		if (netif_running(dev))
 			__netdev_watchdog_up(dev);
@@ -529,7 +529,7 @@ void netif_carrier_off(struct net_device *dev)
 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
 		if (dev->reg_state == NETREG_UNINITIALIZED)
 			return;
-		atomic_inc(&dev->carrier_changes);
+		atomic_inc(&dev->carrier_down_count);
 		linkwatch_fire_event(dev);
 	}
 }
-- 
cgit v1.2.3


From c1696fb85d33194cf65c7ebfc82a75696299c3a3 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 17 Jan 2018 00:01:33 +0100
Subject: GFS2: Introduce new gfs2_log_header_v2

This patch adds a new structure called gfs2_log_header_v2 which is used
to store expanded fields into previously unused areas of the log headers
(i.e., this change is backwards compatible).  Some of these are used for
debug purposes so we can backtrack when problems occur.  Others are
reserved for future expansion.

This patch is based on a prototype from Steve Whitehouse.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/Kconfig                  |  2 ++
 fs/gfs2/aops.c                   |  2 +-
 fs/gfs2/file.c                   |  3 +-
 fs/gfs2/glops.c                  | 13 +++----
 fs/gfs2/log.c                    | 75 ++++++++++++++++++++++++++++++----------
 fs/gfs2/log.h                    | 12 ++-----
 fs/gfs2/lops.c                   | 16 +++++----
 fs/gfs2/lops.h                   |  3 ++
 fs/gfs2/ops_fstype.c             |  2 +-
 fs/gfs2/quota.c                  |  3 +-
 fs/gfs2/recovery.c               | 17 +++++----
 fs/gfs2/rgrp.c                   |  2 +-
 fs/gfs2/super.c                  |  9 ++---
 fs/gfs2/trans.c                  |  2 +-
 include/uapi/linux/gfs2_ondisk.h | 26 ++++++++++++--
 15 files changed, 128 insertions(+), 59 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 43c827a7cce5..c0225d4b5435 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -3,6 +3,8 @@ config GFS2_FS
 	depends on (64BIT || LBDAF)
 	select FS_POSIX_ACL
 	select CRC32
+	select CRYPTO
+	select CRYPTO_CRC32C
 	select QUOTACTL
 	select FS_IOMAP
 	help
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ac4a1e89da1e..462c3fd55929 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -448,7 +448,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 
 	ret = gfs2_write_cache_jdata(mapping, wbc);
 	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
-		gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+		gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL);
 		ret = gfs2_write_cache_jdata(mapping, wbc);
 	}
 	return ret;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index bd60dc682676..7a02b4e6e9f3 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -246,7 +246,8 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	}
 	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
 		if (new_flags & GFS2_DIF_JDATA)
-			gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+			gfs2_log_flush(sdp, ip->i_gl,
+				       GFS2_LOG_HEAD_FLUSH_NORMAL);
 		error = filemap_fdatawrite(inode->i_mapping);
 		if (error)
 			goto out;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index cdd1c5f06f45..2daab13a9e0b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -107,7 +107,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	__gfs2_ail_flush(gl, 0, tr.tr_revokes);
 
 	gfs2_trans_end(sdp);
-	gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
 }
 
 void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
@@ -128,7 +128,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 		return;
 	__gfs2_ail_flush(gl, fsync, max_revokes);
 	gfs2_trans_end(sdp);
-	gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
 }
 
 /**
@@ -157,7 +157,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 		return;
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-	gfs2_log_flush(sdp, gl, NORMAL_FLUSH);
+	gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL);
 	filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 	error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 	mapping_set_error(mapping, error);
@@ -252,7 +252,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
 
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-	gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH);
+	gfs2_log_flush(gl->gl_name.ln_sbd, gl, GFS2_LOG_HEAD_FLUSH_NORMAL);
 	filemap_fdatawrite(metamapping);
 	if (isreg) {
 		struct address_space *mapping = ip->i_inode.i_mapping;
@@ -303,7 +303,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 	}
 
 	if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) {
-		gfs2_log_flush(gl->gl_name.ln_sbd, NULL, NORMAL_FLUSH);
+		gfs2_log_flush(gl->gl_name.ln_sbd, NULL,
+			       GFS2_LOG_HEAD_FLUSH_NORMAL);
 		gl->gl_name.ln_sbd->sd_rindex_uptodate = 0;
 	}
 	if (ip && S_ISREG(ip->i_inode.i_mode))
@@ -495,7 +496,7 @@ static void freeze_go_sync(struct gfs2_glock *gl)
 			gfs2_assert_withdraw(sdp, 0);
 		}
 		queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
-		gfs2_log_flush(sdp, NULL, FREEZE_FLUSH);
+		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE);
 	}
 }
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index c27cbcebfe88..a2eb13c04591 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -14,6 +14,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
+#include <linux/crc32c.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -653,20 +654,25 @@ out_of_blocks:
 /**
  * write_log_header - Write a journal log header buffer at sd_log_flush_head
  * @sdp: The GFS2 superblock
+ * @jd: journal descriptor of the journal to which we are writing
  * @seq: sequence number
  * @tail: tail of the log
- * @flags: log header flags
+ * @flags: log header flags GFS2_LOG_HEAD_*
  * @op_flags: flags to pass to the bio
  *
  * Returns: the initialized log buffer descriptor
  */
 
-void gfs2_write_log_header(struct gfs2_sbd *sdp, u64 seq, u32 tail,
-			   u32 flags, int op_flags)
+void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+			   u64 seq, u32 tail, u32 flags, int op_flags)
 {
 	struct gfs2_log_header *lh;
-	u32 hash;
+	u32 hash, crc;
 	struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct timespec64 tv;
+	struct super_block *sb = sdp->sd_vfs;
+	u64 addr;
 
 	lh = page_address(page);
 	clear_page(lh);
@@ -680,10 +686,39 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, u64 seq, u32 tail,
 	lh->lh_flags = cpu_to_be32(flags);
 	lh->lh_tail = cpu_to_be32(tail);
 	lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
-	hash = ~crc32(~0, lh, sizeof(*lh));
+	hash = ~crc32(~0, lh, LH_V1_SIZE);
 	lh->lh_hash = cpu_to_be32(hash);
 
-	gfs2_log_write_page(sdp, page);
+	tv = current_kernel_time64();
+	lh->lh_nsec = cpu_to_be32(tv.tv_nsec);
+	lh->lh_sec = cpu_to_be64(tv.tv_sec);
+	addr = gfs2_log_bmap(sdp);
+	lh->lh_addr = cpu_to_be64(addr);
+	lh->lh_jinode = cpu_to_be64(GFS2_I(jd->jd_inode)->i_no_addr);
+
+	/* We may only write local statfs, quota, etc., when writing to our
+	   own journal. The values are left 0 when recovering a journal
+	   different from our own. */
+	if (!(flags & GFS2_LOG_HEAD_RECOVERY)) {
+		lh->lh_statfs_addr =
+			cpu_to_be64(GFS2_I(sdp->sd_sc_inode)->i_no_addr);
+		lh->lh_quota_addr =
+			cpu_to_be64(GFS2_I(sdp->sd_qc_inode)->i_no_addr);
+
+		spin_lock(&sdp->sd_statfs_spin);
+		lh->lh_local_total = cpu_to_be64(l_sc->sc_total);
+		lh->lh_local_free = cpu_to_be64(l_sc->sc_free);
+		lh->lh_local_dinodes = cpu_to_be64(l_sc->sc_dinodes);
+		spin_unlock(&sdp->sd_statfs_spin);
+	}
+
+	BUILD_BUG_ON(offsetof(struct gfs2_log_header, lh_crc) != LH_V1_SIZE);
+
+	crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4,
+		     sb->s_blocksize - LH_V1_SIZE - 4);
+	lh->lh_crc = cpu_to_be32(crc);
+
+	gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr);
 	gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags);
 	log_flush_wait(sdp);
 }
@@ -691,6 +726,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, u64 seq, u32 tail,
 /**
  * log_write_header - Get and initialize a journal header buffer
  * @sdp: The GFS2 superblock
+ * @flags: The log header flags, including log header origin
  *
  * Returns: the initialized log buffer descriptor
  */
@@ -710,8 +746,8 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
 	}
 	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
-	gfs2_write_log_header(sdp, sdp->sd_log_sequence++, tail, flags,
-			      op_flags);
+	gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail,
+			      flags, op_flags);
 
 	if (sdp->sd_log_tail != tail)
 		log_pull_tail(sdp, tail);
@@ -721,11 +757,11 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
  * gfs2_log_flush - flush incore transaction(s)
  * @sdp: the filesystem
  * @gl: The glock structure to flush.  If NULL, flush the whole incore log
+ * @flags: The log header flags: GFS2_LOG_HEAD_FLUSH_*
  *
  */
 
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
-		    enum gfs2_flush_type type)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
 {
 	struct gfs2_trans *tr;
 	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
@@ -739,7 +775,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
 	}
 	trace_gfs2_log_flush(sdp, 1);
 
-	if (type == SHUTDOWN_FLUSH)
+	if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN)
 		clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
 
 	sdp->sd_log_flush_head = sdp->sd_log_head;
@@ -764,11 +800,11 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
 
 	if (sdp->sd_log_head != sdp->sd_log_flush_head) {
 		log_flush_wait(sdp);
-		log_write_header(sdp, 0);
+		log_write_header(sdp, flags);
 	} else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
 		atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
 		trace_gfs2_log_blocks(sdp, -1);
-		log_write_header(sdp, 0);
+		log_write_header(sdp, flags);
 	}
 	lops_after_commit(sdp, tr);
 
@@ -785,7 +821,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
 	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
 
-	if (type != NORMAL_FLUSH) {
+	if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
 		if (!sdp->sd_log_idle) {
 			for (;;) {
 				gfs2_ail1_start(sdp);
@@ -795,12 +831,13 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
 			}
 			atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
 			trace_gfs2_log_blocks(sdp, -1);
-			log_write_header(sdp, 0);
+			log_write_header(sdp, flags);
 			sdp->sd_log_head = sdp->sd_log_flush_head;
 		}
-		if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH)
+		if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
+			     GFS2_LOG_HEAD_FLUSH_FREEZE))
 			gfs2_log_shutdown(sdp);
-		if (type == FREEZE_FLUSH)
+		if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE)
 			atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
 	}
 
@@ -956,7 +993,7 @@ int gfs2_logd(void *data)
 		did_flush = false;
 		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
 			gfs2_ail1_empty(sdp);
-			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
 			did_flush = true;
 		}
 
@@ -964,7 +1001,7 @@ int gfs2_logd(void *data)
 			gfs2_ail1_start(sdp);
 			gfs2_ail1_wait(sdp);
 			gfs2_ail1_empty(sdp);
-			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
 			did_flush = true;
 		}
 
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 619de9a1ff4f..93b52ac1ca1f 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -65,16 +65,10 @@ extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 
 extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
 extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-enum gfs2_flush_type {
-	NORMAL_FLUSH = 0,
-	SYNC_FLUSH,
-	SHUTDOWN_FLUSH,
-	FREEZE_FLUSH
-};
-extern void gfs2_write_log_header(struct gfs2_sbd *sdp, u64 seq, u32 tail,
-				  u32 flags, int op_flags);
+extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+				  u64 seq, u32 tail, u32 flags, int op_flags);
 extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
-			   enum gfs2_flush_type type);
+			   u32 type);
 extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
 extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
 extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index c8ff7b7954f0..4a60221c678f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/list_sort.h>
 
+#include "dir.h"
 #include "gfs2.h"
 #include "incore.h"
 #include "inode.h"
@@ -138,7 +139,7 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp)
 		sdp->sd_log_flush_head = 0;
 }
 
-static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
+u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
 {
 	unsigned int lbn = sdp->sd_log_flush_head;
 	struct gfs2_journal_extent *je;
@@ -306,23 +307,22 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
 	return gfs2_log_alloc_bio(sdp, blkno);
 }
 
-
 /**
  * gfs2_log_write - write to log
  * @sdp: the filesystem
  * @page: the page to write
  * @size: the size of the data to write
  * @offset: the offset within the page 
+ * @blkno: block number of the log entry
  *
  * Try and add the page segment to the current bio. If that fails,
  * submit the current bio to the device and create a new one, and
  * then add the page segment to that.
  */
 
-static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
-			   unsigned size, unsigned offset)
+void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
+		    unsigned size, unsigned offset, u64 blkno)
 {
-	u64 blkno = gfs2_log_bmap(sdp);
 	struct bio *bio;
 	int ret;
 
@@ -348,7 +348,8 @@ static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
 
 static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh)
 {
-	gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh));
+	gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh),
+		       gfs2_log_bmap(sdp));
 }
 
 /**
@@ -365,7 +366,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh)
 void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
 {
 	struct super_block *sb = sdp->sd_vfs;
-	gfs2_log_write(sdp, page, sb->s_blocksize, 0);
+	gfs2_log_write(sdp, page, sb->s_blocksize, 0,
+		       gfs2_log_bmap(sdp));
 }
 
 static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index e529f536c117..e4949394f054 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -26,6 +26,9 @@ extern const struct gfs2_log_operations gfs2_revoke_lops;
 extern const struct gfs2_log_operations gfs2_databuf_lops;
 
 extern const struct gfs2_log_operations *gfs2_log_ops[];
+extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp);
+extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
+			   unsigned size, unsigned offset, u64 blkno);
 extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
 extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags);
 extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ad55eb86a250..d6e620beb9db 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1382,7 +1382,7 @@ static void gfs2_kill_sb(struct super_block *sb)
 		return;
 	}
 
-	gfs2_log_flush(sdp, NULL, SYNC_FLUSH);
+	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SYNC);
 	dput(sdp->sd_root_dir);
 	dput(sdp->sd_master_dir);
 	sdp->sd_root_dir = NULL;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e700fb162664..2092df19e433 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -955,7 +955,8 @@ out:
 		gfs2_glock_dq_uninit(&ghs[qx]);
 	inode_unlock(&ip->i_inode);
 	kfree(ghs);
-	gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
+	gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl,
+		       GFS2_LOG_HEAD_FLUSH_NORMAL);
 	return error;
 }
 
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 975f32166dfe..b6b258998bcd 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
+#include <linux/crc32c.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -137,7 +138,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
 {
 	struct gfs2_log_header *lh;
 	struct buffer_head *bh;
-	u32 hash;
+	u32 hash, crc;
 	int error;
 
 	error = gfs2_replay_read_block(jd, blk, &bh);
@@ -145,13 +146,17 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
 		return error;
 	lh = (void *)bh->b_data;
 
-	hash = crc32(~0, lh, sizeof(*lh) - 4);
+	hash = crc32(~0, lh, LH_V1_SIZE - 4);
 	hash = ~crc32_le_shift(hash, 4);  /* assume lh_hash is zero */
 
+	crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4,
+		     bh->b_size - LH_V1_SIZE - 4);
+
 	error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
 		lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) ||
 		be32_to_cpu(lh->lh_blkno) != blk ||
-		be32_to_cpu(lh->lh_hash) != hash;
+		be32_to_cpu(lh->lh_hash) != hash ||
+		(lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc);
 
 	brelse(bh);
 
@@ -372,9 +377,9 @@ static void clean_journal(struct gfs2_jdesc *jd,
 
 	sdp->sd_log_flush_head = head->lh_blkno;
 	gfs2_replay_incr_blk(jd, &sdp->sd_log_flush_head);
-	gfs2_write_log_header(sdp, head->lh_sequence + 1, 0,
-			      GFS2_LOG_HEAD_UNMOUNT, REQ_PREFLUSH |
-			      REQ_FUA | REQ_META | REQ_SYNC);
+	gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0,
+			      GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY,
+			      REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC);
 }
 
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6dea72f49316..00eab6c0525c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2093,7 +2093,7 @@ next_rgrp:
 		}
 		/* Flushing the log may release space */
 		if (loops == 2)
-			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
 	}
 
 	return -ENOSPC;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d81d46e19726..fa3a19eaf0eb 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -757,7 +757,8 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	bool flush_all = (wbc->sync_mode == WB_SYNC_ALL || gfs2_is_jdata(ip));
 
 	if (flush_all)
-		gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl,
+			       GFS2_LOG_HEAD_FLUSH_NORMAL);
 	if (bdi->wb.dirty_exceeded)
 		gfs2_ail1_flush(sdp, wbc);
 	else
@@ -853,7 +854,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	gfs2_quota_sync(sdp->sd_vfs, 0);
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
 
-	gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
+	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN);
 	wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
 	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
 
@@ -946,7 +947,7 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
 
 	gfs2_quota_sync(sb, -1);
 	if (wait)
-		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
 	return sdp->sd_log_error;
 }
 
@@ -1650,7 +1651,7 @@ alloc_failed:
 	goto out_unlock;
 
 out_truncate:
-	gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+	gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL);
 	metamapping = gfs2_glock2aspace(ip->i_gl);
 	if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
 		filemap_fdatawrite(metamapping);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index b95ebd166cac..7aec6d3434fa 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -117,7 +117,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 	up_read(&sdp->sd_log_flush_lock);
 
 	if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS)
-		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
 	if (alloced)
 		sb_end_intwrite(sdp->sd_vfs);
 }
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 09f0920f07e9..9a81d520f54a 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -403,7 +403,15 @@ struct gfs2_ea_header {
  * Log header structure
  */
 
-#define GFS2_LOG_HEAD_UNMOUNT	0x00000001	/* log is clean */
+#define GFS2_LOG_HEAD_UNMOUNT		0x00000001 /* log is clean */
+#define GFS2_LOG_HEAD_FLUSH_NORMAL	0x00000002 /* normal log flush */
+#define GFS2_LOG_HEAD_FLUSH_SYNC	0x00000004 /* Sync log flush */
+#define GFS2_LOG_HEAD_FLUSH_SHUTDOWN	0x00000008 /* Shutdown log flush */
+#define GFS2_LOG_HEAD_FLUSH_FREEZE	0x00000010 /* Freeze flush */
+#define GFS2_LOG_HEAD_RECOVERY		0x00000020 /* Journal recovery */
+#define GFS2_LOG_HEAD_USERSPACE		0x80000000 /* Written by gfs2-utils */
+
+#define LH_V1_SIZE (offsetofend(struct gfs2_log_header, lh_hash))
 
 struct gfs2_log_header {
 	struct gfs2_meta_header lh_header;
@@ -412,7 +420,21 @@ struct gfs2_log_header {
 	__be32 lh_flags;	/* GFS2_LOG_HEAD_... */
 	__be32 lh_tail;		/* Block number of log tail */
 	__be32 lh_blkno;
-	__be32 lh_hash;
+	__be32 lh_hash;		/* crc up to here with this field 0 */
+
+	/* Version 2 additional fields start here */
+	__be32 lh_crc;		/* crc32c from lh_nsec to end of block */
+	__be32 lh_nsec;		/* Nanoseconds of timestamp */
+	__be64 lh_sec;		/* Seconds of timestamp */
+	__be64 lh_addr;		/* Block addr of this log header (absolute) */
+	__be64 lh_jinode;	/* Journal inode number */
+	__be64 lh_statfs_addr;	/* Local statfs inode number */
+	__be64 lh_quota_addr;	/* Local quota change inode number */
+
+	/* Statfs local changes (i.e. diff from global statfs) */
+	__be64 lh_local_total;
+	__be64 lh_local_free;
+	__be64 lh_local_dinodes;
 };
 
 /*
-- 
cgit v1.2.3


From 805c090750a315c5443c14e06304e19a01c697a0 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 8 Jan 2018 10:34:17 -0500
Subject: GFS2: Log the reason for log flushes in every log header

This patch just adds the capability for GFS2 to track which function
called gfs2_log_flush. This should make it easier to diagnose
problems based on the sequence of events found in the journals.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/aops.c                   |  3 ++-
 fs/gfs2/file.c                   |  3 ++-
 fs/gfs2/glops.c                  | 18 ++++++++++++------
 fs/gfs2/log.c                    | 14 ++++++++------
 fs/gfs2/ops_fstype.c             |  2 +-
 fs/gfs2/quota.c                  |  2 +-
 fs/gfs2/rgrp.c                   |  3 ++-
 fs/gfs2/super.c                  | 12 ++++++++----
 fs/gfs2/trace_gfs2.h             | 11 +++++++----
 fs/gfs2/trans.c                  |  3 ++-
 include/uapi/linux/gfs2_ondisk.h | 21 +++++++++++++++++++++
 11 files changed, 66 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 462c3fd55929..2f725b4a386b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -448,7 +448,8 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 
 	ret = gfs2_write_cache_jdata(mapping, wbc);
 	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
-		gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL);
+		gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
+			       GFS2_LFC_JDATA_WPAGES);
 		ret = gfs2_write_cache_jdata(mapping, wbc);
 	}
 	return ret;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7a02b4e6e9f3..4f88e201b3f0 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -247,7 +247,8 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
 		if (new_flags & GFS2_DIF_JDATA)
 			gfs2_log_flush(sdp, ip->i_gl,
-				       GFS2_LOG_HEAD_FLUSH_NORMAL);
+				       GFS2_LOG_HEAD_FLUSH_NORMAL |
+				       GFS2_LFC_SET_FLAGS);
 		error = filemap_fdatawrite(inode->i_mapping);
 		if (error)
 			goto out;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 2daab13a9e0b..d8782a7a1e7d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -107,7 +107,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	__gfs2_ail_flush(gl, 0, tr.tr_revokes);
 
 	gfs2_trans_end(sdp);
-	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
+	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
+		       GFS2_LFC_AIL_EMPTY_GL);
 }
 
 void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
@@ -128,7 +129,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 		return;
 	__gfs2_ail_flush(gl, fsync, max_revokes);
 	gfs2_trans_end(sdp);
-	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
+	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
+		       GFS2_LFC_AIL_FLUSH);
 }
 
 /**
@@ -157,7 +159,8 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 		return;
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-	gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL);
+	gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
+		       GFS2_LFC_RGRP_GO_SYNC);
 	filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 	error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 	mapping_set_error(mapping, error);
@@ -252,7 +255,8 @@ static void inode_go_sync(struct gfs2_glock *gl)
 
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-	gfs2_log_flush(gl->gl_name.ln_sbd, gl, GFS2_LOG_HEAD_FLUSH_NORMAL);
+	gfs2_log_flush(gl->gl_name.ln_sbd, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
+		       GFS2_LFC_INODE_GO_SYNC);
 	filemap_fdatawrite(metamapping);
 	if (isreg) {
 		struct address_space *mapping = ip->i_inode.i_mapping;
@@ -304,7 +308,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 
 	if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) {
 		gfs2_log_flush(gl->gl_name.ln_sbd, NULL,
-			       GFS2_LOG_HEAD_FLUSH_NORMAL);
+			       GFS2_LOG_HEAD_FLUSH_NORMAL |
+			       GFS2_LFC_INODE_GO_INVAL);
 		gl->gl_name.ln_sbd->sd_rindex_uptodate = 0;
 	}
 	if (ip && S_ISREG(ip->i_inode.i_mode))
@@ -496,7 +501,8 @@ static void freeze_go_sync(struct gfs2_glock *gl)
 			gfs2_assert_withdraw(sdp, 0);
 		}
 		queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
-		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE);
+		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
+			       GFS2_LFC_FREEZE_GO_SYNC);
 	}
 }
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index a2eb13c04591..cf6b46247df4 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -757,7 +757,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
  * gfs2_log_flush - flush incore transaction(s)
  * @sdp: the filesystem
  * @gl: The glock structure to flush.  If NULL, flush the whole incore log
- * @flags: The log header flags: GFS2_LOG_HEAD_FLUSH_*
+ * @flags: The log header flags: GFS2_LOG_HEAD_FLUSH_* and debug flags
  *
  */
 
@@ -773,7 +773,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
 		up_write(&sdp->sd_log_flush_lock);
 		return;
 	}
-	trace_gfs2_log_flush(sdp, 1);
+	trace_gfs2_log_flush(sdp, 1, flags);
 
 	if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN)
 		clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
@@ -841,7 +841,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
 			atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
 	}
 
-	trace_gfs2_log_flush(sdp, 0);
+	trace_gfs2_log_flush(sdp, 0, flags);
 	up_write(&sdp->sd_log_flush_lock);
 
 	kfree(tr);
@@ -937,7 +937,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
 
 	sdp->sd_log_flush_head = sdp->sd_log_head;
 
-	log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT);
+	log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT | GFS2_LFC_SHUTDOWN);
 
 	gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
 	gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
@@ -993,7 +993,8 @@ int gfs2_logd(void *data)
 		did_flush = false;
 		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
 			gfs2_ail1_empty(sdp);
-			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
+			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
+				       GFS2_LFC_LOGD_JFLUSH_REQD);
 			did_flush = true;
 		}
 
@@ -1001,7 +1002,8 @@ int gfs2_logd(void *data)
 			gfs2_ail1_start(sdp);
 			gfs2_ail1_wait(sdp);
 			gfs2_ail1_empty(sdp);
-			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
+			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
+				       GFS2_LFC_LOGD_AIL_FLUSH_REQD);
 			did_flush = true;
 		}
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index d6e620beb9db..e6a0a8a89ea7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1382,7 +1382,7 @@ static void gfs2_kill_sb(struct super_block *sb)
 		return;
 	}
 
-	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SYNC);
+	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SYNC | GFS2_LFC_KILL_SB);
 	dput(sdp->sd_root_dir);
 	dput(sdp->sd_master_dir);
 	sdp->sd_root_dir = NULL;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2092df19e433..7a98abd340ee 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -956,7 +956,7 @@ out:
 	inode_unlock(&ip->i_inode);
 	kfree(ghs);
 	gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl,
-		       GFS2_LOG_HEAD_FLUSH_NORMAL);
+		       GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC);
 	return error;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 00eab6c0525c..078b002e0a68 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2093,7 +2093,8 @@ next_rgrp:
 		}
 		/* Flushing the log may release space */
 		if (loops == 2)
-			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
+			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
+				       GFS2_LFC_INPLACE_RESERVE);
 	}
 
 	return -ENOSPC;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index fa3a19eaf0eb..50a297b920fc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -758,7 +758,8 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	if (flush_all)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl,
-			       GFS2_LOG_HEAD_FLUSH_NORMAL);
+			       GFS2_LOG_HEAD_FLUSH_NORMAL |
+			       GFS2_LFC_WRITE_INODE);
 	if (bdi->wb.dirty_exceeded)
 		gfs2_ail1_flush(sdp, wbc);
 	else
@@ -854,7 +855,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	gfs2_quota_sync(sdp->sd_vfs, 0);
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
 
-	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN);
+	gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
+		       GFS2_LFC_MAKE_FS_RO);
 	wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
 	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
 
@@ -947,7 +949,8 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
 
 	gfs2_quota_sync(sb, -1);
 	if (wait)
-		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
+		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
+			       GFS2_LFC_SYNC_FS);
 	return sdp->sd_log_error;
 }
 
@@ -1651,7 +1654,8 @@ alloc_failed:
 	goto out_unlock;
 
 out_truncate:
-	gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL);
+	gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
+		       GFS2_LFC_EVICT_INODE);
 	metamapping = gfs2_glock2aspace(ip->i_gl);
 	if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
 		filemap_fdatawrite(metamapping);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index f67a709589d3..b9318b49ff8f 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -353,26 +353,29 @@ TRACE_EVENT(gfs2_pin,
 /* Flushing the log */
 TRACE_EVENT(gfs2_log_flush,
 
-	TP_PROTO(const struct gfs2_sbd *sdp, int start),
+	TP_PROTO(const struct gfs2_sbd *sdp, int start, u32 flags),
 
-	TP_ARGS(sdp, start),
+	TP_ARGS(sdp, start, flags),
 
 	TP_STRUCT__entry(
 		__field(        dev_t,  dev                     )
 		__field(	int,	start			)
 		__field(	u64,	log_seq			)
+		__field(	u32,	flags			)
 	),
 
 	TP_fast_assign(
 		__entry->dev            = sdp->sd_vfs->s_dev;
 		__entry->start		= start;
 		__entry->log_seq	= sdp->sd_log_sequence;
+		__entry->flags		= flags;
 	),
 
-	TP_printk("%u,%u log flush %s %llu",
+	TP_printk("%u,%u log flush %s %llu %llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->start ? "start" : "end",
-		  (unsigned long long)__entry->log_seq)
+		  (unsigned long long)__entry->log_seq,
+		  (unsigned long long)__entry->flags)
 );
 
 /* Reserving/releasing blocks in the log */
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7aec6d3434fa..c75cacaa349b 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -117,7 +117,8 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 	up_read(&sdp->sd_log_flush_lock);
 
 	if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS)
-		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL);
+		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
+			       GFS2_LFC_TRANS_END);
 	if (alloced)
 		sb_end_intwrite(sdp->sd_vfs);
 }
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 9a81d520f54a..2dc10a034de1 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -411,6 +411,27 @@ struct gfs2_ea_header {
 #define GFS2_LOG_HEAD_RECOVERY		0x00000020 /* Journal recovery */
 #define GFS2_LOG_HEAD_USERSPACE		0x80000000 /* Written by gfs2-utils */
 
+/* Log flush callers */
+#define GFS2_LFC_SHUTDOWN		0x00000100
+#define GFS2_LFC_JDATA_WPAGES		0x00000200
+#define GFS2_LFC_SET_FLAGS		0x00000400
+#define GFS2_LFC_AIL_EMPTY_GL		0x00000800
+#define GFS2_LFC_AIL_FLUSH		0x00001000
+#define GFS2_LFC_RGRP_GO_SYNC		0x00002000
+#define GFS2_LFC_INODE_GO_SYNC		0x00004000
+#define GFS2_LFC_INODE_GO_INVAL		0x00008000
+#define GFS2_LFC_FREEZE_GO_SYNC		0x00010000
+#define GFS2_LFC_KILL_SB		0x00020000
+#define GFS2_LFC_DO_SYNC		0x00040000
+#define GFS2_LFC_INPLACE_RESERVE	0x00080000
+#define GFS2_LFC_WRITE_INODE		0x00100000
+#define GFS2_LFC_MAKE_FS_RO		0x00200000
+#define GFS2_LFC_SYNC_FS		0x00400000
+#define GFS2_LFC_EVICT_INODE		0x00800000
+#define GFS2_LFC_TRANS_END		0x01000000
+#define GFS2_LFC_LOGD_JFLUSH_REQD	0x02000000
+#define GFS2_LFC_LOGD_AIL_FLUSH_REQD	0x04000000
+
 #define LH_V1_SIZE (offsetofend(struct gfs2_log_header, lh_hash))
 
 struct gfs2_log_header {
-- 
cgit v1.2.3


From 430a23689dea2e36ae5a0fc75a67301fd46b18bf Mon Sep 17 00:00:00 2001
From: Jay Cornwall <Jay.Cornwall@amd.com>
Date: Thu, 4 Jan 2018 19:44:59 -0500
Subject: PCI: Add pci_enable_atomic_ops_to_root()

The Atomic Operations feature (PCIe r4.0, sec 6.15) allows atomic
transctions to be requested by, routed through and completed by PCIe
components. Routing and completion do not require software support.
Component support for each is detectable via the DEVCAP2 register.

A Requester may use AtomicOps only if its PCI_EXP_DEVCTL2_ATOMIC_REQ is
set. This should be set only if the Completer and all intermediate routing
elements support AtomicOps.

A concrete example is the AMD Fiji-class GPU (which is capable of making
AtomicOp requests), below a PLX 8747 switch (advertising AtomicOp routing)
with a Haswell host bridge (advertising AtomicOp completion support).

Add pci_enable_atomic_ops_to_root() for per-device control over AtomicOp
requests. This checks to be sure the Root Port supports completion of the
desired AtomicOp sizes and the path to the Root Port supports routing the
AtomicOps.

Signed-off-by: Jay Cornwall <Jay.Cornwall@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
[bhelgaas: changelog, comments, whitespace]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c             | 75 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h           |  1 +
 include/uapi/linux/pci_regs.h |  4 ++-
 3 files changed, 79 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 4a7c6864fdf4..6112dd8d68b6 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3065,6 +3065,81 @@ int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size)
 	return 0;
 }
 
+/**
+ * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port
+ * @dev: the PCI device
+ * @cap_mask: mask of desired AtomicOp sizes, including one or more of:
+ *	PCI_EXP_DEVCAP2_ATOMIC_COMP32
+ *	PCI_EXP_DEVCAP2_ATOMIC_COMP64
+ *	PCI_EXP_DEVCAP2_ATOMIC_COMP128
+ *
+ * Return 0 if all upstream bridges support AtomicOp routing, egress
+ * blocking is disabled on all upstream ports, and the root port supports
+ * the requested completion capabilities (32-bit, 64-bit and/or 128-bit
+ * AtomicOp completion), or negative otherwise.
+ */
+int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask)
+{
+	struct pci_bus *bus = dev->bus;
+	struct pci_dev *bridge;
+	u32 cap, ctl2;
+
+	if (!pci_is_pcie(dev))
+		return -EINVAL;
+
+	/*
+	 * Per PCIe r4.0, sec 6.15, endpoints and root ports may be
+	 * AtomicOp requesters.  For now, we only support endpoints as
+	 * requesters and root ports as completers.  No endpoints as
+	 * completers, and no peer-to-peer.
+	 */
+
+	switch (pci_pcie_type(dev)) {
+	case PCI_EXP_TYPE_ENDPOINT:
+	case PCI_EXP_TYPE_LEG_END:
+	case PCI_EXP_TYPE_RC_END:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	while (bus->parent) {
+		bridge = bus->self;
+
+		pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap);
+
+		switch (pci_pcie_type(bridge)) {
+		/* Ensure switch ports support AtomicOp routing */
+		case PCI_EXP_TYPE_UPSTREAM:
+		case PCI_EXP_TYPE_DOWNSTREAM:
+			if (!(cap & PCI_EXP_DEVCAP2_ATOMIC_ROUTE))
+				return -EINVAL;
+			break;
+
+		/* Ensure root port supports all the sizes we care about */
+		case PCI_EXP_TYPE_ROOT_PORT:
+			if ((cap & cap_mask) != cap_mask)
+				return -EINVAL;
+			break;
+		}
+
+		/* Ensure upstream ports don't block AtomicOps on egress */
+		if (!bridge->has_secondary_link) {
+			pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2,
+						   &ctl2);
+			if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK)
+				return -EINVAL;
+		}
+
+		bus = bus->parent;
+	}
+
+	pcie_capability_set_word(dev, PCI_EXP_DEVCTL2,
+				 PCI_EXP_DEVCTL2_ATOMIC_REQ);
+	return 0;
+}
+EXPORT_SYMBOL(pci_enable_atomic_ops_to_root);
+
 /**
  * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge
  * @dev: the PCI device
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c170c9250c8b..ab3d12a7dfed 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2061,6 +2061,7 @@ void pci_request_acs(void);
 bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags);
 bool pci_acs_path_enabled(struct pci_dev *start,
 			  struct pci_dev *end, u16 acs_flags);
+int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask);
 
 #define PCI_VPD_LRDT			0x80	/* Large Resource Data Type */
 #define PCI_VPD_LRDT_ID(x)		((x) | PCI_VPD_LRDT)
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 70c2b2ade048..f31b56b21714 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -624,7 +624,9 @@
 #define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
 #define  PCI_EXP_DEVCAP2_ARI		0x00000020 /* Alternative Routing-ID */
 #define  PCI_EXP_DEVCAP2_ATOMIC_ROUTE	0x00000040 /* Atomic Op routing */
-#define PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* Atomic 64-bit compare */
+#define  PCI_EXP_DEVCAP2_ATOMIC_COMP32	0x00000080 /* 32b AtomicOp completion */
+#define  PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* 64b AtomicOp completion */
+#define  PCI_EXP_DEVCAP2_ATOMIC_COMP128	0x00000200 /* 128b AtomicOp completion */
 #define  PCI_EXP_DEVCAP2_LTR		0x00000800 /* Latency tolerance reporting */
 #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
-- 
cgit v1.2.3


From e1fc742e14e01d84d9693c4aca4ab23da65811fb Mon Sep 17 00:00:00 2001
From: Jürg Billeter <j@bitron.ch>
Date: Fri, 29 Sep 2017 14:07:17 +0200
Subject: fs: add RWF_APPEND
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the per-I/O equivalent of O_APPEND to support atomic append
operations on any open file.

If a file is opened with O_APPEND, pwrite() ignores the offset and
always appends data to the end of the file. RWF_APPEND enables atomic
append and pwrite() with offset on a single file descriptor.

Signed-off-by: Jürg Billeter <j@bitron.ch>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h      | 2 ++
 include/uapi/linux/fs.h | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6276f8315e5b..85c8ddc55760 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3224,6 +3224,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 		ki->ki_flags |= IOCB_DSYNC;
 	if (flags & RWF_SYNC)
 		ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+	if (flags & RWF_APPEND)
+		ki->ki_flags |= IOCB_APPEND;
 	return 0;
 }
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 4199f8acbce5..d2a8313fabd7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -377,7 +377,11 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO, return -EAGAIN if operation would block */
 #define RWF_NOWAIT	((__force __kernel_rwf_t)0x00000008)
 
+/* per-IO O_APPEND */
+#define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
+
 /* mask of flags supported by the kernel */
-#define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT)
+#define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
+			 RWF_APPEND)
 
 #endif /* _UAPI_LINUX_FS_H */
-- 
cgit v1.2.3


From de525be2ca2734865d29c4b67ddd29913b214906 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:09 -0800
Subject: bpf: Support passing args to sock_ops bpf function

Adds support for passing up to 4 arguments to sock_ops bpf functions. It
reusues the reply union, so the bpf_sock_ops structures are not
increased in size.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h   |  1 +
 include/net/tcp.h        | 40 +++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/bpf.h |  5 +++--
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_nv.c        |  2 +-
 net/ipv4/tcp_output.c    |  2 +-
 6 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index daa5a676335f..20384c4bed25 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1003,6 +1003,7 @@ struct bpf_sock_ops_kern {
 	struct	sock *sk;
 	u32	op;
 	union {
+		u32 args[4];
 		u32 reply;
 		u32 replylong[4];
 	};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6092eaff61cf..093e967a2960 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2006,7 +2006,7 @@ void tcp_cleanup_ulp(struct sock *sk);
  * program loaded).
  */
 #ifdef CONFIG_BPF
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
@@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 
 	sock_ops.sk = sk;
 	sock_ops.op = op;
+	if (nargs > 0)
+		memcpy(sock_ops.args, args, nargs * sizeof(*args));
 
 	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
 	if (ret == 0)
@@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 		ret = -1;
 	return ret;
 }
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	u32 args[2] = {arg1, arg2};
+
+	return tcp_call_bpf(sk, op, 2, args);
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	u32 args[3] = {arg1, arg2, arg3};
+
+	return tcp_call_bpf(sk, op, 3, args);
+}
+
 #else
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	return -EPERM;
 }
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	return -EPERM;
+}
+
 #endif
 
 static inline u32 tcp_timeout_init(struct sock *sk)
 {
 	int timeout;
 
-	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
 
 	if (timeout <= 0)
 		timeout = TCP_TIMEOUT_INIT;
@@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 {
 	int rwnd;
 
-	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
 
 	if (rwnd < 0)
 		rwnd = 0;
@@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 
 static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
-	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
 #if IS_ENABLED(CONFIG_SMC)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 406c19d6016b..8d5874c2c4ff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -952,8 +952,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d7cf861bf699..88b62441e7e9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
 	tcp_mtup_init(sk);
 	icsk->icsk_af_ops->rebuild_header(sk);
 	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, bpf_op);
+	tcp_call_bpf(sk, bpf_op, 0, NULL);
 	tcp_init_congestion_control(sk);
 	tcp_init_buffer_space(sk);
 }
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 0b5a05bd82e3..ddbce73edae8 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk)
 	 * within a datacenter, where we have reasonable estimates of
 	 * RTTs
 	 */
-	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
+	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
 	if (base_rtt > 0) {
 		ca->nv_base_rtt = base_rtt;
 		ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 95461f02ac9a..d12f7f71c1c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3469,7 +3469,7 @@ int tcp_connect(struct sock *sk)
 	struct sk_buff *buff;
 	int err;
 
-	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
+	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
 
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
-- 
cgit v1.2.3


From b13d880721729384757f235166068c315326f4a1 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:10 -0800
Subject: bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock

Adds field bpf_sock_ops_cb_flags to tcp_sock and bpf_sock_ops. Its primary
use is to determine if there should be calls to sock_ops bpf program at
various points in the TCP code. The field is initialized to zero,
disabling the calls. A sock_ops BPF program can set it, per connection and
as necessary, when the connection is established.

It also adds support for reading and writting the field within a
sock_ops BPF program. Reading is done by accessing the field directly.
However, writing is done through the helper function
bpf_sock_ops_cb_flags_set, in order to return an error if a BPF program
is trying to set a callback that is not supported in the current kernel
(i.e. running an older kernel). The helper function returns 0 if it was
able to set all of the bits set in the argument, a positive number
containing the bits that could not be set, or -EINVAL if the socket is
not a full TCP socket.

Examples of where one could call the bpf program:

1) When RTO fires
2) When a packet is retransmitted
3) When the connection terminates
4) When a packet is sent
5) When a packet is received

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tcp.h      | 11 +++++++++++
 include/uapi/linux/bpf.h | 17 ++++++++++++++++-
 net/core/filter.c        | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4f93f0953c41..8f4c54986f97 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -335,6 +335,17 @@ struct tcp_sock {
 
 	int			linger2;
 
+
+/* Sock_ops bpf program related variables */
+#ifdef CONFIG_BPF
+	u8	bpf_sock_ops_cb_flags;  /* Control calling BPF programs
+					 * values defined in uapi/linux/tcp.h
+					 */
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
+#else
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
+#endif
+
 /* Receiver side RTT estimation */
 	struct {
 		u32	rtt_us;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8d5874c2c4ff..aa128407c44d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -642,6 +642,14 @@ union bpf_attr {
  *     @optlen: length of optval in bytes
  *     Return: 0 or negative error
  *
+ * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
+ *     Set callback flags for sock_ops
+ *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
+ *     @flags: flags value
+ *     Return: 0 for no error
+ *             -EINVAL if there is no full tcp socket
+ *             bits in flags that are not supported by current kernel
+ *
  * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
  *     Grow or shrink room in sk_buff.
  *     @skb: pointer to skb
@@ -748,7 +756,8 @@ union bpf_attr {
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
 	FN(getsockopt),			\
-	FN(override_return),
+	FN(override_return),		\
+	FN(sock_ops_cb_flags_set),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -969,8 +978,14 @@ struct bpf_sock_ops {
 				 */
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
 };
 
+/* Definitions for bpf_sock_ops_cb_flags */
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0		/* Mask of all currently
+							 * supported cb flags
+							 */
+
 /* List of known BPF sock_ops operators.
  * New entries can only be added at the end
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index c356ec02b1a5..6936d19ac736 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3328,6 +3328,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, argval)
+{
+	struct sock *sk = bpf_sock->sk;
+	int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+
+	if (!sk_fullsock(sk))
+		return -EINVAL;
+
+#ifdef CONFIG_INET
+	if (val)
+		tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+	return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
+#else
+	return -EINVAL;
+#endif
+}
+
+static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
+	.func		= bpf_sock_ops_cb_flags_set,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3510,6 +3537,8 @@ static const struct bpf_func_proto *
 		return &bpf_setsockopt_proto;
 	case BPF_FUNC_getsockopt:
 		return &bpf_getsockopt_proto;
+	case BPF_FUNC_sock_ops_cb_flags_set:
+		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
 	default:
@@ -4546,6 +4575,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, srtt_us):
 		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
+		SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
+				   struct tcp_sock);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit v1.2.3


From f89013f66d0f1a0dad44c513318efb706399a36b Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:11 -0800
Subject: bpf: Add sock_ops RTO callback

Adds an optional call to sock_ops BPF program based on whether the
BPF_SOCK_OPS_RTO_CB_FLAG is set in bpf_sock_ops_flags.
The BPF program is passed 2 arguments: icsk_retransmits and whether the
RTO has expired.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 8 +++++++-
 net/ipv4/tcp_timer.c     | 7 +++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa128407c44d..c8cecf9cf5bd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -982,7 +982,8 @@ struct bpf_sock_ops {
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
-#define BPF_SOCK_OPS_ALL_CB_FLAGS       0		/* Mask of all currently
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x1		/* Mask of all currently
 							 * supported cb flags
 							 */
 
@@ -1019,6 +1020,11 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6db3124cdbda..257abdde23b0 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -213,11 +213,18 @@ static int tcp_write_timeout(struct sock *sk)
 						icsk->icsk_user_timeout);
 	}
 	tcp_fastopen_active_detect_blackhole(sk, expired);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+				  icsk->icsk_retransmits,
+				  icsk->icsk_rto, (int)expired);
+
 	if (expired) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
 	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 44f0e43037d3a17b043843ba67610ac7c7e37db6 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:12 -0800
Subject: bpf: Add support for reading sk_state and more

Add support for reading many more tcp_sock fields

  state,	same as sk->sk_state
  rtt_min	same as sk->rtt_min.s[0].v (current rtt_min)
  snd_ssthresh
  rcv_nxt
  snd_nxt
  snd_una
  mss_cache
  ecn_flags
  rate_delivered
  rate_interval_us
  packets_out
  retrans_out
  total_retrans
  segs_in
  data_segs_in
  segs_out
  data_segs_out
  lost_out
  sacked_out
  sk_txhash
  bytes_received (__u64)
  bytes_acked    (__u64)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  22 ++++++++
 net/core/filter.c        | 143 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 154 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8cecf9cf5bd..46520eae37fa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -979,6 +979,28 @@ struct bpf_sock_ops {
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u32 lost_out;
+	__u32 sacked_out;
+	__u32 sk_txhash;
+	__u64 bytes_received;
+	__u64 bytes_acked;
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
diff --git a/net/core/filter.c b/net/core/filter.c
index 6936d19ac736..a858ebc4ece4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3855,33 +3855,43 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static bool __is_valid_sock_ops_access(int off, int size)
+static bool sock_ops_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     struct bpf_insn_access_aux *info)
 {
+	const int size_default = sizeof(__u32);
+
 	if (off < 0 || off >= sizeof(struct bpf_sock_ops))
 		return false;
+
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
-		return false;
-
-	return true;
-}
 
-static bool sock_ops_is_valid_access(int off, int size,
-				     enum bpf_access_type type,
-				     struct bpf_insn_access_aux *info)
-{
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct bpf_sock_ops, reply):
+			if (size != size_default)
+				return false;
 			break;
 		default:
 			return false;
 		}
+	} else {
+		switch (off) {
+		case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
+					bytes_acked):
+			if (size != sizeof(__u64))
+				return false;
+			break;
+		default:
+			if (size != size_default)
+				return false;
+			break;
+		}
 	}
 
-	return __is_valid_sock_ops_access(off, size);
+	return true;
 }
 
 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -4498,6 +4508,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
+	case offsetof(struct bpf_sock_ops, state):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_state));
+		break;
+
+	case offsetof(struct bpf_sock_ops, rtt_min):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+			     sizeof(struct minmax));
+		BUILD_BUG_ON(sizeof(struct minmax) <
+			     sizeof(struct minmax_sample));
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct tcp_sock, rtt_min) +
+				      FIELD_SIZEOF(struct minmax_sample, t));
+		break;
+
 /* Helper macro for adding read access to tcp_sock or sock fields. */
 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
@@ -4580,6 +4616,91 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
 				   struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, snd_ssthresh):
+		SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rcv_nxt):
+		SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_nxt):
+		SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_una):
+		SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, mss_cache):
+		SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, ecn_flags):
+		SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_delivered):
+		SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_interval_us):
+		SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, packets_out):
+		SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, retrans_out):
+		SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, total_retrans):
+		SOCK_OPS_GET_FIELD(total_retrans, total_retrans,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_in):
+		SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_in):
+		SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_out):
+		SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_out):
+		SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, lost_out):
+		SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, sacked_out):
+		SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, sk_txhash):
+		SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_received):
+		SOCK_OPS_GET_FIELD(bytes_received, bytes_received,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_acked):
+		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
cgit v1.2.3


From a31ad29e6a30cb0b9084a9425b819cdcd97273ce Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:14 -0800
Subject: bpf: Add BPF_SOCK_OPS_RETRANS_CB

Adds support for calling sock_ops BPF program when there is a
retransmission. Three arguments are used; one for the sequence number,
another for the number of segments retransmitted, and the last one for
the return value of tcp_transmit_skb (0 => success).
Does not include syn-ack retransmissions.

New op: BPF_SOCK_OPS_RETRANS_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 9 ++++++++-
 net/ipv4/tcp_output.c    | 4 ++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 46520eae37fa..31c93a0bdbc2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1005,7 +1005,8 @@ struct bpf_sock_ops {
 
 /* Definitions for bpf_sock_ops_cb_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
-#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x1		/* Mask of all currently
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x3		/* Mask of all currently
 							 * supported cb flags
 							 */
 
@@ -1047,6 +1048,12 @@ enum {
 					 * Arg2: value of icsk_rto
 					 * Arg3: whether RTO has expired
 					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 * Arg3: return value of
+					 *       tcp_transmit_skb (0 => success)
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d12f7f71c1c4..e9f985e42405 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2905,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 	}
 
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+				  TCP_SKB_CB(skb)->seq, segs, err);
+
 	if (likely(!err)) {
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 		trace_tcp_retransmit_skb(sk, skb);
-- 
cgit v1.2.3


From d44874910a26f3a8f81edf873a2473363f07f660 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:15 -0800
Subject: bpf: Add BPF_SOCK_OPS_STATE_CB

Adds support for calling sock_ops BPF program when there is a TCP state
change. Two arguments are used; one for the old state and another for
the new state.

There is a new enum in include/uapi/linux/bpf.h that exports the TCP
states that prepends BPF_ to the current TCP state names. If it is ever
necessary to change the internal TCP state values (other than adding
more to the end), then it will become necessary to convert from the
internal TCP state value to the BPF value before calling the BPF
sock_ops function. There are a set of compile checks added in tcp.c
to detect if the internal and BPF values differ so we can make the
necessary fixes.

New op: BPF_SOCK_OPS_STATE_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 29 ++++++++++++++++++++++++++++-
 net/ipv4/tcp.c           | 24 ++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 31c93a0bdbc2..db6bdc375126 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1006,7 +1006,8 @@ struct bpf_sock_ops {
 /* Definitions for bpf_sock_ops_cb_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
 #define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
-#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x3		/* Mask of all currently
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
 							 * supported cb flags
 							 */
 
@@ -1054,6 +1055,32 @@ enum {
 					 * Arg3: return value of
 					 *       tcp_transmit_skb (0 => success)
 					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 88b62441e7e9..f013ddc191e0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
+	/* We defined a new enum for TCP states that are exported in BPF
+	 * so as not force the internal TCP states to be frozen. The
+	 * following checks will detect if an internal state value ever
+	 * differs from the BPF value. If this ever happens, then we will
+	 * need to remap the internal value to the BPF value before calling
+	 * tcp_call_bpf_2arg.
+	 */
+	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
+	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
+	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
+	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+
 	switch (state) {
 	case TCP_ESTABLISHED:
 		if (oldstate != TCP_ESTABLISHED)
-- 
cgit v1.2.3


From d350a823020e71e20a10d1dfa44f1d1d653b0334 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 25 Jan 2018 13:20:10 -0800
Subject: net: erspan: create erspan metadata uapi header

The patch adds a new uapi header file, erspan.h, and moves
the 'struct erspan_metadata' from internal erspan.h to it.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h        | 32 ++--------------------------
 include/uapi/linux/erspan.h | 52 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 30 deletions(-)
 create mode 100644 include/uapi/linux/erspan.h

(limited to 'include/uapi/linux')

diff --git a/include/net/erspan.h b/include/net/erspan.h
index 6d30fe898286..5daa4866412b 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -46,6 +46,8 @@
  * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
  */
 
+#include <uapi/linux/erspan.h>
+
 #define ERSPAN_VERSION	0x1	/* ERSPAN type II */
 #define VER_MASK	0xf000
 #define VLAN_MASK	0x0fff
@@ -68,29 +70,6 @@
 #define HWID_OFFSET    4
 #define DIR_OFFSET     3
 
-/* ERSPAN version 2 metadata header */
-struct erspan_md2 {
-	__be32 timestamp;
-	__be16 sgt;	/* security group tag */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8	hwid_upper:2,
-		ft:5,
-		p:1;
-	__u8	o:1,
-		gra:2,
-		dir:1,
-		hwid:4;
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	__u8	p:1,
-		ft:5,
-		hwid_upper:2;
-	__u8	hwid:4,
-		dir:1,
-		gra:2,
-		o:1;
-#endif
-};
-
 enum erspan_encap_type {
 	ERSPAN_ENCAP_NOVLAN = 0x0,	/* originally without VLAN tag */
 	ERSPAN_ENCAP_ISL = 0x1,		/* originally ISL encapsulated */
@@ -100,13 +79,6 @@ enum erspan_encap_type {
 
 #define ERSPAN_V1_MDSIZE	4
 #define ERSPAN_V2_MDSIZE	8
-struct erspan_metadata {
-	union {
-		__be32 index;		/* Version 1 (type II)*/
-		struct erspan_md2 md2;	/* Version 2 (type III) */
-	} u;
-	int version;
-};
 
 struct erspan_base_hdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/include/uapi/linux/erspan.h b/include/uapi/linux/erspan.h
new file mode 100644
index 000000000000..841573019ae1
--- /dev/null
+++ b/include/uapi/linux/erspan.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * ERSPAN Tunnel Metadata
+ *
+ * Copyright (c) 2018 VMware
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Userspace API for metadata mode ERSPAN tunnel
+ */
+#ifndef _UAPI_ERSPAN_H
+#define _UAPI_ERSPAN_H
+
+#include <linux/types.h>	/* For __beXX in userspace */
+#include <asm/byteorder.h>
+
+/* ERSPAN version 2 metadata header */
+struct erspan_md2 {
+	__be32 timestamp;
+	__be16 sgt;	/* security group tag */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8	hwid_upper:2,
+		ft:5,
+		p:1;
+	__u8	o:1,
+		gra:2,
+		dir:1,
+		hwid:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u8	p:1,
+		ft:5,
+		hwid_upper:2;
+	__u8	hwid:4,
+		dir:1,
+		gra:2,
+		o:1;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+};
+
+struct erspan_metadata {
+	int version;
+	union {
+		__be32 index;		/* Version 1 (type II)*/
+		struct erspan_md2 md2;	/* Version 2 (type III) */
+	} u;
+};
+
+#endif /* _UAPI_ERSPAN_H */
-- 
cgit v1.2.3


From fc1372f89ffe1f58b589643b75f679e452350703 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 25 Jan 2018 13:20:11 -0800
Subject: openvswitch: add erspan version I and II support

The patch adds support for openvswitch to configure erspan
v1 and v2.  The OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS attr is added
to uapi as a binary blob to support all ERSPAN v1 and v2's
fields.  Note that Previous commit "openvswitch: Add erspan tunnel
support." was reverted since it does not design properly.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  1 +
 net/openvswitch/flow_netlink.c   | 52 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index dcfab5e3b55c..713e56ce681f 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -363,6 +363,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_IPV6_SRC,		/* struct in6_addr src IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_IPV6_DST,		/* struct in6_addr dst IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_PAD,
+	OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,	/* struct erspan_metadata */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index eb55f1b3d047..7322aa1e382e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -49,6 +49,7 @@
 #include <net/mpls.h>
 #include <net/vxlan.h>
 #include <net/tun_proto.h>
+#include <net/erspan.h>
 
 #include "flow_netlink.h"
 
@@ -329,7 +330,8 @@ size_t ovs_tun_key_attr_size(void)
 		+ nla_total_size(0)    /* OVS_TUNNEL_KEY_ATTR_CSUM */
 		+ nla_total_size(0)    /* OVS_TUNNEL_KEY_ATTR_OAM */
 		+ nla_total_size(256)  /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
-		/* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with
+		/* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and
+		 * OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with
 		 * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
 		 */
 		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
@@ -400,6 +402,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 						.next = ovs_vxlan_ext_key_lens },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_SRC]      = { .len = sizeof(struct in6_addr) },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
+	[OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = OVS_ATTR_VARIABLE },
 };
 
 static const struct ovs_len_tbl
@@ -631,6 +634,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
 	return 0;
 }
 
+static int erspan_tun_opt_from_nlattr(const struct nlattr *a,
+				      struct sw_flow_match *match, bool is_mask,
+				      bool log)
+{
+	unsigned long opt_key_offset;
+
+	BUILD_BUG_ON(sizeof(struct erspan_metadata) >
+		     sizeof(match->key->tun_opts));
+
+	if (nla_len(a) > sizeof(match->key->tun_opts)) {
+		OVS_NLERR(log, "ERSPAN option length err (len %d, max %zu).",
+			  nla_len(a), sizeof(match->key->tun_opts));
+		return -EINVAL;
+	}
+
+	if (!is_mask)
+		SW_FLOW_KEY_PUT(match, tun_opts_len,
+				sizeof(struct erspan_metadata), false);
+	else
+		SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
+
+	opt_key_offset = TUN_METADATA_OFFSET(nla_len(a));
+	SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a),
+				  nla_len(a), is_mask);
+	return 0;
+}
+
 static int ip_tun_from_nlattr(const struct nlattr *attr,
 			      struct sw_flow_match *match, bool is_mask,
 			      bool log)
@@ -738,6 +768,20 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_PAD:
 			break;
+		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+			if (opts_type) {
+				OVS_NLERR(log, "Multiple metadata blocks provided");
+				return -EINVAL;
+			}
+
+			err = erspan_tun_opt_from_nlattr(a, match, is_mask,
+							 log);
+			if (err)
+				return err;
+
+			tun_flags |= TUNNEL_ERSPAN_OPT;
+			opts_type = type;
+			break;
 		default:
 			OVS_NLERR(log, "Unknown IP tunnel attribute %d",
 				  type);
@@ -862,6 +906,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 		else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
 			 vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
 			return -EMSGSIZE;
+		else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
+			 nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
+				 swkey_tun_opts_len, tun_opts))
+			return -EMSGSIZE;
 	}
 
 	return 0;
@@ -2486,6 +2534,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
 			break;
+		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+			break;
 		}
 	}
 
-- 
cgit v1.2.3


From 38e01b30563a5b5ade7b54e5d739d16a2b02fe82 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 25 Jan 2018 15:01:39 +0100
Subject: dev: advertise the new ifindex when the netns iface changes

The goal is to let the user follow an interface that moves to another
netns.

CC: Jiri Benc <jbenc@redhat.com>
CC: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h    |  5 +++--
 include/uapi/linux/if_link.h |  1 +
 net/core/dev.c               | 19 ++++++++++++-------
 net/core/rtnetlink.c         | 31 ++++++++++++++++++++-----------
 4 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 62d508b31f56..0514cc36ac34 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -19,10 +19,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
-			 gfp_t flags, int *new_nsid);
+			 gfp_t flags, int *new_nsid, int new_ifindex);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
-				       gfp_t flags, int *new_nsid);
+				       gfp_t flags, int *new_nsid,
+				       int new_ifindex);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8616131e2c61..6d9447700e18 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -163,6 +163,7 @@ enum {
 	IFLA_IF_NETNSID,
 	IFLA_CARRIER_UP_COUNT,
 	IFLA_CARRIER_DOWN_COUNT,
+	IFLA_NEW_IFINDEX,
 	__IFLA_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 59987eb6511a..858501b12869 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7360,7 +7360,7 @@ static void rollback_registered_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL, NULL);
+						     GFP_KERNEL, NULL, 0);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -8473,7 +8473,7 @@ EXPORT_SYMBOL(unregister_netdev);
 
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
-	int err, new_nsid;
+	int err, new_nsid, new_ifindex;
 
 	ASSERT_RTNL();
 
@@ -8529,8 +8529,16 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+
 	new_nsid = peernet2id_alloc(dev_net(dev), net);
-	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
+	/* If there is an ifindex conflict assign a new one */
+	if (__dev_get_by_index(net, dev->ifindex))
+		new_ifindex = dev_new_index(net);
+	else
+		new_ifindex = dev->ifindex;
+
+	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
+			    new_ifindex);
 
 	/*
 	 *	Flush the unicast and multicast chains
@@ -8544,10 +8552,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 
 	/* Actually switch the network namespace */
 	dev_net_set(dev, net);
-
-	/* If there is an ifindex conflict assign a new one */
-	if (__dev_get_by_index(net, dev->ifindex))
-		dev->ifindex = dev_new_index(net);
+	dev->ifindex = new_ifindex;
 
 	/* Send a netdev-add uevent to the new namespace */
 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f111557958bb..e04af7b7f448 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -988,6 +988,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_xdp_size() /* IFLA_XDP */
 	       + nla_total_size(4)  /* IFLA_EVENT */
 	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
+	       + nla_total_size(4)  /* IFLA_NEW_IFINDEX */
 	       + nla_total_size(1)  /* IFLA_PROTO_DOWN */
 	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
 	       + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
@@ -1511,7 +1512,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			    struct net_device *dev, struct net *src_net,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event, int *new_nsid, int tgt_netnsid)
+			    u32 event, int *new_nsid, int new_ifindex,
+			    int tgt_netnsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1608,6 +1610,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	if (new_nsid &&
 	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
 		goto nla_put_failure;
+	if (new_ifindex &&
+	    nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
+		goto nla_put_failure;
+
 
 	rcu_read_lock();
 	if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
@@ -1853,7 +1859,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0, NULL,
+					       ext_filter_mask, 0, NULL, 0,
 					       netnsid);
 
 			if (err < 0) {
@@ -3088,7 +3094,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = rtnl_fill_ifinfo(nskb, dev, net,
 			       RTM_NEWLINK, NETLINK_CB(skb).portid,
 			       nlh->nlmsg_seq, 0, 0, ext_filter_mask,
-			       0, NULL, netnsid);
+			       0, NULL, 0, netnsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -3184,7 +3190,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
-				       u32 event, gfp_t flags, int *new_nsid)
+				       u32 event, gfp_t flags, int *new_nsid,
+				       int new_ifindex)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -3197,7 +3204,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 
 	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
 			       type, 0, 0, change, 0, 0, event,
-			       new_nsid, -1);
+			       new_nsid, new_ifindex, -1);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -3220,14 +3227,15 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags, int *new_nsid)
+			       gfp_t flags, int *new_nsid, int new_ifindex)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
+				     new_ifindex);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
@@ -3235,14 +3243,15 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		  gfp_t flags)
 {
-	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL);
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+			   NULL, 0);
 }
 
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
-			 gfp_t flags, int *new_nsid)
+			 gfp_t flags, int *new_nsid, int new_ifindex)
 {
 	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
-			   new_nsid);
+			   new_nsid, new_ifindex);
 }
 
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -4642,7 +4651,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGELOWERSTATE:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL, NULL);
+				   GFP_KERNEL, NULL, 0);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 01060e3d4e423146ecf9d308814e16a357671ddf Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 16 Jan 2018 17:37:50 -0600
Subject: PCI/DPC: Add and use DPC Status register field definitions

Add definitions for DPC Status register fields and use them in the code.
No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/pci/pcie/pcie-dpc.c   | 4 ++--
 include/uapi/linux/pci_regs.h | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index 0f4eb6111ab4..7eb9bc7d4bfd 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -211,8 +211,8 @@ static irqreturn_t dpc_irq(int irq, void *context)
 	dev_info(dev, "DPC containment event, status:%#06x source:%#06x\n",
 		status, source);
 
-	reason = (status >> 1) & 0x3;
-	ext_reason = (status >> 5) & 0x3;
+	reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1;
+	ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5;
 
 	dev_warn(dev, "DPC %s detected, remove downstream devices\n",
 		 (reason == 0) ? "unmasked uncorrectable error" :
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 70c2b2ade048..970a0dc535c4 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -979,8 +979,10 @@
 
 #define PCI_EXP_DPC_STATUS		8	/* DPC Status */
 #define  PCI_EXP_DPC_STATUS_TRIGGER	0x01	/* Trigger Status */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN	0x06	/* Trigger Reason */
 #define  PCI_EXP_DPC_STATUS_INTERRUPT	0x08	/* Interrupt Status */
 #define  PCI_EXP_DPC_RP_BUSY		0x10	/* Root Port Busy */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x60 /* Trig Reason Extension */
 
 #define PCI_EXP_DPC_SOURCE_ID		10	/* DPC Source Identifier */
 
-- 
cgit v1.2.3


From 65d5e9135f3b0281b97f1749d10dcfc7233e65ab Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 16 Jan 2018 17:34:12 -0600
Subject: PCI/DPC: Reformat DPC register definitions

Reformat DPC register definitions to follow the convention that register
field masks indicate the register width, e.g., a field of a 16-bit register
uses a mask of 4 hex digits, with leading zeros included as needed.
No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sinan Kaya <okaya@codeaurora.org>
---
 include/uapi/linux/pci_regs.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 970a0dc535c4..66d71461d2f0 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -966,28 +966,28 @@
 
 /* Downstream Port Containment */
 #define PCI_EXP_DPC_CAP			4	/* DPC Capability */
-#define PCI_EXP_DPC_IRQ			0x1f	/* DPC Interrupt Message Number */
-#define  PCI_EXP_DPC_CAP_RP_EXT		0x20	/* Root Port Extensions for DPC */
-#define  PCI_EXP_DPC_CAP_POISONED_TLP	0x40	/* Poisoned TLP Egress Blocking Supported */
-#define  PCI_EXP_DPC_CAP_SW_TRIGGER	0x80	/* Software Triggering Supported */
-#define  PCI_EXP_DPC_RP_PIO_LOG_SIZE	0xF00	/* RP PIO log size */
+#define PCI_EXP_DPC_IRQ			0x001F	/* Interrupt Message Number */
+#define  PCI_EXP_DPC_CAP_RP_EXT		0x0020	/* Root Port Extensions */
+#define  PCI_EXP_DPC_CAP_POISONED_TLP	0x0040	/* Poisoned TLP Egress Blocking Supported */
+#define  PCI_EXP_DPC_CAP_SW_TRIGGER	0x0080	/* Software Triggering Supported */
+#define  PCI_EXP_DPC_RP_PIO_LOG_SIZE	0x0F00	/* RP PIO Log Size */
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE	0x1000	/* ERR_COR signal on DL_Active supported */
 
 #define PCI_EXP_DPC_CTL			6	/* DPC control */
-#define  PCI_EXP_DPC_CTL_EN_NONFATAL 	0x02	/* Enable trigger on ERR_NONFATAL message */
-#define  PCI_EXP_DPC_CTL_INT_EN 	0x08	/* DPC Interrupt Enable */
+#define  PCI_EXP_DPC_CTL_EN_NONFATAL 	0x0002	/* Enable trigger on ERR_NONFATAL message */
+#define  PCI_EXP_DPC_CTL_INT_EN 	0x0008	/* DPC Interrupt Enable */
 
 #define PCI_EXP_DPC_STATUS		8	/* DPC Status */
-#define  PCI_EXP_DPC_STATUS_TRIGGER	0x01	/* Trigger Status */
-#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN	0x06	/* Trigger Reason */
-#define  PCI_EXP_DPC_STATUS_INTERRUPT	0x08	/* Interrupt Status */
-#define  PCI_EXP_DPC_RP_BUSY		0x10	/* Root Port Busy */
-#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x60 /* Trig Reason Extension */
+#define  PCI_EXP_DPC_STATUS_TRIGGER	    0x0001 /* Trigger Status */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN	    0x0006 /* Trigger Reason */
+#define  PCI_EXP_DPC_STATUS_INTERRUPT	    0x0008 /* Interrupt Status */
+#define  PCI_EXP_DPC_RP_BUSY		    0x0010 /* Root Port Busy */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x0060 /* Trig Reason Extension */
 
 #define PCI_EXP_DPC_SOURCE_ID		10	/* DPC Source Identifier */
 
 #define PCI_EXP_DPC_RP_PIO_STATUS	 0x0C	/* RP PIO Status */
-#define PCI_EXP_DPC_RP_PIO_MASK		 0x10	/* RP PIO MASK */
+#define PCI_EXP_DPC_RP_PIO_MASK		 0x10	/* RP PIO Mask */
 #define PCI_EXP_DPC_RP_PIO_SEVERITY	 0x14	/* RP PIO Severity */
 #define PCI_EXP_DPC_RP_PIO_SYSERROR	 0x18	/* RP PIO SysError */
 #define PCI_EXP_DPC_RP_PIO_EXCEPTION	 0x1C	/* RP PIO Exception */
-- 
cgit v1.2.3


From 4d32029b8ddb7be4d1699c6d8e1675ff5476d149 Mon Sep 17 00:00:00 2001
From: Tomáš Golembiovský <tgolembi@redhat.com>
Date: Sun, 12 Nov 2017 13:05:38 +0100
Subject: virtio_balloon: include disk/file caches memory statistics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new field VIRTIO_BALLOON_S_CACHES to virtio_balloon memory
statistics protocol. The value represents all disk/file caches.

In this case it corresponds to the sum of values
Buffers+Cached+SwapCached from /proc/meminfo.

Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_balloon.c     | 4 ++++
 include/uapi/linux/virtio_balloon.h | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index a1fb52cb3f0a..dfe5684000be 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -257,11 +257,13 @@ static unsigned int update_balloon_stats(struct virtio_balloon *vb)
 	struct sysinfo i;
 	unsigned int idx = 0;
 	long available;
+	unsigned long caches;
 
 	all_vm_events(events);
 	si_meminfo(&i);
 
 	available = si_mem_available();
+	caches = global_node_page_state(NR_FILE_PAGES);
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
 	update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
@@ -277,6 +279,8 @@ static unsigned int update_balloon_stats(struct virtio_balloon *vb)
 				pages_to_bytes(i.totalram));
 	update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL,
 				pages_to_bytes(available));
+	update_stat(vb, idx++, VIRTIO_BALLOON_S_CACHES,
+				pages_to_bytes(caches));
 
 	return idx;
 }
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 343d7ddefe04..4e8b8304b793 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -52,7 +52,8 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_MEMFREE  4   /* Total amount of free memory */
 #define VIRTIO_BALLOON_S_MEMTOT   5   /* Total amount of memory */
 #define VIRTIO_BALLOON_S_AVAIL    6   /* Available memory as in /proc */
-#define VIRTIO_BALLOON_S_NR       7
+#define VIRTIO_BALLOON_S_CACHES   7   /* Disk caches */
+#define VIRTIO_BALLOON_S_NR       8
 
 /*
  * Memory statistics structure.
-- 
cgit v1.2.3


From 65aaf87b3aa2d049c6b9fd85221858a895df3393 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 11:00:50 -0500
Subject: add EPOLLNVAL, annotate EPOLL... and event_poll->event

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/uapi/linux/eventpoll.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 63e21be30f15..bf48e71f2634 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -28,20 +28,21 @@
 #define EPOLL_CTL_MOD 3
 
 /* Epoll event masks */
-#define EPOLLIN		0x00000001
-#define EPOLLPRI	0x00000002
-#define EPOLLOUT	0x00000004
-#define EPOLLERR	0x00000008
-#define EPOLLHUP	0x00000010
-#define EPOLLRDNORM	0x00000040
-#define EPOLLRDBAND	0x00000080
-#define EPOLLWRNORM	0x00000100
-#define EPOLLWRBAND	0x00000200
-#define EPOLLMSG	0x00000400
-#define EPOLLRDHUP	0x00002000
+#define EPOLLIN		(__force __poll_t)0x00000001
+#define EPOLLPRI	(__force __poll_t)0x00000002
+#define EPOLLOUT	(__force __poll_t)0x00000004
+#define EPOLLERR	(__force __poll_t)0x00000008
+#define EPOLLHUP	(__force __poll_t)0x00000010
+#define EPOLLNVAL	(__force __poll_t)0x00000020
+#define EPOLLRDNORM	(__force __poll_t)0x00000040
+#define EPOLLRDBAND	(__force __poll_t)0x00000080
+#define EPOLLWRNORM	(__force __poll_t)0x00000100
+#define EPOLLWRBAND	(__force __poll_t)0x00000200
+#define EPOLLMSG	(__force __poll_t)0x00000400
+#define EPOLLRDHUP	(__force __poll_t)0x00002000
 
 /* Set exclusive wakeup mode for the target file descriptor */
-#define EPOLLEXCLUSIVE (1U << 28)
+#define EPOLLEXCLUSIVE (__force __poll_t)(1U << 28)
 
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
@@ -53,13 +54,13 @@
  *
  * Requires CAP_BLOCK_SUSPEND
  */
-#define EPOLLWAKEUP (1U << 29)
+#define EPOLLWAKEUP (__force __poll_t)(1U << 29)
 
 /* Set the One Shot behaviour for the target file descriptor */
-#define EPOLLONESHOT (1U << 30)
+#define EPOLLONESHOT (__force __poll_t)(1U << 30)
 
 /* Set the Edge Triggered behaviour for the target file descriptor */
-#define EPOLLET (1U << 31)
+#define EPOLLET (__force __poll_t)(1U << 31)
 
 /* 
  * On x86-64 make the 64bit structure have the same alignment as the
@@ -74,7 +75,7 @@
 #endif
 
 struct epoll_event {
-	__u32 events;
+	__poll_t events;
 	__u64 data;
 } EPOLL_PACKED;
 
-- 
cgit v1.2.3


From f2ba5a5baecf795c2150826bd0c95fc3f7f3d226 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 1 Feb 2018 21:27:22 -0800
Subject: libnvdimm, namespace: make min namespace size 4K

The arbitrary 4MB minimum namespace size turns out to be too large for
some environments. Quoting Cheng-mean Liu:

    In the case of emulated NVDIMM devices in the VM environment, there
    are scenarios that NVDIMM device with much smaller sizes are
    desired, for example, we might use a single enumerated NVDIMM DAX
    device for representing each container layer, which in some cases
    could be just a few KBs size.

PAGE_SIZE is the minimum where we can still support DAX of at least
a single page.

Cc: Matthew Wilcox <willy@infradead.org>
Reported-by: Cheng-mean Liu <soccerl@microsoft.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/ndctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 30ef1236aafa..7e27070b9440 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -209,7 +209,7 @@ enum nd_driver_flags {
 };
 
 enum {
-	ND_MIN_NAMESPACE_SIZE = 0x00400000,
+	ND_MIN_NAMESPACE_SIZE = PAGE_SIZE,
 };
 
 enum ars_masks {
-- 
cgit v1.2.3


From c5f58bd58f432be5d92df33c5458e0bcbee3aadf Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 29 Jan 2018 15:20:13 -0500
Subject: membarrier: Provide GLOBAL_EXPEDITED command

Allow expedited membarrier to be used for data shared between processes
through shared memory.

Processes wishing to receive the membarriers register with
MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED. Those which want to issue
membarrier invoke MEMBARRIER_CMD_GLOBAL_EXPEDITED.

This allows extremely simple kernel-level implementation: we have almost
everything we need with the PRIVATE_EXPEDITED barrier code. All we need
to do is to add a flag in the mm_struct that will be used to check
whether we need to send the IPI to the current thread of each CPU.

There is a slight downside to this approach compared to targeting
specific shared memory users: when performing a membarrier operation,
all registered "global" receivers will get the barrier, even if they
don't share a memory mapping with the sender issuing
MEMBARRIER_CMD_GLOBAL_EXPEDITED.

This registration approach seems to fit the requirement of not
disturbing processes that really deeply care about real-time: they
simply should not register with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.

In order to align the membarrier command names, the "MEMBARRIER_CMD_SHARED"
command is renamed to "MEMBARRIER_CMD_GLOBAL", keeping an alias of
MEMBARRIER_CMD_SHARED to MEMBARRIER_CMD_GLOBAL for UAPI header backward
compatibility.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Avi Kivity <avi@scylladb.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: David Sehr <sehr@google.com>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maged Michael <maged.michael@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/20180129202020.8515-5-mathieu.desnoyers@efficios.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/include/asm/membarrier.h |   3 +-
 include/linux/sched/mm.h              |   6 +-
 include/uapi/linux/membarrier.h       |  42 ++++++++++--
 kernel/sched/membarrier.c             | 120 +++++++++++++++++++++++++++++++---
 4 files changed, 153 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
index 98ff4f1fcf2b..6e20bb5c74ea 100644
--- a/arch/powerpc/include/asm/membarrier.h
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -13,7 +13,8 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
 	 * store to rq->curr.
 	 */
 	if (likely(!(atomic_read(&next->membarrier_state) &
-		     MEMBARRIER_STATE_PRIVATE_EXPEDITED) || !prev))
+		     (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+		      MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
 		return;
 
 	/*
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index b84e0fde1d72..1c4e40c5efaf 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -219,8 +219,10 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
 
 #ifdef CONFIG_MEMBARRIER
 enum {
-	MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY	= (1U << 0),
-	MEMBARRIER_STATE_PRIVATE_EXPEDITED		= (1U << 1),
+	MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY		= (1U << 0),
+	MEMBARRIER_STATE_PRIVATE_EXPEDITED			= (1U << 1),
+	MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY			= (1U << 2),
+	MEMBARRIER_STATE_GLOBAL_EXPEDITED			= (1U << 3),
 };
 
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index 4e01ad7ffe98..d252506e1b5e 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -31,7 +31,7 @@
  * enum membarrier_cmd - membarrier system call command
  * @MEMBARRIER_CMD_QUERY:   Query the set of supported commands. It returns
  *                          a bitmask of valid commands.
- * @MEMBARRIER_CMD_SHARED:  Execute a memory barrier on all running threads.
+ * @MEMBARRIER_CMD_GLOBAL:  Execute a memory barrier on all running threads.
  *                          Upon return from system call, the caller thread
  *                          is ensured that all running threads have passed
  *                          through a state where all memory accesses to
@@ -40,6 +40,28 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ *                          Execute a memory barrier on all running threads
+ *                          of all processes which previously registered
+ *                          with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This only covers threads from processes
+ *                          which registered with
+ *                          MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
+ *                          This command returns 0. Given that
+ *                          registration is about the intent to receive
+ *                          the barriers, it is valid to invoke
+ *                          MEMBARRIER_CMD_GLOBAL_EXPEDITED from a
+ *                          non-registered process.
+ * @MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ *                          Register the process intent to receive
+ *                          MEMBARRIER_CMD_GLOBAL_EXPEDITED memory
+ *                          barriers. Always returns 0.
  * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
  *                          Execute a memory barrier on each running
  *                          thread belonging to the same process as the current
@@ -64,18 +86,24 @@
  *                          Register the process intent to use
  *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
  *                          returns 0.
+ * @MEMBARRIER_CMD_SHARED:
+ *                          Alias to MEMBARRIER_CMD_GLOBAL. Provided for
+ *                          header backward compatibility.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-	MEMBARRIER_CMD_QUERY				= 0,
-	MEMBARRIER_CMD_SHARED				= (1 << 0),
-	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
-	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
-	MEMBARRIER_CMD_PRIVATE_EXPEDITED		= (1 << 3),
-	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED	= (1 << 4),
+	MEMBARRIER_CMD_QUERY					= 0,
+	MEMBARRIER_CMD_GLOBAL					= (1 << 0),
+	MEMBARRIER_CMD_GLOBAL_EXPEDITED				= (1 << 1),
+	MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED		= (1 << 2),
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED			= (1 << 3),
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED		= (1 << 4),
+
+	/* Alias for header backward compatibility. */
+	MEMBARRIER_CMD_SHARED			= MEMBARRIER_CMD_GLOBAL,
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 678577267a9a..d2087d5f9837 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -27,7 +27,9 @@
  * except MEMBARRIER_CMD_QUERY.
  */
 #define MEMBARRIER_CMD_BITMASK	\
-	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
+	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+	| MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
 
 static void ipi_mb(void *info)
@@ -35,6 +37,73 @@ static void ipi_mb(void *info)
 	smp_mb();	/* IPIs should be serializing but paranoid. */
 }
 
+static int membarrier_global_expedited(void)
+{
+	int cpu;
+	bool fallback = false;
+	cpumask_var_t tmpmask;
+
+	if (num_online_cpus() == 1)
+		return 0;
+
+	/*
+	 * Matches memory barriers around rq->curr modification in
+	 * scheduler.
+	 */
+	smp_mb();	/* system call entry is not a mb. */
+
+	/*
+	 * Expedited membarrier commands guarantee that they won't
+	 * block, hence the GFP_NOWAIT allocation flag and fallback
+	 * implementation.
+	 */
+	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+		/* Fallback for OOM. */
+		fallback = true;
+	}
+
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct task_struct *p;
+
+		/*
+		 * Skipping the current CPU is OK even through we can be
+		 * migrated at any point. The current CPU, at the point
+		 * where we read raw_smp_processor_id(), is ensured to
+		 * be in program order with respect to the caller
+		 * thread. Therefore, we can skip this CPU from the
+		 * iteration.
+		 */
+		if (cpu == raw_smp_processor_id())
+			continue;
+		rcu_read_lock();
+		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
+				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+			if (!fallback)
+				__cpumask_set_cpu(cpu, tmpmask);
+			else
+				smp_call_function_single(cpu, ipi_mb, NULL, 1);
+		}
+		rcu_read_unlock();
+	}
+	if (!fallback) {
+		preempt_disable();
+		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+		preempt_enable();
+		free_cpumask_var(tmpmask);
+	}
+	cpus_read_unlock();
+
+	/*
+	 * Memory barrier on the caller thread _after_ we finished
+	 * waiting for the last IPI. Matches memory barriers around
+	 * rq->curr modification in scheduler.
+	 */
+	smp_mb();	/* exit from system call is not a mb */
+	return 0;
+}
+
 static int membarrier_private_expedited(void)
 {
 	int cpu;
@@ -105,7 +174,38 @@ static int membarrier_private_expedited(void)
 	return 0;
 }
 
-static void membarrier_register_private_expedited(void)
+static int membarrier_register_global_expedited(void)
+{
+	struct task_struct *p = current;
+	struct mm_struct *mm = p->mm;
+
+	if (atomic_read(&mm->membarrier_state) &
+	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
+		return 0;
+	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+	if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
+		/*
+		 * For single mm user, single threaded process, we can
+		 * simply issue a memory barrier after setting
+		 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
+		 * no memory access following registration is reordered
+		 * before registration.
+		 */
+		smp_mb();
+	} else {
+		/*
+		 * For multi-mm user threads, we need to ensure all
+		 * future scheduler executions will observe the new
+		 * thread flag state for this mm.
+		 */
+		synchronize_sched();
+	}
+	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+		  &mm->membarrier_state);
+	return 0;
+}
+
+static int membarrier_register_private_expedited(void)
 {
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
@@ -117,7 +217,7 @@ static void membarrier_register_private_expedited(void)
 	 */
 	if (atomic_read(&mm->membarrier_state)
 			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
-		return;
+		return 0;
 	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
 	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
 		/*
@@ -128,6 +228,7 @@ static void membarrier_register_private_expedited(void)
 	}
 	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
 			&mm->membarrier_state);
+	return 0;
 }
 
 /**
@@ -167,21 +268,24 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
 
 		if (tick_nohz_full_enabled())
-			cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 		return cmd_mask;
 	}
-	case MEMBARRIER_CMD_SHARED:
-		/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+	case MEMBARRIER_CMD_GLOBAL:
+		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 		if (tick_nohz_full_enabled())
 			return -EINVAL;
 		if (num_online_cpus() > 1)
 			synchronize_sched();
 		return 0;
+	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+		return membarrier_global_expedited();
+	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+		return membarrier_register_global_expedited();
 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 		return membarrier_private_expedited();
 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
-		membarrier_register_private_expedited();
-		return 0;
+		return membarrier_register_private_expedited();
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 70216e18e519a54a2f13569e8caff99a092a92d6 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 29 Jan 2018 15:20:17 -0500
Subject: membarrier: Provide core serializing command, *_SYNC_CORE

Provide core serializing membarrier command to support memory reclaim
by JIT.

Each architecture needs to explicitly opt into that support by
documenting in their architecture code how they provide the core
serializing instructions required when returning from the membarrier
IPI, and after the scheduler has updated the curr->mm pointer (before
going back to user-space). They should then select
ARCH_HAS_MEMBARRIER_SYNC_CORE to enable support for that command on
their architecture.

Architectures selecting this feature need to either document that
they issue core serializing instructions when returning to user-space,
or implement their architecture-specific sync_core_before_usermode().

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Avi Kivity <avi@scylladb.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: David Sehr <sehr@google.com>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maged Michael <maged.michael@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Link: http://lkml.kernel.org/r/20180129202020.8515-9-mathieu.desnoyers@efficios.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/mm.h        | 18 ++++++++++++++
 include/uapi/linux/membarrier.h | 32 ++++++++++++++++++++++++-
 init/Kconfig                    |  3 +++
 kernel/sched/core.c             | 18 ++++++++++----
 kernel/sched/membarrier.c       | 53 +++++++++++++++++++++++++++++++----------
 5 files changed, 106 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 1c4e40c5efaf..03a169087a18 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
+#include <linux/sync_core.h>
 
 /*
  * Routines for handling mm_structs
@@ -223,12 +224,26 @@ enum {
 	MEMBARRIER_STATE_PRIVATE_EXPEDITED			= (1U << 1),
 	MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY			= (1U << 2),
 	MEMBARRIER_STATE_GLOBAL_EXPEDITED			= (1U << 3),
+	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY	= (1U << 4),
+	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE		= (1U << 5),
+};
+
+enum {
+	MEMBARRIER_FLAG_SYNC_CORE	= (1U << 0),
 };
 
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
 #include <asm/membarrier.h>
 #endif
 
+static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+{
+	if (likely(!(atomic_read(&mm->membarrier_state) &
+		     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
+		return;
+	sync_core_before_usermode();
+}
+
 static inline void membarrier_execve(struct task_struct *t)
 {
 	atomic_set(&t->mm->membarrier_state, 0);
@@ -244,6 +259,9 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
 static inline void membarrier_execve(struct task_struct *t)
 {
 }
+static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+{
+}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index d252506e1b5e..5891d7614c8c 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -73,7 +73,7 @@
  *                          to and return from the system call
  *                          (non-running threads are de facto in such a
  *                          state). This only covers threads from the
- *                          same processes as the caller thread. This
+ *                          same process as the caller thread. This
  *                          command returns 0 on success. The
  *                          "expedited" commands complete faster than
  *                          the non-expedited ones, they never block,
@@ -86,6 +86,34 @@
  *                          Register the process intent to use
  *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
  *                          returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ *                          In addition to provide memory ordering
+ *                          guarantees described in
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED, ensure
+ *                          the caller thread, upon return from system
+ *                          call, that all its running threads siblings
+ *                          have executed a core serializing
+ *                          instruction. (architectures are required to
+ *                          guarantee that non-running threads issue
+ *                          core serializing instructions before they
+ *                          resume user-space execution). This only
+ *                          covers threads from the same process as the
+ *                          caller thread. This command returns 0 on
+ *                          success. The "expedited" commands complete
+ *                          faster than the non-expedited ones, they
+ *                          never block, but have the downside of
+ *                          causing extra overhead. If this command is
+ *                          not implemented by an architecture, -EINVAL
+ *                          is returned. A process needs to register its
+ *                          intent to use the private expedited sync
+ *                          core command prior to using it, otherwise
+ *                          this command returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ *                          Register the process intent to use
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE.
+ *                          If this command is not implemented by an
+ *                          architecture, -EINVAL is returned.
+ *                          Returns 0 on success.
  * @MEMBARRIER_CMD_SHARED:
  *                          Alias to MEMBARRIER_CMD_GLOBAL. Provided for
  *                          header backward compatibility.
@@ -101,6 +129,8 @@ enum membarrier_cmd {
 	MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED		= (1 << 2),
 	MEMBARRIER_CMD_PRIVATE_EXPEDITED			= (1 << 3),
 	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED		= (1 << 4),
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE		= (1 << 5),
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE	= (1 << 6),
 
 	/* Alias for header backward compatibility. */
 	MEMBARRIER_CMD_SHARED			= MEMBARRIER_CMD_GLOBAL,
diff --git a/init/Kconfig b/init/Kconfig
index 535421facea8..e37f4b2a6445 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1415,6 +1415,9 @@ config USERFAULTFD
 config ARCH_HAS_MEMBARRIER_CALLBACKS
 	bool
 
+config ARCH_HAS_MEMBARRIER_SYNC_CORE
+	bool
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11bf4d48d2d3..ee420d78e674 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2704,13 +2704,21 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 
 	fire_sched_in_preempt_notifiers(current);
 	/*
-	 * When transitioning from a kernel thread to a userspace
-	 * thread, mmdrop()'s implicit full barrier is required by the
-	 * membarrier system call, because the current ->active_mm can
-	 * become the current mm without going through switch_mm().
+	 * When switching through a kernel thread, the loop in
+	 * membarrier_{private,global}_expedited() may have observed that
+	 * kernel thread and not issued an IPI. It is therefore possible to
+	 * schedule between user->kernel->user threads without passing though
+	 * switch_mm(). Membarrier requires a barrier after storing to
+	 * rq->curr, before returning to userspace, so provide them here:
+	 *
+	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+	 *   provided by mmdrop(),
+	 * - a sync_core for SYNC_CORE.
 	 */
-	if (mm)
+	if (mm) {
+		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop(mm);
+	}
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index d2087d5f9837..5d0762633639 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -26,11 +26,20 @@
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	\
+	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
+#endif
+
 #define MEMBARRIER_CMD_BITMASK	\
 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
-	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
+	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED	\
+	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
 
 static void ipi_mb(void *info)
 {
@@ -104,15 +113,23 @@ static int membarrier_global_expedited(void)
 	return 0;
 }
 
-static int membarrier_private_expedited(void)
+static int membarrier_private_expedited(int flags)
 {
 	int cpu;
 	bool fallback = false;
 	cpumask_var_t tmpmask;
 
-	if (!(atomic_read(&current->mm->membarrier_state)
-			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
-		return -EPERM;
+	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+			return -EINVAL;
+		if (!(atomic_read(&current->mm->membarrier_state) &
+		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+			return -EPERM;
+	} else {
+		if (!(atomic_read(&current->mm->membarrier_state) &
+		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+			return -EPERM;
+	}
 
 	if (num_online_cpus() == 1)
 		return 0;
@@ -205,20 +222,29 @@ static int membarrier_register_global_expedited(void)
 	return 0;
 }
 
-static int membarrier_register_private_expedited(void)
+static int membarrier_register_private_expedited(int flags)
 {
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
+	int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+
+	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+			return -EINVAL;
+		state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+	}
 
 	/*
 	 * We need to consider threads belonging to different thread
 	 * groups, which use the same mm. (CLONE_VM but not
 	 * CLONE_THREAD).
 	 */
-	if (atomic_read(&mm->membarrier_state)
-			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
+	if (atomic_read(&mm->membarrier_state) & state)
 		return 0;
 	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
+	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+		atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
+			  &mm->membarrier_state);
 	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
 		/*
 		 * Ensure all future scheduler executions will observe the
@@ -226,8 +252,7 @@ static int membarrier_register_private_expedited(void)
 		 */
 		synchronize_sched();
 	}
-	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
-			&mm->membarrier_state);
+	atomic_or(state, &mm->membarrier_state);
 	return 0;
 }
 
@@ -283,9 +308,13 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 		return membarrier_register_global_expedited();
 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-		return membarrier_private_expedited();
+		return membarrier_private_expedited(0);
 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
-		return membarrier_register_private_expedited();
+		return membarrier_register_private_expedited(0);
+	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From d0a144f12a7ca8368933eae6583c096c363ec506 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 6 Feb 2018 17:56:09 +0000
Subject: arm/arm64: KVM: Add PSCI_VERSION helper

As we're about to trigger a PSCI version explosion, it doesn't
hurt to introduce a PSCI_VERSION helper that is going to be
used everywhere.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/kvm/arm_psci.h    | 6 ++++--
 include/uapi/linux/psci.h | 3 +++
 virt/kvm/arm/psci.c       | 4 +---
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h
index 2042bb909474..5659343580a3 100644
--- a/include/kvm/arm_psci.h
+++ b/include/kvm/arm_psci.h
@@ -18,8 +18,10 @@
 #ifndef __KVM_ARM_PSCI_H__
 #define __KVM_ARM_PSCI_H__
 
-#define KVM_ARM_PSCI_0_1	1
-#define KVM_ARM_PSCI_0_2	2
+#include <uapi/linux/psci.h>
+
+#define KVM_ARM_PSCI_0_1	PSCI_VERSION(0, 1)
+#define KVM_ARM_PSCI_0_2	PSCI_VERSION(0, 2)
 
 int kvm_psci_version(struct kvm_vcpu *vcpu);
 int kvm_psci_call(struct kvm_vcpu *vcpu);
diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
index 760e52a9640f..b3bcabe380da 100644
--- a/include/uapi/linux/psci.h
+++ b/include/uapi/linux/psci.h
@@ -88,6 +88,9 @@
 		(((ver) & PSCI_VERSION_MAJOR_MASK) >> PSCI_VERSION_MAJOR_SHIFT)
 #define PSCI_VERSION_MINOR(ver)			\
 		((ver) & PSCI_VERSION_MINOR_MASK)
+#define PSCI_VERSION(maj, min)						\
+	((((maj) << PSCI_VERSION_MAJOR_SHIFT) & PSCI_VERSION_MAJOR_MASK) | \
+	 ((min) & PSCI_VERSION_MINOR_MASK))
 
 /* PSCI features decoding (>=1.0) */
 #define PSCI_1_0_FEATURES_CPU_SUSPEND_PF_SHIFT	1
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index b322e46fd142..999f94d6bb98 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -25,8 +25,6 @@
 
 #include <kvm/arm_psci.h>
 
-#include <uapi/linux/psci.h>
-
 /*
  * This is an implementation of the Power State Coordination Interface
  * as described in ARM document number ARM DEN 0022A.
@@ -222,7 +220,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 		 * Bits[31:16] = Major Version = 0
 		 * Bits[15:0] = Minor Version = 2
 		 */
-		val = 2;
+		val = KVM_ARM_PSCI_0_2;
 		break;
 	case PSCI_0_2_FN_CPU_SUSPEND:
 	case PSCI_0_2_FN64_CPU_SUSPEND:
-- 
cgit v1.2.3


From dfbc3c6cb747c074aa2ba0a10bbeea588d6dfda6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:48 -0800
Subject: uuid: cleanup <uapi/linux/uuid.h>

Exported header doesn't use anything from <linux/string.h>,
it is <linux/uuid.h> which uses memcmp().

Link: http://lkml.kernel.org/r/20171225171121.GA22754@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uuid.h      | 1 +
 include/uapi/linux/uuid.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 33b0bdbb613c..d9c4a6cce3c2 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -17,6 +17,7 @@
 #define _LINUX_UUID_H_
 
 #include <uapi/linux/uuid.h>
+#include <linux/string.h>
 
 #define UUID_SIZE 16
 
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 5c04130bb524..e5a7eecef7c3 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -19,7 +19,6 @@
 #define _UAPI_LINUX_UUID_H_
 
 #include <linux/types.h>
-#include <linux/string.h>
 
 typedef struct {
 	__u8 b[16];
-- 
cgit v1.2.3


From da360299b6734135a5f66d7db458dcc7801c826a Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Mon, 12 Feb 2018 23:59:51 +0100
Subject: uapi/if_ether.h: move __UAPI_DEF_ETHHDR libc define

This fixes a compile problem of some user space applications by not
including linux/libc-compat.h in uapi/if_ether.h.

linux/libc-compat.h checks which "features" the header files, included
from the libc, provide to make the Linux kernel uapi header files only
provide no conflicting structures and enums. If a user application mixes
kernel headers and libc headers it could happen that linux/libc-compat.h
gets included too early where not all other libc headers are included
yet. Then the linux/libc-compat.h would not prevent all the
redefinitions and we run into compile problems.
This patch removes the include of linux/libc-compat.h from
uapi/if_ether.h to fix the recently introduced case, but not all as this
is more or less impossible.

It is no problem to do the check directly in the if_ether.h file and not
in libc-compat.h as this does not need any fancy glibc header detection
as glibc never provided struct ethhdr and should define
__UAPI_DEF_ETHHDR by them self when they will provide this.

The following test program did not compile correctly any more:

#include <linux/if_ether.h>
#include <netinet/in.h>
#include <linux/in.h>

int main(void)
{
	return 0;
}

Fixes: 6926e041a892 ("uapi/if_ether.h: prevent redefinition of struct ethhdr")
Reported-by: Guillaume Nault <g.nault@alphalink.fr>
Cc: <stable@vger.kernel.org> # 4.15
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h    | 6 +++++-
 include/uapi/linux/libc-compat.h | 6 ------
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index f8cb5760ea4f..8bbbcb5cd94b 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -23,7 +23,6 @@
 #define _UAPI_LINUX_IF_ETHER_H
 
 #include <linux/types.h>
-#include <linux/libc-compat.h>
 
 /*
  *	IEEE 802.3 Ethernet magic constants.  The frame sizes omit the preamble
@@ -151,6 +150,11 @@
  *	This is an Ethernet frame header.
  */
 
+/* allow libcs like musl to deactivate this, glibc does not implement this. */
+#ifndef __UAPI_DEF_ETHHDR
+#define __UAPI_DEF_ETHHDR		1
+#endif
+
 #if __UAPI_DEF_ETHHDR
 struct ethhdr {
 	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index fc29efaa918c..8254c937c9f4 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -264,10 +264,4 @@
 
 #endif /* __GLIBC__ */
 
-/* Definitions for if_ether.h */
-/* allow libcs like musl to deactivate this, glibc does not implement this. */
-#ifndef __UAPI_DEF_ETHHDR
-#define __UAPI_DEF_ETHHDR		1
-#endif
-
 #endif /* _UAPI_LIBC_COMPAT_H */
-- 
cgit v1.2.3


From 2a040f9f39d3b020c79e08dec26d12a7ce131c10 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Tue, 20 Feb 2018 19:47:45 -0700
Subject: seccomp, ptrace: switch get_metadata types to arch independent

Commit 26500475ac1b ("ptrace, seccomp: add support for retrieving seccomp
metadata") introduced `struct seccomp_metadata`, which contained unsigned
longs that should be arch independent. The type of the flags member was
chosen to match the corresponding argument to seccomp(), and so we need
something at least as big as unsigned long. My understanding is that __u64
should fit the bill, so let's switch both types to that.

While this is userspace facing, it was only introduced in 4.16-rc2, and so
should be safe assuming it goes in before then.

Reported-by: "Dmitry V. Levin" <ldv@altlinux.org>
Signed-off-by: Tycho Andersen <tycho@tycho.ws>
CC: Kees Cook <keescook@chromium.org>
CC: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: "Dmitry V. Levin" <ldv@altlinux.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/ptrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index e46d82b91166..d5a1b8a492b9 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -69,8 +69,8 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_SECCOMP_GET_METADATA	0x420d
 
 struct seccomp_metadata {
-	unsigned long filter_off;	/* Input: which filter */
-	unsigned int flags;		/* Output: filter's flags */
+	__u64 filter_off;	/* Input: which filter */
+	__u64 flags;		/* Output: filter's flags */
 };
 
 /* Read signals from a shared (process wide) queue */
-- 
cgit v1.2.3


From 9c171cdf22d1486da1608abd7612fabe2a8262ca Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 9 Feb 2018 05:51:19 -0500
Subject: media: dvb: add continuity error indicators for memory mapped buffers

While userspace can detect discontinuity errors, it is useful to
also let Kernelspace reporting discontinuity, as it can help to
identify if the data loss happened either at Kernel or userspace side.

Update documentation accordingly.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions  | 14 +++++++++----
 Documentation/media/uapi/dvb/dmx-qbuf.rst |  7 ++++---
 include/uapi/linux/dvb/dmx.h              | 35 +++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index 63f55a9ae2b1..a8c4239ed95b 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -50,9 +50,15 @@ replace typedef dmx_filter_t :c:type:`dmx_filter`
 replace typedef dmx_pes_type_t :c:type:`dmx_pes_type`
 replace typedef dmx_input_t :c:type:`dmx_input`
 
-ignore symbol DMX_OUT_DECODER
-ignore symbol DMX_OUT_TAP
-ignore symbol DMX_OUT_TS_TAP
-ignore symbol DMX_OUT_TSDEMUX_TAP
+replace symbol DMX_BUFFER_FLAG_HAD_CRC32_DISCARD :c:type:`dmx_buffer_flags`
+replace	symbol DMX_BUFFER_FLAG_TEI :c:type:`dmx_buffer_flags`
+replace	symbol DMX_BUFFER_PKT_COUNTER_MISMATCH :c:type:`dmx_buffer_flags`
+replace	symbol DMX_BUFFER_FLAG_DISCONTINUITY_DETECTED :c:type:`dmx_buffer_flags`
+replace	symbol DMX_BUFFER_FLAG_DISCONTINUITY_INDICATOR :c:type:`dmx_buffer_flags`
+
+replace symbol DMX_OUT_DECODER :c:type:`dmx_output`
+replace symbol DMX_OUT_TAP :c:type:`dmx_output`
+replace symbol DMX_OUT_TS_TAP :c:type:`dmx_output`
+replace symbol DMX_OUT_TSDEMUX_TAP :c:type:`dmx_output`
 
 replace ioctl DMX_DQBUF dmx_qbuf
diff --git a/Documentation/media/uapi/dvb/dmx-qbuf.rst b/Documentation/media/uapi/dvb/dmx-qbuf.rst
index b48c4931658e..be5a4c6f1904 100644
--- a/Documentation/media/uapi/dvb/dmx-qbuf.rst
+++ b/Documentation/media/uapi/dvb/dmx-qbuf.rst
@@ -51,9 +51,10 @@ out to disk. Buffers remain locked until dequeued, until the
 the device is closed.
 
 Applications call the ``DMX_DQBUF`` ioctl to dequeue a filled
-(capturing) buffer from the driver's outgoing queue. They just set the ``reserved`` field array to zero. When ``DMX_DQBUF`` is called with a
-pointer to this structure, the driver fills the remaining fields or
-returns an error code.
+(capturing) buffer from the driver's outgoing queue.
+They just set the ``index`` field withe the buffer ID to be queued.
+When ``DMX_DQBUF`` is called with a pointer to struct :c:type:`dmx_buffer`,
+the driver fills the remaining fields or returns an error code.
 
 By default ``DMX_DQBUF`` blocks when no buffer is in the outgoing
 queue. When the ``O_NONBLOCK`` flag was given to the
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 5f3c5a918f00..b4112f0b6dd3 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -211,6 +211,32 @@ struct dmx_stc {
 	__u64 stc;
 };
 
+/**
+ * enum dmx_buffer_flags - DMX memory-mapped buffer flags
+ *
+ * @DMX_BUFFER_FLAG_HAD_CRC32_DISCARD:
+ *	Indicates that the Kernel discarded one or more frames due to wrong
+ *	CRC32 checksum.
+ * @DMX_BUFFER_FLAG_TEI:
+ *	Indicates that the Kernel has detected a Transport Error indicator
+ *	(TEI) on a filtered pid.
+ * @DMX_BUFFER_PKT_COUNTER_MISMATCH:
+ *	Indicates that the Kernel has detected a packet counter mismatch
+ *	on a filtered pid.
+ * @DMX_BUFFER_FLAG_DISCONTINUITY_DETECTED:
+ *	Indicates that the Kernel has detected one or more frame discontinuity.
+ * @DMX_BUFFER_FLAG_DISCONTINUITY_INDICATOR:
+ *	Received at least one packet with a frame discontinuity indicator.
+ */
+
+enum dmx_buffer_flags {
+	DMX_BUFFER_FLAG_HAD_CRC32_DISCARD		= 1 << 0,
+	DMX_BUFFER_FLAG_TEI				= 1 << 1,
+	DMX_BUFFER_PKT_COUNTER_MISMATCH			= 1 << 2,
+	DMX_BUFFER_FLAG_DISCONTINUITY_DETECTED		= 1 << 3,
+	DMX_BUFFER_FLAG_DISCONTINUITY_INDICATOR		= 1 << 4,
+};
+
 /**
  * struct dmx_buffer - dmx buffer info
  *
@@ -220,15 +246,24 @@ struct dmx_stc {
  *		offset from the start of the device memory for this plane,
  *		(or a "cookie" that should be passed to mmap() as offset)
  * @length:	size in bytes of the buffer
+ * @flags:	bit array of buffer flags as defined by &enum dmx_buffer_flags.
+ *		Filled only at &DMX_DQBUF.
+ * @count:	monotonic counter for filled buffers. Helps to identify
+ *		data stream loses. Filled only at &DMX_DQBUF.
  *
  * Contains data exchanged by application and driver using one of the streaming
  * I/O methods.
+ *
+ * Please notice that, for &DMX_QBUF, only @index should be filled.
+ * On &DMX_DQBUF calls, all fields will be filled by the Kernel.
  */
 struct dmx_buffer {
 	__u32			index;
 	__u32			bytesused;
 	__u32			offset;
 	__u32			length;
+	__u32			flags;
+	__u32			count;
 };
 
 /**
-- 
cgit v1.2.3


From 45d0be876308bf2f858559e84455219eadd9ddc7 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 15 Jan 2018 07:32:04 -0600
Subject: include: psp-sev: Capitalize invalid length enum
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 1d57b17c60ff ("crypto: ccp: Define SEV userspace ioctl and command
id") added the invalid length enum but we missed capitalizing it.

Fixes: 1d57b17c60ff (crypto: ccp: Define SEV userspace ioctl ...)
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
CC: Gary R Hook <gary.hook@amd.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/uapi/linux/psp-sev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
index 3d77fe91239a..9008f31c7eb6 100644
--- a/include/uapi/linux/psp-sev.h
+++ b/include/uapi/linux/psp-sev.h
@@ -42,7 +42,7 @@ typedef enum {
 	SEV_RET_INVALID_PLATFORM_STATE,
 	SEV_RET_INVALID_GUEST_STATE,
 	SEV_RET_INAVLID_CONFIG,
-	SEV_RET_INVALID_len,
+	SEV_RET_INVALID_LEN,
 	SEV_RET_ALREADY_OWNED,
 	SEV_RET_INVALID_CERTIFICATE,
 	SEV_RET_POLICY_FAILURE,
-- 
cgit v1.2.3


From 9c72258870a95671aa301e21ea6639d1d3ec4111 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Jan 2018 16:58:06 -0800
Subject: blktrace_api.h: fix comment for struct blk_user_trace_setup

'struct blk_user_trace_setup' is passed to BLKTRACESETUP, not
BLKTRACESTART.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index 20d1490d6377..3c50e07ee833 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -131,7 +131,7 @@ enum {
 #define BLKTRACE_BDEV_SIZE	32
 
 /*
- * User setup structure passed with BLKTRACESTART
+ * User setup structure passed with BLKTRACESETUP
  */
 struct blk_user_trace_setup {
 	char name[BLKTRACE_BDEV_SIZE];	/* output */
-- 
cgit v1.2.3


From 801e459a6f3a63af9d447e6249088c76ae16efc4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 21 Feb 2018 13:39:51 -0600
Subject: KVM: x86: Add a framework for supporting MSR-based features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide a new KVM capability that allows bits within MSRs to be recognized
as features.  Two new ioctls are added to the /dev/kvm ioctl routine to
retrieve the list of these MSRs and then retrieve their values. A kvm_x86_ops
callback is used to determine support for the listed MSR-based features.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Tweaked documentation. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 40 ++++++++++++++-------
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/svm.c                |  6 ++++
 arch/x86/kvm/vmx.c                |  6 ++++
 arch/x86/kvm/x86.c                | 75 ++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/kvm.h          |  2 ++
 6 files changed, 114 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 792fa8717d13..d6b3ff51a14f 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -123,14 +123,15 @@ memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the
 flag KVM_VM_MIPS_VZ.
 
 
-4.3 KVM_GET_MSR_INDEX_LIST
+4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST
 
-Capability: basic
+Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST
 Architectures: x86
-Type: system
+Type: system ioctl
 Parameters: struct kvm_msr_list (in/out)
 Returns: 0 on success; -1 on error
 Errors:
+  EFAULT:    the msr index list cannot be read from or written to
   E2BIG:     the msr index list is to be to fit in the array specified by
              the user.
 
@@ -139,16 +140,23 @@ struct kvm_msr_list {
 	__u32 indices[0];
 };
 
-This ioctl returns the guest msrs that are supported.  The list varies
-by kvm version and host processor, but does not change otherwise.  The
-user fills in the size of the indices array in nmsrs, and in return
-kvm adjusts nmsrs to reflect the actual number of msrs and fills in
-the indices array with their numbers.
+The user fills in the size of the indices array in nmsrs, and in return
+kvm adjusts nmsrs to reflect the actual number of msrs and fills in the
+indices array with their numbers.
+
+KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported.  The list
+varies by kvm version and host processor, but does not change otherwise.
 
 Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
 not returned in the MSR list, as different vcpus can have a different number
 of banks, as set via the KVM_X86_SETUP_MCE ioctl.
 
+KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed
+to the KVM_GET_MSRS system ioctl.  This lets userspace probe host capabilities
+and processor features that are exposed via MSRs (e.g., VMX capabilities).
+This list also varies by kvm version and host processor, but does not change
+otherwise.
+
 
 4.4 KVM_CHECK_EXTENSION
 
@@ -475,14 +483,22 @@ Support for this has been removed.  Use KVM_SET_GUEST_DEBUG instead.
 
 4.18 KVM_GET_MSRS
 
-Capability: basic
+Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system)
 Architectures: x86
-Type: vcpu ioctl
+Type: system ioctl, vcpu ioctl
 Parameters: struct kvm_msrs (in/out)
-Returns: 0 on success, -1 on error
+Returns: number of msrs successfully returned;
+        -1 on error
+
+When used as a system ioctl:
+Reads the values of MSR-based features that are available for the VM.  This
+is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values.
+The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST
+in a system ioctl.
 
+When used as a vcpu ioctl:
 Reads model-specific registers from the vcpu.  Supported msr indices can
-be obtained using KVM_GET_MSR_INDEX_LIST.
+be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl.
 
 struct kvm_msrs {
 	__u32 nmsrs; /* number of msrs in entries */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0a9e330b34f0..bab0694b35c3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1095,6 +1095,8 @@ struct kvm_x86_ops {
 	int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
 	int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 	int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+
+	int (*get_msr_feature)(struct kvm_msr_entry *entry);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3d8377f75eda..d8db947acf70 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3869,6 +3869,11 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	return 0;
 }
 
+static int svm_get_msr_feature(struct kvm_msr_entry *msr)
+{
+	return 1;
+}
+
 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -6832,6 +6837,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.vcpu_unblocking = svm_vcpu_unblocking,
 
 	.update_bp_intercept = update_bp_intercept,
+	.get_msr_feature = svm_get_msr_feature,
 	.get_msr = svm_get_msr,
 	.set_msr = svm_set_msr,
 	.get_segment_base = svm_get_segment_base,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ec14f2319a87..fafc1f6d8987 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3226,6 +3226,11 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
 	return !(val & ~valid_bits);
 }
 
+static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+{
+	return 1;
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -12296,6 +12301,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.vcpu_put = vmx_vcpu_put,
 
 	.update_bp_intercept = update_exception_bitmap,
+	.get_msr_feature = vmx_get_msr_feature,
 	.get_msr = vmx_get_msr,
 	.set_msr = vmx_set_msr,
 	.get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 96edda878dbf..239fc1fd7845 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1049,6 +1049,28 @@ static u32 emulated_msrs[] = {
 
 static unsigned num_emulated_msrs;
 
+/*
+ * List of msr numbers which are used to expose MSR-based features that
+ * can be used by a hypervisor to validate requested CPU features.
+ */
+static u32 msr_based_features[] = {
+};
+
+static unsigned int num_msr_based_features;
+
+static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+	struct kvm_msr_entry msr;
+
+	msr.index = index;
+	if (kvm_x86_ops->get_msr_feature(&msr))
+		return 1;
+
+	*data = msr.data;
+
+	return 0;
+}
+
 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	if (efer & efer_reserved_bits)
@@ -2680,13 +2702,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 		    int (*do_msr)(struct kvm_vcpu *vcpu,
 				  unsigned index, u64 *data))
 {
-	int i, idx;
+	int i;
 
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	for (i = 0; i < msrs->nmsrs; ++i)
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
 	return i;
 }
@@ -2785,6 +2805,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
 	case KVM_CAP_IMMEDIATE_EXIT:
+	case KVM_CAP_GET_MSR_FEATURES:
 		r = 1;
 		break;
 	case KVM_CAP_ADJUST_CLOCK:
@@ -2899,6 +2920,31 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			goto out;
 		r = 0;
 		break;
+	case KVM_GET_MSR_FEATURE_INDEX_LIST: {
+		struct kvm_msr_list __user *user_msr_list = argp;
+		struct kvm_msr_list msr_list;
+		unsigned int n;
+
+		r = -EFAULT;
+		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
+			goto out;
+		n = msr_list.nmsrs;
+		msr_list.nmsrs = num_msr_based_features;
+		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
+			goto out;
+		r = -E2BIG;
+		if (n < msr_list.nmsrs)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(user_msr_list->indices, &msr_based_features,
+				 num_msr_based_features * sizeof(u32)))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_GET_MSRS:
+		r = msr_io(NULL, argp, do_get_msr_feature, 1);
+		break;
 	}
 	default:
 		r = -EINVAL;
@@ -3636,12 +3682,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
-	case KVM_GET_MSRS:
+	case KVM_GET_MSRS: {
+		int idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = msr_io(vcpu, argp, do_get_msr, 1);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 		break;
-	case KVM_SET_MSRS:
+	}
+	case KVM_SET_MSRS: {
+		int idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = msr_io(vcpu, argp, do_set_msr, 0);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 		break;
+	}
 	case KVM_TPR_ACCESS_REPORTING: {
 		struct kvm_tpr_access_ctl tac;
 
@@ -4464,6 +4516,19 @@ static void kvm_init_msr_list(void)
 		j++;
 	}
 	num_emulated_msrs = j;
+
+	for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
+		struct kvm_msr_entry msr;
+
+		msr.index = msr_based_features[i];
+		if (kvm_x86_ops->get_msr_feature(&msr))
+			continue;
+
+		if (j < i)
+			msr_based_features[j] = msr_based_features[i];
+		j++;
+	}
+	num_msr_based_features = j;
 }
 
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0fb5ef939732..7b26d4b0b052 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -761,6 +761,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_TRACE_PAUSE           __KVM_DEPRECATED_MAIN_0x07
 #define KVM_TRACE_DISABLE         __KVM_DEPRECATED_MAIN_0x08
 #define KVM_GET_EMULATED_CPUID	  _IOWR(KVMIO, 0x09, struct kvm_cpuid2)
+#define KVM_GET_MSR_FEATURE_INDEX_LIST    _IOWR(KVMIO, 0x0a, struct kvm_msr_list)
 
 /*
  * Extension capability list.
@@ -934,6 +935,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_AIS_MIGRATION 150
 #define KVM_CAP_PPC_GET_CPU_CHAR 151
 #define KVM_CAP_S390_BPB 152
+#define KVM_CAP_GET_MSR_FEATURES 153
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3