From 1ab92da32e37758c0e2e2a455f06d5f40609f14e Mon Sep 17 00:00:00 2001
From: Okash Khawaja <okash.khawaja@gmail.com>
Date: Mon, 15 May 2017 18:45:33 +0100
Subject: staging: speakup: add tty-based comms functions

This adds spk_ttyio.c file. It contains a set of functions which implement
those methods in spk_synth struct which relate to sending bytes out using
serial comms. Implementations in this file perform the same function but
using TTY subsystem instead. Currently synths access serial ports, directly
poking standard ISA ports by trying to steal them from serial driver. Some ISA
cards actually need this way of doing it, but most other synthesizers don't,
and can actually work by using the proper TTY subsystem through a new N_SPEAKUP
line discipline. So this adds the methods for drivers to switch to accessing
serial ports through the TTY subsystem, whenever appropriate.

Signed-off-by: Okash Khawaja <okash.khawaja@gmail.com>
Reviewed-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/tty.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index 01c4410352ff..e7855dffd592 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -35,5 +35,6 @@
 #define N_TRACESINK	23	/* Trace data routing for MIPI P1149.7 */
 #define N_TRACEROUTER	24	/* Trace data routing for MIPI P1149.7 */
 #define N_NCI		25	/* NFC NCI UART */
+#define N_SPEAKUP	26	/* Speakup communication with synths */
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit v1.2.3


From 5bc1701881e395cec51811d07ec6961f3d1b2612 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 May 2017 11:08:01 +0200
Subject: net: sched: introduce multichain support for filters

Instead of having only one filter per block, introduce a list of chains
for every block. Create chain 0 by default. UAPI is extended so the user
can specify which chain he wants to change. If the new attribute is not
specified, chain 0 is used. That allows to maintain backward
compatibility. If chain does not exist and user wants to manipulate with
it, new chain is created with specified index. Also, when last filter is
removed from the chain, the chain is destroyed.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h          |   2 +
 include/net/sch_generic.h      |   9 +++-
 include/uapi/linux/rtnetlink.h |   1 +
 net/sched/cls_api.c            | 104 ++++++++++++++++++++++++++++++++++-------
 4 files changed, 98 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e56e7157c280..2c213a69c196 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -18,6 +18,8 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
 #ifdef CONFIG_NET_CLS
+struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index);
+void tcf_chain_put(struct tcf_chain *chain);
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain);
 void tcf_block_put(struct tcf_block *block);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 52bceede534b..569b5654c30c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -8,6 +8,7 @@
 #include <linux/pkt_cls.h>
 #include <linux/percpu.h>
 #include <linux/dynamic_queue_limits.h>
+#include <linux/list.h>
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 
@@ -236,7 +237,7 @@ struct tcf_proto {
 	struct Qdisc		*q;
 	void			*data;
 	const struct tcf_proto_ops	*ops;
-	struct tcf_block	*block;
+	struct tcf_chain	*chain;
 	struct rcu_head		rcu;
 };
 
@@ -251,10 +252,14 @@ struct qdisc_skb_cb {
 struct tcf_chain {
 	struct tcf_proto __rcu *filter_chain;
 	struct tcf_proto __rcu **p_filter_chain;
+	struct list_head list;
+	struct tcf_block *block;
+	u32 index; /* chain index */
+	unsigned int refcnt;
 };
 
 struct tcf_block {
-	struct tcf_chain *chain;
+	struct list_head chain_list;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cce061382e40..6487b21b2c1e 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -549,6 +549,7 @@ enum {
 	TCA_STAB,
 	TCA_PAD,
 	TCA_DUMP_INVISIBLE,
+	TCA_CHAIN,
 	__TCA_MAX
 };
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 63aa2ea5f00c..adacaf299c4a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -129,7 +129,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 
 static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
 					  u32 prio, u32 parent, struct Qdisc *q,
-					  struct tcf_block *block)
+					  struct tcf_chain *chain)
 {
 	struct tcf_proto *tp;
 	int err;
@@ -165,7 +165,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
 	tp->prio = prio;
 	tp->classid = parent;
 	tp->q = q;
-	tp->block = block;
+	tp->chain = chain;
 
 	err = tp->ops->init(tp);
 	if (err) {
@@ -186,15 +186,26 @@ static void tcf_proto_destroy(struct tcf_proto *tp)
 	kfree_rcu(tp, rcu);
 }
 
-static struct tcf_chain *tcf_chain_create(void)
+static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
+					  u32 chain_index)
 {
-	return kzalloc(sizeof(struct tcf_chain), GFP_KERNEL);
+	struct tcf_chain *chain;
+
+	chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+	if (!chain)
+		return NULL;
+	list_add_tail(&chain->list, &block->chain_list);
+	chain->block = block;
+	chain->index = chain_index;
+	chain->refcnt = 1;
+	return chain;
 }
 
 static void tcf_chain_destroy(struct tcf_chain *chain)
 {
 	struct tcf_proto *tp;
 
+	list_del(&chain->list);
 	while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
 		RCU_INIT_POINTER(chain->filter_chain, tp->next);
 		tcf_proto_destroy(tp);
@@ -202,6 +213,30 @@ static void tcf_chain_destroy(struct tcf_chain *chain)
 	kfree(chain);
 }
 
+struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index)
+{
+	struct tcf_chain *chain;
+
+	list_for_each_entry(chain, &block->chain_list, list) {
+		if (chain->index == chain_index) {
+			chain->refcnt++;
+			return chain;
+		}
+	}
+	return tcf_chain_create(block, chain_index);
+}
+EXPORT_SYMBOL(tcf_chain_get);
+
+void tcf_chain_put(struct tcf_chain *chain)
+{
+	/* Destroy unused chain, with exception of chain 0, which is the
+	 * default one and has to be always present.
+	 */
+	if (--chain->refcnt == 0 && !chain->filter_chain && chain->index != 0)
+		tcf_chain_destroy(chain);
+}
+EXPORT_SYMBOL(tcf_chain_put);
+
 static void
 tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain,
 			       struct tcf_proto __rcu **p_filter_chain)
@@ -213,16 +248,19 @@ int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain)
 {
 	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
+	struct tcf_chain *chain;
 	int err;
 
 	if (!block)
 		return -ENOMEM;
-	block->chain = tcf_chain_create();
-	if (!block->chain) {
+	INIT_LIST_HEAD(&block->chain_list);
+	/* Create chain 0 by default, it has to be always present. */
+	chain = tcf_chain_create(block, 0);
+	if (!chain) {
 		err = -ENOMEM;
 		goto err_chain_create;
 	}
-	tcf_chain_filter_chain_ptr_set(block->chain, p_filter_chain);
+	tcf_chain_filter_chain_ptr_set(chain, p_filter_chain);
 	*p_block = block;
 	return 0;
 
@@ -234,9 +272,13 @@ EXPORT_SYMBOL(tcf_block_get);
 
 void tcf_block_put(struct tcf_block *block)
 {
+	struct tcf_chain *chain, *tmp;
+
 	if (!block)
 		return;
-	tcf_chain_destroy(block->chain);
+
+	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+		tcf_chain_destroy(chain);
 	kfree(block);
 }
 EXPORT_SYMBOL(tcf_block_put);
@@ -360,10 +402,11 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 	u32 prio;
 	bool prio_allocate;
 	u32 parent;
+	u32 chain_index;
 	struct net_device *dev;
 	struct Qdisc  *q;
 	struct tcf_chain_info chain_info;
-	struct tcf_chain *chain;
+	struct tcf_chain *chain = NULL;
 	struct tcf_block *block;
 	struct tcf_proto *tp;
 	const struct Qdisc_class_ops *cops;
@@ -449,7 +492,17 @@ replay:
 		err = -EINVAL;
 		goto errout;
 	}
-	chain = block->chain;
+
+	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+	if (chain_index > TC_ACT_EXT_VAL_MASK) {
+		err = -EINVAL;
+		goto errout;
+	}
+	chain = tcf_chain_get(block, chain_index);
+	if (!chain) {
+		err = -ENOMEM;
+		goto errout;
+	}
 
 	if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
 		tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
@@ -483,7 +536,7 @@ replay:
 			prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info));
 
 		tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
-				      protocol, prio, parent, q, block);
+				      protocol, prio, parent, q, chain);
 		if (IS_ERR(tp)) {
 			err = PTR_ERR(tp);
 			goto errout;
@@ -556,6 +609,8 @@ replay:
 	}
 
 errout:
+	if (chain)
+		tcf_chain_put(chain);
 	if (cl)
 		cops->put(q, cl);
 	if (err == -EAGAIN)
@@ -584,6 +639,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 	tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
 	if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
 		goto nla_put_failure;
+	if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
+		goto nla_put_failure;
 	tcm->tcm_handle = fh;
 	if (RTM_DELTFILTER != event) {
 		tcm->tcm_handle = 0;
@@ -640,7 +697,7 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
 			     RTM_NEWTFILTER);
 }
 
-static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb,
+static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb,
 			   struct netlink_callback *cb,
 			   long index_start, long *p_index)
 {
@@ -667,7 +724,7 @@ static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb,
 					  NETLINK_CB(cb->skb).portid,
 					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					  RTM_NEWTFILTER) <= 0)
-				break;
+				return false;
 
 			cb->args[1] = 1;
 		}
@@ -682,14 +739,16 @@ static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb,
 		tp->ops->walk(tp, &arg.w);
 		cb->args[1] = arg.w.count + 1;
 		if (arg.w.stop)
-			break;
+			return false;
 	}
+	return true;
 }
 
 /* called with RTNL */
 static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
 	struct Qdisc *q;
 	struct tcf_block *block;
@@ -699,9 +758,15 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct Qdisc_class_ops *cops;
 	long index_start;
 	long index;
+	int err;
 
 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
 		return skb->len;
+
+	err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL);
+	if (err)
+		return err;
+
 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
 	if (!dev)
 		return skb->len;
@@ -725,11 +790,18 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	block = cops->tcf_block(q, cl);
 	if (!block)
 		goto errout;
-	chain = block->chain;
 
 	index_start = cb->args[0];
 	index = 0;
-	tcf_chain_dump(chain, skb, cb, index_start, &index);
+
+	list_for_each_entry(chain, &block->chain_list, list) {
+		if (tca[TCA_CHAIN] &&
+		    nla_get_u32(tca[TCA_CHAIN]) != chain->index)
+			continue;
+		if (!tcf_chain_dump(chain, skb, cb, index_start, &index))
+			break;
+	}
+
 	cb->args[0] = index;
 
 errout:
-- 
cgit v1.2.3


From db50514f9a9c7ef1f17e9921b1cc0902746872f3 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 17 May 2017 11:08:03 +0200
Subject: net: sched: add termination action to allow goto chain

Introduce new type of termination action called "goto_chain". This allows
user to specify a chain to be processed. This action type is
then processed as a return value in tcf_classify loop in similar
way as "reclassify" is, only it does not reset to the first filter
in chain but rather reset to the first filter of the desired chain.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h        |  1 +
 include/net/sch_generic.h    |  9 +++++++--
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/act_api.c          | 40 ++++++++++++++++++++++++++++++++++++++++
 net/sched/cls_api.c          |  6 +++++-
 5 files changed, 54 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index b22c6f3d6710..26ffd8333f50 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -42,6 +42,7 @@ struct tc_action {
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 	struct tc_cookie	*act_cookie;
+	struct tcf_chain	*goto_chain;
 };
 #define tcf_head	common.tcfa_head
 #define tcf_index	common.tcfa_index
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 569b5654c30c..368850194c94 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -193,8 +193,13 @@ struct Qdisc_ops {
 
 
 struct tcf_result {
-	unsigned long	class;
-	u32		classid;
+	union {
+		struct {
+			unsigned long	class;
+			u32		classid;
+		};
+		const struct tcf_proto *goto_tp;
+	};
 };
 
 struct tcf_proto_ops {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index d613be3b3239..1b9aa9e6b4fd 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -51,6 +51,7 @@ enum {
 	(((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode)
 
 #define TC_ACT_JUMP __TC_ACT_EXT(1)
+#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
 
 /* Action type identifiers*/
 enum {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index e389eb45b484..0ecf2a858767 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -28,6 +28,31 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
+static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
+{
+	u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK;
+
+	if (!tp)
+		return -EINVAL;
+	a->goto_chain = tcf_chain_get(tp->chain->block, chain_index);
+	if (!a->goto_chain)
+		return -ENOMEM;
+	return 0;
+}
+
+static void tcf_action_goto_chain_fini(struct tc_action *a)
+{
+	tcf_chain_put(a->goto_chain);
+}
+
+static void tcf_action_goto_chain_exec(const struct tc_action *a,
+				       struct tcf_result *res)
+{
+	const struct tcf_chain *chain = a->goto_chain;
+
+	res->goto_tp = rcu_dereference_bh(chain->filter_chain);
+}
+
 static void free_tcf(struct rcu_head *head)
 {
 	struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu);
@@ -39,6 +64,8 @@ static void free_tcf(struct rcu_head *head)
 		kfree(p->act_cookie->data);
 		kfree(p->act_cookie);
 	}
+	if (p->goto_chain)
+		tcf_action_goto_chain_fini(p);
 
 	kfree(p);
 }
@@ -465,6 +492,8 @@ repeat:
 				else /* faulty graph, stop pipeline */
 					return TC_ACT_OK;
 			}
+		} else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) {
+			tcf_action_goto_chain_exec(a, res);
 		}
 
 		if (ret != TC_ACT_PIPE)
@@ -657,6 +686,17 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	if (err != ACT_P_CREATED)
 		module_put(a_o->owner);
 
+	if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
+		err = tcf_action_goto_chain_init(a, tp);
+		if (err) {
+			LIST_HEAD(actions);
+
+			list_add_tail(&a->list, &actions);
+			tcf_action_destroy(&actions, bind);
+			return ERR_PTR(err);
+		}
+	}
+
 	return a;
 
 err_mod:
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 9e0c4bb82528..4020b8d932a1 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -307,8 +307,12 @@ reclassify:
 
 		err = tp->classify(skb, tp, res);
 #ifdef CONFIG_NET_CLS_ACT
-		if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
+		if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) {
 			goto reset;
+		} else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
+			old_tp = res->goto_tp;
+			goto reset;
+		}
 #endif
 		if (err >= 0)
 			return err;
-- 
cgit v1.2.3


From e8759ad17d41913dfeb49736ca7fbfbc96f32c54 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 13 May 2017 09:29:04 +0200
Subject: serial: uapi: Add support for bus termination

The Siemens IOT2040 comes with a RS485 interface that allows to enable
or disable bus termination via software. Add a bit to the flags field of
serial_rs485 that applications can set in order to request this feature
from the hardware. This seems generic enough to add it for everyone.
Existing driver will simply ignore it when set.

Signed-off-by: Sascha Weisenberger <sascha.weisenberger@siemens.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/serial.h b/include/uapi/linux/serial.h
index 5d59c3ebf459..d2667ecd54ac 100644
--- a/include/uapi/linux/serial.h
+++ b/include/uapi/linux/serial.h
@@ -122,6 +122,9 @@ struct serial_rs485 {
 #define SER_RS485_RTS_AFTER_SEND	(1 << 2)	/* Logical level for
 							   RTS pin after sent*/
 #define SER_RS485_RX_DURING_TX		(1 << 4)
+#define SER_RS485_TERMINATE_BUS		(1 << 5)	/* Enable bus
+							   termination
+							   (if supported) */
 	__u32	delay_rts_before_send;	/* Delay before send (milliseconds) */
 	__u32	delay_rts_after_send;	/* Delay after send (milliseconds) */
 	__u32	padding[5];		/* Memory is cheap, new structs
-- 
cgit v1.2.3


From b8210a9e4bea6354eccc5d8a50ecc21ea7486dc9 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Fri, 19 May 2017 17:52:35 +0200
Subject: net: define receive timestamp filter for NTP

Add HWTSTAMP_FILTER_NTP_ALL to the hwtstamp_rx_filters enum for
timestamping of NTP packets. There is currently only one driver
(phyter) that could support it directly.

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_tstamp.h | 3 +++
 net/core/dev_ioctl.c            | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 464dcca5ed68..0749fb13e517 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -125,6 +125,9 @@ enum hwtstamp_rx_filters {
 	HWTSTAMP_FILTER_PTP_V2_SYNC,
 	/* PTP v2/802.AS1, any layer, Delay_req packet */
 	HWTSTAMP_FILTER_PTP_V2_DELAY_REQ,
+
+	/* NTP, UDP, all versions and packet modes */
+	HWTSTAMP_FILTER_NTP_ALL,
 };
 
 #endif /* _NET_TIMESTAMPING_H */
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b94b1d293506..8f036a76b92e 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -227,6 +227,8 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
 		rx_filter_valid = 1;
 		break;
+	case HWTSTAMP_FILTER_NTP_ALL:
+		break;
 	}
 
 	if (!tx_type_valid || !rx_filter_valid)
-- 
cgit v1.2.3


From aad9c8c470f2a8321a99eb053630ce0e199558d6 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Fri, 19 May 2017 17:52:38 +0200
Subject: net: add new control message for incoming HW-timestamped packets

Add SOF_TIMESTAMPING_OPT_PKTINFO option to request a new control message
for incoming packets with hardware timestamps. It contains the index of
the real interface which received the packet and the length of the
packet at layer 2.

The index is useful with bonding, bridges and other interfaces, where
IP_PKTINFO doesn't allow applications to determine which PHC made the
timestamp. With the L2 length (and link speed) it is possible to
transpose preamble timestamps to trailer timestamps, which are used in
the NTP protocol.

While this information could be provided by two new socket options
independently from timestamping, it doesn't look like they would be very
useful. With this option any performance impact is limited to hardware
timestamping.

Use dev_get_by_napi_id() to get the device and its index. On kernels
with disabled CONFIG_NET_RX_BUSY_POLL or drivers not using NAPI, a zero
index will be returned in the control message.

CC: Richard Cochran <richardcochran@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/timestamping.txt | 10 ++++++++++
 include/uapi/asm-generic/socket.h         |  2 ++
 include/uapi/linux/net_tstamp.h           | 11 ++++++++++-
 net/socket.c                              | 27 ++++++++++++++++++++++++++-
 4 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 96f50694a748..ce11e3a08c0d 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -193,6 +193,16 @@ SOF_TIMESTAMPING_OPT_STATS:
   the transmit timestamps, such as how long a certain block of
   data was limited by peer's receiver window.
 
+SOF_TIMESTAMPING_OPT_PKTINFO:
+
+  Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming
+  packets with hardware timestamps. The message contains struct
+  scm_ts_pktinfo, which supplies the index of the real interface which
+  received the packet and its length at layer 2. A valid (non-zero)
+  interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is
+  enabled and the driver is using NAPI. The struct contains also two
+  other fields, but they are reserved and undefined.
+
 New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
 disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
 regardless of the setting of sysctl net.core.tstamp_allow_data.
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 2b488565599d..a5f6e819fafd 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -100,4 +100,6 @@
 
 #define SO_COOKIE		57
 
+#define SCM_TIMESTAMPING_PKTINFO	58
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 0749fb13e517..dee74d39da94 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -9,6 +9,7 @@
 #ifndef _NET_TIMESTAMPING_H
 #define _NET_TIMESTAMPING_H
 
+#include <linux/types.h>
 #include <linux/socket.h>   /* for SO_TIMESTAMPING */
 
 /* SO_TIMESTAMPING gets an integer bit field comprised of these values */
@@ -26,8 +27,9 @@ enum {
 	SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
 	SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
 	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
+	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
@@ -130,4 +132,11 @@ enum hwtstamp_rx_filters {
 	HWTSTAMP_FILTER_NTP_ALL,
 };
 
+/* SCM_TIMESTAMPING_PKTINFO control message */
+struct scm_ts_pktinfo {
+	__u32 if_index;
+	__u32 pkt_length;
+	__u32 reserved[2];
+};
+
 #endif /* _NET_TIMESTAMPING_H */
diff --git a/net/socket.c b/net/socket.c
index c2564eb25c6b..67db7d8a3b81 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -662,6 +662,27 @@ static bool skb_is_err_queue(const struct sk_buff *skb)
 	return skb->pkt_type == PACKET_OUTGOING;
 }
 
+static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct scm_ts_pktinfo ts_pktinfo;
+	struct net_device *orig_dev;
+
+	if (!skb_mac_header_was_set(skb))
+		return;
+
+	memset(&ts_pktinfo, 0, sizeof(ts_pktinfo));
+
+	rcu_read_lock();
+	orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
+	if (orig_dev)
+		ts_pktinfo.if_index = orig_dev->ifindex;
+	rcu_read_unlock();
+
+	ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
+	put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
+		 sizeof(ts_pktinfo), &ts_pktinfo);
+}
+
 /*
  * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
  */
@@ -699,8 +720,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 		empty = 0;
 	if (shhwtstamps &&
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
-	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
+	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
 		empty = 0;
+		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
+		    !skb_is_err_queue(skb))
+			put_ts_pktinfo(msg, skb);
+	}
 	if (!empty) {
 		put_cmsg(msg, SOL_SOCKET,
 			 SCM_TIMESTAMPING, sizeof(tss), &tss);
-- 
cgit v1.2.3


From b50a5c70ffa4fd6b6da324ab54c84adf48fb17d9 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Fri, 19 May 2017 17:52:40 +0200
Subject: net: allow simultaneous SW and HW transmit timestamping

Add SOF_TIMESTAMPING_OPT_TX_SWHW option to allow an outgoing packet to
be looped to the socket's error queue with a software timestamp even
when a hardware transmit timestamp is expected to be provided by the
driver.

Applications using this option will receive two separate messages from
the error queue, one with a software timestamp and the other with a
hardware timestamp. As the hardware timestamp is saved to the shared skb
info, which may happen before the first message with software timestamp
is received by the application, the hardware timestamp is copied to the
SCM_TIMESTAMPING control message only when the skb has no software
timestamp or it is an incoming packet.

While changing sw_tx_timestamp(), inline it in skb_tx_timestamp() as
there are no other users.

CC: Richard Cochran <richardcochran@gmail.com>
CC: Willem de Bruijn <willemb@google.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/timestamping.txt |  8 ++++++++
 include/linux/skbuff.h                    | 10 ++--------
 include/uapi/linux/net_tstamp.h           |  3 ++-
 net/core/skbuff.c                         |  4 ++++
 net/socket.c                              | 20 ++++++++++++++++++--
 5 files changed, 34 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 50eb0e554778..196ba17cc344 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -203,6 +203,14 @@ SOF_TIMESTAMPING_OPT_PKTINFO:
   enabled and the driver is using NAPI. The struct contains also two
   other fields, but they are reserved and undefined.
 
+SOF_TIMESTAMPING_OPT_TX_SWHW:
+
+  Request both hardware and software timestamps for outgoing packets
+  when SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE
+  are enabled at the same time. If both timestamps are generated,
+  two separate messages will be looped to the socket's error queue,
+  each containing just one timestamp.
+
 New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
 disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
 regardless of the setting of sysctl net.core.tstamp_allow_data.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8acce7143f6a..45a59c1e0cc7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3259,13 +3259,6 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 void skb_tstamp_tx(struct sk_buff *orig_skb,
 		   struct skb_shared_hwtstamps *hwtstamps);
 
-static inline void sw_tx_timestamp(struct sk_buff *skb)
-{
-	if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP &&
-	    !(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
-		skb_tstamp_tx(skb, NULL);
-}
-
 /**
  * skb_tx_timestamp() - Driver hook for transmit timestamping
  *
@@ -3281,7 +3274,8 @@ static inline void sw_tx_timestamp(struct sk_buff *skb)
 static inline void skb_tx_timestamp(struct sk_buff *skb)
 {
 	skb_clone_tx_timestamp(skb);
-	sw_tx_timestamp(skb);
+	if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
+		skb_tstamp_tx(skb, NULL);
 }
 
 /**
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index dee74d39da94..3d421d912193 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -28,8 +28,9 @@ enum {
 	SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
 	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
 	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
+	SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d5c98117cbce..780b7c1563d0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3901,6 +3901,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 	if (!sk)
 		return;
 
+	if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
+	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
+		return;
+
 	tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
 	if (!skb_may_tx_timestamp(sk, tsonly))
 		return;
diff --git a/net/socket.c b/net/socket.c
index 67db7d8a3b81..cb355a7ef135 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -662,6 +662,19 @@ static bool skb_is_err_queue(const struct sk_buff *skb)
 	return skb->pkt_type == PACKET_OUTGOING;
 }
 
+/* On transmit, software and hardware timestamps are returned independently.
+ * As the two skb clones share the hardware timestamp, which may be updated
+ * before the software timestamp is received, a hardware TX timestamp may be
+ * returned only if there is no software TX timestamp. Ignore false software
+ * timestamps, which may be made in the __sock_recv_timestamp() call when the
+ * option SO_TIMESTAMP(NS) is enabled on the socket, even when the skb has a
+ * hardware timestamp.
+ */
+static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
+{
+	return skb->tstamp && !false_tstamp && skb_is_err_queue(skb);
+}
+
 static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 {
 	struct scm_ts_pktinfo ts_pktinfo;
@@ -691,14 +704,16 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 {
 	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
 	struct scm_timestamping tss;
-	int empty = 1;
+	int empty = 1, false_tstamp = 0;
 	struct skb_shared_hwtstamps *shhwtstamps =
 		skb_hwtstamps(skb);
 
 	/* Race occurred between timestamp enabling and packet
 	   receiving.  Fill in the current time for now. */
-	if (need_software_tstamp && skb->tstamp == 0)
+	if (need_software_tstamp && skb->tstamp == 0) {
 		__net_timestamp(skb);
+		false_tstamp = 1;
+	}
 
 	if (need_software_tstamp) {
 		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
@@ -720,6 +735,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 		empty = 0;
 	if (shhwtstamps &&
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+	    !skb_is_swtx_tstamp(skb, false_tstamp) &&
 	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
 		empty = 0;
 		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
-- 
cgit v1.2.3


From fdfc7dd6ca39b117c709dceee8d32ac4447294d6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Tue, 23 May 2017 18:40:45 +0200
Subject: net/sched: flower: add support for matching on tcp flags

Benefit from the support of tcp flags dissection and allow user to
insert rules matching on tcp flags.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  3 +++
 net/sched/cls_flower.c       | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 1b9aa9e6b4fd..c6e8cf5e9c40 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -451,6 +451,9 @@ enum {
 	TCA_FLOWER_KEY_MPLS_TC,		/* u8 - 3 bits */
 	TCA_FLOWER_KEY_MPLS_LABEL,	/* be32 - 20 bits */
 
+	TCA_FLOWER_KEY_TCP_FLAGS,	/* be16 */
+	TCA_FLOWER_KEY_TCP_FLAGS_MASK,	/* be16 */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ca526c0881bd..fb74a47830f4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -49,6 +49,7 @@ struct fl_flow_key {
 	};
 	struct flow_dissector_key_ports enc_tp;
 	struct flow_dissector_key_mpls mpls;
+	struct flow_dissector_key_tcp tcp;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -424,6 +425,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_MPLS_BOS]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_MPLS_TC]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_MPLS_LABEL]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_TCP_FLAGS]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_TCP_FLAGS_MASK]	= { .type = NLA_U16 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -596,6 +599,9 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
 			       &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
 			       sizeof(key->tp.dst));
+		fl_set_key_val(tb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS,
+			       &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK,
+			       sizeof(key->tcp.flags));
 	} else if (key->basic.ip_proto == IPPROTO_UDP) {
 		fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
 			       &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
@@ -766,6 +772,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
 			     FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_PORTS, tp);
+	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+			     FLOW_DISSECTOR_KEY_TCP, tcp);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ICMP, icmp);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
@@ -1215,7 +1223,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			     sizeof(key->tp.src)) ||
 	     fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
 			     &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
-			     sizeof(key->tp.dst))))
+			     sizeof(key->tp.dst)) ||
+	     fl_dump_key_val(skb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS,
+			     &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK,
+			     sizeof(key->tcp.flags))))
 		goto nla_put_failure;
 	else if (key->basic.ip_proto == IPPROTO_UDP &&
 		 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
-- 
cgit v1.2.3


From 07903ada96139ced48f2f893fe57a26a8fbc6043 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 May 2017 12:03:10 +0300
Subject: mmtimer: Remove the SGI SN2 mmtimer driver

This driver supports direct system clock access on the ancient SGI SN2
IA64 systems, and implement the only non-builtin k_clock instance.
Remove it as any remaining IA64 altix user will be running just as old
distros anyway.

Dimitri Sivanich stated: "Since this is SN2 specific, this can be removed."

Note that this does not affect the never uv_mmtimer driver for x86-based
Altix systems.

[ tglx: Added comment to CLOCK_SGI_CYCLE ]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Dimitri Sivanich <sivanich@hpe.com>
Link: http://lkml.kernel.org/r/20170526090311.3377-2-hch@lst.de
---
 drivers/char/Kconfig      |   9 -
 drivers/char/Makefile     |   1 -
 drivers/char/mmtimer.c    | 858 ----------------------------------------------
 include/uapi/linux/time.h |   6 +-
 4 files changed, 5 insertions(+), 869 deletions(-)
 delete mode 100644 drivers/char/mmtimer.c

(limited to 'include/uapi/linux')

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 31adbebf812e..2af70014ee5a 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -539,15 +539,6 @@ config HANGCHECK_TIMER
 	  out to lunch past a certain margin.  It can reboot the system
 	  or merely print a warning.
 
-config MMTIMER
-	tristate "MMTIMER Memory mapped RTC for SGI Altix"
-	depends on IA64_GENERIC || IA64_SGI_SN2
-	depends on POSIX_TIMERS
-	default y
-	help
-	  The mmtimer device allows direct userspace access to the
-	  Altix system timer.
-
 config UV_MMTIMER
 	tristate "UV_MMTIMER Memory mapped RTC for SGI UV"
 	depends on X86_UV
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 6e6c244a66a0..53e33720818c 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_VIRTIO_CONSOLE)	+= virtio_console.o
 obj-$(CONFIG_RAW_DRIVER)	+= raw.o
 obj-$(CONFIG_SGI_SNSC)		+= snsc.o snsc_event.o
 obj-$(CONFIG_MSPEC)		+= mspec.o
-obj-$(CONFIG_MMTIMER)		+= mmtimer.o
 obj-$(CONFIG_UV_MMTIMER)	+= uv_mmtimer.o
 obj-$(CONFIG_IBM_BSR)		+= bsr.o
 obj-$(CONFIG_SGI_MBCS)		+= mbcs.o
diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
deleted file mode 100644
index 0e7fcb04f01e..000000000000
--- a/drivers/char/mmtimer.c
+++ /dev/null
@@ -1,858 +0,0 @@
-/*
- * Timer device implementation for SGI SN platforms.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (c) 2001-2006 Silicon Graphics, Inc.  All rights reserved.
- *
- * This driver exports an API that should be supportable by any HPET or IA-PC
- * multimedia timer.  The code below is currently specific to the SGI Altix
- * SHub RTC, however.
- *
- * 11/01/01 - jbarnes - initial revision
- * 9/10/04 - Christoph Lameter - remove interrupt support for kernel inclusion
- * 10/1/04 - Christoph Lameter - provide posix clock CLOCK_SGI_CYCLE
- * 10/13/04 - Christoph Lameter, Dimitri Sivanich - provide timer interrupt
- *		support via the posix timer interface
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/ioctl.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/mmtimer.h>
-#include <linux/miscdevice.h>
-#include <linux/posix-timers.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/math64.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-
-#include <linux/uaccess.h>
-#include <asm/sn/addrs.h>
-#include <asm/sn/intr.h>
-#include <asm/sn/shub_mmr.h>
-#include <asm/sn/nodepda.h>
-#include <asm/sn/shubio.h>
-
-MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
-MODULE_DESCRIPTION("SGI Altix RTC Timer");
-MODULE_LICENSE("GPL");
-
-/* name of the device, usually in /dev */
-#define MMTIMER_NAME "mmtimer"
-#define MMTIMER_DESC "SGI Altix RTC Timer"
-#define MMTIMER_VERSION "2.1"
-
-#define RTC_BITS 55 /* 55 bits for this implementation */
-
-static struct k_clock sgi_clock;
-
-extern unsigned long sn_rtc_cycles_per_second;
-
-#define RTC_COUNTER_ADDR        ((long *)LOCAL_MMR_ADDR(SH_RTC))
-
-#define rtc_time()              (*RTC_COUNTER_ADDR)
-
-static DEFINE_MUTEX(mmtimer_mutex);
-static long mmtimer_ioctl(struct file *file, unsigned int cmd,
-						unsigned long arg);
-static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma);
-
-/*
- * Period in femtoseconds (10^-15 s)
- */
-static unsigned long mmtimer_femtoperiod = 0;
-
-static const struct file_operations mmtimer_fops = {
-	.owner = THIS_MODULE,
-	.mmap =	mmtimer_mmap,
-	.unlocked_ioctl = mmtimer_ioctl,
-	.llseek = noop_llseek,
-};
-
-/*
- * We only have comparison registers RTC1-4 currently available per
- * node.  RTC0 is used by SAL.
- */
-/* Check for an RTC interrupt pending */
-static int mmtimer_int_pending(int comparator)
-{
-	if (HUB_L((unsigned long *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED)) &
-			SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator)
-		return 1;
-	else
-		return 0;
-}
-
-/* Clear the RTC interrupt pending bit */
-static void mmtimer_clr_int_pending(int comparator)
-{
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED_ALIAS),
-		SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator);
-}
-
-/* Setup timer on comparator RTC1 */
-static void mmtimer_setup_int_0(int cpu, u64 expires)
-{
-	u64 val;
-
-	/* Disable interrupt */
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), 0UL);
-
-	/* Initialize comparator value */
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPB), -1L);
-
-	/* Clear pending bit */
-	mmtimer_clr_int_pending(0);
-
-	val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC1_INT_CONFIG_IDX_SHFT) |
-		((u64)cpu_physical_id(cpu) <<
-			SH_RTC1_INT_CONFIG_PID_SHFT);
-
-	/* Set configuration */
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_CONFIG), val);
-
-	/* Enable RTC interrupts */
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), 1UL);
-
-	/* Initialize comparator value */
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPB), expires);
-
-
-}
-
-/* Setup timer on comparator RTC2 */
-static void mmtimer_setup_int_1(int cpu, u64 expires)
-{
-	u64 val;
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), 0UL);
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPC), -1L);
-
-	mmtimer_clr_int_pending(1);
-
-	val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC2_INT_CONFIG_IDX_SHFT) |
-		((u64)cpu_physical_id(cpu) <<
-			SH_RTC2_INT_CONFIG_PID_SHFT);
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_CONFIG), val);
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), 1UL);
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPC), expires);
-}
-
-/* Setup timer on comparator RTC3 */
-static void mmtimer_setup_int_2(int cpu, u64 expires)
-{
-	u64 val;
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), 0UL);
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPD), -1L);
-
-	mmtimer_clr_int_pending(2);
-
-	val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC3_INT_CONFIG_IDX_SHFT) |
-		((u64)cpu_physical_id(cpu) <<
-			SH_RTC3_INT_CONFIG_PID_SHFT);
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_CONFIG), val);
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), 1UL);
-
-	HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPD), expires);
-}
-
-/*
- * This function must be called with interrupts disabled and preemption off
- * in order to insure that the setup succeeds in a deterministic time frame.
- * It will check if the interrupt setup succeeded.
- */
-static int mmtimer_setup(int cpu, int comparator, unsigned long expires,
-	u64 *set_completion_time)
-{
-	switch (comparator) {
-	case 0:
-		mmtimer_setup_int_0(cpu, expires);
-		break;
-	case 1:
-		mmtimer_setup_int_1(cpu, expires);
-		break;
-	case 2:
-		mmtimer_setup_int_2(cpu, expires);
-		break;
-	}
-	/* We might've missed our expiration time */
-	*set_completion_time = rtc_time();
-	if (*set_completion_time <= expires)
-		return 1;
-
-	/*
-	 * If an interrupt is already pending then its okay
-	 * if not then we failed
-	 */
-	return mmtimer_int_pending(comparator);
-}
-
-static int mmtimer_disable_int(long nasid, int comparator)
-{
-	switch (comparator) {
-	case 0:
-		nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE),
-			0UL) : REMOTE_HUB_S(nasid, SH_RTC1_INT_ENABLE, 0UL);
-		break;
-	case 1:
-		nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE),
-			0UL) : REMOTE_HUB_S(nasid, SH_RTC2_INT_ENABLE, 0UL);
-		break;
-	case 2:
-		nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE),
-			0UL) : REMOTE_HUB_S(nasid, SH_RTC3_INT_ENABLE, 0UL);
-		break;
-	default:
-		return -EFAULT;
-	}
-	return 0;
-}
-
-#define COMPARATOR	1		/* The comparator to use */
-
-#define TIMER_OFF	0xbadcabLL	/* Timer is not setup */
-#define TIMER_SET	0		/* Comparator is set for this timer */
-
-#define MMTIMER_INTERVAL_RETRY_INCREMENT_DEFAULT 40
-
-/* There is one of these for each timer */
-struct mmtimer {
-	struct rb_node list;
-	struct k_itimer *timer;
-	int cpu;
-};
-
-struct mmtimer_node {
-	spinlock_t lock ____cacheline_aligned;
-	struct rb_root timer_head;
-	struct rb_node *next;
-	struct tasklet_struct tasklet;
-};
-static struct mmtimer_node *timers;
-
-static unsigned mmtimer_interval_retry_increment =
-	MMTIMER_INTERVAL_RETRY_INCREMENT_DEFAULT;
-module_param(mmtimer_interval_retry_increment, uint, 0644);
-MODULE_PARM_DESC(mmtimer_interval_retry_increment,
-	"RTC ticks to add to expiration on interval retry (default 40)");
-
-/*
- * Add a new mmtimer struct to the node's mmtimer list.
- * This function assumes the struct mmtimer_node is locked.
- */
-static void mmtimer_add_list(struct mmtimer *n)
-{
-	int nodeid = n->timer->it.mmtimer.node;
-	unsigned long expires = n->timer->it.mmtimer.expires;
-	struct rb_node **link = &timers[nodeid].timer_head.rb_node;
-	struct rb_node *parent = NULL;
-	struct mmtimer *x;
-
-	/*
-	 * Find the right place in the rbtree:
-	 */
-	while (*link) {
-		parent = *link;
-		x = rb_entry(parent, struct mmtimer, list);
-
-		if (expires < x->timer->it.mmtimer.expires)
-			link = &(*link)->rb_left;
-		else
-			link = &(*link)->rb_right;
-	}
-
-	/*
-	 * Insert the timer to the rbtree and check whether it
-	 * replaces the first pending timer
-	 */
-	rb_link_node(&n->list, parent, link);
-	rb_insert_color(&n->list, &timers[nodeid].timer_head);
-
-	if (!timers[nodeid].next || expires < rb_entry(timers[nodeid].next,
-			struct mmtimer, list)->timer->it.mmtimer.expires)
-		timers[nodeid].next = &n->list;
-}
-
-/*
- * Set the comparator for the next timer.
- * This function assumes the struct mmtimer_node is locked.
- */
-static void mmtimer_set_next_timer(int nodeid)
-{
-	struct mmtimer_node *n = &timers[nodeid];
-	struct mmtimer *x;
-	struct k_itimer *t;
-	u64 expires, exp, set_completion_time;
-	int i;
-
-restart:
-	if (n->next == NULL)
-		return;
-
-	x = rb_entry(n->next, struct mmtimer, list);
-	t = x->timer;
-	if (!t->it.mmtimer.incr) {
-		/* Not an interval timer */
-		if (!mmtimer_setup(x->cpu, COMPARATOR,
-					t->it.mmtimer.expires,
-					&set_completion_time)) {
-			/* Late setup, fire now */
-			tasklet_schedule(&n->tasklet);
-		}
-		return;
-	}
-
-	/* Interval timer */
-	i = 0;
-	expires = exp = t->it.mmtimer.expires;
-	while (!mmtimer_setup(x->cpu, COMPARATOR, expires,
-				&set_completion_time)) {
-		int to;
-
-		i++;
-		expires = set_completion_time +
-				mmtimer_interval_retry_increment + (1 << i);
-		/* Calculate overruns as we go. */
-		to = ((u64)(expires - exp) / t->it.mmtimer.incr);
-		if (to) {
-			t->it_overrun += to;
-			t->it.mmtimer.expires += t->it.mmtimer.incr * to;
-			exp = t->it.mmtimer.expires;
-		}
-		if (i > 20) {
-			printk(KERN_ALERT "mmtimer: cannot reschedule timer\n");
-			t->it.mmtimer.clock = TIMER_OFF;
-			n->next = rb_next(&x->list);
-			rb_erase(&x->list, &n->timer_head);
-			kfree(x);
-			goto restart;
-		}
-	}
-}
-
-/**
- * mmtimer_ioctl - ioctl interface for /dev/mmtimer
- * @file: file structure for the device
- * @cmd: command to execute
- * @arg: optional argument to command
- *
- * Executes the command specified by @cmd.  Returns 0 for success, < 0 for
- * failure.
- *
- * Valid commands:
- *
- * %MMTIMER_GETOFFSET - Should return the offset (relative to the start
- * of the page where the registers are mapped) for the counter in question.
- *
- * %MMTIMER_GETRES - Returns the resolution of the clock in femto (10^-15)
- * seconds
- *
- * %MMTIMER_GETFREQ - Copies the frequency of the clock in Hz to the address
- * specified by @arg
- *
- * %MMTIMER_GETBITS - Returns the number of bits in the clock's counter
- *
- * %MMTIMER_MMAPAVAIL - Returns 1 if the registers can be mmap'd into userspace
- *
- * %MMTIMER_GETCOUNTER - Gets the current value in the counter and places it
- * in the address specified by @arg.
- */
-static long mmtimer_ioctl(struct file *file, unsigned int cmd,
-						unsigned long arg)
-{
-	int ret = 0;
-
-	mutex_lock(&mmtimer_mutex);
-
-	switch (cmd) {
-	case MMTIMER_GETOFFSET:	/* offset of the counter */
-		/*
-		 * SN RTC registers are on their own 64k page
-		 */
-		if(PAGE_SIZE <= (1 << 16))
-			ret = (((long)RTC_COUNTER_ADDR) & (PAGE_SIZE-1)) / 8;
-		else
-			ret = -ENOSYS;
-		break;
-
-	case MMTIMER_GETRES: /* resolution of the clock in 10^-15 s */
-		if(copy_to_user((unsigned long __user *)arg,
-				&mmtimer_femtoperiod, sizeof(unsigned long)))
-			ret = -EFAULT;
-		break;
-
-	case MMTIMER_GETFREQ: /* frequency in Hz */
-		if(copy_to_user((unsigned long __user *)arg,
-				&sn_rtc_cycles_per_second,
-				sizeof(unsigned long)))
-			ret = -EFAULT;
-		break;
-
-	case MMTIMER_GETBITS: /* number of bits in the clock */
-		ret = RTC_BITS;
-		break;
-
-	case MMTIMER_MMAPAVAIL: /* can we mmap the clock into userspace? */
-		ret = (PAGE_SIZE <= (1 << 16)) ? 1 : 0;
-		break;
-
-	case MMTIMER_GETCOUNTER:
-		if(copy_to_user((unsigned long __user *)arg,
-				RTC_COUNTER_ADDR, sizeof(unsigned long)))
-			ret = -EFAULT;
-		break;
-	default:
-		ret = -ENOTTY;
-		break;
-	}
-	mutex_unlock(&mmtimer_mutex);
-	return ret;
-}
-
-/**
- * mmtimer_mmap - maps the clock's registers into userspace
- * @file: file structure for the device
- * @vma: VMA to map the registers into
- *
- * Calls remap_pfn_range() to map the clock's registers into
- * the calling process' address space.
- */
-static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	unsigned long mmtimer_addr;
-
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
-		return -EINVAL;
-
-	if (vma->vm_flags & VM_WRITE)
-		return -EPERM;
-
-	if (PAGE_SIZE > (1 << 16))
-		return -ENOSYS;
-
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	mmtimer_addr = __pa(RTC_COUNTER_ADDR);
-	mmtimer_addr &= ~(PAGE_SIZE - 1);
-	mmtimer_addr &= 0xfffffffffffffffUL;
-
-	if (remap_pfn_range(vma, vma->vm_start, mmtimer_addr >> PAGE_SHIFT,
-					PAGE_SIZE, vma->vm_page_prot)) {
-		printk(KERN_ERR "remap_pfn_range failed in mmtimer.c\n");
-		return -EAGAIN;
-	}
-
-	return 0;
-}
-
-static struct miscdevice mmtimer_miscdev = {
-	.minor = SGI_MMTIMER,
-	.name = MMTIMER_NAME,
-	.fops = &mmtimer_fops
-};
-
-static struct timespec sgi_clock_offset;
-static int sgi_clock_period;
-
-/*
- * Posix Timer Interface
- */
-
-static struct timespec sgi_clock_offset;
-static int sgi_clock_period;
-
-static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp)
-{
-	u64 nsec;
-
-	nsec = rtc_time() * sgi_clock_period
-			+ sgi_clock_offset.tv_nsec;
-	*tp = ns_to_timespec64(nsec);
-	tp->tv_sec += sgi_clock_offset.tv_sec;
-	return 0;
-};
-
-static int sgi_clock_set(const clockid_t clockid, const struct timespec64 *tp)
-{
-
-	u64 nsec;
-	u32 rem;
-
-	nsec = rtc_time() * sgi_clock_period;
-
-	sgi_clock_offset.tv_sec = tp->tv_sec - div_u64_rem(nsec, NSEC_PER_SEC, &rem);
-
-	if (rem <= tp->tv_nsec)
-		sgi_clock_offset.tv_nsec = tp->tv_sec - rem;
-	else {
-		sgi_clock_offset.tv_nsec = tp->tv_sec + NSEC_PER_SEC - rem;
-		sgi_clock_offset.tv_sec--;
-	}
-	return 0;
-}
-
-/**
- * mmtimer_interrupt - timer interrupt handler
- * @irq: irq received
- * @dev_id: device the irq came from
- *
- * Called when one of the comarators matches the counter, This
- * routine will send signals to processes that have requested
- * them.
- *
- * This interrupt is run in an interrupt context
- * by the SHUB. It is therefore safe to locally access SHub
- * registers.
- */
-static irqreturn_t
-mmtimer_interrupt(int irq, void *dev_id)
-{
-	unsigned long expires = 0;
-	int result = IRQ_NONE;
-	unsigned indx = cpu_to_node(smp_processor_id());
-	struct mmtimer *base;
-
-	spin_lock(&timers[indx].lock);
-	base = rb_entry(timers[indx].next, struct mmtimer, list);
-	if (base == NULL) {
-		spin_unlock(&timers[indx].lock);
-		return result;
-	}
-
-	if (base->cpu == smp_processor_id()) {
-		if (base->timer)
-			expires = base->timer->it.mmtimer.expires;
-		/* expires test won't work with shared irqs */
-		if ((mmtimer_int_pending(COMPARATOR) > 0) ||
-			(expires && (expires <= rtc_time()))) {
-			mmtimer_clr_int_pending(COMPARATOR);
-			tasklet_schedule(&timers[indx].tasklet);
-			result = IRQ_HANDLED;
-		}
-	}
-	spin_unlock(&timers[indx].lock);
-	return result;
-}
-
-static void mmtimer_tasklet(unsigned long data)
-{
-	int nodeid = data;
-	struct mmtimer_node *mn = &timers[nodeid];
-	struct mmtimer *x;
-	struct k_itimer *t;
-	unsigned long flags;
-
-	/* Send signal and deal with periodic signals */
-	spin_lock_irqsave(&mn->lock, flags);
-	if (!mn->next)
-		goto out;
-
-	x = rb_entry(mn->next, struct mmtimer, list);
-	t = x->timer;
-
-	if (t->it.mmtimer.clock == TIMER_OFF)
-		goto out;
-
-	t->it_overrun = 0;
-
-	mn->next = rb_next(&x->list);
-	rb_erase(&x->list, &mn->timer_head);
-
-	if (posix_timer_event(t, 0) != 0)
-		t->it_overrun++;
-
-	if(t->it.mmtimer.incr) {
-		t->it.mmtimer.expires += t->it.mmtimer.incr;
-		mmtimer_add_list(x);
-	} else {
-		/* Ensure we don't false trigger in mmtimer_interrupt */
-		t->it.mmtimer.clock = TIMER_OFF;
-		t->it.mmtimer.expires = 0;
-		kfree(x);
-	}
-	/* Set comparator for next timer, if there is one */
-	mmtimer_set_next_timer(nodeid);
-
-	t->it_overrun_last = t->it_overrun;
-out:
-	spin_unlock_irqrestore(&mn->lock, flags);
-}
-
-static int sgi_timer_create(struct k_itimer *timer)
-{
-	/* Insure that a newly created timer is off */
-	timer->it.mmtimer.clock = TIMER_OFF;
-	return 0;
-}
-
-/* This does not really delete a timer. It just insures
- * that the timer is not active
- *
- * Assumption: it_lock is already held with irq's disabled
- */
-static int sgi_timer_del(struct k_itimer *timr)
-{
-	cnodeid_t nodeid = timr->it.mmtimer.node;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&timers[nodeid].lock, irqflags);
-	if (timr->it.mmtimer.clock != TIMER_OFF) {
-		unsigned long expires = timr->it.mmtimer.expires;
-		struct rb_node *n = timers[nodeid].timer_head.rb_node;
-		struct mmtimer *uninitialized_var(t);
-		int r = 0;
-
-		timr->it.mmtimer.clock = TIMER_OFF;
-		timr->it.mmtimer.expires = 0;
-
-		while (n) {
-			t = rb_entry(n, struct mmtimer, list);
-			if (t->timer == timr)
-				break;
-
-			if (expires < t->timer->it.mmtimer.expires)
-				n = n->rb_left;
-			else
-				n = n->rb_right;
-		}
-
-		if (!n) {
-			spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
-			return 0;
-		}
-
-		if (timers[nodeid].next == n) {
-			timers[nodeid].next = rb_next(n);
-			r = 1;
-		}
-
-		rb_erase(n, &timers[nodeid].timer_head);
-		kfree(t);
-
-		if (r) {
-			mmtimer_disable_int(cnodeid_to_nasid(nodeid),
-				COMPARATOR);
-			mmtimer_set_next_timer(nodeid);
-		}
-	}
-	spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
-	return 0;
-}
-
-/* Assumption: it_lock is already held with irq's disabled */
-static void sgi_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
-{
-
-	if (timr->it.mmtimer.clock == TIMER_OFF) {
-		cur_setting->it_interval.tv_nsec = 0;
-		cur_setting->it_interval.tv_sec = 0;
-		cur_setting->it_value.tv_nsec = 0;
-		cur_setting->it_value.tv_sec =0;
-		return;
-	}
-
-	cur_setting->it_interval = ns_to_timespec64(timr->it.mmtimer.incr * sgi_clock_period);
-	cur_setting->it_value = ns_to_timespec64((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
-}
-
-
-static int sgi_timer_set(struct k_itimer *timr, int flags,
-	struct itimerspec64 *new_setting,
-	struct itimerspec64 *old_setting)
-{
-	unsigned long when, period, irqflags;
-	int err = 0;
-	cnodeid_t nodeid;
-	struct mmtimer *base;
-	struct rb_node *n;
-
-	if (old_setting)
-		sgi_timer_get(timr, old_setting);
-
-	sgi_timer_del(timr);
-	when = timespec64_to_ns(&new_setting->it_value);
-	period = timespec64_to_ns(&new_setting->it_interval);
-
-	if (when == 0)
-		/* Clear timer */
-		return 0;
-
-	base = kmalloc(sizeof(struct mmtimer), GFP_KERNEL);
-	if (base == NULL)
-		return -ENOMEM;
-
-	if (flags & TIMER_ABSTIME) {
-		struct timespec64 n;
-		unsigned long now;
-
-		getnstimeofday64(&n);
-		now = timespec64_to_ns(&n);
-		if (when > now)
-			when -= now;
-		else
-			/* Fire the timer immediately */
-			when = 0;
-	}
-
-	/*
-	 * Convert to sgi clock period. Need to keep rtc_time() as near as possible
-	 * to getnstimeofday() in order to be as faithful as possible to the time
-	 * specified.
-	 */
-	when = (when + sgi_clock_period - 1) / sgi_clock_period + rtc_time();
-	period = (period + sgi_clock_period - 1)  / sgi_clock_period;
-
-	/*
-	 * We are allocating a local SHub comparator. If we would be moved to another
-	 * cpu then another SHub may be local to us. Prohibit that by switching off
-	 * preemption.
-	 */
-	preempt_disable();
-
-	nodeid =  cpu_to_node(smp_processor_id());
-
-	/* Lock the node timer structure */
-	spin_lock_irqsave(&timers[nodeid].lock, irqflags);
-
-	base->timer = timr;
-	base->cpu = smp_processor_id();
-
-	timr->it.mmtimer.clock = TIMER_SET;
-	timr->it.mmtimer.node = nodeid;
-	timr->it.mmtimer.incr = period;
-	timr->it.mmtimer.expires = when;
-
-	n = timers[nodeid].next;
-
-	/* Add the new struct mmtimer to node's timer list */
-	mmtimer_add_list(base);
-
-	if (timers[nodeid].next == n) {
-		/* No need to reprogram comparator for now */
-		spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
-		preempt_enable();
-		return err;
-	}
-
-	/* We need to reprogram the comparator */
-	if (n)
-		mmtimer_disable_int(cnodeid_to_nasid(nodeid), COMPARATOR);
-
-	mmtimer_set_next_timer(nodeid);
-
-	/* Unlock the node timer structure */
-	spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
-
-	preempt_enable();
-
-	return err;
-}
-
-static int sgi_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
-{
-	tp->tv_sec = 0;
-	tp->tv_nsec = sgi_clock_period;
-	return 0;
-}
-
-static struct k_clock sgi_clock = {
-	.clock_set	= sgi_clock_set,
-	.clock_get	= sgi_clock_get,
-	.clock_getres	= sgi_clock_getres,
-	.timer_create	= sgi_timer_create,
-	.timer_set	= sgi_timer_set,
-	.timer_del	= sgi_timer_del,
-	.timer_get	= sgi_timer_get
-};
-
-/**
- * mmtimer_init - device initialization routine
- *
- * Does initial setup for the mmtimer device.
- */
-static int __init mmtimer_init(void)
-{
-	cnodeid_t node, maxn = -1;
-
-	if (!ia64_platform_is("sn2"))
-		return 0;
-
-	/*
-	 * Sanity check the cycles/sec variable
-	 */
-	if (sn_rtc_cycles_per_second < 100000) {
-		printk(KERN_ERR "%s: unable to determine clock frequency\n",
-		       MMTIMER_NAME);
-		goto out1;
-	}
-
-	mmtimer_femtoperiod = ((unsigned long)1E15 + sn_rtc_cycles_per_second /
-			       2) / sn_rtc_cycles_per_second;
-
-	if (request_irq(SGI_MMTIMER_VECTOR, mmtimer_interrupt, IRQF_PERCPU, MMTIMER_NAME, NULL)) {
-		printk(KERN_WARNING "%s: unable to allocate interrupt.",
-			MMTIMER_NAME);
-		goto out1;
-	}
-
-	if (misc_register(&mmtimer_miscdev)) {
-		printk(KERN_ERR "%s: failed to register device\n",
-		       MMTIMER_NAME);
-		goto out2;
-	}
-
-	/* Get max numbered node, calculate slots needed */
-	for_each_online_node(node) {
-		maxn = node;
-	}
-	maxn++;
-
-	/* Allocate list of node ptrs to mmtimer_t's */
-	timers = kzalloc(sizeof(struct mmtimer_node)*maxn, GFP_KERNEL);
-	if (!timers) {
-		printk(KERN_ERR "%s: failed to allocate memory for device\n",
-				MMTIMER_NAME);
-		goto out3;
-	}
-
-	/* Initialize struct mmtimer's for each online node */
-	for_each_online_node(node) {
-		spin_lock_init(&timers[node].lock);
-		tasklet_init(&timers[node].tasklet, mmtimer_tasklet,
-			(unsigned long) node);
-	}
-
-	sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second;
-	posix_timers_register_clock(CLOCK_SGI_CYCLE, &sgi_clock);
-
-	printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION,
-	       sn_rtc_cycles_per_second/(unsigned long)1E6);
-
-	return 0;
-
-out3:
-	misc_deregister(&mmtimer_miscdev);
-out2:
-	free_irq(SGI_MMTIMER_VECTOR, NULL);
-out1:
-	return -1;
-}
-
-module_init(mmtimer_init);
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index e75e1b6ff27f..09299fcb842a 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -54,7 +54,11 @@ struct itimerval {
 #define CLOCK_BOOTTIME			7
 #define CLOCK_REALTIME_ALARM		8
 #define CLOCK_BOOTTIME_ALARM		9
-#define CLOCK_SGI_CYCLE			10	/* Hardware specific */
+/*
+ * The driver implementing this got removed. The clock ID is kept as a
+ * place holder. Do not reuse!
+ */
+#define CLOCK_SGI_CYCLE			10
 #define CLOCK_TAI			11
 
 #define MAX_CLOCKS			16
-- 
cgit v1.2.3


From 0be1b305d9b808e5b28e74f4ef807851c14c39f2 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 25 May 2017 10:42:38 -0700
Subject: net: ipv4: add new RTM_F_FIB_MATCH flag for use with RTM_GETROUTE

This flag when specified will return matched fib result in
response to a RTM_GETROUTE query.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 6487b21b2c1e..564790e854f7 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -278,6 +278,7 @@ enum rt_scope_t {
 #define RTM_F_EQUALIZE		0x400	/* Multipath equalizer: NI	*/
 #define RTM_F_PREFIX		0x800	/* Prefix addresses		*/
 #define RTM_F_LOOKUP_TABLE	0x1000	/* set rtm_table to FIB lookup result */
+#define RTM_F_FIB_MATCH	        0x2000	/* return full fib lookup match */
 
 /* Reserved table identifiers */
 
-- 
cgit v1.2.3


From 3d3ea5af5c0b382bc9d9aed378fd814fb5d4a011 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Sat, 27 May 2017 10:14:34 -0400
Subject: rtnl: Add support for netdev event to link messages

When netdev events happen, a rtnetlink_event() handler will send
messages for every event in it's white list.  These messages contain
current information about a particular device, but they do not include
the iformation about which event just happened.  So, it is impossible
to tell what just happend for these events.

This patch adds a new extension to RTM_NEWLINK message called IFLA_EVENT
that would have an encoding of event that triggered this
message.  This would allow the the message consumer to easily determine
if it needs to perform certain actions.

Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h    |  3 +-
 include/uapi/linux/if_link.h | 11 ++++++++
 net/core/dev.c               |  2 +-
 net/core/rtnetlink.c         | 65 ++++++++++++++++++++++++++++++++++++++------
 4 files changed, 70 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 57e54847b0b9..dea59c8eec54 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -18,7 +18,8 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
-				       unsigned change, gfp_t flags);
+				       unsigned change, u32 event,
+				       gfp_t flags);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 15ac20382aba..8ed679fe603f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -157,6 +157,7 @@ enum {
 	IFLA_GSO_MAX_SIZE,
 	IFLA_PAD,
 	IFLA_XDP,
+	IFLA_EVENT,
 	__IFLA_MAX
 };
 
@@ -911,4 +912,14 @@ enum {
 
 #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1)
 
+enum {
+	IFLA_EVENT_NONE,
+	IFLA_EVENT_REBOOT,		/* internal reset / reboot */
+	IFLA_EVENT_FEATURES,		/* change in offload features */
+	IFLA_EVENT_BONDING_FAILOVER,	/* change in active slave */
+	IFLA_EVENT_NOTIFY_PEERS,	/* re-sent grat. arp/ndisc */
+	IFLA_EVENT_IGMP_RESEND,		/* re-sent IGMP JOIN */
+	IFLA_EVENT_BONDING_OPTIONS,	/* change in bonding options */
+};
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 3d98fbf4cbb0..06e0a7492df8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7084,7 +7084,7 @@ static void rollback_registered_many(struct list_head *head)
 
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 						     GFP_KERNEL);
 
 		/*
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 64953af4a3b1..9da53e43750c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -941,6 +941,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
 	       + rtnl_xdp_size() /* IFLA_XDP */
+	       + nla_total_size(4)  /* IFLA_EVENT */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1282,9 +1283,40 @@ err_cancel:
 	return err;
 }
 
+static u32 rtnl_get_event(unsigned long event)
+{
+	u32 rtnl_event_type = IFLA_EVENT_NONE;
+
+	switch (event) {
+	case NETDEV_REBOOT:
+		rtnl_event_type = IFLA_EVENT_REBOOT;
+		break;
+	case NETDEV_FEAT_CHANGE:
+		rtnl_event_type = IFLA_EVENT_FEATURES;
+		break;
+	case NETDEV_BONDING_FAILOVER:
+		rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER;
+		break;
+	case NETDEV_NOTIFY_PEERS:
+		rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS;
+		break;
+	case NETDEV_RESEND_IGMP:
+		rtnl_event_type = IFLA_EVENT_IGMP_RESEND;
+		break;
+	case NETDEV_CHANGEINFODATA:
+		rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS;
+		break;
+	default:
+		break;
+	}
+
+	return rtnl_event_type;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
-			    unsigned int flags, u32 ext_filter_mask)
+			    unsigned int flags, u32 ext_filter_mask,
+			    u32 event)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1333,6 +1365,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
 		goto nla_put_failure;
 
+	if (event != IFLA_EVENT_NONE) {
+		if (nla_put_u32(skb, IFLA_EVENT, event))
+			goto nla_put_failure;
+	}
+
 	if (rtnl_fill_link_ifmap(skb, dev))
 		goto nla_put_failure;
 
@@ -1467,6 +1504,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINK_NETNSID]	= { .type = NLA_S32 },
 	[IFLA_PROTO_DOWN]	= { .type = NLA_U8 },
 	[IFLA_XDP]		= { .type = NLA_NESTED },
+	[IFLA_EVENT]		= { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1626,7 +1664,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask);
+					       ext_filter_mask, 0);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -2736,7 +2774,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask);
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -2808,7 +2846,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 }
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
-				       unsigned int change, gfp_t flags)
+				       unsigned int change,
+				       u32 event, gfp_t flags)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2819,7 +2858,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -2840,18 +2879,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
 }
 
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
-		  gfp_t flags)
+static void rtmsg_ifinfo_event(int type, struct net_device *dev,
+			       unsigned int change, u32 event,
+			       gfp_t flags)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+		  gfp_t flags)
+{
+	rtmsg_ifinfo_event(type, dev, change, IFLA_EVENT_NONE, flags);
+}
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -4168,7 +4214,8 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
-		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
+				   GFP_KERNEL);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 685c9b24ad5090e7a74781c4784fc12e0a04a176 Mon Sep 17 00:00:00 2001
From: Shaun McDowell <shaunjmcdowell@gmail.com>
Date: Thu, 25 May 2017 23:55:54 -0400
Subject: nbd: add FUA op support

NBD userland client and server have FUA (forced unit access) support
and flags defined. Make NBD kernel module recognize NBD_FLAG_SEND_FUA,
enable FUA on the queue, and forward FUA requests to the server.

Signed-off-by: Shaun McDowell <shaunjmcdowell@gmail.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c      | 16 +++++++++++++---
 include/uapi/linux/nbd.h |  4 ++++
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9a7bb2c29447..c5e52f66d3d4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -400,6 +400,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 	unsigned long size = blk_rq_bytes(req);
 	struct bio *bio;
 	u32 type;
+	u32 nbd_cmd_flags = 0;
 	u32 tag = blk_mq_unique_tag(req);
 	int sent = nsock->sent, skip = 0;
 
@@ -429,6 +430,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 		return -EIO;
 	}
 
+	if (req->cmd_flags & REQ_FUA)
+		nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
+
 	/* We did a partial send previously, and we at least sent the whole
 	 * request struct, so just go and send the rest of the pages in the
 	 * request.
@@ -442,7 +446,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 	}
 	cmd->index = index;
 	cmd->cookie = nsock->cookie;
-	request.type = htonl(type);
+	request.type = htonl(type | nbd_cmd_flags);
 	if (type != NBD_CMD_FLUSH) {
 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 		request.len = htonl(size);
@@ -965,8 +969,12 @@ static void nbd_parse_flags(struct nbd_device *nbd)
 		set_disk_ro(nbd->disk, false);
 	if (config->flags & NBD_FLAG_SEND_TRIM)
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
-	if (config->flags & NBD_FLAG_SEND_FLUSH)
-		blk_queue_write_cache(nbd->disk->queue, true, false);
+	if (config->flags & NBD_FLAG_SEND_FLUSH) {
+		if (config->flags & NBD_FLAG_SEND_FUA)
+			blk_queue_write_cache(nbd->disk->queue, true, true);
+		else
+			blk_queue_write_cache(nbd->disk->queue, true, false);
+	}
 	else
 		blk_queue_write_cache(nbd->disk->queue, false, false);
 }
@@ -1309,6 +1317,8 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
 		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
 	if (flags & NBD_FLAG_SEND_FLUSH)
 		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
+	if (flags & NBD_FLAG_SEND_FUA)
+		seq_puts(s, "NBD_FLAG_SEND_FUA\n");
 	if (flags & NBD_FLAG_SEND_TRIM)
 		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
 
diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index 155e33f81913..a50527ebf671 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -41,10 +41,14 @@ enum {
 #define NBD_FLAG_HAS_FLAGS	(1 << 0) /* nbd-server supports flags */
 #define NBD_FLAG_READ_ONLY	(1 << 1) /* device is read-only */
 #define NBD_FLAG_SEND_FLUSH	(1 << 2) /* can flush writeback cache */
+#define NBD_FLAG_SEND_FUA	(1 << 3) /* send FUA (forced unit access) */
 /* there is a gap here to match userspace */
 #define NBD_FLAG_SEND_TRIM	(1 << 5) /* send trim/discard */
 #define NBD_FLAG_CAN_MULTI_CONN	(1 << 8)	/* Server supports multiple connections per export. */
 
+/* values for cmd flags in the upper 16 bits of request type */
+#define NBD_CMD_FLAG_FUA	(1 << 16) /* FUA (forced unit access) op */
+
 /* These are client behavior specific flags. */
 #define NBD_CFLAG_DESTROY_ON_DISCONNECT	(1 << 0) /* delete the nbd device on
 						    disconnect. */
-- 
cgit v1.2.3


From 222155de45573e978cda988b7efc7d4e7b9a8ff9 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <zhangjerry@google.com>
Date: Wed, 19 Apr 2017 18:23:38 -0700
Subject: usb: gadget: function: f_fs: Let ffs_epfile_ioctl wait for enable.

This allows users to make an ioctl call as the first action on a
connection. Ex, some functions might want to get endpoint size
before making any i/os.

Previously, calling ioctls before read/write would depending on the
timing of endpoints being enabled.

ESHUTDOWN is now a possible return value and ENODEV is not, so change
docs accordingly.

Acked-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Jerry Zhang <zhangjerry@google.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/function/f_fs.c  | 93 +++++++++++++++++++++----------------
 include/uapi/linux/usb/functionfs.h |  7 +--
 2 files changed, 58 insertions(+), 42 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 71dd27c0d7f2..a24f9bf9c1c0 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1189,6 +1189,7 @@ static long ffs_epfile_ioctl(struct file *file, unsigned code,
 			     unsigned long value)
 {
 	struct ffs_epfile *epfile = file->private_data;
+	struct ffs_ep *ep;
 	int ret;
 
 	ENTER();
@@ -1196,50 +1197,64 @@ static long ffs_epfile_ioctl(struct file *file, unsigned code,
 	if (WARN_ON(epfile->ffs->state != FFS_ACTIVE))
 		return -ENODEV;
 
+	/* Wait for endpoint to be enabled */
+	ep = epfile->ep;
+	if (!ep) {
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(epfile->wait, (ep = epfile->ep));
+		if (ret)
+			return -EINTR;
+	}
+
 	spin_lock_irq(&epfile->ffs->eps_lock);
-	if (likely(epfile->ep)) {
-		switch (code) {
-		case FUNCTIONFS_FIFO_STATUS:
-			ret = usb_ep_fifo_status(epfile->ep->ep);
-			break;
-		case FUNCTIONFS_FIFO_FLUSH:
-			usb_ep_fifo_flush(epfile->ep->ep);
-			ret = 0;
-			break;
-		case FUNCTIONFS_CLEAR_HALT:
-			ret = usb_ep_clear_halt(epfile->ep->ep);
-			break;
-		case FUNCTIONFS_ENDPOINT_REVMAP:
-			ret = epfile->ep->num;
-			break;
-		case FUNCTIONFS_ENDPOINT_DESC:
-		{
-			int desc_idx;
-			struct usb_endpoint_descriptor *desc;
 
-			switch (epfile->ffs->gadget->speed) {
-			case USB_SPEED_SUPER:
-				desc_idx = 2;
-				break;
-			case USB_SPEED_HIGH:
-				desc_idx = 1;
-				break;
-			default:
-				desc_idx = 0;
-			}
-			desc = epfile->ep->descs[desc_idx];
+	/* In the meantime, endpoint got disabled or changed. */
+	if (epfile->ep != ep) {
+		spin_unlock_irq(&epfile->ffs->eps_lock);
+		return -ESHUTDOWN;
+	}
 
-			spin_unlock_irq(&epfile->ffs->eps_lock);
-			ret = copy_to_user((void *)value, desc, desc->bLength);
-			if (ret)
-				ret = -EFAULT;
-			return ret;
-		}
+	switch (code) {
+	case FUNCTIONFS_FIFO_STATUS:
+		ret = usb_ep_fifo_status(epfile->ep->ep);
+		break;
+	case FUNCTIONFS_FIFO_FLUSH:
+		usb_ep_fifo_flush(epfile->ep->ep);
+		ret = 0;
+		break;
+	case FUNCTIONFS_CLEAR_HALT:
+		ret = usb_ep_clear_halt(epfile->ep->ep);
+		break;
+	case FUNCTIONFS_ENDPOINT_REVMAP:
+		ret = epfile->ep->num;
+		break;
+	case FUNCTIONFS_ENDPOINT_DESC:
+	{
+		int desc_idx;
+		struct usb_endpoint_descriptor *desc;
+
+		switch (epfile->ffs->gadget->speed) {
+		case USB_SPEED_SUPER:
+			desc_idx = 2;
+			break;
+		case USB_SPEED_HIGH:
+			desc_idx = 1;
+			break;
 		default:
-			ret = -ENOTTY;
+			desc_idx = 0;
 		}
-	} else {
-		ret = -ENODEV;
+		desc = epfile->ep->descs[desc_idx];
+
+		spin_unlock_irq(&epfile->ffs->eps_lock);
+		ret = copy_to_user((void *)value, desc, desc->bLength);
+		if (ret)
+			ret = -EFAULT;
+		return ret;
+	}
+	default:
+		ret = -ENOTTY;
 	}
 	spin_unlock_irq(&epfile->ffs->eps_lock);
 
diff --git a/include/uapi/linux/usb/functionfs.h b/include/uapi/linux/usb/functionfs.h
index 062606f02309..f913d08ab7bb 100644
--- a/include/uapi/linux/usb/functionfs.h
+++ b/include/uapi/linux/usb/functionfs.h
@@ -275,13 +275,14 @@ struct usb_functionfs_event {
 #define	FUNCTIONFS_INTERFACE_REVMAP	_IO('g', 128)
 
 /*
- * Returns real bEndpointAddress of an endpoint.  If function is not
- * active returns -ENODEV.
+ * Returns real bEndpointAddress of an endpoint. If endpoint shuts down
+ * during the call, returns -ESHUTDOWN.
  */
 #define	FUNCTIONFS_ENDPOINT_REVMAP	_IO('g', 129)
 
 /*
- * Returns endpoint descriptor. If function is not active returns -ENODEV.
+ * Returns endpoint descriptor. If endpoint shuts down during the call,
+ * returns -ESHUTDOWN.
  */
 #define	FUNCTIONFS_ENDPOINT_DESC	_IOR('g', 130, \
 					     struct usb_endpoint_descriptor)
-- 
cgit v1.2.3


From 47f58e32a27c647de0963386d2714d570b38e3d3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 3 Jun 2017 18:41:07 +0900
Subject: tty: reserve N_SPEAKUP number

Over in the staging tree, N_SPEAKUP is added, so to make life easier for
merging and other development, also reserve it in the tty tree.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/tty.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index 01c4410352ff..e7855dffd592 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -35,5 +35,6 @@
 #define N_TRACESINK	23	/* Trace data routing for MIPI P1149.7 */
 #define N_TRACEROUTER	24	/* Trace data routing for MIPI P1149.7 */
 #define N_NCI		25	/* NFC NCI UART */
+#define N_SPEAKUP	26	/* Speakup communication with synths */
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit v1.2.3


From 8a8dabf2dd68caff842d38057097c23bc514ea6e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@llwyncelyn.cymru>
Date: Fri, 2 Jun 2017 13:49:30 +0100
Subject: tty: handle the case where we cannot restore a line discipline

Historically the N_TTY driver could never fail but this has become broken over
time. Rather than trying to rewrite half the ldisc layer to fix the breakage
introduce a second level of fallback with an N_NULL ldisc which cannot fail,
and thus restore the guarantees required by the ldisc layer.

We still try and fail to N_TTY first. It's much more useful to find yourself
back in your old ldisc (first attempt) or in N_TTY (second attempt), and while
I'm not aware of any code out there that makes those assumptions it's good to
drive(r) defensively.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/Makefile     |  3 +-
 drivers/tty/n_null.c     | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/tty/tty_ldisc.c  | 44 +++++++++++++++++---------
 include/uapi/linux/tty.h |  1 +
 4 files changed, 113 insertions(+), 15 deletions(-)
 create mode 100644 drivers/tty/n_null.c

(limited to 'include/uapi/linux')

diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index f02becdb3e33..8689279afdf1 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -1,6 +1,7 @@
 obj-$(CONFIG_TTY)		+= tty_io.o n_tty.o tty_ioctl.o tty_ldisc.o \
 				   tty_buffer.o tty_port.o tty_mutex.o \
-				   tty_ldsem.o tty_baudrate.o tty_jobctrl.o
+				   tty_ldsem.o tty_baudrate.o tty_jobctrl.o \
+				   n_null.o
 obj-$(CONFIG_LEGACY_PTYS)	+= pty.o
 obj-$(CONFIG_UNIX98_PTYS)	+= pty.o
 obj-$(CONFIG_AUDIT)		+= tty_audit.o
diff --git a/drivers/tty/n_null.c b/drivers/tty/n_null.c
new file mode 100644
index 000000000000..d63261c36e42
--- /dev/null
+++ b/drivers/tty/n_null.c
@@ -0,0 +1,80 @@
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/tty.h>
+#include <linux/module.h>
+
+/*
+ *  n_null.c - Null line discipline used in the failure path
+ *
+ *  Copyright (C) Intel 2017
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+static int n_null_open(struct tty_struct *tty)
+{
+	return 0;
+}
+
+static void n_null_close(struct tty_struct *tty)
+{
+}
+
+static ssize_t n_null_read(struct tty_struct *tty, struct file *file,
+			   unsigned char __user * buf, size_t nr)
+{
+	return -EOPNOTSUPP;
+}
+
+static ssize_t n_null_write(struct tty_struct *tty, struct file *file,
+			    const unsigned char *buf, size_t nr)
+{
+	return -EOPNOTSUPP;
+}
+
+static void n_null_receivebuf(struct tty_struct *tty,
+				 const unsigned char *cp, char *fp,
+				 int cnt)
+{
+}
+
+static struct tty_ldisc_ops null_ldisc = {
+	.owner		=	THIS_MODULE,
+	.magic		=	TTY_LDISC_MAGIC,
+	.name		=	"n_null",
+	.open		=	n_null_open,
+	.close		=	n_null_close,
+	.read		=	n_null_read,
+	.write		=	n_null_write,
+	.receive_buf	=	n_null_receivebuf
+};
+
+static int __init n_null_init(void)
+{
+	BUG_ON(tty_register_ldisc(N_NULL, &null_ldisc));
+	return 0;
+}
+
+static void __exit n_null_exit(void)
+{
+	tty_unregister_ldisc(N_NULL);
+}
+
+module_init(n_null_init);
+module_exit(n_null_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Alan Cox");
+MODULE_ALIAS_LDISC(N_NULL);
+MODULE_DESCRIPTION("Null ldisc driver");
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index e4603b09863a..4a04567d9aef 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -491,6 +491,29 @@ static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
 	tty_ldisc_debug(tty, "%p: closed\n", ld);
 }
 
+/**
+ *	tty_ldisc_failto	-	helper for ldisc failback
+ *	@tty: tty to open the ldisc on
+ *	@ld: ldisc we are trying to fail back to
+ *
+ *	Helper to try and recover a tty when switching back to the old
+ *	ldisc fails and we need something attached.
+ */
+
+static int tty_ldisc_failto(struct tty_struct *tty, int ld)
+{
+	struct tty_ldisc *disc = tty_ldisc_get(tty, ld);
+	int r;
+
+	if (IS_ERR(disc))
+		return PTR_ERR(disc);
+	tty->ldisc = disc;
+	tty_set_termios_ldisc(tty, ld);
+	if ((r = tty_ldisc_open(tty, disc)) < 0)
+		tty_ldisc_put(disc);
+	return r;
+}
+
 /**
  *	tty_ldisc_restore	-	helper for tty ldisc change
  *	@tty: tty to recover
@@ -502,9 +525,6 @@ static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
 
 static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 {
-	struct tty_ldisc *new_ldisc;
-	int r;
-
 	/* There is an outstanding reference here so this is safe */
 	old = tty_ldisc_get(tty, old->ops->num);
 	WARN_ON(IS_ERR(old));
@@ -512,17 +532,13 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 	tty_set_termios_ldisc(tty, old->ops->num);
 	if (tty_ldisc_open(tty, old) < 0) {
 		tty_ldisc_put(old);
-		/* This driver is always present */
-		new_ldisc = tty_ldisc_get(tty, N_TTY);
-		if (IS_ERR(new_ldisc))
-			panic("n_tty: get");
-		tty->ldisc = new_ldisc;
-		tty_set_termios_ldisc(tty, N_TTY);
-		r = tty_ldisc_open(tty, new_ldisc);
-		if (r < 0)
-			panic("Couldn't open N_TTY ldisc for "
-			      "%s --- error %d.",
-			      tty_name(tty), r);
+		/* The traditional behaviour is to fall back to N_TTY, we
+		   want to avoid falling back to N_NULL unless we have no
+		   choice to avoid the risk of breaking anything */
+		if (tty_ldisc_failto(tty, N_TTY) < 0 &&
+		    tty_ldisc_failto(tty, N_NULL) < 0)
+			panic("Couldn't open N_NULL ldisc for %s.",
+			      tty_name(tty));
 	}
 }
 
diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index e7855dffd592..cf1455396df0 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -36,5 +36,6 @@
 #define N_TRACEROUTER	24	/* Trace data routing for MIPI P1149.7 */
 #define N_NCI		25	/* NFC NCI UART */
 #define N_SPEAKUP	26	/* Speakup communication with synths */
+#define N_NULL		27	/* Null ldisc used for error handling */
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit v1.2.3


From 4d80cc0aaaab9efac14c9d3d702b69961800de20 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 1 Jun 2017 21:37:38 +0300
Subject: net/sched: cls_flower: add support for matching on ip tos and ttl

Benefit from the support of ip header fields dissection and
allow users to set rules matching on ipv4 tos and ttl or
ipv6 traffic-class and hoplimit.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  5 +++++
 net/sched/cls_flower.c       | 39 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index c6e8cf5e9c40..edf43ddf47b0 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -454,6 +454,11 @@ enum {
 	TCA_FLOWER_KEY_TCP_FLAGS,	/* be16 */
 	TCA_FLOWER_KEY_TCP_FLAGS_MASK,	/* be16 */
 
+	TCA_FLOWER_KEY_IP_TOS,		/* u8 */
+	TCA_FLOWER_KEY_IP_TOS_MASK,	/* u8 */
+	TCA_FLOWER_KEY_IP_TTL,		/* u8 */
+	TCA_FLOWER_KEY_IP_TTL_MASK,	/* u8 */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index fb74a47830f4..33feaee197cf 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -50,6 +50,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_ports enc_tp;
 	struct flow_dissector_key_mpls mpls;
 	struct flow_dissector_key_tcp tcp;
+	struct flow_dissector_key_ip ip;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -427,6 +428,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_MPLS_LABEL]	= { .type = NLA_U32 },
 	[TCA_FLOWER_KEY_TCP_FLAGS]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_TCP_FLAGS_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_IP_TOS]		= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_IP_TOS_MASK]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_IP_TTL]		= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_IP_TTL_MASK]	= { .type = NLA_U8 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -528,6 +533,19 @@ static int fl_set_key_flags(struct nlattr **tb,
 	return 0;
 }
 
+static void fl_set_key_ip(struct nlattr **tb,
+			  struct flow_dissector_key_ip *key,
+			  struct flow_dissector_key_ip *mask)
+{
+		fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS,
+			       &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK,
+			       sizeof(key->tos));
+
+		fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL,
+			       &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK,
+			       sizeof(key->ttl));
+}
+
 static int fl_set_key(struct net *net, struct nlattr **tb,
 		      struct fl_flow_key *key, struct fl_flow_key *mask)
 {
@@ -570,6 +588,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
 			       &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
 			       sizeof(key->basic.ip_proto));
+		fl_set_key_ip(tb, &key->ip, &mask->ip);
 	}
 
 	if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
@@ -772,6 +791,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
 			     FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_PORTS, tp);
+	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+			     FLOW_DISSECTOR_KEY_IP, ip);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_TCP, tcp);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
@@ -1082,6 +1103,19 @@ static int fl_dump_key_mpls(struct sk_buff *skb,
 	return 0;
 }
 
+static int fl_dump_key_ip(struct sk_buff *skb,
+			  struct flow_dissector_key_ip *key,
+			  struct flow_dissector_key_ip *mask)
+{
+	if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos,
+			    TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) ||
+	    fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl,
+			    TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl)))
+		return -1;
+
+	return 0;
+}
+
 static int fl_dump_key_vlan(struct sk_buff *skb,
 			    struct flow_dissector_key_vlan *vlan_key,
 			    struct flow_dissector_key_vlan *vlan_mask)
@@ -1195,9 +1229,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	if ((key->basic.n_proto == htons(ETH_P_IP) ||
 	     key->basic.n_proto == htons(ETH_P_IPV6)) &&
-	    fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
+	    (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
 			    &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
-			    sizeof(key->basic.ip_proto)))
+			    sizeof(key->basic.ip_proto)) ||
+	    fl_dump_key_ip(skb, &key->ip, &mask->ip)))
 		goto nla_put_failure;
 
 	if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
-- 
cgit v1.2.3


From b7d3ed5be9bd7e0689eee0f0f36702937cd8f7c8 Mon Sep 17 00:00:00 2001
From: Teng Qin <qinteng@fb.com>
Date: Fri, 2 Jun 2017 21:03:54 -0700
Subject: bpf: update perf event helper functions documentation

This commit updates documentation of the bpf_perf_event_output and
bpf_perf_event_read helpers to match their implementation.

Signed-off-by: Teng Qin <qinteng@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h       | 11 +++++++----
 tools/include/uapi/linux/bpf.h | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 94dfa9def355..e78aece03628 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -313,8 +313,11 @@ union bpf_attr {
  *     @flags: room for future extensions
  *     Return: 0 on success or negative error
  *
- * u64 bpf_perf_event_read(&map, index)
- *     Return: Number events read or error code
+ * u64 bpf_perf_event_read(map, flags)
+ *     read perf event counter value
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     Return: value of perf event counter read or error code
  *
  * int bpf_redirect(ifindex, flags)
  *     redirect to another netdev
@@ -328,11 +331,11 @@ union bpf_attr {
  *     @skb: pointer to skb
  *     Return: realm if != 0
  *
- * int bpf_perf_event_output(ctx, map, index, data, size)
+ * int bpf_perf_event_output(ctx, map, flags, data, size)
  *     output perf raw sample
  *     @ctx: struct pt_regs*
  *     @map: pointer to perf_event_array map
- *     @index: index of event in the map
+ *     @flags: index of event in the map or bitmask flags
  *     @data: data on stack to be output as raw data
  *     @size: size of data
  *     Return: 0 on success or negative error
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 94dfa9def355..e78aece03628 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -313,8 +313,11 @@ union bpf_attr {
  *     @flags: room for future extensions
  *     Return: 0 on success or negative error
  *
- * u64 bpf_perf_event_read(&map, index)
- *     Return: Number events read or error code
+ * u64 bpf_perf_event_read(map, flags)
+ *     read perf event counter value
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     Return: value of perf event counter read or error code
  *
  * int bpf_redirect(ifindex, flags)
  *     redirect to another netdev
@@ -328,11 +331,11 @@ union bpf_attr {
  *     @skb: pointer to skb
  *     Return: realm if != 0
  *
- * int bpf_perf_event_output(ctx, map, index, data, size)
+ * int bpf_perf_event_output(ctx, map, flags, data, size)
  *     output perf raw sample
  *     @ctx: struct pt_regs*
  *     @map: pointer to perf_event_array map
- *     @index: index of event in the map
+ *     @flags: index of event in the map or bitmask flags
  *     @data: data on stack to be output as raw data
  *     @size: size of data
  *     Return: 0 on success or negative error
-- 
cgit v1.2.3


From 60927bc314363f91616c1f4577541c2a2e27aba3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 May 2017 09:56:45 +0200
Subject: uuid: remove uuid_be defintions from the uapi header

We don't use uuid_be and the UUID_BE constants in any uapi headers, so make
them private to the kernel.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/uuid.h      | 15 +++++++++++++++
 include/uapi/linux/uuid.h | 16 ----------------
 2 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 2d095fc60204..30fb13018e29 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -18,6 +18,21 @@
 
 #include <uapi/linux/uuid.h>
 
+typedef struct {
+	__u8 b[16];
+} uuid_be;
+
+#define UUID_BE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
+((uuid_be)								\
+{{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
+   ((b) >> 8) & 0xff, (b) & 0xff,					\
+   ((c) >> 8) & 0xff, (c) & 0xff,					\
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+#define NULL_UUID_BE							\
+	UUID_BE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
+		0x00, 0x00, 0x00, 0x00)
+
 /*
  * The length of a UUID string ("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
  * not including trailing NUL.
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 3738e5fb6a4d..0099756c4bac 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -24,10 +24,6 @@ typedef struct {
 	__u8 b[16];
 } uuid_le;
 
-typedef struct {
-	__u8 b[16];
-} uuid_be;
-
 #define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
 ((uuid_le)								\
 {{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
@@ -35,20 +31,8 @@ typedef struct {
    (c) & 0xff, ((c) >> 8) & 0xff,					\
    (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
 
-#define UUID_BE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
-((uuid_be)								\
-{{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
-   ((b) >> 8) & 0xff, (b) & 0xff,					\
-   ((c) >> 8) & 0xff, (c) & 0xff,					\
-   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
-
 #define NULL_UUID_LE							\
 	UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
 		0x00, 0x00, 0x00, 0x00)
 
-#define NULL_UUID_BE							\
-	UUID_BE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
-		0x00, 0x00, 0x00, 0x00)
-
-
 #endif /* _UAPI_LINUX_UUID_H_ */
-- 
cgit v1.2.3


From f9727a17db9bab71ddae91f74f11a8a2f9a0ece6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 May 2017 10:02:48 +0200
Subject: uuid: rename uuid types

Our "little endian" UUID really is a Wintel GUID, so rename it and its
helpers such (guid_t).  The big endian UUID is the only true one, so
give it the name uuid_t.  The uuid_le and uuid_be names are retained for
now, but will hopefully go away soon.  The exception to that are the _cmp
helpers that will be replaced by better primitives ASAP and thus don't
get the new names.

Also the _to_bin helpers are named to match the better named uuid_parse
routine in userspace.

Also remove the existing typedef in XFS that's now been superceeded by
the generic type name.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[andy: also update the UUID_LE/UUID_BE macros including fallout]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_linux.h        |  2 --
 include/linux/uuid.h      | 55 +++++++++++++++++++++++++++--------------------
 include/uapi/linux/uuid.h | 12 +++++++----
 lib/test_uuid.c           | 32 +++++++++++++--------------
 lib/uuid.c                | 30 +++++++++++++-------------
 lib/vsprintf.c            |  4 ++--
 6 files changed, 73 insertions(+), 62 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 89ee5ec66837..2c33d915e550 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -39,8 +39,6 @@ typedef __s64			xfs_daddr_t;	/* <disk address> type */
 typedef __u32			xfs_dev_t;
 typedef __u32			xfs_nlink_t;
 
-typedef uuid_be			uuid_t;
-
 #include "xfs_types.h"
 
 #include "kmem.h"
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 30fb13018e29..c2adb8046095 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -20,46 +20,55 @@
 
 typedef struct {
 	__u8 b[16];
-} uuid_be;
+} uuid_t;
 
-#define UUID_BE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
-((uuid_be)								\
+#define UUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)			\
+((uuid_t)								\
 {{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
    ((b) >> 8) & 0xff, (b) & 0xff,					\
    ((c) >> 8) & 0xff, (c) & 0xff,					\
    (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
 
-#define NULL_UUID_BE							\
-	UUID_BE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
-		0x00, 0x00, 0x00, 0x00)
-
 /*
  * The length of a UUID string ("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
  * not including trailing NUL.
  */
 #define	UUID_STRING_LEN		36
 
-static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2)
-{
-	return memcmp(&u1, &u2, sizeof(uuid_le));
-}
-
-static inline int uuid_be_cmp(const uuid_be u1, const uuid_be u2)
-{
-	return memcmp(&u1, &u2, sizeof(uuid_be));
-}
-
 void generate_random_uuid(unsigned char uuid[16]);
 
-extern void uuid_le_gen(uuid_le *u);
-extern void uuid_be_gen(uuid_be *u);
+extern void guid_gen(guid_t *u);
+extern void uuid_gen(uuid_t *u);
 
 bool __must_check uuid_is_valid(const char *uuid);
 
-extern const u8 uuid_le_index[16];
-extern const u8 uuid_be_index[16];
+extern const u8 guid_index[16];
+extern const u8 uuid_index[16];
+
+int guid_parse(const char *uuid, guid_t *u);
+int uuid_parse(const char *uuid, uuid_t *u);
 
-int uuid_le_to_bin(const char *uuid, uuid_le *u);
-int uuid_be_to_bin(const char *uuid, uuid_be *u);
+/* backwards compatibility, don't use in new code */
+typedef uuid_t uuid_be;
+#define UUID_BE(a, _b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
+	UUID_INIT(a, _b, c, d0, d1, d2, d3, d4, d5, d6, d7)
+#define NULL_UUID_BE 							\
+	UUID_BE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
+	     0x00, 0x00, 0x00, 0x00)
+
+#define uuid_le_gen(u)		guid_gen(u)
+#define uuid_be_gen(u)		uuid_gen(u)
+#define uuid_le_to_bin(guid, u)	guid_parse(guid, u)
+#define uuid_be_to_bin(uuid, u)	uuid_parse(uuid, u)
+
+static inline int uuid_le_cmp(const guid_t u1, const guid_t u2)
+{
+	return memcmp(&u1, &u2, sizeof(guid_t));
+}
+
+static inline int uuid_be_cmp(const uuid_t u1, const uuid_t u2)
+{
+	return memcmp(&u1, &u2, sizeof(uuid_t));
+}
 
 #endif
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 0099756c4bac..8ef82f433877 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -22,17 +22,21 @@
 
 typedef struct {
 	__u8 b[16];
-} uuid_le;
+} guid_t;
 
-#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
-((uuid_le)								\
+#define GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)			\
+((guid_t)								\
 {{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
    (b) & 0xff, ((b) >> 8) & 0xff,					\
    (c) & 0xff, ((c) >> 8) & 0xff,					\
    (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
 
+/* backwards compatibility, don't use in new code */
+typedef guid_t uuid_le;
+#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
+	GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
 #define NULL_UUID_LE							\
 	UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
-		0x00, 0x00, 0x00, 0x00)
+	     0x00, 0x00, 0x00, 0x00)
 
 #endif /* _UAPI_LINUX_UUID_H_ */
diff --git a/lib/test_uuid.c b/lib/test_uuid.c
index 547d3127a3cf..ff36f3240e90 100644
--- a/lib/test_uuid.c
+++ b/lib/test_uuid.c
@@ -11,25 +11,25 @@
 
 struct test_uuid_data {
 	const char *uuid;
-	uuid_le le;
-	uuid_be be;
+	guid_t le;
+	uuid_t be;
 };
 
 static const struct test_uuid_data test_uuid_test_data[] = {
 	{
 		.uuid = "c33f4995-3701-450e-9fbf-206a2e98e576",
-		.le = UUID_LE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
-		.be = UUID_BE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
+		.le = GUID_INIT(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
+		.be = UUID_INIT(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
 	},
 	{
 		.uuid = "64b4371c-77c1-48f9-8221-29f054fc023b",
-		.le = UUID_LE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
-		.be = UUID_BE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
+		.le = GUID_INIT(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
+		.be = UUID_INIT(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
 	},
 	{
 		.uuid = "0cb4ddff-a545-4401-9d06-688af53e7f84",
-		.le = UUID_LE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
-		.be = UUID_BE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
+		.le = GUID_INIT(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
+		.be = UUID_INIT(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
 	},
 };
 
@@ -61,13 +61,13 @@ static void __init test_uuid_failed(const char *prefix, bool wrong, bool be,
 
 static void __init test_uuid_test(const struct test_uuid_data *data)
 {
-	uuid_le le;
-	uuid_be be;
+	guid_t le;
+	uuid_t be;
 	char buf[48];
 
 	/* LE */
 	total_tests++;
-	if (uuid_le_to_bin(data->uuid, &le))
+	if (guid_parse(data->uuid, &le))
 		test_uuid_failed("conversion", false, false, data->uuid, NULL);
 
 	total_tests++;
@@ -78,7 +78,7 @@ static void __init test_uuid_test(const struct test_uuid_data *data)
 
 	/* BE */
 	total_tests++;
-	if (uuid_be_to_bin(data->uuid, &be))
+	if (uuid_parse(data->uuid, &be))
 		test_uuid_failed("conversion", false, true, data->uuid, NULL);
 
 	total_tests++;
@@ -90,17 +90,17 @@ static void __init test_uuid_test(const struct test_uuid_data *data)
 
 static void __init test_uuid_wrong(const char *data)
 {
-	uuid_le le;
-	uuid_be be;
+	guid_t le;
+	uuid_t be;
 
 	/* LE */
 	total_tests++;
-	if (!uuid_le_to_bin(data, &le))
+	if (!guid_parse(data, &le))
 		test_uuid_failed("negative", true, false, data, NULL);
 
 	/* BE */
 	total_tests++;
-	if (!uuid_be_to_bin(data, &be))
+	if (!uuid_parse(data, &be))
 		test_uuid_failed("negative", true, true, data, NULL);
 }
 
diff --git a/lib/uuid.c b/lib/uuid.c
index 37687af77ff8..90bee73f7bd7 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -21,10 +21,10 @@
 #include <linux/uuid.h>
 #include <linux/random.h>
 
-const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
-EXPORT_SYMBOL(uuid_le_index);
-const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-EXPORT_SYMBOL(uuid_be_index);
+const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(guid_index);
+const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_index);
 
 /***************************************************************
  * Random UUID interface
@@ -53,21 +53,21 @@ static void __uuid_gen_common(__u8 b[16])
 	b[8] = (b[8] & 0x3F) | 0x80;
 }
 
-void uuid_le_gen(uuid_le *lu)
+void guid_gen(guid_t *lu)
 {
 	__uuid_gen_common(lu->b);
 	/* version 4 : random generation */
 	lu->b[7] = (lu->b[7] & 0x0F) | 0x40;
 }
-EXPORT_SYMBOL_GPL(uuid_le_gen);
+EXPORT_SYMBOL_GPL(guid_gen);
 
-void uuid_be_gen(uuid_be *bu)
+void uuid_gen(uuid_t *bu)
 {
 	__uuid_gen_common(bu->b);
 	/* version 4 : random generation */
 	bu->b[6] = (bu->b[6] & 0x0F) | 0x40;
 }
-EXPORT_SYMBOL_GPL(uuid_be_gen);
+EXPORT_SYMBOL_GPL(uuid_gen);
 
 /**
   * uuid_is_valid - checks if UUID string valid
@@ -97,7 +97,7 @@ bool uuid_is_valid(const char *uuid)
 }
 EXPORT_SYMBOL(uuid_is_valid);
 
-static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
+static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16])
 {
 	static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34};
 	unsigned int i;
@@ -115,14 +115,14 @@ static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
 	return 0;
 }
 
-int uuid_le_to_bin(const char *uuid, uuid_le *u)
+int guid_parse(const char *uuid, guid_t *u)
 {
-	return __uuid_to_bin(uuid, u->b, uuid_le_index);
+	return __uuid_parse(uuid, u->b, guid_index);
 }
-EXPORT_SYMBOL(uuid_le_to_bin);
+EXPORT_SYMBOL(guid_parse);
 
-int uuid_be_to_bin(const char *uuid, uuid_be *u)
+int uuid_parse(const char *uuid, uuid_t *u)
 {
-	return __uuid_to_bin(uuid, u->b, uuid_be_index);
+	return __uuid_parse(uuid, u->b, uuid_index);
 }
-EXPORT_SYMBOL(uuid_be_to_bin);
+EXPORT_SYMBOL(uuid_parse);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 2d41de3f98a1..9f37d6208e99 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1308,14 +1308,14 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
 	char uuid[UUID_STRING_LEN + 1];
 	char *p = uuid;
 	int i;
-	const u8 *index = uuid_be_index;
+	const u8 *index = uuid_index;
 	bool uc = false;
 
 	switch (*(++fmt)) {
 	case 'L':
 		uc = true;		/* fall-through */
 	case 'l':
-		index = uuid_le_index;
+		index = guid_index;
 		break;
 	case 'B':
 		uc = true;
-- 
cgit v1.2.3


From e25ea21ffa66a029acfa89d2611c0e7ef23e7d8c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Tue, 6 Jun 2017 14:12:02 +0200
Subject: net: sched: introduce a TRAP control action

There is need to instruct the HW offloaded path to push certain matched
packets to cpu/kernel for further analysis. So this patch introduces a
new TRAP control action to TC.

For kernel datapath, this action does not make much sense. So with the
same logic as in HW, new TRAP behaves similar to STOLEN. The skb is just
dropped in the datapath (and virtually ejected to an upper level, which
does not exist in case of kernel).

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 7 +++++++
 net/core/dev.c               | 2 ++
 net/sched/cls_bpf.c          | 1 +
 net/sched/sch_atm.c          | 1 +
 net/sched/sch_cbq.c          | 1 +
 net/sched/sch_drr.c          | 1 +
 net/sched/sch_dsmark.c       | 1 +
 net/sched/sch_fq_codel.c     | 1 +
 net/sched/sch_hfsc.c         | 1 +
 net/sched/sch_htb.c          | 1 +
 net/sched/sch_multiq.c       | 1 +
 net/sched/sch_prio.c         | 1 +
 net/sched/sch_qfq.c          | 1 +
 net/sched/sch_sfb.c          | 1 +
 net/sched/sch_sfq.c          | 1 +
 15 files changed, 22 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index edf43ddf47b0..2055783e6ee9 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -37,6 +37,13 @@ enum {
 #define TC_ACT_QUEUED		5
 #define TC_ACT_REPEAT		6
 #define TC_ACT_REDIRECT		7
+#define TC_ACT_TRAP		8 /* For hw path, this means "trap to cpu"
+				   * and don't further process the frame
+				   * in hardware. For sw path, this is
+				   * equivalent of TC_ACT_STOLEN - drop
+				   * the skb and act like everything
+				   * is alright.
+				   */
 
 /* There is a special kind of actions called "extended actions",
  * which need a value parameter. These have a local opcode located in
diff --git a/net/core/dev.c b/net/core/dev.c
index 06e0a7492df8..8f72f4a9c6ac 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3269,6 +3269,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		return NULL;
 	case TC_ACT_STOLEN:
 	case TC_ACT_QUEUED:
+	case TC_ACT_TRAP:
 		*ret = NET_XMIT_SUCCESS;
 		consume_skb(skb);
 		return NULL;
@@ -4038,6 +4039,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		return NULL;
 	case TC_ACT_STOLEN:
 	case TC_ACT_QUEUED:
+	case TC_ACT_TRAP:
 		consume_skb(skb);
 		return NULL;
 	case TC_ACT_REDIRECT:
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 5ebeae996e63..a9c56ad4533a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -70,6 +70,7 @@ static int cls_bpf_exec_opcode(int code)
 	case TC_ACT_OK:
 	case TC_ACT_SHOT:
 	case TC_ACT_STOLEN:
+	case TC_ACT_TRAP:
 	case TC_ACT_REDIRECT:
 	case TC_ACT_UNSPEC:
 		return code;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index f435546c3864..de162592eee0 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -406,6 +406,7 @@ done:
 		switch (result) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
+		case TC_ACT_TRAP:
 			__qdisc_drop(skb, to_free);
 			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 8dd6d0aca678..481036f6b54e 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -254,6 +254,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		switch (result) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 5db2a2843c66..a413dc1c2098 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -339,6 +339,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 		switch (result) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 7ccdd825d34e..6d94fcc3592a 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -243,6 +243,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 #ifdef CONFIG_NET_CLS_ACT
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
+		case TC_ACT_TRAP:
 			__qdisc_drop(skb, to_free);
 			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index f201e73947fb..337f2d6d81e4 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -103,6 +103,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 		switch (result) {
 		case TC_ACT_STOLEN:
 		case TC_ACT_QUEUED:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return 0;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index a324f84b1ccd..b52f74610dc7 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1155,6 +1155,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		switch (result) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 195bbca9eb0b..203286ab4427 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -238,6 +238,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 		switch (result) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 604767482ad0..f143b7bbaa0d 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -52,6 +52,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	switch (err) {
 	case TC_ACT_STOLEN:
 	case TC_ACT_QUEUED:
+	case TC_ACT_TRAP:
 		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 	case TC_ACT_SHOT:
 		return NULL;
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index a2404688dd01..e3e364cc9a70 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -48,6 +48,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		switch (err) {
 		case TC_ACT_STOLEN:
 		case TC_ACT_QUEUED:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 076ad032befb..0e16dfda0bd7 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -726,6 +726,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 		switch (result) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 9756b1ccd345..11fb6ec878d6 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -266,6 +266,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
 		switch (result) {
 		case TC_ACT_STOLEN:
 		case TC_ACT_QUEUED:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return false;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 66dfd15b7946..f80ea2cc5f1f 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -187,6 +187,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 		switch (result) {
 		case TC_ACT_STOLEN:
 		case TC_ACT_QUEUED:
+		case TC_ACT_TRAP:
 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return 0;
-- 
cgit v1.2.3


From 34ad5580f8f9c86cb273ebea25c149613cd1667e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:48 -0700
Subject: bpf: Add BPF_(PROG|MAP)_GET_NEXT_ID command

This patch adds BPF_PROG_GET_NEXT_ID and BPF_MAP_GET_NEXT_ID
to allow userspace to iterate all bpf_prog IDs and bpf_map IDs.

The API is trying to be consistent with the existing
BPF_MAP_GET_NEXT_KEY.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  7 +++++++
 kernel/bpf/syscall.c     | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e78aece03628..629747a3f273 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -82,6 +82,8 @@ enum bpf_cmd {
 	BPF_PROG_ATTACH,
 	BPF_PROG_DETACH,
 	BPF_PROG_TEST_RUN,
+	BPF_PROG_GET_NEXT_ID,
+	BPF_MAP_GET_NEXT_ID,
 };
 
 enum bpf_map_type {
@@ -209,6 +211,11 @@ union bpf_attr {
 		__u32		repeat;
 		__u32		duration;
 	} test;
+
+	struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */
+		__u32		start_id;
+		__u32		next_id;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4c3075b5d840..2405feedb8c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -166,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map)
 void bpf_map_put(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->refcnt)) {
+		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
@@ -726,6 +727,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
+		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog);
 		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
@@ -1069,6 +1071,34 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	return ret;
 }
 
+#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
+
+static int bpf_obj_get_next_id(const union bpf_attr *attr,
+			       union bpf_attr __user *uattr,
+			       struct idr *idr,
+			       spinlock_t *lock)
+{
+	u32 next_id = attr->start_id;
+	int err = 0;
+
+	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	next_id++;
+	spin_lock_bh(lock);
+	if (!idr_get_next(idr, &next_id))
+		err = -ENOENT;
+	spin_unlock_bh(lock);
+
+	if (!err)
+		err = put_user(next_id, &uattr->next_id);
+
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1146,6 +1176,14 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
 		break;
+	case BPF_PROG_GET_NEXT_ID:
+		err = bpf_obj_get_next_id(&attr, uattr,
+					  &prog_idr, &prog_idr_lock);
+		break;
+	case BPF_MAP_GET_NEXT_ID:
+		err = bpf_obj_get_next_id(&attr, uattr,
+					  &map_idr, &map_idr_lock);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From b16d9aa4c2b90af8d2c3201e245150f8c430c3bc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:49 -0700
Subject: bpf: Add BPF_PROG_GET_FD_BY_ID

Add BPF_PROG_GET_FD_BY_ID command to allow user to get a fd
from a bpf_prog's ID.

bpf_prog_inc_not_zero() is added and is called with prog_idr_lock
held.

__bpf_prog_put() is also added which has the 'bool do_idr_lock'
param to decide if the prog_idr_lock should be acquired when
freeing the prog->id.

In the error path of bpf_prog_inc_not_zero(), it may have to
call __bpf_prog_put(map, false) which does not need
to take the prog_idr_lock when freeing the prog->id.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  8 +++--
 kernel/bpf/syscall.c     | 91 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 87 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 629747a3f273..d70cfed19d5e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -84,6 +84,7 @@ enum bpf_cmd {
 	BPF_PROG_TEST_RUN,
 	BPF_PROG_GET_NEXT_ID,
 	BPF_MAP_GET_NEXT_ID,
+	BPF_PROG_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -212,8 +213,11 @@ union bpf_attr {
 		__u32		duration;
 	} test;
 
-	struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */
-		__u32		start_id;
+	struct { /* anonymous struct used by BPF_*_GET_*_ID */
+		union {
+			__u32		start_id;
+			__u32		prog_id;
+		};
 		__u32		next_id;
 	};
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2405feedb8c1..dc6253bb8ebb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -703,15 +703,23 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_prog_free_id(struct bpf_prog *prog)
+static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 {
 	/* cBPF to eBPF migrations are currently not in the idr store. */
 	if (!prog->aux->id)
 		return;
 
-	spin_lock_bh(&prog_idr_lock);
+	if (do_idr_lock)
+		spin_lock_bh(&prog_idr_lock);
+	else
+		__acquire(&prog_idr_lock);
+
 	idr_remove(&prog_idr, prog->aux->id);
-	spin_unlock_bh(&prog_idr_lock);
+
+	if (do_idr_lock)
+		spin_unlock_bh(&prog_idr_lock);
+	else
+		__release(&prog_idr_lock);
 }
 
 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -723,16 +731,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	bpf_prog_free(aux->prog);
 }
 
-void bpf_prog_put(struct bpf_prog *prog)
+static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
 		/* bpf_prog_free_id() must be called first */
-		bpf_prog_free_id(prog);
+		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
 }
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+	__bpf_prog_put(prog, true);
+}
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
 static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -814,6 +827,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc);
 
+/* prog_idr_lock should have been held */
+static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+	int refold;
+
+	refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
+
+	if (refold >= BPF_MAX_REFCNT) {
+		__bpf_prog_put(prog, false);
+		return ERR_PTR(-EBUSY);
+	}
+
+	if (!refold)
+		return ERR_PTR(-ENOENT);
+
+	return prog;
+}
+
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
 	struct fd f = fdget(ufd);
@@ -928,16 +959,21 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_used_maps;
 
 	err = bpf_prog_new_fd(prog);
-	if (err < 0)
-		/* failed to allocate fd */
-		goto free_id;
+	if (err < 0) {
+		/* failed to allocate fd.
+		 * bpf_prog_put() is needed because the above
+		 * bpf_prog_alloc_id() has published the prog
+		 * to the userspace and the userspace may
+		 * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
+		 */
+		bpf_prog_put(prog);
+		return err;
+	}
 
 	bpf_prog_kallsyms_add(prog);
 	trace_bpf_prog_load(prog, err);
 	return err;
 
-free_id:
-	bpf_prog_free_id(prog);
 free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
@@ -1099,6 +1135,38 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	return err;
 }
 
+#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	u32 id = attr->prog_id;
+	int fd;
+
+	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	spin_lock_bh(&prog_idr_lock);
+	prog = idr_find(&prog_idr, id);
+	if (prog)
+		prog = bpf_prog_inc_not_zero(prog);
+	else
+		prog = ERR_PTR(-ENOENT);
+	spin_unlock_bh(&prog_idr_lock);
+
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	fd = bpf_prog_new_fd(prog);
+	if (fd < 0)
+		bpf_prog_put(prog);
+
+	return fd;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1184,6 +1252,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_obj_get_next_id(&attr, uattr,
 					  &map_idr, &map_idr_lock);
 		break;
+	case BPF_PROG_GET_FD_BY_ID:
+		err = bpf_prog_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From bd5f5f4ecb78e2698dad655645b6d6a2f7012a8c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:50 -0700
Subject: bpf: Add BPF_MAP_GET_FD_BY_ID

Add BPF_MAP_GET_FD_BY_ID command to allow user to get a fd
from a bpf_map's ID.

bpf_map_inc_not_zero() is added and is called with map_idr_lock
held.

__bpf_map_put() is also added which has the 'bool do_idr_lock'
param to decide if the map_idr_lock should be acquired when
freeing the map->id.

In the error path of bpf_map_inc_not_zero(), it may have to
call __bpf_map_put(map, false) which does not need
to take the map_idr_lock when freeing the map->id.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  2 +
 kernel/bpf/syscall.c     | 95 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 87 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d70cfed19d5e..dd23f47ff00c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -85,6 +85,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_NEXT_ID,
 	BPF_MAP_GET_NEXT_ID,
 	BPF_PROG_GET_FD_BY_ID,
+	BPF_MAP_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -217,6 +218,7 @@ union bpf_attr {
 		union {
 			__u32		start_id;
 			__u32		prog_id;
+			__u32		map_id;
 		};
 		__u32		next_id;
 	};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dc6253bb8ebb..1802bb9c47d9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -135,11 +135,19 @@ static int bpf_map_alloc_id(struct bpf_map *map)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_map_free_id(struct bpf_map *map)
+static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
-	spin_lock_bh(&map_idr_lock);
+	if (do_idr_lock)
+		spin_lock_bh(&map_idr_lock);
+	else
+		__acquire(&map_idr_lock);
+
 	idr_remove(&map_idr, map->id);
-	spin_unlock_bh(&map_idr_lock);
+
+	if (do_idr_lock)
+		spin_unlock_bh(&map_idr_lock);
+	else
+		__release(&map_idr_lock);
 }
 
 /* called from workqueue */
@@ -163,16 +171,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
 /* decrement map refcnt and schedule it for freeing via workqueue
  * (unrelying map implementation ops->map_free() might sleep)
  */
-void bpf_map_put(struct bpf_map *map)
+static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
-		bpf_map_free_id(map);
+		bpf_map_free_id(map, do_idr_lock);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
 }
 
+void bpf_map_put(struct bpf_map *map)
+{
+	__bpf_map_put(map, true);
+}
+
 void bpf_map_put_with_uref(struct bpf_map *map)
 {
 	bpf_map_put_uref(map);
@@ -271,15 +284,20 @@ static int map_create(union bpf_attr *attr)
 		goto free_map;
 
 	err = bpf_map_new_fd(map);
-	if (err < 0)
-		/* failed to allocate fd */
-		goto free_id;
+	if (err < 0) {
+		/* failed to allocate fd.
+		 * bpf_map_put() is needed because the above
+		 * bpf_map_alloc_id() has published the map
+		 * to the userspace and the userspace may
+		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
+		 */
+		bpf_map_put(map);
+		return err;
+	}
 
 	trace_bpf_map_create(map, err);
 	return err;
 
-free_id:
-	bpf_map_free_id(map);
 free_map:
 	bpf_map_uncharge_memlock(map);
 free_map_nouncharge:
@@ -331,6 +349,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	return map;
 }
 
+/* map_idr_lock should have been held */
+static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
+					    bool uref)
+{
+	int refold;
+
+	refold = __atomic_add_unless(&map->refcnt, 1, 0);
+
+	if (refold >= BPF_MAX_REFCNT) {
+		__bpf_map_put(map, false);
+		return ERR_PTR(-EBUSY);
+	}
+
+	if (!refold)
+		return ERR_PTR(-ENOENT);
+
+	if (uref)
+		atomic_inc(&map->usercnt);
+
+	return map;
+}
+
 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 {
 	return -ENOTSUPP;
@@ -1167,6 +1207,38 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+
+static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
+{
+	struct bpf_map *map;
+	u32 id = attr->map_id;
+	int fd;
+
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	spin_lock_bh(&map_idr_lock);
+	map = idr_find(&map_idr, id);
+	if (map)
+		map = bpf_map_inc_not_zero(map, true);
+	else
+		map = ERR_PTR(-ENOENT);
+	spin_unlock_bh(&map_idr_lock);
+
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	fd = bpf_map_new_fd(map);
+	if (fd < 0)
+		bpf_map_put(map);
+
+	return fd;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1255,6 +1327,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_GET_FD_BY_ID:
 		err = bpf_prog_get_fd_by_id(&attr);
 		break;
+	case BPF_MAP_GET_FD_BY_ID:
+		err = bpf_map_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 1e270976908686ec25fb91b8a34145be54137976 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:52 -0700
Subject: bpf: Add BPF_OBJ_GET_INFO_BY_FD

A single BPF_OBJ_GET_INFO_BY_FD cmd is used to obtain the info
for both bpf_prog and bpf_map.  The kernel can figure out the
fd is associated with a bpf_prog or bpf_map.

The suggested struct bpf_prog_info and struct bpf_map_info are
not meant to be a complete list and it is not the goal of this patch.
New fields can be added in the future patch.

The focus of this patch is to create the interface,
BPF_OBJ_GET_INFO_BY_FD cmd for exposing the bpf_prog's and
bpf_map's info.

The obj's info, which will be extended (and get bigger) over time, is
separated from the bpf_attr to avoid bloating the bpf_attr.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h   |   2 -
 include/uapi/linux/bpf.h |  28 ++++++++
 kernel/bpf/syscall.c     | 163 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 174 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1e2dddf21f3b..1fa26dc562ce 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -69,8 +69,6 @@ struct bpf_prog_aux;
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
-#define BPF_TAG_SIZE	8
-
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd23f47ff00c..9b2c10b45733 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -86,6 +86,7 @@ enum bpf_cmd {
 	BPF_MAP_GET_NEXT_ID,
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
+	BPF_OBJ_GET_INFO_BY_FD,
 };
 
 enum bpf_map_type {
@@ -222,6 +223,12 @@ union bpf_attr {
 		};
 		__u32		next_id;
 	};
+
+	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
+		__u32		bpf_fd;
+		__u32		info_len;
+		__aligned_u64	info;
+	} info;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
@@ -686,4 +693,25 @@ struct xdp_md {
 	__u32 data_end;
 };
 
+#define BPF_TAG_SIZE	8
+
+struct bpf_prog_info {
+	__u32 type;
+	__u32 id;
+	__u8  tag[BPF_TAG_SIZE];
+	__u32 jited_prog_len;
+	__u32 xlated_prog_len;
+	__aligned_u64 jited_prog_insns;
+	__aligned_u64 xlated_prog_insns;
+} __attribute__((aligned(8)));
+
+struct bpf_map_info {
+	__u32 type;
+	__u32 id;
+	__u32 key_size;
+	__u32 value_size;
+	__u32 max_entries;
+	__u32 map_flags;
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1802bb9c47d9..8942c820d620 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1239,6 +1239,145 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
+static int check_uarg_tail_zero(void __user *uaddr,
+				size_t expected_size,
+				size_t actual_size)
+{
+	unsigned char __user *addr;
+	unsigned char __user *end;
+	unsigned char val;
+	int err;
+
+	if (actual_size <= expected_size)
+		return 0;
+
+	addr = uaddr + expected_size;
+	end  = uaddr + actual_size;
+
+	for (; addr < end; addr++) {
+		err = get_user(val, addr);
+		if (err)
+			return err;
+		if (val)
+			return -E2BIG;
+	}
+
+	return 0;
+}
+
+static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
+				   const union bpf_attr *attr,
+				   union bpf_attr __user *uattr)
+{
+	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	struct bpf_prog_info info = {};
+	u32 info_len = attr->info.info_len;
+	char __user *uinsns;
+	u32 ulen;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	if (err)
+		return err;
+	info_len = min_t(u32, sizeof(info), info_len);
+
+	if (copy_from_user(&info, uinfo, info_len))
+		return err;
+
+	info.type = prog->type;
+	info.id = prog->aux->id;
+
+	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		info.jited_prog_len = 0;
+		info.xlated_prog_len = 0;
+		goto done;
+	}
+
+	ulen = info.jited_prog_len;
+	info.jited_prog_len = prog->jited_len;
+	if (info.jited_prog_len && ulen) {
+		uinsns = u64_to_user_ptr(info.jited_prog_insns);
+		ulen = min_t(u32, info.jited_prog_len, ulen);
+		if (copy_to_user(uinsns, prog->bpf_func, ulen))
+			return -EFAULT;
+	}
+
+	ulen = info.xlated_prog_len;
+	info.xlated_prog_len = bpf_prog_size(prog->len);
+	if (info.xlated_prog_len && ulen) {
+		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+		ulen = min_t(u32, info.xlated_prog_len, ulen);
+		if (copy_to_user(uinsns, prog->insnsi, ulen))
+			return -EFAULT;
+	}
+
+done:
+	if (copy_to_user(uinfo, &info, info_len) ||
+	    put_user(info_len, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int bpf_map_get_info_by_fd(struct bpf_map *map,
+				  const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	struct bpf_map_info info = {};
+	u32 info_len = attr->info.info_len;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	if (err)
+		return err;
+	info_len = min_t(u32, sizeof(info), info_len);
+
+	info.type = map->map_type;
+	info.id = map->id;
+	info.key_size = map->key_size;
+	info.value_size = map->value_size;
+	info.max_entries = map->max_entries;
+	info.map_flags = map->map_flags;
+
+	if (copy_to_user(uinfo, &info, info_len) ||
+	    put_user(info_len, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
+
+static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	int ufd = attr->info.bpf_fd;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	if (!f.file)
+		return -EBADFD;
+
+	if (f.file->f_op == &bpf_prog_fops)
+		err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
+					      uattr);
+	else if (f.file->f_op == &bpf_map_fops)
+		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
+					     uattr);
+	else
+		err = -EINVAL;
+
+	fdput(f);
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1258,23 +1397,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	 * user-space does not rely on any kernel feature
 	 * extensions we dont know about yet.
 	 */
-	if (size > sizeof(attr)) {
-		unsigned char __user *addr;
-		unsigned char __user *end;
-		unsigned char val;
-
-		addr = (void __user *)uattr + sizeof(attr);
-		end  = (void __user *)uattr + size;
-
-		for (; addr < end; addr++) {
-			err = get_user(val, addr);
-			if (err)
-				return err;
-			if (val)
-				return -E2BIG;
-		}
-		size = sizeof(attr);
-	}
+	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+	if (err)
+		return err;
+	size = min_t(u32, size, sizeof(attr));
 
 	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
 	if (copy_from_user(&attr, uattr, size) != 0)
@@ -1330,6 +1456,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_GET_FD_BY_ID:
 		err = bpf_map_get_fd_by_id(&attr);
 		break;
+	case BPF_OBJ_GET_INFO_BY_FD:
+		err = bpf_obj_get_info_by_fd(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 2d4283e9d583a3ee8cfb1cbb9c1270614df4c29d Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:33 +0200
Subject: sched/deadline: Make GRUB a task's flag

This patch introduces the SCHED_FLAG_RECLAIM flag to specify
that a DL task is allowed to reclaim unused CPU time (using
the GRUB algorithm).

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-7-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/sched.h | 1 +
 kernel/sched/core.c        | 3 ++-
 kernel/sched/deadline.c    | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5f0fe019a720..e2a6c7b3510b 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -47,5 +47,6 @@
  * For the sched_{set,get}attr() calls
  */
 #define SCHED_FLAG_RESET_ON_FORK	0x01
+#define SCHED_FLAG_RECLAIM		0x02
 
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7abd06400a98..8d1a5a625814 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4195,7 +4195,8 @@ recheck:
 			return -EINVAL;
 	}
 
-	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+	if (attr->sched_flags &
+		~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
 		return -EINVAL;
 
 	/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a0614b9c8d7..61ea3039cdc1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -985,7 +985,8 @@ static void update_curr_dl(struct rq *rq)
 
 	sched_rt_avg_update(rq, delta_exec);
 
-	delta_exec = grub_reclaim(delta_exec, rq);
+	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
+		delta_exec = grub_reclaim(delta_exec, rq);
 	dl_se->runtime -= delta_exec;
 
 throttle:
-- 
cgit v1.2.3


From f2c6df7dbf9a60e1cd9941f9fb376d4d9ad1e8dd Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 8 Jun 2017 13:46:45 +0200
Subject: loop: support 4k physical blocksize

When generating bootable VM images certain systems (most notably
s390x) require devices with 4k blocksize. This patch implements
a new flag 'LO_FLAGS_BLOCKSIZE' which will set the physical
blocksize to that of the underlying device, and allow to change
the logical blocksize for up to the physical blocksize.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c      | 43 +++++++++++++++++++++++++++++++++++++------
 drivers/block/loop.h      |  1 +
 include/uapi/linux/loop.h |  3 +++
 3 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fc706adff6a4..4d376c10a97a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -221,7 +221,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
 }
 
 static int
-figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
+figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit,
+		 loff_t logical_blocksize)
 {
 	loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
 	sector_t x = (sector_t)size;
@@ -233,6 +234,12 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
 		lo->lo_offset = offset;
 	if (lo->lo_sizelimit != sizelimit)
 		lo->lo_sizelimit = sizelimit;
+	if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) {
+		lo->lo_logical_blocksize = logical_blocksize;
+		blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize);
+		blk_queue_logical_block_size(lo->lo_queue,
+					     lo->lo_logical_blocksize);
+	}
 	set_capacity(lo->lo_disk, x);
 	bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
 	/* let user-space know about the new size */
@@ -810,6 +817,7 @@ static void loop_config_discard(struct loop_device *lo)
 	struct file *file = lo->lo_backing_file;
 	struct inode *inode = file->f_mapping->host;
 	struct request_queue *q = lo->lo_queue;
+	int lo_bits = 9;
 
 	/*
 	 * We use punch hole to reclaim the free space used by the
@@ -829,8 +837,11 @@ static void loop_config_discard(struct loop_device *lo)
 
 	q->limits.discard_granularity = inode->i_sb->s_blocksize;
 	q->limits.discard_alignment = 0;
-	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
-	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
+	if (lo->lo_flags & LO_FLAGS_BLOCKSIZE)
+		lo_bits = blksize_bits(lo->lo_logical_blocksize);
+
+	blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits);
+	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits);
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
@@ -918,6 +929,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	lo->use_dio = false;
 	lo->lo_blocksize = lo_blocksize;
+	lo->lo_logical_blocksize = 512;
 	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
@@ -1083,6 +1095,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	int err;
 	struct loop_func_table *xfer;
 	kuid_t uid = current_uid();
+	int lo_flags = lo->lo_flags;
 
 	if (lo->lo_encrypt_key_size &&
 	    !uid_eq(lo->lo_key_owner, uid) &&
@@ -1115,9 +1128,26 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	if (err)
 		goto exit;
 
+	if (info->lo_flags & LO_FLAGS_BLOCKSIZE) {
+		if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE))
+			lo->lo_logical_blocksize = 512;
+		lo->lo_flags |= LO_FLAGS_BLOCKSIZE;
+		if (LO_INFO_BLOCKSIZE(info) != 512 &&
+		    LO_INFO_BLOCKSIZE(info) != 1024 &&
+		    LO_INFO_BLOCKSIZE(info) != 2048 &&
+		    LO_INFO_BLOCKSIZE(info) != 4096)
+			return -EINVAL;
+		if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize)
+			return -EINVAL;
+	}
+
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit)
-		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
+	    lo->lo_sizelimit != info->lo_sizelimit ||
+	    lo->lo_flags != lo_flags ||
+	    ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) &&
+	     lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) {
+		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit,
+				     LO_INFO_BLOCKSIZE(info)))
 			err = -EFBIG;
 			goto exit;
 		}
@@ -1308,7 +1338,8 @@ static int loop_set_capacity(struct loop_device *lo)
 	if (unlikely(lo->lo_state != Lo_bound))
 		return -ENXIO;
 
-	return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
+	return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit,
+				lo->lo_logical_blocksize);
 }
 
 static int loop_set_dio(struct loop_device *lo, unsigned long arg)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index fecd3f97ef8c..2c096b9a17b8 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -49,6 +49,7 @@ struct loop_device {
 	struct file *	lo_backing_file;
 	struct block_device *lo_device;
 	unsigned	lo_blocksize;
+	unsigned	lo_logical_blocksize;
 	void		*key_data; 
 
 	gfp_t		old_gfp_mask;
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index c8125ec1f4f2..a3960f98679c 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -22,6 +22,7 @@ enum {
 	LO_FLAGS_AUTOCLEAR	= 4,
 	LO_FLAGS_PARTSCAN	= 8,
 	LO_FLAGS_DIRECT_IO	= 16,
+	LO_FLAGS_BLOCKSIZE	= 32,
 };
 
 #include <asm/posix_types.h>	/* for __kernel_old_dev_t */
@@ -59,6 +60,8 @@ struct loop_info64 {
 	__u64		   lo_init[2];
 };
 
+#define LO_INFO_BLOCKSIZE(l) (l)->lo_init[0]
+
 /*
  * Loop filter types
  */
-- 
cgit v1.2.3


From 0604475119de5f80dc051a5db055c6a2a75bd542 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Jun 2017 13:29:12 -0700
Subject: tcp: add TCPMemoryPressuresChrono counter

DRAM supply shortage and poor memory pressure tracking in TCP
stack makes any change in SO_SNDBUF/SO_RCVBUF (or equivalent autotuning
limits) and tcp_mem[] quite hazardous.

TCPMemoryPressures SNMP counter is an indication of tcp_mem sysctl
limits being hit, but only tracking number of transitions.

If TCP stack behavior under stress was perfect :
1) It would maintain memory usage close to the limit.
2) Memory pressure state would be entered for short times.

We certainly prefer 100 events lasting 10ms compared to one event
lasting 200 seconds.

This patch adds a new SNMP counter tracking cumulative duration of
memory pressure events, given in ms units.

$ cat /proc/sys/net/ipv4/tcp_mem
3088    4117    6176
$ grep TCP /proc/net/sockstat
TCP: inuse 180 orphan 0 tw 2 alloc 234 mem 4140
$ nstat -n ; sleep 10 ; nstat |grep Pressure
TcpExtTCPMemoryPressures        1700
TcpExtTCPMemoryPressuresChrono  5209

v2: Used EXPORT_SYMBOL_GPL() instead of EXPORT_SYMBOL() as David
instructed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h        | 22 ++--------------------
 include/net/tcp.h         |  3 ++-
 include/uapi/linux/snmp.h |  1 +
 net/core/sock.c           | 20 ++++++++++++++++++++
 net/decnet/af_decnet.c    |  2 +-
 net/ipv4/proc.c           |  1 +
 net/ipv4/tcp.c            | 31 +++++++++++++++++++++++++------
 net/ipv4/tcp_ipv4.c       |  1 +
 net/ipv6/tcp_ipv6.c       |  1 +
 net/sctp/socket.c         |  2 +-
 10 files changed, 55 insertions(+), 29 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/sock.h b/include/net/sock.h
index 3467d9e89e7d..858891c36f94 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1080,6 +1080,7 @@ struct proto {
 	bool			(*stream_memory_free)(const struct sock *sk);
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
+	void			(*leave_memory_pressure)(struct sock *sk);
 	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
 	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 	/*
@@ -1088,7 +1089,7 @@ struct proto {
 	 * All the __sk_mem_schedule() is of this nature: accounting
 	 * is strict, actions are advisory and have some latency.
 	 */
-	int			*memory_pressure;
+	unsigned long		*memory_pressure;
 	long			*sysctl_mem;
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
@@ -1193,25 +1194,6 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 	return !!*sk->sk_prot->memory_pressure;
 }
 
-static inline void sk_leave_memory_pressure(struct sock *sk)
-{
-	int *memory_pressure = sk->sk_prot->memory_pressure;
-
-	if (!memory_pressure)
-		return;
-
-	if (*memory_pressure)
-		*memory_pressure = 0;
-}
-
-static inline void sk_enter_memory_pressure(struct sock *sk)
-{
-	if (!sk->sk_prot->enter_memory_pressure)
-		return;
-
-	sk->sk_prot->enter_memory_pressure(sk);
-}
-
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index aec092560d9b..3ab677d11d02 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -276,7 +276,7 @@ extern int sysctl_tcp_pacing_ca_ratio;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
-extern int tcp_memory_pressure;
+extern unsigned long tcp_memory_pressure;
 
 /* optimized version of sk_under_memory_pressure() for TCP sockets */
 static inline bool tcp_under_memory_pressure(const struct sock *sk)
@@ -1320,6 +1320,7 @@ extern void tcp_openreq_init_rwin(struct request_sock *req,
 				  const struct dst_entry *dst);
 
 void tcp_enter_memory_pressure(struct sock *sk);
+void tcp_leave_memory_pressure(struct sock *sk);
 
 static inline int keepalive_intvl_when(const struct tcp_sock *tp)
 {
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 95cffcb21dfd..d85693295798 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -228,6 +228,7 @@ enum
 	LINUX_MIB_TCPABORTONLINGER,		/* TCPAbortOnLinger */
 	LINUX_MIB_TCPABORTFAILED,		/* TCPAbortFailed */
 	LINUX_MIB_TCPMEMORYPRESSURES,		/* TCPMemoryPressures */
+	LINUX_MIB_TCPMEMORYPRESSURESCHRONO,	/* TCPMemoryPressuresChrono */
 	LINUX_MIB_TCPSACKDISCARD,		/* TCPSACKDiscard */
 	LINUX_MIB_TCPDSACKIGNOREDOLD,		/* TCPSACKIgnoredOld */
 	LINUX_MIB_TCPDSACKIGNOREDNOUNDO,	/* TCPSACKIgnoredNoUndo */
diff --git a/net/core/sock.c b/net/core/sock.c
index bef844127e01..ad8a4bc84126 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2076,6 +2076,26 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
 }
 EXPORT_SYMBOL(sock_cmsg_send);
 
+static void sk_enter_memory_pressure(struct sock *sk)
+{
+	if (!sk->sk_prot->enter_memory_pressure)
+		return;
+
+	sk->sk_prot->enter_memory_pressure(sk);
+}
+
+static void sk_leave_memory_pressure(struct sock *sk)
+{
+	if (sk->sk_prot->leave_memory_pressure) {
+		sk->sk_prot->leave_memory_pressure(sk);
+	} else {
+		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
+
+		if (memory_pressure && *memory_pressure)
+			*memory_pressure = 0;
+	}
+}
+
 /* On 32bit arches, an skb frag is limited to 2^15 */
 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 405483a07efc..73a0399dc7a2 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -447,7 +447,7 @@ static void dn_destruct(struct sock *sk)
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
 }
 
-static int dn_memory_pressure;
+static unsigned long dn_memory_pressure;
 
 static void dn_enter_memory_pressure(struct sock *sk)
 {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index fa44e752a9a3..43eb6567b3a0 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -250,6 +250,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
 	SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
 	SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+	SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO),
 	SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
 	SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
 	SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87981fcdfcf2..cc8fd8b747a4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -320,17 +320,36 @@ struct tcp_splice_state {
  * All the __sk_mem_schedule() is of this nature: accounting
  * is strict, actions are advisory and have some latency.
  */
-int tcp_memory_pressure __read_mostly;
-EXPORT_SYMBOL(tcp_memory_pressure);
+unsigned long tcp_memory_pressure __read_mostly;
+EXPORT_SYMBOL_GPL(tcp_memory_pressure);
 
 void tcp_enter_memory_pressure(struct sock *sk)
 {
-	if (!tcp_memory_pressure) {
+	unsigned long val;
+
+	if (tcp_memory_pressure)
+		return;
+	val = jiffies;
+
+	if (!val)
+		val--;
+	if (!cmpxchg(&tcp_memory_pressure, 0, val))
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
-		tcp_memory_pressure = 1;
-	}
 }
-EXPORT_SYMBOL(tcp_enter_memory_pressure);
+EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
+
+void tcp_leave_memory_pressure(struct sock *sk)
+{
+	unsigned long val;
+
+	if (!tcp_memory_pressure)
+		return;
+	val = xchg(&tcp_memory_pressure, 0);
+	if (val)
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
+			      jiffies_to_msecs(jiffies - val));
+}
+EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
 
 /* Convert seconds to retransmits based on initial and max timeout */
 static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13c7ae7d4504..1dc8c449e16a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2387,6 +2387,7 @@ struct proto tcp_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.leave_memory_pressure	= tcp_leave_memory_pressure,
 	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.orphan_count		= &tcp_orphan_count,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5a525426fe93..0840543fc245 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1910,6 +1910,7 @@ struct proto tcpv6_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.leave_memory_pressure	= tcp_leave_memory_pressure,
 	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.memory_allocated	= &tcp_memory_allocated,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0822046e4f3f..5f58dd03e3ac 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -103,7 +103,7 @@ static int sctp_autobind(struct sock *sk);
 static void sctp_sock_migrate(struct sock *, struct sock *,
 			      struct sctp_association *, sctp_socket_type_t);
 
-static int sctp_memory_pressure;
+static unsigned long sctp_memory_pressure;
 static atomic_long_t sctp_memory_allocated;
 struct percpu_counter sctp_sockets_allocated;
 
-- 
cgit v1.2.3


From 9fe8bcec0dbc19604acc3a2cd469febf96f0d59a Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 8 Jun 2017 08:44:15 +0200
Subject: net: bridge: Receive notification about successful FDB offload

When a new static FDB is added to the bridge a notification is sent to
the driver for offload. In case of successful offload the driver should
notify the bridge back, which in turn should mark the FDB as offloaded.

Currently, externally learned is equivalent for being offloaded which is
not correct due to the fact that FDBs which are added from user-space are
also marked as externally learned. In order to specify if an FDB was
successfully offloaded a new flag is introduced.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h        |  1 +
 include/uapi/linux/neighbour.h |  1 +
 net/bridge/br.c                | 11 ++++++++++-
 net/bridge/br_fdb.c            | 22 +++++++++++++++++++++-
 net/bridge/br_private.h        |  5 ++++-
 5 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 8165ed93c58b..c784a6ac6ef1 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -159,6 +159,7 @@ enum switchdev_notifier_type {
 	SWITCHDEV_FDB_DEL_TO_BRIDGE,
 	SWITCHDEV_FDB_ADD_TO_DEVICE,
 	SWITCHDEV_FDB_DEL_TO_DEVICE,
+	SWITCHDEV_FDB_OFFLOADED,
 };
 
 struct switchdev_notifier_info {
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index f3d16dbe09d6..3199d28980b3 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -41,6 +41,7 @@ enum {
 #define NTF_MASTER	0x04
 #define NTF_PROXY	0x08	/* == ATF_PUBL */
 #define NTF_EXT_LEARNED	0x10
+#define NTF_OFFLOADED   0x20
 #define NTF_ROUTER	0x80
 
 /*
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 96d209caf6db..1407d1ba7577 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -142,8 +142,12 @@ static int br_switchdev_event(struct notifier_block *unused,
 		fdb_info = ptr;
 		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
 						fdb_info->vid);
-		if (err)
+		if (err) {
 			err = notifier_from_errno(err);
+			break;
+		}
+		br_fdb_offloaded_set(br, p, fdb_info->addr,
+				     fdb_info->vid);
 		break;
 	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
 		fdb_info = ptr;
@@ -152,6 +156,11 @@ static int br_switchdev_event(struct notifier_block *unused,
 		if (err)
 			err = notifier_from_errno(err);
 		break;
+	case SWITCHDEV_FDB_OFFLOADED:
+		fdb_info = ptr;
+		br_fdb_offloaded_set(br, p, fdb_info->addr,
+				     fdb_info->vid);
+		break;
 	}
 
 out:
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 26a1dae2d434..fef7872a320b 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -511,6 +511,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 		fdb->is_static = is_static;
 		fdb->added_by_user = 0;
 		fdb->added_by_external_learn = 0;
+		fdb->offloaded = 0;
 		fdb->updated = fdb->used = jiffies;
 		hlist_add_head_rcu(&fdb->hlist, head);
 	}
@@ -647,11 +648,16 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	ndm->ndm_family	 = AF_BRIDGE;
 	ndm->ndm_pad1    = 0;
 	ndm->ndm_pad2    = 0;
-	ndm->ndm_flags	 = fdb->added_by_external_learn ? NTF_EXT_LEARNED : 0;
+	ndm->ndm_flags	 = 0;
 	ndm->ndm_type	 = 0;
 	ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex;
 	ndm->ndm_state   = fdb_to_nud(br, fdb);
 
+	if (fdb->offloaded)
+		ndm->ndm_flags |= NTF_OFFLOADED;
+	if (fdb->added_by_external_learn)
+		ndm->ndm_flags |= NTF_EXT_LEARNED;
+
 	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr))
 		goto nla_put_failure;
 	if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
@@ -1123,3 +1129,17 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 
 	return err;
 }
+
+void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
+			  const unsigned char *addr, u16 vid)
+{
+	struct net_bridge_fdb_entry *fdb;
+
+	spin_lock_bh(&br->hash_lock);
+
+	fdb = br_fdb_find(br, addr, vid);
+	if (fdb)
+		fdb->offloaded = 1;
+
+	spin_unlock_bh(&br->hash_lock);
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 98410ea032cb..c18682f804a0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -169,7 +169,8 @@ struct net_bridge_fdb_entry {
 	unsigned char			is_local:1,
 					is_static:1,
 					added_by_user:1,
-					added_by_external_learn:1;
+					added_by_external_learn:1,
+					offloaded:1;
 
 	/* write-heavy members should not affect lookups */
 	unsigned long			updated ____cacheline_aligned_in_smp;
@@ -536,6 +537,8 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid);
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid);
+void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
+			  const unsigned char *addr, u16 vid);
 
 /* br_forward.c */
 enum br_pkt_type {
-- 
cgit v1.2.3


From 772c344dbb23b2ce4568ac30afae92a842fa6d8f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 7 Jun 2017 18:02:32 +0300
Subject: net: ipmr: add getlink support

Currently there's no way to dump the VIF table for an ipmr table other
than the default (via proc). This is a major issue when debugging ipmr
issues and in general it is good to know which interfaces are
configured. This patch adds support for RTM_GETLINK for the ipmr family
so we can dump the VIF table and the ipmr table's current config for
each table. We're protected by rtnl so no need to acquire RCU or
mrt_lock.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mroute.h |  42 +++++++++++++++
 net/ipv4/ipmr.c             | 126 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
index 1fe4c1e7d66e..f904367c0cee 100644
--- a/include/uapi/linux/mroute.h
+++ b/include/uapi/linux/mroute.h
@@ -110,6 +110,48 @@ struct igmpmsg {
 	struct in_addr im_src,im_dst;
 };
 
+/* ipmr netlink table attributes */
+enum {
+	IPMRA_TABLE_UNSPEC,
+	IPMRA_TABLE_ID,
+	IPMRA_TABLE_CACHE_RES_QUEUE_LEN,
+	IPMRA_TABLE_MROUTE_REG_VIF_NUM,
+	IPMRA_TABLE_MROUTE_DO_ASSERT,
+	IPMRA_TABLE_MROUTE_DO_PIM,
+	IPMRA_TABLE_VIFS,
+	__IPMRA_TABLE_MAX
+};
+#define IPMRA_TABLE_MAX (__IPMRA_TABLE_MAX - 1)
+
+/* ipmr netlink vif attribute format
+ * [ IPMRA_TABLE_VIFS ] - nested attribute
+ *   [ IPMRA_VIF ] - nested attribute
+ *     [ IPMRA_VIFA_xxx ]
+ */
+enum {
+	IPMRA_VIF_UNSPEC,
+	IPMRA_VIF,
+	__IPMRA_VIF_MAX
+};
+#define IPMRA_VIF_MAX (__IPMRA_VIF_MAX - 1)
+
+/* vif-specific attributes */
+enum {
+	IPMRA_VIFA_UNSPEC,
+	IPMRA_VIFA_IFINDEX,
+	IPMRA_VIFA_VIF_ID,
+	IPMRA_VIFA_FLAGS,
+	IPMRA_VIFA_BYTES_IN,
+	IPMRA_VIFA_BYTES_OUT,
+	IPMRA_VIFA_PACKETS_IN,
+	IPMRA_VIFA_PACKETS_OUT,
+	IPMRA_VIFA_LOCAL_ADDR,
+	IPMRA_VIFA_REMOTE_ADDR,
+	IPMRA_VIFA_PAD,
+	__IPMRA_VIFA_MAX
+};
+#define IPMRA_VIFA_MAX (__IPMRA_VIFA_MAX - 1)
+
 /* That's all usermode folks */
 
 #define MFC_ASSERT_THRESH (3*HZ)		/* Maximal freq. of asserts */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 551de4d023a8..9374b99c7c17 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2528,6 +2528,129 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return ipmr_mfc_delete(tbl, &mfcc, parent);
 }
 
+static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
+{
+	u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len);
+
+	if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) ||
+	    nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) ||
+	    nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM,
+			mrt->mroute_reg_vif_num) ||
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
+		       mrt->mroute_do_assert) ||
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
+		return false;
+
+	return true;
+}
+
+static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
+{
+	struct nlattr *vif_nest;
+	struct vif_device *vif;
+
+	/* if the VIF doesn't exist just continue */
+	if (!VIF_EXISTS(mrt, vifid))
+		return true;
+
+	vif = &mrt->vif_table[vifid];
+	vif_nest = nla_nest_start(skb, IPMRA_VIF);
+	if (!vif_nest)
+		return false;
+	if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
+	    nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
+	    nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
+	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
+			      IPMRA_VIFA_PAD) ||
+	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out,
+			      IPMRA_VIFA_PAD) ||
+	    nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in,
+			      IPMRA_VIFA_PAD) ||
+	    nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out,
+			      IPMRA_VIFA_PAD) ||
+	    nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) ||
+	    nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) {
+		nla_nest_cancel(skb, vif_nest);
+		return false;
+	}
+	nla_nest_end(skb, vif_nest);
+
+	return true;
+}
+
+static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlmsghdr *nlh = NULL;
+	unsigned int t = 0, s_t;
+	unsigned int e = 0, s_e;
+	struct mr_table *mrt;
+
+	s_t = cb->args[0];
+	s_e = cb->args[1];
+
+	ipmr_for_each_table(mrt, net) {
+		struct nlattr *vifs, *af;
+		struct ifinfomsg *hdr;
+		u32 i;
+
+		if (t < s_t)
+			goto skip_table;
+		nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq, RTM_NEWLINK,
+				sizeof(*hdr), NLM_F_MULTI);
+		if (!nlh)
+			break;
+
+		hdr = nlmsg_data(nlh);
+		memset(hdr, 0, sizeof(*hdr));
+		hdr->ifi_family = RTNL_FAMILY_IPMR;
+
+		af = nla_nest_start(skb, IFLA_AF_SPEC);
+		if (!af) {
+			nlmsg_cancel(skb, nlh);
+			goto out;
+		}
+
+		if (!ipmr_fill_table(mrt, skb)) {
+			nlmsg_cancel(skb, nlh);
+			goto out;
+		}
+
+		vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS);
+		if (!vifs) {
+			nla_nest_end(skb, af);
+			nlmsg_end(skb, nlh);
+			goto out;
+		}
+		for (i = 0; i < mrt->maxvif; i++) {
+			if (e < s_e)
+				goto skip_entry;
+			if (!ipmr_fill_vif(mrt, i, skb)) {
+				nla_nest_end(skb, vifs);
+				nla_nest_end(skb, af);
+				nlmsg_end(skb, nlh);
+				goto out;
+			}
+skip_entry:
+			e++;
+		}
+		s_e = 0;
+		e = 0;
+		nla_nest_end(skb, vifs);
+		nla_nest_end(skb, af);
+		nlmsg_end(skb, nlh);
+skip_table:
+		t++;
+	}
+
+out:
+	cb->args[1] = e;
+	cb->args[0] = t;
+
+	return skb->len;
+}
+
 #ifdef CONFIG_PROC_FS
 /* The /proc interfaces to multicast routing :
  * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
@@ -2870,6 +2993,9 @@ int __init ip_mr_init(void)
 		      ipmr_rtm_route, NULL, NULL);
 	rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
 		      ipmr_rtm_route, NULL, NULL);
+
+	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
+		      NULL, ipmr_rtm_dumplink, NULL);
 	return 0;
 
 #ifdef CONFIG_IP_PIMSM_V2
-- 
cgit v1.2.3


From a481f4d917835cad86701fc0d1e620c74bb5cd5f Mon Sep 17 00:00:00 2001
From: John Johansen <john.johansen@canonical.com>
Date: Thu, 25 May 2017 05:52:56 -0700
Subject: apparmor: add custom apparmorfs that will be used by policy namespace
 files

AppArmor policy needs to be able to be resolved based on the policy
namespace a task is confined by. Add a base apparmorfs filesystem that
(like nsfs) will exist as a kern mount and be accessed via jump_link
through a securityfs file.

Setup the base apparmorfs fns and data, but don't use it yet.

Signed-off-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Seth Arnold <seth.arnold@canonical.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/magic.h     |   2 +
 security/apparmor/apparmorfs.c | 353 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 338 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index e230af2e6855..a0908f1d2760 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -80,6 +80,8 @@
 #define BTRFS_TEST_MAGIC	0x73727279
 #define NSFS_MAGIC		0x6e736673
 #define BPF_FS_MAGIC		0xcafe4a11
+#define AAFS_MAGIC		0x5a3c69f0
+
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC		0x15013346
 #define BALLOON_KVM_MAGIC	0x13661366
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 16680d15d43e..7e4b7f28ee20 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -22,8 +22,9 @@
 #include <linux/namei.h>
 #include <linux/capability.h>
 #include <linux/rcupdate.h>
-#include <uapi/linux/major.h>
 #include <linux/fs.h>
+#include <uapi/linux/major.h>
+#include <uapi/linux/magic.h>
 
 #include "include/apparmor.h"
 #include "include/apparmorfs.h"
@@ -74,6 +75,265 @@ static int mangle_name(const char *name, char *target)
 	return t - target;
 }
 
+
+/*
+ * aafs - core fns and data for the policy tree
+ */
+
+#define AAFS_NAME		"apparmorfs"
+static struct vfsmount *aafs_mnt;
+static int aafs_count;
+
+
+static int aafs_show_path(struct seq_file *seq, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	seq_printf(seq, "%s:[%lu]", AAFS_NAME, inode->i_ino);
+	return 0;
+}
+
+static void aafs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	if (S_ISLNK(inode->i_mode))
+		kfree(inode->i_link);
+}
+
+static const struct super_operations aafs_super_ops = {
+	.statfs = simple_statfs,
+	.evict_inode = aafs_evict_inode,
+	.show_path = aafs_show_path,
+};
+
+static int fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr files[] = { {""} };
+	int error;
+
+	error = simple_fill_super(sb, AAFS_MAGIC, files);
+	if (error)
+		return error;
+	sb->s_op = &aafs_super_ops;
+
+	return 0;
+}
+
+static struct dentry *aafs_mount(struct file_system_type *fs_type,
+				 int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, fill_super);
+}
+
+static struct file_system_type aafs_ops = {
+	.owner = THIS_MODULE,
+	.name = AAFS_NAME,
+	.mount = aafs_mount,
+	.kill_sb = kill_anon_super,
+};
+
+/**
+ * __aafs_setup_d_inode - basic inode setup for apparmorfs
+ * @dir: parent directory for the dentry
+ * @dentry: dentry we are seting the inode up for
+ * @mode: permissions the file should have
+ * @data: data to store on inode.i_private, available in open()
+ * @link: if symlink, symlink target string
+ * @fops: struct file_operations that should be used
+ * @iops: struct of inode_operations that should be used
+ */
+static int __aafs_setup_d_inode(struct inode *dir, struct dentry *dentry,
+			       umode_t mode, void *data, char *link,
+			       const struct file_operations *fops,
+			       const struct inode_operations *iops)
+{
+	struct inode *inode = new_inode(dir->i_sb);
+
+	AA_BUG(!dir);
+	AA_BUG(!dentry);
+
+	if (!inode)
+		return -ENOMEM;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+	inode->i_private = data;
+	if (S_ISDIR(mode)) {
+		inode->i_op = iops ? iops : &simple_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+		inc_nlink(inode);
+		inc_nlink(dir);
+	} else if (S_ISLNK(mode)) {
+		inode->i_op = iops ? iops : &simple_symlink_inode_operations;
+		inode->i_link = link;
+	} else {
+		inode->i_fop = fops;
+	}
+	d_instantiate(dentry, inode);
+	dget(dentry);
+
+	return 0;
+}
+
+/**
+ * aafs_create - create a dentry in the apparmorfs filesystem
+ *
+ * @name: name of dentry to create
+ * @mode: permissions the file should have
+ * @parent: parent directory for this dentry
+ * @data: data to store on inode.i_private, available in open()
+ * @link: if symlink, symlink target string
+ * @fops: struct file_operations that should be used for
+ * @iops: struct of inode_operations that should be used
+ *
+ * This is the basic "create a xxx" function for apparmorfs.
+ *
+ * Returns a pointer to a dentry if it succeeds, that must be free with
+ * aafs_remove(). Will return ERR_PTR on failure.
+ */
+static struct dentry *aafs_create(const char *name, umode_t mode,
+				  struct dentry *parent, void *data, void *link,
+				  const struct file_operations *fops,
+				  const struct inode_operations *iops)
+{
+	struct dentry *dentry;
+	struct inode *dir;
+	int error;
+
+	AA_BUG(!name);
+	AA_BUG(!parent);
+
+	if (!(mode & S_IFMT))
+		mode = (mode & S_IALLUGO) | S_IFREG;
+
+	error = simple_pin_fs(&aafs_ops, &aafs_mnt, &aafs_count);
+	if (error)
+		return ERR_PTR(error);
+
+	dir = d_inode(parent);
+
+	inode_lock(dir);
+	dentry = lookup_one_len(name, parent, strlen(name));
+	if (IS_ERR(dentry))
+		goto fail_lock;
+
+	if (d_really_is_positive(dentry)) {
+		error = -EEXIST;
+		goto fail_dentry;
+	}
+
+	error = __aafs_setup_d_inode(dir, dentry, mode, data, link, fops, iops);
+	if (error)
+		goto fail_dentry;
+	inode_unlock(dir);
+
+	return dentry;
+
+fail_dentry:
+	dput(dentry);
+
+fail_lock:
+	inode_unlock(dir);
+	simple_release_fs(&aafs_mnt, &aafs_count);
+
+	return ERR_PTR(error);
+}
+
+/**
+ * aafs_create_file - create a file in the apparmorfs filesystem
+ *
+ * @name: name of dentry to create
+ * @mode: permissions the file should have
+ * @parent: parent directory for this dentry
+ * @data: data to store on inode.i_private, available in open()
+ * @fops: struct file_operations that should be used for
+ *
+ * see aafs_create
+ */
+static struct dentry *aafs_create_file(const char *name, umode_t mode,
+				       struct dentry *parent, void *data,
+				       const struct file_operations *fops)
+{
+	return aafs_create(name, mode, parent, data, NULL, fops, NULL);
+}
+
+/**
+ * aafs_create_dir - create a directory in the apparmorfs filesystem
+ *
+ * @name: name of dentry to create
+ * @parent: parent directory for this dentry
+ *
+ * see aafs_create
+ */
+static struct dentry *aafs_create_dir(const char *name, struct dentry *parent)
+{
+	return aafs_create(name, S_IFDIR | 0755, parent, NULL, NULL, NULL,
+			   NULL);
+}
+
+/**
+ * aafs_create_symlink - create a symlink in the apparmorfs filesystem
+ * @name: name of dentry to create
+ * @parent: parent directory for this dentry
+ * @target: if symlink, symlink target string
+ * @iops: struct of inode_operations that should be used
+ *
+ * If @target parameter is %NULL, then the @iops parameter needs to be
+ * setup to handle .readlink and .get_link inode_operations.
+ */
+static struct dentry *aafs_create_symlink(const char *name,
+					  struct dentry *parent,
+					  const char *target,
+					  const struct inode_operations *iops)
+{
+	struct dentry *dent;
+	char *link = NULL;
+
+	if (target) {
+		link = kstrdup(target, GFP_KERNEL);
+		if (!link)
+			return ERR_PTR(-ENOMEM);
+	}
+	dent = aafs_create(name, S_IFLNK | 0444, parent, NULL, link, NULL,
+			   iops);
+	if (IS_ERR(dent))
+		kfree(link);
+
+	return dent;
+}
+
+/**
+ * aafs_remove - removes a file or directory from the apparmorfs filesystem
+ *
+ * @dentry: dentry of the file/directory/symlink to removed.
+ */
+static void aafs_remove(struct dentry *dentry)
+{
+	struct inode *dir;
+
+	if (!dentry || IS_ERR(dentry))
+		return;
+
+	dir = d_inode(dentry->d_parent);
+	inode_lock(dir);
+	if (simple_positive(dentry)) {
+		if (d_is_dir(dentry))
+			simple_rmdir(dir, dentry);
+		else
+			simple_unlink(dir, dentry);
+		dput(dentry);
+	}
+	inode_unlock(dir);
+	simple_release_fs(&aafs_mnt, &aafs_count);
+}
+
+
+/*
+ * aa_fs - policy load/replace/remove
+ */
+
 /**
  * aa_simple_write_to_buffer - common routine for getting policy from user
  * @userbuf: user buffer to copy data from  (NOT NULL)
@@ -1369,14 +1629,14 @@ static struct aa_fs_entry aa_fs_entry =
 	AA_FS_DIR("apparmor", aa_fs_entry_apparmor);
 
 /**
- * aafs_create_file - create a file entry in the apparmor securityfs
+ * entry_create_file - create a file entry in the apparmor securityfs
  * @fs_file: aa_fs_entry to build an entry for (NOT NULL)
  * @parent: the parent dentry in the securityfs
  *
- * Use aafs_remove_file to remove entries created with this fn.
+ * Use entry_remove_file to remove entries created with this fn.
  */
-static int __init aafs_create_file(struct aa_fs_entry *fs_file,
-				   struct dentry *parent)
+static int __init entry_create_file(struct aa_fs_entry *fs_file,
+				    struct dentry *parent)
 {
 	int error = 0;
 
@@ -1391,15 +1651,15 @@ static int __init aafs_create_file(struct aa_fs_entry *fs_file,
 	return error;
 }
 
-static void __init aafs_remove_dir(struct aa_fs_entry *fs_dir);
+static void __init entry_remove_dir(struct aa_fs_entry *fs_dir);
 /**
- * aafs_create_dir - recursively create a directory entry in the securityfs
+ * entry_create_dir - recursively create a directory entry in the securityfs
  * @fs_dir: aa_fs_entry (and all child entries) to build (NOT NULL)
  * @parent: the parent dentry in the securityfs
  *
- * Use aafs_remove_dir to remove entries created with this fn.
+ * Use entry_remove_dir to remove entries created with this fn.
  */
-static int __init aafs_create_dir(struct aa_fs_entry *fs_dir,
+static int __init entry_create_dir(struct aa_fs_entry *fs_dir,
 				  struct dentry *parent)
 {
 	struct aa_fs_entry *fs_file;
@@ -1413,9 +1673,9 @@ static int __init aafs_create_dir(struct aa_fs_entry *fs_dir,
 
 	for (fs_file = fs_dir->v.files; fs_file && fs_file->name; ++fs_file) {
 		if (fs_file->v_type == AA_FS_TYPE_DIR)
-			error = aafs_create_dir(fs_file, fs_dir->dentry);
+			error = entry_create_dir(fs_file, fs_dir->dentry);
 		else
-			error = aafs_create_file(fs_file, fs_dir->dentry);
+			error = entry_create_file(fs_file, fs_dir->dentry);
 		if (error)
 			goto failed;
 	}
@@ -1423,7 +1683,7 @@ static int __init aafs_create_dir(struct aa_fs_entry *fs_dir,
 	return 0;
 
 failed:
-	aafs_remove_dir(fs_dir);
+	entry_remove_dir(fs_dir);
 
 	return error;
 }
@@ -1442,16 +1702,16 @@ static void __init aafs_remove_file(struct aa_fs_entry *fs_file)
 }
 
 /**
- * aafs_remove_dir - recursively drop a directory entry from the securityfs
+ * entry_remove_dir - recursively drop a directory entry from the securityfs
  * @fs_dir: aa_fs_entry (and all child entries) to detach (NOT NULL)
  */
-static void __init aafs_remove_dir(struct aa_fs_entry *fs_dir)
+static void __init entry_remove_dir(struct aa_fs_entry *fs_dir)
 {
 	struct aa_fs_entry *fs_file;
 
 	for (fs_file = fs_dir->v.files; fs_file && fs_file->name; ++fs_file) {
 		if (fs_file->v_type == AA_FS_TYPE_DIR)
-			aafs_remove_dir(fs_file);
+			entry_remove_dir(fs_file);
 		else
 			aafs_remove_file(fs_file);
 	}
@@ -1466,7 +1726,7 @@ static void __init aafs_remove_dir(struct aa_fs_entry *fs_dir)
  */
 void __init aa_destroy_aafs(void)
 {
-	aafs_remove_dir(&aa_fs_entry);
+	entry_remove_dir(&aa_fs_entry);
 }
 
 
@@ -1515,6 +1775,59 @@ out:
 	return error;
 }
 
+
+
+static const char *policy_get_link(struct dentry *dentry,
+				   struct inode *inode,
+				   struct delayed_call *done)
+{
+	struct aa_ns *ns;
+	struct path path;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+	ns = aa_get_current_ns();
+	path.mnt = mntget(aafs_mnt);
+	path.dentry = dget(ns_dir(ns));
+	nd_jump_link(&path);
+	aa_put_ns(ns);
+
+	return NULL;
+}
+
+static int ns_get_name(char *buf, size_t size, struct aa_ns *ns,
+		       struct inode *inode)
+{
+	int res = snprintf(buf, size, "%s:[%lu]", AAFS_NAME, inode->i_ino);
+
+	if (res < 0 || res >= size)
+		res = -ENOENT;
+
+	return res;
+}
+
+static int policy_readlink(struct dentry *dentry, char __user *buffer,
+			   int buflen)
+{
+	struct aa_ns *ns;
+	char name[32];
+	int res;
+
+	ns = aa_get_current_ns();
+	res = ns_get_name(name, sizeof(name), ns, d_inode(dentry));
+	if (res >= 0)
+		res = readlink_copy(buffer, buflen, name);
+	aa_put_ns(ns);
+
+	return res;
+}
+
+static const struct inode_operations policy_link_iops = {
+	.readlink	= policy_readlink,
+	.get_link	= policy_get_link,
+};
+
+
 /**
  * aa_create_aafs - create the apparmor security filesystem
  *
@@ -1535,8 +1848,14 @@ static int __init aa_create_aafs(void)
 		return -EEXIST;
 	}
 
+	/* setup apparmorfs used to virtualize policy/ */
+	aafs_mnt = kern_mount(&aafs_ops);
+	if (IS_ERR(aafs_mnt))
+		panic("can't set apparmorfs up\n");
+	aafs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+
 	/* Populate fs tree. */
-	error = aafs_create_dir(&aa_fs_entry, NULL);
+	error = entry_create_dir(&aa_fs_entry, NULL);
 	if (error)
 		goto error;
 
-- 
cgit v1.2.3


From daa6630a310fe2ad90ce5f7d2d196cd0353ef4fa Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 8 Jun 2017 10:37:45 +0200
Subject: openvswitch: warn about missing first netlink attribute

The first netlink attribute (value 0) must always be defined
as none/unspec.

Because we cannot change an existing UAPI, I add a comment to point the
mistake and avoid to propagate it in a new ovs API in the future.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 61b7d36dfe34..156ee4cab82e 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -343,6 +343,7 @@ enum ovs_key_attr {
 #define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
 
 enum ovs_tunnel_key_attr {
+	/* OVS_TUNNEL_KEY_ATTR_NONE, standard nl API requires this attribute! */
 	OVS_TUNNEL_KEY_ATTR_ID,                 /* be64 Tunnel ID */
 	OVS_TUNNEL_KEY_ATTR_IPV4_SRC,           /* be32 src IP address. */
 	OVS_TUNNEL_KEY_ATTR_IPV4_DST,           /* be32 dst IP address. */
-- 
cgit v1.2.3


From 297fb414d0d190ca82bf0b46fb19d7fda1598737 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 8 Jun 2017 11:18:13 +0200
Subject: ethtool.h: remind to update 802.3ad when adding new speeds

Each time a new speed is added, the bonding 802.3ad isn't updated. Add a
comment to remind the developer to update this driver.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index d179d7767f51..7d4a594d5d58 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1486,8 +1486,10 @@ enum ethtool_link_mode_bit_indices {
  * it was forced up into this mode or autonegotiated.
  */
 
-/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. */
-/* Update drivers/net/phy/phy.c:phy_speed_to_str() when adding new values */
+/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal.
+ * Update drivers/net/phy/phy.c:phy_speed_to_str() and
+ * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values.
+ */
 #define SPEED_10		10
 #define SPEED_100		100
 #define SPEED_1000		1000
-- 
cgit v1.2.3


From a77395447b0aeab9473a066ff28fbee01130206b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 8 Jun 2017 14:49:57 +0100
Subject: KEYS: DH: add __user annotations to keyctl_kdf_params

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/uapi/linux/keyctl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index 201c6644b237..ef16df06642a 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -70,8 +70,8 @@ struct keyctl_dh_params {
 };
 
 struct keyctl_kdf_params {
-	char *hashname;
-	char *otherinfo;
+	char __user *hashname;
+	char __user *otherinfo;
 	__u32 otherinfolen;
 	__u32 __spare[8];
 };
-- 
cgit v1.2.3


From 72ce5732eeca023abb04e40eb77a6bc1169d9b9d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 7 Jun 2017 18:19:31 +0300
Subject: tty/serial: atmel: Remove AVR32 bits from the driver

AVR32 is gone. Now it's time to clean up the driver by removing
leftovers that was used by AVR32 related code.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Richard Genoud <richard.genoud@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                       |  2 +-
 drivers/tty/serial/Kconfig        | 16 ++++++++--------
 drivers/tty/serial/atmel_serial.c | 20 +-------------------
 include/uapi/linux/serial_core.h  |  2 +-
 4 files changed, 11 insertions(+), 29 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7a28acd7f525..10c6faf7c3b6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8438,7 +8438,7 @@ T:	git git://git.monstr.eu/linux-2.6-microblaze.git
 S:	Supported
 F:	arch/microblaze/
 
-MICROCHIP / ATMEL AT91 / AT32 SERIAL DRIVER
+MICROCHIP / ATMEL AT91 SERIAL DRIVER
 M:	Richard Genoud <richard.genoud@gmail.com>
 S:	Maintained
 F:	drivers/tty/serial/atmel_serial.c
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 5c8850f7a2a0..07812a7ea2a4 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -114,32 +114,32 @@ config SERIAL_SB1250_DUART_CONSOLE
 	  If unsure, say Y.
 
 config SERIAL_ATMEL
-	bool "AT91 / AT32 on-chip serial port support"
+	bool "AT91 on-chip serial port support"
 	depends on HAS_DMA
-	depends on ARCH_AT91 || AVR32 || COMPILE_TEST
+	depends on ARCH_AT91 || COMPILE_TEST
 	select SERIAL_CORE
 	select SERIAL_MCTRL_GPIO if GPIOLIB
 	help
 	  This enables the driver for the on-chip UARTs of the Atmel
-	  AT91 and AT32 processors.
+	  AT91 processors.
 
 config SERIAL_ATMEL_CONSOLE
-	bool "Support for console on AT91 / AT32 serial port"
+	bool "Support for console on AT91 serial port"
 	depends on SERIAL_ATMEL=y
 	select SERIAL_CORE_CONSOLE
 	help
 	  Say Y here if you wish to use an on-chip UART on a Atmel
-	  AT91 or AT32 processor as the system console (the system
+	  AT91 processor as the system console (the system
 	  console is the device which receives all kernel messages and
 	  warnings and which allows logins in single user mode).
 
 config SERIAL_ATMEL_PDC
-	bool "Support DMA transfers on AT91 / AT32 serial port"
+	bool "Support DMA transfers on AT91 serial port"
 	depends on SERIAL_ATMEL
 	default y
 	help
 	  Say Y here if you wish to use the PDC to do DMA transfers to
-	  and from the Atmel AT91 / AT32 serial port. In order to
+	  and from the Atmel AT91 serial port. In order to
 	  actually use DMA transfers, make sure that the use_dma_tx
 	  and use_dma_rx members in the atmel_uart_data struct is set
 	  appropriately for each port.
@@ -152,7 +152,7 @@ config SERIAL_ATMEL_TTYAT
 	bool "Install as device ttyATn instead of ttySn"
 	depends on SERIAL_ATMEL=y
 	help
-	  Say Y here if you wish to have the internal AT91 / AT32 UARTs
+	  Say Y here if you wish to have the internal AT91 UARTs
 	  appear as /dev/ttyATn (major 204, minor starting at 154)
 	  instead of the normal /dev/ttySn (major 4, minor starting at
 	  64). This is necessary if you also want other UARTs, such as
diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index d25f044158ff..a7909a5b60d8 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -1,5 +1,5 @@
 /*
- *  Driver for Atmel AT91 / AT32 Serial ports
+ *  Driver for Atmel AT91 Serial ports
  *  Copyright (C) 2003 Rick Bronson
  *
  *  Based on drivers/char/serial_sa1100.c, by Deep Blue Solutions Ltd.
@@ -119,7 +119,6 @@ struct atmel_uart_char {
 
 /*
  * at91: 6 USARTs and one DBGU port (SAM9260)
- * avr32: 4
  * samx7: 3 USARTs and 5 UARTs
  */
 #define ATMEL_MAX_UART		8
@@ -229,21 +228,6 @@ static inline void atmel_uart_writel(struct uart_port *port, u32 reg, u32 value)
 	__raw_writel(value, port->membase + reg);
 }
 
-#ifdef CONFIG_AVR32
-
-/* AVR32 cannot handle 8 or 16bit I/O accesses but only 32bit I/O accesses */
-static inline u8 atmel_uart_read_char(struct uart_port *port)
-{
-	return __raw_readl(port->membase + ATMEL_US_RHR);
-}
-
-static inline void atmel_uart_write_char(struct uart_port *port, u8 value)
-{
-	__raw_writel(value, port->membase + ATMEL_US_THR);
-}
-
-#else
-
 static inline u8 atmel_uart_read_char(struct uart_port *port)
 {
 	return __raw_readb(port->membase + ATMEL_US_RHR);
@@ -254,8 +238,6 @@ static inline void atmel_uart_write_char(struct uart_port *port, u8 value)
 	__raw_writeb(value, port->membase + ATMEL_US_THR);
 }
 
-#endif
-
 #ifdef CONFIG_SERIAL_ATMEL_PDC
 static bool atmel_use_pdc_rx(struct uart_port *port)
 {
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 9ec741b133fe..c34a2a3eeff5 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -83,7 +83,7 @@
 /* Parisc type numbers. */
 #define PORT_MUX	48
 
-/* Atmel AT91 / AT32 SoC */
+/* Atmel AT91 SoC */
 #define PORT_ATMEL	49
 
 /* Macintosh Zilog type numbers */
-- 
cgit v1.2.3


From 22ec656bcc3f38207ad5476ebad1e5005fb0f1ff Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 9 Jun 2017 11:02:40 -0400
Subject: dm: bump DM_VERSION_MINOR in response to target method error code
 changes

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/dm-ioctl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 4bf9f1eabffc..2f6c77aebe1a 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	35
+#define DM_VERSION_MINOR	36
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2016-06-23)"
+#define DM_VERSION_EXTRA	"-ioctl (2017-06-09)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From ded092cd73c2c56a394b936f86897f29b2e131c0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:47 +0200
Subject: bpf: add bpf_set_hash helper for tc progs

Allow for tc BPF programs to set a skb->hash, apart from clearing
and triggering a recalc that we have right now. It allows for BPF
to implement a custom hashing routine for skb_get_hash().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h       |  8 +++++++-
 net/core/filter.c              | 20 ++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  8 +++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9b2c10b45733..f94b48b168dc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -513,6 +513,11 @@ union bpf_attr {
  *     Get the owner uid of the socket stored inside sk_buff.
  *     @skb: pointer to skb
  *     Return: uid of the socket owner on success or overflowuid if failed.
+ *
+ * u32 bpf_set_hash(skb, hash)
+ *     Set full skb->hash.
+ *     @skb: pointer to skb
+ *     @hash: hash to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -562,7 +567,8 @@ union bpf_attr {
 	FN(xdp_adjust_head),		\
 	FN(probe_read_str),		\
 	FN(get_socket_cookie),		\
-	FN(get_socket_uid),
+	FN(get_socket_uid),		\
+	FN(set_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 4867391126e4..a65a3b25e104 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1874,6 +1874,24 @@ static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
+{
+	/* Set user specified hash as L4(+), so that it gets returned
+	 * on skb_get_hash() call unless BPF prog later on triggers a
+	 * skb_clear_hash().
+	 */
+	__skb_set_sw_hash(skb, hash, true);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_proto = {
+	.func		= bpf_set_hash,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	   u16, vlan_tci)
 {
@@ -2744,6 +2762,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_hash_recalc_proto;
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
+	case BPF_FUNC_set_hash:
+		return &bpf_set_hash_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9b2c10b45733..f94b48b168dc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -513,6 +513,11 @@ union bpf_attr {
  *     Get the owner uid of the socket stored inside sk_buff.
  *     @skb: pointer to skb
  *     Return: uid of the socket owner on success or overflowuid if failed.
+ *
+ * u32 bpf_set_hash(skb, hash)
+ *     Set full skb->hash.
+ *     @skb: pointer to skb
+ *     @hash: hash to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -562,7 +567,8 @@ union bpf_attr {
 	FN(xdp_adjust_head),		\
 	FN(probe_read_str),		\
 	FN(get_socket_cookie),		\
-	FN(get_socket_uid),
+	FN(get_socket_uid),		\
+	FN(set_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 91b5ab628929d97357108594610e7c07be93e2fd Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Fri, 9 Jun 2017 13:08:42 +0100
Subject: cfg80211: support 4-way handshake offloading for WPA/WPA2-PSK

Let drivers advertise support for station-mode 4-way handshake
offloading with a new NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK flag.

Extend use of NL80211_ATTR_PMK attribute indicating it might be passed
as part of NL80211_CMD_CONNECT command, and contain the PSK (which is
the PMK, hence the name.)

The driver/device is assumed to handle the 4-way handshake by
itself in this case (including key derivations, etc.), instead
of relying on the supplicant.

This patch is somewhat based on this one (by Vladimir Kondratiev):
https://patchwork.kernel.org/patch/1309561/.

Signed-off-by: Vladimir Kondratiev <qca_vkondrat@qca.qualcomm.com>
Signed-off-by: Eliad Peller <eliadx.peller@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
[arend.vanspriel@broadcom.com rebase dealing with existing ATTR_PMK]
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
[reword NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK docs to indicate
that this offload might be required]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    |  1 +
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h | 22 ++++++++++++++++++++--
 net/wireless/nl80211.c       |  9 +++++++++
 4 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 69033353d0d1..e97ca3a9a67b 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2401,6 +2401,7 @@ enum ieee80211_sa_query_action {
 #define WLAN_MAX_KEY_LEN		32
 
 #define WLAN_PMKID_LEN			16
+#define WLAN_PMK_LEN			32
 
 #define WLAN_OUI_WFA			0x506f9a
 #define WLAN_OUI_TYPE_WFA_P2P		9
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index fa25fbb67cb6..1b288bac5d1a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -649,6 +649,7 @@ struct survey_info {
  * @wep_keys: static WEP keys, if not NULL points to an array of
  *	CFG80211_MAX_WEP_KEYS WEP keys
  * @wep_tx_key: key index (0..3) of the default TX static WEP key
+ * @psk: PSK (for devices supporting 4-way-handshake offload)
  */
 struct cfg80211_crypto_settings {
 	u32 wpa_versions;
@@ -662,6 +663,7 @@ struct cfg80211_crypto_settings {
 	bool control_port_no_encrypt;
 	struct key_params *wep_keys;
 	int wep_tx_key;
+	const u8 *psk;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index b8c44b98f12d..f1f7da25bca4 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -172,6 +172,18 @@
  * Multiple such rules can be created.
  */
 
+/**
+ * DOC: WPA/WPA2 EAPOL handshake offload
+ *
+ * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK flag drivers
+ * can indicate they support offloading EAPOL handshakes for WPA/WPA2
+ * preshared key authentication. In %NL80211_CMD_CONNECT the preshared
+ * key should be specified using %NL80211_ATTR_PMK. Drivers supporting
+ * this offload may reject the %NL80211_CMD_CONNECT when no preshared
+ * key material is provided, for example when that driver does not
+ * support setting the temporal keys through %CMD_NEW_KEY.
+ */
+
 /**
  * DOC: FILS shared key authentication offload
  *
@@ -2080,8 +2092,10 @@ enum nl80211_commands {
  *	identifying the scope of PMKSAs. This is used with
  *	@NL80211_CMD_SET_PMKSA and @NL80211_CMD_DEL_PMKSA.
  *
- * @NL80211_ATTR_PMK: PMK for the PMKSA identified by %NL80211_ATTR_PMKID.
- *	This is used with @NL80211_CMD_SET_PMKSA.
+ * @NL80211_ATTR_PMK: attribute for passing PMK key material. Used with
+ *	%NL80211_CMD_SET_PMKSA for the PMKSA identified by %NL80211_ATTR_PMKID.
+ *	For %NL80211_CMD_CONNECT it is used to provide PSK for offloading 4-way
+ *	handshake for WPA/WPA2-PSK networks.
  *
  * @NL80211_ATTR_SCHED_SCAN_MULTI: flag attribute which user-space shall use to
  *	indicate that it supports multiple active scheduled scan requests.
@@ -4852,6 +4866,9 @@ enum nl80211_feature_flags {
  *	RSSI threshold values to monitor rather than exactly one threshold.
  * @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD: Driver SME supports FILS shared key
  *	authentication with %NL80211_CMD_CONNECT.
+ * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK: Device wants to do 4-way
+ *	handshake with PSK in station mode (PSK is passed as part of the connect
+ *	and associate commands), doing it in the host might not be supported.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4872,6 +4889,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI,
 	NL80211_EXT_FEATURE_CQM_RSSI_LIST,
 	NL80211_EXT_FEATURE_FILS_SK_OFFLOAD,
+	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9eb59196a378..2c6863aee4e4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8168,6 +8168,15 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 		memcpy(settings->akm_suites, data, len);
 	}
 
+	if (info->attrs[NL80211_ATTR_PMK]) {
+		if (nla_len(info->attrs[NL80211_ATTR_PMK]) != WLAN_PMK_LEN)
+			return -EINVAL;
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK))
+			return -EINVAL;
+		settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3a00df5707b6af715e78c26569800e0c2eb615fe Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 9 Jun 2017 13:08:43 +0100
Subject: cfg80211: support 4-way handshake offloading for 802.1X

Add API for setting the PMK to the driver. For FT support, allow
setting also the PMK-R0 Name.

This can be used by drivers that support 4-Way handshake offload
while IEEE802.1X authentication is managed by upper layers.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
[arend.vanspriel@broadcom.com: add WANT_1X_4WAY_HS attribute]
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
[reword NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X docs a bit to
say that the device may require it]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    |   3 ++
 include/net/cfg80211.h       |  32 +++++++++++++
 include/uapi/linux/nl80211.h |  39 +++++++++++++++-
 net/wireless/core.c          |   5 +++
 net/wireless/nl80211.c       | 105 +++++++++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h      |  25 +++++++++++
 net/wireless/trace.h         |  60 +++++++++++++++++++++++++
 7 files changed, 268 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e97ca3a9a67b..34e1bcd2d7ff 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2400,8 +2400,11 @@ enum ieee80211_sa_query_action {
 
 #define WLAN_MAX_KEY_LEN		32
 
+#define WLAN_PMK_NAME_LEN		16
 #define WLAN_PMKID_LEN			16
+#define WLAN_PMK_LEN_EAP_LEAP		16
 #define WLAN_PMK_LEN			32
+#define WLAN_PMK_LEN_SUITE_B_192	48
 
 #define WLAN_OUI_WFA			0x506f9a
 #define WLAN_OUI_TYPE_WFA_P2P		9
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1b288bac5d1a..2174e51c6595 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2112,6 +2112,8 @@ struct cfg80211_bss_selection {
  * @fils_erp_rrk: ERP re-authentication Root Key (rRK) used to derive additional
  *	keys in FILS or %NULL if not specified.
  * @fils_erp_rrk_len: Length of @fils_erp_rrk in octets.
+ * @want_1x: indicates user-space supports and wants to use 802.1X driver
+ *	offload of 4-way handshake.
  */
 struct cfg80211_connect_params {
 	struct ieee80211_channel *channel;
@@ -2144,6 +2146,7 @@ struct cfg80211_connect_params {
 	u16 fils_erp_next_seq_num;
 	const u8 *fils_erp_rrk;
 	size_t fils_erp_rrk_len;
+	bool want_1x;
 };
 
 /**
@@ -2565,6 +2568,23 @@ struct cfg80211_nan_func {
 	u64 cookie;
 };
 
+/**
+ * struct cfg80211_pmk_conf - PMK configuration
+ *
+ * @aa: authenticator address
+ * @pmk_len: PMK length in bytes.
+ * @pmk: the PMK material
+ * @pmk_r0_name: PMK-R0 Name. NULL if not applicable (i.e., the PMK
+ *	is not PMK-R0). When pmk_r0_name is not NULL, the pmk field
+ *	holds PMK-R0.
+ */
+struct cfg80211_pmk_conf {
+	const u8 *aa;
+	u8 pmk_len;
+	const u8 *pmk;
+	const u8 *pmk_r0_name;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -2881,6 +2901,13 @@ struct cfg80211_nan_func {
  *	All other parameters must be ignored.
  *
  * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
+ *
+ * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake.
+ *	If not deleted through @del_pmk the PMK remains valid until disconnect
+ *	upon which the driver should clear it.
+ *	(invoked with the wireless_dev mutex held)
+ * @del_pmk: delete the previously configured PMK for the given authenticator.
+ *	(invoked with the wireless_dev mutex held)
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3169,6 +3196,11 @@ struct cfg80211_ops {
 	int	(*set_multicast_to_unicast)(struct wiphy *wiphy,
 					    struct net_device *dev,
 					    const bool enabled);
+
+	int	(*set_pmk)(struct wiphy *wiphy, struct net_device *dev,
+			   const struct cfg80211_pmk_conf *conf);
+	int	(*del_pmk)(struct wiphy *wiphy, struct net_device *dev,
+			   const u8 *aa);
 };
 
 /*
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f1f7da25bca4..073e26850195 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -182,6 +182,17 @@
  * this offload may reject the %NL80211_CMD_CONNECT when no preshared
  * key material is provided, for example when that driver does not
  * support setting the temporal keys through %CMD_NEW_KEY.
+ *
+ * Similarly @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X flag can be
+ * set by drivers indicating offload support of the PTK/GTK EAPOL
+ * handshakes during 802.1X authentication. In order to use the offload
+ * the %NL80211_CMD_CONNECT should have %NL80211_ATTR_WANT_1X_4WAY_HS
+ * attribute flag. Drivers supporting this offload may reject the
+ * %NL80211_CMD_CONNECT when the attribute flag is not present.
+ *
+ * For 802.1X the PMK or PMK-R0 are set by providing %NL80211_ATTR_PMK
+ * using %NL80211_CMD_SET_PMK. For offloaded FT support also
+ * %NL80211_ATTR_PMKR0_NAME must be provided.
  */
 
 /**
@@ -959,6 +970,14 @@
  *	does not result in a change for the current association. Currently,
  *	only the %NL80211_ATTR_IE data is used and updated with this command.
  *
+ * @NL80211_CMD_SET_PMK: For offloaded 4-Way handshake, set the PMK or PMK-R0
+ *	for the given authenticator address (specified with &NL80211_ATTR_MAC).
+ *	When &NL80211_ATTR_PMKR0_NAME is set, &NL80211_ATTR_PMK specifies the
+ *	PMK-R0, otherwise it specifies the PMK.
+ * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously
+ *	configured PMK for the authenticator address identified by
+ *	&NL80211_ATTR_MAC.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1158,6 +1177,9 @@ enum nl80211_commands {
 
 	NL80211_CMD_UPDATE_CONNECT_PARAMS,
 
+	NL80211_CMD_SET_PMK,
+	NL80211_CMD_DEL_PMK,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2095,13 +2117,20 @@ enum nl80211_commands {
  * @NL80211_ATTR_PMK: attribute for passing PMK key material. Used with
  *	%NL80211_CMD_SET_PMKSA for the PMKSA identified by %NL80211_ATTR_PMKID.
  *	For %NL80211_CMD_CONNECT it is used to provide PSK for offloading 4-way
- *	handshake for WPA/WPA2-PSK networks.
+ *	handshake for WPA/WPA2-PSK networks. For 802.1X authentication it is
+ *	used with %NL80211_CMD_SET_PMK. For offloaded FT support this attribute
+ *	specifies the PMK-R0 if NL80211_ATTR_PMKR0_NAME is included as well.
  *
  * @NL80211_ATTR_SCHED_SCAN_MULTI: flag attribute which user-space shall use to
  *	indicate that it supports multiple active scheduled scan requests.
  * @NL80211_ATTR_SCHED_SCAN_MAX_REQS: indicates maximum number of scheduled
  *	scan request that may be active for the device (u32).
  *
+ * @NL80211_ATTR_WANT_1X_4WAY_HS: flag attribute which user-space can include
+ *	in %NL80211_CMD_CONNECT to indicate that for 802.1X authentication it
+ *	wants to use the supported offload of the 4-way handshake.
+ * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2524,6 +2553,9 @@ enum nl80211_attrs {
 	NL80211_ATTR_SCHED_SCAN_MULTI,
 	NL80211_ATTR_SCHED_SCAN_MAX_REQS,
 
+	NL80211_ATTR_WANT_1X_4WAY_HS,
+	NL80211_ATTR_PMKR0_NAME,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -4869,6 +4901,10 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK: Device wants to do 4-way
  *	handshake with PSK in station mode (PSK is passed as part of the connect
  *	and associate commands), doing it in the host might not be supported.
+ * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X: Device wants to do doing 4-way
+ *	handshake with 802.1X in station mode (will pass EAP frames to the host
+ *	and accept the set_pmk/del_pmk commands), doing it in the host might not
+ *	be supported.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4890,6 +4926,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_CQM_RSSI_LIST,
 	NL80211_EXT_FEATURE_FILS_SK_OFFLOAD,
 	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK,
+	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 83ea164f16b3..7b33e8c366bc 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -711,6 +711,11 @@ int wiphy_register(struct wiphy *wiphy)
 		    (wiphy->bss_select_support & ~(BIT(__NL80211_BSS_SELECT_ATTR_AFTER_LAST) - 2))))
 		return -EINVAL;
 
+	if (WARN_ON(wiphy_ext_feature_isset(&rdev->wiphy,
+					    NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X) &&
+		    (!rdev->ops->set_pmk || !rdev->ops->del_pmk)))
+		return -EINVAL;
+
 	if (wiphy->addresses)
 		memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2c6863aee4e4..8148b01bcdd2 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8881,6 +8881,12 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 
 	connect.privacy = info->attrs[NL80211_ATTR_PRIVACY];
 
+	if (info->attrs[NL80211_ATTR_WANT_1X_4WAY_HS] &&
+	    !wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X))
+		return -EINVAL;
+	connect.want_1x = info->attrs[NL80211_ATTR_WANT_1X_4WAY_HS];
+
 	err = nl80211_crypto_settings(rdev, info, &connect.crypto,
 				      NL80211_MAX_NR_CIPHER_SUITES);
 	if (err)
@@ -12265,6 +12271,90 @@ static int nl80211_set_multicast_to_unicast(struct sk_buff *skb,
 	return rdev_set_multicast_to_unicast(rdev, dev, enabled);
 }
 
+static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_pmk_conf pmk_conf = {};
+	int ret;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION &&
+	    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	if (!wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X))
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_MAC] || !info->attrs[NL80211_ATTR_PMK])
+		return -EINVAL;
+
+	wdev_lock(wdev);
+	if (!wdev->current_bss) {
+		ret = -ENOTCONN;
+		goto out;
+	}
+
+	pmk_conf.aa = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	if (memcmp(pmk_conf.aa, wdev->current_bss->pub.bssid, ETH_ALEN)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pmk_conf.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]);
+	pmk_conf.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]);
+	if (pmk_conf.pmk_len != WLAN_PMK_LEN &&
+	    pmk_conf.pmk_len != WLAN_PMK_LEN_SUITE_B_192) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (info->attrs[NL80211_ATTR_PMKR0_NAME]) {
+		int r0_name_len = nla_len(info->attrs[NL80211_ATTR_PMKR0_NAME]);
+
+		if (r0_name_len != WLAN_PMK_NAME_LEN) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		pmk_conf.pmk_r0_name =
+			nla_data(info->attrs[NL80211_ATTR_PMKR0_NAME]);
+	}
+
+	ret = rdev_set_pmk(rdev, dev, &pmk_conf);
+out:
+	wdev_unlock(wdev);
+	return ret;
+}
+
+static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	const u8 *aa;
+	int ret;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION &&
+	    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	if (!wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X))
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	wdev_lock(wdev);
+	aa = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	ret = rdev_del_pmk(rdev, dev, aa);
+	wdev_unlock(wdev);
+
+	return ret;
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -13140,6 +13230,21 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_NETDEV |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_SET_PMK,
+		.doit = nl80211_set_pmk,
+		.policy = nl80211_policy,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_PMK,
+		.doit = nl80211_del_pmk,
+		.policy = nl80211_policy,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 0598c1e5d0ad..ce23d7d49960 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1164,4 +1164,29 @@ rdev_set_coalesce(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
+
+static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev,
+			       struct net_device *dev,
+			       struct cfg80211_pmk_conf *pmk_conf)
+{
+	int ret = -EOPNOTSUPP;
+
+	trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf);
+	if (rdev->ops->set_pmk)
+		ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
+static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev,
+			       struct net_device *dev, const u8 *aa)
+{
+	int ret = -EOPNOTSUPP;
+
+	trace_rdev_del_pmk(&rdev->wiphy, dev, aa);
+	if (rdev->ops->del_pmk)
+		ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ca8b2059f92c..0f8db41eaddb 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2258,6 +2258,66 @@ TRACE_EVENT(rdev_tdls_cancel_channel_switch,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr))
 );
 
+TRACE_EVENT(rdev_set_pmk,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct cfg80211_pmk_conf *pmk_conf),
+
+	TP_ARGS(wiphy, netdev, pmk_conf),
+
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		MAC_ENTRY(aa)
+		__field(u8, pmk_len)
+		__field(u8, pmk_r0_name_len)
+		__dynamic_array(u8, pmk, pmk_conf->pmk_len)
+		__dynamic_array(u8, pmk_r0_name, WLAN_PMK_NAME_LEN)
+	),
+
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		MAC_ASSIGN(aa, pmk_conf->aa);
+		__entry->pmk_len = pmk_conf->pmk_len;
+		__entry->pmk_r0_name_len =
+		pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0;
+		memcpy(__get_dynamic_array(pmk), pmk_conf->pmk,
+		       pmk_conf->pmk_len);
+		memcpy(__get_dynamic_array(pmk_r0_name), pmk_conf->pmk_r0_name,
+		       pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0);
+	),
+
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT
+		  "pmk_len=%u, pmk: %s pmk_r0_name: %s", WIPHY_PR_ARG,
+		  NETDEV_PR_ARG, MAC_PR_ARG(aa), __entry->pmk_len,
+		  __print_array(__get_dynamic_array(pmk),
+				__get_dynamic_array_len(pmk), 1),
+		  __entry->pmk_r0_name_len ?
+		  __print_array(__get_dynamic_array(pmk_r0_name),
+				__get_dynamic_array_len(pmk_r0_name), 1) : "")
+);
+
+TRACE_EVENT(rdev_del_pmk,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *aa),
+
+	TP_ARGS(wiphy, netdev, aa),
+
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		MAC_ENTRY(aa)
+	),
+
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		MAC_ASSIGN(aa, aa);
+	),
+
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT,
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(aa))
+);
+
 /*************************************************************
  *	     cfg80211 exported functions traces		     *
  *************************************************************/
-- 
cgit v1.2.3


From c01b244ad848ac7f0faa141182db80650a8a761a Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 5 Jun 2017 10:28:01 -0400
Subject: USB: add usbfs ioctl to retrieve the connection speed

The usbfs interface does not provide any way for the user to learn the
speed at which a device is connected.  The current API includes a
USBDEVFS_CONNECTINFO ioctl, but all it provides is the device's
address and a one-bit value indicating whether the connection is low
speed.  That may have sufficed in the era of USB-1.1, but it isn't
good enough today.

This patch introduces a new ioctl, USBDEVFS_GET_SPEED, which returns a
numeric value indicating the speed of the connection: unknown, low,
full, high, wireless, super, or super-plus.

Similar information (not exactly the same) is available through sysfs,
but it seems reasonable to provide the actual value in usbfs.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reported-by: Reinhard Huck <reinhard.huck@thesycon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/devio.c          | 3 +++
 include/uapi/linux/usbdevice_fs.h | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 8e6ef671be9b..0e7d0e81a7cb 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -2537,6 +2537,9 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
 	case USBDEVFS_DROP_PRIVILEGES:
 		ret = proc_drop_privileges(ps, p);
 		break;
+	case USBDEVFS_GET_SPEED:
+		ret = ps->dev->speed;
+		break;
 	}
 
  done:
diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h
index a8653a6f40df..0bbfd4abd2e3 100644
--- a/include/uapi/linux/usbdevice_fs.h
+++ b/include/uapi/linux/usbdevice_fs.h
@@ -156,6 +156,11 @@ struct usbdevfs_streams {
 	unsigned char eps[0];
 };
 
+/*
+ * USB_SPEED_* values returned by USBDEVFS_GET_SPEED are defined in
+ * linux/usb/ch9.h
+ */
+
 #define USBDEVFS_CONTROL           _IOWR('U', 0, struct usbdevfs_ctrltransfer)
 #define USBDEVFS_CONTROL32           _IOWR('U', 0, struct usbdevfs_ctrltransfer32)
 #define USBDEVFS_BULK              _IOWR('U', 2, struct usbdevfs_bulktransfer)
@@ -190,5 +195,6 @@ struct usbdevfs_streams {
 #define USBDEVFS_ALLOC_STREAMS     _IOR('U', 28, struct usbdevfs_streams)
 #define USBDEVFS_FREE_STREAMS      _IOR('U', 29, struct usbdevfs_streams)
 #define USBDEVFS_DROP_PRIVILEGES   _IOW('U', 30, __u32)
+#define USBDEVFS_GET_SPEED         _IO('U', 31)
 
 #endif /* _UAPI_LINUX_USBDEVICE_FS_H */
-- 
cgit v1.2.3


From f45cbe6e691fcdeda480ecc9c66533a8277f0ca4 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 9 Jun 2017 13:08:45 +0100
Subject: nl80211: add authorized flag to ROAM event

Drivers that initiate roaming while being connected to a network that
uses 802.1X authentication need to inform user space if 802.1X
authentication is further required after roaming.
For example, when using the Fast transition protocol, roaming within
the mobility domain does not require new 802.1X authentication, but
roaming to another mobility domain does.
In addition, some drivers may not support 802.1X authentication
(so it has to be done in user space), while other drivers do.

Add a flag to the roaming notification to indicate if user space is
required to do 802.1X authentication after the roaming or not.
This flag will only be used for networks that use 802.1X
authentication. For networks that do not use 802.1X authentication it
is assumed that no further action is required from user space after
the roaming notification.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
[arend.vanspriel@broadcom.com reuse NL80211_ATTR_PORT_AUTHORIZED]
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
[rebase to apply w/o the flag in CONNECT]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h | 11 +++++++++++
 net/wireless/nl80211.c       |  4 +++-
 net/wireless/sme.c           |  1 +
 4 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2174e51c6595..f12fa5245a45 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5441,6 +5441,9 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
  * @resp_ie_len: assoc response IEs length
+ * @authorized: true if the 802.1X authentication was done by the driver or is
+ *	not needed (e.g., when Fast Transition protocol was used), false
+ *	otherwise. Ignored for networks that don't use 802.1X authentication.
  */
 struct cfg80211_roam_info {
 	struct ieee80211_channel *channel;
@@ -5450,6 +5453,7 @@ struct cfg80211_roam_info {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
+	bool authorized;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 073e26850195..72f15c3fc5a6 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -571,6 +571,12 @@
  *	well to remain backwards compatible.
  * @NL80211_CMD_ROAM: request that the card roam (currently not implemented),
  *	sent as an event when the card/driver roamed by itself.
+ *	When used as an event, and the driver roamed in a network that requires
+ *	802.1X authentication, %NL80211_ATTR_PORT_AUTHORIZED should be set
+ *	if the 802.1X authentication was done by the driver or if roaming was
+ *	done using Fast Transition protocol (in which case 802.1X authentication
+ *	is not needed). If %NL80211_ATTR_PORT_AUTHORIZED is not set, user space
+ *	is responsible for the 802.1X authentication.
  * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify
  *	userspace that a connection was dropped by the AP or due to other
  *	reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and
@@ -2130,6 +2136,10 @@ enum nl80211_commands {
  *	in %NL80211_CMD_CONNECT to indicate that for 802.1X authentication it
  *	wants to use the supported offload of the 4-way handshake.
  * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT.
+ * @NL80211_ATTR_PORT_AUTHORIZED: flag attribute used in %NL80211_CMD_ROAMED
+ *	notification indicating that that 802.1X authentication was done by
+ *	the driver or is not needed (because roaming used the Fast Transition
+ *	protocol).
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2555,6 +2565,7 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_WANT_1X_4WAY_HS,
 	NL80211_ATTR_PMKR0_NAME,
+	NL80211_ATTR_PORT_AUTHORIZED,
 
 	/* add attributes here, update the policy in nl80211.c */
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8148b01bcdd2..5487cd775b6f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13800,7 +13800,9 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 		     info->req_ie)) ||
 	    (info->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
-		     info->resp_ie)))
+		     info->resp_ie)) ||
+	    (info->authorized &&
+	     nla_put_flag(msg, NL80211_ATTR_PORT_AUTHORIZED)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 532a0007ce82..0a49b88070d0 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -960,6 +960,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	ev->rm.resp_ie_len = info->resp_ie_len;
 	memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len);
 	ev->rm.bss = info->bss;
+	ev->rm.authorized = info->authorized;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
 	list_add_tail(&ev->list, &wdev->event_list);
-- 
cgit v1.2.3


From ea304a99b06e6c05a61c85f05c75aac6ff545806 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 9 Jun 2017 13:08:46 +0100
Subject: nl80211: remove desciption about request from NL80211_CMD_ROAM

The description of NL80211_CMD_ROAM indicated possibility for a
request to roam issued by user-space. However, it also states that
as not being implemented right now. This has been so since commit
b23aa676ab9d ("cfg80211: connect/disconnect API") added in 2009.
So it seems safe to assume it will not be added any time soon and
thus remove it.

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 72f15c3fc5a6..828aa4703e22 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -569,14 +569,13 @@
  *	authentication/association or not receiving a response from the AP.
  *	Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as
  *	well to remain backwards compatible.
- * @NL80211_CMD_ROAM: request that the card roam (currently not implemented),
- *	sent as an event when the card/driver roamed by itself.
- *	When used as an event, and the driver roamed in a network that requires
- *	802.1X authentication, %NL80211_ATTR_PORT_AUTHORIZED should be set
- *	if the 802.1X authentication was done by the driver or if roaming was
- *	done using Fast Transition protocol (in which case 802.1X authentication
- *	is not needed). If %NL80211_ATTR_PORT_AUTHORIZED is not set, user space
- *	is responsible for the 802.1X authentication.
+ * @NL80211_CMD_ROAM: notifcation indicating the card/driver roamed by itself.
+ *	When the driver roamed in a network that requires 802.1X authentication,
+ *	%NL80211_ATTR_PORT_AUTHORIZED should be set if the 802.1X authentication
+ *	was done by the driver or if roaming was done using Fast Transition
+ *	protocol (in which case 802.1X authentication is not needed). If
+ *	%NL80211_ATTR_PORT_AUTHORIZED is not set, user space is responsible for
+ *	the 802.1X authentication.
  * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify
  *	userspace that a connection was dropped by the AP or due to other
  *	reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and
-- 
cgit v1.2.3


From be4c9acfe2976b6e024d15656254d2eb207b83a8 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.vnet.ibm.com>
Date: Wed, 24 May 2017 17:39:40 -0400
Subject: tpm: vtpm_proxy: Implement request_locality function.

Implement the request_locality function. To set the locality on the
backend we define vendor-specific TPM 1.2 and TPM 2 ordinals and send
a command to the backend to set the locality for the next commands.

To avoid recursing into requesting the locality, we set the
TPM_TRANSMIT_RAW flag when calling tpm_transmit_cmd. To avoid recursing
into TPM 2 space related commands, we set the space parameter to NULL.

Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/tpm-interface.c  |  1 +
 drivers/char/tpm/tpm_vtpm_proxy.c | 36 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vtpm_proxy.h   |  4 ++++
 3 files changed, 41 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 8ef5e1723efb..d2b4df6d9894 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -538,6 +538,7 @@ ssize_t tpm_transmit_cmd(struct tpm_chip *chip, struct tpm_space *space,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(tpm_transmit_cmd);
 
 #define TPM_DIGEST_SIZE 20
 #define TPM_RET_CODE_IDX 6
diff --git a/drivers/char/tpm/tpm_vtpm_proxy.c b/drivers/char/tpm/tpm_vtpm_proxy.c
index 751059d2140a..66024bf92097 100644
--- a/drivers/char/tpm/tpm_vtpm_proxy.c
+++ b/drivers/char/tpm/tpm_vtpm_proxy.c
@@ -371,6 +371,41 @@ static bool vtpm_proxy_tpm_req_canceled(struct tpm_chip  *chip, u8 status)
 	return ret;
 }
 
+static int vtpm_proxy_request_locality(struct tpm_chip *chip, int locality)
+{
+	struct tpm_buf buf;
+	int rc;
+	const struct tpm_output_header *header;
+
+	if (chip->flags & TPM_CHIP_FLAG_TPM2)
+		rc = tpm_buf_init(&buf, TPM2_ST_SESSIONS,
+				  TPM2_CC_SET_LOCALITY);
+	else
+		rc = tpm_buf_init(&buf, TPM_TAG_RQU_COMMAND,
+				  TPM_ORD_SET_LOCALITY);
+	if (rc)
+		return rc;
+	tpm_buf_append_u8(&buf, locality);
+
+	rc = tpm_transmit_cmd(chip, NULL, buf.data, tpm_buf_length(&buf), 0,
+			      TPM_TRANSMIT_UNLOCKED | TPM_TRANSMIT_RAW,
+			      "attempting to set locality");
+	if (rc < 0) {
+		locality = rc;
+		goto out;
+	}
+
+	header = (const struct tpm_output_header *)buf.data;
+	rc = be32_to_cpu(header->return_code);
+	if (rc)
+		locality = -1;
+
+out:
+	tpm_buf_destroy(&buf);
+
+	return locality;
+}
+
 static const struct tpm_class_ops vtpm_proxy_tpm_ops = {
 	.flags = TPM_OPS_AUTO_STARTUP,
 	.recv = vtpm_proxy_tpm_op_recv,
@@ -380,6 +415,7 @@ static const struct tpm_class_ops vtpm_proxy_tpm_ops = {
 	.req_complete_mask = VTPM_PROXY_REQ_COMPLETE_FLAG,
 	.req_complete_val = VTPM_PROXY_REQ_COMPLETE_FLAG,
 	.req_canceled = vtpm_proxy_tpm_req_canceled,
+	.request_locality = vtpm_proxy_request_locality,
 };
 
 /*
diff --git a/include/uapi/linux/vtpm_proxy.h b/include/uapi/linux/vtpm_proxy.h
index a69e991eb080..58ac73cd38fe 100644
--- a/include/uapi/linux/vtpm_proxy.h
+++ b/include/uapi/linux/vtpm_proxy.h
@@ -46,4 +46,8 @@ struct vtpm_proxy_new_dev {
 
 #define VTPM_PROXY_IOC_NEW_DEV	_IOWR(0xa1, 0x00, struct vtpm_proxy_new_dev)
 
+/* vendor specific commands to set locality */
+#define TPM2_CC_SET_LOCALITY	0x20001000
+#define TPM_ORD_SET_LOCALITY	0x20001000
+
 #endif /* _UAPI_LINUX_VTPM_PROXY_H */
-- 
cgit v1.2.3


From 734942cc4ea6478eed125af258da1bdbb4afe578 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Wed, 14 Jun 2017 11:37:14 -0700
Subject: tcp: ULP infrastructure

Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
ULP can add its own logic by changing the TCP proto_ops structure to its own
methods.

Example usage:

setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));

modules will call:
tcp_register_ulp(&tcp_tls_ulp_ops);

to register/unregister their ulp, with an init function and name.

A list of registered ulps will be returned by tcp_get_available_ulp, which is
hooked up to /proc.  Example:

$ cat /proc/sys/net/ipv4/tcp_available_ulp
tls

There is currently no functionality to remove or chain ULPs, but
it should be possible to add these in the future if needed.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |   4 ++
 include/net/tcp.h                  |  25 +++++++
 include/uapi/linux/tcp.h           |   1 +
 net/ipv4/Makefile                  |   2 +-
 net/ipv4/sysctl_net_ipv4.c         |  25 +++++++
 net/ipv4/tcp.c                     |  28 ++++++++
 net/ipv4/tcp_ipv4.c                |   2 +
 net/ipv4/tcp_ulp.c                 | 134 +++++++++++++++++++++++++++++++++++++
 8 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/tcp_ulp.c

(limited to 'include/uapi/linux')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c7a577976bec..13e4c89a8231 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -75,6 +75,8 @@ struct inet_connection_sock_af_ops {
  * @icsk_pmtu_cookie	   Last pmtu seen by socket
  * @icsk_ca_ops		   Pluggable congestion control hook
  * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
+ * @icsk_ulp_ops	   Pluggable ULP control hook
+ * @icsk_ulp_data	   ULP private data
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
  * @icsk_pending:	   Scheduled timer event
@@ -97,6 +99,8 @@ struct inet_connection_sock {
 	__u32			  icsk_pmtu_cookie;
 	const struct tcp_congestion_ops *icsk_ca_ops;
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
+	const struct tcp_ulp_ops  *icsk_ulp_ops;
+	void			  *icsk_ulp_data;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state:6,
 				  icsk_ca_setsockopt:1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3ab677d11d02..b439f46f149c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1991,4 +1991,29 @@ static inline void tcp_listendrop(const struct sock *sk)
 
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 
+/*
+ * Interface for adding Upper Level Protocols over TCP
+ */
+
+#define TCP_ULP_NAME_MAX	16
+#define TCP_ULP_MAX		128
+#define TCP_ULP_BUF_MAX		(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
+
+struct tcp_ulp_ops {
+	struct list_head	list;
+
+	/* initialize ulp */
+	int (*init)(struct sock *sk);
+	/* cleanup ulp */
+	void (*release)(struct sock *sk);
+
+	char		name[TCP_ULP_NAME_MAX];
+	struct module	*owner;
+};
+int tcp_register_ulp(struct tcp_ulp_ops *type);
+void tcp_unregister_ulp(struct tcp_ulp_ops *type);
+int tcp_set_ulp(struct sock *sk, const char *name);
+void tcp_get_available_ulp(char *buf, size_t len);
+void tcp_cleanup_ulp(struct sock *sk);
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 38a2b07afdff..8204dcebc6f3 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -117,6 +117,7 @@ enum {
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
 #define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
+#define TCP_ULP		31	/* Attach a ULP to a TCP connection */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f83de23a30e7..afcb435adfbe 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-	     tcp_rate.o tcp_recovery.o \
+	     tcp_rate.o tcp_recovery.o tcp_ulp.o \
 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7065234a89a5..9bf809726066 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -360,6 +360,25 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		tcp_fastopen_active_timeout_reset();
+
+	return ret;
+}
+
+static int proc_tcp_available_ulp(struct ctl_table *ctl,
+				  int write,
+				  void __user *buffer, size_t *lenp,
+				  loff_t *ppos)
+{
+	struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+	tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	kfree(tbl.data);
+
 	return ret;
 }
 
@@ -685,6 +704,12 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
+	{
+		.procname	= "tcp_available_ulp",
+		.maxlen		= TCP_ULP_BUF_MAX,
+		.mode		= 0444,
+		.proc_handler   = proc_tcp_available_ulp,
+	},
 	{
 		.procname	= "icmp_msgs_per_sec",
 		.data		= &sysctl_icmp_msgs_per_sec,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cc8fd8b747a4..b06ee3086a0e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2482,6 +2482,24 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		release_sock(sk);
 		return err;
 	}
+	case TCP_ULP: {
+		char name[TCP_ULP_NAME_MAX];
+
+		if (optlen < 1)
+			return -EINVAL;
+
+		val = strncpy_from_user(name, optval,
+					min_t(long, TCP_ULP_NAME_MAX - 1,
+					      optlen));
+		if (val < 0)
+			return -EFAULT;
+		name[val] = 0;
+
+		lock_sock(sk);
+		err = tcp_set_ulp(sk, name);
+		release_sock(sk);
+		return err;
+	}
 	default:
 		/* fallthru */
 		break;
@@ -3038,6 +3056,16 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		return 0;
 
+	case TCP_ULP:
+		if (get_user(len, optlen))
+			return -EFAULT;
+		len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
+			return -EFAULT;
+		return 0;
+
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1dc8c449e16a..eec2ff907279 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1860,6 +1860,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	tcp_cleanup_congestion_control(sk);
 
+	tcp_cleanup_ulp(sk);
+
 	/* Cleanup up the write buffer. */
 	tcp_write_queue_purge(sk);
 
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
new file mode 100644
index 000000000000..e855ea70819b
--- /dev/null
+++ b/net/ipv4/tcp_ulp.c
@@ -0,0 +1,134 @@
+/*
+ * Pluggable TCP upper layer protocol support.
+ *
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ */
+
+#include<linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_ulp_list_lock);
+static LIST_HEAD(tcp_ulp_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
+{
+	struct tcp_ulp_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+
+	return NULL;
+}
+
+static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
+{
+	const struct tcp_ulp_ops *ulp = NULL;
+
+	rcu_read_lock();
+	ulp = tcp_ulp_find(name);
+
+#ifdef CONFIG_MODULES
+	if (!ulp && capable(CAP_NET_ADMIN)) {
+		rcu_read_unlock();
+		request_module("%s", name);
+		rcu_read_lock();
+		ulp = tcp_ulp_find(name);
+	}
+#endif
+	if (!ulp || !try_module_get(ulp->owner))
+		ulp = NULL;
+
+	rcu_read_unlock();
+	return ulp;
+}
+
+/* Attach new upper layer protocol to the list
+ * of available protocols.
+ */
+int tcp_register_ulp(struct tcp_ulp_ops *ulp)
+{
+	int ret = 0;
+
+	spin_lock(&tcp_ulp_list_lock);
+	if (tcp_ulp_find(ulp->name)) {
+		pr_notice("%s already registered or non-unique name\n",
+			  ulp->name);
+		ret = -EEXIST;
+	} else {
+		list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
+	}
+	spin_unlock(&tcp_ulp_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_ulp);
+
+void tcp_unregister_ulp(struct tcp_ulp_ops *ulp)
+{
+	spin_lock(&tcp_ulp_list_lock);
+	list_del_rcu(&ulp->list);
+	spin_unlock(&tcp_ulp_list_lock);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_ulp);
+
+/* Build string with list of available upper layer protocl values */
+void tcp_get_available_ulp(char *buf, size_t maxlen)
+{
+	struct tcp_ulp_ops *ulp_ops;
+	size_t offs = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) {
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", ulp_ops->name);
+	}
+	rcu_read_unlock();
+}
+
+void tcp_cleanup_ulp(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (!icsk->icsk_ulp_ops)
+		return;
+
+	if (icsk->icsk_ulp_ops->release)
+		icsk->icsk_ulp_ops->release(sk);
+	module_put(icsk->icsk_ulp_ops->owner);
+}
+
+/* Change upper layer protocol for socket */
+int tcp_set_ulp(struct sock *sk, const char *name)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_ulp_ops *ulp_ops;
+	int err = 0;
+
+	if (icsk->icsk_ulp_ops)
+		return -EEXIST;
+
+	ulp_ops = __tcp_ulp_find_autoload(name);
+	if (!ulp_ops)
+		err = -ENOENT;
+	else
+		err = ulp_ops->init(sk);
+
+	if (err)
+		goto out;
+
+	icsk->icsk_ulp_ops = ulp_ops;
+ out:
+	return err;
+}
-- 
cgit v1.2.3


From 3c4d7559159bfe1e3b94df3a657b2cda3a34e218 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Wed, 14 Jun 2017 11:37:39 -0700
Subject: tls: kernel TLS support

Software implementation of transport layer security, implemented using ULP
infrastructure.  tcp proto_ops are replaced with tls equivalents of sendmsg and
sendpage.

Only symmetric crypto is done in the kernel, keys are passed by setsockopt
after the handshake is complete.  All control messages are supported via CMSG
data - the actual symmetric encryption is the same, just the message type needs
to be passed separately.

For user API, please see Documentation patch.

Pieces that can be shared between hw and sw implementation
are in tls_main.c

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS              |  10 +
 include/linux/socket.h   |   1 +
 include/net/tls.h        | 237 +++++++++++++++
 include/uapi/linux/tls.h |  79 +++++
 net/Kconfig              |   1 +
 net/Makefile             |   1 +
 net/tls/Kconfig          |  12 +
 net/tls/Makefile         |   7 +
 net/tls/tls_main.c       | 487 ++++++++++++++++++++++++++++++
 net/tls/tls_sw.c         | 772 +++++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 1607 insertions(+)
 create mode 100644 include/net/tls.h
 create mode 100644 include/uapi/linux/tls.h
 create mode 100644 net/tls/Kconfig
 create mode 100644 net/tls/Makefile
 create mode 100644 net/tls/tls_main.c
 create mode 100644 net/tls/tls_sw.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 10f158ee95a3..71a74555afdf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8978,6 +8978,16 @@ F:	net/ipv6/
 F:	include/net/ip*
 F:	arch/x86/net/*
 
+NETWORKING [TLS]
+M:	Ilya Lesokhin <ilyal@mellanox.com>
+M:	Aviad Yehezkel <aviadye@mellanox.com>
+M:	Dave Watson <davejwatson@fb.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	net/tls/*
+F:	include/uapi/linux/tls.h
+F:	include/net/tls.h
+
 NETWORKING [IPSEC]
 M:	Steffen Klassert <steffen.klassert@secunet.com>
 M:	Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 082027457825..8b13db5163cc 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -334,6 +334,7 @@ struct ucred {
 #define SOL_ALG		279
 #define SOL_NFC		280
 #define SOL_KCM		281
+#define SOL_TLS		282
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/net/tls.h b/include/net/tls.h
new file mode 100644
index 000000000000..b89d397dd62f
--- /dev/null
+++ b/include/net/tls.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TLS_OFFLOAD_H
+#define _TLS_OFFLOAD_H
+
+#include <linux/types.h>
+
+#include <uapi/linux/tls.h>
+
+
+/* Maximum data size carried in a TLS record */
+#define TLS_MAX_PAYLOAD_SIZE		((size_t)1 << 14)
+
+#define TLS_HEADER_SIZE			5
+#define TLS_NONCE_OFFSET		TLS_HEADER_SIZE
+
+#define TLS_CRYPTO_INFO_READY(info)	((info)->cipher_type)
+
+#define TLS_RECORD_TYPE_DATA		0x17
+
+#define TLS_AAD_SPACE_SIZE		13
+
+struct tls_sw_context {
+	struct crypto_aead *aead_send;
+
+	/* Sending context */
+	char aad_space[TLS_AAD_SPACE_SIZE];
+
+	unsigned int sg_plaintext_size;
+	int sg_plaintext_num_elem;
+	struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
+
+	unsigned int sg_encrypted_size;
+	int sg_encrypted_num_elem;
+	struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS];
+
+	/* AAD | sg_plaintext_data | sg_tag */
+	struct scatterlist sg_aead_in[2];
+	/* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */
+	struct scatterlist sg_aead_out[2];
+};
+
+enum {
+	TLS_PENDING_CLOSED_RECORD
+};
+
+struct tls_context {
+	union {
+		struct tls_crypto_info crypto_send;
+		struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128;
+	};
+
+	void *priv_ctx;
+
+	u16 prepend_size;
+	u16 tag_size;
+	u16 overhead_size;
+	u16 iv_size;
+	char *iv;
+	u16 rec_seq_size;
+	char *rec_seq;
+
+	struct scatterlist *partially_sent_record;
+	u16 partially_sent_offset;
+	unsigned long flags;
+
+	u16 pending_open_record_frags;
+	int (*push_pending_record)(struct sock *sk, int flags);
+	void (*free_resources)(struct sock *sk);
+
+	void (*sk_write_space)(struct sock *sk);
+	void (*sk_proto_close)(struct sock *sk, long timeout);
+
+	int  (*setsockopt)(struct sock *sk, int level,
+			   int optname, char __user *optval,
+			   unsigned int optlen);
+	int  (*getsockopt)(struct sock *sk, int level,
+			   int optname, char __user *optval,
+			   int __user *optlen);
+};
+
+int wait_on_pending_writer(struct sock *sk, long *timeo);
+int tls_sk_query(struct sock *sk, int optname, char __user *optval,
+		int __user *optlen);
+int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
+		  unsigned int optlen);
+
+
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx);
+int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tls_sw_sendpage(struct sock *sk, struct page *page,
+		    int offset, size_t size, int flags);
+void tls_sw_close(struct sock *sk, long timeout);
+
+void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
+void tls_icsk_clean_acked(struct sock *sk);
+
+int tls_push_sg(struct sock *sk, struct tls_context *ctx,
+		struct scatterlist *sg, u16 first_offset,
+		int flags);
+int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
+				   int flags, long *timeo);
+
+static inline bool tls_is_pending_closed_record(struct tls_context *ctx)
+{
+	return test_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+}
+
+static inline int tls_complete_pending_work(struct sock *sk,
+					    struct tls_context *ctx,
+					    int flags, long *timeo)
+{
+	int rc = 0;
+
+	if (unlikely(sk->sk_write_pending))
+		rc = wait_on_pending_writer(sk, timeo);
+
+	if (!rc && tls_is_pending_closed_record(ctx))
+		rc = tls_push_pending_closed_record(sk, ctx, flags, timeo);
+
+	return rc;
+}
+
+static inline bool tls_is_partially_sent_record(struct tls_context *ctx)
+{
+	return !!ctx->partially_sent_record;
+}
+
+static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
+{
+	return tls_ctx->pending_open_record_frags;
+}
+
+static inline void tls_err_abort(struct sock *sk)
+{
+	sk->sk_err = -EBADMSG;
+	sk->sk_error_report(sk);
+}
+
+static inline bool tls_bigint_increment(unsigned char *seq, int len)
+{
+	int i;
+
+	for (i = len - 1; i >= 0; i--) {
+		++seq[i];
+		if (seq[i] != 0)
+			break;
+	}
+
+	return (i == -1);
+}
+
+static inline void tls_advance_record_sn(struct sock *sk,
+					 struct tls_context *ctx)
+{
+	if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
+		tls_err_abort(sk);
+	tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			     ctx->iv_size);
+}
+
+static inline void tls_fill_prepend(struct tls_context *ctx,
+			     char *buf,
+			     size_t plaintext_len,
+			     unsigned char record_type)
+{
+	size_t pkt_len, iv_size = ctx->iv_size;
+
+	pkt_len = plaintext_len + iv_size + ctx->tag_size;
+
+	/* we cover nonce explicit here as well, so buf should be of
+	 * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
+	 */
+	buf[0] = record_type;
+	buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.version);
+	buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.version);
+	/* we can use IV for nonce explicit according to spec */
+	buf[3] = pkt_len >> 8;
+	buf[4] = pkt_len & 0xFF;
+	memcpy(buf + TLS_NONCE_OFFSET,
+	       ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
+}
+
+static inline struct tls_context *tls_get_ctx(const struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	return icsk->icsk_ulp_data;
+}
+
+static inline struct tls_sw_context *tls_sw_ctx(
+		const struct tls_context *tls_ctx)
+{
+	return (struct tls_sw_context *)tls_ctx->priv_ctx;
+}
+
+static inline struct tls_offload_context *tls_offload_ctx(
+		const struct tls_context *tls_ctx)
+{
+	return (struct tls_offload_context *)tls_ctx->priv_ctx;
+}
+
+int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
+		      unsigned char *record_type);
+
+#endif /* _TLS_OFFLOAD_H */
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
new file mode 100644
index 000000000000..cc1d21db35d8
--- /dev/null
+++ b/include/uapi/linux/tls.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UAPI_LINUX_TLS_H
+#define _UAPI_LINUX_TLS_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+/* TLS socket options */
+#define TLS_TX			1	/* Set transmit parameters */
+
+/* Supported versions */
+#define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
+#define TLS_VERSION_MAJOR(ver)	(((ver) >> 8) & 0xFF)
+
+#define TLS_VERSION_NUMBER(id)	((((id##_VERSION_MAJOR) & 0xFF) << 8) |	\
+				 ((id##_VERSION_MINOR) & 0xFF))
+
+#define TLS_1_2_VERSION_MAJOR	0x3
+#define TLS_1_2_VERSION_MINOR	0x3
+#define TLS_1_2_VERSION		TLS_VERSION_NUMBER(TLS_1_2)
+
+/* Supported ciphers */
+#define TLS_CIPHER_AES_GCM_128				51
+#define TLS_CIPHER_AES_GCM_128_IV_SIZE			8
+#define TLS_CIPHER_AES_GCM_128_KEY_SIZE		16
+#define TLS_CIPHER_AES_GCM_128_SALT_SIZE		4
+#define TLS_CIPHER_AES_GCM_128_TAG_SIZE		16
+#define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE		8
+
+#define TLS_SET_RECORD_TYPE	1
+
+struct tls_crypto_info {
+	__u16 version;
+	__u16 cipher_type;
+};
+
+struct tls12_crypto_info_aes_gcm_128 {
+	struct tls_crypto_info info;
+	unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
+	unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+	unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+	unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
+};
+
+#endif /* _UAPI_LINUX_TLS_H */
diff --git a/net/Kconfig b/net/Kconfig
index 102f781a0131..7d57ef34b79c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -55,6 +55,7 @@ menu "Networking options"
 
 source "net/packet/Kconfig"
 source "net/unix/Kconfig"
+source "net/tls/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
 source "net/smc/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 9086ffbb5085..bed80fa398b7 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_LLC)		+= llc/
 obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/ bpf/
 obj-$(CONFIG_NETFILTER)		+= netfilter/
 obj-$(CONFIG_INET)		+= ipv4/
+obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
new file mode 100644
index 000000000000..61e532964c82
--- /dev/null
+++ b/net/tls/Kconfig
@@ -0,0 +1,12 @@
+#
+# TLS configuration
+#
+config TLS
+	tristate "Transport Layer Security support"
+	depends on NET
+	default m
+	---help---
+	Enable kernel support for TLS protocol. This allows symmetric
+	encryption handling of the TLS protocol to be done in-kernel.
+
+	If unsure, say M.
diff --git a/net/tls/Makefile b/net/tls/Makefile
new file mode 100644
index 000000000000..a930fd1c4f7b
--- /dev/null
+++ b/net/tls/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the TLS subsystem.
+#
+
+obj-$(CONFIG_TLS) += tls.o
+
+tls-y := tls_main.o tls_sw.o
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
new file mode 100644
index 000000000000..2ebc328bda96
--- /dev/null
+++ b/net/tls/tls_main.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/highmem.h>
+#include <linux/netdevice.h>
+#include <linux/sched/signal.h>
+
+#include <net/tls.h>
+
+MODULE_AUTHOR("Mellanox Technologies");
+MODULE_DESCRIPTION("Transport Layer Security Support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct proto tls_base_prot;
+static struct proto tls_sw_prot;
+
+int wait_on_pending_writer(struct sock *sk, long *timeo)
+{
+	int rc = 0;
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (1) {
+		if (!*timeo) {
+			rc = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			rc = sock_intr_errno(*timeo);
+			break;
+		}
+
+		if (sk_wait_event(sk, timeo, !sk->sk_write_pending, &wait))
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return rc;
+}
+
+int tls_push_sg(struct sock *sk,
+		struct tls_context *ctx,
+		struct scatterlist *sg,
+		u16 first_offset,
+		int flags)
+{
+	int sendpage_flags = flags | MSG_SENDPAGE_NOTLAST;
+	int ret = 0;
+	struct page *p;
+	size_t size;
+	int offset = first_offset;
+
+	size = sg->length - offset;
+	offset += sg->offset;
+
+	while (1) {
+		if (sg_is_last(sg))
+			sendpage_flags = flags;
+
+		/* is sending application-limited? */
+		tcp_rate_check_app_limited(sk);
+		p = sg_page(sg);
+retry:
+		ret = do_tcp_sendpages(sk, p, offset, size, sendpage_flags);
+
+		if (ret != size) {
+			if (ret > 0) {
+				offset += ret;
+				size -= ret;
+				goto retry;
+			}
+
+			offset -= sg->offset;
+			ctx->partially_sent_offset = offset;
+			ctx->partially_sent_record = (void *)sg;
+			return ret;
+		}
+
+		put_page(p);
+		sk_mem_uncharge(sk, sg->length);
+		sg = sg_next(sg);
+		if (!sg)
+			break;
+
+		offset = sg->offset;
+		size = sg->length;
+	}
+
+	clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+
+	return 0;
+}
+
+static int tls_handle_open_record(struct sock *sk, int flags)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (tls_is_pending_open_record(ctx))
+		return ctx->push_pending_record(sk, flags);
+
+	return 0;
+}
+
+int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
+		      unsigned char *record_type)
+{
+	struct cmsghdr *cmsg;
+	int rc = -EINVAL;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+		if (cmsg->cmsg_level != SOL_TLS)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case TLS_SET_RECORD_TYPE:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(*record_type)))
+				return -EINVAL;
+
+			if (msg->msg_flags & MSG_MORE)
+				return -EINVAL;
+
+			rc = tls_handle_open_record(sk, msg->msg_flags);
+			if (rc)
+				return rc;
+
+			*record_type = *(unsigned char *)CMSG_DATA(cmsg);
+			rc = 0;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return rc;
+}
+
+int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
+				   int flags, long *timeo)
+{
+	struct scatterlist *sg;
+	u16 offset;
+
+	if (!tls_is_partially_sent_record(ctx))
+		return ctx->push_pending_record(sk, flags);
+
+	sg = ctx->partially_sent_record;
+	offset = ctx->partially_sent_offset;
+
+	ctx->partially_sent_record = NULL;
+	return tls_push_sg(sk, ctx, sg, offset, flags);
+}
+
+static void tls_write_space(struct sock *sk)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) {
+		gfp_t sk_allocation = sk->sk_allocation;
+		int rc;
+		long timeo = 0;
+
+		sk->sk_allocation = GFP_ATOMIC;
+		rc = tls_push_pending_closed_record(sk, ctx,
+						    MSG_DONTWAIT |
+						    MSG_NOSIGNAL,
+						    &timeo);
+		sk->sk_allocation = sk_allocation;
+
+		if (rc < 0)
+			return;
+	}
+
+	ctx->sk_write_space(sk);
+}
+
+static void tls_sk_proto_close(struct sock *sk, long timeout)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	long timeo = sock_sndtimeo(sk, 0);
+	void (*sk_proto_close)(struct sock *sk, long timeout);
+
+	lock_sock(sk);
+
+	if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
+		tls_handle_open_record(sk, 0);
+
+	if (ctx->partially_sent_record) {
+		struct scatterlist *sg = ctx->partially_sent_record;
+
+		while (1) {
+			put_page(sg_page(sg));
+			sk_mem_uncharge(sk, sg->length);
+
+			if (sg_is_last(sg))
+				break;
+			sg++;
+		}
+	}
+	ctx->free_resources(sk);
+	kfree(ctx->rec_seq);
+	kfree(ctx->iv);
+
+	sk_proto_close = ctx->sk_proto_close;
+	kfree(ctx);
+
+	release_sock(sk);
+	sk_proto_close(sk, timeout);
+}
+
+static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
+				int __user *optlen)
+{
+	int rc = 0;
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct tls_crypto_info *crypto_info;
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (!optval || (len < sizeof(*crypto_info))) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (!ctx) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* get user crypto info */
+	crypto_info = &ctx->crypto_send;
+
+	if (!TLS_CRYPTO_INFO_READY(crypto_info)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	if (len == sizeof(crypto_info)) {
+		rc = copy_to_user(optval, crypto_info, sizeof(*crypto_info));
+		goto out;
+	}
+
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128: {
+		struct tls12_crypto_info_aes_gcm_128 *
+		  crypto_info_aes_gcm_128 =
+		  container_of(crypto_info,
+			       struct tls12_crypto_info_aes_gcm_128,
+			       info);
+
+		if (len != sizeof(*crypto_info_aes_gcm_128)) {
+			rc = -EINVAL;
+			goto out;
+		}
+		lock_sock(sk);
+		memcpy(crypto_info_aes_gcm_128->iv, ctx->iv,
+		       TLS_CIPHER_AES_GCM_128_IV_SIZE);
+		release_sock(sk);
+		rc = copy_to_user(optval,
+				  crypto_info_aes_gcm_128,
+				  sizeof(*crypto_info_aes_gcm_128));
+		break;
+	}
+	default:
+		rc = -EINVAL;
+	}
+
+out:
+	return rc;
+}
+
+static int do_tls_getsockopt(struct sock *sk, int optname,
+			     char __user *optval, int __user *optlen)
+{
+	int rc = 0;
+
+	switch (optname) {
+	case TLS_TX:
+		rc = do_tls_getsockopt_tx(sk, optval, optlen);
+		break;
+	default:
+		rc = -ENOPROTOOPT;
+		break;
+	}
+	return rc;
+}
+
+static int tls_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (level != SOL_TLS)
+		return ctx->getsockopt(sk, level, optname, optval, optlen);
+
+	return do_tls_getsockopt(sk, optname, optval, optlen);
+}
+
+static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
+				unsigned int optlen)
+{
+	struct tls_crypto_info *crypto_info, tmp_crypto_info;
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct proto *prot = NULL;
+	int rc = 0;
+
+	if (!optval || (optlen < sizeof(*crypto_info))) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info));
+	if (rc) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/* check version */
+	if (tmp_crypto_info.version != TLS_1_2_VERSION) {
+		rc = -ENOTSUPP;
+		goto out;
+	}
+
+	/* get user crypto info */
+	crypto_info = &ctx->crypto_send;
+
+	/* Currently we don't support set crypto info more than one time */
+	if (TLS_CRYPTO_INFO_READY(crypto_info))
+		goto out;
+
+	switch (tmp_crypto_info.cipher_type) {
+	case TLS_CIPHER_AES_GCM_128: {
+		if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) {
+			rc = -EINVAL;
+			goto out;
+		}
+		rc = copy_from_user(
+		  crypto_info,
+		  optval,
+		  sizeof(struct tls12_crypto_info_aes_gcm_128));
+
+		if (rc) {
+			rc = -EFAULT;
+			goto err_crypto_info;
+		}
+		break;
+	}
+	default:
+		rc = -EINVAL;
+		goto out;
+	}
+
+	ctx->sk_write_space = sk->sk_write_space;
+	sk->sk_write_space = tls_write_space;
+
+	ctx->sk_proto_close = sk->sk_prot->close;
+
+	/* currently SW is default, we will have ethtool in future */
+	rc = tls_set_sw_offload(sk, ctx);
+	prot = &tls_sw_prot;
+	if (rc)
+		goto err_crypto_info;
+
+	sk->sk_prot = prot;
+	goto out;
+
+err_crypto_info:
+	memset(crypto_info, 0, sizeof(*crypto_info));
+out:
+	return rc;
+}
+
+static int do_tls_setsockopt(struct sock *sk, int optname,
+			     char __user *optval, unsigned int optlen)
+{
+	int rc = 0;
+
+	switch (optname) {
+	case TLS_TX:
+		lock_sock(sk);
+		rc = do_tls_setsockopt_tx(sk, optval, optlen);
+		release_sock(sk);
+		break;
+	default:
+		rc = -ENOPROTOOPT;
+		break;
+	}
+	return rc;
+}
+
+static int tls_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (level != SOL_TLS)
+		return ctx->setsockopt(sk, level, optname, optval, optlen);
+
+	return do_tls_setsockopt(sk, optname, optval, optlen);
+}
+
+static int tls_init(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tls_context *ctx;
+	int rc = 0;
+
+	/* allocate tls context */
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	icsk->icsk_ulp_data = ctx;
+	ctx->setsockopt = sk->sk_prot->setsockopt;
+	ctx->getsockopt = sk->sk_prot->getsockopt;
+	sk->sk_prot = &tls_base_prot;
+out:
+	return rc;
+}
+
+static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
+	.name			= "tls",
+	.owner			= THIS_MODULE,
+	.init			= tls_init,
+};
+
+static int __init tls_register(void)
+{
+	tls_base_prot			= tcp_prot;
+	tls_base_prot.setsockopt	= tls_setsockopt;
+	tls_base_prot.getsockopt	= tls_getsockopt;
+
+	tls_sw_prot			= tls_base_prot;
+	tls_sw_prot.sendmsg		= tls_sw_sendmsg;
+	tls_sw_prot.sendpage            = tls_sw_sendpage;
+	tls_sw_prot.close               = tls_sk_proto_close;
+
+	tcp_register_ulp(&tcp_tls_ulp_ops);
+
+	return 0;
+}
+
+static void __exit tls_unregister(void)
+{
+	tcp_unregister_ulp(&tcp_tls_ulp_ops);
+}
+
+module_init(tls_register);
+module_exit(tls_unregister);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
new file mode 100644
index 000000000000..fa596fa71ba7
--- /dev/null
+++ b/net/tls/tls_sw.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ * Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved.
+ * Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved.
+ * Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <crypto/aead.h>
+
+#include <net/tls.h>
+
+static inline void tls_make_aad(int recv,
+				char *buf,
+				size_t size,
+				char *record_sequence,
+				int record_sequence_size,
+				unsigned char record_type)
+{
+	memcpy(buf, record_sequence, record_sequence_size);
+
+	buf[8] = record_type;
+	buf[9] = TLS_1_2_VERSION_MAJOR;
+	buf[10] = TLS_1_2_VERSION_MINOR;
+	buf[11] = size >> 8;
+	buf[12] = size & 0xFF;
+}
+
+static void trim_sg(struct sock *sk, struct scatterlist *sg,
+		    int *sg_num_elem, unsigned int *sg_size, int target_size)
+{
+	int i = *sg_num_elem - 1;
+	int trim = *sg_size - target_size;
+
+	if (trim <= 0) {
+		WARN_ON(trim < 0);
+		return;
+	}
+
+	*sg_size = target_size;
+	while (trim >= sg[i].length) {
+		trim -= sg[i].length;
+		sk_mem_uncharge(sk, sg[i].length);
+		put_page(sg_page(&sg[i]));
+		i--;
+
+		if (i < 0)
+			goto out;
+	}
+
+	sg[i].length -= trim;
+	sk_mem_uncharge(sk, trim);
+
+out:
+	*sg_num_elem = i + 1;
+}
+
+static void trim_both_sgl(struct sock *sk, int target_size)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	trim_sg(sk, ctx->sg_plaintext_data,
+		&ctx->sg_plaintext_num_elem,
+		&ctx->sg_plaintext_size,
+		target_size);
+
+	if (target_size > 0)
+		target_size += tls_ctx->overhead_size;
+
+	trim_sg(sk, ctx->sg_encrypted_data,
+		&ctx->sg_encrypted_num_elem,
+		&ctx->sg_encrypted_size,
+		target_size);
+}
+
+static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
+		    int *sg_num_elem, unsigned int *sg_size,
+		    int first_coalesce)
+{
+	struct page_frag *pfrag;
+	unsigned int size = *sg_size;
+	int num_elem = *sg_num_elem, use = 0, rc = 0;
+	struct scatterlist *sge;
+	unsigned int orig_offset;
+
+	len -= size;
+	pfrag = sk_page_frag(sk);
+
+	while (len > 0) {
+		if (!sk_page_frag_refill(sk, pfrag)) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		use = min_t(int, len, pfrag->size - pfrag->offset);
+
+		if (!sk_wmem_schedule(sk, use)) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		sk_mem_charge(sk, use);
+		size += use;
+		orig_offset = pfrag->offset;
+		pfrag->offset += use;
+
+		sge = sg + num_elem - 1;
+		if (num_elem > first_coalesce && sg_page(sg) == pfrag->page &&
+		    sg->offset + sg->length == orig_offset) {
+			sg->length += use;
+		} else {
+			sge++;
+			sg_unmark_end(sge);
+			sg_set_page(sge, pfrag->page, use, orig_offset);
+			get_page(pfrag->page);
+			++num_elem;
+			if (num_elem == MAX_SKB_FRAGS) {
+				rc = -ENOSPC;
+				break;
+			}
+		}
+
+		len -= use;
+	}
+	goto out;
+
+out:
+	*sg_size = size;
+	*sg_num_elem = num_elem;
+	return rc;
+}
+
+static int alloc_encrypted_sg(struct sock *sk, int len)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int rc = 0;
+
+	rc = alloc_sg(sk, len, ctx->sg_encrypted_data,
+		      &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0);
+
+	return rc;
+}
+
+static int alloc_plaintext_sg(struct sock *sk, int len)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int rc = 0;
+
+	rc = alloc_sg(sk, len, ctx->sg_plaintext_data,
+		      &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
+		      tls_ctx->pending_open_record_frags);
+
+	return rc;
+}
+
+static void free_sg(struct sock *sk, struct scatterlist *sg,
+		    int *sg_num_elem, unsigned int *sg_size)
+{
+	int i, n = *sg_num_elem;
+
+	for (i = 0; i < n; ++i) {
+		sk_mem_uncharge(sk, sg[i].length);
+		put_page(sg_page(&sg[i]));
+	}
+	*sg_num_elem = 0;
+	*sg_size = 0;
+}
+
+static void tls_free_both_sg(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
+		&ctx->sg_encrypted_size);
+
+	free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
+		&ctx->sg_plaintext_size);
+}
+
+static int tls_do_encryption(struct tls_context *tls_ctx,
+			     struct tls_sw_context *ctx, size_t data_len,
+			     gfp_t flags)
+{
+	unsigned int req_size = sizeof(struct aead_request) +
+		crypto_aead_reqsize(ctx->aead_send);
+	struct aead_request *aead_req;
+	int rc;
+
+	aead_req = kmalloc(req_size, flags);
+	if (!aead_req)
+		return -ENOMEM;
+
+	ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size;
+	ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size;
+
+	aead_request_set_tfm(aead_req, ctx->aead_send);
+	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
+	aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
+			       data_len, tls_ctx->iv);
+	rc = crypto_aead_encrypt(aead_req);
+
+	ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size;
+	ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size;
+
+	kfree(aead_req);
+	return rc;
+}
+
+static int tls_push_record(struct sock *sk, int flags,
+			   unsigned char record_type)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int rc;
+
+	sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
+	sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1);
+
+	tls_make_aad(0, ctx->aad_space, ctx->sg_plaintext_size,
+		     tls_ctx->rec_seq, tls_ctx->rec_seq_size,
+		     record_type);
+
+	tls_fill_prepend(tls_ctx,
+			 page_address(sg_page(&ctx->sg_encrypted_data[0])) +
+			 ctx->sg_encrypted_data[0].offset,
+			 ctx->sg_plaintext_size, record_type);
+
+	tls_ctx->pending_open_record_frags = 0;
+	set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags);
+
+	rc = tls_do_encryption(tls_ctx, ctx, ctx->sg_plaintext_size,
+			       sk->sk_allocation);
+	if (rc < 0) {
+		/* If we are called from write_space and
+		 * we fail, we need to set this SOCK_NOSPACE
+		 * to trigger another write_space in the future.
+		 */
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		return rc;
+	}
+
+	free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
+		&ctx->sg_plaintext_size);
+
+	ctx->sg_encrypted_num_elem = 0;
+	ctx->sg_encrypted_size = 0;
+
+	/* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */
+	rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags);
+	if (rc < 0 && rc != -EAGAIN)
+		tls_err_abort(sk);
+
+	tls_advance_record_sn(sk, tls_ctx);
+	return rc;
+}
+
+static int tls_sw_push_pending_record(struct sock *sk, int flags)
+{
+	return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA);
+}
+
+static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+			      int length)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct page *pages[MAX_SKB_FRAGS];
+
+	size_t offset;
+	ssize_t copied, use;
+	int i = 0;
+	unsigned int size = ctx->sg_plaintext_size;
+	int num_elem = ctx->sg_plaintext_num_elem;
+	int rc = 0;
+	int maxpages;
+
+	while (length > 0) {
+		i = 0;
+		maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem;
+		if (maxpages == 0) {
+			rc = -EFAULT;
+			goto out;
+		}
+		copied = iov_iter_get_pages(from, pages,
+					    length,
+					    maxpages, &offset);
+		if (copied <= 0) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		iov_iter_advance(from, copied);
+
+		length -= copied;
+		size += copied;
+		while (copied) {
+			use = min_t(int, copied, PAGE_SIZE - offset);
+
+			sg_set_page(&ctx->sg_plaintext_data[num_elem],
+				    pages[i], use, offset);
+			sg_unmark_end(&ctx->sg_plaintext_data[num_elem]);
+			sk_mem_charge(sk, use);
+
+			offset = 0;
+			copied -= use;
+
+			++i;
+			++num_elem;
+		}
+	}
+
+out:
+	ctx->sg_plaintext_size = size;
+	ctx->sg_plaintext_num_elem = num_elem;
+	return rc;
+}
+
+static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+			     int bytes)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct scatterlist *sg = ctx->sg_plaintext_data;
+	int copy, i, rc = 0;
+
+	for (i = tls_ctx->pending_open_record_frags;
+	     i < ctx->sg_plaintext_num_elem; ++i) {
+		copy = sg[i].length;
+		if (copy_from_iter(
+				page_address(sg_page(&sg[i])) + sg[i].offset,
+				copy, from) != copy) {
+			rc = -EFAULT;
+			goto out;
+		}
+		bytes -= copy;
+
+		++tls_ctx->pending_open_record_frags;
+
+		if (!bytes)
+			break;
+	}
+
+out:
+	return rc;
+}
+
+int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int ret = 0;
+	int required_size;
+	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	bool eor = !(msg->msg_flags & MSG_MORE);
+	size_t try_to_copy, copied = 0;
+	unsigned char record_type = TLS_RECORD_TYPE_DATA;
+	int record_room;
+	bool full_record;
+	int orig_size;
+
+	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
+		return -ENOTSUPP;
+
+	lock_sock(sk);
+
+	if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo))
+		goto send_end;
+
+	if (unlikely(msg->msg_controllen)) {
+		ret = tls_proccess_cmsg(sk, msg, &record_type);
+		if (ret)
+			goto send_end;
+	}
+
+	while (msg_data_left(msg)) {
+		if (sk->sk_err) {
+			ret = sk->sk_err;
+			goto send_end;
+		}
+
+		orig_size = ctx->sg_plaintext_size;
+		full_record = false;
+		try_to_copy = msg_data_left(msg);
+		record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size;
+		if (try_to_copy >= record_room) {
+			try_to_copy = record_room;
+			full_record = true;
+		}
+
+		required_size = ctx->sg_plaintext_size + try_to_copy +
+				tls_ctx->overhead_size;
+
+		if (!sk_stream_memory_free(sk))
+			goto wait_for_sndbuf;
+alloc_encrypted:
+		ret = alloc_encrypted_sg(sk, required_size);
+		if (ret) {
+			if (ret != -ENOSPC)
+				goto wait_for_memory;
+
+			/* Adjust try_to_copy according to the amount that was
+			 * actually allocated. The difference is due
+			 * to max sg elements limit
+			 */
+			try_to_copy -= required_size - ctx->sg_encrypted_size;
+			full_record = true;
+		}
+
+		if (full_record || eor) {
+			ret = zerocopy_from_iter(sk, &msg->msg_iter,
+						 try_to_copy);
+			if (ret)
+				goto fallback_to_reg_send;
+
+			copied += try_to_copy;
+			ret = tls_push_record(sk, msg->msg_flags, record_type);
+			if (!ret)
+				continue;
+			if (ret == -EAGAIN)
+				goto send_end;
+
+			copied -= try_to_copy;
+fallback_to_reg_send:
+			iov_iter_revert(&msg->msg_iter,
+					ctx->sg_plaintext_size - orig_size);
+			trim_sg(sk, ctx->sg_plaintext_data,
+				&ctx->sg_plaintext_num_elem,
+				&ctx->sg_plaintext_size,
+				orig_size);
+		}
+
+		required_size = ctx->sg_plaintext_size + try_to_copy;
+alloc_plaintext:
+		ret = alloc_plaintext_sg(sk, required_size);
+		if (ret) {
+			if (ret != -ENOSPC)
+				goto wait_for_memory;
+
+			/* Adjust try_to_copy according to the amount that was
+			 * actually allocated. The difference is due
+			 * to max sg elements limit
+			 */
+			try_to_copy -= required_size - ctx->sg_plaintext_size;
+			full_record = true;
+
+			trim_sg(sk, ctx->sg_encrypted_data,
+				&ctx->sg_encrypted_num_elem,
+				&ctx->sg_encrypted_size,
+				ctx->sg_plaintext_size +
+				tls_ctx->overhead_size);
+		}
+
+		ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy);
+		if (ret)
+			goto trim_sgl;
+
+		copied += try_to_copy;
+		if (full_record || eor) {
+push_record:
+			ret = tls_push_record(sk, msg->msg_flags, record_type);
+			if (ret) {
+				if (ret == -ENOMEM)
+					goto wait_for_memory;
+
+				goto send_end;
+			}
+		}
+
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		ret = sk_stream_wait_memory(sk, &timeo);
+		if (ret) {
+trim_sgl:
+			trim_both_sgl(sk, orig_size);
+			goto send_end;
+		}
+
+		if (tls_is_pending_closed_record(tls_ctx))
+			goto push_record;
+
+		if (ctx->sg_encrypted_size < required_size)
+			goto alloc_encrypted;
+
+		goto alloc_plaintext;
+	}
+
+send_end:
+	ret = sk_stream_error(sk, msg->msg_flags, ret);
+
+	release_sock(sk);
+	return copied ? copied : ret;
+}
+
+int tls_sw_sendpage(struct sock *sk, struct page *page,
+		    int offset, size_t size, int flags)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int ret = 0;
+	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	bool eor;
+	size_t orig_size = size;
+	unsigned char record_type = TLS_RECORD_TYPE_DATA;
+	struct scatterlist *sg;
+	bool full_record;
+	int record_room;
+
+	if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
+		      MSG_SENDPAGE_NOTLAST))
+		return -ENOTSUPP;
+
+	/* No MSG_EOR from splice, only look at MSG_MORE */
+	eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST));
+
+	lock_sock(sk);
+
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo))
+		goto sendpage_end;
+
+	/* Call the sk_stream functions to manage the sndbuf mem. */
+	while (size > 0) {
+		size_t copy, required_size;
+
+		if (sk->sk_err) {
+			ret = sk->sk_err;
+			goto sendpage_end;
+		}
+
+		full_record = false;
+		record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size;
+		copy = size;
+		if (copy >= record_room) {
+			copy = record_room;
+			full_record = true;
+		}
+		required_size = ctx->sg_plaintext_size + copy +
+			      tls_ctx->overhead_size;
+
+		if (!sk_stream_memory_free(sk))
+			goto wait_for_sndbuf;
+alloc_payload:
+		ret = alloc_encrypted_sg(sk, required_size);
+		if (ret) {
+			if (ret != -ENOSPC)
+				goto wait_for_memory;
+
+			/* Adjust copy according to the amount that was
+			 * actually allocated. The difference is due
+			 * to max sg elements limit
+			 */
+			copy -= required_size - ctx->sg_plaintext_size;
+			full_record = true;
+		}
+
+		get_page(page);
+		sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem;
+		sg_set_page(sg, page, copy, offset);
+		ctx->sg_plaintext_num_elem++;
+
+		sk_mem_charge(sk, copy);
+		offset += copy;
+		size -= copy;
+		ctx->sg_plaintext_size += copy;
+		tls_ctx->pending_open_record_frags = ctx->sg_plaintext_num_elem;
+
+		if (full_record || eor ||
+		    ctx->sg_plaintext_num_elem ==
+		    ARRAY_SIZE(ctx->sg_plaintext_data)) {
+push_record:
+			ret = tls_push_record(sk, flags, record_type);
+			if (ret) {
+				if (ret == -ENOMEM)
+					goto wait_for_memory;
+
+				goto sendpage_end;
+			}
+		}
+		continue;
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		ret = sk_stream_wait_memory(sk, &timeo);
+		if (ret) {
+			trim_both_sgl(sk, ctx->sg_plaintext_size);
+			goto sendpage_end;
+		}
+
+		if (tls_is_pending_closed_record(tls_ctx))
+			goto push_record;
+
+		goto alloc_payload;
+	}
+
+sendpage_end:
+	if (orig_size > size)
+		ret = orig_size - size;
+	else
+		ret = sk_stream_error(sk, flags, ret);
+
+	release_sock(sk);
+	return ret;
+}
+
+void tls_sw_free_resources(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	if (ctx->aead_send)
+		crypto_free_aead(ctx->aead_send);
+
+	tls_free_both_sg(sk);
+
+	kfree(ctx);
+}
+
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
+{
+	char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+	struct tls_crypto_info *crypto_info;
+	struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
+	struct tls_sw_context *sw_ctx;
+	u16 nonce_size, tag_size, iv_size, rec_seq_size;
+	char *iv, *rec_seq;
+	int rc = 0;
+
+	if (!ctx) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (ctx->priv_ctx) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
+	if (!sw_ctx) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
+	ctx->free_resources = tls_sw_free_resources;
+
+	crypto_info = &ctx->crypto_send;
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128: {
+		nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+		tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+		iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+		iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
+		rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
+		rec_seq =
+		 ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
+		gcm_128_info =
+			(struct tls12_crypto_info_aes_gcm_128 *)crypto_info;
+		break;
+	}
+	default:
+		rc = -EINVAL;
+		goto out;
+	}
+
+	ctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
+	ctx->tag_size = tag_size;
+	ctx->overhead_size = ctx->prepend_size + ctx->tag_size;
+	ctx->iv_size = iv_size;
+	ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			  GFP_KERNEL);
+	if (!ctx->iv) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+	ctx->rec_seq_size = rec_seq_size;
+	ctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+	if (!ctx->rec_seq) {
+		rc = -ENOMEM;
+		goto free_iv;
+	}
+	memcpy(ctx->rec_seq, rec_seq, rec_seq_size);
+
+	sg_init_table(sw_ctx->sg_encrypted_data,
+		      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
+	sg_init_table(sw_ctx->sg_plaintext_data,
+		      ARRAY_SIZE(sw_ctx->sg_plaintext_data));
+
+	sg_init_table(sw_ctx->sg_aead_in, 2);
+	sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
+		   sizeof(sw_ctx->aad_space));
+	sg_unmark_end(&sw_ctx->sg_aead_in[1]);
+	sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
+	sg_init_table(sw_ctx->sg_aead_out, 2);
+	sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
+		   sizeof(sw_ctx->aad_space));
+	sg_unmark_end(&sw_ctx->sg_aead_out[1]);
+	sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
+
+	if (!sw_ctx->aead_send) {
+		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
+		if (IS_ERR(sw_ctx->aead_send)) {
+			rc = PTR_ERR(sw_ctx->aead_send);
+			sw_ctx->aead_send = NULL;
+			goto free_rec_seq;
+		}
+	}
+
+	ctx->push_pending_record = tls_sw_push_pending_record;
+
+	memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+
+	rc = crypto_aead_setkey(sw_ctx->aead_send, keyval,
+				TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+	if (rc)
+		goto free_aead;
+
+	rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size);
+	if (!rc)
+		goto out;
+
+free_aead:
+	crypto_free_aead(sw_ctx->aead_send);
+	sw_ctx->aead_send = NULL;
+free_rec_seq:
+	kfree(ctx->rec_seq);
+	ctx->rec_seq = NULL;
+free_iv:
+	kfree(ctx->iv);
+	ctx->iv = NULL;
+out:
+	return rc;
+}
-- 
cgit v1.2.3


From 86087e170cd1f19e9b25e5d944d9f52fad9470f4 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Wed, 14 Jun 2017 21:19:31 +0200
Subject: net: sched: act_tunnel_key: make UDP checksum configurable

Allow requesting of zero UDP checksum for encapsulated packets. The name and
meaning of the attribute is "NO_CSUM" in order to have the same meaning of
the attribute missing and being 0.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_tunnel_key.h |  1 +
 net/sched/act_tunnel_key.c                | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index 84ea55e1076b..afcd4be953e2 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -34,6 +34,7 @@ enum {
 	TCA_TUNNEL_KEY_ENC_KEY_ID,	/* be64 */
 	TCA_TUNNEL_KEY_PAD,
 	TCA_TUNNEL_KEY_ENC_DST_PORT,	/* be16 */
+	TCA_TUNNEL_KEY_NO_CSUM,		/* u8 */
 	__TCA_TUNNEL_KEY_MAX,
 };
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index b90effa10eb5..fd7e75679c69 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -67,6 +67,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
 	[TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
 	[TCA_TUNNEL_KEY_ENC_KEY_ID]   = { .type = NLA_U32 },
 	[TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
+	[TCA_TUNNEL_KEY_NO_CSUM]      = { .type = NLA_U8 },
 };
 
 static int tunnel_key_init(struct net *net, struct nlattr *nla,
@@ -83,6 +84,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	bool exists = false;
 	__be16 dst_port = 0;
 	__be64 key_id;
+	__be16 flags;
 	int ret = 0;
 	int err;
 
@@ -113,6 +115,11 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 
 		key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
 
+		flags = TUNNEL_KEY | TUNNEL_CSUM;
+		if (tb[TCA_TUNNEL_KEY_NO_CSUM] &&
+		    nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
+			flags &= ~TUNNEL_CSUM;
+
 		if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
 			dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
 
@@ -125,7 +132,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
 
 			metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
-						    dst_port, TUNNEL_KEY | TUNNEL_CSUM,
+						    dst_port, flags,
 						    key_id, 0);
 		} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
 			   tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
@@ -136,7 +143,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
 
 			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
-						      0, TUNNEL_KEY | TUNNEL_CSUM,
+						      0, flags,
 						      key_id, 0);
 		}
 
@@ -266,7 +273,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 		if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
 		    tunnel_key_dump_addresses(skb,
 					      &params->tcft_enc_metadata->u.tun_info) ||
-		    nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst))
+		    nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
+		    nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
+			       !(key->tun_flags & TUNNEL_CSUM)))
 			goto nla_put_failure;
 	}
 
-- 
cgit v1.2.3


From 58038695e62b4473e4d70e1503933579c640cd52 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 15 Jun 2017 17:29:09 -0700
Subject: net: Add IFLA_XDP_PROG_ID

Expose prog_id through IFLA_XDP_PROG_ID.  This patch
makes modification to generic_xdp.  The later patches will
modify other xdp-supported drivers.

prog_id is added to struct net_dev_xdp.

iproute2 patch will be followed. Here is how the 'ip link'
will look like:
> ip link show eth0
3: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp(prog_id:1) qdisc fq_codel state UP mode DEFAULT group default qlen 1000

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  7 +++++--
 include/uapi/linux/if_link.h |  1 +
 net/core/dev.c               | 19 +++++++++++--------
 net/core/rtnetlink.c         | 27 +++++++++++++++++++++------
 4 files changed, 38 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ad98a83f1332..7c7118b3bd69 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -824,7 +824,10 @@ struct netdev_xdp {
 			struct netlink_ext_ack *extack;
 		};
 		/* XDP_QUERY_PROG */
-		bool prog_attached;
+		struct {
+			bool prog_attached;
+			u32 prog_id;
+		};
 	};
 };
 
@@ -3302,7 +3305,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op);
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8ed679fe603f..dd88375a6580 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -907,6 +907,7 @@ enum {
 	IFLA_XDP_FD,
 	IFLA_XDP_ATTACHED,
 	IFLA_XDP_FLAGS,
+	IFLA_XDP_PROG_ID,
 	__IFLA_XDP_MAX,
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 8658074ecad6..b8d6dd9e8b5c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4342,13 +4342,12 @@ static struct static_key generic_xdp_needed __read_mostly;
 
 static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
 {
+	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 	struct bpf_prog *new = xdp->prog;
 	int ret = 0;
 
 	switch (xdp->command) {
-	case XDP_SETUP_PROG: {
-		struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
-
+	case XDP_SETUP_PROG:
 		rcu_assign_pointer(dev->xdp_prog, new);
 		if (old)
 			bpf_prog_put(old);
@@ -4360,10 +4359,10 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
 			dev_disable_lro(dev);
 		}
 		break;
-	}
 
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog);
+		xdp->prog_attached = !!old;
+		xdp->prog_id = old ? old->aux->id : 0;
 		break;
 
 	default:
@@ -6937,7 +6936,8 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op,
+			u32 *prog_id)
 {
 	struct netdev_xdp xdp;
 
@@ -6946,6 +6946,9 @@ bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
 
 	/* Query must always succeed. */
 	WARN_ON(xdp_op(dev, &xdp) < 0);
+	if (prog_id)
+		*prog_id = xdp.prog_id;
+
 	return xdp.prog_attached;
 }
 
@@ -6991,10 +6994,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		xdp_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk))
+		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, xdp_op))
+		    __dev_xdp_attached(dev, xdp_op, NULL))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2769ad9834d1..3aa57848a895 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -39,6 +39,7 @@
 #include <linux/if_vlan.h>
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
+#include <linux/bpf.h>
 
 #include <linux/uaccess.h>
 
@@ -899,7 +900,8 @@ static size_t rtnl_port_size(const struct net_device *dev,
 static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
-			  nla_total_size(1);	/* XDP_ATTACHED */
+			  nla_total_size(1) +	/* XDP_ATTACHED */
+			  nla_total_size(4);	/* XDP_PROG_ID */
 
 	return xdp_size;
 }
@@ -1248,15 +1250,20 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
-static u8 rtnl_xdp_attached_mode(struct net_device *dev)
+static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
+	const struct bpf_prog *generic_xdp_prog;
 
 	ASSERT_RTNL();
 
-	if (rcu_access_pointer(dev->xdp_prog))
+	*prog_id = 0;
+	generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
+	if (generic_xdp_prog) {
+		*prog_id = generic_xdp_prog->aux->id;
 		return XDP_ATTACHED_SKB;
-	if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp))
+	}
+	if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp, prog_id))
 		return XDP_ATTACHED_DRV;
 
 	return XDP_ATTACHED_NONE;
@@ -1265,6 +1272,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev)
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	struct nlattr *xdp;
+	u32 prog_id;
 	int err;
 
 	xdp = nla_nest_start(skb, IFLA_XDP);
@@ -1272,10 +1280,16 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 		return -EMSGSIZE;
 
 	err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
-			 rtnl_xdp_attached_mode(dev));
+			 rtnl_xdp_attached_mode(dev, &prog_id));
 	if (err)
 		goto err_cancel;
 
+	if (prog_id) {
+		err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
+		if (err)
+			goto err_cancel;
+	}
+
 	nla_nest_end(skb, xdp);
 	return 0;
 
@@ -1553,6 +1567,7 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
 	[IFLA_XDP_FD]		= { .type = NLA_S32 },
 	[IFLA_XDP_ATTACHED]	= { .type = NLA_U8 },
 	[IFLA_XDP_FLAGS]	= { .type = NLA_U32 },
+	[IFLA_XDP_PROG_ID]	= { .type = NLA_U32 },
 };
 
 static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -2225,7 +2240,7 @@ static int do_setlink(const struct sk_buff *skb,
 		if (err < 0)
 			goto errout;
 
-		if (xdp[IFLA_XDP_ATTACHED]) {
+		if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) {
 			err = -EINVAL;
 			goto errout;
 		}
-- 
cgit v1.2.3


From fc1841e1c15d72b0897ecfc1627ecdc284f0ec95 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 5 May 2017 11:12:52 -0700
Subject: dm ioctl: add a new DM_DEV_ARM_POLL ioctl

This ioctl will record the current global event number in the structure
dm_file, so that next select or poll call will wait until new events
arrived since this ioctl.

The DM_DEV_ARM_POLL ioctl has the same effect as closing and reopening
the handle.

Using the DM_DEV_ARM_POLL ioctl is optional - if the userspace is OK
with closing and reopening the /dev/mapper/control handle after select
or poll, there is no need to re-arm via ioctl.

Usage:
1. open the /dev/mapper/control device
2. send the DM_DEV_ARM_POLL ioctl
3. scan the event numbers of all devices we are interested in and process
   them
4. call select, poll or epoll on the handle (it waits until some new event
   happens since the DM_DEV_ARM_POLL ioctl)
5. go to step 2

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Andy Grover <agrover@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c         | 56 +++++++++++++++++++++++++++----------------
 include/uapi/linux/dm-ioctl.h |  4 +++-
 2 files changed, 38 insertions(+), 22 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 6b65a538d91d..a69658b18dc9 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -28,7 +28,7 @@ struct dm_file {
 	 * poll will wait until the global event number is greater than
 	 * this value.
 	 */
-	unsigned global_event_nr;
+	volatile unsigned global_event_nr;
 };
 
 /*-----------------------------------------------------------------
@@ -464,9 +464,9 @@ void dm_deferred_remove(void)
  * All the ioctl commands get dispatched to functions with this
  * prototype.
  */
-typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
+typedef int (*ioctl_fn)(struct file *filp, struct dm_ioctl *param, size_t param_size);
 
-static int remove_all(struct dm_ioctl *param, size_t param_size)
+static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
 	param->data_size = 0;
@@ -499,7 +499,7 @@ static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
 	return ((void *) param) + param->data_start;
 }
 
-static int list_devices(struct dm_ioctl *param, size_t param_size)
+static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	unsigned int i;
 	struct hash_cell *hc;
@@ -590,7 +590,7 @@ static void list_version_get_info(struct target_type *tt, void *param)
     info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
 }
 
-static int list_versions(struct dm_ioctl *param, size_t param_size)
+static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	size_t len, needed = 0;
 	struct dm_target_versions *vers;
@@ -732,7 +732,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	}
 }
 
-static int dev_create(struct dm_ioctl *param, size_t param_size)
+static int dev_create(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r, m = DM_ANY_MINOR;
 	struct mapped_device *md;
@@ -824,7 +824,7 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
 	return md;
 }
 
-static int dev_remove(struct dm_ioctl *param, size_t param_size)
+static int dev_remove(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct hash_cell *hc;
 	struct mapped_device *md;
@@ -889,7 +889,7 @@ static int invalid_str(char *str, void *end)
 	return -EINVAL;
 }
 
-static int dev_rename(struct dm_ioctl *param, size_t param_size)
+static int dev_rename(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r;
 	char *new_data = (char *) param + param->data_start;
@@ -919,7 +919,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 	return 0;
 }
 
-static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+static int dev_set_geometry(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r = -EINVAL, x;
 	struct mapped_device *md;
@@ -1068,7 +1068,7 @@ static int do_resume(struct dm_ioctl *param)
  * Set or unset the suspension state of a device.
  * If the device already is in the requested state we just return its status.
  */
-static int dev_suspend(struct dm_ioctl *param, size_t param_size)
+static int dev_suspend(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	if (param->flags & DM_SUSPEND_FLAG)
 		return do_suspend(param);
@@ -1080,7 +1080,7 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size)
  * Copies device info back to user space, used by
  * the create and info ioctls.
  */
-static int dev_status(struct dm_ioctl *param, size_t param_size)
+static int dev_status(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 
@@ -1171,7 +1171,7 @@ static void retrieve_status(struct dm_table *table,
 /*
  * Wait for a device to report an event
  */
-static int dev_wait(struct dm_ioctl *param, size_t param_size)
+static int dev_wait(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r = 0;
 	struct mapped_device *md;
@@ -1208,6 +1208,19 @@ out:
 	return r;
 }
 
+/*
+ * Remember the global event number and make it possible to poll
+ * for further events.
+ */
+static int dev_arm_poll(struct file *filp, struct dm_ioctl *param, size_t param_size)
+{
+	struct dm_file *priv = filp->private_data;
+
+	priv->global_event_nr = atomic_read(&dm_global_event_nr);
+
+	return 0;
+}
+
 static inline fmode_t get_mode(struct dm_ioctl *param)
 {
 	fmode_t mode = FMODE_READ | FMODE_WRITE;
@@ -1277,7 +1290,7 @@ static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
 	return false;
 }
 
-static int table_load(struct dm_ioctl *param, size_t param_size)
+static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r;
 	struct hash_cell *hc;
@@ -1364,7 +1377,7 @@ err:
 	return r;
 }
 
-static int table_clear(struct dm_ioctl *param, size_t param_size)
+static int table_clear(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct hash_cell *hc;
 	struct mapped_device *md;
@@ -1438,7 +1451,7 @@ static void retrieve_deps(struct dm_table *table,
 	param->data_size = param->data_start + needed;
 }
 
-static int table_deps(struct dm_ioctl *param, size_t param_size)
+static int table_deps(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
@@ -1464,7 +1477,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
  * Return the status of a device as a text string for each
  * target.
  */
-static int table_status(struct dm_ioctl *param, size_t param_size)
+static int table_status(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
@@ -1519,7 +1532,7 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
 /*
  * Pass a message to the target that's at the supplied device offset.
  */
-static int target_message(struct dm_ioctl *param, size_t param_size)
+static int target_message(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r, argc;
 	char **argv;
@@ -1636,7 +1649,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
 		{DM_LIST_VERSIONS_CMD, 0, list_versions},
 
 		{DM_TARGET_MSG_CMD, 0, target_message},
-		{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
+		{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
+		{DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
 	};
 
 	if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
@@ -1791,7 +1805,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 	return 0;
 }
 
-static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
+static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *user)
 {
 	int r = 0;
 	int ioctl_flags;
@@ -1845,7 +1859,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 		goto out;
 
 	param->data_size = offsetof(struct dm_ioctl, data);
-	r = fn(param, input_param_size);
+	r = fn(file, param, input_param_size);
 
 	if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
 	    unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
@@ -1864,7 +1878,7 @@ out:
 
 static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
 {
-	return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
+	return (long)ctl_ioctl(file, command, (struct dm_ioctl __user *)u);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 2f6c77aebe1a..412c06a624c8 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -240,7 +240,8 @@ enum {
 	/* Added later */
 	DM_LIST_VERSIONS_CMD,
 	DM_TARGET_MSG_CMD,
-	DM_DEV_SET_GEOMETRY_CMD
+	DM_DEV_SET_GEOMETRY_CMD,
+	DM_DEV_ARM_POLL_CMD,
 };
 
 #define DM_IOCTL 0xfd
@@ -255,6 +256,7 @@ enum {
 #define DM_DEV_SUSPEND   _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
 #define DM_DEV_STATUS    _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
 #define DM_DEV_WAIT      _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
+#define DM_DEV_ARM_POLL  _IOWR(DM_IOCTL, DM_DEV_ARM_POLL_CMD, struct dm_ioctl)
 
 #define DM_TABLE_LOAD    _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
 #define DM_TABLE_CLEAR   _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
-- 
cgit v1.2.3


From 1a63143dc18b2c6a2233d1c5afce95205d2676f4 Mon Sep 17 00:00:00 2001
From: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Date: Tue, 6 Jun 2017 00:20:32 +0200
Subject: Btrfs: btrfs_ioctl_search_key documentation

A programmer who is trying to implement calling the btrfs SEARCH
or SEARCH_V2 ioctl will probably soon end up reading this struct
definition.

Properly document the input fields to prevent common misconceptions:
 1. The search space is linear, not 3 dimensional. The invidual min/max
 values for objectid, type and offset cannot be used to filter the
 result, they only define the endpoints of an interval.
 2. The transaction id (a.k.a. generation) filter applies only on
 transaction id of the last COW operation on a whole metadata page, not
 on individual items.

Ad 1. The first misunderstanding was helped by the previous misleading
comments on min/max type and offset:
  "keys returned will be >= min and <= max".

Ad 2. For example, running btrfs balance will happily cause rewriting of
metadata pages that contain a filesystem tree of a read only subvolume,
causing transids to be increased.

Also, improve descriptions of tree_id and nr_items and add in/out
annotations.

Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/uapi/linux/btrfs.h | 63 +++++++++++++++++++++++++++++++---------------
 1 file changed, 43 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index a456e5309238..9aa74f317747 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -426,31 +426,54 @@ struct btrfs_ioctl_ino_lookup_args {
 	char name[BTRFS_INO_LOOKUP_PATH_MAX];
 };
 
+/* Search criteria for the btrfs SEARCH ioctl family. */
 struct btrfs_ioctl_search_key {
-	/* which root are we searching.  0 is the tree of tree roots */
-	__u64 tree_id;
-
-	/* keys returned will be >= min and <= max */
-	__u64 min_objectid;
-	__u64 max_objectid;
-
-	/* keys returned will be >= min and <= max */
-	__u64 min_offset;
-	__u64 max_offset;
-
-	/* max and min transids to search for */
-	__u64 min_transid;
-	__u64 max_transid;
+	/*
+	 * The tree we're searching in. 1 is the tree of tree roots, 2 is the
+	 * extent tree, etc...
+	 *
+	 * A special tree_id value of 0 will cause a search in the subvolume
+	 * tree that the inode which is passed to the ioctl is part of.
+	 */
+	__u64 tree_id;		/* in */
 
-	/* keys returned will be >= min and <= max */
-	__u32 min_type;
-	__u32 max_type;
+	/*
+	 * When doing a tree search, we're actually taking a slice from a
+	 * linear search space of 136-bit keys.
+	 *
+	 * A full 136-bit tree key is composed as:
+	 *   (objectid << 72) + (type << 64) + offset
+	 *
+	 * The individual min and max values for objectid, type and offset
+	 * define the min_key and max_key values for the search range. All
+	 * metadata items with a key in the interval [min_key, max_key] will be
+	 * returned.
+	 *
+	 * Additionally, we can filter the items returned on transaction id of
+	 * the metadata block they're stored in by specifying a transid range.
+	 * Be aware that this transaction id only denotes when the metadata
+	 * page that currently contains the item got written the last time as
+	 * result of a COW operation.  The number does not have any meaning
+	 * related to the transaction in which an individual item that is being
+	 * returned was created or changed.
+	 */
+	__u64 min_objectid;	/* in */
+	__u64 max_objectid;	/* in */
+	__u64 min_offset;	/* in */
+	__u64 max_offset;	/* in */
+	__u64 min_transid;	/* in */
+	__u64 max_transid;	/* in */
+	__u32 min_type;		/* in */
+	__u32 max_type;		/* in */
 
 	/*
-	 * how many items did userland ask for, and how many are we
-	 * returning
+	 * input: The maximum amount of results desired.
+	 * output: The actual amount of items returned, restricted by any of:
+	 *  - reaching the upper bound of the search range
+	 *  - reaching the input nr_items amount of items
+	 *  - completely filling the supplied memory buffer
 	 */
-	__u32 nr_items;
+	__u32 nr_items;		/* in/out */
 
 	/* align to 64 bits */
 	__u32 unused;
-- 
cgit v1.2.3


From 8917a777be3ba566377be05117f71b93a5fd909d Mon Sep 17 00:00:00 2001
From: Ivan Delalande <colona@arista.com>
Date: Thu, 15 Jun 2017 18:07:07 -0700
Subject: tcp: md5: add TCP_MD5SIG_EXT socket option to set a key address
 prefix

Replace first padding in the tcp_md5sig structure with a new flag field
and address prefix length so it can be specified when configuring a new
key for TCP MD5 signature. The tcpm_flags field will only be used if the
socket option is TCP_MD5SIG_EXT to avoid breaking existing programs, and
tcpm_prefixlen only when the TCP_MD5SIG_FLAG_PREFIX flag is set.

Signed-off-by: Bob Gilligan <gilligan@arista.com>
Signed-off-by: Eric Mowat <mowat@arista.com>
Signed-off-by: Ivan Delalande <colona@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  1 +
 include/uapi/linux/tcp.h | 11 ++++++++---
 net/ipv4/tcp.c           |  3 ++-
 net/ipv4/tcp_ipv4.c      | 16 ++++++++++++----
 net/ipv6/tcp_ipv6.c      | 25 ++++++++++++++++++-------
 5 files changed, 41 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8f4076d31669..d0751b79d99c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1848,6 +1848,7 @@ struct tcp_sock_af_ops {
 					 const struct sock *sk,
 					 const struct sk_buff *skb);
 	int		(*md5_parse)(struct sock *sk,
+				     int optname,
 				     char __user *optval,
 				     int optlen);
 #endif
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8204dcebc6f3..a5507c977497 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -117,7 +117,8 @@ enum {
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
 #define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
-#define TCP_ULP		31	/* Attach a ULP to a TCP connection */
+#define TCP_ULP			31	/* Attach a ULP to a TCP connection */
+#define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
@@ -235,11 +236,15 @@ enum {
 /* for TCP_MD5SIG socket option */
 #define TCP_MD5SIG_MAXKEYLEN	80
 
+/* tcp_md5sig extension flags for TCP_MD5SIG_EXT */
+#define TCP_MD5SIG_FLAG_PREFIX		1	/* address prefix length */
+
 struct tcp_md5sig {
 	struct __kernel_sockaddr_storage tcpm_addr;	/* address associated */
-	__u16	__tcpm_pad1;				/* zero */
+	__u8	tcpm_flags;				/* extension flags */
+	__u8	tcpm_prefixlen;				/* address prefix */
 	__u16	tcpm_keylen;				/* key length */
-	__u32	__tcpm_pad2;				/* zero */
+	__u32	__tcpm_pad;				/* zero */
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 11e4ee281aa0..058f509ca98e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2714,8 +2714,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 
 #ifdef CONFIG_TCP_MD5SIG
 	case TCP_MD5SIG:
+	case TCP_MD5SIG_EXT:
 		/* Read the IP->Key mappings from userspace */
-		err = tp->af_specific->md5_parse(sk, optval, optlen);
+		err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
 		break;
 #endif
 	case TCP_USER_TIMEOUT:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3c67866b780..bf407f3e20dd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1066,11 +1066,12 @@ static void tcp_clear_md5_list(struct sock *sk)
 	}
 }
 
-static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
-				 int optlen)
+static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
+				 char __user *optval, int optlen)
 {
 	struct tcp_md5sig cmd;
 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+	u8 prefixlen = 32;
 
 	if (optlen < sizeof(cmd))
 		return -EINVAL;
@@ -1081,15 +1082,22 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 	if (sin->sin_family != AF_INET)
 		return -EINVAL;
 
+	if (optname == TCP_MD5SIG_EXT &&
+	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
+		prefixlen = cmd.tcpm_prefixlen;
+		if (prefixlen > 32)
+			return -EINVAL;
+	}
+
 	if (!cmd.tcpm_keylen)
 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
-				      AF_INET, 32);
+				      AF_INET, prefixlen);
 
 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 		return -EINVAL;
 
 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
-			      AF_INET, 32, cmd.tcpm_key, cmd.tcpm_keylen,
+			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
 			      GFP_KERNEL);
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 261689310408..68dc7472b44d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -515,11 +515,12 @@ static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
 	return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr);
 }
 
-static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval,
-				 int optlen)
+static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
+				 char __user *optval, int optlen)
 {
 	struct tcp_md5sig cmd;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
+	u8 prefixlen;
 
 	if (optlen < sizeof(cmd))
 		return -EINVAL;
@@ -530,12 +531,22 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval,
 	if (sin6->sin6_family != AF_INET6)
 		return -EINVAL;
 
+	if (optname == TCP_MD5SIG_EXT &&
+	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
+		prefixlen = cmd.tcpm_prefixlen;
+		if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) &&
+					prefixlen > 32))
+			return -EINVAL;
+	} else {
+		prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128;
+	}
+
 	if (!cmd.tcpm_keylen) {
 		if (ipv6_addr_v4mapped(&sin6->sin6_addr))
 			return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
-					      AF_INET, 32);
+					      AF_INET, prefixlen);
 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
-				      AF_INET6, 128);
+				      AF_INET6, prefixlen);
 	}
 
 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
@@ -543,12 +554,12 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval,
 
 	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
 		return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
-				      AF_INET, 32, cmd.tcpm_key,
+				      AF_INET, prefixlen, cmd.tcpm_key,
 				      cmd.tcpm_keylen, GFP_KERNEL);
 
 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
-			      AF_INET6, 128, cmd.tcpm_key, cmd.tcpm_keylen,
-			      GFP_KERNEL);
+			      AF_INET6, prefixlen, cmd.tcpm_key,
+			      cmd.tcpm_keylen, GFP_KERNEL);
 }
 
 static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
-- 
cgit v1.2.3


From 56c1af4606f04048e3ae9ab2027a708b9684ff37 Mon Sep 17 00:00:00 2001
From: Wong Vee Khee <vee.khee.wong@ni.com>
Date: Thu, 1 Jun 2017 17:43:06 +0800
Subject: PCI: Add sysfs max_link_speed/width, current_link_speed/width, etc

Expose PCIe bridges attributes such as secondary bus number, subordinate
bus number, max link speed and link width, current link speed and link
width via sysfs in /sys/bus/pci/devices/...

This information is available via lspci, but that requires root privilege.

Signed-off-by: Wong Vee Khee <vee.khee.wong@ni.com>
Signed-off-by: Hui Chun Ong <hui.chun.ong@ni.com>
[bhelgaas: changelog, return errors early to unindent usual case, return
errors with same style throughout]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci-sysfs.c       | 199 +++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/pci_regs.h |   1 +
 2 files changed, 196 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 31e99613a12e..a3537cf58a20 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -154,6 +154,129 @@ static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(resource);
 
+static ssize_t max_link_speed_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	u32 linkcap;
+	int err;
+	const char *speed;
+
+	err = pcie_capability_read_dword(pci_dev, PCI_EXP_LNKCAP, &linkcap);
+	if (err)
+		return -EINVAL;
+
+	switch (linkcap & PCI_EXP_LNKCAP_SLS) {
+	case PCI_EXP_LNKCAP_SLS_8_0GB:
+		speed = "8 GT/s";
+		break;
+	case PCI_EXP_LNKCAP_SLS_5_0GB:
+		speed = "5 GT/s";
+		break;
+	case PCI_EXP_LNKCAP_SLS_2_5GB:
+		speed = "2.5 GT/s";
+		break;
+	default:
+		speed = "Unknown speed";
+	}
+
+	return sprintf(buf, "%s\n", speed);
+}
+static DEVICE_ATTR_RO(max_link_speed);
+
+static ssize_t max_link_width_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	u32 linkcap;
+	int err;
+
+	err = pcie_capability_read_dword(pci_dev, PCI_EXP_LNKCAP, &linkcap);
+	if (err)
+		return -EINVAL;
+
+	return sprintf(buf, "%u\n", (linkcap & PCI_EXP_LNKCAP_MLW) >> 4);
+}
+static DEVICE_ATTR_RO(max_link_width);
+
+static ssize_t current_link_speed_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	u16 linkstat;
+	int err;
+	const char *speed;
+
+	err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, &linkstat);
+	if (err)
+		return -EINVAL;
+
+	switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+	case PCI_EXP_LNKSTA_CLS_8_0GB:
+		speed = "8 GT/s";
+		break;
+	case PCI_EXP_LNKSTA_CLS_5_0GB:
+		speed = "5 GT/s";
+		break;
+	case PCI_EXP_LNKSTA_CLS_2_5GB:
+		speed = "2.5 GT/s";
+		break;
+	default:
+		speed = "Unknown speed";
+	}
+
+	return sprintf(buf, "%s\n", speed);
+}
+static DEVICE_ATTR_RO(current_link_speed);
+
+static ssize_t current_link_width_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	u16 linkstat;
+	int err;
+
+	err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, &linkstat);
+	if (err)
+		return -EINVAL;
+
+	return sprintf(buf, "%u\n",
+		(linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT);
+}
+static DEVICE_ATTR_RO(current_link_width);
+
+static ssize_t secondary_bus_number_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	u8 sec_bus;
+	int err;
+
+	err = pci_read_config_byte(pci_dev, PCI_SECONDARY_BUS, &sec_bus);
+	if (err)
+		return -EINVAL;
+
+	return sprintf(buf, "%u\n", sec_bus);
+}
+static DEVICE_ATTR_RO(secondary_bus_number);
+
+static ssize_t subordinate_bus_number_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	u8 sub_bus;
+	int err;
+
+	err = pci_read_config_byte(pci_dev, PCI_SUBORDINATE_BUS, &sub_bus);
+	if (err)
+		return -EINVAL;
+
+	return sprintf(buf, "%u\n", sub_bus);
+}
+static DEVICE_ATTR_RO(subordinate_bus_number);
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
@@ -629,12 +752,17 @@ static struct attribute *pci_dev_attrs[] = {
 	NULL,
 };
 
-static const struct attribute_group pci_dev_group = {
-	.attrs = pci_dev_attrs,
+static struct attribute *pci_bridge_attrs[] = {
+	&dev_attr_subordinate_bus_number.attr,
+	&dev_attr_secondary_bus_number.attr,
+	NULL,
 };
 
-const struct attribute_group *pci_dev_groups[] = {
-	&pci_dev_group,
+static struct attribute *pcie_dev_attrs[] = {
+	&dev_attr_current_link_speed.attr,
+	&dev_attr_current_link_width.attr,
+	&dev_attr_max_link_width.attr,
+	&dev_attr_max_link_speed.attr,
 	NULL,
 };
 
@@ -1557,6 +1685,57 @@ static umode_t pci_dev_hp_attrs_are_visible(struct kobject *kobj,
 	return a->mode;
 }
 
+static umode_t pci_bridge_attrs_are_visible(struct kobject *kobj,
+					    struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (pci_is_bridge(pdev))
+		return a->mode;
+
+	return 0;
+}
+
+static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
+					  struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (pci_is_pcie(pdev))
+		return a->mode;
+
+	return 0;
+}
+
+static const struct attribute_group pci_dev_group = {
+	.attrs = pci_dev_attrs,
+};
+
+const struct attribute_group *pci_dev_groups[] = {
+	&pci_dev_group,
+	NULL,
+};
+
+static const struct attribute_group pci_bridge_group = {
+	.attrs = pci_bridge_attrs,
+};
+
+const struct attribute_group *pci_bridge_groups[] = {
+	&pci_bridge_group,
+	NULL,
+};
+
+static const struct attribute_group pcie_dev_group = {
+	.attrs = pcie_dev_attrs,
+};
+
+const struct attribute_group *pcie_dev_groups[] = {
+	&pcie_dev_group,
+	NULL,
+};
+
 static struct attribute_group pci_dev_hp_attr_group = {
 	.attrs = pci_dev_hp_attrs,
 	.is_visible = pci_dev_hp_attrs_are_visible,
@@ -1592,12 +1771,24 @@ static struct attribute_group pci_dev_attr_group = {
 	.is_visible = pci_dev_attrs_are_visible,
 };
 
+static struct attribute_group pci_bridge_attr_group = {
+	.attrs = pci_bridge_attrs,
+	.is_visible = pci_bridge_attrs_are_visible,
+};
+
+static struct attribute_group pcie_dev_attr_group = {
+	.attrs = pcie_dev_attrs,
+	.is_visible = pcie_dev_attrs_are_visible,
+};
+
 static const struct attribute_group *pci_dev_attr_groups[] = {
 	&pci_dev_attr_group,
 	&pci_dev_hp_attr_group,
 #ifdef CONFIG_PCI_IOV
 	&sriov_dev_attr_group,
 #endif
+	&pci_bridge_attr_group,
+	&pcie_dev_attr_group,
 	NULL,
 };
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index d56bb0051009..c22d3ebaca20 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -517,6 +517,7 @@
 #define  PCI_EXP_LNKCAP_SLS	0x0000000f /* Supported Link Speeds */
 #define  PCI_EXP_LNKCAP_SLS_2_5GB 0x00000001 /* LNKCAP2 SLS Vector bit 0 */
 #define  PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */
+#define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
 #define  PCI_EXP_LNKCAP_MLW	0x000003f0 /* Maximum Link Width */
 #define  PCI_EXP_LNKCAP_ASPMS	0x00000c00 /* ASPM Support */
 #define  PCI_EXP_LNKCAP_L0SEL	0x00007000 /* L0s Exit Latency */
-- 
cgit v1.2.3


From f902c1e95d5dfe5102f8467d69dc51d505f832ee Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Wed, 7 Jun 2017 11:46:12 -0300
Subject: [media] cec: add CEC_CAP_NEEDS_HPD

Add a new capability CEC_CAP_NEEDS_HPD. If this capability is set
then the hardware can only use CEC if the HDMI Hotplug Detect pin
is high. Such hardware cannot handle the corner case in the CEC specification
where it is possible to transmit messages even if no hotplug signal is
present (needed for some displays that turn off the HPD when in standby,
but still have CEC enabled).

Typically hardware that needs this capability have the HPD wired to the CEC
block, often to a 'power' or 'active' pin.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-adap.c | 20 ++++++++++++++------
 drivers/media/cec/cec-api.c  |  5 ++++-
 drivers/media/cec/cec-core.c |  1 +
 include/media/cec.h          |  1 +
 include/uapi/linux/cec.h     |  2 ++
 5 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index bd76c15ade4f..bf45977b2823 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -368,6 +368,8 @@ int cec_thread_func(void *_adap)
 			 * transmit should be canceled.
 			 */
 			err = wait_event_interruptible_timeout(adap->kthread_waitq,
+				(adap->needs_hpd &&
+				 (!adap->is_configured && !adap->is_configuring)) ||
 				kthread_should_stop() ||
 				(!adap->transmitting &&
 				 !list_empty(&adap->transmit_queue)),
@@ -383,7 +385,9 @@ int cec_thread_func(void *_adap)
 
 		mutex_lock(&adap->lock);
 
-		if (kthread_should_stop()) {
+		if ((adap->needs_hpd &&
+		     (!adap->is_configured && !adap->is_configuring)) ||
+		    kthread_should_stop()) {
 			cec_flush(adap);
 			goto unlock;
 		}
@@ -682,7 +686,7 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg,
 		return -EINVAL;
 	}
 	if (!adap->is_configured && !adap->is_configuring) {
-		if (msg->msg[0] != 0xf0) {
+		if (adap->needs_hpd || msg->msg[0] != 0xf0) {
 			dprintk(1, "%s: adapter is unconfigured\n", __func__);
 			return -ENONET;
 		}
@@ -1158,7 +1162,9 @@ static int cec_config_log_addr(struct cec_adapter *adap,
  */
 static void cec_adap_unconfigure(struct cec_adapter *adap)
 {
-	WARN_ON(adap->ops->adap_log_addr(adap, CEC_LOG_ADDR_INVALID));
+	if (!adap->needs_hpd ||
+	    adap->phys_addr != CEC_PHYS_ADDR_INVALID)
+		WARN_ON(adap->ops->adap_log_addr(adap, CEC_LOG_ADDR_INVALID));
 	adap->log_addrs.log_addr_mask = 0;
 	adap->is_configuring = false;
 	adap->is_configured = false;
@@ -1387,6 +1393,8 @@ void __cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, bool block)
 	if (phys_addr == adap->phys_addr || adap->devnode.unregistered)
 		return;
 
+	dprintk(1, "new physical address %x.%x.%x.%x\n",
+		cec_phys_addr_exp(phys_addr));
 	if (phys_addr == CEC_PHYS_ADDR_INVALID ||
 	    adap->phys_addr != CEC_PHYS_ADDR_INVALID) {
 		adap->phys_addr = CEC_PHYS_ADDR_INVALID;
@@ -1396,7 +1404,7 @@ void __cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, bool block)
 		if (adap->monitor_all_cnt)
 			WARN_ON(call_op(adap, adap_monitor_all_enable, false));
 		mutex_lock(&adap->devnode.lock);
-		if (list_empty(&adap->devnode.fhs))
+		if (adap->needs_hpd || list_empty(&adap->devnode.fhs))
 			WARN_ON(adap->ops->adap_enable(adap, false));
 		mutex_unlock(&adap->devnode.lock);
 		if (phys_addr == CEC_PHYS_ADDR_INVALID)
@@ -1404,7 +1412,7 @@ void __cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, bool block)
 	}
 
 	mutex_lock(&adap->devnode.lock);
-	if (list_empty(&adap->devnode.fhs) &&
+	if ((adap->needs_hpd || list_empty(&adap->devnode.fhs)) &&
 	    adap->ops->adap_enable(adap, true)) {
 		mutex_unlock(&adap->devnode.lock);
 		return;
@@ -1412,7 +1420,7 @@ void __cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, bool block)
 
 	if (adap->monitor_all_cnt &&
 	    call_op(adap, adap_monitor_all_enable, true)) {
-		if (list_empty(&adap->devnode.fhs))
+		if (adap->needs_hpd || list_empty(&adap->devnode.fhs))
 			WARN_ON(adap->ops->adap_enable(adap, false));
 		mutex_unlock(&adap->devnode.lock);
 		return;
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index 0860fb458757..1359c3977101 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -202,7 +202,8 @@ static long cec_transmit(struct cec_adapter *adap, struct cec_fh *fh,
 		err = -EPERM;
 	else if (adap->is_configuring)
 		err = -ENONET;
-	else if (!adap->is_configured && msg.msg[0] != 0xf0)
+	else if (!adap->is_configured &&
+		 (adap->needs_hpd || msg.msg[0] != 0xf0))
 		err = -ENONET;
 	else if (cec_is_busy(adap, fh))
 		err = -EBUSY;
@@ -521,6 +522,7 @@ static int cec_open(struct inode *inode, struct file *filp)
 
 	mutex_lock(&devnode->lock);
 	if (list_empty(&devnode->fhs) &&
+	    !adap->needs_hpd &&
 	    adap->phys_addr == CEC_PHYS_ADDR_INVALID) {
 		err = adap->ops->adap_enable(adap, true);
 		if (err) {
@@ -565,6 +567,7 @@ static int cec_release(struct inode *inode, struct file *filp)
 	mutex_lock(&devnode->lock);
 	list_del(&fh->list);
 	if (list_empty(&devnode->fhs) &&
+	    !adap->needs_hpd &&
 	    adap->phys_addr == CEC_PHYS_ADDR_INVALID) {
 		WARN_ON(adap->ops->adap_enable(adap, false));
 	}
diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c
index 2f87748ba4fc..b516d599d6c4 100644
--- a/drivers/media/cec/cec-core.c
+++ b/drivers/media/cec/cec-core.c
@@ -230,6 +230,7 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops,
 	adap->log_addrs.cec_version = CEC_OP_CEC_VERSION_2_0;
 	adap->log_addrs.vendor_id = CEC_VENDOR_ID_NONE;
 	adap->capabilities = caps;
+	adap->needs_hpd = caps & CEC_CAP_NEEDS_HPD;
 	adap->available_log_addrs = available_las;
 	adap->sequence = 0;
 	adap->ops = ops;
diff --git a/include/media/cec.h b/include/media/cec.h
index a2e184d1df00..7e32e80b243e 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -164,6 +164,7 @@ struct cec_adapter {
 	u8 available_log_addrs;
 
 	u16 phys_addr;
+	bool needs_hpd;
 	bool is_configuring;
 	bool is_configured;
 	u32 monitor_all_cnt;
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index a0dfe27bc6c7..44579a24f95d 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -336,6 +336,8 @@ static inline int cec_is_unconfigured(__u16 log_addr_mask)
 #define CEC_CAP_RC		(1 << 4)
 /* Hardware can monitor all messages, not just directed and broadcast. */
 #define CEC_CAP_MONITOR_ALL	(1 << 5)
+/* Hardware can use CEC only if the HDMI HPD pin is high. */
+#define CEC_CAP_NEEDS_HPD	(1 << 6)
 
 /**
  * struct cec_caps - CEC capabilities structure.
-- 
cgit v1.2.3


From 8d67ae25a9ea206f1ad53561511c1810d7838666 Mon Sep 17 00:00:00 2001
From: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Date: Mon, 12 Jun 2017 10:26:13 -0300
Subject: [media] media: v4l2-ctrls: Reserve controls for MAX217X

Reserve controls for MAX217X RF to Bits tuner family. These hybrid
radio receiver chips are highly programmable and hence reserving 32
controls.

Signed-off-by: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/v4l2-controls.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 0d2e1e01fbd5..83b28b41123f 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -180,6 +180,11 @@ enum v4l2_colorfx {
  * We reserve 16 controls for this driver. */
 #define V4L2_CID_USER_TC358743_BASE		(V4L2_CID_USER_BASE + 0x1080)
 
+/* The base for the max217x driver controls.
+ * We reserve 32 controls for this driver
+ */
+#define V4L2_CID_USER_MAX217X_BASE		(V4L2_CID_USER_BASE + 0x1090)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From b47b79d8a231d137ec9f9a5bef05f9e2f19a4347 Mon Sep 17 00:00:00 2001
From: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Date: Tue, 13 Jun 2017 09:54:47 -0300
Subject: [media] media: i2c: max2175: Add MAX2175 support

This patch adds driver support for the MAX2175 chip. This is Maxim
Integrated's RF to Bits tuner front end chip designed for software-defined
radio solutions. This driver exposes the tuner as a sub-device instance
with standard and custom controls to configure the device.

Signed-off-by: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/v4l-drivers/index.rst   |    1 +
 Documentation/media/v4l-drivers/max2175.rst |   62 ++
 drivers/media/i2c/Kconfig                   |   12 +
 drivers/media/i2c/Makefile                  |    2 +
 drivers/media/i2c/max2175.c                 | 1453 +++++++++++++++++++++++++++
 drivers/media/i2c/max2175.h                 |  109 ++
 include/uapi/linux/max2175.h                |   28 +
 7 files changed, 1667 insertions(+)
 create mode 100644 Documentation/media/v4l-drivers/max2175.rst
 create mode 100644 drivers/media/i2c/max2175.c
 create mode 100644 drivers/media/i2c/max2175.h
 create mode 100644 include/uapi/linux/max2175.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/v4l-drivers/index.rst b/Documentation/media/v4l-drivers/index.rst
index 90fe22a6414a..2e24d6806052 100644
--- a/Documentation/media/v4l-drivers/index.rst
+++ b/Documentation/media/v4l-drivers/index.rst
@@ -42,6 +42,7 @@ For more details see the file COPYING in the source distribution of Linux.
 	davinci-vpbe
 	fimc
 	ivtv
+	max2175
 	meye
 	omap3isp
 	omap4_camera
diff --git a/Documentation/media/v4l-drivers/max2175.rst b/Documentation/media/v4l-drivers/max2175.rst
new file mode 100644
index 000000000000..04478c25d57a
--- /dev/null
+++ b/Documentation/media/v4l-drivers/max2175.rst
@@ -0,0 +1,62 @@
+Maxim Integrated MAX2175 RF to bits tuner driver
+================================================
+
+The MAX2175 driver implements the following driver-specific controls:
+
+``V4L2_CID_MAX2175_I2S_ENABLE``
+-------------------------------
+    Enable/Disable I2S output of the tuner. This is a private control
+    that can be accessed only using the subdev interface.
+    Refer to Documentation/media/kapi/v4l2-controls for more details.
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 4
+
+    * - ``(0)``
+      - I2S output is disabled.
+    * - ``(1)``
+      - I2S output is enabled.
+
+``V4L2_CID_MAX2175_HSLS``
+-------------------------
+    The high-side/low-side (HSLS) control of the tuner for a given band.
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 4
+
+    * - ``(0)``
+      - The LO frequency position is below the desired frequency.
+    * - ``(1)``
+      - The LO frequency position is above the desired frequency.
+
+``V4L2_CID_MAX2175_RX_MODE (menu)``
+-----------------------------------
+    The Rx mode controls a number of preset parameters of the tuner like
+    sample clock (sck), sampling rate etc. These multiple settings are
+    provided under one single label called Rx mode in the datasheet. The
+    list below shows the supported modes with a brief description.
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 4
+
+    * - ``"Europe modes"``
+    * - ``"FM 1.2" (0)``
+      - This configures FM band with a sample rate of 0.512 million
+        samples/sec with a 10.24 MHz sck.
+    * - ``"DAB 1.2" (1)``
+      - This configures VHF band with a sample rate of 2.048 million
+        samples/sec with a 32.768 MHz sck.
+
+    * - ``"North America modes"``
+    * - ``"FM 1.0" (0)``
+      - This configures FM band with a sample rate of 0.7441875 million
+        samples/sec with a 14.88375 MHz sck.
+    * - ``"DAB 1.2" (1)``
+      - This configures FM band with a sample rate of 0.372 million
+        samples/sec with a 7.441875 MHz sck.
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index c380e2475c82..c0e6e78883b0 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -796,6 +796,18 @@ config VIDEO_SAA6752HS
 	  To compile this driver as a module, choose M here: the
 	  module will be called saa6752hs.
 
+comment "SDR tuner chips"
+
+config SDR_MAX2175
+	tristate "Maxim 2175 RF to Bits tuner"
+	depends on VIDEO_V4L2 && MEDIA_SDR_SUPPORT && I2C
+	---help---
+	  Support for Maxim 2175 tuner. It is an advanced analog/digital
+	  radio receiver with RF-to-Bits front-end designed for SDR solutions.
+
+	  To compile this driver as a module, choose M here; the
+	  module will be called max2175.
+
 comment "Miscellaneous helper chips"
 
 config VIDEO_THS7303
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index 62323ec66be8..5a4a761f7383 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -86,3 +86,5 @@ obj-$(CONFIG_VIDEO_IR_I2C)  += ir-kbd-i2c.o
 obj-$(CONFIG_VIDEO_ML86V7667)	+= ml86v7667.o
 obj-$(CONFIG_VIDEO_OV2659)	+= ov2659.o
 obj-$(CONFIG_VIDEO_TC358743)	+= tc358743.o
+
+obj-$(CONFIG_SDR_MAX2175) += max2175.o
diff --git a/drivers/media/i2c/max2175.c b/drivers/media/i2c/max2175.c
new file mode 100644
index 000000000000..0d28a80f8ed2
--- /dev/null
+++ b/drivers/media/i2c/max2175.c
@@ -0,0 +1,1453 @@
+/*
+ * Maxim Integrated MAX2175 RF to Bits tuner driver
+ *
+ * This driver & most of the hard coded values are based on the reference
+ * application delivered by Maxim for this device.
+ *
+ * Copyright (C) 2016 Maxim Integrated Products
+ * Copyright (C) 2017 Renesas Electronics Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/max2175.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-device.h>
+
+#include "max2175.h"
+
+#define DRIVER_NAME "max2175"
+
+#define mxm_dbg(ctx, fmt, arg...) dev_dbg(&ctx->client->dev, fmt, ## arg)
+#define mxm_err(ctx, fmt, arg...) dev_err(&ctx->client->dev, fmt, ## arg)
+
+/* Rx mode */
+struct max2175_rxmode {
+	enum max2175_band band;		/* Associated band */
+	u32 freq;			/* Default freq in Hz */
+	u8 i2s_word_size;		/* Bit value */
+};
+
+/* Register map to define preset values */
+struct max2175_reg_map {
+	u8 idx;				/* Register index */
+	u8 val;				/* Register value */
+};
+
+static const struct max2175_rxmode eu_rx_modes[] = {
+	/* EU modes */
+	[MAX2175_EU_FM_1_2] = { MAX2175_BAND_FM, 98256000, 1 },
+	[MAX2175_DAB_1_2]   = { MAX2175_BAND_VHF, 182640000, 0 },
+};
+
+static const struct max2175_rxmode na_rx_modes[] = {
+	/* NA modes */
+	[MAX2175_NA_FM_1_0] = { MAX2175_BAND_FM, 98255520, 1 },
+	[MAX2175_NA_FM_2_0] = { MAX2175_BAND_FM, 98255520, 6 },
+};
+
+/*
+ * Preset values:
+ * Based on Maxim MAX2175 Register Table revision: 130p10
+ */
+static const u8 full_fm_eu_1p0[] = {
+	0x15, 0x04, 0xb8, 0xe3, 0x35, 0x18, 0x7c, 0x00,
+	0x00, 0x7d, 0x40, 0x08, 0x70, 0x7a, 0x88, 0x91,
+	0x61, 0x61, 0x61, 0x61, 0x5a, 0x0f, 0x34, 0x1c,
+	0x14, 0x88, 0x33, 0x02, 0x00, 0x09, 0x00, 0x65,
+	0x9f, 0x2b, 0x80, 0x00, 0x95, 0x05, 0x2c, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40,
+	0x4a, 0x08, 0xa8, 0x0e, 0x0e, 0x2f, 0x7e, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xab, 0x5e, 0xa9,
+	0xae, 0xbb, 0x57, 0x18, 0x3b, 0x03, 0x3b, 0x64,
+	0x40, 0x60, 0x00, 0x2a, 0xbf, 0x3f, 0xff, 0x9f,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00,
+	0xff, 0xfc, 0xef, 0x1c, 0x40, 0x00, 0x00, 0x02,
+	0x00, 0x00, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x40, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00,
+	0x00, 0x47, 0x00, 0x00, 0x11, 0x3f, 0x22, 0x00,
+	0xf1, 0x00, 0x41, 0x03, 0xb0, 0x00, 0x00, 0x00,
+	0x1b,
+};
+
+static const u8 full_fm_na_1p0[] = {
+	0x13, 0x08, 0x8d, 0xc0, 0x35, 0x18, 0x7d, 0x3f,
+	0x7d, 0x75, 0x40, 0x08, 0x70, 0x7a, 0x88, 0x91,
+	0x61, 0x61, 0x61, 0x61, 0x5c, 0x0f, 0x34, 0x1c,
+	0x14, 0x88, 0x33, 0x02, 0x00, 0x01, 0x00, 0x65,
+	0x9f, 0x2b, 0x80, 0x00, 0x95, 0x05, 0x2c, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40,
+	0x4a, 0x08, 0xa8, 0x0e, 0x0e, 0xaf, 0x7e, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xab, 0x5e, 0xa9,
+	0xae, 0xbb, 0x57, 0x18, 0x3b, 0x03, 0x3b, 0x64,
+	0x40, 0x60, 0x00, 0x2a, 0xbf, 0x3f, 0xff, 0x9f,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00,
+	0xff, 0xfc, 0xef, 0x1c, 0x40, 0x00, 0x00, 0x02,
+	0x00, 0x00, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0x40, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00,
+	0x00, 0x35, 0x00, 0x00, 0x11, 0x3f, 0x22, 0x00,
+	0xf1, 0x00, 0x41, 0x03, 0xb0, 0x00, 0x00, 0x00,
+	0x1b,
+};
+
+/* DAB1.2 settings */
+static const struct max2175_reg_map dab12_map[] = {
+	{ 0x01, 0x13 }, { 0x02, 0x0d }, { 0x03, 0x15 }, { 0x04, 0x55 },
+	{ 0x05, 0x0a }, { 0x06, 0xa0 }, { 0x07, 0x40 }, { 0x08, 0x00 },
+	{ 0x09, 0x00 }, { 0x0a, 0x7d }, { 0x0b, 0x4a }, { 0x0c, 0x28 },
+	{ 0x0e, 0x43 }, { 0x0f, 0xb5 }, { 0x10, 0x31 }, { 0x11, 0x9e },
+	{ 0x12, 0x68 }, { 0x13, 0x9e }, { 0x14, 0x68 }, { 0x15, 0x58 },
+	{ 0x16, 0x2f }, { 0x17, 0x3f }, { 0x18, 0x40 }, { 0x1a, 0x88 },
+	{ 0x1b, 0xaa }, { 0x1c, 0x9a }, { 0x1d, 0x00 }, { 0x1e, 0x00 },
+	{ 0x23, 0x80 }, { 0x24, 0x00 }, { 0x25, 0x00 }, { 0x26, 0x00 },
+	{ 0x27, 0x00 }, { 0x32, 0x08 }, { 0x33, 0xf8 }, { 0x36, 0x2d },
+	{ 0x37, 0x7e }, { 0x55, 0xaf }, { 0x56, 0x3f }, { 0x57, 0xf8 },
+	{ 0x58, 0x99 }, { 0x76, 0x00 }, { 0x77, 0x00 }, { 0x78, 0x02 },
+	{ 0x79, 0x40 }, { 0x82, 0x00 }, { 0x83, 0x00 }, { 0x85, 0x00 },
+	{ 0x86, 0x20 },
+};
+
+/* EU FM 1.2 settings */
+static const struct max2175_reg_map fmeu1p2_map[] = {
+	{ 0x01, 0x15 }, { 0x02, 0x04 }, { 0x03, 0xb8 }, { 0x04, 0xe3 },
+	{ 0x05, 0x35 }, { 0x06, 0x18 }, { 0x07, 0x7c }, { 0x08, 0x00 },
+	{ 0x09, 0x00 }, { 0x0a, 0x73 }, { 0x0b, 0x40 }, { 0x0c, 0x08 },
+	{ 0x0e, 0x7a }, { 0x0f, 0x88 }, { 0x10, 0x91 }, { 0x11, 0x61 },
+	{ 0x12, 0x61 }, { 0x13, 0x61 }, { 0x14, 0x61 }, { 0x15, 0x5a },
+	{ 0x16, 0x0f }, { 0x17, 0x34 }, { 0x18, 0x1c }, { 0x1a, 0x88 },
+	{ 0x1b, 0x33 }, { 0x1c, 0x02 }, { 0x1d, 0x00 }, { 0x1e, 0x01 },
+	{ 0x23, 0x80 }, { 0x24, 0x00 }, { 0x25, 0x95 }, { 0x26, 0x05 },
+	{ 0x27, 0x2c }, { 0x32, 0x08 }, { 0x33, 0xa8 }, { 0x36, 0x2f },
+	{ 0x37, 0x7e }, { 0x55, 0xbf }, { 0x56, 0x3f }, { 0x57, 0xff },
+	{ 0x58, 0x9f }, { 0x76, 0xac }, { 0x77, 0x40 }, { 0x78, 0x00 },
+	{ 0x79, 0x00 }, { 0x82, 0x47 }, { 0x83, 0x00 }, { 0x85, 0x11 },
+	{ 0x86, 0x3f },
+};
+
+/* FM NA 1.0 settings */
+static const struct max2175_reg_map fmna1p0_map[] = {
+	{ 0x01, 0x13 }, { 0x02, 0x08 }, { 0x03, 0x8d }, { 0x04, 0xc0 },
+	{ 0x05, 0x35 }, { 0x06, 0x18 }, { 0x07, 0x7d }, { 0x08, 0x3f },
+	{ 0x09, 0x7d }, { 0x0a, 0x75 }, { 0x0b, 0x40 }, { 0x0c, 0x08 },
+	{ 0x0e, 0x7a }, { 0x0f, 0x88 }, { 0x10, 0x91 }, { 0x11, 0x61 },
+	{ 0x12, 0x61 }, { 0x13, 0x61 }, { 0x14, 0x61 }, { 0x15, 0x5c },
+	{ 0x16, 0x0f }, { 0x17, 0x34 }, { 0x18, 0x1c }, { 0x1a, 0x88 },
+	{ 0x1b, 0x33 }, { 0x1c, 0x02 }, { 0x1d, 0x00 }, { 0x1e, 0x01 },
+	{ 0x23, 0x80 }, { 0x24, 0x00 }, { 0x25, 0x95 }, { 0x26, 0x05 },
+	{ 0x27, 0x2c }, { 0x32, 0x08 }, { 0x33, 0xa8 }, { 0x36, 0xaf },
+	{ 0x37, 0x7e }, { 0x55, 0xbf }, { 0x56, 0x3f }, { 0x57, 0xff },
+	{ 0x58, 0x9f }, { 0x76, 0xa6 }, { 0x77, 0x40 }, { 0x78, 0x00 },
+	{ 0x79, 0x00 }, { 0x82, 0x35 }, { 0x83, 0x00 }, { 0x85, 0x11 },
+	{ 0x86, 0x3f },
+};
+
+/* FM NA 2.0 settings */
+static const struct max2175_reg_map fmna2p0_map[] = {
+	{ 0x01, 0x13 }, { 0x02, 0x08 }, { 0x03, 0x8d }, { 0x04, 0xc0 },
+	{ 0x05, 0x35 }, { 0x06, 0x18 }, { 0x07, 0x7c }, { 0x08, 0x54 },
+	{ 0x09, 0xa7 }, { 0x0a, 0x55 }, { 0x0b, 0x42 }, { 0x0c, 0x48 },
+	{ 0x0e, 0x7a }, { 0x0f, 0x88 }, { 0x10, 0x91 }, { 0x11, 0x61 },
+	{ 0x12, 0x61 }, { 0x13, 0x61 }, { 0x14, 0x61 }, { 0x15, 0x5c },
+	{ 0x16, 0x0f }, { 0x17, 0x34 }, { 0x18, 0x1c }, { 0x1a, 0x88 },
+	{ 0x1b, 0x33 }, { 0x1c, 0x02 }, { 0x1d, 0x00 }, { 0x1e, 0x01 },
+	{ 0x23, 0x80 }, { 0x24, 0x00 }, { 0x25, 0x95 }, { 0x26, 0x05 },
+	{ 0x27, 0x2c }, { 0x32, 0x08 }, { 0x33, 0xa8 }, { 0x36, 0xaf },
+	{ 0x37, 0x7e }, { 0x55, 0xbf }, { 0x56, 0x3f }, { 0x57, 0xff },
+	{ 0x58, 0x9f }, { 0x76, 0xac }, { 0x77, 0xc0 }, { 0x78, 0x00 },
+	{ 0x79, 0x00 }, { 0x82, 0x6b }, { 0x83, 0x00 }, { 0x85, 0x11 },
+	{ 0x86, 0x3f },
+};
+
+static const u16 ch_coeff_dab1[] = {
+	0x001c, 0x0007, 0xffcd, 0x0056, 0xffa4, 0x0033, 0x0027, 0xff61,
+	0x010e, 0xfec0, 0x0106, 0xffb8, 0xff1c, 0x023c, 0xfcb2, 0x039b,
+	0xfd4e, 0x0055, 0x036a, 0xf7de, 0x0d21, 0xee72, 0x1499, 0x6a51,
+};
+
+static const u16 ch_coeff_fmeu[] = {
+	0x0000, 0xffff, 0x0001, 0x0002, 0xfffa, 0xffff, 0x0015, 0xffec,
+	0xffde, 0x0054, 0xfff9, 0xff52, 0x00b8, 0x00a2, 0xfe0a, 0x00af,
+	0x02e3, 0xfc14, 0xfe89, 0x089d, 0xfa2e, 0xf30f, 0x25be, 0x4eb6,
+};
+
+static const u16 eq_coeff_fmeu1_ra02_m6db[] = {
+	0x0040, 0xffc6, 0xfffa, 0x002c, 0x000d, 0xff90, 0x0037, 0x006e,
+	0xffc0, 0xff5b, 0x006a, 0x00f0, 0xff57, 0xfe94, 0x0112, 0x0252,
+	0xfe0c, 0xfc6a, 0x0385, 0x0553, 0xfa49, 0xf789, 0x0b91, 0x1a10,
+};
+
+static const u16 ch_coeff_fmna[] = {
+	0x0001, 0x0003, 0xfffe, 0xfff4, 0x0000, 0x001f, 0x000c, 0xffbc,
+	0xffd3, 0x007d, 0x0075, 0xff33, 0xff01, 0x0131, 0x01ef, 0xfe60,
+	0xfc7a, 0x020e, 0x0656, 0xfd94, 0xf395, 0x02ab, 0x2857, 0x3d3f,
+};
+
+static const u16 eq_coeff_fmna1_ra02_m6db[] = {
+	0xfff1, 0xffe1, 0xffef, 0x000e, 0x0030, 0x002f, 0xfff6, 0xffa7,
+	0xff9d, 0x000a, 0x00a2, 0x00b5, 0xffea, 0xfed9, 0xfec5, 0x003d,
+	0x0217, 0x021b, 0xff5a, 0xfc2b, 0xfcbd, 0x02c4, 0x0ac3, 0x0e85,
+};
+
+static const u8 adc_presets[2][23] = {
+	{
+		0x83, 0x00, 0xcf, 0xb4, 0x0f, 0x2c, 0x0c, 0x49,
+		0x00, 0x00, 0x00, 0x8c,	0x02, 0x02, 0x00, 0x04,
+		0xec, 0x82, 0x4b, 0xcc, 0x01, 0x88, 0x0c,
+	},
+	{
+		0x83, 0x00, 0xcf, 0xb4,	0x0f, 0x2c, 0x0c, 0x49,
+		0x00, 0x00, 0x00, 0x8c,	0x02, 0x20, 0x33, 0x8c,
+		0x57, 0xd7, 0x59, 0xb7,	0x65, 0x0e, 0x0c,
+	},
+};
+
+/* Tuner bands */
+static const struct v4l2_frequency_band eu_bands_rf = {
+	.tuner = 0,
+	.type = V4L2_TUNER_RF,
+	.index = 0,
+	.capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS,
+	.rangelow   = 65000000,
+	.rangehigh  = 240000000,
+};
+
+static const struct v4l2_frequency_band na_bands_rf = {
+	.tuner = 0,
+	.type = V4L2_TUNER_RF,
+	.index = 0,
+	.capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS,
+	.rangelow   = 65000000,
+	.rangehigh  = 108000000,
+};
+
+/* Regmap settings */
+static const struct regmap_range max2175_regmap_volatile_range[] = {
+	regmap_reg_range(0x30, 0x35),
+	regmap_reg_range(0x3a, 0x45),
+	regmap_reg_range(0x59, 0x5e),
+	regmap_reg_range(0x73, 0x75),
+};
+
+static const struct regmap_access_table max2175_volatile_regs = {
+	.yes_ranges = max2175_regmap_volatile_range,
+	.n_yes_ranges = ARRAY_SIZE(max2175_regmap_volatile_range),
+};
+
+static const struct reg_default max2175_reg_defaults[] = {
+	{ 0x00, 0x07},
+};
+
+static const struct regmap_config max2175_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = 0xff,
+	.reg_defaults = max2175_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(max2175_reg_defaults),
+	.volatile_table = &max2175_volatile_regs,
+	.cache_type = REGCACHE_FLAT,
+};
+
+struct max2175 {
+	struct v4l2_subdev sd;		/* Sub-device */
+	struct i2c_client *client;	/* I2C client */
+
+	/* Controls */
+	struct v4l2_ctrl_handler ctrl_hdl;
+	struct v4l2_ctrl *lna_gain;	/* LNA gain value */
+	struct v4l2_ctrl *if_gain;	/* I/F gain value */
+	struct v4l2_ctrl *pll_lock;	/* PLL lock */
+	struct v4l2_ctrl *i2s_en;	/* I2S output enable */
+	struct v4l2_ctrl *hsls;		/* High-side/Low-side polarity */
+	struct v4l2_ctrl *rx_mode;	/* Receive mode */
+
+	/* Regmap */
+	struct regmap *regmap;
+
+	/* Cached configuration */
+	u32 freq;			/* Tuned freq In Hz */
+	const struct max2175_rxmode *rx_modes;		/* EU or NA modes */
+	const struct v4l2_frequency_band *bands_rf;	/* EU or NA bands */
+
+	/* Device settings */
+	unsigned long xtal_freq;	/* Ref Oscillator freq in Hz */
+	u32 decim_ratio;
+	bool master;			/* Master/Slave */
+	bool am_hiz;			/* AM Hi-Z filter */
+
+	/* ROM values */
+	u8 rom_bbf_bw_am;
+	u8 rom_bbf_bw_fm;
+	u8 rom_bbf_bw_dab;
+
+	/* Driver private variables */
+	bool mode_resolved;		/* Flag to sanity check settings */
+};
+
+static inline struct max2175 *max2175_from_sd(struct v4l2_subdev *sd)
+{
+	return container_of(sd, struct max2175, sd);
+}
+
+static inline struct max2175 *max2175_from_ctrl_hdl(struct v4l2_ctrl_handler *h)
+{
+	return container_of(h, struct max2175, ctrl_hdl);
+}
+
+/* Get bitval of a given val */
+static inline u8 max2175_get_bitval(u8 val, u8 msb, u8 lsb)
+{
+	return (val & GENMASK(msb, lsb)) >> lsb;
+}
+
+/* Read/Write bit(s) on top of regmap */
+static int max2175_read(struct max2175 *ctx, u8 idx, u8 *val)
+{
+	u32 regval;
+	int ret;
+
+	ret = regmap_read(ctx->regmap, idx, &regval);
+	if (ret)
+		mxm_err(ctx, "read ret(%d): idx 0x%02x\n", ret, idx);
+	else
+		*val = regval;
+
+	return ret;
+}
+
+static int max2175_write(struct max2175 *ctx, u8 idx, u8 val)
+{
+	int ret;
+
+	ret = regmap_write(ctx->regmap, idx, val);
+	if (ret)
+		mxm_err(ctx, "write ret(%d): idx 0x%02x val 0x%02x\n",
+			ret, idx, val);
+
+	return ret;
+}
+
+static u8 max2175_read_bits(struct max2175 *ctx, u8 idx, u8 msb, u8 lsb)
+{
+	u8 val;
+
+	if (max2175_read(ctx, idx, &val))
+		return 0;
+
+	return max2175_get_bitval(val, msb, lsb);
+}
+
+static int max2175_write_bits(struct max2175 *ctx, u8 idx,
+			     u8 msb, u8 lsb, u8 newval)
+{
+	int ret = regmap_update_bits(ctx->regmap, idx, GENMASK(msb, lsb),
+				     newval << lsb);
+
+	if (ret)
+		mxm_err(ctx, "wbits ret(%d): idx 0x%02x\n", ret, idx);
+
+	return ret;
+}
+
+static int max2175_write_bit(struct max2175 *ctx, u8 idx, u8 bit, u8 newval)
+{
+	return max2175_write_bits(ctx, idx, bit, bit, newval);
+}
+
+/* Checks expected pattern every msec until timeout */
+static int max2175_poll_timeout(struct max2175 *ctx, u8 idx, u8 msb, u8 lsb,
+				u8 exp_bitval, u32 timeout_us)
+{
+	unsigned int val;
+
+	return regmap_read_poll_timeout(ctx->regmap, idx, val,
+			(max2175_get_bitval(val, msb, lsb) == exp_bitval),
+			1000, timeout_us);
+}
+
+static int max2175_poll_csm_ready(struct max2175 *ctx)
+{
+	int ret;
+
+	ret = max2175_poll_timeout(ctx, 69, 1, 1, 0, 50000);
+	if (ret)
+		mxm_err(ctx, "csm not ready\n");
+
+	return ret;
+}
+
+#define MAX2175_IS_BAND_AM(ctx)		\
+	(max2175_read_bits(ctx, 5, 1, 0) == MAX2175_BAND_AM)
+
+#define MAX2175_IS_BAND_VHF(ctx)	\
+	(max2175_read_bits(ctx, 5, 1, 0) == MAX2175_BAND_VHF)
+
+#define MAX2175_IS_FM_MODE(ctx)		\
+	(max2175_read_bits(ctx, 12, 5, 4) == 0)
+
+#define MAX2175_IS_FMHD_MODE(ctx)	\
+	(max2175_read_bits(ctx, 12, 5, 4) == 1)
+
+#define MAX2175_IS_DAB_MODE(ctx)	\
+	(max2175_read_bits(ctx, 12, 5, 4) == 2)
+
+static int max2175_band_from_freq(u32 freq)
+{
+	if (freq >= 144000 && freq <= 26100000)
+		return MAX2175_BAND_AM;
+	else if (freq >= 65000000 && freq <= 108000000)
+		return MAX2175_BAND_FM;
+
+	return MAX2175_BAND_VHF;
+}
+
+static void max2175_i2s_enable(struct max2175 *ctx, bool enable)
+{
+	if (enable)
+		/* Stuff bits are zeroed */
+		max2175_write_bits(ctx, 104, 3, 0, 2);
+	else
+		/* Keep SCK alive */
+		max2175_write_bits(ctx, 104, 3, 0, 9);
+	mxm_dbg(ctx, "i2s %sabled\n", enable ? "en" : "dis");
+}
+
+static void max2175_set_filter_coeffs(struct max2175 *ctx, u8 m_sel,
+				      u8 bank, const u16 *coeffs)
+{
+	unsigned int i;
+	u8 coeff_addr, upper_address = 24;
+
+	mxm_dbg(ctx, "set_filter_coeffs: m_sel %d bank %d\n", m_sel, bank);
+	max2175_write_bits(ctx, 114, 5, 4, m_sel);
+
+	if (m_sel == 2)
+		upper_address = 12;
+
+	for (i = 0; i < upper_address; i++) {
+		coeff_addr = i + bank * 24;
+		max2175_write(ctx, 115, coeffs[i] >> 8);
+		max2175_write(ctx, 116, coeffs[i]);
+		max2175_write(ctx, 117, coeff_addr | 1 << 7);
+	}
+	max2175_write_bit(ctx, 117, 7, 0);
+}
+
+static void max2175_load_fmeu_1p2(struct max2175 *ctx)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(fmeu1p2_map); i++)
+		max2175_write(ctx, fmeu1p2_map[i].idx, fmeu1p2_map[i].val);
+
+	ctx->decim_ratio = 36;
+
+	/* Load the Channel Filter Coefficients into channel filter bank #2 */
+	max2175_set_filter_coeffs(ctx, MAX2175_CH_MSEL, 0, ch_coeff_fmeu);
+	max2175_set_filter_coeffs(ctx, MAX2175_EQ_MSEL, 0,
+				  eq_coeff_fmeu1_ra02_m6db);
+}
+
+static void max2175_load_dab_1p2(struct max2175 *ctx)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(dab12_map); i++)
+		max2175_write(ctx, dab12_map[i].idx, dab12_map[i].val);
+
+	ctx->decim_ratio = 1;
+
+	/* Load the Channel Filter Coefficients into channel filter bank #2 */
+	max2175_set_filter_coeffs(ctx, MAX2175_CH_MSEL, 2, ch_coeff_dab1);
+}
+
+static void max2175_load_fmna_1p0(struct max2175 *ctx)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(fmna1p0_map); i++)
+		max2175_write(ctx, fmna1p0_map[i].idx, fmna1p0_map[i].val);
+}
+
+static void max2175_load_fmna_2p0(struct max2175 *ctx)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(fmna2p0_map); i++)
+		max2175_write(ctx, fmna2p0_map[i].idx, fmna2p0_map[i].val);
+}
+
+static void max2175_set_bbfilter(struct max2175 *ctx)
+{
+	if (MAX2175_IS_BAND_AM(ctx)) {
+		max2175_write_bits(ctx, 12, 3, 0, ctx->rom_bbf_bw_am);
+		mxm_dbg(ctx, "set_bbfilter AM: rom %d\n", ctx->rom_bbf_bw_am);
+	} else if (MAX2175_IS_DAB_MODE(ctx)) {
+		max2175_write_bits(ctx, 12, 3, 0, ctx->rom_bbf_bw_dab);
+		mxm_dbg(ctx, "set_bbfilter DAB: rom %d\n", ctx->rom_bbf_bw_dab);
+	} else {
+		max2175_write_bits(ctx, 12, 3, 0, ctx->rom_bbf_bw_fm);
+		mxm_dbg(ctx, "set_bbfilter FM: rom %d\n", ctx->rom_bbf_bw_fm);
+	}
+}
+
+static bool max2175_set_csm_mode(struct max2175 *ctx,
+			  enum max2175_csm_mode new_mode)
+{
+	int ret = max2175_poll_csm_ready(ctx);
+
+	if (ret)
+		return ret;
+
+	max2175_write_bits(ctx, 0, 2, 0, new_mode);
+	mxm_dbg(ctx, "set csm new mode %d\n", new_mode);
+
+	/* Wait for a fixed settle down time depending on new mode */
+	switch (new_mode) {
+	case MAX2175_PRESET_TUNE:
+		usleep_range(51100, 51500);	/* 51.1ms */
+		break;
+	/*
+	 * Other mode switches need different sleep values depending on band &
+	 * mode
+	 */
+	default:
+		break;
+	}
+
+	return max2175_poll_csm_ready(ctx);
+}
+
+static int max2175_csm_action(struct max2175 *ctx,
+			      enum max2175_csm_mode action)
+{
+	int ret;
+
+	mxm_dbg(ctx, "csm_action: %d\n", action);
+
+	/* Other actions can be added in future when needed */
+	ret = max2175_set_csm_mode(ctx, MAX2175_LOAD_TO_BUFFER);
+	if (ret)
+		return ret;
+
+	return max2175_set_csm_mode(ctx, MAX2175_PRESET_TUNE);
+}
+
+static int max2175_set_lo_freq(struct max2175 *ctx, u32 lo_freq)
+{
+	u8 lo_mult, loband_bits = 0, vcodiv_bits = 0;
+	u32 int_desired, frac_desired;
+	enum max2175_band band;
+	int ret;
+
+	band = max2175_read_bits(ctx, 5, 1, 0);
+	switch (band) {
+	case MAX2175_BAND_AM:
+		lo_mult = 16;
+		break;
+	case MAX2175_BAND_FM:
+		if (lo_freq <= 74700000) {
+			lo_mult = 16;
+		} else if (lo_freq > 74700000 && lo_freq <= 110000000) {
+			loband_bits = 1;
+			lo_mult = 8;
+		} else {
+			loband_bits = 1;
+			vcodiv_bits = 3;
+			lo_mult = 8;
+		}
+		break;
+	case MAX2175_BAND_VHF:
+		if (lo_freq <= 210000000)
+			vcodiv_bits = 2;
+		else
+			vcodiv_bits = 1;
+
+		loband_bits = 2;
+		lo_mult = 4;
+		break;
+	default:
+		loband_bits = 3;
+		vcodiv_bits = 2;
+		lo_mult = 2;
+		break;
+	}
+
+	if (band == MAX2175_BAND_L)
+		lo_freq /= lo_mult;
+	else
+		lo_freq *= lo_mult;
+
+	int_desired = lo_freq / ctx->xtal_freq;
+	frac_desired = div_u64((u64)(lo_freq % ctx->xtal_freq) << 20,
+			       ctx->xtal_freq);
+
+	/* Check CSM is not busy */
+	ret = max2175_poll_csm_ready(ctx);
+	if (ret)
+		return ret;
+
+	mxm_dbg(ctx, "lo_mult %u int %u  frac %u\n",
+		lo_mult, int_desired, frac_desired);
+
+	/* Write the calculated values to the appropriate registers */
+	max2175_write(ctx, 1, int_desired);
+	max2175_write_bits(ctx, 2, 3, 0, (frac_desired >> 16) & 0xf);
+	max2175_write(ctx, 3, frac_desired >> 8);
+	max2175_write(ctx, 4, frac_desired);
+	max2175_write_bits(ctx, 5, 3, 2, loband_bits);
+	max2175_write_bits(ctx, 6, 7, 6, vcodiv_bits);
+
+	return ret;
+}
+
+/*
+ * Helper similar to DIV_ROUND_CLOSEST but an inline function that accepts s64
+ * dividend and s32 divisor
+ */
+static inline s64 max2175_round_closest(s64 dividend, s32 divisor)
+{
+	if ((dividend > 0 && divisor > 0) || (dividend < 0 && divisor < 0))
+		return div_s64(dividend + divisor / 2, divisor);
+
+	return div_s64(dividend - divisor / 2, divisor);
+}
+
+static int max2175_set_nco_freq(struct max2175 *ctx, s32 nco_freq)
+{
+	s32 clock_rate = ctx->xtal_freq / ctx->decim_ratio;
+	u32 nco_reg, abs_nco_freq = abs(nco_freq);
+	s64 nco_val_desired;
+	int ret;
+
+	if (abs_nco_freq < clock_rate / 2) {
+		nco_val_desired = 2 * nco_freq;
+	} else {
+		nco_val_desired = 2 * (clock_rate - abs_nco_freq);
+		if (nco_freq < 0)
+			nco_val_desired = -nco_val_desired;
+	}
+
+	nco_reg = max2175_round_closest(nco_val_desired << 20, clock_rate);
+
+	if (nco_freq < 0)
+		nco_reg += 0x200000;
+
+	/* Check CSM is not busy */
+	ret = max2175_poll_csm_ready(ctx);
+	if (ret)
+		return ret;
+
+	mxm_dbg(ctx, "freq %d desired %lld reg %u\n",
+		nco_freq, nco_val_desired, nco_reg);
+
+	/* Write the calculated values to the appropriate registers */
+	max2175_write_bits(ctx, 7, 4, 0, (nco_reg >> 16) & 0x1f);
+	max2175_write(ctx, 8, nco_reg >> 8);
+	max2175_write(ctx, 9, nco_reg);
+
+	return ret;
+}
+
+static int max2175_set_rf_freq_non_am_bands(struct max2175 *ctx, u64 freq,
+					    u32 lo_pos)
+{
+	s64 adj_freq, low_if_freq;
+	int ret;
+
+	mxm_dbg(ctx, "rf_freq: non AM bands\n");
+
+	if (MAX2175_IS_FM_MODE(ctx))
+		low_if_freq = 128000;
+	else if (MAX2175_IS_FMHD_MODE(ctx))
+		low_if_freq = 228000;
+	else
+		return max2175_set_lo_freq(ctx, freq);
+
+	if (MAX2175_IS_BAND_VHF(ctx) == (lo_pos == MAX2175_LO_ABOVE_DESIRED))
+		adj_freq = freq + low_if_freq;
+	else
+		adj_freq = freq - low_if_freq;
+
+	ret = max2175_set_lo_freq(ctx, adj_freq);
+	if (ret)
+		return ret;
+
+	return max2175_set_nco_freq(ctx, -low_if_freq);
+}
+
+static int max2175_set_rf_freq(struct max2175 *ctx, u64 freq, u32 lo_pos)
+{
+	int ret;
+
+	if (MAX2175_IS_BAND_AM(ctx))
+		ret = max2175_set_nco_freq(ctx, freq);
+	else
+		ret = max2175_set_rf_freq_non_am_bands(ctx, freq, lo_pos);
+
+	mxm_dbg(ctx, "set_rf_freq: ret %d freq %llu\n", ret, freq);
+
+	return ret;
+}
+
+static int max2175_tune_rf_freq(struct max2175 *ctx, u64 freq, u32 hsls)
+{
+	int ret;
+
+	ret = max2175_set_rf_freq(ctx, freq, hsls);
+	if (ret)
+		return ret;
+
+	ret = max2175_csm_action(ctx, MAX2175_BUFFER_PLUS_PRESET_TUNE);
+	if (ret)
+		return ret;
+
+	mxm_dbg(ctx, "tune_rf_freq: old %u new %llu\n", ctx->freq, freq);
+	ctx->freq = freq;
+
+	return ret;
+}
+
+static void max2175_set_hsls(struct max2175 *ctx, u32 lo_pos)
+{
+	mxm_dbg(ctx, "set_hsls: lo_pos %u\n", lo_pos);
+
+	if ((lo_pos == MAX2175_LO_BELOW_DESIRED) == MAX2175_IS_BAND_VHF(ctx))
+		max2175_write_bit(ctx, 5, 4, 1);
+	else
+		max2175_write_bit(ctx, 5, 4, 0);
+}
+
+static void max2175_set_eu_rx_mode(struct max2175 *ctx, u32 rx_mode)
+{
+	switch (rx_mode) {
+	case MAX2175_EU_FM_1_2:
+		max2175_load_fmeu_1p2(ctx);
+		break;
+
+	case MAX2175_DAB_1_2:
+		max2175_load_dab_1p2(ctx);
+		break;
+	}
+	/* Master is the default setting */
+	if (!ctx->master)
+		max2175_write_bit(ctx, 30, 7, 1);
+}
+
+static void max2175_set_na_rx_mode(struct max2175 *ctx, u32 rx_mode)
+{
+	switch (rx_mode) {
+	case MAX2175_NA_FM_1_0:
+		max2175_load_fmna_1p0(ctx);
+		break;
+	case MAX2175_NA_FM_2_0:
+		max2175_load_fmna_2p0(ctx);
+		break;
+	}
+	/* Master is the default setting */
+	if (!ctx->master)
+		max2175_write_bit(ctx, 30, 7, 1);
+
+	ctx->decim_ratio = 27;
+
+	/* Load the Channel Filter Coefficients into channel filter bank #2 */
+	max2175_set_filter_coeffs(ctx, MAX2175_CH_MSEL, 0, ch_coeff_fmna);
+	max2175_set_filter_coeffs(ctx, MAX2175_EQ_MSEL, 0,
+				  eq_coeff_fmna1_ra02_m6db);
+}
+
+static int max2175_set_rx_mode(struct max2175 *ctx, u32 rx_mode)
+{
+	mxm_dbg(ctx, "set_rx_mode: %u am_hiz %u\n", rx_mode, ctx->am_hiz);
+	if (ctx->xtal_freq == MAX2175_EU_XTAL_FREQ)
+		max2175_set_eu_rx_mode(ctx, rx_mode);
+	else
+		max2175_set_na_rx_mode(ctx, rx_mode);
+
+	if (ctx->am_hiz) {
+		mxm_dbg(ctx, "setting AM HiZ related config\n");
+		max2175_write_bit(ctx, 50, 5, 1);
+		max2175_write_bit(ctx, 90, 7, 1);
+		max2175_write_bits(ctx, 73, 1, 0, 2);
+		max2175_write_bits(ctx, 80, 5, 0, 33);
+	}
+
+	/* Load BB filter trim values saved in ROM */
+	max2175_set_bbfilter(ctx);
+
+	/* Set HSLS */
+	max2175_set_hsls(ctx, ctx->hsls->cur.val);
+
+	/* Use i2s enable settings */
+	max2175_i2s_enable(ctx, ctx->i2s_en->cur.val);
+
+	ctx->mode_resolved = true;
+
+	return 0;
+}
+
+static int max2175_rx_mode_from_freq(struct max2175 *ctx, u32 freq, u32 *mode)
+{
+	unsigned int i;
+	int band = max2175_band_from_freq(freq);
+
+	/* Pick the first match always */
+	for (i = 0; i <= ctx->rx_mode->maximum; i++) {
+		if (ctx->rx_modes[i].band == band) {
+			*mode = i;
+			mxm_dbg(ctx, "rx_mode_from_freq: freq %u mode %d\n",
+				freq, *mode);
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static bool max2175_freq_rx_mode_valid(struct max2175 *ctx,
+					 u32 mode, u32 freq)
+{
+	int band = max2175_band_from_freq(freq);
+
+	return (ctx->rx_modes[mode].band == band);
+}
+
+static void max2175_load_adc_presets(struct max2175 *ctx)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(adc_presets); i++)
+		for (j = 0; j < ARRAY_SIZE(adc_presets[0]); j++)
+			max2175_write(ctx, 146 + j + i * 55, adc_presets[i][j]);
+}
+
+static int max2175_init_power_manager(struct max2175 *ctx)
+{
+	int ret;
+
+	/* Execute on-chip power-up/calibration */
+	max2175_write_bit(ctx, 99, 2, 0);
+	usleep_range(1000, 1500);
+	max2175_write_bit(ctx, 99, 2, 1);
+
+	/* Wait for the power manager to finish. */
+	ret = max2175_poll_timeout(ctx, 69, 7, 7, 1, 50000);
+	if (ret)
+		mxm_err(ctx, "init pm failed\n");
+
+	return ret;
+}
+
+static int max2175_recalibrate_adc(struct max2175 *ctx)
+{
+	int ret;
+
+	/* ADC Re-calibration */
+	max2175_write(ctx, 150, 0xff);
+	max2175_write(ctx, 205, 0xff);
+	max2175_write(ctx, 147, 0x20);
+	max2175_write(ctx, 147, 0x00);
+	max2175_write(ctx, 202, 0x20);
+	max2175_write(ctx, 202, 0x00);
+
+	ret = max2175_poll_timeout(ctx, 69, 4, 3, 3, 50000);
+	if (ret)
+		mxm_err(ctx, "adc recalibration failed\n");
+
+	return ret;
+}
+
+static u8 max2175_read_rom(struct max2175 *ctx, u8 row)
+{
+	u8 data = 0;
+
+	max2175_write_bit(ctx, 56, 4, 0);
+	max2175_write_bits(ctx, 56, 3, 0, row);
+
+	usleep_range(2000, 2500);
+	max2175_read(ctx, 58, &data);
+
+	max2175_write_bits(ctx, 56, 3, 0, 0);
+
+	mxm_dbg(ctx, "read_rom: row %d data 0x%02x\n", row, data);
+
+	return data;
+}
+
+static void max2175_load_from_rom(struct max2175 *ctx)
+{
+	u8 data = 0;
+
+	data = max2175_read_rom(ctx, 0);
+	ctx->rom_bbf_bw_am = data & 0x0f;
+	max2175_write_bits(ctx, 81, 3, 0, data >> 4);
+
+	data = max2175_read_rom(ctx, 1);
+	ctx->rom_bbf_bw_fm = data & 0x0f;
+	ctx->rom_bbf_bw_dab = data >> 4;
+
+	data = max2175_read_rom(ctx, 2);
+	max2175_write_bits(ctx, 82, 4, 0, data & 0x1f);
+	max2175_write_bits(ctx, 82, 7, 5, data >> 5);
+
+	data = max2175_read_rom(ctx, 3);
+	if (ctx->am_hiz) {
+		data &= 0x0f;
+		data |= (max2175_read_rom(ctx, 7) & 0x40) >> 2;
+		if (!data)
+			data |= 2;
+	} else {
+		data = (data & 0xf0) >> 4;
+		data |= (max2175_read_rom(ctx, 7) & 0x80) >> 3;
+		if (!data)
+			data |= 30;
+	}
+	max2175_write_bits(ctx, 80, 5, 0, data + 31);
+
+	data = max2175_read_rom(ctx, 6);
+	max2175_write_bits(ctx, 81, 7, 6, data >> 6);
+}
+
+static void max2175_load_full_fm_eu_1p0(struct max2175 *ctx)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(full_fm_eu_1p0); i++)
+		max2175_write(ctx, i + 1, full_fm_eu_1p0[i]);
+
+	usleep_range(5000, 5500);
+	ctx->decim_ratio = 36;
+}
+
+static void max2175_load_full_fm_na_1p0(struct max2175 *ctx)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(full_fm_na_1p0); i++)
+		max2175_write(ctx, i + 1, full_fm_na_1p0[i]);
+
+	usleep_range(5000, 5500);
+	ctx->decim_ratio = 27;
+}
+
+static int max2175_core_init(struct max2175 *ctx, u32 refout_bits)
+{
+	int ret;
+
+	/* MAX2175 uses 36.864MHz clock for EU & 40.154MHz for NA region */
+	if (ctx->xtal_freq == MAX2175_EU_XTAL_FREQ)
+		max2175_load_full_fm_eu_1p0(ctx);
+	else
+		max2175_load_full_fm_na_1p0(ctx);
+
+	/* The default settings assume master */
+	if (!ctx->master)
+		max2175_write_bit(ctx, 30, 7, 1);
+
+	mxm_dbg(ctx, "refout_bits %u\n", refout_bits);
+
+	/* Set REFOUT */
+	max2175_write_bits(ctx, 56, 7, 5, refout_bits);
+
+	/* ADC Reset */
+	max2175_write_bit(ctx, 99, 1, 0);
+	usleep_range(1000, 1500);
+	max2175_write_bit(ctx, 99, 1, 1);
+
+	/* Load ADC preset values */
+	max2175_load_adc_presets(ctx);
+
+	/* Initialize the power management state machine */
+	ret = max2175_init_power_manager(ctx);
+	if (ret)
+		return ret;
+
+	/* Recalibrate ADC */
+	ret = max2175_recalibrate_adc(ctx);
+	if (ret)
+		return ret;
+
+	/* Load ROM values to appropriate registers */
+	max2175_load_from_rom(ctx);
+
+	if (ctx->xtal_freq == MAX2175_EU_XTAL_FREQ) {
+		/* Load FIR coefficients into bank 0 */
+		max2175_set_filter_coeffs(ctx, MAX2175_CH_MSEL, 0,
+					  ch_coeff_fmeu);
+		max2175_set_filter_coeffs(ctx, MAX2175_EQ_MSEL, 0,
+					  eq_coeff_fmeu1_ra02_m6db);
+	} else {
+		/* Load FIR coefficients into bank 0 */
+		max2175_set_filter_coeffs(ctx, MAX2175_CH_MSEL, 0,
+					  ch_coeff_fmna);
+		max2175_set_filter_coeffs(ctx, MAX2175_EQ_MSEL, 0,
+					  eq_coeff_fmna1_ra02_m6db);
+	}
+	mxm_dbg(ctx, "core initialized\n");
+
+	return 0;
+}
+
+static void max2175_s_ctrl_rx_mode(struct max2175 *ctx, u32 rx_mode)
+{
+	/* Load mode. Range check already done */
+	max2175_set_rx_mode(ctx, rx_mode);
+
+	mxm_dbg(ctx, "s_ctrl_rx_mode: %u curr freq %u\n", rx_mode, ctx->freq);
+
+	/* Check if current freq valid for mode & update */
+	if (max2175_freq_rx_mode_valid(ctx, rx_mode, ctx->freq))
+		max2175_tune_rf_freq(ctx, ctx->freq, ctx->hsls->cur.val);
+	else
+		/* Use default freq of mode if current freq is not valid */
+		max2175_tune_rf_freq(ctx, ctx->rx_modes[rx_mode].freq,
+				     ctx->hsls->cur.val);
+}
+
+static int max2175_s_ctrl(struct v4l2_ctrl *ctrl)
+{
+	struct max2175 *ctx = max2175_from_ctrl_hdl(ctrl->handler);
+
+	mxm_dbg(ctx, "s_ctrl: id 0x%x, val %u\n", ctrl->id, ctrl->val);
+	switch (ctrl->id) {
+	case V4L2_CID_MAX2175_I2S_ENABLE:
+		max2175_i2s_enable(ctx, ctrl->val);
+		break;
+	case V4L2_CID_MAX2175_HSLS:
+		max2175_set_hsls(ctx, ctrl->val);
+		break;
+	case V4L2_CID_MAX2175_RX_MODE:
+		max2175_s_ctrl_rx_mode(ctx, ctrl->val);
+		break;
+	}
+
+	return 0;
+}
+
+static u32 max2175_get_lna_gain(struct max2175 *ctx)
+{
+	enum max2175_band band = max2175_read_bits(ctx, 5, 1, 0);
+
+	switch (band) {
+	case MAX2175_BAND_AM:
+		return max2175_read_bits(ctx, 51, 3, 0);
+	case MAX2175_BAND_FM:
+		return max2175_read_bits(ctx, 50, 3, 0);
+	case MAX2175_BAND_VHF:
+		return max2175_read_bits(ctx, 52, 5, 0);
+	default:
+		return 0;
+	}
+}
+
+static int max2175_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
+{
+	struct max2175 *ctx = max2175_from_ctrl_hdl(ctrl->handler);
+
+	switch (ctrl->id) {
+	case V4L2_CID_RF_TUNER_LNA_GAIN:
+		ctrl->val = max2175_get_lna_gain(ctx);
+		break;
+	case V4L2_CID_RF_TUNER_IF_GAIN:
+		ctrl->val = max2175_read_bits(ctx, 49, 4, 0);
+		break;
+	case V4L2_CID_RF_TUNER_PLL_LOCK:
+		ctrl->val = (max2175_read_bits(ctx, 60, 7, 6) == 3);
+		break;
+	}
+
+	return 0;
+};
+
+static int max2175_set_freq_and_mode(struct max2175 *ctx, u32 freq)
+{
+	u32 rx_mode;
+	int ret;
+
+	/* Get band from frequency */
+	ret = max2175_rx_mode_from_freq(ctx, freq, &rx_mode);
+	if (ret)
+		return ret;
+
+	mxm_dbg(ctx, "set_freq_and_mode: freq %u rx_mode %d\n", freq, rx_mode);
+
+	/* Load mode */
+	max2175_set_rx_mode(ctx, rx_mode);
+	ctx->rx_mode->cur.val = rx_mode;
+
+	/* Tune to the new freq given */
+	return max2175_tune_rf_freq(ctx, freq, ctx->hsls->cur.val);
+}
+
+static int max2175_s_frequency(struct v4l2_subdev *sd,
+			       const struct v4l2_frequency *vf)
+{
+	struct max2175 *ctx = max2175_from_sd(sd);
+	u32 freq;
+	int ret = 0;
+
+	mxm_dbg(ctx, "s_freq: new %u curr %u, mode_resolved %d\n",
+		vf->frequency, ctx->freq, ctx->mode_resolved);
+
+	if (vf->tuner != 0)
+		return -EINVAL;
+
+	freq = clamp(vf->frequency, ctx->bands_rf->rangelow,
+		     ctx->bands_rf->rangehigh);
+
+	/* Check new freq valid for rx_mode if already resolved */
+	if (ctx->mode_resolved &&
+	    max2175_freq_rx_mode_valid(ctx, ctx->rx_mode->cur.val, freq))
+		ret = max2175_tune_rf_freq(ctx, freq, ctx->hsls->cur.val);
+	else
+		/* Find default rx_mode for freq and tune to it */
+		ret = max2175_set_freq_and_mode(ctx, freq);
+
+	mxm_dbg(ctx, "s_freq: ret %d curr %u mode_resolved %d mode %u\n",
+		ret, ctx->freq, ctx->mode_resolved, ctx->rx_mode->cur.val);
+
+	return ret;
+}
+
+static int max2175_g_frequency(struct v4l2_subdev *sd,
+			       struct v4l2_frequency *vf)
+{
+	struct max2175 *ctx = max2175_from_sd(sd);
+	int ret = 0;
+
+	if (vf->tuner != 0)
+		return -EINVAL;
+
+	/* RF freq */
+	vf->type = V4L2_TUNER_RF;
+	vf->frequency = ctx->freq;
+
+	return ret;
+}
+
+static int max2175_enum_freq_bands(struct v4l2_subdev *sd,
+			    struct v4l2_frequency_band *band)
+{
+	struct max2175 *ctx = max2175_from_sd(sd);
+
+	if (band->tuner != 0 || band->index != 0)
+		return -EINVAL;
+
+	*band = *ctx->bands_rf;
+
+	return 0;
+}
+
+static int max2175_g_tuner(struct v4l2_subdev *sd, struct v4l2_tuner *vt)
+{
+	struct max2175 *ctx = max2175_from_sd(sd);
+
+	if (vt->index > 0)
+		return -EINVAL;
+
+	strlcpy(vt->name, "RF", sizeof(vt->name));
+	vt->type = V4L2_TUNER_RF;
+	vt->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS;
+	vt->rangelow = ctx->bands_rf->rangelow;
+	vt->rangehigh = ctx->bands_rf->rangehigh;
+
+	return 0;
+}
+
+static int max2175_s_tuner(struct v4l2_subdev *sd, const struct v4l2_tuner *vt)
+{
+	/* Check tuner index is valid */
+	if (vt->index > 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static const struct v4l2_subdev_tuner_ops max2175_tuner_ops = {
+	.s_frequency = max2175_s_frequency,
+	.g_frequency = max2175_g_frequency,
+	.enum_freq_bands = max2175_enum_freq_bands,
+	.g_tuner = max2175_g_tuner,
+	.s_tuner = max2175_s_tuner,
+};
+
+static const struct v4l2_subdev_ops max2175_ops = {
+	.tuner = &max2175_tuner_ops,
+};
+
+static const struct v4l2_ctrl_ops max2175_ctrl_ops = {
+	.s_ctrl = max2175_s_ctrl,
+	.g_volatile_ctrl = max2175_g_volatile_ctrl,
+};
+
+/*
+ * I2S output enable/disable configuration. This is a private control.
+ * Refer to Documentation/media/v4l-drivers/max2175 for more details.
+ */
+static const struct v4l2_ctrl_config max2175_i2s_en = {
+	.ops = &max2175_ctrl_ops,
+	.id = V4L2_CID_MAX2175_I2S_ENABLE,
+	.name = "I2S Enable",
+	.type = V4L2_CTRL_TYPE_BOOLEAN,
+	.min = 0,
+	.max = 1,
+	.step = 1,
+	.def = 1,
+	.is_private = 1,
+};
+
+/*
+ * HSLS value control LO freq adjacent location configuration.
+ * Refer to Documentation/media/v4l-drivers/max2175 for more details.
+ */
+static const struct v4l2_ctrl_config max2175_hsls = {
+	.ops = &max2175_ctrl_ops,
+	.id = V4L2_CID_MAX2175_HSLS,
+	.name = "HSLS Above/Below Desired",
+	.type = V4L2_CTRL_TYPE_BOOLEAN,
+	.min = 0,
+	.max = 1,
+	.step = 1,
+	.def = 1,
+};
+
+/*
+ * Rx modes below are a set of preset configurations that decides the tuner's
+ * sck and sample rate of transmission. They are separate for EU & NA regions.
+ * Refer to Documentation/media/v4l-drivers/max2175 for more details.
+ */
+static const char * const max2175_ctrl_eu_rx_modes[] = {
+	[MAX2175_EU_FM_1_2]	= "EU FM 1.2",
+	[MAX2175_DAB_1_2]	= "DAB 1.2",
+};
+
+static const char * const max2175_ctrl_na_rx_modes[] = {
+	[MAX2175_NA_FM_1_0]	= "NA FM 1.0",
+	[MAX2175_NA_FM_2_0]	= "NA FM 2.0",
+};
+
+static const struct v4l2_ctrl_config max2175_eu_rx_mode = {
+	.ops = &max2175_ctrl_ops,
+	.id = V4L2_CID_MAX2175_RX_MODE,
+	.name = "RX Mode",
+	.type = V4L2_CTRL_TYPE_MENU,
+	.max = ARRAY_SIZE(max2175_ctrl_eu_rx_modes) - 1,
+	.def = 0,
+	.qmenu = max2175_ctrl_eu_rx_modes,
+};
+
+static const struct v4l2_ctrl_config max2175_na_rx_mode = {
+	.ops = &max2175_ctrl_ops,
+	.id = V4L2_CID_MAX2175_RX_MODE,
+	.name = "RX Mode",
+	.type = V4L2_CTRL_TYPE_MENU,
+	.max = ARRAY_SIZE(max2175_ctrl_na_rx_modes) - 1,
+	.def = 0,
+	.qmenu = max2175_ctrl_na_rx_modes,
+};
+
+static int max2175_refout_load_to_bits(struct i2c_client *client, u32 load,
+				       u32 *bits)
+{
+	if (load >= 0 && load <= 40)
+		*bits = load / 10;
+	else if (load >= 60 && load <= 70)
+		*bits = load / 10 - 1;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+static int max2175_probe(struct i2c_client *client,
+			const struct i2c_device_id *id)
+{
+	bool master = true, am_hiz = false;
+	u32 refout_load, refout_bits = 0;	/* REFOUT disabled */
+	struct v4l2_ctrl_handler *hdl;
+	struct fwnode_handle *fwnode;
+	struct device_node *np;
+	struct v4l2_subdev *sd;
+	struct regmap *regmap;
+	struct max2175 *ctx;
+	struct clk *clk;
+	int ret;
+
+	/* Parse DT properties */
+	np = of_parse_phandle(client->dev.of_node, "maxim,master", 0);
+	if (np) {
+		master = false;			/* Slave tuner */
+		of_node_put(np);
+	}
+
+	fwnode = of_fwnode_handle(client->dev.of_node);
+	if (fwnode_property_present(fwnode, "maxim,am-hiz-filter"))
+		am_hiz = true;
+
+	if (!fwnode_property_read_u32(fwnode, "maxim,refout-load",
+				      &refout_load)) {
+		ret = max2175_refout_load_to_bits(client, refout_load,
+						  &refout_bits);
+		if (ret) {
+			dev_err(&client->dev, "invalid refout_load %u\n",
+				refout_load);
+			return -EINVAL;
+		}
+	}
+
+	clk = devm_clk_get(&client->dev, NULL);
+	if (IS_ERR(clk)) {
+		ret = PTR_ERR(clk);
+		dev_err(&client->dev, "cannot get clock %d\n", ret);
+		return -ENODEV;
+	}
+
+	regmap = devm_regmap_init_i2c(client, &max2175_regmap_config);
+	if (IS_ERR(regmap)) {
+		ret = PTR_ERR(regmap);
+		dev_err(&client->dev, "regmap init failed %d\n", ret);
+		return -ENODEV;
+	}
+
+	/* Alloc tuner context */
+	ctx = devm_kzalloc(&client->dev, sizeof(*ctx), GFP_KERNEL);
+	if (ctx == NULL)
+		return -ENOMEM;
+
+	sd = &ctx->sd;
+	ctx->master = master;
+	ctx->am_hiz = am_hiz;
+	ctx->mode_resolved = false;
+	ctx->regmap = regmap;
+	ctx->xtal_freq = clk_get_rate(clk);
+	dev_info(&client->dev, "xtal freq %luHz\n", ctx->xtal_freq);
+
+	v4l2_i2c_subdev_init(sd, client, &max2175_ops);
+	ctx->client = client;
+
+	sd->flags = V4L2_SUBDEV_FL_HAS_DEVNODE;
+
+	/* Controls */
+	hdl = &ctx->ctrl_hdl;
+	ret = v4l2_ctrl_handler_init(hdl, 7);
+	if (ret)
+		return ret;
+
+	ctx->lna_gain = v4l2_ctrl_new_std(hdl, &max2175_ctrl_ops,
+					  V4L2_CID_RF_TUNER_LNA_GAIN,
+					  0, 63, 1, 0);
+	ctx->lna_gain->flags |= (V4L2_CTRL_FLAG_VOLATILE |
+				 V4L2_CTRL_FLAG_READ_ONLY);
+	ctx->if_gain = v4l2_ctrl_new_std(hdl, &max2175_ctrl_ops,
+					 V4L2_CID_RF_TUNER_IF_GAIN,
+					 0, 31, 1, 0);
+	ctx->if_gain->flags |= (V4L2_CTRL_FLAG_VOLATILE |
+				V4L2_CTRL_FLAG_READ_ONLY);
+	ctx->pll_lock = v4l2_ctrl_new_std(hdl, &max2175_ctrl_ops,
+					  V4L2_CID_RF_TUNER_PLL_LOCK,
+					  0, 1, 1, 0);
+	ctx->pll_lock->flags |= (V4L2_CTRL_FLAG_VOLATILE |
+				 V4L2_CTRL_FLAG_READ_ONLY);
+	ctx->i2s_en = v4l2_ctrl_new_custom(hdl, &max2175_i2s_en, NULL);
+	ctx->hsls = v4l2_ctrl_new_custom(hdl, &max2175_hsls, NULL);
+
+	if (ctx->xtal_freq == MAX2175_EU_XTAL_FREQ) {
+		ctx->rx_mode = v4l2_ctrl_new_custom(hdl,
+						    &max2175_eu_rx_mode, NULL);
+		ctx->rx_modes = eu_rx_modes;
+		ctx->bands_rf = &eu_bands_rf;
+	} else {
+		ctx->rx_mode = v4l2_ctrl_new_custom(hdl,
+						    &max2175_na_rx_mode, NULL);
+		ctx->rx_modes = na_rx_modes;
+		ctx->bands_rf = &na_bands_rf;
+	}
+	ctx->sd.ctrl_handler = &ctx->ctrl_hdl;
+
+	/* Set the defaults */
+	ctx->freq = ctx->bands_rf->rangelow;
+
+	/* Register subdev */
+	ret = v4l2_async_register_subdev(sd);
+	if (ret) {
+		dev_err(&client->dev, "register subdev failed\n");
+		goto err_reg;
+	}
+
+	/* Initialize device */
+	ret = max2175_core_init(ctx, refout_bits);
+	if (ret)
+		goto err_init;
+
+	ret = v4l2_ctrl_handler_setup(hdl);
+	if (ret)
+		goto err_init;
+
+	return 0;
+
+err_init:
+	v4l2_async_unregister_subdev(sd);
+err_reg:
+	v4l2_ctrl_handler_free(&ctx->ctrl_hdl);
+
+	return ret;
+}
+
+static int max2175_remove(struct i2c_client *client)
+{
+	struct v4l2_subdev *sd = i2c_get_clientdata(client);
+	struct max2175 *ctx = max2175_from_sd(sd);
+
+	v4l2_ctrl_handler_free(&ctx->ctrl_hdl);
+	v4l2_async_unregister_subdev(sd);
+
+	return 0;
+}
+
+static const struct i2c_device_id max2175_id[] = {
+	{ DRIVER_NAME, 0},
+	{},
+};
+MODULE_DEVICE_TABLE(i2c, max2175_id);
+
+static const struct of_device_id max2175_of_ids[] = {
+	{ .compatible = "maxim,max2175", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, max2175_of_ids);
+
+static struct i2c_driver max2175_driver = {
+	.driver = {
+		.name	= DRIVER_NAME,
+		.of_match_table = max2175_of_ids,
+	},
+	.probe		= max2175_probe,
+	.remove		= max2175_remove,
+	.id_table	= max2175_id,
+};
+
+module_i2c_driver(max2175_driver);
+
+MODULE_DESCRIPTION("Maxim MAX2175 RF to Bits tuner driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>");
diff --git a/drivers/media/i2c/max2175.h b/drivers/media/i2c/max2175.h
new file mode 100644
index 000000000000..eb43373ce7e2
--- /dev/null
+++ b/drivers/media/i2c/max2175.h
@@ -0,0 +1,109 @@
+/*
+ * Maxim Integrated MAX2175 RF to Bits tuner driver
+ *
+ * This driver & most of the hard coded values are based on the reference
+ * application delivered by Maxim for this device.
+ *
+ * Copyright (C) 2016 Maxim Integrated Products
+ * Copyright (C) 2017 Renesas Electronics Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __MAX2175_H__
+#define __MAX2175_H__
+
+#define MAX2175_EU_XTAL_FREQ	36864000	/* In Hz */
+#define MAX2175_NA_XTAL_FREQ	40186125	/* In Hz */
+
+enum max2175_region {
+	MAX2175_REGION_EU = 0,	/* Europe */
+	MAX2175_REGION_NA,	/* North America */
+};
+
+enum max2175_band {
+	MAX2175_BAND_AM = 0,
+	MAX2175_BAND_FM,
+	MAX2175_BAND_VHF,
+	MAX2175_BAND_L,
+};
+
+enum max2175_eu_mode {
+	/* EU modes */
+	MAX2175_EU_FM_1_2 = 0,
+	MAX2175_DAB_1_2,
+
+	/*
+	 * Other possible modes to add in future
+	 * MAX2175_DAB_1_0,
+	 * MAX2175_DAB_1_3,
+	 * MAX2175_EU_FM_2_2,
+	 * MAX2175_EU_FMHD_4_0,
+	 * MAX2175_EU_AM_1_0,
+	 * MAX2175_EU_AM_2_2,
+	 */
+};
+
+enum max2175_na_mode {
+	/* NA modes */
+	MAX2175_NA_FM_1_0 = 0,
+	MAX2175_NA_FM_2_0,
+
+	/*
+	 * Other possible modes to add in future
+	 * MAX2175_NA_FMHD_1_0,
+	 * MAX2175_NA_FMHD_1_2,
+	 * MAX2175_NA_AM_1_0,
+	 * MAX2175_NA_AM_1_2,
+	 */
+};
+
+/* Supported I2S modes */
+enum {
+	MAX2175_I2S_MODE0 = 0,
+	MAX2175_I2S_MODE1,
+	MAX2175_I2S_MODE2,
+	MAX2175_I2S_MODE3,
+	MAX2175_I2S_MODE4,
+};
+
+/* Coefficient table groups */
+enum {
+	MAX2175_CH_MSEL = 0,
+	MAX2175_EQ_MSEL,
+	MAX2175_AA_MSEL,
+};
+
+/* HSLS LO injection polarity */
+enum {
+	MAX2175_LO_BELOW_DESIRED = 0,
+	MAX2175_LO_ABOVE_DESIRED,
+};
+
+/* Channel FSM modes */
+enum max2175_csm_mode {
+	MAX2175_LOAD_TO_BUFFER = 0,
+	MAX2175_PRESET_TUNE,
+	MAX2175_SEARCH,
+	MAX2175_AF_UPDATE,
+	MAX2175_JUMP_FAST_TUNE,
+	MAX2175_CHECK,
+	MAX2175_LOAD_AND_SWAP,
+	MAX2175_END,
+	MAX2175_BUFFER_PLUS_PRESET_TUNE,
+	MAX2175_BUFFER_PLUS_SEARCH,
+	MAX2175_BUFFER_PLUS_AF_UPDATE,
+	MAX2175_BUFFER_PLUS_JUMP_FAST_TUNE,
+	MAX2175_BUFFER_PLUS_CHECK,
+	MAX2175_BUFFER_PLUS_LOAD_AND_SWAP,
+	MAX2175_NO_ACTION
+};
+
+#endif /* __MAX2175_H__ */
diff --git a/include/uapi/linux/max2175.h b/include/uapi/linux/max2175.h
new file mode 100644
index 000000000000..3ef5d264440f
--- /dev/null
+++ b/include/uapi/linux/max2175.h
@@ -0,0 +1,28 @@
+/*
+ * max2175.h
+ *
+ * Maxim Integrated MAX2175 RF to Bits tuner driver - user space header file.
+ *
+ * Copyright (C) 2016 Maxim Integrated Products
+ * Copyright (C) 2017 Renesas Electronics Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __UAPI_MAX2175_H_
+#define __UAPI_MAX2175_H_
+
+#include <linux/v4l2-controls.h>
+
+#define V4L2_CID_MAX2175_I2S_ENABLE	(V4L2_CID_USER_MAX217X_BASE + 0x01)
+#define V4L2_CID_MAX2175_HSLS		(V4L2_CID_USER_MAX217X_BASE + 0x02)
+#define V4L2_CID_MAX2175_RX_MODE	(V4L2_CID_USER_MAX217X_BASE + 0x03)
+
+#endif /* __UAPI_MAX2175_H_ */
-- 
cgit v1.2.3


From c28f2118a2129f8e2c1cdf2454ffe4833885edff Mon Sep 17 00:00:00 2001
From: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Date: Mon, 12 Jun 2017 10:26:16 -0300
Subject: [media] media: Add new SDR formats PC16, PC18 & PC20

This patch adds support for the three new SDR formats. These formats
were prefixed with "planar" indicating I & Q data are not interleaved
as in other formats. Here, I & Q data constitutes the top half and bottom
half of the received buffer respectively.

V4L2_SDR_FMT_PCU16BE - 14-bit complex (I & Q) unsigned big-endian sample
inside 16-bit. V4L2 FourCC: PC16

V4L2_SDR_FMT_PCU18BE - 16-bit complex (I & Q) unsigned big-endian sample
inside 18-bit. V4L2 FourCC: PC18

V4L2_SDR_FMT_PCU20BE - 18-bit complex (I & Q) unsigned big-endian sample
inside 20-bit. V4L2 FourCC: PC20

Signed-off-by: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/v4l2-core/v4l2-ioctl.c | 3 +++
 include/uapi/linux/videodev2.h       | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 4f27cfa134a1..ce40183d9daa 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1229,6 +1229,9 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_SDR_FMT_CS8:		descr = "Complex S8"; break;
 	case V4L2_SDR_FMT_CS14LE:	descr = "Complex S14LE"; break;
 	case V4L2_SDR_FMT_RU12LE:	descr = "Real U12LE"; break;
+	case V4L2_SDR_FMT_PCU16BE:	descr = "Planar Complex U16BE"; break;
+	case V4L2_SDR_FMT_PCU18BE:	descr = "Planar Complex U18BE"; break;
+	case V4L2_SDR_FMT_PCU20BE:	descr = "Planar Complex U20BE"; break;
 	case V4L2_TCH_FMT_DELTA_TD16:	descr = "16-bit signed deltas"; break;
 	case V4L2_TCH_FMT_DELTA_TD08:	descr = "8-bit signed deltas"; break;
 	case V4L2_TCH_FMT_TU16:		descr = "16-bit unsigned touch data"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 2b8feb86d09e..45cf7359822c 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -669,6 +669,9 @@ struct v4l2_pix_format {
 #define V4L2_SDR_FMT_CS8          v4l2_fourcc('C', 'S', '0', '8') /* complex s8 */
 #define V4L2_SDR_FMT_CS14LE       v4l2_fourcc('C', 'S', '1', '4') /* complex s14le */
 #define V4L2_SDR_FMT_RU12LE       v4l2_fourcc('R', 'U', '1', '2') /* real u12le */
+#define V4L2_SDR_FMT_PCU16BE	  v4l2_fourcc('P', 'C', '1', '6') /* planar complex u16be */
+#define V4L2_SDR_FMT_PCU18BE	  v4l2_fourcc('P', 'C', '1', '8') /* planar complex u18be */
+#define V4L2_SDR_FMT_PCU20BE	  v4l2_fourcc('P', 'C', '2', '0') /* planar complex u20be */
 
 /* Touch formats - used for Touch devices */
 #define V4L2_TCH_FMT_DELTA_TD16	v4l2_fourcc('T', 'D', '1', '6') /* 16-bit signed deltas */
-- 
cgit v1.2.3


From b45cd756368823ce9e19bcb8c69d575595df5c5a Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Wed, 7 Jun 2017 15:33:54 -0300
Subject: [media] add mux and video interface bridge entity functions

Add two new media entity function definitions for video multiplexers
and video interface bridges.

- renamed MEDIA_ENT_F_MUX to MEDIA_ENT_F_VID_MUX

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Steve Longerbeam <steve_longerbeam@mentor.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/mediactl/media-types.rst | 21 +++++++++++++++++++++
 include/uapi/linux/media.h                        |  6 ++++++
 2 files changed, 27 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/mediactl/media-types.rst b/Documentation/media/uapi/mediactl/media-types.rst
index 2a5164aea2b4..71078565d644 100644
--- a/Documentation/media/uapi/mediactl/media-types.rst
+++ b/Documentation/media/uapi/mediactl/media-types.rst
@@ -299,6 +299,27 @@ Types and flags used to represent the media graph elements
 	  received on its sink pad and outputs the statistics data on
 	  its source pad.
 
+    -  ..  row 29
+
+       ..  _MEDIA-ENT-F-VID-MUX:
+
+       -  ``MEDIA_ENT_F_VID_MUX``
+
+       - Video multiplexer. An entity capable of multiplexing must have at
+         least two sink pads and one source pad, and must pass the video
+         frame(s) received from the active sink pad to the source pad.
+
+    -  ..  row 30
+
+       ..  _MEDIA-ENT-F-VID-IF-BRIDGE:
+
+       -  ``MEDIA_ENT_F_VID_IF_BRIDGE``
+
+       - Video interface bridge. A video interface bridge entity must have at
+         least one sink pad and at least one source pad. It receives video
+         frames on its sink pad from an input video bus of one type (HDMI, eDP,
+         MIPI CSI-2, ...), and outputs them on its source pad to an output
+         video bus of another type (eDP, MIPI CSI-2, parallel, ...).
 
 ..  tabularcolumns:: |p{5.5cm}|p{12.0cm}|
 
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 4890787731b8..fac96c64fe51 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -104,6 +104,12 @@ struct media_device_info {
 #define MEDIA_ENT_F_PROC_VIDEO_SCALER		(MEDIA_ENT_F_BASE + 0x4005)
 #define MEDIA_ENT_F_PROC_VIDEO_STATISTICS	(MEDIA_ENT_F_BASE + 0x4006)
 
+/*
+ * Switch and bridge entitites
+ */
+#define MEDIA_ENT_F_VID_MUX			(MEDIA_ENT_F_BASE + 0x5001)
+#define MEDIA_ENT_F_VID_IF_BRIDGE		(MEDIA_ENT_F_BASE + 0x5002)
+
 /*
  * Connectors
  */
-- 
cgit v1.2.3


From ac6424b981bce1c4bc55675c6ce11bfe1bbfa64f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 20 Jun 2017 12:06:13 +0200
Subject: sched/wait: Rename wait_queue_t => wait_queue_entry_t

Rename:

	wait_queue_t		=>	wait_queue_entry_t

'wait_queue_t' was always a slight misnomer: its name implies that it's a "queue",
but in reality it's a queue *entry*. The 'real' queue is the wait queue head,
which had to carry the name.

Start sorting this out by renaming it to 'wait_queue_entry_t'.

This also allows the real structure name 'struct __wait_queue' to
lose its double underscore and become 'struct wait_queue_entry',
which is the more canonical nomenclature for such data types.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/DocBook/kernel-hacking.tmpl          |  2 +-
 Documentation/filesystems/autofs4.txt              | 12 ++--
 block/blk-mq.c                                     |  2 +-
 block/blk-wbt.c                                    |  2 +-
 block/kyber-iosched.c                              |  8 +--
 drivers/bluetooth/btmrvl_main.c                    |  2 +-
 drivers/char/ipmi/ipmi_watchdog.c                  |  2 +-
 drivers/gpu/drm/i915/i915_gem_request.h            |  2 +-
 drivers/gpu/drm/i915/i915_sw_fence.c               | 14 ++---
 drivers/gpu/drm/i915/i915_sw_fence.h               |  2 +-
 drivers/gpu/drm/radeon/radeon.h                    |  2 +-
 drivers/gpu/drm/radeon/radeon_fence.c              |  2 +-
 drivers/gpu/vga/vgaarb.c                           |  2 +-
 drivers/infiniband/hw/i40iw/i40iw_main.c           |  2 +-
 drivers/md/bcache/btree.h                          |  2 +-
 drivers/net/ethernet/cavium/liquidio/octeon_main.h |  4 +-
 drivers/net/wireless/cisco/airo.c                  |  2 +-
 .../net/wireless/intersil/hostap/hostap_ioctl.c    |  2 +-
 drivers/net/wireless/marvell/libertas/main.c       |  2 +-
 drivers/scsi/dpt/dpti_i2o.h                        |  2 +-
 drivers/scsi/ips.c                                 | 12 ++--
 drivers/scsi/ips.h                                 |  4 +-
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c |  6 +-
 .../staging/lustre/lnet/klnds/socklnd/socklnd_cb.c |  4 +-
 drivers/staging/lustre/lnet/libcfs/debug.c         |  2 +-
 drivers/staging/lustre/lnet/libcfs/tracefile.c     |  2 +-
 drivers/staging/lustre/lnet/lnet/lib-eq.c          |  2 +-
 drivers/staging/lustre/lnet/lnet/lib-socket.c      |  2 +-
 drivers/staging/lustre/lustre/fid/fid_request.c    |  6 +-
 drivers/staging/lustre/lustre/include/lustre_lib.h |  4 +-
 drivers/staging/lustre/lustre/llite/lcommon_cl.c   |  2 +-
 .../staging/lustre/lustre/lov/lov_cl_internal.h    |  2 +-
 drivers/staging/lustre/lustre/lov/lov_object.c     |  2 +-
 drivers/staging/lustre/lustre/obdclass/lu_object.c |  6 +-
 drivers/tty/synclink_gt.c                          |  2 +-
 drivers/vfio/virqfd.c                              |  2 +-
 drivers/vhost/vhost.c                              |  2 +-
 drivers/vhost/vhost.h                              |  2 +-
 fs/autofs4/autofs_i.h                              |  2 +-
 fs/autofs4/waitq.c                                 | 18 +++---
 fs/cachefiles/internal.h                           |  2 +-
 fs/cachefiles/namei.c                              |  2 +-
 fs/cachefiles/rdwr.c                               |  2 +-
 fs/dax.c                                           |  4 +-
 fs/eventfd.c                                       |  2 +-
 fs/eventpoll.c                                     | 10 ++--
 fs/fs_pin.c                                        |  2 +-
 fs/nfs/nfs4proc.c                                  |  4 +-
 fs/nilfs2/segment.c                                |  2 +-
 fs/orangefs/orangefs-bufmap.c                      |  4 +-
 fs/reiserfs/journal.c                              |  2 +-
 fs/select.c                                        |  4 +-
 fs/signalfd.c                                      |  2 +-
 fs/userfaultfd.c                                   |  8 +--
 include/linux/blk-mq.h                             |  2 +-
 include/linux/eventfd.h                            |  4 +-
 include/linux/kvm_irqfd.h                          |  2 +-
 include/linux/pagemap.h                            |  2 +-
 include/linux/poll.h                               |  2 +-
 include/linux/vfio.h                               |  2 +-
 include/linux/wait.h                               | 67 +++++++++++-----------
 include/net/af_unix.h                              |  2 +-
 include/uapi/linux/auto_fs.h                       |  4 +-
 include/uapi/linux/auto_fs4.h                      |  4 +-
 kernel/exit.c                                      |  4 +-
 kernel/futex.c                                     |  2 +-
 kernel/sched/completion.c                          |  2 +-
 kernel/sched/core.c                                |  2 +-
 kernel/sched/wait.c                                | 42 +++++++-------
 kernel/workqueue.c                                 |  4 +-
 mm/filemap.c                                       | 10 ++--
 mm/memcontrol.c                                    |  8 +--
 mm/mempool.c                                       |  2 +-
 mm/shmem.c                                         |  2 +-
 net/9p/trans_fd.c                                  |  4 +-
 net/bluetooth/bnep/core.c                          |  2 +-
 net/bluetooth/cmtp/core.c                          |  2 +-
 net/bluetooth/hidp/core.c                          |  2 +-
 net/core/datagram.c                                |  2 +-
 net/unix/af_unix.c                                 |  4 +-
 sound/core/control.c                               |  2 +-
 sound/core/hwdep.c                                 |  2 +-
 sound/core/init.c                                  |  2 +-
 sound/core/oss/pcm_oss.c                           |  4 +-
 sound/core/pcm_lib.c                               |  2 +-
 sound/core/pcm_native.c                            |  4 +-
 sound/core/rawmidi.c                               |  8 +--
 sound/core/seq/seq_fifo.c                          |  2 +-
 sound/core/seq/seq_memory.c                        |  2 +-
 sound/core/timer.c                                 |  2 +-
 sound/isa/wavefront/wavefront_synth.c              |  2 +-
 sound/pci/mixart/mixart_core.c                     |  4 +-
 sound/pci/ymfpci/ymfpci_main.c                     |  2 +-
 virt/kvm/eventfd.c                                 |  2 +-
 94 files changed, 216 insertions(+), 213 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index da5c087462b1..c3c705591532 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -819,7 +819,7 @@ printk(KERN_INFO "my ip: %pI4\n", &amp;ipaddress);
    certain condition is true.  They must be used carefully to ensure
    there is no race condition.  You declare a
    <type>wait_queue_head_t</type>, and then processes which want to
-   wait for that condition declare a <type>wait_queue_t</type>
+   wait for that condition declare a <type>wait_queue_entry_t</type>
    referring to themselves, and place that in the queue.
   </para>
 
diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt
index f10dd590f69f..8444dc3d57e8 100644
--- a/Documentation/filesystems/autofs4.txt
+++ b/Documentation/filesystems/autofs4.txt
@@ -316,7 +316,7 @@ For version 5, the format of the message is:
         struct autofs_v5_packet {
                 int proto_version;                /* Protocol version */
                 int type;                        /* Type of packet */
-                autofs_wqt_t wait_queue_token;
+                autofs_wqt_t wait_queue_entry_token;
                 __u32 dev;
                 __u64 ino;
                 __u32 uid;
@@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing
 `O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
 most one packet, and any unread portion of a packet will be discarded.
 
-The `wait_queue_token` is a unique number which can identify a
+The `wait_queue_entry_token` is a unique number which can identify a
 particular request to be acknowledged.  When a message is sent over
 the pipe the affected dentry is marked as either "active" or
 "expiring" and other accesses to it block until the message is
 acknowledged using one of the ioctls below and the relevant
-`wait_queue_token`.
+`wait_queue_entry_token`.
 
 Communicating with autofs: root directory ioctls
 ------------------------------------------------
@@ -358,7 +358,7 @@ capability, or must be the automount daemon.
 The available ioctl commands are:
 
 - **AUTOFS_IOC_READY**: a notification has been handled.  The argument
-    to the ioctl command is the "wait_queue_token" number
+    to the ioctl command is the "wait_queue_entry_token" number
     corresponding to the notification being acknowledged.
 - **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
     the error code `ENOENT`.
@@ -382,14 +382,14 @@ The available ioctl commands are:
         struct autofs_packet_expire_multi {
                 int proto_version;              /* Protocol version */
                 int type;                       /* Type of packet */
-                autofs_wqt_t wait_queue_token;
+                autofs_wqt_t wait_queue_entry_token;
                 int len;
                 char name[NAME_MAX+1];
         };
 
      is required.  This is filled in with the name of something
      that can be unmounted or removed.  If nothing can be expired,
-     `errno` is set to `EAGAIN`.  Even though a `wait_queue_token`
+     `errno` is set to `EAGAIN`.  Even though a `wait_queue_entry_token`
      is present in the structure, no "wait queue" is established
      and no acknowledgment is needed.
 - **AUTOFS_IOC_EXPIRE_MULTI**:  This is similar to
diff --git a/block/blk-mq.c b/block/blk-mq.c
index bb66c96850b1..a083f95e04b1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -926,7 +926,7 @@ static bool reorder_tags_to_front(struct list_head *list)
 	return first != NULL;
 }
 
-static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 				void *key)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 17676f4d7fd1..5f3a37c2784c 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -503,7 +503,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 }
 
 static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
-			     wait_queue_t *wait, unsigned long rw)
+			     wait_queue_entry_t *wait, unsigned long rw)
 {
 	/*
 	 * inc it here even if disabled, since we'll dec it at completion.
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b9faabc75fdb..b95d6bd714c0 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -99,7 +99,7 @@ struct kyber_hctx_data {
 	struct list_head rqs[KYBER_NUM_DOMAINS];
 	unsigned int cur_domain;
 	unsigned int batching;
-	wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 	atomic_t wait_index[KYBER_NUM_DOMAINS];
 };
 
@@ -507,7 +507,7 @@ static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
 	}
 }
 
-static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 			     void *key)
 {
 	struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
@@ -523,7 +523,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 {
 	unsigned int sched_domain = khd->cur_domain;
 	struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
-	wait_queue_t *wait = &khd->domain_wait[sched_domain];
+	wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
 	struct sbq_wait_state *ws;
 	int nr;
 
@@ -734,7 +734,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m)	\
 {									\
 	struct blk_mq_hw_ctx *hctx = data;				\
 	struct kyber_hctx_data *khd = hctx->sched_data;			\
-	wait_queue_t *wait = &khd->domain_wait[domain];			\
+	wait_queue_entry_t *wait = &khd->domain_wait[domain];		\
 									\
 	seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list));	\
 	return 0;							\
diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c
index c38cb5b91291..fe850f0567cb 100644
--- a/drivers/bluetooth/btmrvl_main.c
+++ b/drivers/bluetooth/btmrvl_main.c
@@ -602,7 +602,7 @@ static int btmrvl_service_main_thread(void *data)
 	struct btmrvl_thread *thread = data;
 	struct btmrvl_private *priv = thread->priv;
 	struct btmrvl_adapter *adapter = priv->adapter;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct sk_buff *skb;
 	ulong flags;
 
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index d165af8abe36..a5c6cfe71a8e 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -821,7 +821,7 @@ static ssize_t ipmi_read(struct file *file,
 			 loff_t      *ppos)
 {
 	int          rv = 0;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (count <= 0)
 		return 0;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 129c58bb4805..a4a920c4c454 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -123,7 +123,7 @@ struct drm_i915_gem_request {
 	 * It is used by the driver to then queue the request for execution.
 	 */
 	struct i915_sw_fence submit;
-	wait_queue_t submitq;
+	wait_queue_entry_t submitq;
 	wait_queue_head_t execute;
 
 	/* A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index a277f8eb7beb..8669bfa33064 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -152,7 +152,7 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
 					struct list_head *continuation)
 {
 	wait_queue_head_t *x = &fence->wait;
-	wait_queue_t *pos, *next;
+	wait_queue_entry_t *pos, *next;
 	unsigned long flags;
 
 	debug_fence_deactivate(fence);
@@ -254,7 +254,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence)
 	__i915_sw_fence_commit(fence);
 }
 
-static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key)
+static int i915_sw_fence_wake(wait_queue_entry_t *wq, unsigned mode, int flags, void *key)
 {
 	list_del(&wq->task_list);
 	__i915_sw_fence_complete(wq->private, key);
@@ -267,7 +267,7 @@ static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *
 static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
 				    const struct i915_sw_fence * const signaler)
 {
-	wait_queue_t *wq;
+	wait_queue_entry_t *wq;
 
 	if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
 		return false;
@@ -288,7 +288,7 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
 
 static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence)
 {
-	wait_queue_t *wq;
+	wait_queue_entry_t *wq;
 
 	if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
 		return;
@@ -320,7 +320,7 @@ static bool i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
 
 static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 					  struct i915_sw_fence *signaler,
-					  wait_queue_t *wq, gfp_t gfp)
+					  wait_queue_entry_t *wq, gfp_t gfp)
 {
 	unsigned long flags;
 	int pending;
@@ -359,7 +359,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 
 	spin_lock_irqsave(&signaler->wait.lock, flags);
 	if (likely(!i915_sw_fence_done(signaler))) {
-		__add_wait_queue_tail(&signaler->wait, wq);
+		__add_wait_queue_entry_tail(&signaler->wait, wq);
 		pending = 1;
 	} else {
 		i915_sw_fence_wake(wq, 0, 0, NULL);
@@ -372,7 +372,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 
 int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 				 struct i915_sw_fence *signaler,
-				 wait_queue_t *wq)
+				 wait_queue_entry_t *wq)
 {
 	return __i915_sw_fence_await_sw_fence(fence, signaler, wq, 0);
 }
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index d31cefbbcc04..fd3c3bf6c8b7 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -66,7 +66,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence);
 
 int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 				 struct i915_sw_fence *after,
-				 wait_queue_t *wq);
+				 wait_queue_entry_t *wq);
 int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
 				     struct i915_sw_fence *after,
 				     gfp_t gfp);
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index c1c8e2208a21..e562a78510ff 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -375,7 +375,7 @@ struct radeon_fence {
 	unsigned		ring;
 	bool			is_vm_update;
 
-	wait_queue_t		fence_wake;
+	wait_queue_entry_t		fence_wake;
 };
 
 int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index ef09f0a63754..e86f2bd38410 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -158,7 +158,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
  * for the fence locking itself, so unlocked variants are used for
  * fence_signal, and remove_wait_queue.
  */
-static int radeon_fence_check_signaled(wait_queue_t *wait, unsigned mode, int flags, void *key)
+static int radeon_fence_check_signaled(wait_queue_entry_t *wait, unsigned mode, int flags, void *key)
 {
 	struct radeon_fence *fence;
 	u64 seq;
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 92f1452dad57..76875f6299b8 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -417,7 +417,7 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible)
 {
 	struct vga_device *vgadev, *conflict;
 	unsigned long flags;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int rc = 0;
 
 	vga_check_first_use();
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index a3f18a22f5ed..e0f47cc2effc 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1939,7 +1939,7 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev,
 bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev)
 {
 	struct i40iw_device *iwdev;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	iwdev = dev->back_dev;
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 9b80417cd547..73da1f5626cb 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -207,7 +207,7 @@ void bkey_put(struct cache_set *c, struct bkey *k);
 
 struct btree_op {
 	/* for waiting on btree reserve in btree_split() */
-	wait_queue_t		wait;
+	wait_queue_entry_t		wait;
 
 	/* Btree level at which we start taking write locks */
 	short			lock;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
index bed9ef17bc26..7ccffbb0019e 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
@@ -144,7 +144,7 @@ static inline int
 sleep_cond(wait_queue_head_t *wait_queue, int *condition)
 {
 	int errno = 0;
-	wait_queue_t we;
+	wait_queue_entry_t we;
 
 	init_waitqueue_entry(&we, current);
 	add_wait_queue(wait_queue, &we);
@@ -171,7 +171,7 @@ sleep_timeout_cond(wait_queue_head_t *wait_queue,
 		   int *condition,
 		   int timeout)
 {
-	wait_queue_t we;
+	wait_queue_entry_t we;
 
 	init_waitqueue_entry(&we, current);
 	add_wait_queue(wait_queue, &we);
diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c
index 1b7e125a28e2..6a13303af2b7 100644
--- a/drivers/net/wireless/cisco/airo.c
+++ b/drivers/net/wireless/cisco/airo.c
@@ -3066,7 +3066,7 @@ static int airo_thread(void *data) {
 		if (ai->jobs) {
 			locked = down_interruptible(&ai->sem);
 		} else {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 
 			init_waitqueue_entry(&wait, current);
 			add_wait_queue(&ai->thr_wait, &wait);
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
index b2c6b065b542..ff153ce29539 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
@@ -2544,7 +2544,7 @@ static int prism2_ioctl_priv_prism2_param(struct net_device *dev,
 			ret = -EINVAL;
 		}
 		if (local->iw_mode == IW_MODE_MASTER) {
-			wait_queue_t __wait;
+			wait_queue_entry_t __wait;
 			init_waitqueue_entry(&__wait, current);
 			add_wait_queue(&local->hostscan_wq, &__wait);
 			set_current_state(TASK_INTERRUPTIBLE);
diff --git a/drivers/net/wireless/marvell/libertas/main.c b/drivers/net/wireless/marvell/libertas/main.c
index e3500203715c..dde065d0d5c1 100644
--- a/drivers/net/wireless/marvell/libertas/main.c
+++ b/drivers/net/wireless/marvell/libertas/main.c
@@ -453,7 +453,7 @@ static int lbs_thread(void *data)
 {
 	struct net_device *dev = data;
 	struct lbs_private *priv = dev->ml_priv;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	lbs_deb_enter(LBS_DEB_THREAD);
 
diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h
index bd9e31e16249..16fc380b5512 100644
--- a/drivers/scsi/dpt/dpti_i2o.h
+++ b/drivers/scsi/dpt/dpti_i2o.h
@@ -48,7 +48,7 @@
 #include <linux/wait.h>
 typedef wait_queue_head_t adpt_wait_queue_head_t;
 #define ADPT_DECLARE_WAIT_QUEUE_HEAD(wait) DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wait)
-typedef wait_queue_t adpt_wait_queue_t;
+typedef wait_queue_entry_t adpt_wait_queue_entry_t;
 
 /*
  * message structures
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 3419e1bcdff6..67621308eb9c 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -301,13 +301,13 @@ static uint32_t ips_statupd_copperhead_memio(ips_ha_t *);
 static uint32_t ips_statupd_morpheus(ips_ha_t *);
 static ips_scb_t *ips_getscb(ips_ha_t *);
 static void ips_putq_scb_head(ips_scb_queue_t *, ips_scb_t *);
-static void ips_putq_wait_tail(ips_wait_queue_t *, struct scsi_cmnd *);
+static void ips_putq_wait_tail(ips_wait_queue_entry_t *, struct scsi_cmnd *);
 static void ips_putq_copp_tail(ips_copp_queue_t *,
 				      ips_copp_wait_item_t *);
 static ips_scb_t *ips_removeq_scb_head(ips_scb_queue_t *);
 static ips_scb_t *ips_removeq_scb(ips_scb_queue_t *, ips_scb_t *);
-static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *);
-static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *,
+static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *);
+static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *,
 					  struct scsi_cmnd *);
 static ips_copp_wait_item_t *ips_removeq_copp(ips_copp_queue_t *,
 						     ips_copp_wait_item_t *);
@@ -2871,7 +2871,7 @@ ips_removeq_scb(ips_scb_queue_t * queue, ips_scb_t * item)
 /* ASSUMED to be called from within the HA lock                             */
 /*                                                                          */
 /****************************************************************************/
-static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item)
+static void ips_putq_wait_tail(ips_wait_queue_entry_t *queue, struct scsi_cmnd *item)
 {
 	METHOD_TRACE("ips_putq_wait_tail", 1);
 
@@ -2902,7 +2902,7 @@ static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item)
 /* ASSUMED to be called from within the HA lock                             */
 /*                                                                          */
 /****************************************************************************/
-static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue)
+static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *queue)
 {
 	struct scsi_cmnd *item;
 
@@ -2936,7 +2936,7 @@ static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue)
 /* ASSUMED to be called from within the HA lock                             */
 /*                                                                          */
 /****************************************************************************/
-static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *queue,
+static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *queue,
 					  struct scsi_cmnd *item)
 {
 	struct scsi_cmnd *p;
diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h
index b782bb60baf0..366be3b2f9b4 100644
--- a/drivers/scsi/ips.h
+++ b/drivers/scsi/ips.h
@@ -989,7 +989,7 @@ typedef struct ips_wait_queue {
 	struct scsi_cmnd *head;
 	struct scsi_cmnd *tail;
 	int count;
-} ips_wait_queue_t;
+} ips_wait_queue_entry_t;
 
 typedef struct ips_copp_wait_item {
 	struct scsi_cmnd *scsi_cmd;
@@ -1035,7 +1035,7 @@ typedef struct ips_ha {
    ips_stat_t         sp;                 /* Status packer pointer      */
    struct ips_scb    *scbs;               /* Array of all CCBS          */
    struct ips_scb    *scb_freelist;       /* SCB free list              */
-   ips_wait_queue_t   scb_waitlist;       /* Pending SCB list           */
+   ips_wait_queue_entry_t   scb_waitlist;       /* Pending SCB list           */
    ips_copp_queue_t   copp_waitlist;      /* Pending PT list            */
    ips_scb_queue_t    scb_activelist;     /* Active SCB list            */
    IPS_IO_CMD        *dummy;              /* dummy command              */
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 0db662d6abdd..85b242ec5f9b 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -3267,7 +3267,7 @@ int
 kiblnd_connd(void *arg)
 {
 	spinlock_t *lock = &kiblnd_data.kib_connd_lock;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	unsigned long flags;
 	struct kib_conn *conn;
 	int timeout;
@@ -3521,7 +3521,7 @@ kiblnd_scheduler(void *arg)
 	long id = (long)arg;
 	struct kib_sched_info *sched;
 	struct kib_conn *conn;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	unsigned long flags;
 	struct ib_wc wc;
 	int did_something;
@@ -3656,7 +3656,7 @@ kiblnd_failover_thread(void *arg)
 {
 	rwlock_t *glock = &kiblnd_data.kib_global_lock;
 	struct kib_dev *dev;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	unsigned long flags;
 	int rc;
 
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
index 3ed3b08c122c..6b38d5a8fe92 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -2166,7 +2166,7 @@ ksocknal_connd(void *arg)
 {
 	spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
 	struct ksock_connreq *cr;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int nloops = 0;
 	int cons_retry = 0;
 
@@ -2554,7 +2554,7 @@ ksocknal_check_peer_timeouts(int idx)
 int
 ksocknal_reaper(void *arg)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct ksock_conn *conn;
 	struct ksock_sched *sched;
 	struct list_head enomem_conns;
diff --git a/drivers/staging/lustre/lnet/libcfs/debug.c b/drivers/staging/lustre/lnet/libcfs/debug.c
index c56e9922cd5b..49deb448b044 100644
--- a/drivers/staging/lustre/lnet/libcfs/debug.c
+++ b/drivers/staging/lustre/lnet/libcfs/debug.c
@@ -361,7 +361,7 @@ static int libcfs_debug_dumplog_thread(void *arg)
 
 void libcfs_debug_dumplog(void)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct task_struct *dumper;
 
 	/* we're being careful to ensure that the kernel thread is
diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.c b/drivers/staging/lustre/lnet/libcfs/tracefile.c
index 9599b7441feb..27082d2f7938 100644
--- a/drivers/staging/lustre/lnet/libcfs/tracefile.c
+++ b/drivers/staging/lustre/lnet/libcfs/tracefile.c
@@ -990,7 +990,7 @@ static int tracefiled(void *arg)
 	complete(&tctl->tctl_start);
 
 	while (1) {
-		wait_queue_t __wait;
+		wait_queue_entry_t __wait;
 
 		pc.pc_want_daemon_pages = 0;
 		collect_pages(&pc);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
index ce4b83584e17..9ebba4ef5f90 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-eq.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -312,7 +312,7 @@ __must_hold(&the_lnet.ln_eq_wait_lock)
 {
 	int tms = *timeout_ms;
 	int wait;
-	wait_queue_t wl;
+	wait_queue_entry_t wl;
 	unsigned long now;
 
 	if (!tms)
diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c
index 9fca8d225ee0..f075706bba6d 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c
@@ -516,7 +516,7 @@ lnet_sock_listen(struct socket **sockp, __u32 local_ip, int local_port,
 int
 lnet_sock_accept(struct socket **newsockp, struct socket *sock)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct socket *newsock;
 	int rc;
 
diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
index 999f250ceed0..bf31bc200d27 100644
--- a/drivers/staging/lustre/lustre/fid/fid_request.c
+++ b/drivers/staging/lustre/lustre/fid/fid_request.c
@@ -192,7 +192,7 @@ static int seq_client_alloc_seq(const struct lu_env *env,
 }
 
 static int seq_fid_alloc_prep(struct lu_client_seq *seq,
-			      wait_queue_t *link)
+			      wait_queue_entry_t *link)
 {
 	if (seq->lcs_update) {
 		add_wait_queue(&seq->lcs_waitq, link);
@@ -223,7 +223,7 @@ static void seq_fid_alloc_fini(struct lu_client_seq *seq)
 int seq_client_alloc_fid(const struct lu_env *env,
 			 struct lu_client_seq *seq, struct lu_fid *fid)
 {
-	wait_queue_t link;
+	wait_queue_entry_t link;
 	int rc;
 
 	LASSERT(seq);
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(seq_client_alloc_fid);
  */
 void seq_client_flush(struct lu_client_seq *seq)
 {
-	wait_queue_t link;
+	wait_queue_entry_t link;
 
 	LASSERT(seq);
 	init_waitqueue_entry(&link, current);
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
index b04d613846ee..f24970da8323 100644
--- a/drivers/staging/lustre/lustre/include/lustre_lib.h
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -201,7 +201,7 @@ struct l_wait_info {
 			   sigmask(SIGALRM))
 
 /**
- * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * wait_queue_entry_t of Linux (version < 2.6.34) is a FIFO list for exclusively
  * waiting threads, which is not always desirable because all threads will
  * be waken up again and again, even user only needs a few of them to be
  * active most time. This is not good for performance because cache can
@@ -228,7 +228,7 @@ struct l_wait_info {
  */
 #define __l_wait_event(wq, condition, info, ret, l_add_wait)		   \
 do {									   \
-	wait_queue_t __wait;						 \
+	wait_queue_entry_t __wait;						 \
 	long __timeout = info->lwi_timeout;			  \
 	sigset_t   __blocked;					      \
 	int   __allow_intr = info->lwi_allow_intr;			     \
diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c
index 8af611033e12..96515b839436 100644
--- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustre/lustre/llite/lcommon_cl.c
@@ -207,7 +207,7 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
 static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
 {
 	struct lu_object_header *header = obj->co_lu.lo_header;
-	wait_queue_t	   waiter;
+	wait_queue_entry_t	   waiter;
 
 	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
 		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
index 391c632365ae..e889d3a7de9c 100644
--- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
+++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
@@ -370,7 +370,7 @@ struct lov_thread_info {
 	struct ost_lvb	  lti_lvb;
 	struct cl_2queue	lti_cl2q;
 	struct cl_page_list     lti_plist;
-	wait_queue_t	  lti_waiter;
+	wait_queue_entry_t	  lti_waiter;
 	struct cl_attr          lti_attr;
 };
 
diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c
index ab3ecfeeadc8..eddabbe31e5c 100644
--- a/drivers/staging/lustre/lustre/lov/lov_object.c
+++ b/drivers/staging/lustre/lustre/lov/lov_object.c
@@ -371,7 +371,7 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
 	struct lov_layout_raid0 *r0;
 	struct lu_site	  *site;
 	struct lu_site_bkt_data *bkt;
-	wait_queue_t	  *waiter;
+	wait_queue_entry_t	  *waiter;
 
 	r0  = &lov->u.raid0;
 	LASSERT(r0->lo_sub[idx] == los);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
index abcf951208d2..76ae600ae2c8 100644
--- a/drivers/staging/lustre/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -556,7 +556,7 @@ EXPORT_SYMBOL(lu_object_print);
 static struct lu_object *htable_lookup(struct lu_site *s,
 				       struct cfs_hash_bd *bd,
 				       const struct lu_fid *f,
-				       wait_queue_t *waiter,
+				       wait_queue_entry_t *waiter,
 				       __u64 *version)
 {
 	struct lu_site_bkt_data *bkt;
@@ -670,7 +670,7 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env,
 					    struct lu_device *dev,
 					    const struct lu_fid *f,
 					    const struct lu_object_conf *conf,
-					    wait_queue_t *waiter)
+					    wait_queue_entry_t *waiter)
 {
 	struct lu_object      *o;
 	struct lu_object      *shadow;
@@ -750,7 +750,7 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
 {
 	struct lu_site_bkt_data *bkt;
 	struct lu_object	*obj;
-	wait_queue_t	   wait;
+	wait_queue_entry_t	   wait;
 
 	while (1) {
 		obj = lu_object_find_try(env, dev, f, conf, &wait);
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 31885f20fc15..cc047de72e2a 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -184,7 +184,7 @@ static void hdlcdev_exit(struct slgt_info *info);
 struct cond_wait {
 	struct cond_wait *next;
 	wait_queue_head_t q;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	unsigned int data;
 };
 static void init_cond_wait(struct cond_wait *w, unsigned int data);
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 27c89cd5d70b..4797217e5e72 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -43,7 +43,7 @@ static void virqfd_deactivate(struct virqfd *virqfd)
 	queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
 }
 
-static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
 	unsigned long flags = (unsigned long)key;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 042030e5a035..e4613a3c362d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -165,7 +165,7 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 	add_wait_queue(wqh, &poll->wait);
 }
 
-static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
+static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
 			     void *key)
 {
 	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index f55671d53f28..f72095868b93 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -31,7 +31,7 @@ struct vhost_work {
 struct vhost_poll {
 	poll_table                table;
 	wait_queue_head_t        *wqh;
-	wait_queue_t              wait;
+	wait_queue_entry_t              wait;
 	struct vhost_work	  work;
 	unsigned long		  mask;
 	struct vhost_dev	 *dev;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index beef981aa54f..974f5346458a 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -83,7 +83,7 @@ struct autofs_info {
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
 	struct autofs_wait_queue *next;
-	autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_entry_token;
 	/* We use the following to see what we are waiting for */
 	struct qstr name;
 	u32 dev;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 24a58bf9ca72..7071895b0678 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	size_t pktsz;
 
 	pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
-		 (unsigned long) wq->wait_queue_token,
+		 (unsigned long) wq->wait_queue_entry_token,
 		 wq->name.len, wq->name.name, type);
 
 	memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
@@ -120,7 +120,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*mp);
 
-		mp->wait_queue_token = wq->wait_queue_token;
+		mp->wait_queue_entry_token = wq->wait_queue_entry_token;
 		mp->len = wq->name.len;
 		memcpy(mp->name, wq->name.name, wq->name.len);
 		mp->name[wq->name.len] = '\0';
@@ -133,7 +133,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*ep);
 
-		ep->wait_queue_token = wq->wait_queue_token;
+		ep->wait_queue_entry_token = wq->wait_queue_entry_token;
 		ep->len = wq->name.len;
 		memcpy(ep->name, wq->name.name, wq->name.len);
 		ep->name[wq->name.len] = '\0';
@@ -153,7 +153,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*packet);
 
-		packet->wait_queue_token = wq->wait_queue_token;
+		packet->wait_queue_entry_token = wq->wait_queue_entry_token;
 		packet->len = wq->name.len;
 		memcpy(packet->name, wq->name.name, wq->name.len);
 		packet->name[wq->name.len] = '\0';
@@ -428,7 +428,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 			return -ENOMEM;
 		}
 
-		wq->wait_queue_token = autofs4_next_wait_queue;
+		wq->wait_queue_entry_token = autofs4_next_wait_queue;
 		if (++autofs4_next_wait_queue == 0)
 			autofs4_next_wait_queue = 1;
 		wq->next = sbi->queues;
@@ -461,7 +461,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		}
 
 		pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-			 (unsigned long) wq->wait_queue_token, wq->name.len,
+			 (unsigned long) wq->wait_queue_entry_token, wq->name.len,
 			 wq->name.name, notify);
 
 		/*
@@ -471,7 +471,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 	} else {
 		wq->wait_ctr++;
 		pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-			 (unsigned long) wq->wait_queue_token, wq->name.len,
+			 (unsigned long) wq->wait_queue_entry_token, wq->name.len,
 			 wq->name.name, notify);
 		mutex_unlock(&sbi->wq_mutex);
 		kfree(qstr.name);
@@ -550,13 +550,13 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 }
 
 
-int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
+int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_entry_token, int status)
 {
 	struct autofs_wait_queue *wq, **wql;
 
 	mutex_lock(&sbi->wq_mutex);
 	for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
-		if (wq->wait_queue_token == wait_queue_token)
+		if (wq->wait_queue_entry_token == wait_queue_entry_token)
 			break;
 	}
 
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 9bf90bcc56ac..54a4fcd679ed 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -97,7 +97,7 @@ struct cachefiles_cache {
  * backing file read tracking
  */
 struct cachefiles_one_read {
-	wait_queue_t			monitor;	/* link into monitored waitqueue */
+	wait_queue_entry_t			monitor;	/* link into monitored waitqueue */
 	struct page			*back_page;	/* backing file page we're waiting for */
 	struct page			*netfs_page;	/* netfs page we're going to fill */
 	struct fscache_retrieval	*op;		/* retrieval op covering this */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 41df8a27d7eb..3978b324cbca 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -204,7 +204,7 @@ wait_for_old_object:
 		wait_queue_head_t *wq;
 
 		signed long timeout = 60 * HZ;
-		wait_queue_t wait;
+		wait_queue_entry_t wait;
 		bool requeue;
 
 		/* if the object we're waiting for is queued for processing,
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index afbdc418966d..8be33b33b981 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -21,7 +21,7 @@
  * - we use this to detect read completion of backing pages
  * - the caller holds the waitqueue lock
  */
-static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
 				  int sync, void *_key)
 {
 	struct cachefiles_one_read *monitor =
diff --git a/fs/dax.c b/fs/dax.c
index 2a6889b3585f..323ea481d4a8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -84,7 +84,7 @@ struct exceptional_entry_key {
 };
 
 struct wait_exceptional_entry_queue {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct exceptional_entry_key key;
 };
 
@@ -108,7 +108,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
 	return wait_table + hash;
 }
 
-static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
 				       int sync, void *keyp)
 {
 	struct exceptional_entry_key *key = keyp;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 68b9fffcb2c8..9736df2ce89d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
  * This is used to atomically remove a wait queue entry from the eventfd wait
  * queue head, and read/reset the counter value.
  */
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt)
 {
 	unsigned long flags;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5420767c9b68..5ac1cba5ef72 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -244,7 +244,7 @@ struct eppoll_entry {
 	 * Wait queue item that will be linked to the target file wait
 	 * queue head.
 	 */
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	/* The wait queue head that linked the "wait" wait queue item */
 	wait_queue_head_t *whead;
@@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p)
 	return !list_empty(p);
 }
 
-static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
 {
 	return container_of(p, struct eppoll_entry, wait);
 }
 
 /* Get the "struct epitem" from a wait queue pointer */
-static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
+static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
 {
 	return container_of(p, struct eppoll_entry, wait)->base;
 }
@@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
  * mechanism. It is called by the stored file descriptors when they
  * have events to report.
  */
-static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	int pwake = 0;
 	unsigned long flags;
@@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	int res = 0, eavail, timed_out = 0;
 	unsigned long flags;
 	u64 slack = 0;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
 
 	if (timeout > 0) {
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 611b5408f6ec..7b447a245760 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 
 void pin_kill(struct fs_pin *p)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (!p) {
 		rcu_read_unlock();
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c08c46a3b8cd..be5a8f84e5bb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6372,7 +6372,7 @@ struct nfs4_lock_waiter {
 };
 
 static int
-nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
 {
 	int ret;
 	struct cb_notify_lock_args *cbnl = key;
@@ -6415,7 +6415,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 					   .inode = state->inode,
 					   .owner = &owner,
 					   .notified = false };
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	/* Don't bother with waitqueue if we don't expect a callback */
 	if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index febed1217b3f..775304e7f96f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
 }
 
 struct nilfs_segctor_wait_request {
-	wait_queue_t	wq;
+	wait_queue_entry_t	wq;
 	__u32		seq;
 	int		err;
 	atomic_t	done;
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 83b506020718..9e37b7028ea4 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -47,7 +47,7 @@ static void run_down(struct slot_map *m)
 	if (m->c != -1) {
 		for (;;) {
 			if (likely(list_empty(&wait.task_list)))
-				__add_wait_queue_tail(&m->q, &wait);
+				__add_wait_queue_entry_tail(&m->q, &wait);
 			set_current_state(TASK_UNINTERRUPTIBLE);
 
 			if (m->c == -1)
@@ -85,7 +85,7 @@ static int wait_for_free(struct slot_map *m)
 	do {
 		long n = left, t;
 		if (likely(list_empty(&wait.task_list)))
-			__add_wait_queue_tail_exclusive(&m->q, &wait);
+			__add_wait_queue_entry_tail_exclusive(&m->q, &wait);
 		set_current_state(TASK_INTERRUPTIBLE);
 
 		if (m->c > 0)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 39bb1e838d8d..a11d773e5ff3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s)
 
 static void queue_log_writer(struct super_block *s)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	set_bit(J_WRITERS_QUEUED, &journal->j_state);
 
diff --git a/fs/select.c b/fs/select.c
index d6c652a31e99..5b524a977d91 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 	return table->entry++;
 }
 
-static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct poll_wqueues *pwq = wait->private;
 	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	return default_wake_function(&dummy_wait, mode, sync, key);
 }
 
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct poll_table_entry *entry;
 
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 7e3d71109f51..593b022ac11b 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -43,7 +43,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
 	if (likely(!waitqueue_active(wqh)))
 		return;
 
-	/* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+	/* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
 	wake_up_poll(wqh, POLLHUP | POLLFREE);
 }
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1d622f276e3a..bda64fcd8a0c 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx {
 
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
-	wait_queue_t wq;
+	wait_queue_entry_t wq;
 	struct userfaultfd_ctx *ctx;
 	bool waken;
 };
@@ -91,7 +91,7 @@ struct userfaultfd_wake_range {
 	unsigned long len;
 };
 
-static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 				     int wake_flags, void *key)
 {
 	struct userfaultfd_wake_range *range = key;
@@ -860,7 +860,7 @@ wakeup:
 static inline struct userfaultfd_wait_queue *find_userfault_in(
 		wait_queue_head_t *wqh)
 {
-	wait_queue_t *wq;
+	wait_queue_entry_t *wq;
 	struct userfaultfd_wait_queue *uwq;
 
 	VM_BUG_ON(!spin_is_locked(&wqh->lock));
@@ -1747,7 +1747,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct userfaultfd_ctx *ctx = f->private_data;
-	wait_queue_t *wq;
+	wait_queue_entry_t *wq;
 	struct userfaultfd_wait_queue *uwq;
 	unsigned long pending = 0, total = 0;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fcd641032f8d..95ba83806c5d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -33,7 +33,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-	wait_queue_t		dispatch_wait;
+	wait_queue_entry_t		dispatch_wait;
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index ff0b981f078e..9e4befd95bc7 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -37,7 +37,7 @@ struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
 ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt);
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 
 #else /* CONFIG_EVENTFD */
@@ -73,7 +73,7 @@ static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait,
 }
 
 static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
-						wait_queue_t *wait, __u64 *cnt)
+						wait_queue_entry_t *wait, __u64 *cnt)
 {
 	return -ENOSYS;
 }
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index 0c1de05098c8..76c2fbc59f35 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -46,7 +46,7 @@ struct kvm_kernel_irqfd_resampler {
 struct kvm_kernel_irqfd {
 	/* Used for MSI fast-path */
 	struct kvm *kvm;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	/* Update side is protected by irqfds.lock */
 	struct kvm_kernel_irq_routing_entry irq_entry;
 	seqcount_t irq_entry_sc;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 316a19f6b635..e7bbd9d4dc6c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -524,7 +524,7 @@ void page_endio(struct page *page, bool is_write, int err);
 /*
  * Add an arbitrary waiter to a page's wait queue
  */
-extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
+extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
 
 /*
  * Fault everything in given userspace address range in.
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 75ffc5729e4c..2889f09a1c60 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -75,7 +75,7 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 struct poll_table_entry {
 	struct file *filp;
 	unsigned long key;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	wait_queue_head_t *wait_address;
 };
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index edf9b2cad277..f57076b958b7 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -183,7 +183,7 @@ struct virqfd {
 	void			(*thread)(void *, void *);
 	void			*data;
 	struct work_struct	inject;
-	wait_queue_t		wait;
+	wait_queue_entry_t		wait;
 	poll_table		pt;
 	struct work_struct	shutdown;
 	struct virqfd		**pvirqfd;
diff --git a/include/linux/wait.h b/include/linux/wait.h
index db076ca7f11d..5889f0c86ff7 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -10,15 +10,18 @@
 #include <asm/current.h>
 #include <uapi/linux/wait.h>
 
-typedef struct __wait_queue wait_queue_t;
-typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
-int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
+typedef struct wait_queue_entry wait_queue_entry_t;
+typedef int (*wait_queue_func_t)(wait_queue_entry_t *wait, unsigned mode, int flags, void *key);
+int default_wake_function(wait_queue_entry_t *wait, unsigned mode, int flags, void *key);
 
-/* __wait_queue::flags */
+/* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
 
-struct __wait_queue {
+/*
+ * A single wait-queue entry structure:
+ */
+struct wait_queue_entry {
 	unsigned int		flags;
 	void			*private;
 	wait_queue_func_t	func;
@@ -34,7 +37,7 @@ struct wait_bit_key {
 
 struct wait_bit_queue {
 	struct wait_bit_key	key;
-	wait_queue_t		wait;
+	wait_queue_entry_t	wait;
 };
 
 struct __wait_queue_head {
@@ -55,7 +58,7 @@ struct task_struct;
 	.task_list	= { NULL, NULL } }
 
 #define DECLARE_WAITQUEUE(name, tsk)					\
-	wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
+	wait_queue_entry_t name = __WAITQUEUE_INITIALIZER(name, tsk)
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
 	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),		\
@@ -88,7 +91,7 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct
 # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
 #endif
 
-static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
+static inline void init_waitqueue_entry(wait_queue_entry_t *q, struct task_struct *p)
 {
 	q->flags	= 0;
 	q->private	= p;
@@ -96,7 +99,7 @@ static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
 }
 
 static inline void
-init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
+init_waitqueue_func_entry(wait_queue_entry_t *q, wait_queue_func_t func)
 {
 	q->flags	= 0;
 	q->private	= NULL;
@@ -159,11 +162,11 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq)
 	return waitqueue_active(wq);
 }
 
-extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
-extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
-extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
+extern void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait);
+extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait);
+extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait);
 
-static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
+static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *new)
 {
 	list_add(&new->task_list, &head->task_list);
 }
@@ -172,27 +175,27 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
  * Used for wake-one threads:
  */
 static inline void
-__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	wait->flags |= WQ_FLAG_EXCLUSIVE;
 	__add_wait_queue(q, wait);
 }
 
-static inline void __add_wait_queue_tail(wait_queue_head_t *head,
-					 wait_queue_t *new)
+static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head,
+					 wait_queue_entry_t *new)
 {
 	list_add_tail(&new->task_list, &head->task_list);
 }
 
 static inline void
-__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	wait->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue_tail(q, wait);
+	__add_wait_queue_entry_tail(q, wait);
 }
 
 static inline void
-__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
+__remove_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *old)
 {
 	list_del(&old->task_list);
 }
@@ -249,7 +252,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 	(!__builtin_constant_p(state) ||				\
 		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)	\
 
-extern void init_wait_entry(wait_queue_t *__wait, int flags);
+extern void init_wait_entry(wait_queue_entry_t *__wait, int flags);
 
 /*
  * The below macro ___wait_event() has an explicit shadow of the __ret
@@ -266,7 +269,7 @@ extern void init_wait_entry(wait_queue_t *__wait, int flags);
 #define ___wait_event(wq, condition, state, exclusive, ret, cmd)	\
 ({									\
 	__label__ __out;						\
-	wait_queue_t __wait;						\
+	wait_queue_entry_t __wait;						\
 	long __ret = ret;	/* explicit shadow */			\
 									\
 	init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0);	\
@@ -620,8 +623,8 @@ do {									\
 	__ret;								\
 })
 
-extern int do_wait_intr(wait_queue_head_t *, wait_queue_t *);
-extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
+extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
+extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
 
 #define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \
 ({									\
@@ -967,17 +970,17 @@ do {									\
 /*
  * Waitqueues which are removed from the waitqueue_head at wakeup time
  */
-void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
-void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
-void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
-long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
-int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
-int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
-int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
+void prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
+void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
+void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait);
+long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout);
+int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
+int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
+int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
 
 #define DEFINE_WAIT_FUNC(name, function)				\
-	wait_queue_t name = {						\
+	wait_queue_entry_t name = {					\
 		.private	= current,				\
 		.func		= function,				\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index fd60eccb59a6..75e612a45824 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -62,7 +62,7 @@ struct unix_sock {
 #define UNIX_GC_CANDIDATE	0
 #define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
-	wait_queue_t		peer_wake;
+	wait_queue_entry_t		peer_wake;
 };
 
 static inline struct unix_sock *unix_sk(const struct sock *sk)
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index aa63451ef20a..1953f8d6063b 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -26,7 +26,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	AUTOFS_PROTO_VERSION
 
 /*
- * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
+ * The wait_queue_entry_token (autofs_wqt_t) is part of a structure which is passed
  * back to the kernel via ioctl from userspace. On architectures where 32- and
  * 64-bit userspace binaries can be executed it's important that the size of
  * autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we
@@ -49,7 +49,7 @@ struct autofs_packet_hdr {
 
 struct autofs_packet_missing {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_entry_token;
 	int len;
 	char name[NAME_MAX+1];
 };	
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 7c6da423d54e..65b72d0222e7 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -108,7 +108,7 @@ enum autofs_notify {
 /* v4 multi expire (via pipe) */
 struct autofs_packet_expire_multi {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_entry_token;
 	int len;
 	char name[NAME_MAX+1];
 };
@@ -123,7 +123,7 @@ union autofs_packet_union {
 /* autofs v5 common packet struct */
 struct autofs_v5_packet {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_entry_token;
 	__u32 dev;
 	__u64 ino;
 	__u32 uid;
diff --git a/kernel/exit.c b/kernel/exit.c
index 516acdb0e0ec..7d694437ab44 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1004,7 +1004,7 @@ struct wait_opts {
 	int __user		*wo_stat;
 	struct rusage __user	*wo_rusage;
 
-	wait_queue_t		child_wait;
+	wait_queue_entry_t		child_wait;
 	int			notask_error;
 };
 
@@ -1541,7 +1541,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	return 0;
 }
 
-static int child_wait_callback(wait_queue_t *wait, unsigned mode,
+static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 				int sync, void *key)
 {
 	struct wait_opts *wo = container_of(wait, struct wait_opts,
diff --git a/kernel/futex.c b/kernel/futex.c
index 357348a6cf6b..d6cf71d08f21 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -225,7 +225,7 @@ struct futex_pi_state {
  * @requeue_pi_key:	the requeue_pi target futex key
  * @bitset:		bitset for the optional bitmasked wakeup
  *
- * We use this hashed waitqueue, instead of a normal wait_queue_t, so
+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
  * we can wake only the relevant ones (hashed queues may be shared).
  *
  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 53f9558fa925..13fc5ae9bf2f 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x,
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 
-		__add_wait_queue_tail_exclusive(&x->wait, &wait);
+		__add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 326d4f88e2b1..5b36644536ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3687,7 +3687,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	exception_exit(prev_state);
 }
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, wake_flags);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index b8c84c6dee64..301ea02dede0 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -21,7 +21,7 @@ void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_c
 
 EXPORT_SYMBOL(__init_waitqueue_head);
 
-void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	unsigned long flags;
 
@@ -32,18 +32,18 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(add_wait_queue);
 
-void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	unsigned long flags;
 
 	wait->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&q->lock, flags);
-	__add_wait_queue_tail(q, wait);
+	__add_wait_queue_entry_tail(q, wait);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 
-void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	unsigned long flags;
 
@@ -66,7 +66,7 @@ EXPORT_SYMBOL(remove_wait_queue);
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int wake_flags, void *key)
 {
-	wait_queue_t *curr, *next;
+	wait_queue_entry_t *curr, *next;
 
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
@@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
  * loads to move into the critical region).
  */
 void
-prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
 {
 	unsigned long flags;
 
@@ -184,20 +184,20 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
 EXPORT_SYMBOL(prepare_to_wait);
 
 void
-prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
 {
 	unsigned long flags;
 
 	wait->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&q->lock, flags);
 	if (list_empty(&wait->task_list))
-		__add_wait_queue_tail(q, wait);
+		__add_wait_queue_entry_tail(q, wait);
 	set_current_state(state);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
-void init_wait_entry(wait_queue_t *wait, int flags)
+void init_wait_entry(wait_queue_entry_t *wait, int flags)
 {
 	wait->flags = flags;
 	wait->private = current;
@@ -206,7 +206,7 @@ void init_wait_entry(wait_queue_t *wait, int flags)
 }
 EXPORT_SYMBOL(init_wait_entry);
 
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
 {
 	unsigned long flags;
 	long ret = 0;
@@ -230,7 +230,7 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
 	} else {
 		if (list_empty(&wait->task_list)) {
 			if (wait->flags & WQ_FLAG_EXCLUSIVE)
-				__add_wait_queue_tail(q, wait);
+				__add_wait_queue_entry_tail(q, wait);
 			else
 				__add_wait_queue(q, wait);
 		}
@@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event);
  * condition in the caller before they add the wait
  * entry to the wake queue.
  */
-int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
+int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
 {
 	if (likely(list_empty(&wait->task_list)))
-		__add_wait_queue_tail(wq, wait);
+		__add_wait_queue_entry_tail(wq, wait);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	if (signal_pending(current))
@@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(do_wait_intr);
 
-int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait)
+int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
 {
 	if (likely(list_empty(&wait->task_list)))
-		__add_wait_queue_tail(wq, wait);
+		__add_wait_queue_entry_tail(wq, wait);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	if (signal_pending(current))
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(do_wait_intr_irq);
  * the wait descriptor from the given waitqueue if still
  * queued.
  */
-void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	unsigned long flags;
 
@@ -316,7 +316,7 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);
 
-int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	int ret = default_wake_function(wait, mode, sync, key);
 
@@ -351,7 +351,7 @@ static inline bool is_kthread_should_stop(void)
  * remove_wait_queue(&wq, &wait);
  *
  */
-long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout)
 {
 	set_current_state(mode); /* A */
 	/*
@@ -375,7 +375,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
 }
 EXPORT_SYMBOL(wait_woken);
 
-int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	/*
 	 * Although this function is called under waitqueue lock, LOCK
@@ -391,7 +391,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 }
 EXPORT_SYMBOL(woken_wake_function);
 
-int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
 {
 	struct wait_bit_key *key = arg;
 	struct wait_bit_queue *wait_bit
@@ -534,7 +534,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
 	return bit_waitqueue(p, 0);
 }
 
-static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+static int wake_atomic_t_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 				  void *arg)
 {
 	struct wait_bit_key *key = arg;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c74bf39ef764..a86688fabc55 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work)
 EXPORT_SYMBOL_GPL(flush_work);
 
 struct cwt_wait {
-	wait_queue_t		wait;
+	wait_queue_entry_t		wait;
 	struct work_struct	*work;
 };
 
-static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 6f1be573a5e6..80c19ee81e95 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -768,10 +768,10 @@ struct wait_page_key {
 struct wait_page_queue {
 	struct page *page;
 	int bit_nr;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 };
 
-static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
 {
 	struct wait_page_key *key = arg;
 	struct wait_page_queue *wait_page
@@ -834,7 +834,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 		struct page *page, int bit_nr, int state, bool lock)
 {
 	struct wait_page_queue wait_page;
-	wait_queue_t *wait = &wait_page.wait;
+	wait_queue_entry_t *wait = &wait_page.wait;
 	int ret = 0;
 
 	init_wait(wait);
@@ -847,7 +847,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 
 		if (likely(list_empty(&wait->task_list))) {
 			if (lock)
-				__add_wait_queue_tail_exclusive(q, wait);
+				__add_wait_queue_entry_tail_exclusive(q, wait);
 			else
 				__add_wait_queue(q, wait);
 			SetPageWaiters(page);
@@ -907,7 +907,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
  *
  * Add an arbitrary @waiter to the wait queue for the nominated @page.
  */
-void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
 {
 	wait_queue_head_t *q = page_waitqueue(page);
 	unsigned long flags;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 94172089f52f..9a90b096dc6b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -170,7 +170,7 @@ struct mem_cgroup_event {
 	 */
 	poll_table pt;
 	wait_queue_head_t *wqh;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct work_struct remove;
 };
 
@@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 
 struct oom_wait_info {
 	struct mem_cgroup *memcg;
-	wait_queue_t	wait;
+	wait_queue_entry_t	wait;
 };
 
-static int memcg_oom_wake_function(wait_queue_t *wait,
+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
 	unsigned mode, int sync, void *arg)
 {
 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
@@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work)
  *
  * Called with wqh->lock held and interrupts disabled.
  */
-static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
 			    int sync, void *key)
 {
 	struct mem_cgroup_event *event =
diff --git a/mm/mempool.c b/mm/mempool.c
index 47a659dedd44..1c0294858527 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 {
 	void *element;
 	unsigned long flags;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	gfp_t gfp_temp;
 
 	VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
diff --git a/mm/shmem.c b/mm/shmem.c
index e67d6ba4e98e..a6c7dece4660 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1902,7 +1902,7 @@ unlock:
  * entry unconditionally - even if something else had already woken the
  * target.
  */
-static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	int ret = default_wake_function(wait, mode, sync, key);
 	list_del_init(&wait->task_list);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 7bc2208b6cc4..dca3cdd1a014 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -95,7 +95,7 @@ enum {
 
 struct p9_poll_wait {
 	struct p9_conn *conn;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	wait_queue_head_t *wait_addr;
 };
 
@@ -522,7 +522,7 @@ error:
 	clear_bit(Wworksched, &m->wsched);
 }
 
-static int p9_pollwake(wait_queue_t *wait, unsigned int mode, int sync, void *key)
+static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
 {
 	struct p9_poll_wait *pwait =
 		container_of(wait, struct p9_poll_wait, wait);
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index fbf251fef70f..5c4808b3da2d 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -484,7 +484,7 @@ static int bnep_session(void *arg)
 	struct net_device *dev = s->dev;
 	struct sock *sk = s->sock->sk;
 	struct sk_buff *skb;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	BT_DBG("");
 
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 9e59b6654126..14f7c8135c31 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -280,7 +280,7 @@ static int cmtp_session(void *arg)
 	struct cmtp_session *session = arg;
 	struct sock *sk = session->sock->sk;
 	struct sk_buff *skb;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	BT_DBG("session %p", session);
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 0bec4588c3c8..fc31161e98f2 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1244,7 +1244,7 @@ static void hidp_session_run(struct hidp_session *session)
 static int hidp_session_thread(void *arg)
 {
 	struct hidp_session *session = arg;
-	wait_queue_t ctrl_wait, intr_wait;
+	wait_queue_entry_t ctrl_wait, intr_wait;
 
 	BT_DBG("session %p", session);
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index db1866f2ffcf..34678828e2bb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk)
 	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
 }
 
-static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
+static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
 				  void *key)
 {
 	unsigned long bits = (unsigned long)key;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1a0c961f4ffe..c77ced0109b7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -343,7 +343,7 @@ found:
  * are still connected to it and there's no way to inform "a polling
  * implementation" that it should let go of a certain wait queue
  *
- * In order to propagate a wake up, a wait_queue_t of the client
+ * In order to propagate a wake up, a wait_queue_entry_t of the client
  * socket is enqueued on the peer_wait queue of the server socket
  * whose wake function does a wake_up on the ordinary client socket
  * wait queue. This connection is established whenever a write (or
@@ -352,7 +352,7 @@ found:
  * was relayed.
  */
 
-static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 				      void *key)
 {
 	struct unix_sock *u;
diff --git a/sound/core/control.c b/sound/core/control.c
index c109b82eef4b..6362da17ac3f 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -1577,7 +1577,7 @@ static ssize_t snd_ctl_read(struct file *file, char __user *buffer,
 		struct snd_ctl_event ev;
 		struct snd_kctl_event *kev;
 		while (list_empty(&ctl->events)) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 			if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
 				err = -EAGAIN;
 				goto __end_lock;
diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c
index 9602a7e38d8a..a73baa1242be 100644
--- a/sound/core/hwdep.c
+++ b/sound/core/hwdep.c
@@ -85,7 +85,7 @@ static int snd_hwdep_open(struct inode *inode, struct file * file)
 	int major = imajor(inode);
 	struct snd_hwdep *hw;
 	int err;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (major == snd_major) {
 		hw = snd_lookup_minor_data(iminor(inode),
diff --git a/sound/core/init.c b/sound/core/init.c
index 6bda8436d765..d61d2b3cd521 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -989,7 +989,7 @@ EXPORT_SYMBOL(snd_card_file_remove);
  */
 int snd_power_wait(struct snd_card *card, unsigned int power_state)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int result = 0;
 
 	/* fastpath */
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 36baf962f9b0..cd8b7bef8d06 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -1554,7 +1554,7 @@ static int snd_pcm_oss_sync1(struct snd_pcm_substream *substream, size_t size)
 	ssize_t result = 0;
 	snd_pcm_state_t state;
 	long res;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	runtime = substream->runtime;
 	init_waitqueue_entry(&wait, current);
@@ -2387,7 +2387,7 @@ static int snd_pcm_oss_open(struct inode *inode, struct file *file)
 	struct snd_pcm_oss_file *pcm_oss_file;
 	struct snd_pcm_oss_setup setup[2];
 	int nonblock;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	err = nonseekable_open(inode, file);
 	if (err < 0)
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index 5088d4b8db22..dd5254077ef7 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -1904,7 +1904,7 @@ static int wait_for_avail(struct snd_pcm_substream *substream,
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	int is_playback = substream->stream == SNDRV_PCM_STREAM_PLAYBACK;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int err = 0;
 	snd_pcm_uframes_t avail = 0;
 	long wait_time, tout;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 13dec5ec93f2..faa2e2be6f2e 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -1652,7 +1652,7 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream,
 	struct snd_card *card;
 	struct snd_pcm_runtime *runtime;
 	struct snd_pcm_substream *s;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int result = 0;
 	int nonblock = 0;
 
@@ -2353,7 +2353,7 @@ static int snd_pcm_capture_open(struct inode *inode, struct file *file)
 static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream)
 {
 	int err;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (pcm == NULL) {
 		err = -ENODEV;
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index ab890336175f..32588ad05653 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -368,7 +368,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file)
 	int err;
 	struct snd_rawmidi *rmidi;
 	struct snd_rawmidi_file *rawmidi_file = NULL;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if ((file->f_flags & O_APPEND) && !(file->f_flags & O_NONBLOCK)) 
 		return -EINVAL;		/* invalid combination */
@@ -1002,7 +1002,7 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun
 	while (count > 0) {
 		spin_lock_irq(&runtime->lock);
 		while (!snd_rawmidi_ready(substream)) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 			if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
 				spin_unlock_irq(&runtime->lock);
 				return result > 0 ? result : -EAGAIN;
@@ -1306,7 +1306,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
 	while (count > 0) {
 		spin_lock_irq(&runtime->lock);
 		while (!snd_rawmidi_ready_append(substream, count)) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 			if (file->f_flags & O_NONBLOCK) {
 				spin_unlock_irq(&runtime->lock);
 				return result > 0 ? result : -EAGAIN;
@@ -1338,7 +1338,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
 	if (file->f_flags & O_DSYNC) {
 		spin_lock_irq(&runtime->lock);
 		while (runtime->avail != runtime->buffer_size) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 			unsigned int last_avail = runtime->avail;
 			init_waitqueue_entry(&wait, current);
 			add_wait_queue(&runtime->sleep, &wait);
diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c
index 01c4cfe30c9f..a8c2822e0198 100644
--- a/sound/core/seq/seq_fifo.c
+++ b/sound/core/seq/seq_fifo.c
@@ -179,7 +179,7 @@ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f,
 {
 	struct snd_seq_event_cell *cell;
 	unsigned long flags;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (snd_BUG_ON(!f))
 		return -EINVAL;
diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c
index d4c61ec9be13..d6e9aacdc36b 100644
--- a/sound/core/seq/seq_memory.c
+++ b/sound/core/seq/seq_memory.c
@@ -227,7 +227,7 @@ static int snd_seq_cell_alloc(struct snd_seq_pool *pool,
 	struct snd_seq_event_cell *cell;
 	unsigned long flags;
 	int err = -EAGAIN;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (pool == NULL)
 		return -EINVAL;
diff --git a/sound/core/timer.c b/sound/core/timer.c
index cd67d1c12cf1..884c3066b028 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1964,7 +1964,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer,
 	spin_lock_irq(&tu->qlock);
 	while ((long)count - result >= unit) {
 		while (!tu->qused) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 
 			if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
 				err = -EAGAIN;
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 4dae9ff9ef5a..0b1e4b34b299 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -1782,7 +1782,7 @@ wavefront_should_cause_interrupt (snd_wavefront_t *dev,
 				  int val, int port, unsigned long timeout)
 
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	init_waitqueue_entry(&wait, current);
 	spin_lock_irq(&dev->irq_lock);
diff --git a/sound/pci/mixart/mixart_core.c b/sound/pci/mixart/mixart_core.c
index dccf3db48fe0..8bf2ce32d4a8 100644
--- a/sound/pci/mixart/mixart_core.c
+++ b/sound/pci/mixart/mixart_core.c
@@ -239,7 +239,7 @@ int snd_mixart_send_msg(struct mixart_mgr *mgr, struct mixart_msg *request, int
 	struct mixart_msg resp;
 	u32 msg_frame = 0; /* set to 0, so it's no notification to wait for, but the answer */
 	int err;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	long timeout;
 
 	init_waitqueue_entry(&wait, current);
@@ -284,7 +284,7 @@ int snd_mixart_send_msg_wait_notif(struct mixart_mgr *mgr,
 				   struct mixart_msg *request, u32 notif_event)
 {
 	int err;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	long timeout;
 
 	if (snd_BUG_ON(!notif_event))
diff --git a/sound/pci/ymfpci/ymfpci_main.c b/sound/pci/ymfpci/ymfpci_main.c
index fe4ba463b57c..1114166c685c 100644
--- a/sound/pci/ymfpci/ymfpci_main.c
+++ b/sound/pci/ymfpci/ymfpci_main.c
@@ -781,7 +781,7 @@ static snd_pcm_uframes_t snd_ymfpci_capture_pointer(struct snd_pcm_substream *su
 
 static void snd_ymfpci_irq_wait(struct snd_ymfpci *chip)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int loops = 4;
 
 	while (loops-- > 0) {
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index a8d540398bbd..9120edf3c94b 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -184,7 +184,7 @@ int __attribute__((weak)) kvm_arch_set_irq_inatomic(
  * Called with wqh->lock held and interrupts disabled
  */
 static int
-irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(wait, struct kvm_kernel_irqfd, wait);
-- 
cgit v1.2.3


From e130291212df5ce8160cd2e35387c96439863ad3 Mon Sep 17 00:00:00 2001
From: Steve Longerbeam <slongerbeam@gmail.com>
Date: Sat, 10 Jun 2017 16:00:29 -0300
Subject: [media] media: Add i.MX media core driver

Add the core media driver for i.MX SOC.

Switch from the v4l2_of_ APIs to the v4l2_fwnode_ APIs.
Add the bayer formats to imx-media's list of supported pixel and bus
formats.

Signed-off-by: Steve Longerbeam <steve_longerbeam@mentor.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/v4l-drivers/imx.rst           | 614 ++++++++++++++++
 drivers/staging/media/Kconfig                     |   2 +
 drivers/staging/media/Makefile                    |   1 +
 drivers/staging/media/imx/Kconfig                 |   7 +
 drivers/staging/media/imx/Makefile                |   5 +
 drivers/staging/media/imx/imx-media-dev.c         | 667 +++++++++++++++++
 drivers/staging/media/imx/imx-media-fim.c         | 494 +++++++++++++
 drivers/staging/media/imx/imx-media-internal-sd.c | 349 +++++++++
 drivers/staging/media/imx/imx-media-of.c          | 270 +++++++
 drivers/staging/media/imx/imx-media-utils.c       | 834 ++++++++++++++++++++++
 drivers/staging/media/imx/imx-media.h             | 323 +++++++++
 include/media/imx.h                               |  15 +
 include/uapi/linux/v4l2-controls.h                |   4 +
 13 files changed, 3585 insertions(+)
 create mode 100644 Documentation/media/v4l-drivers/imx.rst
 create mode 100644 drivers/staging/media/imx/Kconfig
 create mode 100644 drivers/staging/media/imx/Makefile
 create mode 100644 drivers/staging/media/imx/imx-media-dev.c
 create mode 100644 drivers/staging/media/imx/imx-media-fim.c
 create mode 100644 drivers/staging/media/imx/imx-media-internal-sd.c
 create mode 100644 drivers/staging/media/imx/imx-media-of.c
 create mode 100644 drivers/staging/media/imx/imx-media-utils.c
 create mode 100644 drivers/staging/media/imx/imx-media.h
 create mode 100644 include/media/imx.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/v4l-drivers/imx.rst b/Documentation/media/v4l-drivers/imx.rst
new file mode 100644
index 000000000000..e0ee0f1aeb05
--- /dev/null
+++ b/Documentation/media/v4l-drivers/imx.rst
@@ -0,0 +1,614 @@
+i.MX Video Capture Driver
+=========================
+
+Introduction
+------------
+
+The Freescale i.MX5/6 contains an Image Processing Unit (IPU), which
+handles the flow of image frames to and from capture devices and
+display devices.
+
+For image capture, the IPU contains the following internal subunits:
+
+- Image DMA Controller (IDMAC)
+- Camera Serial Interface (CSI)
+- Image Converter (IC)
+- Sensor Multi-FIFO Controller (SMFC)
+- Image Rotator (IRT)
+- Video De-Interlacing or Combining Block (VDIC)
+
+The IDMAC is the DMA controller for transfer of image frames to and from
+memory. Various dedicated DMA channels exist for both video capture and
+display paths. During transfer, the IDMAC is also capable of vertical
+image flip, 8x8 block transfer (see IRT description), pixel component
+re-ordering (for example UYVY to YUYV) within the same colorspace, and
+even packed <--> planar conversion. It can also perform a simple
+de-interlacing by interleaving even and odd lines during transfer
+(without motion compensation which requires the VDIC).
+
+The CSI is the backend capture unit that interfaces directly with
+camera sensors over Parallel, BT.656/1120, and MIPI CSI-2 busses.
+
+The IC handles color-space conversion, resizing (downscaling and
+upscaling), horizontal flip, and 90/270 degree rotation operations.
+
+There are three independent "tasks" within the IC that can carry out
+conversions concurrently: pre-process encoding, pre-process viewfinder,
+and post-processing. Within each task, conversions are split into three
+sections: downsizing section, main section (upsizing, flip, colorspace
+conversion, and graphics plane combining), and rotation section.
+
+The IPU time-shares the IC task operations. The time-slice granularity
+is one burst of eight pixels in the downsizing section, one image line
+in the main processing section, one image frame in the rotation section.
+
+The SMFC is composed of four independent FIFOs that each can transfer
+captured frames from sensors directly to memory concurrently via four
+IDMAC channels.
+
+The IRT carries out 90 and 270 degree image rotation operations. The
+rotation operation is carried out on 8x8 pixel blocks at a time. This
+operation is supported by the IDMAC which handles the 8x8 block transfer
+along with block reordering, in coordination with vertical flip.
+
+The VDIC handles the conversion of interlaced video to progressive, with
+support for different motion compensation modes (low, medium, and high
+motion). The deinterlaced output frames from the VDIC can be sent to the
+IC pre-process viewfinder task for further conversions. The VDIC also
+contains a Combiner that combines two image planes, with alpha blending
+and color keying.
+
+In addition to the IPU internal subunits, there are also two units
+outside the IPU that are also involved in video capture on i.MX:
+
+- MIPI CSI-2 Receiver for camera sensors with the MIPI CSI-2 bus
+  interface. This is a Synopsys DesignWare core.
+- Two video multiplexers for selecting among multiple sensor inputs
+  to send to a CSI.
+
+For more info, refer to the latest versions of the i.MX5/6 reference
+manuals [#f1]_ and [#f2]_.
+
+
+Features
+--------
+
+Some of the features of this driver include:
+
+- Many different pipelines can be configured via media controller API,
+  that correspond to the hardware video capture pipelines supported in
+  the i.MX.
+
+- Supports parallel, BT.565, and MIPI CSI-2 interfaces.
+
+- Concurrent independent streams, by configuring pipelines to multiple
+  video capture interfaces using independent entities.
+
+- Scaling, color-space conversion, horizontal and vertical flip, and
+  image rotation via IC task subdevs.
+
+- Many pixel formats supported (RGB, packed and planar YUV, partial
+  planar YUV).
+
+- The VDIC subdev supports motion compensated de-interlacing, with three
+  motion compensation modes: low, medium, and high motion. Pipelines are
+  defined that allow sending frames to the VDIC subdev directly from the
+  CSI. There is also support in the future for sending frames to the
+  VDIC from memory buffers via a output/mem2mem devices.
+
+- Includes a Frame Interval Monitor (FIM) that can correct vertical sync
+  problems with the ADV718x video decoders.
+
+
+Entities
+--------
+
+imx6-mipi-csi2
+--------------
+
+This is the MIPI CSI-2 receiver entity. It has one sink pad to receive
+the MIPI CSI-2 stream (usually from a MIPI CSI-2 camera sensor). It has
+four source pads, corresponding to the four MIPI CSI-2 demuxed virtual
+channel outputs. Multpiple source pads can be enabled to independently
+stream from multiple virtual channels.
+
+This entity actually consists of two sub-blocks. One is the MIPI CSI-2
+core. This is a Synopsys Designware MIPI CSI-2 core. The other sub-block
+is a "CSI-2 to IPU gasket". The gasket acts as a demultiplexer of the
+four virtual channels streams, providing four separate parallel buses
+containing each virtual channel that are routed to CSIs or video
+multiplexers as described below.
+
+On i.MX6 solo/dual-lite, all four virtual channel buses are routed to
+two video multiplexers. Both CSI0 and CSI1 can receive any virtual
+channel, as selected by the video multiplexers.
+
+On i.MX6 Quad, virtual channel 0 is routed to IPU1-CSI0 (after selected
+by a video mux), virtual channels 1 and 2 are hard-wired to IPU1-CSI1
+and IPU2-CSI0, respectively, and virtual channel 3 is routed to
+IPU2-CSI1 (again selected by a video mux).
+
+ipuX_csiY_mux
+-------------
+
+These are the video multiplexers. They have two or more sink pads to
+select from either camera sensors with a parallel interface, or from
+MIPI CSI-2 virtual channels from imx6-mipi-csi2 entity. They have a
+single source pad that routes to a CSI (ipuX_csiY entities).
+
+On i.MX6 solo/dual-lite, there are two video mux entities. One sits
+in front of IPU1-CSI0 to select between a parallel sensor and any of
+the four MIPI CSI-2 virtual channels (a total of five sink pads). The
+other mux sits in front of IPU1-CSI1, and again has five sink pads to
+select between a parallel sensor and any of the four MIPI CSI-2 virtual
+channels.
+
+On i.MX6 Quad, there are two video mux entities. One sits in front of
+IPU1-CSI0 to select between a parallel sensor and MIPI CSI-2 virtual
+channel 0 (two sink pads). The other mux sits in front of IPU2-CSI1 to
+select between a parallel sensor and MIPI CSI-2 virtual channel 3 (two
+sink pads).
+
+ipuX_csiY
+---------
+
+These are the CSI entities. They have a single sink pad receiving from
+either a video mux or from a MIPI CSI-2 virtual channel as described
+above.
+
+This entity has two source pads. The first source pad can link directly
+to the ipuX_vdic entity or the ipuX_ic_prp entity, using hardware links
+that require no IDMAC memory buffer transfer.
+
+When the direct source pad is routed to the ipuX_ic_prp entity, frames
+from the CSI can be processed by one or both of the IC pre-processing
+tasks.
+
+When the direct source pad is routed to the ipuX_vdic entity, the VDIC
+will carry out motion-compensated de-interlace using "high motion" mode
+(see description of ipuX_vdic entity).
+
+The second source pad sends video frames directly to memory buffers
+via the SMFC and an IDMAC channel, bypassing IC pre-processing. This
+source pad is routed to a capture device node, with a node name of the
+format "ipuX_csiY capture".
+
+Note that since the IDMAC source pad makes use of an IDMAC channel, it
+can do pixel reordering within the same colorspace. For example, the
+sink pad can take UYVY2X8, but the IDMAC source pad can output YUYV2X8.
+If the sink pad is receiving YUV, the output at the capture device can
+also be converted to a planar YUV format such as YUV420.
+
+It will also perform simple de-interlace without motion compensation,
+which is activated if the sink pad's field type is an interlaced type,
+and the IDMAC source pad field type is set to none.
+
+This subdev can generate the following event when enabling the second
+IDMAC source pad:
+
+- V4L2_EVENT_IMX_FRAME_INTERVAL_ERROR
+
+The user application can subscribe to this event from the ipuX_csiY
+subdev node. This event is generated by the Frame Interval Monitor
+(see below for more on the FIM).
+
+Cropping in ipuX_csiY
+---------------------
+
+The CSI supports cropping the incoming raw sensor frames. This is
+implemented in the ipuX_csiY entities at the sink pad, using the
+crop selection subdev API.
+
+The CSI also supports fixed divide-by-two downscaling indepently in
+width and height. This is implemented in the ipuX_csiY entities at
+the sink pad, using the compose selection subdev API.
+
+The output rectangle at the ipuX_csiY source pad is the same as
+the compose rectangle at the sink pad. So the source pad rectangle
+cannot be negotiated, it must be set using the compose selection
+API at sink pad (if /2 downscale is desired, otherwise source pad
+rectangle is equal to incoming rectangle).
+
+To give an example of crop and /2 downscale, this will crop a
+1280x960 input frame to 640x480, and then /2 downscale in both
+dimensions to 320x240 (assumes ipu1_csi0 is linked to ipu1_csi0_mux):
+
+media-ctl -V "'ipu1_csi0_mux':2[fmt:UYVY2X8/1280x960]"
+media-ctl -V "'ipu1_csi0':0[crop:(0,0)/640x480]"
+media-ctl -V "'ipu1_csi0':0[compose:(0,0)/320x240]"
+
+Frame Skipping in ipuX_csiY
+---------------------------
+
+The CSI supports frame rate decimation, via frame skipping. Frame
+rate decimation is specified by setting the frame intervals at
+sink and source pads. The ipuX_csiY entity then applies the best
+frame skip setting to the CSI to achieve the desired frame rate
+at the source pad.
+
+The following example reduces an assumed incoming 60 Hz frame
+rate by half at the IDMAC output source pad:
+
+media-ctl -V "'ipu1_csi0':0[fmt:UYVY2X8/640x480@1/60]"
+media-ctl -V "'ipu1_csi0':2[fmt:UYVY2X8/640x480@1/30]"
+
+Frame Interval Monitor in ipuX_csiY
+-----------------------------------
+
+The adv718x decoders can occasionally send corrupt fields during
+NTSC/PAL signal re-sync (too little or too many video lines). When
+this happens, the IPU triggers a mechanism to re-establish vertical
+sync by adding 1 dummy line every frame, which causes a rolling effect
+from image to image, and can last a long time before a stable image is
+recovered. Or sometimes the mechanism doesn't work at all, causing a
+permanent split image (one frame contains lines from two consecutive
+captured images).
+
+From experiment it was found that during image rolling, the frame
+intervals (elapsed time between two EOF's) drop below the nominal
+value for the current standard, by about one frame time (60 usec),
+and remain at that value until rolling stops.
+
+While the reason for this observation isn't known (the IPU dummy
+line mechanism should show an increase in the intervals by 1 line
+time every frame, not a fixed value), we can use it to detect the
+corrupt fields using a frame interval monitor. If the FIM detects a
+bad frame interval, the ipuX_csiY subdev will send the event
+V4L2_EVENT_IMX_FRAME_INTERVAL_ERROR. Userland can register with
+the FIM event notification on the ipuX_csiY subdev device node.
+Userland can issue a streaming restart when this event is received
+to correct the rolling/split image.
+
+The ipuX_csiY subdev includes custom controls to tweak some dials for
+FIM. If one of these controls is changed during streaming, the FIM will
+be reset and will continue at the new settings.
+
+- V4L2_CID_IMX_FIM_ENABLE
+
+Enable/disable the FIM.
+
+- V4L2_CID_IMX_FIM_NUM
+
+How many frame interval measurements to average before comparing against
+the nominal frame interval reported by the sensor. This can reduce noise
+caused by interrupt latency.
+
+- V4L2_CID_IMX_FIM_TOLERANCE_MIN
+
+If the averaged intervals fall outside nominal by this amount, in
+microseconds, the V4L2_EVENT_IMX_FRAME_INTERVAL_ERROR event is sent.
+
+- V4L2_CID_IMX_FIM_TOLERANCE_MAX
+
+If any intervals are higher than this value, those samples are
+discarded and do not enter into the average. This can be used to
+discard really high interval errors that might be due to interrupt
+latency from high system load.
+
+- V4L2_CID_IMX_FIM_NUM_SKIP
+
+How many frames to skip after a FIM reset or stream restart before
+FIM begins to average intervals.
+
+- V4L2_CID_IMX_FIM_ICAP_CHANNEL
+- V4L2_CID_IMX_FIM_ICAP_EDGE
+
+These controls will configure an input capture channel as the method
+for measuring frame intervals. This is superior to the default method
+of measuring frame intervals via EOF interrupt, since it is not subject
+to uncertainty errors introduced by interrupt latency.
+
+Input capture requires hardware support. A VSYNC signal must be routed
+to one of the i.MX6 input capture channel pads.
+
+V4L2_CID_IMX_FIM_ICAP_CHANNEL configures which i.MX6 input capture
+channel to use. This must be 0 or 1.
+
+V4L2_CID_IMX_FIM_ICAP_EDGE configures which signal edge will trigger
+input capture events. By default the input capture method is disabled
+with a value of IRQ_TYPE_NONE. Set this control to IRQ_TYPE_EDGE_RISING,
+IRQ_TYPE_EDGE_FALLING, or IRQ_TYPE_EDGE_BOTH to enable input capture,
+triggered on the given signal edge(s).
+
+When input capture is disabled, frame intervals will be measured via
+EOF interrupt.
+
+
+ipuX_vdic
+---------
+
+The VDIC carries out motion compensated de-interlacing, with three
+motion compensation modes: low, medium, and high motion. The mode is
+specified with the menu control V4L2_CID_DEINTERLACING_MODE. It has
+two sink pads and a single source pad.
+
+The direct sink pad receives from an ipuX_csiY direct pad. With this
+link the VDIC can only operate in high motion mode.
+
+When the IDMAC sink pad is activated, it receives from an output
+or mem2mem device node. With this pipeline, it can also operate
+in low and medium modes, because these modes require receiving
+frames from memory buffers. Note that an output or mem2mem device
+is not implemented yet, so this sink pad currently has no links.
+
+The source pad routes to the IC pre-processing entity ipuX_ic_prp.
+
+ipuX_ic_prp
+-----------
+
+This is the IC pre-processing entity. It acts as a router, routing
+data from its sink pad to one or both of its source pads.
+
+It has a single sink pad. The sink pad can receive from the ipuX_csiY
+direct pad, or from ipuX_vdic.
+
+This entity has two source pads. One source pad routes to the
+pre-process encode task entity (ipuX_ic_prpenc), the other to the
+pre-process viewfinder task entity (ipuX_ic_prpvf). Both source pads
+can be activated at the same time if the sink pad is receiving from
+ipuX_csiY. Only the source pad to the pre-process viewfinder task entity
+can be activated if the sink pad is receiving from ipuX_vdic (frames
+from the VDIC can only be processed by the pre-process viewfinder task).
+
+ipuX_ic_prpenc
+--------------
+
+This is the IC pre-processing encode entity. It has a single sink
+pad from ipuX_ic_prp, and a single source pad. The source pad is
+routed to a capture device node, with a node name of the format
+"ipuX_ic_prpenc capture".
+
+This entity performs the IC pre-process encode task operations:
+color-space conversion, resizing (downscaling and upscaling),
+horizontal and vertical flip, and 90/270 degree rotation. Flip
+and rotation are provided via standard V4L2 controls.
+
+Like the ipuX_csiY IDMAC source, it can also perform simple de-interlace
+without motion compensation, and pixel reordering.
+
+ipuX_ic_prpvf
+-------------
+
+This is the IC pre-processing viewfinder entity. It has a single sink
+pad from ipuX_ic_prp, and a single source pad. The source pad is routed
+to a capture device node, with a node name of the format
+"ipuX_ic_prpvf capture".
+
+It is identical in operation to ipuX_ic_prpenc, with the same resizing
+and CSC operations and flip/rotation controls. It will receive and
+process de-interlaced frames from the ipuX_vdic if ipuX_ic_prp is
+receiving from ipuX_vdic.
+
+Like the ipuX_csiY IDMAC source, it can perform simple de-interlace
+without motion compensation. However, note that if the ipuX_vdic is
+included in the pipeline (ipuX_ic_prp is receiving from ipuX_vdic),
+it's not possible to use simple de-interlace in ipuX_ic_prpvf, since
+the ipuX_vdic has already carried out de-interlacing (with motion
+compensation) and therefore the field type output from ipuX_ic_prp can
+only be none.
+
+Capture Pipelines
+-----------------
+
+The following describe the various use-cases supported by the pipelines.
+
+The links shown do not include the backend sensor, video mux, or mipi
+csi-2 receiver links. This depends on the type of sensor interface
+(parallel or mipi csi-2). So these pipelines begin with:
+
+sensor -> ipuX_csiY_mux -> ...
+
+for parallel sensors, or:
+
+sensor -> imx6-mipi-csi2 -> (ipuX_csiY_mux) -> ...
+
+for mipi csi-2 sensors. The imx6-mipi-csi2 receiver may need to route
+to the video mux (ipuX_csiY_mux) before sending to the CSI, depending
+on the mipi csi-2 virtual channel, hence ipuX_csiY_mux is shown in
+parenthesis.
+
+Unprocessed Video Capture:
+--------------------------
+
+Send frames directly from sensor to camera device interface node, with
+no conversions, via ipuX_csiY IDMAC source pad:
+
+-> ipuX_csiY:2 -> ipuX_csiY capture
+
+IC Direct Conversions:
+----------------------
+
+This pipeline uses the preprocess encode entity to route frames directly
+from the CSI to the IC, to carry out scaling up to 1024x1024 resolution,
+CSC, flipping, and image rotation:
+
+-> ipuX_csiY:1 -> 0:ipuX_ic_prp:1 -> 0:ipuX_ic_prpenc:1 ->
+   ipuX_ic_prpenc capture
+
+Motion Compensated De-interlace:
+--------------------------------
+
+This pipeline routes frames from the CSI direct pad to the VDIC entity to
+support motion-compensated de-interlacing (high motion mode only),
+scaling up to 1024x1024, CSC, flip, and rotation:
+
+-> ipuX_csiY:1 -> 0:ipuX_vdic:2 -> 0:ipuX_ic_prp:2 ->
+   0:ipuX_ic_prpvf:1 -> ipuX_ic_prpvf capture
+
+
+Usage Notes
+-----------
+
+To aid in configuration and for backward compatibility with V4L2
+applications that access controls only from video device nodes, the
+capture device interfaces inherit controls from the active entities
+in the current pipeline, so controls can be accessed either directly
+from the subdev or from the active capture device interface. For
+example, the FIM controls are available either from the ipuX_csiY
+subdevs or from the active capture device.
+
+The following are specific usage notes for the Sabre* reference
+boards:
+
+
+SabreLite with OV5642 and OV5640
+--------------------------------
+
+This platform requires the OmniVision OV5642 module with a parallel
+camera interface, and the OV5640 module with a MIPI CSI-2
+interface. Both modules are available from Boundary Devices:
+
+https://boundarydevices.com/product/nit6x_5mp
+https://boundarydevices.com/product/nit6x_5mp_mipi
+
+Note that if only one camera module is available, the other sensor
+node can be disabled in the device tree.
+
+The OV5642 module is connected to the parallel bus input on the i.MX
+internal video mux to IPU1 CSI0. It's i2c bus connects to i2c bus 2.
+
+The MIPI CSI-2 OV5640 module is connected to the i.MX internal MIPI CSI-2
+receiver, and the four virtual channel outputs from the receiver are
+routed as follows: vc0 to the IPU1 CSI0 mux, vc1 directly to IPU1 CSI1,
+vc2 directly to IPU2 CSI0, and vc3 to the IPU2 CSI1 mux. The OV5640 is
+also connected to i2c bus 2 on the SabreLite, therefore the OV5642 and
+OV5640 must not share the same i2c slave address.
+
+The following basic example configures unprocessed video capture
+pipelines for both sensors. The OV5642 is routed to ipu1_csi0, and
+the OV5640, transmitting on MIPI CSI-2 virtual channel 1 (which is
+imx6-mipi-csi2 pad 2), is routed to ipu1_csi1. Both sensors are
+configured to output 640x480, and the OV5642 outputs YUYV2X8, the
+OV5640 UYVY2X8:
+
+.. code-block:: none
+
+   # Setup links for OV5642
+   media-ctl -l "'ov5642 1-0042':0 -> 'ipu1_csi0_mux':1[1]"
+   media-ctl -l "'ipu1_csi0_mux':2 -> 'ipu1_csi0':0[1]"
+   media-ctl -l "'ipu1_csi0':2 -> 'ipu1_csi0 capture':0[1]"
+   # Setup links for OV5640
+   media-ctl -l "'ov5640 1-0040':0 -> 'imx6-mipi-csi2':0[1]"
+   media-ctl -l "'imx6-mipi-csi2':2 -> 'ipu1_csi1':0[1]"
+   media-ctl -l "'ipu1_csi1':2 -> 'ipu1_csi1 capture':0[1]"
+   # Configure pads for OV5642 pipeline
+   media-ctl -V "'ov5642 1-0042':0 [fmt:YUYV2X8/640x480 field:none]"
+   media-ctl -V "'ipu1_csi0_mux':2 [fmt:YUYV2X8/640x480 field:none]"
+   media-ctl -V "'ipu1_csi0':2 [fmt:AYUV32/640x480 field:none]"
+   # Configure pads for OV5640 pipeline
+   media-ctl -V "'ov5640 1-0040':0 [fmt:UYVY2X8/640x480 field:none]"
+   media-ctl -V "'imx6-mipi-csi2':2 [fmt:UYVY2X8/640x480 field:none]"
+   media-ctl -V "'ipu1_csi1':2 [fmt:AYUV32/640x480 field:none]"
+
+Streaming can then begin independently on the capture device nodes
+"ipu1_csi0 capture" and "ipu1_csi1 capture". The v4l2-ctl tool can
+be used to select any supported YUV pixelformat on the capture device
+nodes, including planar.
+
+SabreAuto with ADV7180 decoder
+------------------------------
+
+On the SabreAuto, an on-board ADV7180 SD decoder is connected to the
+parallel bus input on the internal video mux to IPU1 CSI0.
+
+The following example configures a pipeline to capture from the ADV7180
+video decoder, assuming NTSC 720x480 input signals, with Motion
+Compensated de-interlacing. Pad field types assume the adv7180 outputs
+"interlaced". $outputfmt can be any format supported by the ipu1_ic_prpvf
+entity at its output pad:
+
+.. code-block:: none
+
+   # Setup links
+   media-ctl -l "'adv7180 3-0021':0 -> 'ipu1_csi0_mux':1[1]"
+   media-ctl -l "'ipu1_csi0_mux':2 -> 'ipu1_csi0':0[1]"
+   media-ctl -l "'ipu1_csi0':1 -> 'ipu1_vdic':0[1]"
+   media-ctl -l "'ipu1_vdic':2 -> 'ipu1_ic_prp':0[1]"
+   media-ctl -l "'ipu1_ic_prp':2 -> 'ipu1_ic_prpvf':0[1]"
+   media-ctl -l "'ipu1_ic_prpvf':1 -> 'ipu1_ic_prpvf capture':0[1]"
+   # Configure pads
+   media-ctl -V "'adv7180 3-0021':0 [fmt:UYVY2X8/720x480]"
+   media-ctl -V "'ipu1_csi0_mux':2 [fmt:UYVY2X8/720x480 field:interlaced]"
+   media-ctl -V "'ipu1_csi0':1 [fmt:AYUV32/720x480 field:interlaced]"
+   media-ctl -V "'ipu1_vdic':2 [fmt:AYUV32/720x480 field:none]"
+   media-ctl -V "'ipu1_ic_prp':2 [fmt:AYUV32/720x480 field:none]"
+   media-ctl -V "'ipu1_ic_prpvf':1 [fmt:$outputfmt field:none]"
+
+Streaming can then begin on the capture device node at
+"ipu1_ic_prpvf capture". The v4l2-ctl tool can be used to select any
+supported YUV or RGB pixelformat on the capture device node.
+
+This platform accepts Composite Video analog inputs to the ADV7180 on
+Ain1 (connector J42).
+
+SabreSD with MIPI CSI-2 OV5640
+------------------------------
+
+Similarly to SabreLite, the SabreSD supports a parallel interface
+OV5642 module on IPU1 CSI0, and a MIPI CSI-2 OV5640 module. The OV5642
+connects to i2c bus 1 and the OV5640 to i2c bus 2.
+
+The device tree for SabreSD includes OF graphs for both the parallel
+OV5642 and the MIPI CSI-2 OV5640, but as of this writing only the MIPI
+CSI-2 OV5640 has been tested, so the OV5642 node is currently disabled.
+The OV5640 module connects to MIPI connector J5 (sorry I don't have the
+compatible module part number or URL).
+
+The following example configures a direct conversion pipeline to capture
+from the OV5640, transmitting on MIPI CSI-2 virtual channel 1. $sensorfmt
+can be any format supported by the OV5640. $sensordim is the frame
+dimension part of $sensorfmt (minus the mbus pixel code). $outputfmt can
+be any format supported by the ipu1_ic_prpenc entity at its output pad:
+
+.. code-block:: none
+
+   # Setup links
+   media-ctl -l "'ov5640 1-003c':0 -> 'imx6-mipi-csi2':0[1]"
+   media-ctl -l "'imx6-mipi-csi2':2 -> 'ipu1_csi1':0[1]"
+   media-ctl -l "'ipu1_csi1':1 -> 'ipu1_ic_prp':0[1]"
+   media-ctl -l "'ipu1_ic_prp':1 -> 'ipu1_ic_prpenc':0[1]"
+   media-ctl -l "'ipu1_ic_prpenc':1 -> 'ipu1_ic_prpenc capture':0[1]"
+   # Configure pads
+   media-ctl -V "'ov5640 1-003c':0 [fmt:$sensorfmt field:none]"
+   media-ctl -V "'imx6-mipi-csi2':2 [fmt:$sensorfmt field:none]"
+   media-ctl -V "'ipu1_csi1':1 [fmt:AYUV32/$sensordim field:none]"
+   media-ctl -V "'ipu1_ic_prp':1 [fmt:AYUV32/$sensordim field:none]"
+   media-ctl -V "'ipu1_ic_prpenc':1 [fmt:$outputfmt field:none]"
+
+Streaming can then begin on "ipu1_ic_prpenc capture" node. The v4l2-ctl
+tool can be used to select any supported YUV or RGB pixelformat on the
+capture device node.
+
+
+Known Issues
+------------
+
+1. When using 90 or 270 degree rotation control at capture resolutions
+   near the IC resizer limit of 1024x1024, and combined with planar
+   pixel formats (YUV420, YUV422p), frame capture will often fail with
+   no end-of-frame interrupts from the IDMAC channel. To work around
+   this, use lower resolution and/or packed formats (YUYV, RGB3, etc.)
+   when 90 or 270 rotations are needed.
+
+
+File list
+---------
+
+drivers/staging/media/imx/
+include/media/imx.h
+include/linux/imx-media.h
+
+References
+----------
+
+.. [#f1] http://www.nxp.com/assets/documents/data/en/reference-manuals/IMX6DQRM.pdf
+.. [#f2] http://www.nxp.com/assets/documents/data/en/reference-manuals/IMX6SDLRM.pdf
+
+
+Authors
+-------
+Steve Longerbeam <steve_longerbeam@mentor.com>
+Philipp Zabel <kernel@pengutronix.de>
+Russell King <linux@armlinux.org.uk>
+
+Copyright (C) 2012-2017 Mentor Graphics Inc.
diff --git a/drivers/staging/media/Kconfig b/drivers/staging/media/Kconfig
index dbda4d9a08e7..f8c25ee082ef 100644
--- a/drivers/staging/media/Kconfig
+++ b/drivers/staging/media/Kconfig
@@ -27,6 +27,8 @@ source "drivers/staging/media/cxd2099/Kconfig"
 
 source "drivers/staging/media/davinci_vpfe/Kconfig"
 
+source "drivers/staging/media/imx/Kconfig"
+
 source "drivers/staging/media/omap4iss/Kconfig"
 
 # Keep LIRC at the end, as it has sub-menus
diff --git a/drivers/staging/media/Makefile b/drivers/staging/media/Makefile
index c04600c81264..ac090c5fce30 100644
--- a/drivers/staging/media/Makefile
+++ b/drivers/staging/media/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_I2C_BCM2048)	+= bcm2048/
 obj-$(CONFIG_DVB_CXD2099)	+= cxd2099/
+obj-$(CONFIG_VIDEO_IMX_MEDIA)	+= imx/
 obj-$(CONFIG_LIRC_STAGING)	+= lirc/
 obj-$(CONFIG_VIDEO_DM365_VPFE)	+= davinci_vpfe/
 obj-$(CONFIG_VIDEO_OMAP4)	+= omap4iss/
diff --git a/drivers/staging/media/imx/Kconfig b/drivers/staging/media/imx/Kconfig
new file mode 100644
index 000000000000..5e79a36ce225
--- /dev/null
+++ b/drivers/staging/media/imx/Kconfig
@@ -0,0 +1,7 @@
+config VIDEO_IMX_MEDIA
+	tristate "i.MX5/6 V4L2 media core driver"
+	depends on MEDIA_CONTROLLER && VIDEO_V4L2 && ARCH_MXC && IMX_IPUV3_CORE
+	select V4L2_FWNODE
+	---help---
+	  Say yes here to enable support for video4linux media controller
+	  driver for the i.MX5/6 SOC.
diff --git a/drivers/staging/media/imx/Makefile b/drivers/staging/media/imx/Makefile
new file mode 100644
index 000000000000..ddd7d94dbac9
--- /dev/null
+++ b/drivers/staging/media/imx/Makefile
@@ -0,0 +1,5 @@
+imx-media-objs := imx-media-dev.o imx-media-internal-sd.o imx-media-of.o
+imx-media-common-objs := imx-media-utils.o imx-media-fim.o
+
+obj-$(CONFIG_VIDEO_IMX_MEDIA) += imx-media.o
+obj-$(CONFIG_VIDEO_IMX_MEDIA) += imx-media-common.o
diff --git a/drivers/staging/media/imx/imx-media-dev.c b/drivers/staging/media/imx/imx-media-dev.c
new file mode 100644
index 000000000000..48cbc7716758
--- /dev/null
+++ b/drivers/staging/media/imx/imx-media-dev.c
@@ -0,0 +1,667 @@
+/*
+ * V4L2 Media Controller Driver for Freescale i.MX5/6 SOC
+ *
+ * Copyright (c) 2016 Mentor Graphics Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-event.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-mc.h>
+#include <video/imx-ipu-v3.h>
+#include <media/imx.h>
+#include "imx-media.h"
+
+static inline struct imx_media_dev *notifier2dev(struct v4l2_async_notifier *n)
+{
+	return container_of(n, struct imx_media_dev, subdev_notifier);
+}
+
+/*
+ * Find a subdev by device node or device name. This is called during
+ * driver load to form the async subdev list and bind them.
+ */
+struct imx_media_subdev *
+imx_media_find_async_subdev(struct imx_media_dev *imxmd,
+			    struct device_node *np,
+			    const char *devname)
+{
+	struct fwnode_handle *fwnode = np ? of_fwnode_handle(np) : NULL;
+	struct imx_media_subdev *imxsd;
+	int i;
+
+	for (i = 0; i < imxmd->subdev_notifier.num_subdevs; i++) {
+		imxsd = &imxmd->subdev[i];
+		switch (imxsd->asd.match_type) {
+		case V4L2_ASYNC_MATCH_FWNODE:
+			if (fwnode && imxsd->asd.match.fwnode.fwnode == fwnode)
+				return imxsd;
+			break;
+		case V4L2_ASYNC_MATCH_DEVNAME:
+			if (devname &&
+			    !strcmp(imxsd->asd.match.device_name.name, devname))
+				return imxsd;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ * Adds a subdev to the async subdev list. If np is non-NULL, adds
+ * the async as a V4L2_ASYNC_MATCH_FWNODE match type, otherwise as
+ * a V4L2_ASYNC_MATCH_DEVNAME match type using the dev_name of the
+ * given platform_device. This is called during driver load when
+ * forming the async subdev list.
+ */
+struct imx_media_subdev *
+imx_media_add_async_subdev(struct imx_media_dev *imxmd,
+			   struct device_node *np,
+			   struct platform_device *pdev)
+{
+	struct imx_media_subdev *imxsd;
+	struct v4l2_async_subdev *asd;
+	const char *devname = NULL;
+	int sd_idx;
+
+	mutex_lock(&imxmd->mutex);
+
+	if (pdev)
+		devname = dev_name(&pdev->dev);
+
+	/* return NULL if this subdev already added */
+	if (imx_media_find_async_subdev(imxmd, np, devname)) {
+		dev_dbg(imxmd->md.dev, "%s: already added %s\n",
+			__func__, np ? np->name : devname);
+		imxsd = NULL;
+		goto out;
+	}
+
+	sd_idx = imxmd->subdev_notifier.num_subdevs;
+	if (sd_idx >= IMX_MEDIA_MAX_SUBDEVS) {
+		dev_err(imxmd->md.dev, "%s: too many subdevs! can't add %s\n",
+			__func__, np ? np->name : devname);
+		imxsd = ERR_PTR(-ENOSPC);
+		goto out;
+	}
+
+	imxsd = &imxmd->subdev[sd_idx];
+
+	asd = &imxsd->asd;
+	if (np) {
+		asd->match_type = V4L2_ASYNC_MATCH_FWNODE;
+		asd->match.fwnode.fwnode = of_fwnode_handle(np);
+	} else {
+		asd->match_type = V4L2_ASYNC_MATCH_DEVNAME;
+		strncpy(imxsd->devname, devname, sizeof(imxsd->devname));
+		asd->match.device_name.name = imxsd->devname;
+		imxsd->pdev = pdev;
+	}
+
+	imxmd->async_ptrs[sd_idx] = asd;
+	imxmd->subdev_notifier.num_subdevs++;
+
+	dev_dbg(imxmd->md.dev, "%s: added %s, match type %s\n",
+		__func__, np ? np->name : devname, np ? "FWNODE" : "DEVNAME");
+
+out:
+	mutex_unlock(&imxmd->mutex);
+	return imxsd;
+}
+
+/*
+ * Adds an imx-media link to a subdev pad's link list. This is called
+ * during driver load when forming the links between subdevs.
+ *
+ * @pad: the local pad
+ * @remote_node: the device node of the remote subdev
+ * @remote_devname: the device name of the remote subdev
+ * @local_pad: local pad index
+ * @remote_pad: remote pad index
+ */
+int imx_media_add_pad_link(struct imx_media_dev *imxmd,
+			   struct imx_media_pad *pad,
+			   struct device_node *remote_node,
+			   const char *remote_devname,
+			   int local_pad, int remote_pad)
+{
+	struct imx_media_link *link;
+	int link_idx, ret = 0;
+
+	mutex_lock(&imxmd->mutex);
+
+	link_idx = pad->num_links;
+	if (link_idx >= IMX_MEDIA_MAX_LINKS) {
+		dev_err(imxmd->md.dev, "%s: too many links!\n", __func__);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	link = &pad->link[link_idx];
+
+	link->remote_sd_node = remote_node;
+	if (remote_devname)
+		strncpy(link->remote_devname, remote_devname,
+			sizeof(link->remote_devname));
+
+	link->local_pad = local_pad;
+	link->remote_pad = remote_pad;
+
+	pad->num_links++;
+out:
+	mutex_unlock(&imxmd->mutex);
+	return ret;
+}
+
+/*
+ * get IPU from this CSI and add it to the list of IPUs
+ * the media driver will control.
+ */
+static int imx_media_get_ipu(struct imx_media_dev *imxmd,
+			     struct v4l2_subdev *csi_sd)
+{
+	struct ipu_soc *ipu;
+	int ipu_id;
+
+	ipu = dev_get_drvdata(csi_sd->dev->parent);
+	if (!ipu) {
+		v4l2_err(&imxmd->v4l2_dev,
+			 "CSI %s has no parent IPU!\n", csi_sd->name);
+		return -ENODEV;
+	}
+
+	ipu_id = ipu_get_num(ipu);
+	if (ipu_id > 1) {
+		v4l2_err(&imxmd->v4l2_dev, "invalid IPU id %d!\n", ipu_id);
+		return -ENODEV;
+	}
+
+	if (!imxmd->ipu[ipu_id])
+		imxmd->ipu[ipu_id] = ipu;
+
+	return 0;
+}
+
+/* async subdev bound notifier */
+static int imx_media_subdev_bound(struct v4l2_async_notifier *notifier,
+				  struct v4l2_subdev *sd,
+				  struct v4l2_async_subdev *asd)
+{
+	struct imx_media_dev *imxmd = notifier2dev(notifier);
+	struct device_node *np = to_of_node(sd->fwnode);
+	struct imx_media_subdev *imxsd;
+	int ret = 0;
+
+	mutex_lock(&imxmd->mutex);
+
+	imxsd = imx_media_find_async_subdev(imxmd, np, dev_name(sd->dev));
+	if (!imxsd) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (sd->grp_id & IMX_MEDIA_GRP_ID_CSI) {
+		ret = imx_media_get_ipu(imxmd, sd);
+		if (ret)
+			goto out_unlock;
+	} else if (sd->entity.function == MEDIA_ENT_F_VID_MUX) {
+		/* this is a video mux */
+		sd->grp_id = IMX_MEDIA_GRP_ID_VIDMUX;
+	} else if (imxsd->num_sink_pads == 0) {
+		/*
+		 * this is an original source of video frames, it
+		 * could be a camera sensor, an analog decoder, or
+		 * a bridge device (HDMI -> MIPI CSI-2 for example).
+		 * This group ID is used to locate the entity that
+		 * is the original source of video in a pipeline.
+		 */
+		sd->grp_id = IMX_MEDIA_GRP_ID_SENSOR;
+	}
+
+	/* attach the subdev */
+	imxsd->sd = sd;
+out:
+	if (ret)
+		v4l2_warn(&imxmd->v4l2_dev,
+			  "Received unknown subdev %s\n", sd->name);
+	else
+		v4l2_info(&imxmd->v4l2_dev,
+			  "Registered subdev %s\n", sd->name);
+
+out_unlock:
+	mutex_unlock(&imxmd->mutex);
+	return ret;
+}
+
+/*
+ * Create a single source->sink media link given a subdev and a single
+ * link from one of its source pads. Called after all subdevs have
+ * registered.
+ */
+static int imx_media_create_link(struct imx_media_dev *imxmd,
+				 struct imx_media_subdev *src,
+				 struct imx_media_link *link)
+{
+	struct imx_media_subdev *sink;
+	u16 source_pad, sink_pad;
+	int ret;
+
+	sink = imx_media_find_async_subdev(imxmd, link->remote_sd_node,
+					   link->remote_devname);
+	if (!sink) {
+		v4l2_warn(&imxmd->v4l2_dev, "%s: no sink for %s:%d\n",
+			  __func__, src->sd->name, link->local_pad);
+		return 0;
+	}
+
+	source_pad = link->local_pad;
+	sink_pad = link->remote_pad;
+
+	v4l2_info(&imxmd->v4l2_dev, "%s: %s:%d -> %s:%d\n", __func__,
+		  src->sd->name, source_pad, sink->sd->name, sink_pad);
+
+	ret = media_create_pad_link(&src->sd->entity, source_pad,
+				    &sink->sd->entity, sink_pad, 0);
+	if (ret)
+		v4l2_err(&imxmd->v4l2_dev,
+			 "create_pad_link failed: %d\n", ret);
+
+	return ret;
+}
+
+/*
+ * create the media links from all imx-media pads and their links.
+ * Called after all subdevs have registered.
+ */
+static int imx_media_create_links(struct imx_media_dev *imxmd)
+{
+	struct imx_media_subdev *imxsd;
+	struct imx_media_link *link;
+	struct imx_media_pad *pad;
+	int num_pads, i, j, k;
+	int ret = 0;
+
+	for (i = 0; i < imxmd->num_subdevs; i++) {
+		imxsd = &imxmd->subdev[i];
+		num_pads = imxsd->num_sink_pads + imxsd->num_src_pads;
+
+		for (j = 0; j < num_pads; j++) {
+			pad = &imxsd->pad[j];
+
+			/* only create the source->sink links */
+			if (!(pad->pad.flags & MEDIA_PAD_FL_SOURCE))
+				continue;
+
+			for (k = 0; k < pad->num_links; k++) {
+				link = &pad->link[k];
+
+				ret = imx_media_create_link(imxmd, imxsd, link);
+				if (ret)
+					goto out;
+			}
+		}
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * adds given video device to given imx-media source pad vdev list.
+ * Continues upstream from the pad entity's sink pads.
+ */
+static int imx_media_add_vdev_to_pad(struct imx_media_dev *imxmd,
+				     struct imx_media_video_dev *vdev,
+				     struct media_pad *srcpad)
+{
+	struct media_entity *entity = srcpad->entity;
+	struct imx_media_subdev *imxsd;
+	struct imx_media_pad *imxpad;
+	struct media_link *link;
+	struct v4l2_subdev *sd;
+	int i, vdev_idx, ret;
+
+	/* skip this entity if not a v4l2_subdev */
+	if (!is_media_entity_v4l2_subdev(entity))
+		return 0;
+
+	sd = media_entity_to_v4l2_subdev(entity);
+	imxsd = imx_media_find_subdev_by_sd(imxmd, sd);
+	if (IS_ERR(imxsd))
+		return PTR_ERR(imxsd);
+
+	imxpad = &imxsd->pad[srcpad->index];
+	vdev_idx = imxpad->num_vdevs;
+
+	/* just return if we've been here before */
+	for (i = 0; i < vdev_idx; i++)
+		if (vdev == imxpad->vdev[i])
+			return 0;
+
+	if (vdev_idx >= IMX_MEDIA_MAX_VDEVS) {
+		dev_err(imxmd->md.dev, "can't add %s to pad %s:%u\n",
+			vdev->vfd->entity.name, entity->name, srcpad->index);
+		return -ENOSPC;
+	}
+
+	dev_dbg(imxmd->md.dev, "adding %s to pad %s:%u\n",
+		vdev->vfd->entity.name, entity->name, srcpad->index);
+	imxpad->vdev[vdev_idx] = vdev;
+	imxpad->num_vdevs++;
+
+	/* move upstream from this entity's sink pads */
+	for (i = 0; i < entity->num_pads; i++) {
+		struct media_pad *pad = &entity->pads[i];
+
+		if (!(pad->flags & MEDIA_PAD_FL_SINK))
+			continue;
+
+		list_for_each_entry(link, &entity->links, list) {
+			if (link->sink != pad)
+				continue;
+			ret = imx_media_add_vdev_to_pad(imxmd, vdev,
+							link->source);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/* form the vdev lists in all imx-media source pads */
+static int imx_media_create_pad_vdev_lists(struct imx_media_dev *imxmd)
+{
+	struct imx_media_video_dev *vdev;
+	struct media_link *link;
+	int i, ret;
+
+	for (i = 0; i < imxmd->num_vdevs; i++) {
+		vdev = imxmd->vdev[i];
+		link = list_first_entry(&vdev->vfd->entity.links,
+					struct media_link, list);
+		ret = imx_media_add_vdev_to_pad(imxmd, vdev, link->source);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/* async subdev complete notifier */
+static int imx_media_probe_complete(struct v4l2_async_notifier *notifier)
+{
+	struct imx_media_dev *imxmd = notifier2dev(notifier);
+	int i, ret;
+
+	mutex_lock(&imxmd->mutex);
+
+	/* make sure all subdevs were bound */
+	for (i = 0; i < imxmd->num_subdevs; i++) {
+		if (!imxmd->subdev[i].sd) {
+			v4l2_err(&imxmd->v4l2_dev, "unbound subdev!\n");
+			ret = -ENODEV;
+			goto unlock;
+		}
+	}
+
+	ret = imx_media_create_links(imxmd);
+	if (ret)
+		goto unlock;
+
+	ret = imx_media_create_pad_vdev_lists(imxmd);
+	if (ret)
+		goto unlock;
+
+	ret = v4l2_device_register_subdev_nodes(&imxmd->v4l2_dev);
+unlock:
+	mutex_unlock(&imxmd->mutex);
+	if (ret)
+		return ret;
+
+	return media_device_register(&imxmd->md);
+}
+
+/*
+ * adds controls to a video device from an entity subdevice.
+ * Continues upstream from the entity's sink pads.
+ */
+static int imx_media_inherit_controls(struct imx_media_dev *imxmd,
+				      struct video_device *vfd,
+				      struct media_entity *entity)
+{
+	int i, ret = 0;
+
+	if (is_media_entity_v4l2_subdev(entity)) {
+		struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity);
+
+		dev_dbg(imxmd->md.dev,
+			"adding controls to %s from %s\n",
+			vfd->entity.name, sd->entity.name);
+
+		ret = v4l2_ctrl_add_handler(vfd->ctrl_handler,
+					    sd->ctrl_handler,
+					    NULL);
+		if (ret)
+			return ret;
+	}
+
+	/* move upstream */
+	for (i = 0; i < entity->num_pads; i++) {
+		struct media_pad *pad, *spad = &entity->pads[i];
+
+		if (!(spad->flags & MEDIA_PAD_FL_SINK))
+			continue;
+
+		pad = media_entity_remote_pad(spad);
+		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
+			continue;
+
+		ret = imx_media_inherit_controls(imxmd, vfd, pad->entity);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int imx_media_link_notify(struct media_link *link, u32 flags,
+				 unsigned int notification)
+{
+	struct media_entity *source = link->source->entity;
+	struct imx_media_subdev *imxsd;
+	struct imx_media_pad *imxpad;
+	struct imx_media_dev *imxmd;
+	struct video_device *vfd;
+	struct v4l2_subdev *sd;
+	int i, pad_idx, ret;
+
+	ret = v4l2_pipeline_link_notify(link, flags, notification);
+	if (ret)
+		return ret;
+
+	/* don't bother if source is not a subdev */
+	if (!is_media_entity_v4l2_subdev(source))
+		return 0;
+
+	sd = media_entity_to_v4l2_subdev(source);
+	pad_idx = link->source->index;
+
+	imxmd = dev_get_drvdata(sd->v4l2_dev->dev);
+
+	imxsd = imx_media_find_subdev_by_sd(imxmd, sd);
+	if (IS_ERR(imxsd))
+		return PTR_ERR(imxsd);
+	imxpad = &imxsd->pad[pad_idx];
+
+	/*
+	 * Before disabling a link, reset controls for all video
+	 * devices reachable from this link.
+	 *
+	 * After enabling a link, refresh controls for all video
+	 * devices reachable from this link.
+	 */
+	if (notification == MEDIA_DEV_NOTIFY_PRE_LINK_CH &&
+	    !(flags & MEDIA_LNK_FL_ENABLED)) {
+		for (i = 0; i < imxpad->num_vdevs; i++) {
+			vfd = imxpad->vdev[i]->vfd;
+			dev_dbg(imxmd->md.dev,
+				"reset controls for %s\n",
+				vfd->entity.name);
+			v4l2_ctrl_handler_free(vfd->ctrl_handler);
+			v4l2_ctrl_handler_init(vfd->ctrl_handler, 0);
+		}
+	} else if (notification == MEDIA_DEV_NOTIFY_POST_LINK_CH &&
+		   (link->flags & MEDIA_LNK_FL_ENABLED)) {
+		for (i = 0; i < imxpad->num_vdevs; i++) {
+			vfd = imxpad->vdev[i]->vfd;
+			dev_dbg(imxmd->md.dev,
+				"refresh controls for %s\n",
+				vfd->entity.name);
+			ret = imx_media_inherit_controls(imxmd, vfd,
+							 &vfd->entity);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static const struct media_device_ops imx_media_md_ops = {
+	.link_notify = imx_media_link_notify,
+};
+
+static int imx_media_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct device_node *node = dev->of_node;
+	struct imx_media_subdev *csi[4] = {0};
+	struct imx_media_dev *imxmd;
+	int ret;
+
+	imxmd = devm_kzalloc(dev, sizeof(*imxmd), GFP_KERNEL);
+	if (!imxmd)
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, imxmd);
+
+	strlcpy(imxmd->md.model, "imx-media", sizeof(imxmd->md.model));
+	imxmd->md.ops = &imx_media_md_ops;
+	imxmd->md.dev = dev;
+
+	mutex_init(&imxmd->mutex);
+
+	imxmd->v4l2_dev.mdev = &imxmd->md;
+	strlcpy(imxmd->v4l2_dev.name, "imx-media",
+		sizeof(imxmd->v4l2_dev.name));
+
+	media_device_init(&imxmd->md);
+
+	ret = v4l2_device_register(dev, &imxmd->v4l2_dev);
+	if (ret < 0) {
+		v4l2_err(&imxmd->v4l2_dev,
+			 "Failed to register v4l2_device: %d\n", ret);
+		goto cleanup;
+	}
+
+	dev_set_drvdata(imxmd->v4l2_dev.dev, imxmd);
+
+	ret = imx_media_of_parse(imxmd, &csi, node);
+	if (ret) {
+		v4l2_err(&imxmd->v4l2_dev,
+			 "imx_media_of_parse failed with %d\n", ret);
+		goto unreg_dev;
+	}
+
+	ret = imx_media_add_internal_subdevs(imxmd, csi);
+	if (ret) {
+		v4l2_err(&imxmd->v4l2_dev,
+			 "add_internal_subdevs failed with %d\n", ret);
+		goto unreg_dev;
+	}
+
+	/* no subdevs? just bail */
+	imxmd->num_subdevs = imxmd->subdev_notifier.num_subdevs;
+	if (imxmd->num_subdevs == 0) {
+		ret = -ENODEV;
+		goto unreg_dev;
+	}
+
+	/* prepare the async subdev notifier and register it */
+	imxmd->subdev_notifier.subdevs = imxmd->async_ptrs;
+	imxmd->subdev_notifier.bound = imx_media_subdev_bound;
+	imxmd->subdev_notifier.complete = imx_media_probe_complete;
+	ret = v4l2_async_notifier_register(&imxmd->v4l2_dev,
+					   &imxmd->subdev_notifier);
+	if (ret) {
+		v4l2_err(&imxmd->v4l2_dev,
+			 "v4l2_async_notifier_register failed with %d\n", ret);
+		goto del_int;
+	}
+
+	return 0;
+
+del_int:
+	imx_media_remove_internal_subdevs(imxmd);
+unreg_dev:
+	v4l2_device_unregister(&imxmd->v4l2_dev);
+cleanup:
+	media_device_cleanup(&imxmd->md);
+	return ret;
+}
+
+static int imx_media_remove(struct platform_device *pdev)
+{
+	struct imx_media_dev *imxmd =
+		(struct imx_media_dev *)platform_get_drvdata(pdev);
+
+	v4l2_info(&imxmd->v4l2_dev, "Removing imx-media\n");
+
+	v4l2_async_notifier_unregister(&imxmd->subdev_notifier);
+	imx_media_remove_internal_subdevs(imxmd);
+	v4l2_device_unregister(&imxmd->v4l2_dev);
+	media_device_unregister(&imxmd->md);
+	media_device_cleanup(&imxmd->md);
+
+	return 0;
+}
+
+static const struct of_device_id imx_media_dt_ids[] = {
+	{ .compatible = "fsl,imx-capture-subsystem" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, imx_media_dt_ids);
+
+static struct platform_driver imx_media_pdrv = {
+	.probe		= imx_media_probe,
+	.remove		= imx_media_remove,
+	.driver		= {
+		.name	= "imx-media",
+		.of_match_table	= imx_media_dt_ids,
+	},
+};
+
+module_platform_driver(imx_media_pdrv);
+
+MODULE_DESCRIPTION("i.MX5/6 v4l2 media controller driver");
+MODULE_AUTHOR("Steve Longerbeam <steve_longerbeam@mentor.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/media/imx/imx-media-fim.c b/drivers/staging/media/imx/imx-media-fim.c
new file mode 100644
index 000000000000..47275ef803f3
--- /dev/null
+++ b/drivers/staging/media/imx/imx-media-fim.c
@@ -0,0 +1,494 @@
+/*
+ * Frame Interval Monitor.
+ *
+ * Copyright (c) 2016 Mentor Graphics Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-subdev.h>
+#include <media/imx.h>
+#include "imx-media.h"
+
+enum {
+	FIM_CL_ENABLE = 0,
+	FIM_CL_NUM,
+	FIM_CL_TOLERANCE_MIN,
+	FIM_CL_TOLERANCE_MAX,
+	FIM_CL_NUM_SKIP,
+	FIM_NUM_CONTROLS,
+};
+
+enum {
+	FIM_CL_ICAP_EDGE = 0,
+	FIM_CL_ICAP_CHANNEL,
+	FIM_NUM_ICAP_CONTROLS,
+};
+
+#define FIM_CL_ENABLE_DEF          0 /* FIM disabled by default */
+#define FIM_CL_NUM_DEF             8 /* average 8 frames */
+#define FIM_CL_NUM_SKIP_DEF        2 /* skip 2 frames after restart */
+#define FIM_CL_TOLERANCE_MIN_DEF  50 /* usec */
+#define FIM_CL_TOLERANCE_MAX_DEF   0 /* no max tolerance (unbounded) */
+
+struct imx_media_fim {
+	struct imx_media_dev *md;
+
+	/* the owning subdev of this fim instance */
+	struct v4l2_subdev *sd;
+
+	/* FIM's control handler */
+	struct v4l2_ctrl_handler ctrl_handler;
+
+	/* control clusters */
+	struct v4l2_ctrl  *ctrl[FIM_NUM_CONTROLS];
+	struct v4l2_ctrl  *icap_ctrl[FIM_NUM_ICAP_CONTROLS];
+
+	spinlock_t        lock; /* protect control values */
+
+	/* current control values */
+	bool              enabled;
+	int               num_avg;
+	int               num_skip;
+	unsigned long     tolerance_min; /* usec */
+	unsigned long     tolerance_max; /* usec */
+	/* input capture method of measuring FI */
+	int               icap_channel;
+	int               icap_flags;
+
+	int               counter;
+	struct timespec   last_ts;
+	unsigned long     sum;       /* usec */
+	unsigned long     nominal;   /* usec */
+
+	struct completion icap_first_event;
+	bool              stream_on;
+};
+
+#define icap_enabled(fim) ((fim)->icap_flags != IRQ_TYPE_NONE)
+
+static void update_fim_nominal(struct imx_media_fim *fim,
+			       const struct v4l2_fract *fi)
+{
+	if (fi->denominator == 0) {
+		dev_dbg(fim->sd->dev, "no frame interval, FIM disabled\n");
+		fim->enabled = false;
+		return;
+	}
+
+	fim->nominal = DIV_ROUND_CLOSEST_ULL(1000000ULL * (u64)fi->numerator,
+					     fi->denominator);
+
+	dev_dbg(fim->sd->dev, "FI=%lu usec\n", fim->nominal);
+}
+
+static void reset_fim(struct imx_media_fim *fim, bool curval)
+{
+	struct v4l2_ctrl *icap_chan = fim->icap_ctrl[FIM_CL_ICAP_CHANNEL];
+	struct v4l2_ctrl *icap_edge = fim->icap_ctrl[FIM_CL_ICAP_EDGE];
+	struct v4l2_ctrl *en = fim->ctrl[FIM_CL_ENABLE];
+	struct v4l2_ctrl *num = fim->ctrl[FIM_CL_NUM];
+	struct v4l2_ctrl *skip = fim->ctrl[FIM_CL_NUM_SKIP];
+	struct v4l2_ctrl *tol_min = fim->ctrl[FIM_CL_TOLERANCE_MIN];
+	struct v4l2_ctrl *tol_max = fim->ctrl[FIM_CL_TOLERANCE_MAX];
+
+	if (curval) {
+		fim->enabled = en->cur.val;
+		fim->icap_flags = icap_edge->cur.val;
+		fim->icap_channel = icap_chan->cur.val;
+		fim->num_avg = num->cur.val;
+		fim->num_skip = skip->cur.val;
+		fim->tolerance_min = tol_min->cur.val;
+		fim->tolerance_max = tol_max->cur.val;
+	} else {
+		fim->enabled = en->val;
+		fim->icap_flags = icap_edge->val;
+		fim->icap_channel = icap_chan->val;
+		fim->num_avg = num->val;
+		fim->num_skip = skip->val;
+		fim->tolerance_min = tol_min->val;
+		fim->tolerance_max = tol_max->val;
+	}
+
+	/* disable tolerance range if max <= min */
+	if (fim->tolerance_max <= fim->tolerance_min)
+		fim->tolerance_max = 0;
+
+	/* num_skip must be >= 1 if input capture not used */
+	if (!icap_enabled(fim))
+		fim->num_skip = max_t(int, fim->num_skip, 1);
+
+	fim->counter = -fim->num_skip;
+	fim->sum = 0;
+}
+
+static void send_fim_event(struct imx_media_fim *fim, unsigned long error)
+{
+	static const struct v4l2_event ev = {
+		.type = V4L2_EVENT_IMX_FRAME_INTERVAL_ERROR,
+	};
+
+	v4l2_subdev_notify_event(fim->sd, &ev);
+}
+
+/*
+ * Monitor an averaged frame interval. If the average deviates too much
+ * from the nominal frame rate, send the frame interval error event. The
+ * frame intervals are averaged in order to quiet noise from
+ * (presumably random) interrupt latency.
+ */
+static void frame_interval_monitor(struct imx_media_fim *fim,
+				   struct timespec *ts)
+{
+	unsigned long interval, error, error_avg;
+	bool send_event = false;
+	struct timespec diff;
+
+	if (!fim->enabled || ++fim->counter <= 0)
+		goto out_update_ts;
+
+	diff = timespec_sub(*ts, fim->last_ts);
+	interval = diff.tv_sec * 1000 * 1000 + diff.tv_nsec / 1000;
+	error = abs(interval - fim->nominal);
+
+	if (fim->tolerance_max && error >= fim->tolerance_max) {
+		dev_dbg(fim->sd->dev,
+			"FIM: %lu ignored, out of tolerance bounds\n",
+			error);
+		fim->counter--;
+		goto out_update_ts;
+	}
+
+	fim->sum += error;
+
+	if (fim->counter == fim->num_avg) {
+		error_avg = DIV_ROUND_CLOSEST(fim->sum, fim->num_avg);
+
+		if (error_avg > fim->tolerance_min)
+			send_event = true;
+
+		dev_dbg(fim->sd->dev, "FIM: error: %lu usec%s\n",
+			error_avg, send_event ? " (!!!)" : "");
+
+		fim->counter = 0;
+		fim->sum = 0;
+	}
+
+out_update_ts:
+	fim->last_ts = *ts;
+	if (send_event)
+		send_fim_event(fim, error_avg);
+}
+
+#ifdef CONFIG_IMX_GPT_ICAP
+/*
+ * Input Capture method of measuring frame intervals. Not subject
+ * to interrupt latency.
+ */
+static void fim_input_capture_handler(int channel, void *dev_id,
+				      struct timespec *ts)
+{
+	struct imx_media_fim *fim = dev_id;
+	unsigned long flags;
+
+	spin_lock_irqsave(&fim->lock, flags);
+
+	frame_interval_monitor(fim, ts);
+
+	if (!completion_done(&fim->icap_first_event))
+		complete(&fim->icap_first_event);
+
+	spin_unlock_irqrestore(&fim->lock, flags);
+}
+
+static int fim_request_input_capture(struct imx_media_fim *fim)
+{
+	init_completion(&fim->icap_first_event);
+
+	return mxc_request_input_capture(fim->icap_channel,
+					 fim_input_capture_handler,
+					 fim->icap_flags, fim);
+}
+
+static void fim_free_input_capture(struct imx_media_fim *fim)
+{
+	mxc_free_input_capture(fim->icap_channel, fim);
+}
+
+#else /* CONFIG_IMX_GPT_ICAP */
+
+static int fim_request_input_capture(struct imx_media_fim *fim)
+{
+	return 0;
+}
+
+static void fim_free_input_capture(struct imx_media_fim *fim)
+{
+}
+
+#endif /* CONFIG_IMX_GPT_ICAP */
+
+/*
+ * In case we are monitoring the first frame interval after streamon
+ * (when fim->num_skip = 0), we need a valid fim->last_ts before we
+ * can begin. This only applies to the input capture method. It is not
+ * possible to accurately measure the first FI after streamon using the
+ * EOF method, so fim->num_skip minimum is set to 1 in that case, so this
+ * function is a noop when the EOF method is used.
+ */
+static void fim_acquire_first_ts(struct imx_media_fim *fim)
+{
+	unsigned long ret;
+
+	if (!fim->enabled || fim->num_skip > 0)
+		return;
+
+	ret = wait_for_completion_timeout(
+		&fim->icap_first_event,
+		msecs_to_jiffies(IMX_MEDIA_EOF_TIMEOUT));
+	if (ret == 0)
+		v4l2_warn(fim->sd, "wait first icap event timeout\n");
+}
+
+/* FIM Controls */
+static int fim_s_ctrl(struct v4l2_ctrl *ctrl)
+{
+	struct imx_media_fim *fim = container_of(ctrl->handler,
+						 struct imx_media_fim,
+						 ctrl_handler);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&fim->lock, flags);
+
+	switch (ctrl->id) {
+	case V4L2_CID_IMX_FIM_ENABLE:
+		break;
+	case V4L2_CID_IMX_FIM_ICAP_EDGE:
+		if (fim->stream_on)
+			ret = -EBUSY;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (!ret)
+		reset_fim(fim, false);
+
+	spin_unlock_irqrestore(&fim->lock, flags);
+	return ret;
+}
+
+static const struct v4l2_ctrl_ops fim_ctrl_ops = {
+	.s_ctrl = fim_s_ctrl,
+};
+
+static const struct v4l2_ctrl_config fim_ctrl[] = {
+	[FIM_CL_ENABLE] = {
+		.ops = &fim_ctrl_ops,
+		.id = V4L2_CID_IMX_FIM_ENABLE,
+		.name = "FIM Enable",
+		.type = V4L2_CTRL_TYPE_BOOLEAN,
+		.def = FIM_CL_ENABLE_DEF,
+		.min = 0,
+		.max = 1,
+		.step = 1,
+	},
+	[FIM_CL_NUM] = {
+		.ops = &fim_ctrl_ops,
+		.id = V4L2_CID_IMX_FIM_NUM,
+		.name = "FIM Num Average",
+		.type = V4L2_CTRL_TYPE_INTEGER,
+		.def = FIM_CL_NUM_DEF,
+		.min =  1, /* no averaging */
+		.max = 64, /* average 64 frames */
+		.step = 1,
+	},
+	[FIM_CL_TOLERANCE_MIN] = {
+		.ops = &fim_ctrl_ops,
+		.id = V4L2_CID_IMX_FIM_TOLERANCE_MIN,
+		.name = "FIM Tolerance Min",
+		.type = V4L2_CTRL_TYPE_INTEGER,
+		.def = FIM_CL_TOLERANCE_MIN_DEF,
+		.min =    2,
+		.max =  200,
+		.step =   1,
+	},
+	[FIM_CL_TOLERANCE_MAX] = {
+		.ops = &fim_ctrl_ops,
+		.id = V4L2_CID_IMX_FIM_TOLERANCE_MAX,
+		.name = "FIM Tolerance Max",
+		.type = V4L2_CTRL_TYPE_INTEGER,
+		.def = FIM_CL_TOLERANCE_MAX_DEF,
+		.min =    0,
+		.max =  500,
+		.step =   1,
+	},
+	[FIM_CL_NUM_SKIP] = {
+		.ops = &fim_ctrl_ops,
+		.id = V4L2_CID_IMX_FIM_NUM_SKIP,
+		.name = "FIM Num Skip",
+		.type = V4L2_CTRL_TYPE_INTEGER,
+		.def = FIM_CL_NUM_SKIP_DEF,
+		.min =   0, /* skip no frames */
+		.max = 256, /* skip 256 frames */
+		.step =  1,
+	},
+};
+
+static const struct v4l2_ctrl_config fim_icap_ctrl[] = {
+	[FIM_CL_ICAP_EDGE] = {
+		.ops = &fim_ctrl_ops,
+		.id = V4L2_CID_IMX_FIM_ICAP_EDGE,
+		.name = "FIM Input Capture Edge",
+		.type = V4L2_CTRL_TYPE_INTEGER,
+		.def =  IRQ_TYPE_NONE, /* input capture disabled by default */
+		.min =  IRQ_TYPE_NONE,
+		.max =  IRQ_TYPE_EDGE_BOTH,
+		.step = 1,
+	},
+	[FIM_CL_ICAP_CHANNEL] = {
+		.ops = &fim_ctrl_ops,
+		.id = V4L2_CID_IMX_FIM_ICAP_CHANNEL,
+		.name = "FIM Input Capture Channel",
+		.type = V4L2_CTRL_TYPE_INTEGER,
+		.def =  0,
+		.min =  0,
+		.max =  1,
+		.step = 1,
+	},
+};
+
+static int init_fim_controls(struct imx_media_fim *fim)
+{
+	struct v4l2_ctrl_handler *hdlr = &fim->ctrl_handler;
+	int i, ret;
+
+	v4l2_ctrl_handler_init(hdlr, FIM_NUM_CONTROLS + FIM_NUM_ICAP_CONTROLS);
+
+	for (i = 0; i < FIM_NUM_CONTROLS; i++)
+		fim->ctrl[i] = v4l2_ctrl_new_custom(hdlr,
+						    &fim_ctrl[i],
+						    NULL);
+	for (i = 0; i < FIM_NUM_ICAP_CONTROLS; i++)
+		fim->icap_ctrl[i] = v4l2_ctrl_new_custom(hdlr,
+							 &fim_icap_ctrl[i],
+							 NULL);
+	if (hdlr->error) {
+		ret = hdlr->error;
+		goto err_free;
+	}
+
+	v4l2_ctrl_cluster(FIM_NUM_CONTROLS, fim->ctrl);
+	v4l2_ctrl_cluster(FIM_NUM_ICAP_CONTROLS, fim->icap_ctrl);
+
+	return 0;
+err_free:
+	v4l2_ctrl_handler_free(hdlr);
+	return ret;
+}
+
+/*
+ * Monitor frame intervals via EOF interrupt. This method is
+ * subject to uncertainty errors introduced by interrupt latency.
+ *
+ * This is a noop if the Input Capture method is being used, since
+ * the frame_interval_monitor() is called by the input capture event
+ * callback handler in that case.
+ */
+void imx_media_fim_eof_monitor(struct imx_media_fim *fim, struct timespec *ts)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&fim->lock, flags);
+
+	if (!icap_enabled(fim))
+		frame_interval_monitor(fim, ts);
+
+	spin_unlock_irqrestore(&fim->lock, flags);
+}
+EXPORT_SYMBOL_GPL(imx_media_fim_eof_monitor);
+
+/* Called by the subdev in its s_stream callback */
+int imx_media_fim_set_stream(struct imx_media_fim *fim,
+			     const struct v4l2_fract *fi,
+			     bool on)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	v4l2_ctrl_lock(fim->ctrl[FIM_CL_ENABLE]);
+
+	if (fim->stream_on == on)
+		goto out;
+
+	if (on) {
+		spin_lock_irqsave(&fim->lock, flags);
+		reset_fim(fim, true);
+		update_fim_nominal(fim, fi);
+		spin_unlock_irqrestore(&fim->lock, flags);
+
+		if (icap_enabled(fim)) {
+			ret = fim_request_input_capture(fim);
+			if (ret)
+				goto out;
+			fim_acquire_first_ts(fim);
+		}
+	} else {
+		if (icap_enabled(fim))
+			fim_free_input_capture(fim);
+	}
+
+	fim->stream_on = on;
+out:
+	v4l2_ctrl_unlock(fim->ctrl[FIM_CL_ENABLE]);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(imx_media_fim_set_stream);
+
+int imx_media_fim_add_controls(struct imx_media_fim *fim)
+{
+	/* add the FIM controls to the calling subdev ctrl handler */
+	return v4l2_ctrl_add_handler(fim->sd->ctrl_handler,
+				     &fim->ctrl_handler, NULL);
+}
+EXPORT_SYMBOL_GPL(imx_media_fim_add_controls);
+
+/* Called by the subdev in its subdev registered callback */
+struct imx_media_fim *imx_media_fim_init(struct v4l2_subdev *sd)
+{
+	struct imx_media_fim *fim;
+	int ret;
+
+	fim = devm_kzalloc(sd->dev, sizeof(*fim), GFP_KERNEL);
+	if (!fim)
+		return ERR_PTR(-ENOMEM);
+
+	/* get media device */
+	fim->md = dev_get_drvdata(sd->v4l2_dev->dev);
+	fim->sd = sd;
+
+	spin_lock_init(&fim->lock);
+
+	ret = init_fim_controls(fim);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return fim;
+}
+EXPORT_SYMBOL_GPL(imx_media_fim_init);
+
+void imx_media_fim_free(struct imx_media_fim *fim)
+{
+	v4l2_ctrl_handler_free(&fim->ctrl_handler);
+}
+EXPORT_SYMBOL_GPL(imx_media_fim_free);
diff --git a/drivers/staging/media/imx/imx-media-internal-sd.c b/drivers/staging/media/imx/imx-media-internal-sd.c
new file mode 100644
index 000000000000..cdfbf40dfcbe
--- /dev/null
+++ b/drivers/staging/media/imx/imx-media-internal-sd.c
@@ -0,0 +1,349 @@
+/*
+ * Media driver for Freescale i.MX5/6 SOC
+ *
+ * Adds the internal subdevices and the media links between them.
+ *
+ * Copyright (c) 2016 Mentor Graphics Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/platform_device.h>
+#include "imx-media.h"
+
+enum isd_enum {
+	isd_csi0 = 0,
+	isd_csi1,
+	isd_vdic,
+	isd_ic_prp,
+	isd_ic_prpenc,
+	isd_ic_prpvf,
+	num_isd,
+};
+
+static const struct internal_subdev_id {
+	enum isd_enum index;
+	const char *name;
+	u32 grp_id;
+} isd_id[num_isd] = {
+	[isd_csi0] = {
+		.index = isd_csi0,
+		.grp_id = IMX_MEDIA_GRP_ID_CSI0,
+		.name = "imx-ipuv3-csi",
+	},
+	[isd_csi1] = {
+		.index = isd_csi1,
+		.grp_id = IMX_MEDIA_GRP_ID_CSI1,
+		.name = "imx-ipuv3-csi",
+	},
+	[isd_vdic] = {
+		.index = isd_vdic,
+		.grp_id = IMX_MEDIA_GRP_ID_VDIC,
+		.name = "imx-ipuv3-vdic",
+	},
+	[isd_ic_prp] = {
+		.index = isd_ic_prp,
+		.grp_id = IMX_MEDIA_GRP_ID_IC_PRP,
+		.name = "imx-ipuv3-ic",
+	},
+	[isd_ic_prpenc] = {
+		.index = isd_ic_prpenc,
+		.grp_id = IMX_MEDIA_GRP_ID_IC_PRPENC,
+		.name = "imx-ipuv3-ic",
+	},
+	[isd_ic_prpvf] = {
+		.index = isd_ic_prpvf,
+		.grp_id = IMX_MEDIA_GRP_ID_IC_PRPVF,
+		.name = "imx-ipuv3-ic",
+	},
+};
+
+struct internal_link {
+	const struct internal_subdev_id *remote_id;
+	int remote_pad;
+};
+
+struct internal_pad {
+	bool devnode; /* does this pad link to a device node */
+	struct internal_link link[IMX_MEDIA_MAX_LINKS];
+};
+
+static const struct internal_subdev {
+	const struct internal_subdev_id *id;
+	struct internal_pad pad[IMX_MEDIA_MAX_PADS];
+	int num_sink_pads;
+	int num_src_pads;
+} internal_subdev[num_isd] = {
+	[isd_csi0] = {
+		.id = &isd_id[isd_csi0],
+		.num_sink_pads = CSI_NUM_SINK_PADS,
+		.num_src_pads = CSI_NUM_SRC_PADS,
+		.pad[CSI_SRC_PAD_DIRECT] = {
+			.link = {
+				{
+					.remote_id = &isd_id[isd_ic_prp],
+					.remote_pad = PRP_SINK_PAD,
+				}, {
+					.remote_id =  &isd_id[isd_vdic],
+					.remote_pad = VDIC_SINK_PAD_DIRECT,
+				},
+			},
+		},
+		.pad[CSI_SRC_PAD_IDMAC] = {
+			.devnode = true,
+		},
+	},
+
+	[isd_csi1] = {
+		.id = &isd_id[isd_csi1],
+		.num_sink_pads = CSI_NUM_SINK_PADS,
+		.num_src_pads = CSI_NUM_SRC_PADS,
+		.pad[CSI_SRC_PAD_DIRECT] = {
+			.link = {
+				{
+					.remote_id = &isd_id[isd_ic_prp],
+					.remote_pad = PRP_SINK_PAD,
+				}, {
+					.remote_id =  &isd_id[isd_vdic],
+					.remote_pad = VDIC_SINK_PAD_DIRECT,
+				},
+			},
+		},
+		.pad[CSI_SRC_PAD_IDMAC] = {
+			.devnode = true,
+		},
+	},
+
+	[isd_vdic] = {
+		.id = &isd_id[isd_vdic],
+		.num_sink_pads = VDIC_NUM_SINK_PADS,
+		.num_src_pads = VDIC_NUM_SRC_PADS,
+		.pad[VDIC_SINK_PAD_IDMAC] = {
+			.devnode = true,
+		},
+		.pad[VDIC_SRC_PAD_DIRECT] = {
+			.link = {
+				{
+					.remote_id =  &isd_id[isd_ic_prp],
+					.remote_pad = PRP_SINK_PAD,
+				},
+			},
+		},
+	},
+
+	[isd_ic_prp] = {
+		.id = &isd_id[isd_ic_prp],
+		.num_sink_pads = PRP_NUM_SINK_PADS,
+		.num_src_pads = PRP_NUM_SRC_PADS,
+		.pad[PRP_SRC_PAD_PRPENC] = {
+			.link = {
+				{
+					.remote_id = &isd_id[isd_ic_prpenc],
+					.remote_pad = 0,
+				},
+			},
+		},
+		.pad[PRP_SRC_PAD_PRPVF] = {
+			.link = {
+				{
+					.remote_id = &isd_id[isd_ic_prpvf],
+					.remote_pad = 0,
+				},
+			},
+		},
+	},
+
+	[isd_ic_prpenc] = {
+		.id = &isd_id[isd_ic_prpenc],
+		.num_sink_pads = PRPENCVF_NUM_SINK_PADS,
+		.num_src_pads = PRPENCVF_NUM_SRC_PADS,
+		.pad[PRPENCVF_SRC_PAD] = {
+			.devnode = true,
+		},
+	},
+
+	[isd_ic_prpvf] = {
+		.id = &isd_id[isd_ic_prpvf],
+		.num_sink_pads = PRPENCVF_NUM_SINK_PADS,
+		.num_src_pads = PRPENCVF_NUM_SRC_PADS,
+		.pad[PRPENCVF_SRC_PAD] = {
+			.devnode = true,
+		},
+	},
+};
+
+/* form a device name given a group id and ipu id */
+static inline void isd_id_to_devname(char *devname, int sz,
+				     const struct internal_subdev_id *id,
+				     int ipu_id)
+{
+	int pdev_id = ipu_id * num_isd + id->index;
+
+	snprintf(devname, sz, "%s.%d", id->name, pdev_id);
+}
+
+/* adds the links from given internal subdev */
+static int add_internal_links(struct imx_media_dev *imxmd,
+			      const struct internal_subdev *isd,
+			      struct imx_media_subdev *imxsd,
+			      int ipu_id)
+{
+	int i, num_pads, ret;
+
+	num_pads = isd->num_sink_pads + isd->num_src_pads;
+
+	for (i = 0; i < num_pads; i++) {
+		const struct internal_pad *intpad = &isd->pad[i];
+		struct imx_media_pad *pad = &imxsd->pad[i];
+		int j;
+
+		/* init the pad flags for this internal subdev */
+		pad->pad.flags = (i < isd->num_sink_pads) ?
+			MEDIA_PAD_FL_SINK : MEDIA_PAD_FL_SOURCE;
+		/* export devnode pad flag to the subdevs */
+		pad->devnode = intpad->devnode;
+
+		for (j = 0; ; j++) {
+			const struct internal_link *link;
+			char remote_devname[32];
+
+			link = &intpad->link[j];
+
+			if (!link->remote_id)
+				break;
+
+			isd_id_to_devname(remote_devname,
+					  sizeof(remote_devname),
+					  link->remote_id, ipu_id);
+
+			ret = imx_media_add_pad_link(imxmd, pad,
+						     NULL, remote_devname,
+						     i, link->remote_pad);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/* register an internal subdev as a platform device */
+static struct imx_media_subdev *
+add_internal_subdev(struct imx_media_dev *imxmd,
+		    const struct internal_subdev *isd,
+		    int ipu_id)
+{
+	struct imx_media_internal_sd_platformdata pdata;
+	struct platform_device_info pdevinfo = {0};
+	struct imx_media_subdev *imxsd;
+	struct platform_device *pdev;
+
+	pdata.grp_id = isd->id->grp_id;
+
+	/* the id of IPU this subdev will control */
+	pdata.ipu_id = ipu_id;
+
+	/* create subdev name */
+	imx_media_grp_id_to_sd_name(pdata.sd_name, sizeof(pdata.sd_name),
+				    pdata.grp_id, ipu_id);
+
+	pdevinfo.name = isd->id->name;
+	pdevinfo.id = ipu_id * num_isd + isd->id->index;
+	pdevinfo.parent = imxmd->md.dev;
+	pdevinfo.data = &pdata;
+	pdevinfo.size_data = sizeof(pdata);
+	pdevinfo.dma_mask = DMA_BIT_MASK(32);
+
+	pdev = platform_device_register_full(&pdevinfo);
+	if (IS_ERR(pdev))
+		return ERR_CAST(pdev);
+
+	imxsd = imx_media_add_async_subdev(imxmd, NULL, pdev);
+	if (IS_ERR(imxsd))
+		return imxsd;
+
+	imxsd->num_sink_pads = isd->num_sink_pads;
+	imxsd->num_src_pads = isd->num_src_pads;
+
+	return imxsd;
+}
+
+/* adds the internal subdevs in one ipu */
+static int add_ipu_internal_subdevs(struct imx_media_dev *imxmd,
+				    struct imx_media_subdev *csi0,
+				    struct imx_media_subdev *csi1,
+				    int ipu_id)
+{
+	enum isd_enum i;
+	int ret;
+
+	for (i = 0; i < num_isd; i++) {
+		const struct internal_subdev *isd = &internal_subdev[i];
+		struct imx_media_subdev *imxsd;
+
+		/*
+		 * the CSIs are represented in the device-tree, so those
+		 * devices are added already, and are added to the async
+		 * subdev list by of_parse_subdev(), so we are given those
+		 * subdevs as csi0 and csi1.
+		 */
+		switch (isd->id->grp_id) {
+		case IMX_MEDIA_GRP_ID_CSI0:
+			imxsd = csi0;
+			break;
+		case IMX_MEDIA_GRP_ID_CSI1:
+			imxsd = csi1;
+			break;
+		default:
+			imxsd = add_internal_subdev(imxmd, isd, ipu_id);
+			break;
+		}
+
+		if (IS_ERR(imxsd))
+			return PTR_ERR(imxsd);
+
+		/* add the links from this subdev */
+		if (imxsd) {
+			ret = add_internal_links(imxmd, isd, imxsd, ipu_id);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+int imx_media_add_internal_subdevs(struct imx_media_dev *imxmd,
+				   struct imx_media_subdev *csi[4])
+{
+	int ret;
+
+	ret = add_ipu_internal_subdevs(imxmd, csi[0], csi[1], 0);
+	if (ret)
+		goto remove;
+
+	ret = add_ipu_internal_subdevs(imxmd, csi[2], csi[3], 1);
+	if (ret)
+		goto remove;
+
+	return 0;
+
+remove:
+	imx_media_remove_internal_subdevs(imxmd);
+	return ret;
+}
+
+void imx_media_remove_internal_subdevs(struct imx_media_dev *imxmd)
+{
+	struct imx_media_subdev *imxsd;
+	int i;
+
+	for (i = 0; i < imxmd->subdev_notifier.num_subdevs; i++) {
+		imxsd = &imxmd->subdev[i];
+		if (!imxsd->pdev)
+			continue;
+		platform_device_unregister(imxsd->pdev);
+	}
+}
diff --git a/drivers/staging/media/imx/imx-media-of.c b/drivers/staging/media/imx/imx-media-of.c
new file mode 100644
index 000000000000..b026fe66467c
--- /dev/null
+++ b/drivers/staging/media/imx/imx-media-of.c
@@ -0,0 +1,270 @@
+/*
+ * Media driver for Freescale i.MX5/6 SOC
+ *
+ * Open Firmware parsing.
+ *
+ * Copyright (c) 2016 Mentor Graphics Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/of_platform.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-fwnode.h>
+#include <media/v4l2-subdev.h>
+#include <media/videobuf2-dma-contig.h>
+#include <linux/of_graph.h>
+#include <video/imx-ipu-v3.h>
+#include "imx-media.h"
+
+static int of_add_pad_link(struct imx_media_dev *imxmd,
+			   struct imx_media_pad *pad,
+			   struct device_node *local_sd_node,
+			   struct device_node *remote_sd_node,
+			   int local_pad, int remote_pad)
+{
+	dev_dbg(imxmd->md.dev, "%s: adding %s:%d -> %s:%d\n", __func__,
+		local_sd_node->name, local_pad,
+		remote_sd_node->name, remote_pad);
+
+	return imx_media_add_pad_link(imxmd, pad, remote_sd_node, NULL,
+				      local_pad, remote_pad);
+}
+
+static void of_parse_sensor(struct imx_media_dev *imxmd,
+			    struct imx_media_subdev *sensor,
+			    struct device_node *sensor_np)
+{
+	struct device_node *endpoint;
+
+	endpoint = of_graph_get_next_endpoint(sensor_np, NULL);
+	if (endpoint) {
+		v4l2_fwnode_endpoint_parse(of_fwnode_handle(endpoint),
+					   &sensor->sensor_ep);
+		of_node_put(endpoint);
+	}
+}
+
+static int of_get_port_count(const struct device_node *np)
+{
+	struct device_node *ports, *child;
+	int num = 0;
+
+	/* check if this node has a ports subnode */
+	ports = of_get_child_by_name(np, "ports");
+	if (ports)
+		np = ports;
+
+	for_each_child_of_node(np, child)
+		if (of_node_cmp(child->name, "port") == 0)
+			num++;
+
+	of_node_put(ports);
+	return num;
+}
+
+/*
+ * find the remote device node and remote port id (remote pad #)
+ * given local endpoint node
+ */
+static void of_get_remote_pad(struct device_node *epnode,
+			      struct device_node **remote_node,
+			      int *remote_pad)
+{
+	struct device_node *rp, *rpp;
+	struct device_node *remote;
+
+	rp = of_graph_get_remote_port(epnode);
+	rpp = of_graph_get_remote_port_parent(epnode);
+
+	if (of_device_is_compatible(rpp, "fsl,imx6q-ipu")) {
+		/* the remote is one of the CSI ports */
+		remote = rp;
+		*remote_pad = 0;
+		of_node_put(rpp);
+	} else {
+		remote = rpp;
+		if (of_property_read_u32(rp, "reg", remote_pad))
+			*remote_pad = 0;
+		of_node_put(rp);
+	}
+
+	if (!of_device_is_available(remote)) {
+		of_node_put(remote);
+		*remote_node = NULL;
+	} else {
+		*remote_node = remote;
+	}
+}
+
+static struct imx_media_subdev *
+of_parse_subdev(struct imx_media_dev *imxmd, struct device_node *sd_np,
+		bool is_csi_port)
+{
+	struct imx_media_subdev *imxsd;
+	int i, num_pads, ret;
+
+	if (!of_device_is_available(sd_np)) {
+		dev_dbg(imxmd->md.dev, "%s: %s not enabled\n", __func__,
+			sd_np->name);
+		return NULL;
+	}
+
+	/* register this subdev with async notifier */
+	imxsd = imx_media_add_async_subdev(imxmd, sd_np, NULL);
+	if (IS_ERR_OR_NULL(imxsd))
+		return imxsd;
+
+	if (is_csi_port) {
+		/*
+		 * the ipu-csi has one sink port and two source ports.
+		 * The source ports are not represented in the device tree,
+		 * but are described by the internal pads and links later.
+		 */
+		num_pads = CSI_NUM_PADS;
+		imxsd->num_sink_pads = CSI_NUM_SINK_PADS;
+	} else if (of_device_is_compatible(sd_np, "fsl,imx6-mipi-csi2")) {
+		num_pads = of_get_port_count(sd_np);
+		/* the mipi csi2 receiver has only one sink port */
+		imxsd->num_sink_pads = 1;
+	} else if (of_device_is_compatible(sd_np, "video-mux")) {
+		num_pads = of_get_port_count(sd_np);
+		/* for the video mux, all but the last port are sinks */
+		imxsd->num_sink_pads = num_pads - 1;
+	} else {
+		num_pads = of_get_port_count(sd_np);
+		if (num_pads != 1) {
+			dev_warn(imxmd->md.dev,
+				 "%s: unknown device %s with %d ports\n",
+				 __func__, sd_np->name, num_pads);
+			return NULL;
+		}
+
+		/*
+		 * we got to this node from this single source port,
+		 * there are no sink pads.
+		 */
+		imxsd->num_sink_pads = 0;
+	}
+
+	if (imxsd->num_sink_pads >= num_pads)
+		return ERR_PTR(-EINVAL);
+
+	imxsd->num_src_pads = num_pads - imxsd->num_sink_pads;
+
+	dev_dbg(imxmd->md.dev, "%s: %s has %d pads (%d sink, %d src)\n",
+		__func__, sd_np->name, num_pads,
+		imxsd->num_sink_pads, imxsd->num_src_pads);
+
+	/*
+	 * With no sink, this subdev node is the original source
+	 * of video, parse it's media bus for use by the pipeline.
+	 */
+	if (imxsd->num_sink_pads == 0)
+		of_parse_sensor(imxmd, imxsd, sd_np);
+
+	for (i = 0; i < num_pads; i++) {
+		struct device_node *epnode = NULL, *port, *remote_np;
+		struct imx_media_subdev *remote_imxsd;
+		struct imx_media_pad *pad;
+		int remote_pad;
+
+		/* init this pad */
+		pad = &imxsd->pad[i];
+		pad->pad.flags = (i < imxsd->num_sink_pads) ?
+			MEDIA_PAD_FL_SINK : MEDIA_PAD_FL_SOURCE;
+
+		if (is_csi_port)
+			port = (i < imxsd->num_sink_pads) ? sd_np : NULL;
+		else
+			port = of_graph_get_port_by_id(sd_np, i);
+		if (!port)
+			continue;
+
+		for_each_child_of_node(port, epnode) {
+			of_get_remote_pad(epnode, &remote_np, &remote_pad);
+			if (!remote_np)
+				continue;
+
+			ret = of_add_pad_link(imxmd, pad, sd_np, remote_np,
+					      i, remote_pad);
+			if (ret) {
+				imxsd = ERR_PTR(ret);
+				break;
+			}
+
+			if (i < imxsd->num_sink_pads) {
+				/* follow sink endpoints upstream */
+				remote_imxsd = of_parse_subdev(imxmd,
+							       remote_np,
+							       false);
+				if (IS_ERR(remote_imxsd)) {
+					imxsd = remote_imxsd;
+					break;
+				}
+			}
+
+			of_node_put(remote_np);
+		}
+
+		if (port != sd_np)
+			of_node_put(port);
+		if (IS_ERR(imxsd)) {
+			of_node_put(remote_np);
+			of_node_put(epnode);
+			break;
+		}
+	}
+
+	return imxsd;
+}
+
+int imx_media_of_parse(struct imx_media_dev *imxmd,
+		       struct imx_media_subdev *(*csi)[4],
+		       struct device_node *np)
+{
+	struct imx_media_subdev *lcsi;
+	struct device_node *csi_np;
+	u32 ipu_id, csi_id;
+	int i, ret;
+
+	for (i = 0; ; i++) {
+		csi_np = of_parse_phandle(np, "ports", i);
+		if (!csi_np)
+			break;
+
+		lcsi = of_parse_subdev(imxmd, csi_np, true);
+		if (IS_ERR(lcsi)) {
+			ret = PTR_ERR(lcsi);
+			goto err_put;
+		}
+
+		ret = of_property_read_u32(csi_np, "reg", &csi_id);
+		if (ret) {
+			dev_err(imxmd->md.dev,
+				"%s: csi port missing reg property!\n",
+				__func__);
+			goto err_put;
+		}
+
+		ipu_id = of_alias_get_id(csi_np->parent, "ipu");
+		of_node_put(csi_np);
+
+		if (ipu_id > 1 || csi_id > 1) {
+			dev_err(imxmd->md.dev,
+				"%s: invalid ipu/csi id (%u/%u)\n",
+				__func__, ipu_id, csi_id);
+			return -EINVAL;
+		}
+
+		(*csi)[ipu_id * 2 + csi_id] = lcsi;
+	}
+
+	return 0;
+err_put:
+	of_node_put(csi_np);
+	return ret;
+}
diff --git a/drivers/staging/media/imx/imx-media-utils.c b/drivers/staging/media/imx/imx-media-utils.c
new file mode 100644
index 000000000000..f7184220d566
--- /dev/null
+++ b/drivers/staging/media/imx/imx-media-utils.c
@@ -0,0 +1,834 @@
+/*
+ * V4L2 Media Controller Driver for Freescale i.MX5/6 SOC
+ *
+ * Copyright (c) 2016 Mentor Graphics Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/module.h>
+#include "imx-media.h"
+
+/*
+ * List of supported pixel formats for the subdevs.
+ *
+ * In all of these tables, the non-mbus formats (with no
+ * mbus codes) must all fall at the end of the table.
+ */
+
+static const struct imx_media_pixfmt yuv_formats[] = {
+	{
+		.fourcc	= V4L2_PIX_FMT_UYVY,
+		.codes  = {
+			MEDIA_BUS_FMT_UYVY8_2X8,
+			MEDIA_BUS_FMT_UYVY8_1X16
+		},
+		.cs     = IPUV3_COLORSPACE_YUV,
+		.bpp    = 16,
+	}, {
+		.fourcc	= V4L2_PIX_FMT_YUYV,
+		.codes  = {
+			MEDIA_BUS_FMT_YUYV8_2X8,
+			MEDIA_BUS_FMT_YUYV8_1X16
+		},
+		.cs     = IPUV3_COLORSPACE_YUV,
+		.bpp    = 16,
+	},
+	/***
+	 * non-mbus YUV formats start here. NOTE! when adding non-mbus
+	 * formats, NUM_NON_MBUS_YUV_FORMATS must be updated below.
+	 ***/
+	{
+		.fourcc	= V4L2_PIX_FMT_YUV420,
+		.cs     = IPUV3_COLORSPACE_YUV,
+		.bpp    = 12,
+		.planar = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_YVU420,
+		.cs     = IPUV3_COLORSPACE_YUV,
+		.bpp    = 12,
+		.planar = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_YUV422P,
+		.cs     = IPUV3_COLORSPACE_YUV,
+		.bpp    = 16,
+		.planar = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_NV12,
+		.cs     = IPUV3_COLORSPACE_YUV,
+		.bpp    = 12,
+		.planar = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_NV16,
+		.cs     = IPUV3_COLORSPACE_YUV,
+		.bpp    = 16,
+		.planar = true,
+	},
+};
+
+#define NUM_NON_MBUS_YUV_FORMATS 5
+#define NUM_YUV_FORMATS ARRAY_SIZE(yuv_formats)
+#define NUM_MBUS_YUV_FORMATS (NUM_YUV_FORMATS - NUM_NON_MBUS_YUV_FORMATS)
+
+static const struct imx_media_pixfmt rgb_formats[] = {
+	{
+		.fourcc	= V4L2_PIX_FMT_RGB565,
+		.codes  = {MEDIA_BUS_FMT_RGB565_2X8_LE},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 16,
+	}, {
+		.fourcc	= V4L2_PIX_FMT_RGB24,
+		.codes  = {
+			MEDIA_BUS_FMT_RGB888_1X24,
+			MEDIA_BUS_FMT_RGB888_2X12_LE
+		},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 24,
+	}, {
+		.fourcc	= V4L2_PIX_FMT_RGB32,
+		.codes  = {MEDIA_BUS_FMT_ARGB8888_1X32},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 32,
+		.ipufmt = true,
+	},
+	/*** raw bayer formats start here ***/
+	{
+		.fourcc = V4L2_PIX_FMT_SBGGR8,
+		.codes  = {MEDIA_BUS_FMT_SBGGR8_1X8},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 8,
+		.bayer  = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_SGBRG8,
+		.codes  = {MEDIA_BUS_FMT_SGBRG8_1X8},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 8,
+		.bayer  = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_SGRBG8,
+		.codes  = {MEDIA_BUS_FMT_SGRBG8_1X8},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 8,
+		.bayer  = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_SRGGB8,
+		.codes  = {MEDIA_BUS_FMT_SRGGB8_1X8},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 8,
+		.bayer  = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_SBGGR16,
+		.codes  = {
+			MEDIA_BUS_FMT_SBGGR10_1X10,
+			MEDIA_BUS_FMT_SBGGR12_1X12,
+			MEDIA_BUS_FMT_SBGGR14_1X14,
+			MEDIA_BUS_FMT_SBGGR16_1X16
+		},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 16,
+		.bayer  = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_SGBRG16,
+		.codes  = {
+			MEDIA_BUS_FMT_SGBRG10_1X10,
+			MEDIA_BUS_FMT_SGBRG12_1X12,
+			MEDIA_BUS_FMT_SGBRG14_1X14,
+			MEDIA_BUS_FMT_SGBRG16_1X16,
+		},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 16,
+		.bayer  = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_SGRBG16,
+		.codes  = {
+			MEDIA_BUS_FMT_SGRBG10_1X10,
+			MEDIA_BUS_FMT_SGRBG12_1X12,
+			MEDIA_BUS_FMT_SGRBG14_1X14,
+			MEDIA_BUS_FMT_SGRBG16_1X16,
+		},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 16,
+		.bayer  = true,
+	}, {
+		.fourcc = V4L2_PIX_FMT_SRGGB16,
+		.codes  = {
+			MEDIA_BUS_FMT_SRGGB10_1X10,
+			MEDIA_BUS_FMT_SRGGB12_1X12,
+			MEDIA_BUS_FMT_SRGGB14_1X14,
+			MEDIA_BUS_FMT_SRGGB16_1X16,
+		},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 16,
+		.bayer  = true,
+	},
+	/***
+	 * non-mbus RGB formats start here. NOTE! when adding non-mbus
+	 * formats, NUM_NON_MBUS_RGB_FORMATS must be updated below.
+	 ***/
+	{
+		.fourcc	= V4L2_PIX_FMT_BGR24,
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 24,
+	}, {
+		.fourcc	= V4L2_PIX_FMT_BGR32,
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 32,
+	},
+};
+
+#define NUM_NON_MBUS_RGB_FORMATS 2
+#define NUM_RGB_FORMATS ARRAY_SIZE(rgb_formats)
+#define NUM_MBUS_RGB_FORMATS (NUM_RGB_FORMATS - NUM_NON_MBUS_RGB_FORMATS)
+
+static const struct imx_media_pixfmt ipu_yuv_formats[] = {
+	{
+		.fourcc = V4L2_PIX_FMT_YUV32,
+		.codes  = {MEDIA_BUS_FMT_AYUV8_1X32},
+		.cs     = IPUV3_COLORSPACE_YUV,
+		.bpp    = 32,
+		.ipufmt = true,
+	},
+};
+
+#define NUM_IPU_YUV_FORMATS ARRAY_SIZE(ipu_yuv_formats)
+
+static const struct imx_media_pixfmt ipu_rgb_formats[] = {
+	{
+		.fourcc	= V4L2_PIX_FMT_RGB32,
+		.codes  = {MEDIA_BUS_FMT_ARGB8888_1X32},
+		.cs     = IPUV3_COLORSPACE_RGB,
+		.bpp    = 32,
+		.ipufmt = true,
+	},
+};
+
+#define NUM_IPU_RGB_FORMATS ARRAY_SIZE(ipu_rgb_formats)
+
+static void init_mbus_colorimetry(struct v4l2_mbus_framefmt *mbus,
+				  const struct imx_media_pixfmt *fmt)
+{
+	mbus->colorspace = (fmt->cs == IPUV3_COLORSPACE_RGB) ?
+		V4L2_COLORSPACE_SRGB : V4L2_COLORSPACE_SMPTE170M;
+	mbus->xfer_func = V4L2_MAP_XFER_FUNC_DEFAULT(mbus->colorspace);
+	mbus->ycbcr_enc = V4L2_MAP_YCBCR_ENC_DEFAULT(mbus->colorspace);
+	mbus->quantization =
+		V4L2_MAP_QUANTIZATION_DEFAULT(fmt->cs == IPUV3_COLORSPACE_RGB,
+					      mbus->colorspace,
+					      mbus->ycbcr_enc);
+}
+
+static const struct imx_media_pixfmt *find_format(u32 fourcc,
+						  u32 code,
+						  enum codespace_sel cs_sel,
+						  bool allow_non_mbus,
+						  bool allow_bayer)
+{
+	const struct imx_media_pixfmt *array, *fmt, *ret = NULL;
+	u32 array_size;
+	int i, j;
+
+	switch (cs_sel) {
+	case CS_SEL_YUV:
+		array_size = NUM_YUV_FORMATS;
+		array = yuv_formats;
+		break;
+	case CS_SEL_RGB:
+		array_size = NUM_RGB_FORMATS;
+		array = rgb_formats;
+		break;
+	case CS_SEL_ANY:
+		array_size = NUM_YUV_FORMATS + NUM_RGB_FORMATS;
+		array = yuv_formats;
+		break;
+	default:
+		return NULL;
+	}
+
+	for (i = 0; i < array_size; i++) {
+		if (cs_sel == CS_SEL_ANY && i >= NUM_YUV_FORMATS)
+			fmt = &rgb_formats[i - NUM_YUV_FORMATS];
+		else
+			fmt = &array[i];
+
+		if ((!allow_non_mbus && fmt->codes[0] == 0) ||
+		    (!allow_bayer && fmt->bayer))
+			continue;
+
+		if (fourcc && fmt->fourcc == fourcc) {
+			ret = fmt;
+			goto out;
+		}
+
+		for (j = 0; code && fmt->codes[j]; j++) {
+			if (code == fmt->codes[j]) {
+				ret = fmt;
+				goto out;
+			}
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int enum_format(u32 *fourcc, u32 *code, u32 index,
+		       enum codespace_sel cs_sel,
+		       bool allow_non_mbus,
+		       bool allow_bayer)
+{
+	const struct imx_media_pixfmt *fmt;
+	u32 mbus_yuv_sz = NUM_MBUS_YUV_FORMATS;
+	u32 mbus_rgb_sz = NUM_MBUS_RGB_FORMATS;
+	u32 yuv_sz = NUM_YUV_FORMATS;
+	u32 rgb_sz = NUM_RGB_FORMATS;
+
+	switch (cs_sel) {
+	case CS_SEL_YUV:
+		if (index >= yuv_sz ||
+		    (!allow_non_mbus && index >= mbus_yuv_sz))
+			return -EINVAL;
+		fmt = &yuv_formats[index];
+		break;
+	case CS_SEL_RGB:
+		if (index >= rgb_sz ||
+		    (!allow_non_mbus && index >= mbus_rgb_sz))
+			return -EINVAL;
+		fmt = &rgb_formats[index];
+		if (!allow_bayer && fmt->bayer)
+			return -EINVAL;
+		break;
+	case CS_SEL_ANY:
+		if (!allow_non_mbus) {
+			if (index >= mbus_yuv_sz) {
+				index -= mbus_yuv_sz;
+				if (index >= mbus_rgb_sz)
+					return -EINVAL;
+				fmt = &rgb_formats[index];
+				if (!allow_bayer && fmt->bayer)
+					return -EINVAL;
+			} else {
+				fmt = &yuv_formats[index];
+			}
+		} else {
+			if (index >= yuv_sz + rgb_sz)
+				return -EINVAL;
+			if (index >= yuv_sz) {
+				fmt = &rgb_formats[index - yuv_sz];
+				if (!allow_bayer && fmt->bayer)
+					return -EINVAL;
+			} else {
+				fmt = &yuv_formats[index];
+			}
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (fourcc)
+		*fourcc = fmt->fourcc;
+	if (code)
+		*code = fmt->codes[0];
+
+	return 0;
+}
+
+const struct imx_media_pixfmt *
+imx_media_find_format(u32 fourcc, enum codespace_sel cs_sel, bool allow_bayer)
+{
+	return find_format(fourcc, 0, cs_sel, true, allow_bayer);
+}
+EXPORT_SYMBOL_GPL(imx_media_find_format);
+
+int imx_media_enum_format(u32 *fourcc, u32 index, enum codespace_sel cs_sel)
+{
+	return enum_format(fourcc, NULL, index, cs_sel, true, false);
+}
+EXPORT_SYMBOL_GPL(imx_media_enum_format);
+
+const struct imx_media_pixfmt *
+imx_media_find_mbus_format(u32 code, enum codespace_sel cs_sel,
+			   bool allow_bayer)
+{
+	return find_format(0, code, cs_sel, false, allow_bayer);
+}
+EXPORT_SYMBOL_GPL(imx_media_find_mbus_format);
+
+int imx_media_enum_mbus_format(u32 *code, u32 index, enum codespace_sel cs_sel,
+			       bool allow_bayer)
+{
+	return enum_format(NULL, code, index, cs_sel, false, allow_bayer);
+}
+EXPORT_SYMBOL_GPL(imx_media_enum_mbus_format);
+
+const struct imx_media_pixfmt *
+imx_media_find_ipu_format(u32 code, enum codespace_sel cs_sel)
+{
+	const struct imx_media_pixfmt *array, *fmt, *ret = NULL;
+	u32 array_size;
+	int i, j;
+
+	switch (cs_sel) {
+	case CS_SEL_YUV:
+		array_size = NUM_IPU_YUV_FORMATS;
+		array = ipu_yuv_formats;
+		break;
+	case CS_SEL_RGB:
+		array_size = NUM_IPU_RGB_FORMATS;
+		array = ipu_rgb_formats;
+		break;
+	case CS_SEL_ANY:
+		array_size = NUM_IPU_YUV_FORMATS + NUM_IPU_RGB_FORMATS;
+		array = ipu_yuv_formats;
+		break;
+	default:
+		return NULL;
+	}
+
+	for (i = 0; i < array_size; i++) {
+		if (cs_sel == CS_SEL_ANY && i >= NUM_IPU_YUV_FORMATS)
+			fmt = &ipu_rgb_formats[i - NUM_IPU_YUV_FORMATS];
+		else
+			fmt = &array[i];
+
+		for (j = 0; code && fmt->codes[j]; j++) {
+			if (code == fmt->codes[j]) {
+				ret = fmt;
+				goto out;
+			}
+		}
+	}
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(imx_media_find_ipu_format);
+
+int imx_media_enum_ipu_format(u32 *code, u32 index, enum codespace_sel cs_sel)
+{
+	switch (cs_sel) {
+	case CS_SEL_YUV:
+		if (index >= NUM_IPU_YUV_FORMATS)
+			return -EINVAL;
+		*code = ipu_yuv_formats[index].codes[0];
+		break;
+	case CS_SEL_RGB:
+		if (index >= NUM_IPU_RGB_FORMATS)
+			return -EINVAL;
+		*code = ipu_rgb_formats[index].codes[0];
+		break;
+	case CS_SEL_ANY:
+		if (index >= NUM_IPU_YUV_FORMATS + NUM_IPU_RGB_FORMATS)
+			return -EINVAL;
+		if (index >= NUM_IPU_YUV_FORMATS) {
+			index -= NUM_IPU_YUV_FORMATS;
+			*code = ipu_rgb_formats[index].codes[0];
+		} else {
+			*code = ipu_yuv_formats[index].codes[0];
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(imx_media_enum_ipu_format);
+
+int imx_media_init_mbus_fmt(struct v4l2_mbus_framefmt *mbus,
+			    u32 width, u32 height, u32 code, u32 field,
+			    const struct imx_media_pixfmt **cc)
+{
+	const struct imx_media_pixfmt *lcc;
+
+	mbus->width = width;
+	mbus->height = height;
+	mbus->field = field;
+	if (code == 0)
+		imx_media_enum_mbus_format(&code, 0, CS_SEL_YUV, false);
+	lcc = imx_media_find_mbus_format(code, CS_SEL_ANY, false);
+	if (!lcc) {
+		lcc = imx_media_find_ipu_format(code, CS_SEL_ANY);
+		if (!lcc)
+			return -EINVAL;
+	}
+
+	mbus->code = code;
+	init_mbus_colorimetry(mbus, lcc);
+	if (cc)
+		*cc = lcc;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(imx_media_init_mbus_fmt);
+
+int imx_media_mbus_fmt_to_pix_fmt(struct v4l2_pix_format *pix,
+				  struct v4l2_mbus_framefmt *mbus,
+				  const struct imx_media_pixfmt *cc)
+{
+	u32 stride;
+
+	if (!cc) {
+		cc = imx_media_find_ipu_format(mbus->code, CS_SEL_ANY);
+		if (!cc)
+			cc = imx_media_find_mbus_format(mbus->code, CS_SEL_ANY,
+							true);
+		if (!cc)
+			return -EINVAL;
+	}
+
+	/*
+	 * TODO: the IPU currently does not support the AYUV32 format,
+	 * so until it does convert to a supported YUV format.
+	 */
+	if (cc->ipufmt && cc->cs == IPUV3_COLORSPACE_YUV) {
+		u32 code;
+
+		imx_media_enum_mbus_format(&code, 0, CS_SEL_YUV, false);
+		cc = imx_media_find_mbus_format(code, CS_SEL_YUV, false);
+	}
+
+	stride = cc->planar ? mbus->width : (mbus->width * cc->bpp) >> 3;
+
+	pix->width = mbus->width;
+	pix->height = mbus->height;
+	pix->pixelformat = cc->fourcc;
+	pix->colorspace = mbus->colorspace;
+	pix->xfer_func = mbus->xfer_func;
+	pix->ycbcr_enc = mbus->ycbcr_enc;
+	pix->quantization = mbus->quantization;
+	pix->field = mbus->field;
+	pix->bytesperline = stride;
+	pix->sizeimage = (pix->width * pix->height * cc->bpp) >> 3;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(imx_media_mbus_fmt_to_pix_fmt);
+
+int imx_media_mbus_fmt_to_ipu_image(struct ipu_image *image,
+				    struct v4l2_mbus_framefmt *mbus)
+{
+	int ret;
+
+	memset(image, 0, sizeof(*image));
+
+	ret = imx_media_mbus_fmt_to_pix_fmt(&image->pix, mbus, NULL);
+	if (ret)
+		return ret;
+
+	image->rect.width = mbus->width;
+	image->rect.height = mbus->height;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(imx_media_mbus_fmt_to_ipu_image);
+
+int imx_media_ipu_image_to_mbus_fmt(struct v4l2_mbus_framefmt *mbus,
+				    struct ipu_image *image)
+{
+	const struct imx_media_pixfmt *fmt;
+
+	fmt = imx_media_find_format(image->pix.pixelformat, CS_SEL_ANY, true);
+	if (!fmt)
+		return -EINVAL;
+
+	memset(mbus, 0, sizeof(*mbus));
+	mbus->width = image->pix.width;
+	mbus->height = image->pix.height;
+	mbus->code = fmt->codes[0];
+	mbus->field = image->pix.field;
+	mbus->colorspace = image->pix.colorspace;
+	mbus->xfer_func = image->pix.xfer_func;
+	mbus->ycbcr_enc = image->pix.ycbcr_enc;
+	mbus->quantization = image->pix.quantization;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(imx_media_ipu_image_to_mbus_fmt);
+
+void imx_media_free_dma_buf(struct imx_media_dev *imxmd,
+			    struct imx_media_dma_buf *buf)
+{
+	if (buf->virt)
+		dma_free_coherent(imxmd->md.dev, buf->len,
+				  buf->virt, buf->phys);
+
+	buf->virt = NULL;
+	buf->phys = 0;
+}
+EXPORT_SYMBOL_GPL(imx_media_free_dma_buf);
+
+int imx_media_alloc_dma_buf(struct imx_media_dev *imxmd,
+			    struct imx_media_dma_buf *buf,
+			    int size)
+{
+	imx_media_free_dma_buf(imxmd, buf);
+
+	buf->len = PAGE_ALIGN(size);
+	buf->virt = dma_alloc_coherent(imxmd->md.dev, buf->len, &buf->phys,
+				       GFP_DMA | GFP_KERNEL);
+	if (!buf->virt) {
+		dev_err(imxmd->md.dev, "failed to alloc dma buffer\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(imx_media_alloc_dma_buf);
+
+/* form a subdev name given a group id and ipu id */
+void imx_media_grp_id_to_sd_name(char *sd_name, int sz, u32 grp_id, int ipu_id)
+{
+	int id;
+
+	switch (grp_id) {
+	case IMX_MEDIA_GRP_ID_CSI0...IMX_MEDIA_GRP_ID_CSI1:
+		id = (grp_id >> IMX_MEDIA_GRP_ID_CSI_BIT) - 1;
+		snprintf(sd_name, sz, "ipu%d_csi%d", ipu_id + 1, id);
+		break;
+	case IMX_MEDIA_GRP_ID_VDIC:
+		snprintf(sd_name, sz, "ipu%d_vdic", ipu_id + 1);
+		break;
+	case IMX_MEDIA_GRP_ID_IC_PRP:
+		snprintf(sd_name, sz, "ipu%d_ic_prp", ipu_id + 1);
+		break;
+	case IMX_MEDIA_GRP_ID_IC_PRPENC:
+		snprintf(sd_name, sz, "ipu%d_ic_prpenc", ipu_id + 1);
+		break;
+	case IMX_MEDIA_GRP_ID_IC_PRPVF:
+		snprintf(sd_name, sz, "ipu%d_ic_prpvf", ipu_id + 1);
+		break;
+	default:
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(imx_media_grp_id_to_sd_name);
+
+struct imx_media_subdev *
+imx_media_find_subdev_by_sd(struct imx_media_dev *imxmd,
+			    struct v4l2_subdev *sd)
+{
+	struct imx_media_subdev *imxsd;
+	int i;
+
+	for (i = 0; i < imxmd->num_subdevs; i++) {
+		imxsd = &imxmd->subdev[i];
+		if (sd == imxsd->sd)
+			return imxsd;
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+EXPORT_SYMBOL_GPL(imx_media_find_subdev_by_sd);
+
+struct imx_media_subdev *
+imx_media_find_subdev_by_id(struct imx_media_dev *imxmd, u32 grp_id)
+{
+	struct imx_media_subdev *imxsd;
+	int i;
+
+	for (i = 0; i < imxmd->num_subdevs; i++) {
+		imxsd = &imxmd->subdev[i];
+		if (imxsd->sd && imxsd->sd->grp_id == grp_id)
+			return imxsd;
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+EXPORT_SYMBOL_GPL(imx_media_find_subdev_by_id);
+
+/*
+ * Adds a video device to the master video device list. This is called by
+ * an async subdev that owns a video device when it is registered.
+ */
+int imx_media_add_video_device(struct imx_media_dev *imxmd,
+			       struct imx_media_video_dev *vdev)
+{
+	int vdev_idx, ret = 0;
+
+	mutex_lock(&imxmd->mutex);
+
+	vdev_idx = imxmd->num_vdevs;
+	if (vdev_idx >= IMX_MEDIA_MAX_VDEVS) {
+		dev_err(imxmd->md.dev,
+			"%s: too many video devices! can't add %s\n",
+			__func__, vdev->vfd->name);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	imxmd->vdev[vdev_idx] = vdev;
+	imxmd->num_vdevs++;
+out:
+	mutex_unlock(&imxmd->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(imx_media_add_video_device);
+
+/*
+ * Search upstream or downstream for a subdevice in the current pipeline
+ * with given grp_id, starting from start_entity. Returns the subdev's
+ * source/sink pad that it was reached from. Must be called with
+ * mdev->graph_mutex held.
+ */
+static struct media_pad *
+find_pipeline_pad(struct imx_media_dev *imxmd,
+		  struct media_entity *start_entity,
+		  u32 grp_id, bool upstream)
+{
+	struct media_entity *me = start_entity;
+	struct media_pad *pad = NULL;
+	struct v4l2_subdev *sd;
+	int i;
+
+	for (i = 0; i < me->num_pads; i++) {
+		struct media_pad *spad = &me->pads[i];
+
+		if ((upstream && !(spad->flags & MEDIA_PAD_FL_SINK)) ||
+		    (!upstream && !(spad->flags & MEDIA_PAD_FL_SOURCE)))
+			continue;
+
+		pad = media_entity_remote_pad(spad);
+		if (!pad || !is_media_entity_v4l2_subdev(pad->entity))
+			continue;
+
+		sd = media_entity_to_v4l2_subdev(pad->entity);
+		if (sd->grp_id & grp_id)
+			return pad;
+
+		return find_pipeline_pad(imxmd, pad->entity, grp_id, upstream);
+	}
+
+	return NULL;
+}
+
+/*
+ * Search upstream for a subdev in the current pipeline with
+ * given grp_id. Must be called with mdev->graph_mutex held.
+ */
+static struct v4l2_subdev *
+find_upstream_subdev(struct imx_media_dev *imxmd,
+		     struct media_entity *start_entity,
+		     u32 grp_id)
+{
+	struct v4l2_subdev *sd;
+	struct media_pad *pad;
+
+	if (is_media_entity_v4l2_subdev(start_entity)) {
+		sd = media_entity_to_v4l2_subdev(start_entity);
+		if (sd->grp_id & grp_id)
+			return sd;
+	}
+
+	pad = find_pipeline_pad(imxmd, start_entity, grp_id, true);
+
+	return pad ? media_entity_to_v4l2_subdev(pad->entity) : NULL;
+}
+
+
+/*
+ * Find the upstream mipi-csi2 virtual channel reached from the given
+ * start entity in the current pipeline.
+ * Must be called with mdev->graph_mutex held.
+ */
+int imx_media_find_mipi_csi2_channel(struct imx_media_dev *imxmd,
+				     struct media_entity *start_entity)
+{
+	struct media_pad *pad;
+	int ret = -EPIPE;
+
+	pad = find_pipeline_pad(imxmd, start_entity, IMX_MEDIA_GRP_ID_CSI2,
+				true);
+	if (pad) {
+		ret = pad->index - 1;
+		dev_dbg(imxmd->md.dev, "found vc%d from %s\n",
+			ret, start_entity->name);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(imx_media_find_mipi_csi2_channel);
+
+/*
+ * Find a subdev reached upstream from the given start entity in
+ * the current pipeline.
+ * Must be called with mdev->graph_mutex held.
+ */
+struct imx_media_subdev *
+imx_media_find_upstream_subdev(struct imx_media_dev *imxmd,
+			       struct media_entity *start_entity,
+			       u32 grp_id)
+{
+	struct v4l2_subdev *sd;
+
+	sd = find_upstream_subdev(imxmd, start_entity, grp_id);
+	if (!sd)
+		return ERR_PTR(-ENODEV);
+
+	return imx_media_find_subdev_by_sd(imxmd, sd);
+}
+EXPORT_SYMBOL_GPL(imx_media_find_upstream_subdev);
+
+struct imx_media_subdev *
+__imx_media_find_sensor(struct imx_media_dev *imxmd,
+			struct media_entity *start_entity)
+{
+	return imx_media_find_upstream_subdev(imxmd, start_entity,
+					      IMX_MEDIA_GRP_ID_SENSOR);
+}
+EXPORT_SYMBOL_GPL(__imx_media_find_sensor);
+
+struct imx_media_subdev *
+imx_media_find_sensor(struct imx_media_dev *imxmd,
+		      struct media_entity *start_entity)
+{
+	struct imx_media_subdev *sensor;
+
+	mutex_lock(&imxmd->md.graph_mutex);
+	sensor = __imx_media_find_sensor(imxmd, start_entity);
+	mutex_unlock(&imxmd->md.graph_mutex);
+
+	return sensor;
+}
+EXPORT_SYMBOL_GPL(imx_media_find_sensor);
+
+/*
+ * Turn current pipeline streaming on/off starting from entity.
+ */
+int imx_media_pipeline_set_stream(struct imx_media_dev *imxmd,
+				  struct media_entity *entity,
+				  bool on)
+{
+	struct v4l2_subdev *sd;
+	int ret = 0;
+
+	if (!is_media_entity_v4l2_subdev(entity))
+		return -EINVAL;
+	sd = media_entity_to_v4l2_subdev(entity);
+
+	mutex_lock(&imxmd->md.graph_mutex);
+
+	if (on) {
+		ret = __media_pipeline_start(entity, &imxmd->pipe);
+		if (ret)
+			goto out;
+		ret = v4l2_subdev_call(sd, video, s_stream, 1);
+		if (ret)
+			__media_pipeline_stop(entity);
+	} else {
+		v4l2_subdev_call(sd, video, s_stream, 0);
+		if (entity->pipe)
+			__media_pipeline_stop(entity);
+	}
+
+out:
+	mutex_unlock(&imxmd->md.graph_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(imx_media_pipeline_set_stream);
+
+MODULE_DESCRIPTION("i.MX5/6 v4l2 media controller driver");
+MODULE_AUTHOR("Steve Longerbeam <steve_longerbeam@mentor.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/media/imx/imx-media.h b/drivers/staging/media/imx/imx-media.h
new file mode 100644
index 000000000000..0077c4c9ea6a
--- /dev/null
+++ b/drivers/staging/media/imx/imx-media.h
@@ -0,0 +1,323 @@
+/*
+ * V4L2 Media Controller Driver for Freescale i.MX5/6 SOC
+ *
+ * Copyright (c) 2016 Mentor Graphics Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef _IMX_MEDIA_H
+#define _IMX_MEDIA_H
+
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-fwnode.h>
+#include <media/v4l2-subdev.h>
+#include <media/videobuf2-dma-contig.h>
+#include <video/imx-ipu-v3.h>
+
+/*
+ * This is somewhat arbitrary, but we need at least:
+ * - 4 video devices per IPU
+ * - 3 IC subdevs per IPU
+ * - 1 VDIC subdev per IPU
+ * - 2 CSI subdevs per IPU
+ * - 1 mipi-csi2 receiver subdev
+ * - 2 video-mux subdevs
+ * - 2 camera sensor subdevs per IPU (1 parallel, 1 mipi-csi2)
+ *
+ */
+/* max video devices */
+#define IMX_MEDIA_MAX_VDEVS          8
+/* max subdevices */
+#define IMX_MEDIA_MAX_SUBDEVS       32
+/* max pads per subdev */
+#define IMX_MEDIA_MAX_PADS          16
+/* max links per pad */
+#define IMX_MEDIA_MAX_LINKS          8
+
+/*
+ * Pad definitions for the subdevs with multiple source or
+ * sink pads
+ */
+
+/* ipu_csi */
+enum {
+	CSI_SINK_PAD = 0,
+	CSI_SRC_PAD_DIRECT,
+	CSI_SRC_PAD_IDMAC,
+	CSI_NUM_PADS,
+};
+
+#define CSI_NUM_SINK_PADS 1
+#define CSI_NUM_SRC_PADS  2
+
+/* ipu_vdic */
+enum {
+	VDIC_SINK_PAD_DIRECT = 0,
+	VDIC_SINK_PAD_IDMAC,
+	VDIC_SRC_PAD_DIRECT,
+	VDIC_NUM_PADS,
+};
+
+#define VDIC_NUM_SINK_PADS 2
+#define VDIC_NUM_SRC_PADS  1
+
+/* ipu_ic_prp */
+enum {
+	PRP_SINK_PAD = 0,
+	PRP_SRC_PAD_PRPENC,
+	PRP_SRC_PAD_PRPVF,
+	PRP_NUM_PADS,
+};
+
+#define PRP_NUM_SINK_PADS 1
+#define PRP_NUM_SRC_PADS  2
+
+/* ipu_ic_prpencvf */
+enum {
+	PRPENCVF_SINK_PAD = 0,
+	PRPENCVF_SRC_PAD,
+	PRPENCVF_NUM_PADS,
+};
+
+#define PRPENCVF_NUM_SINK_PADS 1
+#define PRPENCVF_NUM_SRC_PADS  1
+
+/* How long to wait for EOF interrupts in the buffer-capture subdevs */
+#define IMX_MEDIA_EOF_TIMEOUT       1000
+
+struct imx_media_pixfmt {
+	u32     fourcc;
+	u32     codes[4];
+	int     bpp;     /* total bpp */
+	enum ipu_color_space cs;
+	bool    planar;  /* is a planar format */
+	bool    bayer;   /* is a raw bayer format */
+	bool    ipufmt;  /* is one of the IPU internal formats */
+};
+
+struct imx_media_buffer {
+	struct vb2_v4l2_buffer vbuf; /* v4l buffer must be first */
+	struct list_head  list;
+};
+
+struct imx_media_video_dev {
+	struct video_device *vfd;
+
+	/* the user format */
+	struct v4l2_format fmt;
+	const struct imx_media_pixfmt *cc;
+};
+
+static inline struct imx_media_buffer *to_imx_media_vb(struct vb2_buffer *vb)
+{
+	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+
+	return container_of(vbuf, struct imx_media_buffer, vbuf);
+}
+
+struct imx_media_link {
+	struct device_node *remote_sd_node;
+	char               remote_devname[32];
+	int                local_pad;
+	int                remote_pad;
+};
+
+struct imx_media_pad {
+	struct media_pad  pad;
+	struct imx_media_link link[IMX_MEDIA_MAX_LINKS];
+	bool devnode; /* does this pad link to a device node */
+	int num_links;
+
+	/*
+	 * list of video devices that can be reached from this pad,
+	 * list is only valid for source pads.
+	 */
+	struct imx_media_video_dev *vdev[IMX_MEDIA_MAX_VDEVS];
+	int num_vdevs;
+};
+
+struct imx_media_internal_sd_platformdata {
+	char sd_name[V4L2_SUBDEV_NAME_SIZE];
+	u32 grp_id;
+	int ipu_id;
+};
+
+struct imx_media_subdev {
+	struct v4l2_async_subdev asd;
+	struct v4l2_subdev       *sd; /* set when bound */
+
+	struct imx_media_pad     pad[IMX_MEDIA_MAX_PADS];
+	int num_sink_pads;
+	int num_src_pads;
+
+	/* the platform device if this is an internal subdev */
+	struct platform_device *pdev;
+	/* the devname is needed for async devname match */
+	char devname[32];
+
+	/* if this is a sensor */
+	struct v4l2_fwnode_endpoint sensor_ep;
+};
+
+struct imx_media_dev {
+	struct media_device md;
+	struct v4l2_device  v4l2_dev;
+
+	/* the pipeline object */
+	struct media_pipeline pipe;
+
+	struct mutex mutex; /* protect elements below */
+
+	/* master subdevice list */
+	struct imx_media_subdev subdev[IMX_MEDIA_MAX_SUBDEVS];
+	int num_subdevs;
+
+	/* master video device list */
+	struct imx_media_video_dev *vdev[IMX_MEDIA_MAX_VDEVS];
+	int num_vdevs;
+
+	/* IPUs this media driver control, valid after subdevs bound */
+	struct ipu_soc *ipu[2];
+
+	/* for async subdev registration */
+	struct v4l2_async_subdev *async_ptrs[IMX_MEDIA_MAX_SUBDEVS];
+	struct v4l2_async_notifier subdev_notifier;
+};
+
+enum codespace_sel {
+	CS_SEL_YUV = 0,
+	CS_SEL_RGB,
+	CS_SEL_ANY,
+};
+
+const struct imx_media_pixfmt *
+imx_media_find_format(u32 fourcc, enum codespace_sel cs_sel, bool allow_bayer);
+int imx_media_enum_format(u32 *fourcc, u32 index, enum codespace_sel cs_sel);
+const struct imx_media_pixfmt *
+imx_media_find_mbus_format(u32 code, enum codespace_sel cs_sel,
+			   bool allow_bayer);
+int imx_media_enum_mbus_format(u32 *code, u32 index, enum codespace_sel cs_sel,
+			       bool allow_bayer);
+const struct imx_media_pixfmt *
+imx_media_find_ipu_format(u32 code, enum codespace_sel cs_sel);
+int imx_media_enum_ipu_format(u32 *code, u32 index, enum codespace_sel cs_sel);
+
+int imx_media_init_mbus_fmt(struct v4l2_mbus_framefmt *mbus,
+			    u32 width, u32 height, u32 code, u32 field,
+			    const struct imx_media_pixfmt **cc);
+
+int imx_media_mbus_fmt_to_pix_fmt(struct v4l2_pix_format *pix,
+				  struct v4l2_mbus_framefmt *mbus,
+				  const struct imx_media_pixfmt *cc);
+int imx_media_mbus_fmt_to_ipu_image(struct ipu_image *image,
+				    struct v4l2_mbus_framefmt *mbus);
+int imx_media_ipu_image_to_mbus_fmt(struct v4l2_mbus_framefmt *mbus,
+				    struct ipu_image *image);
+
+struct imx_media_subdev *
+imx_media_find_async_subdev(struct imx_media_dev *imxmd,
+			    struct device_node *np,
+			    const char *devname);
+struct imx_media_subdev *
+imx_media_add_async_subdev(struct imx_media_dev *imxmd,
+			   struct device_node *np,
+			   struct platform_device *pdev);
+int imx_media_add_pad_link(struct imx_media_dev *imxmd,
+			   struct imx_media_pad *pad,
+			   struct device_node *remote_node,
+			   const char *remote_devname,
+			   int local_pad, int remote_pad);
+
+void imx_media_grp_id_to_sd_name(char *sd_name, int sz,
+				 u32 grp_id, int ipu_id);
+
+int imx_media_add_internal_subdevs(struct imx_media_dev *imxmd,
+				   struct imx_media_subdev *csi[4]);
+void imx_media_remove_internal_subdevs(struct imx_media_dev *imxmd);
+
+struct imx_media_subdev *
+imx_media_find_subdev_by_sd(struct imx_media_dev *imxmd,
+			    struct v4l2_subdev *sd);
+struct imx_media_subdev *
+imx_media_find_subdev_by_id(struct imx_media_dev *imxmd,
+			    u32 grp_id);
+int imx_media_add_video_device(struct imx_media_dev *imxmd,
+			       struct imx_media_video_dev *vdev);
+int imx_media_find_mipi_csi2_channel(struct imx_media_dev *imxmd,
+				     struct media_entity *start_entity);
+struct imx_media_subdev *
+imx_media_find_upstream_subdev(struct imx_media_dev *imxmd,
+			       struct media_entity *start_entity,
+			       u32 grp_id);
+struct imx_media_subdev *
+__imx_media_find_sensor(struct imx_media_dev *imxmd,
+			struct media_entity *start_entity);
+struct imx_media_subdev *
+imx_media_find_sensor(struct imx_media_dev *imxmd,
+		      struct media_entity *start_entity);
+
+struct imx_media_dma_buf {
+	void          *virt;
+	dma_addr_t     phys;
+	unsigned long  len;
+};
+
+void imx_media_free_dma_buf(struct imx_media_dev *imxmd,
+			    struct imx_media_dma_buf *buf);
+int imx_media_alloc_dma_buf(struct imx_media_dev *imxmd,
+			    struct imx_media_dma_buf *buf,
+			    int size);
+
+int imx_media_pipeline_set_stream(struct imx_media_dev *imxmd,
+				  struct media_entity *entity,
+				  bool on);
+
+/* imx-media-fim.c */
+struct imx_media_fim;
+void imx_media_fim_eof_monitor(struct imx_media_fim *fim, struct timespec *ts);
+int imx_media_fim_set_stream(struct imx_media_fim *fim,
+			     const struct v4l2_fract *frame_interval,
+			     bool on);
+int imx_media_fim_add_controls(struct imx_media_fim *fim);
+struct imx_media_fim *imx_media_fim_init(struct v4l2_subdev *sd);
+void imx_media_fim_free(struct imx_media_fim *fim);
+
+/* imx-media-of.c */
+struct imx_media_subdev *
+imx_media_of_find_subdev(struct imx_media_dev *imxmd,
+			 struct device_node *np,
+			 const char *name);
+int imx_media_of_parse(struct imx_media_dev *dev,
+		       struct imx_media_subdev *(*csi)[4],
+		       struct device_node *np);
+
+/* imx-media-capture.c */
+struct imx_media_video_dev *
+imx_media_capture_device_init(struct v4l2_subdev *src_sd, int pad);
+void imx_media_capture_device_remove(struct imx_media_video_dev *vdev);
+int imx_media_capture_device_register(struct imx_media_video_dev *vdev);
+void imx_media_capture_device_unregister(struct imx_media_video_dev *vdev);
+struct imx_media_buffer *
+imx_media_capture_device_next_buf(struct imx_media_video_dev *vdev);
+void imx_media_capture_device_set_format(struct imx_media_video_dev *vdev,
+					 struct v4l2_pix_format *pix);
+void imx_media_capture_device_error(struct imx_media_video_dev *vdev);
+
+/* subdev group ids */
+#define IMX_MEDIA_GRP_ID_SENSOR    (1 << 8)
+#define IMX_MEDIA_GRP_ID_VIDMUX    (1 << 9)
+#define IMX_MEDIA_GRP_ID_CSI2      (1 << 10)
+#define IMX_MEDIA_GRP_ID_CSI_BIT   11
+#define IMX_MEDIA_GRP_ID_CSI       (0x3 << IMX_MEDIA_GRP_ID_CSI_BIT)
+#define IMX_MEDIA_GRP_ID_CSI0      (1 << IMX_MEDIA_GRP_ID_CSI_BIT)
+#define IMX_MEDIA_GRP_ID_CSI1      (2 << IMX_MEDIA_GRP_ID_CSI_BIT)
+#define IMX_MEDIA_GRP_ID_VDIC      (1 << 13)
+#define IMX_MEDIA_GRP_ID_IC_PRP    (1 << 14)
+#define IMX_MEDIA_GRP_ID_IC_PRPENC (1 << 15)
+#define IMX_MEDIA_GRP_ID_IC_PRPVF  (1 << 16)
+
+#endif
diff --git a/include/media/imx.h b/include/media/imx.h
new file mode 100644
index 000000000000..6e5f50d35f89
--- /dev/null
+++ b/include/media/imx.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2014-2017 Mentor Graphics Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version
+ */
+
+#ifndef __MEDIA_IMX_H__
+#define __MEDIA_IMX_H__
+
+#include <linux/imx-media.h>
+
+#endif
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 83b28b41123f..8cadcece7a29 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -185,6 +185,10 @@ enum v4l2_colorfx {
  */
 #define V4L2_CID_USER_MAX217X_BASE		(V4L2_CID_USER_BASE + 0x1090)
 
+/* The base for the imx driver controls.
+ * We reserve 16 controls for this driver. */
+#define V4L2_CID_USER_IMX_BASE			(V4L2_CID_USER_BASE + 0x1090)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From e72cb0e79475b0d26625e802846e281a84304c07 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 8 Jun 2017 16:59:58 -0300
Subject: [media] v4l: ctrls: Add a control for digital gain

Add V4L2_CID_DIGITAL_GAIN to control explicitly digital gain.

We already have analogue gain control which the digital gain control
complements. Typically higher quality images are obtained using analogue
gain only as the digital gain does not add information to the image
(rather it may remove it).

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/v4l/extended-controls.rst | 7 +++++++
 drivers/media/v4l2-core/v4l2-ctrls.c               | 1 +
 include/uapi/linux/v4l2-controls.h                 | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index 76c5b1aeaadd..9acc9cad49e2 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -3021,6 +3021,13 @@ Image Process Control IDs
     The video deinterlacing mode (such as Bob, Weave, ...). The menu items are
     driver specific and are documented in :ref:`v4l-drivers`.
 
+``V4L2_CID_DIGITAL_GAIN (integer)``
+    Digital gain is the value by which all colour components
+    are multiplied by. Typically the digital gain applied is the
+    control value divided by e.g. 0x100, meaning that to get no
+    digital gain the control value needs to be 0x100. The no-gain
+    configuration is also typically the default.
+
 
 .. _dv-controls:
 
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 5aed7bd20ad2..36eede3ff098 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -886,6 +886,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_PIXEL_RATE:		return "Pixel Rate";
 	case V4L2_CID_TEST_PATTERN:		return "Test Pattern";
 	case V4L2_CID_DEINTERLACING_MODE:	return "Deinterlacing Mode";
+	case V4L2_CID_DIGITAL_GAIN:		return "Digital Gain";
 
 	/* DV controls */
 	/* Keep the order of the 'case's the same as in v4l2-controls.h! */
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 8cadcece7a29..31bfc68f86d6 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -902,7 +902,7 @@ enum v4l2_jpeg_chroma_subsampling {
 #define V4L2_CID_PIXEL_RATE			(V4L2_CID_IMAGE_PROC_CLASS_BASE + 2)
 #define V4L2_CID_TEST_PATTERN			(V4L2_CID_IMAGE_PROC_CLASS_BASE + 3)
 #define V4L2_CID_DEINTERLACING_MODE		(V4L2_CID_IMAGE_PROC_CLASS_BASE + 4)
-
+#define V4L2_CID_DIGITAL_GAIN			(V4L2_CID_IMAGE_PROC_CLASS_BASE + 5)
 
 /*  DV-class control IDs defined by V4L2 */
 #define V4L2_CID_DV_CLASS_BASE			(V4L2_CTRL_CLASS_DV | 0x900)
-- 
cgit v1.2.3


From fdd2f5b7de2afaa931e5f7bad7bcda35d1f1b479 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 20 Jun 2017 07:05:40 -0500
Subject: fs: Separate out kiocb flags setup based on RWF_* flags

Also added RWF_SUPPORTED to encompass all flags.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/read_write.c         | 12 +++---------
 include/linux/fs.h      | 14 ++++++++++++++
 include/uapi/linux/fs.h |  2 ++
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d4484df9..53c816c61122 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -678,16 +678,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	struct kiocb kiocb;
 	ssize_t ret;
 
-	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
-		return -EOPNOTSUPP;
-
 	init_sync_kiocb(&kiocb, filp);
-	if (flags & RWF_HIPRI)
-		kiocb.ki_flags |= IOCB_HIPRI;
-	if (flags & RWF_DSYNC)
-		kiocb.ki_flags |= IOCB_DSYNC;
-	if (flags & RWF_SYNC)
-		kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+	ret = kiocb_set_rw_flags(&kiocb, flags);
+	if (ret)
+		return ret;
 	kiocb.ki_pos = *ppos;
 
 	if (type == READ)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 023f0324762b..96a1a1fa54a9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3057,6 +3057,20 @@ static inline int iocb_flags(struct file *file)
 	return res;
 }
 
+static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+{
+	if (unlikely(flags & ~RWF_SUPPORTED))
+		return -EOPNOTSUPP;
+
+	if (flags & RWF_HIPRI)
+		ki->ki_flags |= IOCB_HIPRI;
+	if (flags & RWF_DSYNC)
+		ki->ki_flags |= IOCB_DSYNC;
+	if (flags & RWF_SYNC)
+		ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+	return 0;
+}
+
 static inline ino_t parent_ino(struct dentry *dentry)
 {
 	ino_t res;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 24e61a54feaa..937c3e39650a 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -361,4 +361,6 @@ struct fscrypt_key {
 #define RWF_DSYNC			0x00000002 /* per-IO O_DSYNC */
 #define RWF_SYNC			0x00000004 /* per-IO O_SYNC */
 
+#define RWF_SUPPORTED			(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)
+
 #endif /* _UAPI_LINUX_FS_H */
-- 
cgit v1.2.3


From 9830f4be159b29399d107bffb99e0132bc5aedd4 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 20 Jun 2017 07:05:42 -0500
Subject: fs: Use RWF_* flags for AIO operations

aio_rw_flags is introduced in struct iocb (using aio_reserved1) which will
carry the RWF_* flags. We cannot use aio_flags because they are not
checked for validity which may break existing applications.

Note, the only place RWF_HIPRI comes in effect is dio_await_one().
All the rest of the locations, aio code return -EIOCBQUEUED before the
checks for RWF_HIPRI.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c                     | 8 +++++++-
 include/uapi/linux/aio_abi.h | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/aio.c b/fs/aio.c
index f52d925ee259..020fa0045e3c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
+	if (unlikely(iocb->aio_reserved2)) {
 		pr_debug("EINVAL: reserve field set\n");
 		return -EINVAL;
 	}
@@ -1586,6 +1586,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		req->common.ki_flags |= IOCB_EVENTFD;
 	}
 
+	ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
+	if (unlikely(ret)) {
+		pr_debug("EINVAL: aio_rw_flags\n");
+		goto out_put_req;
+	}
+
 	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
 	if (unlikely(ret)) {
 		pr_debug("EFAULT: aio_key\n");
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index bb2554f7fbd1..a2d4a8ac94ca 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -79,7 +79,7 @@ struct io_event {
 struct iocb {
 	/* these are internal to the kernel/libc. */
 	__u64	aio_data;	/* data to be returned in event's data */
-	__u32	PADDED(aio_key, aio_reserved1);
+	__u32	PADDED(aio_key, aio_rw_flags);
 				/* the kernel sets aio_key to the req # */
 
 	/* common fields */
-- 
cgit v1.2.3


From b745fafaf70c0a98a2e1e7ac8cb14542889ceb0e Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 20 Jun 2017 07:05:43 -0500
Subject: fs: Introduce RWF_NOWAIT and FMODE_AIO_NOWAIT

RWF_NOWAIT informs kernel to bail out if an AIO request will block
for reasons such as file allocations, or a writeback triggered,
or would block while allocating requests while performing
direct I/O.

RWF_NOWAIT is translated to IOCB_NOWAIT for iocb->ki_flags.

FMODE_AIO_NOWAIT is a flag which identifies the file opened is capable
of returning -EAGAIN if the AIO call will block. This must be set by
supporting filesystems in the ->open() call.

Filesystems xfs, btrfs and ext4 would be supported in the following patches.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c                | 6 ++++++
 include/linux/fs.h      | 9 +++++++++
 include/uapi/linux/fs.h | 4 +++-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 020fa0045e3c..34027b67e2f4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1592,6 +1592,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 	}
 
+	if ((req->common.ki_flags & IOCB_NOWAIT) &&
+			!(req->common.ki_flags & IOCB_DIRECT)) {
+		ret = -EOPNOTSUPP;
+		goto out_put_req;
+	}
+
 	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
 	if (unlikely(ret)) {
 		pr_debug("EFAULT: aio_key\n");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0d34f5b5a6b0..4574121f4746 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -143,6 +143,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
 
+/* File is capable of returning -EAGAIN if AIO will block */
+#define FMODE_AIO_NOWAIT	((__force fmode_t)0x8000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
@@ -269,6 +272,7 @@ struct writeback_control;
 #define IOCB_DSYNC		(1 << 4)
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
+#define IOCB_NOWAIT		(1 << 7)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -3064,6 +3068,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
 	if (unlikely(flags & ~RWF_SUPPORTED))
 		return -EOPNOTSUPP;
 
+	if (flags & RWF_NOWAIT) {
+		if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT))
+			return -EOPNOTSUPP;
+		ki->ki_flags |= IOCB_NOWAIT;
+	}
 	if (flags & RWF_HIPRI)
 		ki->ki_flags |= IOCB_HIPRI;
 	if (flags & RWF_DSYNC)
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 937c3e39650a..27d8c36c04af 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -360,7 +360,9 @@ struct fscrypt_key {
 #define RWF_HIPRI			0x00000001 /* high priority request, poll if possible */
 #define RWF_DSYNC			0x00000002 /* per-IO O_DSYNC */
 #define RWF_SYNC			0x00000004 /* per-IO O_SYNC */
+#define RWF_NOWAIT			0x00000008 /* per-IO, return -EAGAIN if operation would block */
 
-#define RWF_SUPPORTED			(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)
+#define RWF_SUPPORTED			(RWF_HIPRI | RWF_DSYNC | RWF_SYNC |\
+					 RWF_NOWAIT)
 
 #endif /* _UAPI_LINUX_FS_H */
-- 
cgit v1.2.3


From d320695add65d8f909e4edbdcc8b66ae504854a6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 17 Sep 2015 18:19:34 -0300
Subject: [media] dvb: don't use 'time_t' in event ioctl

'struct video_event' is used for the VIDEO_GET_EVENT ioctl, implemented
by drivers/media/pci/ivtv/ivtv-ioctl.c and
drivers/media/pci/ttpci/av7110_av.c. The structure contains a 'time_t',
which will be redefined in the future to be 64-bit wide, causing an
incompatible ABI change for this ioctl.

As it turns out, neither of the drivers currently sets the timestamp
field, and it is presumably useless anyway because of the limited
resolutions (no sub-second times). This means we can simply change
the structure definition to use a 'long' instead of 'time_t' and
remain compatible with all existing user space binaries when time_t
gets changed.

If anybody ever starts using this field, they have to make sure not
to use 1970 based seconds in there, as those overflow in 2038.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/dvb/video.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dvb/video.h b/include/uapi/linux/dvb/video.h
index 260f033a5b54..c83d40b8a8a4 100644
--- a/include/uapi/linux/dvb/video.h
+++ b/include/uapi/linux/dvb/video.h
@@ -134,7 +134,8 @@ struct video_event {
 #define VIDEO_EVENT_FRAME_RATE_CHANGED	2
 #define VIDEO_EVENT_DECODER_STOPPED 	3
 #define VIDEO_EVENT_VSYNC 		4
-	__kernel_time_t timestamp;
+	/* unused, make sure to use atomic time for y2038 if it ever gets used */
+	long timestamp;
 	union {
 		video_size_t size;
 		unsigned int frame_rate;	/* in frames per 1000sec */
-- 
cgit v1.2.3


From 134764ed6e12d9f99b3de68b8aaeae1ba842d91c Mon Sep 17 00:00:00 2001
From: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
Date: Thu, 11 May 2017 16:32:48 +0530
Subject: KVM: PPC: Book3S HV: Add new capability to control MCE behaviour

This introduces a new KVM capability to control how KVM behaves
on machine check exception (MCE) in HV KVM guests.

If this capability has not been enabled, KVM redirects machine check
exceptions to guest's 0x200 vector, if the address in error belongs to
the guest. With this capability enabled, KVM will cause a guest exit
with the exit reason indicating an NMI.

The new capability is required to avoid problems if a new kernel/KVM
is used with an old QEMU, running a guest that doesn't issue
"ibm,nmi-register".  As old QEMU does not understand the NMI exit
type, it treats it as a fatal error.  However, the guest could have
handled the machine check error if the exception was delivered to
guest's 0x200 interrupt vector instead of NMI exit in case of old
QEMU.

[paulus@ozlabs.org - Reworded the commit message to be clearer,
 enable only on HV KVM.]

Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt   | 11 +++++++++++
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kernel/asm-offsets.c   |  1 +
 arch/powerpc/kvm/powerpc.c          | 14 ++++++++++++++
 include/uapi/linux/kvm.h            |  1 +
 5 files changed, 28 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 68b66b538d2d..1531a3cd548f 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4011,6 +4011,17 @@ vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is
 subsequently queried for the VM.  This capability is only supported by
 HV KVM, and can only be set before any VCPUs have been created.
 
+7.12 KVM_CAP_PPC_FWNMI
+
+Architectures: ppc
+Parameters: none
+
+With this capability a machine check exception in the guest address
+space will cause KVM to exit the guest with NMI exit reason. This
+enables QEMU to build error log and branch to guest kernel registered
+machine check handling routine. Without this capability KVM will
+branch to guests' 0x200 interrupt vector.
+
 8. Other capabilities.
 ----------------------
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 683c3c82ce9c..05866391f406 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -287,6 +287,7 @@ struct kvm_arch {
 	cpumask_t need_tlb_flush;
 	cpumask_t cpu_in_guest;
 	u8 radix;
+	u8 fwnmi_enabled;
 	pgd_t *pgtable;
 	u64 process_table;
 	struct dentry *debugfs_dir;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 293fbdf96e7d..ae8e89e0d083 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -485,6 +485,7 @@ int main(void)
 	OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls);
 	OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
 	OFFSET(KVM_RADIX, kvm, arch.radix);
+	OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
 	OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
 	OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
 	OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8208c2b95a93..ccaa7a407c15 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -623,6 +623,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		/* Disable this on POWER9 until code handles new HPTE format */
 		r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
 		break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_CAP_PPC_FWNMI:
+		r = hv_enabled;
+		break;
 #endif
 	case KVM_CAP_PPC_HTM:
 		r = cpu_has_feature(CPU_FTR_TM_COMP) &&
@@ -1543,6 +1548,15 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		break;
 	}
 #endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_CAP_PPC_FWNMI:
+		r = -EINVAL;
+		if (!is_kvmppc_hv_enabled(vcpu->kvm))
+			break;
+		r = 0;
+		vcpu->kvm->arch.fwnmi_enabled = true;
+		break;
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 577429a95ad8..89bc145b4051 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -895,6 +895,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SPAPR_TCE_VFIO 142
 #define KVM_CAP_X86_GUEST_MWAIT 143
 #define KVM_CAP_ARM_USER_IRQ 144
+#define KVM_CAP_PPC_FWNMI 145
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From a520b49ec15576784774f77c914d7020fa7aef13 Mon Sep 17 00:00:00 2001
From: Maya Erez <qca_merez@qca.qualcomm.com>
Date: Fri, 16 Jun 2017 10:38:05 +0300
Subject: wil6210: remove ioctl interface

Wireless drivers should not be using ioctl interface,
hence remove this interface for wil6210 driver.

Signed-off-by: Maya Erez <qca_merez@qca.qualcomm.com>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
---
 drivers/net/wireless/ath/wil6210/Makefile  |   1 -
 drivers/net/wireless/ath/wil6210/ioctl.c   | 180 -----------------------------
 drivers/net/wireless/ath/wil6210/netdev.c  |   8 --
 drivers/net/wireless/ath/wil6210/wil6210.h |   1 -
 include/uapi/linux/wil6210_uapi.h          |  87 --------------
 5 files changed, 277 deletions(-)
 delete mode 100644 drivers/net/wireless/ath/wil6210/ioctl.c
 delete mode 100644 include/uapi/linux/wil6210_uapi.h

(limited to 'include/uapi/linux')

diff --git a/drivers/net/wireless/ath/wil6210/Makefile b/drivers/net/wireless/ath/wil6210/Makefile
index 89bf2f9eca1d..4ae21da78e9e 100644
--- a/drivers/net/wireless/ath/wil6210/Makefile
+++ b/drivers/net/wireless/ath/wil6210/Makefile
@@ -10,7 +10,6 @@ wil6210-y += interrupt.o
 wil6210-y += txrx.o
 wil6210-y += debug.o
 wil6210-y += rx_reorder.o
-wil6210-y += ioctl.o
 wil6210-y += fw.o
 wil6210-y += pm.o
 wil6210-y += pmc.o
diff --git a/drivers/net/wireless/ath/wil6210/ioctl.c b/drivers/net/wireless/ath/wil6210/ioctl.c
deleted file mode 100644
index 1c49ad8f9478..000000000000
--- a/drivers/net/wireless/ath/wil6210/ioctl.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2014,2017 Qualcomm Atheros, Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <linux/uaccess.h>
-
-#include "wil6210.h"
-#include <uapi/linux/wil6210_uapi.h>
-
-#define wil_hex_dump_ioctl(prefix_str, buf, len) \
-	print_hex_dump_debug("DBG[IOC ]" prefix_str, \
-			     DUMP_PREFIX_OFFSET, 16, 1, buf, len, true)
-#define wil_dbg_ioctl(wil, fmt, arg...) wil_dbg(wil, "DBG[IOC ]" fmt, ##arg)
-
-static void __iomem *wil_ioc_addr(struct wil6210_priv *wil, uint32_t addr,
-				  uint32_t size, enum wil_memio_op op)
-{
-	void __iomem *a;
-	u32 off;
-
-	switch (op & wil_mmio_addr_mask) {
-	case wil_mmio_addr_linker:
-		a = wmi_buffer(wil, cpu_to_le32(addr));
-		break;
-	case wil_mmio_addr_ahb:
-		a = wmi_addr(wil, addr);
-		break;
-	case wil_mmio_addr_bar:
-		a = wmi_addr(wil, addr + WIL6210_FW_HOST_OFF);
-		break;
-	default:
-		wil_err(wil, "Unsupported address mode, op = 0x%08x\n", op);
-		return NULL;
-	}
-
-	off = a - wil->csr;
-	if (size >= wil->bar_size - off) {
-		wil_err(wil, "Requested block does not fit into memory: "
-			"off = 0x%08x size = 0x%08x\n", off, size);
-		return NULL;
-	}
-
-	return a;
-}
-
-static int wil_ioc_memio_dword(struct wil6210_priv *wil, void __user *data)
-{
-	struct wil_memio io;
-	void __iomem *a;
-	bool need_copy = false;
-
-	if (copy_from_user(&io, data, sizeof(io)))
-		return -EFAULT;
-
-	wil_dbg_ioctl(wil, "IO: addr = 0x%08x val = 0x%08x op = 0x%08x\n",
-		      io.addr, io.val, io.op);
-
-	a = wil_ioc_addr(wil, io.addr, sizeof(u32), io.op);
-	if (!a) {
-		wil_err(wil, "invalid address 0x%08x, op = 0x%08x\n", io.addr,
-			io.op);
-		return -EINVAL;
-	}
-	/* operation */
-	switch (io.op & wil_mmio_op_mask) {
-	case wil_mmio_read:
-		io.val = readl(a);
-		need_copy = true;
-		break;
-	case wil_mmio_write:
-		writel(io.val, a);
-		wmb(); /* make sure write propagated to HW */
-		break;
-	default:
-		wil_err(wil, "Unsupported operation, op = 0x%08x\n", io.op);
-		return -EINVAL;
-	}
-
-	if (need_copy) {
-		wil_dbg_ioctl(wil, "IO done: addr = 0x%08x"
-			      " val = 0x%08x op = 0x%08x\n",
-			      io.addr, io.val, io.op);
-		if (copy_to_user(data, &io, sizeof(io)))
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int wil_ioc_memio_block(struct wil6210_priv *wil, void __user *data)
-{
-	struct wil_memio_block io;
-	void *block;
-	void __iomem *a;
-	int rc = 0;
-
-	if (copy_from_user(&io, data, sizeof(io)))
-		return -EFAULT;
-
-	wil_dbg_ioctl(wil, "IO: addr = 0x%08x size = 0x%08x op = 0x%08x\n",
-		      io.addr, io.size, io.op);
-
-	/* size */
-	if (io.size % 4) {
-		wil_err(wil, "size is not multiple of 4:  0x%08x\n", io.size);
-		return -EINVAL;
-	}
-
-	a = wil_ioc_addr(wil, io.addr, io.size, io.op);
-	if (!a) {
-		wil_err(wil, "invalid address 0x%08x, op = 0x%08x\n", io.addr,
-			io.op);
-		return -EINVAL;
-	}
-
-	block = kmalloc(io.size, GFP_USER);
-	if (!block)
-		return -ENOMEM;
-
-	/* operation */
-	switch (io.op & wil_mmio_op_mask) {
-	case wil_mmio_read:
-		wil_memcpy_fromio_32(block, a, io.size);
-		wil_hex_dump_ioctl("Read  ", block, io.size);
-		if (copy_to_user(io.block, block, io.size)) {
-			rc = -EFAULT;
-			goto out_free;
-		}
-		break;
-	case wil_mmio_write:
-		if (copy_from_user(block, io.block, io.size)) {
-			rc = -EFAULT;
-			goto out_free;
-		}
-		wil_memcpy_toio_32(a, block, io.size);
-		wmb(); /* make sure write propagated to HW */
-		wil_hex_dump_ioctl("Write ", block, io.size);
-		break;
-	default:
-		wil_err(wil, "Unsupported operation, op = 0x%08x\n", io.op);
-		rc = -EINVAL;
-		break;
-	}
-
-out_free:
-	kfree(block);
-	return rc;
-}
-
-int wil_ioctl(struct wil6210_priv *wil, void __user *data, int cmd)
-{
-	int ret;
-
-	switch (cmd) {
-	case WIL_IOCTL_MEMIO:
-		ret = wil_ioc_memio_dword(wil, data);
-		break;
-	case WIL_IOCTL_MEMIO_BLOCK:
-		ret = wil_ioc_memio_block(wil, data);
-		break;
-	default:
-		wil_dbg_ioctl(wil, "Unsupported IOCTL 0x%04x\n", cmd);
-		return -ENOIOCTLCMD;
-	}
-
-	wil_dbg_ioctl(wil, "ioctl(0x%04x) -> %d\n", cmd, ret);
-	return ret;
-}
diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c
index 708facd5f667..4a6ab2d0fdf1 100644
--- a/drivers/net/wireless/ath/wil6210/netdev.c
+++ b/drivers/net/wireless/ath/wil6210/netdev.c
@@ -42,20 +42,12 @@ static int wil_stop(struct net_device *ndev)
 	return wil_down(wil);
 }
 
-static int wil_do_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd)
-{
-	struct wil6210_priv *wil = ndev_to_wil(ndev);
-
-	return wil_ioctl(wil, ifr->ifr_data, cmd);
-}
-
 static const struct net_device_ops wil_netdev_ops = {
 	.ndo_open		= wil_open,
 	.ndo_stop		= wil_stop,
 	.ndo_start_xmit		= wil_start_xmit,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_do_ioctl		= wil_do_ioctl,
 };
 
 static int wil6210_netdev_poll_rx(struct napi_struct *napi, int budget)
diff --git a/drivers/net/wireless/ath/wil6210/wil6210.h b/drivers/net/wireless/ath/wil6210/wil6210.h
index 35f0554b20cc..d085ccfc7228 100644
--- a/drivers/net/wireless/ath/wil6210/wil6210.h
+++ b/drivers/net/wireless/ath/wil6210/wil6210.h
@@ -977,7 +977,6 @@ void wil6210_unmask_irq_rx(struct wil6210_priv *wil);
 
 int wil_iftype_nl2wmi(enum nl80211_iftype type);
 
-int wil_ioctl(struct wil6210_priv *wil, void __user *data, int cmd);
 int wil_request_firmware(struct wil6210_priv *wil, const char *name,
 			 bool load);
 bool wil_fw_verify_file_exists(struct wil6210_priv *wil, const char *name);
diff --git a/include/uapi/linux/wil6210_uapi.h b/include/uapi/linux/wil6210_uapi.h
deleted file mode 100644
index 6a3cddd156c4..000000000000
--- a/include/uapi/linux/wil6210_uapi.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2014 Qualcomm Atheros, Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifndef __WIL6210_UAPI_H__
-#define __WIL6210_UAPI_H__
-
-#if !defined(__KERNEL__)
-#define __user
-#endif
-
-#include <linux/sockios.h>
-
-/* Numbers SIOCDEVPRIVATE and SIOCDEVPRIVATE + 1
- * are used by Android devices to implement PNO (preferred network offload).
- * Albeit it is temporary solution, use different numbers to avoid conflicts
- */
-
-/**
- * Perform 32-bit I/O operation to the card memory
- *
- * User code should arrange data in memory like this:
- *
- *	struct wil_memio io;
- *	struct ifreq ifr = {
- *		.ifr_data = &io,
- *	};
- */
-#define WIL_IOCTL_MEMIO (SIOCDEVPRIVATE + 2)
-
-/**
- * Perform block I/O operation to the card memory
- *
- * User code should arrange data in memory like this:
- *
- *	void *buf;
- *	struct wil_memio_block io = {
- *		.block = buf,
- *	};
- *	struct ifreq ifr = {
- *		.ifr_data = &io,
- *	};
- */
-#define WIL_IOCTL_MEMIO_BLOCK (SIOCDEVPRIVATE + 3)
-
-/**
- * operation to perform
- *
- * @wil_mmio_op_mask - bits defining operation,
- * @wil_mmio_addr_mask - bits defining addressing mode
- */
-enum wil_memio_op {
-	wil_mmio_read = 0,
-	wil_mmio_write = 1,
-	wil_mmio_op_mask = 0xff,
-	wil_mmio_addr_linker = 0 << 8,
-	wil_mmio_addr_ahb = 1 << 8,
-	wil_mmio_addr_bar = 2 << 8,
-	wil_mmio_addr_mask = 0xff00,
-};
-
-struct wil_memio {
-	uint32_t op; /* enum wil_memio_op */
-	uint32_t addr; /* should be 32-bit aligned */
-	uint32_t val;
-};
-
-struct wil_memio_block {
-	uint32_t op; /* enum wil_memio_op */
-	uint32_t addr; /* should be 32-bit aligned */
-	uint32_t size; /* should be multiple of 4 */
-	void __user *block; /* block address */
-};
-
-#endif /* __WIL6210_UAPI_H__ */
-- 
cgit v1.2.3


From 94df30a6521becea7fda16f2c12ff9a01cac1da7 Mon Sep 17 00:00:00 2001
From: Julien Gomes <julien@arista.com>
Date: Tue, 20 Jun 2017 13:54:15 -0700
Subject: rtnetlink: add NEWCACHEREPORT message type

New NEWCACHEREPORT message type to be used for cache reports sent
via Netlink, effectively allowing splitting cache report reception from
mroute programming.

Suggested-by: Ryan Halbrook <halbrook@arista.com>
Signed-off-by: Julien Gomes <julien@arista.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 3 +++
 security/selinux/nlmsgtab.c    | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 564790e854f7..cd1afb900929 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -146,6 +146,9 @@ enum {
 	RTM_GETSTATS = 94,
 #define RTM_GETSTATS RTM_GETSTATS
 
+	RTM_NEWCACHEREPORT = 96,
+#define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 5aeaf30b7a13..7b7433a1a34c 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -79,6 +79,7 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_GETNSID,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 	{ RTM_NEWSTATS,		NETLINK_ROUTE_SOCKET__NLMSG_READ },
 	{ RTM_GETSTATS,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_NEWCACHEREPORT,	NETLINK_ROUTE_SOCKET__NLMSG_READ },
 };
 
 static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
@@ -158,7 +159,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 	switch (sclass) {
 	case SECCLASS_NETLINK_ROUTE_SOCKET:
 		/* RTM_MAX always point to RTM_SETxxxx, ie RTM_NEWxxx + 3 */
-		BUILD_BUG_ON(RTM_MAX != (RTM_NEWSTATS + 3));
+		BUILD_BUG_ON(RTM_MAX != (RTM_NEWCACHEREPORT + 3));
 		err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
 				 sizeof(nlmsg_route_perms));
 		break;
-- 
cgit v1.2.3


From 5f729eaabef9308cfaa4b27c9b3f120253eff79b Mon Sep 17 00:00:00 2001
From: Julien Gomes <julien@arista.com>
Date: Tue, 20 Jun 2017 13:54:16 -0700
Subject: rtnetlink: add restricted rtnl groups for ipv4 and ipv6 mroute

Add RTNLGRP_{IPV4,IPV6}_MROUTE_R as two new restricted groups for the
NETLINK_ROUTE family.
Binding to these groups specifically requires CAP_NET_ADMIN to allow
multicast of sensitive messages (e.g. mroute cache reports).

Suggested-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Julien Gomes <julien@arista.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h |  4 ++++
 net/core/rtnetlink.c           | 13 +++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cd1afb900929..d148505010a7 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -669,6 +669,10 @@ enum rtnetlink_groups {
 #define RTNLGRP_NSID		RTNLGRP_NSID
 	RTNLGRP_MPLS_NETCONF,
 #define RTNLGRP_MPLS_NETCONF	RTNLGRP_MPLS_NETCONF
+	RTNLGRP_IPV4_MROUTE_R,
+#define RTNLGRP_IPV4_MROUTE_R	RTNLGRP_IPV4_MROUTE_R
+	RTNLGRP_IPV6_MROUTE_R,
+#define RTNLGRP_IPV6_MROUTE_R	RTNLGRP_IPV6_MROUTE_R
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3aa57848a895..4aefa5a2625f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4218,6 +4218,18 @@ static void rtnetlink_rcv(struct sk_buff *skb)
 	rtnl_unlock();
 }
 
+static int rtnetlink_bind(struct net *net, int group)
+{
+	switch (group) {
+	case RTNLGRP_IPV4_MROUTE_R:
+	case RTNLGRP_IPV6_MROUTE_R:
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
+		break;
+	}
+	return 0;
+}
+
 static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
@@ -4252,6 +4264,7 @@ static int __net_init rtnetlink_net_init(struct net *net)
 		.input		= rtnetlink_rcv,
 		.cb_mutex	= &rtnl_mutex,
 		.flags		= NL_CFG_F_NONROOT_RECV,
+		.bind		= rtnetlink_bind,
 	};
 
 	sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
-- 
cgit v1.2.3


From 5a645dd86c1be64728578bcb1bdfb96e21815acb Mon Sep 17 00:00:00 2001
From: Julien Gomes <julien@arista.com>
Date: Tue, 20 Jun 2017 13:54:17 -0700
Subject: ipmr: add netlink notifications on igmpmsg cache reports

Add Netlink notifications on cache reports in ipmr, in addition to the
existing igmpmsg sent to mroute_sk.
Send RTM_NEWCACHEREPORT notifications to RTNLGRP_IPV4_MROUTE_R.

MSGTYPE, VIF_ID, SRC_ADDR and DST_ADDR Netlink attributes contain the
same data as their equivalent fields in the igmpmsg header.
PKT attribute is the packet sent to mroute_sk, without the added igmpmsg
header.

Suggested-by: Ryan Halbrook <halbrook@arista.com>
Signed-off-by: Julien Gomes <julien@arista.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mroute.h | 12 ++++++++
 net/ipv4/ipmr.c             | 69 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 79 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
index f904367c0cee..e8e5041dea8e 100644
--- a/include/uapi/linux/mroute.h
+++ b/include/uapi/linux/mroute.h
@@ -152,6 +152,18 @@ enum {
 };
 #define IPMRA_VIFA_MAX (__IPMRA_VIFA_MAX - 1)
 
+/* ipmr netlink cache report attributes */
+enum {
+	IPMRA_CREPORT_UNSPEC,
+	IPMRA_CREPORT_MSGTYPE,
+	IPMRA_CREPORT_VIF_ID,
+	IPMRA_CREPORT_SRC_ADDR,
+	IPMRA_CREPORT_DST_ADDR,
+	IPMRA_CREPORT_PKT,
+	__IPMRA_CREPORT_MAX
+};
+#define IPMRA_CREPORT_MAX (__IPMRA_CREPORT_MAX - 1)
+
 /* That's all usermode folks */
 
 #define MFC_ASSERT_THRESH (3*HZ)		/* Maximal freq. of asserts */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3e7454aa49e8..a1d521be612b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -109,6 +109,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 			      struct mfc_cache *c, struct rtmsg *rtm);
 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 				 int cmd);
+static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
 static void mroute_clean_tables(struct mr_table *mrt, bool all);
 static void ipmr_expire_process(unsigned long arg);
 
@@ -995,8 +996,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 	}
 }
 
-/* Bounce a cache query up to mrouted. We could use netlink for this but mrouted
- * expects the following bizarre scheme.
+/* Bounce a cache query up to mrouted and netlink.
  *
  * Called under mrt_lock.
  */
@@ -1062,6 +1062,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
 		return -EINVAL;
 	}
 
+	igmpmsg_netlink_event(mrt, skb);
+
 	/* Deliver to mrouted */
 	ret = sock_queue_rcv_skb(mroute_sk, skb);
 	rcu_read_unlock();
@@ -2341,6 +2343,69 @@ errout:
 		rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
 }
 
+static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
+{
+	size_t len =
+		NLMSG_ALIGN(sizeof(struct rtgenmsg))
+		+ nla_total_size(1)	/* IPMRA_CREPORT_MSGTYPE */
+		+ nla_total_size(4)	/* IPMRA_CREPORT_VIF_ID */
+		+ nla_total_size(4)	/* IPMRA_CREPORT_SRC_ADDR */
+		+ nla_total_size(4)	/* IPMRA_CREPORT_DST_ADDR */
+					/* IPMRA_CREPORT_PKT */
+		+ nla_total_size(payloadlen)
+		;
+
+	return len;
+}
+
+static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct nlmsghdr *nlh;
+	struct rtgenmsg *rtgenm;
+	struct igmpmsg *msg;
+	struct sk_buff *skb;
+	struct nlattr *nla;
+	int payloadlen;
+
+	payloadlen = pkt->len - sizeof(struct igmpmsg);
+	msg = (struct igmpmsg *)skb_network_header(pkt);
+
+	skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC);
+	if (!skb)
+		goto errout;
+
+	nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
+			sizeof(struct rtgenmsg), 0);
+	if (!nlh)
+		goto errout;
+	rtgenm = nlmsg_data(nlh);
+	rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
+	if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
+	    nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
+	    nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
+			    msg->im_src.s_addr) ||
+	    nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
+			    msg->im_dst.s_addr))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
+	if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
+				  nla_data(nla), payloadlen))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
+	return;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+errout:
+	kfree_skb(skb);
+	rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
+}
+
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-- 
cgit v1.2.3


From dd12d15c9a5b422331426980ddf70522c57c3392 Mon Sep 17 00:00:00 2001
From: Julien Gomes <julien@arista.com>
Date: Tue, 20 Jun 2017 13:54:18 -0700
Subject: ip6mr: add netlink notifications on mrt6msg cache reports

Add Netlink notifications on cache reports in ip6mr, in addition to the
existing mrt6msg sent to mroute6_sk.
Send RTM_NEWCACHEREPORT notifications to RTNLGRP_IPV6_MROUTE_R.

MSGTYPE, MIF_ID, SRC_ADDR and DST_ADDR Netlink attributes contain the
same data as their equivalent fields in the mrt6msg header.
PKT attribute is the packet sent to mroute6_sk, without the added
mrt6msg header.

Suggested-by: Ryan Halbrook <halbrook@arista.com>
Signed-off-by: Julien Gomes <julien@arista.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mroute6.h | 12 ++++++++
 net/ipv6/ip6mr.c             | 71 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index ed5721148768..e4746816c855 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -133,4 +133,16 @@ struct mrt6msg {
 	struct in6_addr	im6_src, im6_dst;
 };
 
+/* ip6mr netlink cache report attributes */
+enum {
+	IP6MRA_CREPORT_UNSPEC,
+	IP6MRA_CREPORT_MSGTYPE,
+	IP6MRA_CREPORT_MIF_ID,
+	IP6MRA_CREPORT_SRC_ADDR,
+	IP6MRA_CREPORT_DST_ADDR,
+	IP6MRA_CREPORT_PKT,
+	__IP6MRA_CREPORT_MAX
+};
+#define IP6MRA_CREPORT_MAX (__IP6MRA_CREPORT_MAX - 1)
+
 #endif /* _UAPI__LINUX_MROUTE6_H */
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index b0e2bf1f4212..7454850f2098 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -116,6 +116,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 			       struct mfc6_cache *c, struct rtmsg *rtm);
 static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc,
 			      int cmd);
+static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt);
 static int ip6mr_rtm_dumproute(struct sk_buff *skb,
 			       struct netlink_callback *cb);
 static void mroute_clean_tables(struct mr6_table *mrt, bool all);
@@ -1125,8 +1126,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
 }
 
 /*
- *	Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd
- *	expects the following bizarre scheme.
+ *	Bounce a cache query up to pim6sd and netlink.
  *
  *	Called under mrt_lock.
  */
@@ -1208,6 +1208,8 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
 		return -EINVAL;
 	}
 
+	mrt6msg_netlink_event(mrt, skb);
+
 	/*
 	 *	Deliver to user space multicast routing algorithms
 	 */
@@ -2457,6 +2459,71 @@ errout:
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err);
 }
 
+static size_t mrt6msg_netlink_msgsize(size_t payloadlen)
+{
+	size_t len =
+		NLMSG_ALIGN(sizeof(struct rtgenmsg))
+		+ nla_total_size(1)	/* IP6MRA_CREPORT_MSGTYPE */
+		+ nla_total_size(4)	/* IP6MRA_CREPORT_MIF_ID */
+					/* IP6MRA_CREPORT_SRC_ADDR */
+		+ nla_total_size(sizeof(struct in6_addr))
+					/* IP6MRA_CREPORT_DST_ADDR */
+		+ nla_total_size(sizeof(struct in6_addr))
+					/* IP6MRA_CREPORT_PKT */
+		+ nla_total_size(payloadlen)
+		;
+
+	return len;
+}
+
+static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct nlmsghdr *nlh;
+	struct rtgenmsg *rtgenm;
+	struct mrt6msg *msg;
+	struct sk_buff *skb;
+	struct nlattr *nla;
+	int payloadlen;
+
+	payloadlen = pkt->len - sizeof(struct mrt6msg);
+	msg = (struct mrt6msg *)skb_transport_header(pkt);
+
+	skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC);
+	if (!skb)
+		goto errout;
+
+	nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
+			sizeof(struct rtgenmsg), 0);
+	if (!nlh)
+		goto errout;
+	rtgenm = nlmsg_data(nlh);
+	rtgenm->rtgen_family = RTNL_FAMILY_IP6MR;
+	if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) ||
+	    nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) ||
+	    nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR,
+			     &msg->im6_src) ||
+	    nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR,
+			     &msg->im6_dst))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen);
+	if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg),
+				  nla_data(nla), payloadlen))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC);
+	return;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+errout:
+	kfree_skb(skb);
+	rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS);
+}
+
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-- 
cgit v1.2.3


From e86283071fb0eed28136adb52997888f4beb202b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 21 Jun 2017 20:16:11 +0200
Subject: bpf: expose prog id for cls_bpf and act_bpf

In order to be able to retrieve the attached programs from cls_bpf
and act_bpf, we need to expose the prog ids via netlink so that
an application can later on get an fd based on the id through the
BPF_PROG_GET_FD_BY_ID command, and dump related prog info via
BPF_OBJ_GET_INFO_BY_FD command for bpf(2).

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h       | 1 +
 include/uapi/linux/tc_act/tc_bpf.h | 1 +
 net/sched/act_bpf.c                | 3 +++
 net/sched/cls_bpf.c                | 3 +++
 4 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 2055783e6ee9..d5e2bf68d0d4 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -369,6 +369,7 @@ enum {
 	TCA_BPF_FLAGS,
 	TCA_BPF_FLAGS_GEN,
 	TCA_BPF_TAG,
+	TCA_BPF_ID,
 	__TCA_BPF_MAX,
 };
 
diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h
index 975b50dc8d1d..8dc2ac05eecf 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -28,6 +28,7 @@ enum {
 	TCA_ACT_BPF_NAME,
 	TCA_ACT_BPF_PAD,
 	TCA_ACT_BPF_TAG,
+	TCA_ACT_BPF_ID,
 	__TCA_ACT_BPF_MAX,
 };
 #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index d33947d6e9d0..9afe1337cfd1 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -123,6 +123,9 @@ static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
 	    nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
+	if (nla_put_u32(skb, TCA_ACT_BPF_ID, prog->filter->aux->id))
+		return -EMSGSIZE;
+
 	nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag));
 	if (nla == NULL)
 		return -EMSGSIZE;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index be0cfdf48976..f57bd531ba98 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -566,6 +566,9 @@ static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
 	    nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
+	if (nla_put_u32(skb, TCA_BPF_ID, prog->filter->aux->id))
+		return -EMSGSIZE;
+
 	nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag));
 	if (nla == NULL)
 		return -EMSGSIZE;
-- 
cgit v1.2.3


From 2ed4f9dd19c0f76f7fb56c4b201696d29149325c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 21 Jun 2017 16:01:27 +1000
Subject: KVM: PPC: Book3S HV: Add capability to report possible virtual SMT
 modes

Now that userspace can set the virtual SMT mode by enabling the
KVM_CAP_PPC_SMT capability, it is useful for userspace to be able
to query the set of possible virtual SMT modes.  This provides a
new capability, KVM_CAP_PPC_SMT_POSSIBLE, to provide this
information.  The return value is a bitmap of possible modes, with
bit N set if virtual SMT mode 2^N is available.  That is, 1 indicates
SMT1 is available, 2 indicates that SMT2 is available, 3 indicates
that both SMT1 and SMT2 are available, and so on.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt | 11 +++++++++++
 arch/powerpc/kvm/powerpc.c        | 10 ++++++++++
 include/uapi/linux/kvm.h          |  1 +
 3 files changed, 22 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 1531a3cd548f..63875dbb6ef5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4010,6 +4010,8 @@ be 0.  A successful call to enable this capability will result in
 vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is
 subsequently queried for the VM.  This capability is only supported by
 HV KVM, and can only be set before any VCPUs have been created.
+The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT
+modes are available.
 
 7.12 KVM_CAP_PPC_FWNMI
 
@@ -4183,3 +4185,12 @@ Currently the following bits are defined for the device_irq_level bitmap:
 Future versions of kvm may implement additional events. These will get
 indicated by returning a higher number from KVM_CHECK_EXTENSION and will be
 listed above.
+
+8.10 KVM_CAP_PPC_SMT_POSSIBLE
+
+Architectures: ppc
+
+Querying this capability returns a bitmap indicating the possible
+virtual SMT modes that can be set using KVM_CAP_PPC_SMT.  If bit N
+(counting from the right) is set, then a virtual SMT mode of 2^N is
+available.
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ccaa7a407c15..b14736f3870b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -566,6 +566,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 				r = threads_per_subcore;
 		}
 		break;
+	case KVM_CAP_PPC_SMT_POSSIBLE:
+		r = 1;
+		if (hv_enabled) {
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				r = ((threads_per_subcore << 1) - 1);
+			else
+				/* P9 can emulate dbells, so allow any mode */
+				r = 8 | 4 | 2 | 1;
+		}
+		break;
 	case KVM_CAP_PPC_RMA:
 		r = 0;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 89bc145b4051..36ea74aa3422 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -896,6 +896,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_GUEST_MWAIT 143
 #define KVM_CAP_ARM_USER_IRQ 144
 #define KVM_CAP_PPC_FWNMI 145
+#define KVM_CAP_PPC_SMT_POSSIBLE 146
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 4036e3874a1ce41a4f7267289f9a0d8e5cd49408 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Date: Thu, 4 Aug 2016 17:58:47 +0200
Subject: KVM: s390: ioctls to get and set guest storage attributes

* Add the struct used in the ioctls to get and set CMMA attributes.
* Add the two functions needed to get and set the CMMA attributes for
  guest pages.
* Add the two ioctls that use the aforementioned functions.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt | 135 +++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.c          | 202 +++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h          |  33 +++++++
 3 files changed, 369 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4029943887a3..912b7df8215a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3255,6 +3255,141 @@ Otherwise, if the MCE is a corrected error, KVM will just
 store it in the corresponding bank (provided this bank is
 not holding a previously reported uncorrected error).
 
+4.107 KVM_S390_GET_CMMA_BITS
+
+Capability: KVM_CAP_S390_CMMA_MIGRATION
+Architectures: s390
+Type: vm ioctl
+Parameters: struct kvm_s390_cmma_log (in, out)
+Returns: 0 on success, a negative value on error
+
+This ioctl is used to get the values of the CMMA bits on the s390
+architecture. It is meant to be used in two scenarios:
+- During live migration to save the CMMA values. Live migration needs
+  to be enabled via the KVM_REQ_START_MIGRATION VM property.
+- To non-destructively peek at the CMMA values, with the flag
+  KVM_S390_CMMA_PEEK set.
+
+The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired
+values are written to a buffer whose location is indicated via the "values"
+member in the kvm_s390_cmma_log struct.  The values in the input struct are
+also updated as needed.
+Each CMMA value takes up one byte.
+
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
+start_gfn is the number of the first guest frame whose CMMA values are
+to be retrieved,
+
+count is the length of the buffer in bytes,
+
+values points to the buffer where the result will be written to.
+
+If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be
+KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with
+other ioctls.
+
+The result is written in the buffer pointed to by the field values, and
+the values of the input parameter are updated as follows.
+
+Depending on the flags, different actions are performed. The only
+supported flag so far is KVM_S390_CMMA_PEEK.
+
+The default behaviour if KVM_S390_CMMA_PEEK is not set is:
+start_gfn will indicate the first page frame whose CMMA bits were dirty.
+It is not necessarily the same as the one passed as input, as clean pages
+are skipped.
+
+count will indicate the number of bytes actually written in the buffer.
+It can (and very often will) be smaller than the input value, since the
+buffer is only filled until 16 bytes of clean values are found (which
+are then not copied in the buffer). Since a CMMA migration block needs
+the base address and the length, for a total of 16 bytes, we will send
+back some clean data if there is some dirty data afterwards, as long as
+the size of the clean data does not exceed the size of the header. This
+allows to minimize the amount of data to be saved or transferred over
+the network at the expense of more roundtrips to userspace. The next
+invocation of the ioctl will skip over all the clean values, saving
+potentially more than just the 16 bytes we found.
+
+If KVM_S390_CMMA_PEEK is set:
+the existing storage attributes are read even when not in migration
+mode, and no other action is performed;
+
+the output start_gfn will be equal to the input start_gfn,
+
+the output count will be equal to the input count, except if the end of
+memory has been reached.
+
+In both cases:
+the field "remaining" will indicate the total number of dirty CMMA values
+still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is
+not enabled.
+
+mask is unused.
+
+values points to the userspace buffer where the result will be stored.
+
+This ioctl can fail with -ENOMEM if not enough memory can be allocated to
+complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
+KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
+-EFAULT if the userspace address is invalid or if no page table is
+present for the addresses (e.g. when using hugepages).
+
+4.108 KVM_S390_SET_CMMA_BITS
+
+Capability: KVM_CAP_S390_CMMA_MIGRATION
+Architectures: s390
+Type: vm ioctl
+Parameters: struct kvm_s390_cmma_log (in)
+Returns: 0 on success, a negative value on error
+
+This ioctl is used to set the values of the CMMA bits on the s390
+architecture. It is meant to be used during live migration to restore
+the CMMA values, but there are no restrictions on its use.
+The ioctl takes parameters via the kvm_s390_cmma_values struct.
+Each CMMA value takes up one byte.
+
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
+start_gfn indicates the starting guest frame number,
+
+count indicates how many values are to be considered in the buffer,
+
+flags is not used and must be 0.
+
+mask indicates which PGSTE bits are to be considered.
+
+remaining is not used.
+
+values points to the buffer in userspace where to store the values.
+
+This ioctl can fail with -ENOMEM if not enough memory can be allocated to
+complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
+the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or
+if the flags field was not 0, with -EFAULT if the userspace address is
+invalid, if invalid pages are written to (e.g. after the end of memory)
+or if no page table is present for the addresses (e.g. when using
+hugepages).
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c2b391499374..e100a7ff35c7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -30,8 +30,8 @@
 #include <linux/vmalloc.h>
 #include <linux/bitmap.h>
 #include <linux/sched/signal.h>
-
 #include <linux/string.h>
+
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/stp.h>
@@ -387,6 +387,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_SKEYS:
 	case KVM_CAP_S390_IRQ_STATE:
 	case KVM_CAP_S390_USER_INSTR0:
+	case KVM_CAP_S390_CMMA_MIGRATION:
 	case KVM_CAP_S390_AIS:
 		r = 1;
 		break;
@@ -1419,6 +1420,182 @@ out:
 	return r;
 }
 
+/*
+ * Base address and length must be sent at the start of each block, therefore
+ * it's cheaper to send some clean data, as long as it's less than the size of
+ * two longs.
+ */
+#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
+/* for consistency */
+#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
+
+/*
+ * This function searches for the next page with dirty CMMA attributes, and
+ * saves the attributes in the buffer up to either the end of the buffer or
+ * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found;
+ * no trailing clean bytes are saved.
+ * In case no dirty bits were found, or if CMMA was not enabled or used, the
+ * output buffer will indicate 0 as length.
+ */
+static int kvm_s390_get_cmma_bits(struct kvm *kvm,
+				  struct kvm_s390_cmma_log *args)
+{
+	struct kvm_s390_migration_state *s = kvm->arch.migration_state;
+	unsigned long bufsize, hva, pgstev, i, next, cur;
+	int srcu_idx, peek, r = 0, rr;
+	u8 *res;
+
+	cur = args->start_gfn;
+	i = next = pgstev = 0;
+
+	if (unlikely(!kvm->arch.use_cmma))
+		return -ENXIO;
+	/* Invalid/unsupported flags were specified */
+	if (args->flags & ~KVM_S390_CMMA_PEEK)
+		return -EINVAL;
+	/* Migration mode query, and we are not doing a migration */
+	peek = !!(args->flags & KVM_S390_CMMA_PEEK);
+	if (!peek && !s)
+		return -EINVAL;
+	/* CMMA is disabled or was not used, or the buffer has length zero */
+	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
+	if (!bufsize || !kvm->mm->context.use_cmma) {
+		memset(args, 0, sizeof(*args));
+		return 0;
+	}
+
+	if (!peek) {
+		/* We are not peeking, and there are no dirty pages */
+		if (!atomic64_read(&s->dirty_pages)) {
+			memset(args, 0, sizeof(*args));
+			return 0;
+		}
+		cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
+				    args->start_gfn);
+		if (cur >= s->bitmap_size)	/* nothing found, loop back */
+			cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
+		if (cur >= s->bitmap_size) {	/* again! (very unlikely) */
+			memset(args, 0, sizeof(*args));
+			return 0;
+		}
+		next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
+	}
+
+	res = vmalloc(bufsize);
+	if (!res)
+		return -ENOMEM;
+
+	args->start_gfn = cur;
+
+	down_read(&kvm->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	while (i < bufsize) {
+		hva = gfn_to_hva(kvm, cur);
+		if (kvm_is_error_hva(hva)) {
+			r = -EFAULT;
+			break;
+		}
+		/* decrement only if we actually flipped the bit to 0 */
+		if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
+			atomic64_dec(&s->dirty_pages);
+		r = get_pgste(kvm->mm, hva, &pgstev);
+		if (r < 0)
+			pgstev = 0;
+		/* save the value */
+		res[i++] = (pgstev >> 24) & 0x3;
+		/*
+		 * if the next bit is too far away, stop.
+		 * if we reached the previous "next", find the next one
+		 */
+		if (!peek) {
+			if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
+				break;
+			if (cur == next)
+				next = find_next_bit(s->pgste_bitmap,
+						     s->bitmap_size, cur + 1);
+		/* reached the end of the bitmap or of the buffer, stop */
+			if ((next >= s->bitmap_size) ||
+			    (next >= args->start_gfn + bufsize))
+				break;
+		}
+		cur++;
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	up_read(&kvm->mm->mmap_sem);
+	args->count = i;
+	args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
+
+	rr = copy_to_user((void __user *)args->values, res, args->count);
+	if (rr)
+		r = -EFAULT;
+
+	vfree(res);
+	return r;
+}
+
+/*
+ * This function sets the CMMA attributes for the given pages. If the input
+ * buffer has zero length, no action is taken, otherwise the attributes are
+ * set and the mm->context.use_cmma flag is set.
+ */
+static int kvm_s390_set_cmma_bits(struct kvm *kvm,
+				  const struct kvm_s390_cmma_log *args)
+{
+	unsigned long hva, mask, pgstev, i;
+	uint8_t *bits;
+	int srcu_idx, r = 0;
+
+	mask = args->mask;
+
+	if (!kvm->arch.use_cmma)
+		return -ENXIO;
+	/* invalid/unsupported flags */
+	if (args->flags != 0)
+		return -EINVAL;
+	/* Enforce sane limit on memory allocation */
+	if (args->count > KVM_S390_CMMA_SIZE_MAX)
+		return -EINVAL;
+	/* Nothing to do */
+	if (args->count == 0)
+		return 0;
+
+	bits = vmalloc(sizeof(*bits) * args->count);
+	if (!bits)
+		return -ENOMEM;
+
+	r = copy_from_user(bits, (void __user *)args->values, args->count);
+	if (r) {
+		r = -EFAULT;
+		goto out;
+	}
+
+	down_read(&kvm->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	for (i = 0; i < args->count; i++) {
+		hva = gfn_to_hva(kvm, args->start_gfn + i);
+		if (kvm_is_error_hva(hva)) {
+			r = -EFAULT;
+			break;
+		}
+
+		pgstev = bits[i];
+		pgstev = pgstev << 24;
+		mask &= _PGSTE_GPS_USAGE_MASK;
+		set_pgste_bits(kvm->mm, hva, mask, pgstev);
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	up_read(&kvm->mm->mmap_sem);
+
+	if (!kvm->mm->context.use_cmma) {
+		down_write(&kvm->mm->mmap_sem);
+		kvm->mm->context.use_cmma = 1;
+		up_write(&kvm->mm->mmap_sem);
+	}
+out:
+	vfree(bits);
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -1497,6 +1674,29 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_s390_set_skeys(kvm, &args);
 		break;
 	}
+	case KVM_S390_GET_CMMA_BITS: {
+		struct kvm_s390_cmma_log args;
+
+		r = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		r = kvm_s390_get_cmma_bits(kvm, &args);
+		if (!r) {
+			r = copy_to_user(argp, &args, sizeof(args));
+			if (r)
+				r = -EFAULT;
+		}
+		break;
+	}
+	case KVM_S390_SET_CMMA_BITS: {
+		struct kvm_s390_cmma_log args;
+
+		r = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		r = kvm_s390_set_cmma_bits(kvm, &args);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 577429a95ad8..2b8dc1ca18d4 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -155,6 +155,35 @@ struct kvm_s390_skeys {
 	__u32 reserved[9];
 };
 
+#define KVM_S390_CMMA_PEEK (1 << 0)
+
+/**
+ * kvm_s390_cmma_log - Used for CMMA migration.
+ *
+ * Used both for input and output.
+ *
+ * @start_gfn: Guest page number to start from.
+ * @count: Size of the result buffer.
+ * @flags: Control operation mode via KVM_S390_CMMA_* flags
+ * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty
+ *             pages are still remaining.
+ * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set
+ *        in the PGSTE.
+ * @values: Pointer to the values buffer.
+ *
+ * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls.
+ */
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
 struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
 #define KVM_EXIT_HYPERV_HCALL          2
@@ -895,6 +924,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SPAPR_TCE_VFIO 142
 #define KVM_CAP_X86_GUEST_MWAIT 143
 #define KVM_CAP_ARM_USER_IRQ 144
+#define KVM_CAP_S390_CMMA_MIGRATION 145
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1318,6 +1348,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
 /* Available with KVM_CAP_X86_SMM */
 #define KVM_SMI                   _IO(KVMIO,   0xb7)
+/* Available with KVM_CAP_S390_CMMA_MIGRATION */
+#define KVM_S390_GET_CMMA_BITS      _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log)
+#define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-- 
cgit v1.2.3


From ee5d032f7d032e2cea354522a46b211de84c4e8c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Jun 2017 18:25:04 -0700
Subject: xdp: add HW offload mode flag for installing programs

Add an installation-time flag for requesting that the program
be installed only if it can be offloaded to HW.

Internally new command for ndo_xdp is added, this way we avoid
putting checks into drivers since they all return -EINVAL on
an unknown command.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    | 1 +
 include/uapi/linux/if_link.h | 7 +++++--
 net/core/dev.c               | 7 +++++--
 net/core/rtnetlink.c         | 4 ++--
 4 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b194817631de..a838591aad28 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -807,6 +807,7 @@ enum xdp_netdev_command {
 	 * when it is no longer used.
 	 */
 	XDP_SETUP_PROG,
+	XDP_SETUP_PROG_HW,
 	/* Check if a bpf program is set on the device.  The callee should
 	 * return true if a program is currently attached and running.
 	 */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index dd88375a6580..ce777ec88e1e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -891,9 +891,12 @@ enum {
 #define XDP_FLAGS_UPDATE_IF_NOEXIST	(1U << 0)
 #define XDP_FLAGS_SKB_MODE		(1U << 1)
 #define XDP_FLAGS_DRV_MODE		(1U << 2)
+#define XDP_FLAGS_HW_MODE		(1U << 3)
+#define XDP_FLAGS_MODES			(XDP_FLAGS_SKB_MODE | \
+					 XDP_FLAGS_DRV_MODE | \
+					 XDP_FLAGS_HW_MODE)
 #define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
-					 XDP_FLAGS_SKB_MODE | \
-					 XDP_FLAGS_DRV_MODE)
+					 XDP_FLAGS_MODES)
 
 /* These are stored into IFLA_XDP_ATTACHED on dump. */
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index 09f9e99f4a3e..cd885e9e3363 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6957,7 +6957,10 @@ static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
 	struct netdev_xdp xdp;
 
 	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_SETUP_PROG;
+	if (flags & XDP_FLAGS_HW_MODE)
+		xdp.command = XDP_SETUP_PROG_HW;
+	else
+		xdp.command = XDP_SETUP_PROG;
 	xdp.extack = extack;
 	xdp.flags = flags;
 	xdp.prog = prog;
@@ -6985,7 +6988,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 	ASSERT_RTNL();
 
 	xdp_op = xdp_chk = ops->ndo_xdp;
-	if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
+	if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
 		return -EOPNOTSUPP;
 	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
 		xdp_op = generic_xdp_install;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8da89c1136e5..a5bedd03a63e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -16,6 +16,7 @@
  *	Vitaly E. Lavrov		RTA_OK arithmetics was wrong.
  */
 
+#include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -2253,8 +2254,7 @@ static int do_setlink(const struct sk_buff *skb,
 				err = -EINVAL;
 				goto errout;
 			}
-			if ((xdp_flags & XDP_FLAGS_SKB_MODE) &&
-			    (xdp_flags & XDP_FLAGS_DRV_MODE)) {
+			if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) {
 				err = -EINVAL;
 				goto errout;
 			}
-- 
cgit v1.2.3


From ce158e580a5bdc93286a3b630638bdd47d4ec663 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Jun 2017 18:25:09 -0700
Subject: xdp: add reporting of offload mode

Extend the XDP_ATTACHED_* values to include offloaded mode.
Let drivers report whether program is installed in the driver
or the HW by changing the prog_attached field from bool to
u8 (type of the netlink attribute).

Exploit the fact that the value of XDP_ATTACHED_DRV is 1,
therefore since all drivers currently assign the mode with
double negation:
       mode = !!xdp_prog;
no drivers have to be modified.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    | 7 ++++---
 include/uapi/linux/if_link.h | 1 +
 net/core/dev.c               | 3 +--
 net/core/rtnetlink.c         | 6 +++---
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a838591aad28..68f5d899d1e6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -809,7 +809,8 @@ enum xdp_netdev_command {
 	XDP_SETUP_PROG,
 	XDP_SETUP_PROG_HW,
 	/* Check if a bpf program is set on the device.  The callee should
-	 * return true if a program is currently attached and running.
+	 * set @prog_attached to one of XDP_ATTACHED_* values, note that "true"
+	 * is equivalent to XDP_ATTACHED_DRV.
 	 */
 	XDP_QUERY_PROG,
 };
@@ -827,7 +828,7 @@ struct netdev_xdp {
 		};
 		/* XDP_QUERY_PROG */
 		struct {
-			bool prog_attached;
+			u8 prog_attached;
 			u32 prog_id;
 		};
 	};
@@ -3307,7 +3308,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id);
+u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ce777ec88e1e..8d062c58d5cb 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -903,6 +903,7 @@ enum {
 	XDP_ATTACHED_NONE = 0,
 	XDP_ATTACHED_DRV,
 	XDP_ATTACHED_SKB,
+	XDP_ATTACHED_HW,
 };
 
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index cd885e9e3363..a91572aa73d5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6934,8 +6934,7 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op,
-			u32 *prog_id)
+u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id)
 {
 	struct netdev_xdp xdp;
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a5bedd03a63e..9a1bd510c812 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1265,10 +1265,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 		*prog_id = generic_xdp_prog->aux->id;
 		return XDP_ATTACHED_SKB;
 	}
-	if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp, prog_id))
-		return XDP_ATTACHED_DRV;
+	if (!ops->ndo_xdp)
+		return XDP_ATTACHED_NONE;
 
-	return XDP_ATTACHED_NONE;
+	return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id);
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
-- 
cgit v1.2.3


From b7e7cf7a66a27e62c5f873a0068cee34094bf5d7 Mon Sep 17 00:00:00 2001
From: Daniel Walter <dwalter@sigma-star.at>
Date: Mon, 19 Jun 2017 09:27:58 +0200
Subject: fscrypt: add support for AES-128-CBC

fscrypt provides facilities to use different encryption algorithms which
are selectable by userspace when setting the encryption policy. Currently,
only AES-256-XTS for file contents and AES-256-CBC-CTS for file names are
implemented. This is a clear case of kernel offers the mechanism and
userspace selects a policy. Similar to what dm-crypt and ecryptfs have.

This patch adds support for using AES-128-CBC for file contents and
AES-128-CBC-CTS for file name encryption. To mitigate watermarking
attacks, IVs are generated using the ESSIV algorithm. While AES-CBC is
actually slightly less secure than AES-XTS from a security point of view,
there is more widespread hardware support. Using AES-CBC gives us the
acceptable performance while still providing a moderate level of security
for persistent storage.

Especially low-powered embedded devices with crypto accelerators such as
CAAM or CESA often only support AES-CBC. Since using AES-CBC over AES-XTS
is basically thought of a last resort, we use AES-128-CBC over AES-256-CBC
since it has less encryption rounds and yields noticeable better
performance starting from a file size of just a few kB.

Signed-off-by: Daniel Walter <dwalter@sigma-star.at>
[david@sigma-star.at: addressed review comments]
Signed-off-by: David Gstir <david@sigma-star.at>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/Kconfig              |   1 +
 fs/crypto/crypto.c             |  23 ++++--
 fs/crypto/fscrypt_private.h    |   9 ++-
 fs/crypto/keyinfo.c            | 173 ++++++++++++++++++++++++++++++++---------
 fs/crypto/policy.c             |   8 +-
 include/linux/fscrypt_common.h |  16 ++--
 include/uapi/linux/fs.h        |   2 +
 7 files changed, 174 insertions(+), 58 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 08b46e6e3995..02b7d91c9231 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -7,6 +7,7 @@ config FS_ENCRYPTION
 	select CRYPTO_XTS
 	select CRYPTO_CTS
 	select CRYPTO_CTR
+	select CRYPTO_SHA256
 	select KEYS
 	help
 	  Enable encryption of files and directories.  This
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6d6eca394d4d..c7835df7e7b8 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -26,6 +26,7 @@
 #include <linux/ratelimit.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
+#include <crypto/aes.h>
 #include "fscrypt_private.h"
 
 static unsigned int num_prealloc_crypto_pages = 32;
@@ -147,8 +148,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 {
 	struct {
 		__le64 index;
-		u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)];
-	} xts_tweak;
+		u8 padding[FS_IV_SIZE - sizeof(__le64)];
+	} iv;
 	struct skcipher_request *req = NULL;
 	DECLARE_FS_COMPLETION_RESULT(ecr);
 	struct scatterlist dst, src;
@@ -158,6 +159,16 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 
 	BUG_ON(len == 0);
 
+	BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE);
+	BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE);
+	iv.index = cpu_to_le64(lblk_num);
+	memset(iv.padding, 0, sizeof(iv.padding));
+
+	if (ci->ci_essiv_tfm != NULL) {
+		crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv,
+					  (u8 *)&iv);
+	}
+
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
@@ -170,15 +181,11 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 		page_crypt_complete, &ecr);
 
-	BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
-	xts_tweak.index = cpu_to_le64(lblk_num);
-	memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
-
 	sg_init_table(&dst, 1);
 	sg_set_page(&dst, dest_page, len, offs);
 	sg_init_table(&src, 1);
 	sg_set_page(&src, src_page, len, offs);
-	skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak);
+	skcipher_request_set_crypt(req, &src, &dst, len, &iv);
 	if (rw == FS_DECRYPT)
 		res = crypto_skcipher_decrypt(req);
 	else
@@ -477,6 +484,8 @@ static void __exit fscrypt_exit(void)
 		destroy_workqueue(fscrypt_read_workqueue);
 	kmem_cache_destroy(fscrypt_ctx_cachep);
 	kmem_cache_destroy(fscrypt_info_cachep);
+
+	fscrypt_essiv_cleanup();
 }
 module_exit(fscrypt_exit);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 1e1f8a361b75..a1d5021c31ef 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,10 +12,13 @@
 #define _FSCRYPT_PRIVATE_H
 
 #include <linux/fscrypt_supp.h>
+#include <crypto/hash.h>
 
 /* Encryption parameters */
-#define FS_XTS_TWEAK_SIZE		16
+#define FS_IV_SIZE			16
 #define FS_AES_128_ECB_KEY_SIZE		16
+#define FS_AES_128_CBC_KEY_SIZE		16
+#define FS_AES_128_CTS_KEY_SIZE		16
 #define FS_AES_256_GCM_KEY_SIZE		32
 #define FS_AES_256_CBC_KEY_SIZE		32
 #define FS_AES_256_CTS_KEY_SIZE		32
@@ -54,6 +57,7 @@ struct fscrypt_info {
 	u8 ci_filename_mode;
 	u8 ci_flags;
 	struct crypto_skcipher *ci_ctfm;
+	struct crypto_cipher *ci_essiv_tfm;
 	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
 };
 
@@ -87,4 +91,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode,
 extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
 					      gfp_t gfp_flags);
 
+/* keyinfo.c */
+extern void __exit fscrypt_essiv_cleanup(void);
+
 #endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 179e578b875b..018c588c7ac3 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,8 +10,13 @@
 
 #include <keys/user-type.h>
 #include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <crypto/aes.h>
+#include <crypto/sha.h>
 #include "fscrypt_private.h"
 
+static struct crypto_shash *essiv_hash_tfm;
+
 static void derive_crypt_complete(struct crypto_async_request *req, int rc)
 {
 	struct fscrypt_completion_result *ecr = req->data;
@@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc)
  * derive_key_aes() - Derive a key using AES-128-ECB
  * @deriving_key: Encryption key used for derivation.
  * @source_key:   Source key to which to apply derivation.
- * @derived_key:  Derived key.
+ * @derived_raw_key:  Derived raw key.
  *
  * Return: Zero on success; non-zero otherwise.
  */
 static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
-				u8 source_key[FS_AES_256_XTS_KEY_SIZE],
-				u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+				const struct fscrypt_key *source_key,
+				u8 derived_raw_key[FS_MAX_KEY_SIZE])
 {
 	int res = 0;
 	struct skcipher_request *req = NULL;
@@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
 	if (res < 0)
 		goto out;
 
-	sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
-	sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
-	skcipher_request_set_crypt(req, &src_sg, &dst_sg,
-					FS_AES_256_XTS_KEY_SIZE, NULL);
+	sg_init_one(&src_sg, source_key->raw, source_key->size);
+	sg_init_one(&dst_sg, derived_raw_key, source_key->size);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
+				   NULL);
 	res = crypto_skcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
 		wait_for_completion(&ecr.completion);
@@ -77,7 +82,7 @@ out:
 
 static int validate_user_key(struct fscrypt_info *crypt_info,
 			struct fscrypt_context *ctx, u8 *raw_key,
-			const char *prefix)
+			const char *prefix, int min_keysize)
 {
 	char *description;
 	struct key *keyring_key;
@@ -111,50 +116,60 @@ static int validate_user_key(struct fscrypt_info *crypt_info,
 	master_key = (struct fscrypt_key *)ukp->data;
 	BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
 
-	if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+	if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE
+	    || master_key->size % AES_BLOCK_SIZE != 0) {
 		printk_once(KERN_WARNING
 				"%s: key size incorrect: %d\n",
 				__func__, master_key->size);
 		res = -ENOKEY;
 		goto out;
 	}
-	res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+	res = derive_key_aes(ctx->nonce, master_key, raw_key);
 out:
 	up_read(&keyring_key->sem);
 	key_put(keyring_key);
 	return res;
 }
 
+static const struct {
+	const char *cipher_str;
+	int keysize;
+} available_modes[] = {
+	[FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)",
+					     FS_AES_256_XTS_KEY_SIZE },
+	[FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))",
+					     FS_AES_256_CTS_KEY_SIZE },
+	[FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)",
+					     FS_AES_128_CBC_KEY_SIZE },
+	[FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))",
+					     FS_AES_128_CTS_KEY_SIZE },
+};
+
 static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
 				 const char **cipher_str_ret, int *keysize_ret)
 {
-	if (S_ISREG(inode->i_mode)) {
-		if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) {
-			*cipher_str_ret = "xts(aes)";
-			*keysize_ret = FS_AES_256_XTS_KEY_SIZE;
-			return 0;
-		}
-		pr_warn_once("fscrypto: unsupported contents encryption mode "
-			     "%d for inode %lu\n",
-			     ci->ci_data_mode, inode->i_ino);
-		return -ENOKEY;
+	u32 mode;
+
+	if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
+		pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n",
+				    inode->i_ino,
+				    ci->ci_data_mode, ci->ci_filename_mode);
+		return -EINVAL;
 	}
 
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
-		if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) {
-			*cipher_str_ret = "cts(cbc(aes))";
-			*keysize_ret = FS_AES_256_CTS_KEY_SIZE;
-			return 0;
-		}
-		pr_warn_once("fscrypto: unsupported filenames encryption mode "
-			     "%d for inode %lu\n",
-			     ci->ci_filename_mode, inode->i_ino);
-		return -ENOKEY;
+	if (S_ISREG(inode->i_mode)) {
+		mode = ci->ci_data_mode;
+	} else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+		mode = ci->ci_filename_mode;
+	} else {
+		WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+			  inode->i_ino, (inode->i_mode & S_IFMT));
+		return -EINVAL;
 	}
 
-	pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n",
-		     (inode->i_mode & S_IFMT), inode->i_ino);
-	return -ENOKEY;
+	*cipher_str_ret = available_modes[mode].cipher_str;
+	*keysize_ret = available_modes[mode].keysize;
+	return 0;
 }
 
 static void put_crypt_info(struct fscrypt_info *ci)
@@ -163,9 +178,76 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		return;
 
 	crypto_free_skcipher(ci->ci_ctfm);
+	crypto_free_cipher(ci->ci_essiv_tfm);
 	kmem_cache_free(fscrypt_info_cachep, ci);
 }
 
+static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
+{
+	struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
+
+	/* init hash transform on demand */
+	if (unlikely(!tfm)) {
+		struct crypto_shash *prev_tfm;
+
+		tfm = crypto_alloc_shash("sha256", 0, 0);
+		if (IS_ERR(tfm)) {
+			pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n",
+					    PTR_ERR(tfm));
+			return PTR_ERR(tfm);
+		}
+		prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
+		if (prev_tfm) {
+			crypto_free_shash(tfm);
+			tfm = prev_tfm;
+		}
+	}
+
+	{
+		SHASH_DESC_ON_STACK(desc, tfm);
+		desc->tfm = tfm;
+		desc->flags = 0;
+
+		return crypto_shash_digest(desc, key, keysize, salt);
+	}
+}
+
+static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
+				int keysize)
+{
+	int err;
+	struct crypto_cipher *essiv_tfm;
+	u8 salt[SHA256_DIGEST_SIZE];
+
+	essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(essiv_tfm))
+		return PTR_ERR(essiv_tfm);
+
+	ci->ci_essiv_tfm = essiv_tfm;
+
+	err = derive_essiv_salt(raw_key, keysize, salt);
+	if (err)
+		goto out;
+
+	/*
+	 * Using SHA256 to derive the salt/key will result in AES-256 being
+	 * used for IV generation. File contents encryption will still use the
+	 * configured keysize (AES-128) nevertheless.
+	 */
+	err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
+	if (err)
+		goto out;
+
+out:
+	memzero_explicit(salt, sizeof(salt));
+	return err;
+}
+
+void __exit fscrypt_essiv_cleanup(void)
+{
+	crypto_free_shash(essiv_hash_tfm);
+}
+
 int fscrypt_get_encryption_info(struct inode *inode)
 {
 	struct fscrypt_info *crypt_info;
@@ -212,6 +294,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
 	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
 	crypt_info->ci_ctfm = NULL;
+	crypt_info->ci_essiv_tfm = NULL;
 	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
 				sizeof(crypt_info->ci_master_key));
 
@@ -228,10 +311,12 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	if (!raw_key)
 		goto out;
 
-	res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX);
+	res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX,
+				keysize);
 	if (res && inode->i_sb->s_cop->key_prefix) {
 		int res2 = validate_user_key(crypt_info, &ctx, raw_key,
-					     inode->i_sb->s_cop->key_prefix);
+					     inode->i_sb->s_cop->key_prefix,
+					     keysize);
 		if (res2) {
 			if (res2 == -ENOKEY)
 				res = -ENOKEY;
@@ -243,18 +328,30 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
 	if (!ctfm || IS_ERR(ctfm)) {
 		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
-		printk(KERN_DEBUG
-		       "%s: error %d (inode %u) allocating crypto tfm\n",
-		       __func__, res, (unsigned) inode->i_ino);
+		pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n",
+			 __func__, res, inode->i_ino);
 		goto out;
 	}
 	crypt_info->ci_ctfm = ctfm;
 	crypto_skcipher_clear_flags(ctfm, ~0);
 	crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	/*
+	 * if the provided key is longer than keysize, we use the first
+	 * keysize bytes of the derived key only
+	 */
 	res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
 	if (res)
 		goto out;
 
+	if (S_ISREG(inode->i_mode) &&
+	    crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
+		res = init_essiv_generator(crypt_info, raw_key, keysize);
+		if (res) {
+			pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n",
+				 __func__, res, inode->i_ino);
+			goto out;
+		}
+	}
 	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL)
 		crypt_info = NULL;
 out:
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 210976e7a269..9914d51dff86 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -38,12 +38,8 @@ static int create_encryption_context_from_policy(struct inode *inode,
 	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
 					FS_KEY_DESCRIPTOR_SIZE);
 
-	if (!fscrypt_valid_contents_enc_mode(
-				policy->contents_encryption_mode))
-		return -EINVAL;
-
-	if (!fscrypt_valid_filenames_enc_mode(
-				policy->filenames_encryption_mode))
+	if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+				     policy->filenames_encryption_mode))
 		return -EINVAL;
 
 	if (policy->flags & ~FS_POLICY_FLAGS_VALID)
diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h
index 0a30c106c1e5..4022c61f7e9b 100644
--- a/include/linux/fscrypt_common.h
+++ b/include/linux/fscrypt_common.h
@@ -91,14 +91,18 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
 	return false;
 }
 
-static inline bool fscrypt_valid_contents_enc_mode(u32 mode)
+static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
+					u32 filenames_mode)
 {
-	return (mode == FS_ENCRYPTION_MODE_AES_256_XTS);
-}
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+		return true;
 
-static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
-{
-	return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+		return true;
+
+	return false;
 }
 
 static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 24e61a54feaa..a2a3ffb06038 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -272,6 +272,8 @@ struct fsxattr {
 #define FS_ENCRYPTION_MODE_AES_256_GCM		2
 #define FS_ENCRYPTION_MODE_AES_256_CBC		3
 #define FS_ENCRYPTION_MODE_AES_256_CTS		4
+#define FS_ENCRYPTION_MODE_AES_128_CBC		5
+#define FS_ENCRYPTION_MODE_AES_128_CTS		6
 
 struct fscrypt_policy {
 	__u8 version;
-- 
cgit v1.2.3


From c75b1d9421f80f4143e389d2d50ddfc8a28c8c35 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Jun 2017 11:47:04 -0600
Subject: fs: add fcntl() interface for setting/getting write life time hints

Define a set of write life time hints:

RWH_WRITE_LIFE_NOT_SET	No hint information set
RWH_WRITE_LIFE_NONE	No hints about write life time
RWH_WRITE_LIFE_SHORT	Data written has a short life time
RWH_WRITE_LIFE_MEDIUM	Data written has a medium life time
RWH_WRITE_LIFE_LONG	Data written has a long life time
RWH_WRITE_LIFE_EXTREME	Data written has an extremely long life time

The intent is for these values to be relative to each other, no
absolute meaning should be attached to these flag names.

Add an fcntl interface for querying these flags, and also for
setting them as well:

F_GET_RW_HINT		Returns the read/write hint set on the
			underlying inode.

F_SET_RW_HINT		Set one of the above write hints on the
			underlying inode.

F_GET_FILE_RW_HINT	Returns the read/write hint set on the
			file descriptor.

F_SET_FILE_RW_HINT	Set one of the above write hints on the
			file descriptor.

The user passes in a 64-bit pointer to get/set these values, and
the interface returns 0/-1 on success/error.

Sample program testing/implementing basic setting/getting of write
hints is below.

Add support for storing the write life time hint in the inode flags
and in struct file as well, and pass them to the kiocb flags. If
both a file and its corresponding inode has a write hint, then we
use the one in the file, if available. The file hint can be used
for sync/direct IO, for buffered writeback only the inode hint
is available.

This is in preparation for utilizing these hints in the block layer,
to guide on-media data placement.

/*
 * writehint.c: get or set an inode write hint
 */
 #include <stdio.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdbool.h>
 #include <inttypes.h>

 #ifndef F_GET_RW_HINT
 #define F_LINUX_SPECIFIC_BASE	1024
 #define F_GET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 11)
 #define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
 #endif

static char *str[] = { "RWF_WRITE_LIFE_NOT_SET", "RWH_WRITE_LIFE_NONE",
			"RWH_WRITE_LIFE_SHORT", "RWH_WRITE_LIFE_MEDIUM",
			"RWH_WRITE_LIFE_LONG", "RWH_WRITE_LIFE_EXTREME" };

int main(int argc, char *argv[])
{
	uint64_t hint;
	int fd, ret;

	if (argc < 2) {
		fprintf(stderr, "%s: file <hint>\n", argv[0]);
		return 1;
	}

	fd = open(argv[1], O_RDONLY);
	if (fd < 0) {
		perror("open");
		return 2;
	}

	if (argc > 2) {
		hint = atoi(argv[2]);
		ret = fcntl(fd, F_SET_RW_HINT, &hint);
		if (ret < 0) {
			perror("fcntl: F_SET_RW_HINT");
			return 4;
		}
	}

	ret = fcntl(fd, F_GET_RW_HINT, &hint);
	if (ret < 0) {
		perror("fcntl: F_GET_RW_HINT");
		return 3;
	}

	printf("%s: hint %s\n", argv[1], str[hint]);
	close(fd);
	return 0;
}

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fcntl.c                 | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/inode.c                 |  1 +
 fs/open.c                  |  1 +
 include/linux/fs.h         | 47 ++++++++++++++++++++++++++---------
 include/uapi/linux/fcntl.h | 21 ++++++++++++++++
 5 files changed, 120 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f4e7267d117f..67bdc6e8ccad 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -243,6 +243,62 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
 }
 #endif
 
+static bool rw_hint_valid(enum rw_hint hint)
+{
+	switch (hint) {
+	case RWF_WRITE_LIFE_NOT_SET:
+	case RWH_WRITE_LIFE_NONE:
+	case RWH_WRITE_LIFE_SHORT:
+	case RWH_WRITE_LIFE_MEDIUM:
+	case RWH_WRITE_LIFE_LONG:
+	case RWH_WRITE_LIFE_EXTREME:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	u64 *argp = (u64 __user *)arg;
+	enum rw_hint hint;
+
+	switch (cmd) {
+	case F_GET_FILE_RW_HINT:
+		if (put_user(file_write_hint(file), argp))
+			return -EFAULT;
+		return 0;
+	case F_SET_FILE_RW_HINT:
+		if (get_user(hint, argp))
+			return -EFAULT;
+		if (!rw_hint_valid(hint))
+			return -EINVAL;
+
+		spin_lock(&file->f_lock);
+		file->f_write_hint = hint;
+		spin_unlock(&file->f_lock);
+		return 0;
+	case F_GET_RW_HINT:
+		if (put_user(inode->i_write_hint, argp))
+			return -EFAULT;
+		return 0;
+	case F_SET_RW_HINT:
+		if (get_user(hint, argp))
+			return -EFAULT;
+		if (!rw_hint_valid(hint))
+			return -EINVAL;
+
+		inode_lock(inode);
+		inode->i_write_hint = hint;
+		inode_unlock(inode);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		struct file *filp)
 {
@@ -337,6 +393,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_GET_SEALS:
 		err = shmem_fcntl(filp, cmd, arg);
 		break;
+	case F_GET_RW_HINT:
+	case F_SET_RW_HINT:
+	case F_GET_FILE_RW_HINT:
+	case F_SET_FILE_RW_HINT:
+		err = fcntl_rw_hint(filp, cmd, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/inode.c b/fs/inode.c
index db5914783a71..f0e5fc77e6a4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	i_gid_write(inode, 0);
 	atomic_set(&inode->i_writecount, 0);
 	inode->i_size = 0;
+	inode->i_write_hint = WRITE_LIFE_NOT_SET;
 	inode->i_blocks = 0;
 	inode->i_bytes = 0;
 	inode->i_generation = 0;
diff --git a/fs/open.c b/fs/open.c
index cd0c5be8d012..3fe0c4aa7d27 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -759,6 +759,7 @@ static int do_dentry_open(struct file *f,
 	     likely(f->f_op->write || f->f_op->write_iter))
 		f->f_mode |= FMODE_CAN_WRITE;
 
+	f->f_write_hint = WRITE_LIFE_NOT_SET;
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 
 	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4574121f4746..65adbddb3163 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,6 +20,7 @@
 #include <linux/rwsem.h>
 #include <linux/capability.h>
 #include <linux/semaphore.h>
+#include <linux/fcntl.h>
 #include <linux/fiemap.h>
 #include <linux/rculist_bl.h>
 #include <linux/atomic.h>
@@ -265,6 +266,18 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+/*
+ * Write life time hint values.
+ */
+enum rw_hint {
+	WRITE_LIFE_NOT_SET	= 0,
+	WRITE_LIFE_NONE		= RWH_WRITE_LIFE_NONE,
+	WRITE_LIFE_SHORT	= RWH_WRITE_LIFE_SHORT,
+	WRITE_LIFE_MEDIUM	= RWH_WRITE_LIFE_MEDIUM,
+	WRITE_LIFE_LONG		= RWH_WRITE_LIFE_LONG,
+	WRITE_LIFE_EXTREME	= RWH_WRITE_LIFE_EXTREME,
+};
+
 #define IOCB_EVENTFD		(1 << 0)
 #define IOCB_APPEND		(1 << 1)
 #define IOCB_DIRECT		(1 << 2)
@@ -280,6 +293,7 @@ struct kiocb {
 	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
 	void			*private;
 	int			ki_flags;
+	enum rw_hint		ki_hint;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -287,16 +301,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
 	return kiocb->ki_complete == NULL;
 }
 
-static inline int iocb_flags(struct file *file);
-
-static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
-{
-	*kiocb = (struct kiocb) {
-		.ki_filp = filp,
-		.ki_flags = iocb_flags(filp),
-	};
-}
-
 /*
  * "descriptor" for what we're up to with a read.
  * This allows us to use the same read code yet
@@ -597,6 +601,7 @@ struct inode {
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
 	unsigned int		i_blkbits;
+	enum rw_hint		i_write_hint;
 	blkcnt_t		i_blocks;
 
 #ifdef __NEED_I_SIZE_ORDERED
@@ -851,6 +856,7 @@ struct file {
 	 * Must not be taken from IRQ context.
 	 */
 	spinlock_t		f_lock;
+	enum rw_hint		f_write_hint;
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
@@ -1026,8 +1032,6 @@ struct file_lock_context {
 #define OFFT_OFFSET_MAX	INT_LIMIT(off_t)
 #endif
 
-#include <linux/fcntl.h>
-
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
 /*
@@ -1878,6 +1882,25 @@ static inline bool HAS_UNMAPPED_ID(struct inode *inode)
 	return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
 }
 
+static inline enum rw_hint file_write_hint(struct file *file)
+{
+	if (file->f_write_hint != WRITE_LIFE_NOT_SET)
+		return file->f_write_hint;
+
+	return file_inode(file)->i_write_hint;
+}
+
+static inline int iocb_flags(struct file *file);
+
+static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
+{
+	*kiocb = (struct kiocb) {
+		.ki_filp = filp,
+		.ki_flags = iocb_flags(filp),
+		.ki_hint = file_write_hint(filp),
+	};
+}
+
 /*
  * Inode state bits.  Protected by inode->i_lock
  *
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 813afd6eee71..ec69d55bcec7 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -42,6 +42,27 @@
 #define F_SEAL_WRITE	0x0008	/* prevent writes */
 /* (1U << 31) is reserved for signed error codes */
 
+/*
+ * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the
+ * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on
+ * the specific file.
+ */
+#define F_GET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
+#define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
+#define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
+
+/*
+ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
+ * used to clear any hints previously set.
+ */
+#define RWF_WRITE_LIFE_NOT_SET	0
+#define RWH_WRITE_LIFE_NONE	1
+#define RWH_WRITE_LIFE_SHORT	2
+#define RWH_WRITE_LIFE_MEDIUM	3
+#define RWH_WRITE_LIFE_LONG	4
+#define RWH_WRITE_LIFE_EXTREME	5
+
 /*
  * Types of directory notifications that may be requested.
  */
-- 
cgit v1.2.3


From 079e3bc588465f6216bfc5ad61540b5db4616598 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 15 Jun 2017 14:12:23 -0600
Subject: switchtec: Add "running" status flag to fw partition info ioctl

This flag lets userspace know which firmware partitions are currently in
use as opposed to just active.  "Active" means they will be in use for the
next reboot, whereas "running" means they are currently in use.

If an old kernel is in use, or the firmware doesn't support these fields,
the new flag will not be set in the output.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
---
 drivers/pci/switch/switchtec.c       | 22 ++++++++++++++++++++--
 include/uapi/linux/switchtec_ioctl.h |  3 +++
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index cc6e085008fb..a5009b623555 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -120,6 +120,13 @@ struct sw_event_regs {
 	u32 reserved16[4];
 } __packed;
 
+enum {
+	SWITCHTEC_CFG0_RUNNING = 0x04,
+	SWITCHTEC_CFG1_RUNNING = 0x05,
+	SWITCHTEC_IMG0_RUNNING = 0x03,
+	SWITCHTEC_IMG1_RUNNING = 0x07,
+};
+
 struct sys_info_regs {
 	u32 device_id;
 	u32 device_version;
@@ -129,7 +136,9 @@ struct sys_info_regs {
 	u32 table_format_version;
 	u32 partition_id;
 	u32 cfg_file_fmt_version;
-	u32 reserved2[58];
+	u16 cfg_running;
+	u16 img_running;
+	u32 reserved2[57];
 	char vendor_id[8];
 	char product_id[16];
 	char product_revision[4];
@@ -807,6 +816,7 @@ static int ioctl_flash_part_info(struct switchtec_dev *stdev,
 {
 	struct switchtec_ioctl_flash_part_info info = {0};
 	struct flash_info_regs __iomem *fi = stdev->mmio_flash_info;
+	struct sys_info_regs __iomem *si = stdev->mmio_sys_info;
 	u32 active_addr = -1;
 
 	if (copy_from_user(&info, uinfo, sizeof(info)))
@@ -816,18 +826,26 @@ static int ioctl_flash_part_info(struct switchtec_dev *stdev,
 	case SWITCHTEC_IOCTL_PART_CFG0:
 		active_addr = ioread32(&fi->active_cfg);
 		set_fw_info_part(&info, &fi->cfg0);
+		if (ioread16(&si->cfg_running) == SWITCHTEC_CFG0_RUNNING)
+			info.active |= SWITCHTEC_IOCTL_PART_RUNNING;
 		break;
 	case SWITCHTEC_IOCTL_PART_CFG1:
 		active_addr = ioread32(&fi->active_cfg);
 		set_fw_info_part(&info, &fi->cfg1);
+		if (ioread16(&si->cfg_running) == SWITCHTEC_CFG1_RUNNING)
+			info.active |= SWITCHTEC_IOCTL_PART_RUNNING;
 		break;
 	case SWITCHTEC_IOCTL_PART_IMG0:
 		active_addr = ioread32(&fi->active_img);
 		set_fw_info_part(&info, &fi->img0);
+		if (ioread16(&si->img_running) == SWITCHTEC_IMG0_RUNNING)
+			info.active |= SWITCHTEC_IOCTL_PART_RUNNING;
 		break;
 	case SWITCHTEC_IOCTL_PART_IMG1:
 		active_addr = ioread32(&fi->active_img);
 		set_fw_info_part(&info, &fi->img1);
+		if (ioread16(&si->img_running) == SWITCHTEC_IMG1_RUNNING)
+			info.active |= SWITCHTEC_IOCTL_PART_RUNNING;
 		break;
 	case SWITCHTEC_IOCTL_PART_NVLOG:
 		set_fw_info_part(&info, &fi->nvlog);
@@ -861,7 +879,7 @@ static int ioctl_flash_part_info(struct switchtec_dev *stdev,
 	}
 
 	if (info.address == active_addr)
-		info.active = 1;
+		info.active |= SWITCHTEC_IOCTL_PART_ACTIVE;
 
 	if (copy_to_user(uinfo, &info, sizeof(info)))
 		return -EFAULT;
diff --git a/include/uapi/linux/switchtec_ioctl.h b/include/uapi/linux/switchtec_ioctl.h
index 3e824e1a6495..5e392968bad2 100644
--- a/include/uapi/linux/switchtec_ioctl.h
+++ b/include/uapi/linux/switchtec_ioctl.h
@@ -39,6 +39,9 @@ struct switchtec_ioctl_flash_info {
 	__u32 padding;
 };
 
+#define SWITCHTEC_IOCTL_PART_ACTIVE  1
+#define SWITCHTEC_IOCTL_PART_RUNNING 2
+
 struct switchtec_ioctl_flash_part_info {
 	__u32 flash_partition;
 	__u32 address;
-- 
cgit v1.2.3


From 36a554cec119bbd20c4ec0cb96bd4712d124bfea Mon Sep 17 00:00:00 2001
From: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Date: Mon, 26 Jun 2017 19:52:30 +0300
Subject: nl80211: Don't verify owner_nlportid on NAN commands

If NAN interface is created with NL80211_ATTR_SOCKET_OWNER, the socket
that is used to create the interface is used for all NAN operations and
reporting NAN events.
However, it turns out that sending commands and receiving events on
the same socket is not possible in a completely race-free way:
If the socket buffer is overflowed by the events, the command response
will not be sent. In that case the caller will block forever on recv.
Using non-blocking socket for commands is more complicated and still
the command response or ack may not be received.
So, keep unicasting NAN events to the interface creator, but allow
using a different socket for commands.

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/uapi/linux/nl80211.h | 9 ++++-----
 net/wireless/nl80211.c       | 8 --------
 2 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 828aa4703e22..51626b4175c0 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1909,11 +1909,10 @@ enum nl80211_commands {
  *	that configured the indoor setting, and the indoor operation would be
  *	cleared when the socket is closed.
  *	If set during NAN interface creation, the interface will be destroyed
- *	if the socket is closed just like any other interface. Moreover, only
- *	the netlink socket that created the interface will be allowed to add
- *	and remove functions. NAN notifications will be sent in unicast to that
- *	socket. Without this attribute, any socket can add functions and the
- *	notifications will be sent to the %NL80211_MCGRP_NAN multicast group.
+ *	if the socket is closed just like any other interface. Moreover, NAN
+ *	notifications will be sent in unicast to that socket. Without this
+ *	attribute, the notifications will be sent to the %NL80211_MCGRP_NAN
+ *	multicast group.
  *	If set during %NL80211_CMD_ASSOCIATE or %NL80211_CMD_CONNECT the
  *	station will deauthenticate when the socket is closed.
  *
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5487cd775b6f..45ba3d0872cc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11206,10 +11206,6 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
 	if (!info->attrs[NL80211_ATTR_NAN_FUNC])
 		return -EINVAL;
 
-	if (wdev->owner_nlportid &&
-	    wdev->owner_nlportid != info->snd_portid)
-		return -ENOTCONN;
-
 	err = nla_parse_nested(tb, NL80211_NAN_FUNC_ATTR_MAX,
 			       info->attrs[NL80211_ATTR_NAN_FUNC],
 			       nl80211_nan_func_policy, info->extack);
@@ -11441,10 +11437,6 @@ static int nl80211_nan_del_func(struct sk_buff *skb,
 	if (!info->attrs[NL80211_ATTR_COOKIE])
 		return -EINVAL;
 
-	if (wdev->owner_nlportid &&
-	    wdev->owner_nlportid != info->snd_portid)
-		return -ENOTCONN;
-
 	cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
 
 	rdev_del_nan_func(rdev, wdev, cookie);
-- 
cgit v1.2.3


From fbd576295d6f98b20356d1e6efbc8f976b17c8af Mon Sep 17 00:00:00 2001
From: Zack Weinberg <zackw@panix.com>
Date: Wed, 14 Jun 2017 08:14:28 -0700
Subject: uapi/linux/a.out.h: don't use deprecated system-specific predefines.

uapi/linux/a.out.h uses a number of predefined macros that are
deprecated because they're in the application namespace
(e.g. '#ifdef linux' instead of '#ifdef __linux__').
This patch either corrects or just removes them if they are not
applicable to Linux.

The primary reason this is worth bothering to fix, considering how
obsolete a.out binary support is, is that the GCC build process
considers this such a severe error that it will copy the header into a
private directory and change the macro names, which causes future
updates to the header to be masked.  This header probably doesn't get
updated very often anymore, but it is the _only_ uapi header that gets
this treatment, so IMHO it is worth patching just to drive that number
all the way to zero.

Signed-off-by: Zack Weinberg <zackw@panix.com>
[hch: removed dead conditionals]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/a.out.h | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/a.out.h b/include/uapi/linux/a.out.h
index 7caf44c7fa51..295cd3ef6330 100644
--- a/include/uapi/linux/a.out.h
+++ b/include/uapi/linux/a.out.h
@@ -112,24 +112,7 @@ enum machine_type {
 #define N_TXTADDR(x) (N_MAGIC(x) == QMAGIC ? PAGE_SIZE : 0)
 #endif
 
-/* Address of data segment in memory after it is loaded.
-   Note that it is up to you to define SEGMENT_SIZE
-   on machines not listed here.  */
-#if defined(vax) || defined(hp300) || defined(pyr)
-#define SEGMENT_SIZE page_size
-#endif
-#ifdef	sony
-#define	SEGMENT_SIZE	0x2000
-#endif	/* Sony.  */
-#ifdef is68k
-#define SEGMENT_SIZE 0x20000
-#endif
-#if defined(m68k) && defined(PORTAR)
-#define PAGE_SIZE 0x400
-#define SEGMENT_SIZE PAGE_SIZE
-#endif
-
-#ifdef linux
+/* Address of data segment in memory after it is loaded. */
 #ifndef __KERNEL__
 #include <unistd.h>
 #endif
@@ -142,7 +125,6 @@ enum machine_type {
 #endif
 #endif
 #endif
-#endif
 
 #define _N_SEGMENT_ROUND(x) ALIGN(x, SEGMENT_SIZE)
 
@@ -260,13 +242,7 @@ struct relocation_info
   unsigned int r_extern:1;
   /* Four bits that aren't used, but when writing an object file
      it is desirable to clear them.  */
-#ifdef NS32K
-  unsigned r_bsr:1;
-  unsigned r_disp:1;
-  unsigned r_pad:2;
-#else
   unsigned int r_pad:4;
-#endif
 };
 #endif /* no N_RELOCATION_INFO_DECLARED.  */
 
-- 
cgit v1.2.3


From 37d74841b9d42b105cba053e70e9db0e395949da Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Fri, 30 Jun 2017 20:41:24 -0700
Subject: acpi, nfit: Enable DSM pass thru for root functions.

Set ND_CMD_CALL in the cmd_mask to enable calling root
functions via the pass thru mechanism.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c   | 1 +
 include/uapi/linux/ndctl.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 957b13152d1b..b16e4ef7a5ca 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1623,6 +1623,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
 		if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
 			set_bit(i, &nd_desc->cmd_mask);
+	set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
 }
 
 static ssize_t range_index_show(struct device *dev,
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 7ad3863cb88b..e23c37fedade 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -179,6 +179,7 @@ static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
 		[ND_CMD_ARS_START] = "ars_start",
 		[ND_CMD_ARS_STATUS] = "ars_status",
 		[ND_CMD_CLEAR_ERROR] = "clear_error",
+		[ND_CMD_CALL] = "cmd_call",
 	};
 
 	if (cmd < ARRAY_SIZE(names) && names[cmd])
-- 
cgit v1.2.3


From 759d6a9641d7f52f9311aae4f2d90058adad3ac2 Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Fri, 30 Jun 2017 20:41:29 -0700
Subject: libnvdimm: New ACPI 6.2 DSM functions

ACPI 6.2 added new NVDIMM root DSM functions.  Define their
data structures.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/ndctl.h | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index e23c37fedade..e15768fb4b99 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -105,7 +105,8 @@ struct nd_cmd_ars_cap {
 	__u32 status;
 	__u32 max_ars_out;
 	__u32 clear_err_unit;
-	__u32 reserved;
+	__u16 flags;
+	__u16 reserved;
 } __packed;
 
 struct nd_cmd_ars_start {
@@ -144,6 +145,43 @@ struct nd_cmd_clear_error {
 	__u64 cleared;
 } __packed;
 
+struct nd_cmd_trans_spa {
+	__u64 spa;
+	__u32 status;
+	__u8  flags;
+	__u8  _reserved[3];
+	__u64 trans_length;
+	__u32 num_nvdimms;
+	struct nd_nvdimm_device {
+		__u32 nfit_device_handle;
+		__u32 _reserved;
+		__u64 dpa;
+	} __packed devices[0];
+
+} __packed;
+
+struct nd_cmd_ars_err_inj {
+	__u64 err_inj_spa_range_base;
+	__u64 err_inj_spa_range_length;
+	__u8  err_inj_options;
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_clr {
+	__u64 err_inj_clr_spa_range_base;
+	__u64 err_inj_clr_spa_range_length;
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_stat {
+	__u32 status;
+	__u32 inj_err_rec_count;
+	struct nd_error_stat_query_record {
+		__u64 err_inj_stat_spa_range_base;
+		__u64 err_inj_stat_spa_range_length;
+	} __packed record[0];
+} __packed;
+
 enum {
 	ND_CMD_IMPLEMENTED = 0,
 
-- 
cgit v1.2.3


From 2cb5c8e378d10a57aa1c9eaee36bea46c27dd2b9 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 30 Jun 2017 13:32:57 -0400
Subject: sctp: Add peeloff-flags socket option

Based on a request raised on the sctp devel list, there is a need to
augment the sctp_peeloff operation while specifying the O_CLOEXEC and
O_NONBLOCK flags (simmilar to the socket syscall).  Since modifying the
SCTP_SOCKOPT_PEELOFF socket option would break user space ABI for existing
programs, this patch creates a new socket option
SCTP_SOCKOPT_PEELOFF_FLAGS, which accepts a third flags parameter to
allow atomic assignment of the socket descriptor flags.

Tested successfully by myself and the requestor

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Andreas Steinmetz <ast@domdv.de>
CC: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  6 ++++
 net/sctp/socket.c         | 87 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 78 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index ced9d8b97426..6217ff8500a1 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -121,6 +121,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_RESET_STREAMS	119
 #define SCTP_RESET_ASSOC	120
 #define SCTP_ADD_STREAMS	121
+#define SCTP_SOCKOPT_PEELOFF_FLAGS 122
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -978,6 +979,11 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+typedef struct {
+	sctp_peeloff_arg_t p_arg;
+	unsigned flags;
+} sctp_peeloff_flags_arg_t;
+
 /*
  *  Peer Address Thresholds socket option
  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0af103f85c79..1db478e34520 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4933,11 +4933,47 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
 }
 EXPORT_SYMBOL(sctp_do_peeloff);
 
+static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *peeloff,
+					  struct file **newfile, unsigned flags)
+{
+	struct socket *newsock;
+	int retval;
+
+	retval = sctp_do_peeloff(sk, peeloff->associd, &newsock);
+	if (retval < 0)
+		goto out;
+
+	/* Map the socket to an unused fd that can be returned to the user.  */
+	retval = get_unused_fd_flags(flags & SOCK_CLOEXEC);
+	if (retval < 0) {
+		sock_release(newsock);
+		goto out;
+	}
+
+	*newfile = sock_alloc_file(newsock, 0, NULL);
+	if (IS_ERR(*newfile)) {
+		put_unused_fd(retval);
+		sock_release(newsock);
+		retval = PTR_ERR(*newfile);
+		*newfile = NULL;
+		return retval;
+	}
+
+	pr_debug("%s: sk:%p, newsk:%p, sd:%d\n", __func__, sk, newsock->sk,
+		 retval);
+
+	peeloff->sd = retval;
+
+	if (flags & SOCK_NONBLOCK)
+		(*newfile)->f_flags |= O_NONBLOCK;
+out:
+	return retval;
+}
+
 static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval, int __user *optlen)
 {
 	sctp_peeloff_arg_t peeloff;
-	struct socket *newsock;
-	struct file *newfile;
+	struct file *newfile = NULL;
 	int retval = 0;
 
 	if (len < sizeof(sctp_peeloff_arg_t))
@@ -4946,26 +4982,44 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval
 	if (copy_from_user(&peeloff, optval, len))
 		return -EFAULT;
 
-	retval = sctp_do_peeloff(sk, peeloff.associd, &newsock);
+	retval = sctp_getsockopt_peeloff_common(sk, &peeloff, &newfile, 0);
 	if (retval < 0)
 		goto out;
 
-	/* Map the socket to an unused fd that can be returned to the user.  */
-	retval = get_unused_fd_flags(0);
-	if (retval < 0) {
-		sock_release(newsock);
-		goto out;
+	/* Return the fd mapped to the new socket.  */
+	if (put_user(len, optlen)) {
+		fput(newfile);
+		put_unused_fd(retval);
+		return -EFAULT;
 	}
 
-	newfile = sock_alloc_file(newsock, 0, NULL);
-	if (IS_ERR(newfile)) {
+	if (copy_to_user(optval, &peeloff, len)) {
+		fput(newfile);
 		put_unused_fd(retval);
-		sock_release(newsock);
-		return PTR_ERR(newfile);
+		return -EFAULT;
 	}
+	fd_install(retval, newfile);
+out:
+	return retval;
+}
 
-	pr_debug("%s: sk:%p, newsk:%p, sd:%d\n", __func__, sk, newsock->sk,
-		 retval);
+static int sctp_getsockopt_peeloff_flags(struct sock *sk, int len,
+					 char __user *optval, int __user *optlen)
+{
+	sctp_peeloff_flags_arg_t peeloff;
+	struct file *newfile = NULL;
+	int retval = 0;
+
+	if (len < sizeof(sctp_peeloff_flags_arg_t))
+		return -EINVAL;
+	len = sizeof(sctp_peeloff_flags_arg_t);
+	if (copy_from_user(&peeloff, optval, len))
+		return -EFAULT;
+
+	retval = sctp_getsockopt_peeloff_common(sk, &peeloff.p_arg,
+						&newfile, peeloff.flags);
+	if (retval < 0)
+		goto out;
 
 	/* Return the fd mapped to the new socket.  */
 	if (put_user(len, optlen)) {
@@ -4973,7 +5027,7 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval
 		put_unused_fd(retval);
 		return -EFAULT;
 	}
-	peeloff.sd = retval;
+
 	if (copy_to_user(optval, &peeloff, len)) {
 		fput(newfile);
 		put_unused_fd(retval);
@@ -6759,6 +6813,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_SOCKOPT_PEELOFF:
 		retval = sctp_getsockopt_peeloff(sk, len, optval, optlen);
 		break;
+	case SCTP_SOCKOPT_PEELOFF_FLAGS:
+		retval = sctp_getsockopt_peeloff_flags(sk, len, optval, optlen);
+		break;
 	case SCTP_PEER_ADDR_PARAMS:
 		retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
 							  optlen);
-- 
cgit v1.2.3


From 40304b2a1567fecc321f640ee4239556dd0f3ee0 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:40 -0700
Subject: bpf: BPF support for sock_ops

Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). It uses the
existing bpf cgroups infrastructure so the programs can be attached per
cgroup with full inheritance support. The program will be called at
appropriate times to set relevant connections parameters such as buffer
sizes, SYN and SYN-ACK RTOs, etc., based on connection information such
as IP addresses, port numbers, etc.

Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
distinct advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.

Although the bpf cgroup framework already contains a sock related
program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type
(BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called
only once during the connections's lifetime. In contrast, the new
program type will be called multiple times from different places in the
network stack code.  For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.

The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.

This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.

This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.

Two new corresponding structs (one for the kernel one for the user/BPF
program):

/* kernel version */
struct bpf_sock_ops_kern {
        struct sock *sk;
        __u32  op;
        union {
                __u32 reply;
                __u32 replylong[4];
        };
};

/* user version
 * Some fields are in network byte order reflecting the sock struct
 * Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to
 * convert them to host byte order.
 */
struct bpf_sock_ops {
        __u32 op;
        union {
                __u32 reply;
                __u32 replylong[4];
        };
        __u32 family;
        __u32 remote_ip4;     /* In network byte order */
        __u32 local_ip4;      /* In network byte order */
        __u32 remote_ip6[4];  /* In network byte order */
        __u32 local_ip6[4];   /* In network byte order */
        __u32 remote_port;    /* In network byte order */
        __u32 local_port;     /* In host byte horder */
};

Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.

The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  18 +++++
 include/linux/bpf_types.h  |   1 +
 include/linux/filter.h     |   9 +++
 include/net/tcp.h          |  36 ++++++++++
 include/uapi/linux/bpf.h   |  30 ++++++++
 kernel/bpf/cgroup.c        |  37 ++++++++++
 kernel/bpf/syscall.c       |   5 ++
 net/core/filter.c          | 168 +++++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/bpf_load.c     |  13 +++-
 9 files changed, 314 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c970a25d2a49..360c082e885c 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -7,6 +7,7 @@
 struct sock;
 struct cgroup;
 struct sk_buff;
+struct bpf_sock_ops_kern;
 
 #ifdef CONFIG_CGROUP_BPF
 
@@ -42,6 +43,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+				     struct bpf_sock_ops_kern *sock_ops,
+				     enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -75,6 +80,18 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && (sock_ops)->sk) {	       \
+		typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk);	       \
+		if (sk_fullsock(__sk))					       \
+			__ret = __cgroup_bpf_run_filter_sock_ops(__sk,	       \
+								 sock_ops,     \
+							 BPF_CGROUP_SOCK_OPS); \
+	}								       \
+	__ret;								       \
+})
 #else
 
 struct cgroup_bpf {};
@@ -85,6 +102,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 03bf223f18be..3d137c33d664 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -10,6 +10,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1fa26dc562ce..738f8b14f025 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -898,4 +898,13 @@ static inline int bpf_tell_extensions(void)
 	return SKF_AD_MAX;
 }
 
+struct bpf_sock_ops_kern {
+	struct	sock *sk;
+	u32	op;
+	union {
+		u32 reply;
+		u32 replylong[4];
+	};
+};
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0751b79d99c..e58500825006 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,10 @@
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
 
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf-cgroup.h>
+
 extern struct inet_hashinfo tcp_hashinfo;
 
 extern struct percpu_counter tcp_orphan_count;
@@ -2021,4 +2025,36 @@ int tcp_set_ulp(struct sock *sk, const char *name);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
 
+/* Call BPF_SOCK_OPS program that returns an int. If the return value
+ * is < 0, then the BPF op failed (for example if the loaded BPF
+ * program does not support the chosen operation or there is no BPF
+ * program loaded).
+ */
+#ifdef CONFIG_BPF
+static inline int tcp_call_bpf(struct sock *sk, int op)
+{
+	struct bpf_sock_ops_kern sock_ops;
+	int ret;
+
+	if (sk_fullsock(sk))
+		sock_owned_by_me(sk);
+
+	memset(&sock_ops, 0, sizeof(sock_ops));
+	sock_ops.sk = sk;
+	sock_ops.op = op;
+
+	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+	if (ret == 0)
+		ret = sock_ops.reply;
+	else
+		ret = -1;
+	return ret;
+}
+#else
+static inline int tcp_call_bpf(struct sock *sk, int op)
+{
+	return -EPERM;
+}
+#endif
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f94b48b168dc..01cd485ccd4f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -120,12 +120,14 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_IN,
 	BPF_PROG_TYPE_LWT_OUT,
 	BPF_PROG_TYPE_LWT_XMIT,
+	BPF_PROG_TYPE_SOCK_OPS,
 };
 
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
+	BPF_CGROUP_SOCK_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -720,4 +722,32 @@ struct bpf_map_info {
 	__u32 map_flags;
 } __attribute__((aligned(8)));
 
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * Some of this fields are in network (bigendian) byte order and may need
+ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+	__u32 op;
+	union {
+		__u32 reply;
+		__u32 replylong[4];
+	};
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+};
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+	BPF_SOCK_OPS_VOID,
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @type: The type of program to be exectuted
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+				     struct bpf_sock_ops_kern *sock_ops,
+				     enum bpf_attach_type type)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_prog *prog;
+	int ret = 0;
+
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4409ccca8831..d4d47de75bba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_SOCK_CREATE:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
 		break;
+	case BPF_CGROUP_SOCK_OPS:
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
@@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 
 	return ret;
 }
+
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
diff --git a/net/core/filter.c b/net/core/filter.c
index b39c869d22e3..1f6a26c4f8b9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3110,6 +3110,36 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+static bool __is_valid_sock_ops_access(int off, int size)
+{
+	if (off < 0 || off >= sizeof(struct bpf_sock_ops))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
+static bool sock_ops_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock_ops, op) ...
+		     offsetof(struct bpf_sock_ops, replylong[3]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	return __is_valid_sock_ops_access(off, size);
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
@@ -3379,6 +3409,138 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
+static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
+				       const struct bpf_insn *si,
+				       struct bpf_insn *insn_buf,
+				       struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+	int off;
+
+	switch (si->off) {
+	case offsetof(struct bpf_sock_ops, op) ...
+	     offsetof(struct bpf_sock_ops, replylong[3]):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) !=
+			     FIELD_SIZEOF(struct bpf_sock_ops_kern, op));
+		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) !=
+			     FIELD_SIZEOF(struct bpf_sock_ops_kern, reply));
+		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) !=
+			     FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong));
+		off = si->off;
+		off -= offsetof(struct bpf_sock_ops, op);
+		off += offsetof(struct bpf_sock_ops_kern, op);
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					      off);
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					      off);
+		break;
+
+	case offsetof(struct bpf_sock_ops, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_family));
+		break;
+
+	case offsetof(struct bpf_sock_ops, remote_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_daddr));
+		break;
+
+	case offsetof(struct bpf_sock_ops, local_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_rcv_saddr));
+		break;
+
+	case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
+	     offsetof(struct bpf_sock_ops, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_daddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_daddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
+	     offsetof(struct bpf_sock_ops, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_rcv_saddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct bpf_sock_ops, remote_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+		break;
+
+	case offsetof(struct bpf_sock_ops, local_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_num));
+		break;
+	}
+	return insn - insn_buf;
+}
+
 const struct bpf_verifier_ops sk_filter_prog_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
@@ -3428,6 +3590,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = {
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
+const struct bpf_verifier_ops sock_ops_prog_ops = {
+	.get_func_proto		= bpf_base_func_proto,
+	.is_valid_access	= sock_ops_is_valid_access,
+	.convert_ctx_access	= sock_ops_convert_ctx_access,
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index a91c57dd8571..a4be7cfa6519 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -64,6 +64,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
 	bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
 	bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
+	bool is_sockops = strncmp(event, "sockops", 7) == 0;
 	size_t insns_cnt = size / sizeof(struct bpf_insn);
 	enum bpf_prog_type prog_type;
 	char buf[256];
@@ -89,6 +90,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_CGROUP_SKB;
 	} else if (is_cgroup_sk) {
 		prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
+	} else if (is_sockops) {
+		prog_type = BPF_PROG_TYPE_SOCK_OPS;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -106,8 +109,11 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
 		return 0;
 
-	if (is_socket) {
-		event += 6;
+	if (is_socket || is_sockops) {
+		if (is_socket)
+			event += 6;
+		else
+			event += 7;
 		if (*event != '/')
 			return 0;
 		event++;
@@ -560,7 +566,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 		    memcmp(shname, "xdp", 3) == 0 ||
 		    memcmp(shname, "perf_event", 10) == 0 ||
 		    memcmp(shname, "socket", 6) == 0 ||
-		    memcmp(shname, "cgroup/", 7) == 0)
+		    memcmp(shname, "cgroup/", 7) == 0 ||
+		    memcmp(shname, "sockops", 7) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
 
-- 
cgit v1.2.3


From 8550f328f45db6d37981eb2041bc465810245c03 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:42 -0700
Subject: bpf: Support for per connection SYN/SYN-ACK RTOs

This patch adds support for setting a per connection SYN and
SYN_ACK RTOs from within a BPF_SOCK_OPS program. For example,
to set small RTOs when it is known both hosts are within a
datacenter.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        | 11 +++++++++++
 include/uapi/linux/bpf.h |  3 +++
 net/ipv4/tcp_input.c     |  3 ++-
 net/ipv4/tcp_output.c    |  2 +-
 4 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e58500825006..564af2dee236 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2057,4 +2057,15 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 }
 #endif
 
+static inline u32 tcp_timeout_init(struct sock *sk)
+{
+	int timeout;
+
+	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+
+	if (timeout <= 0)
+		timeout = TCP_TIMEOUT_INIT;
+	return timeout;
+}
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 01cd485ccd4f..00702b294447 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -748,6 +748,9 @@ struct bpf_sock_ops {
  */
 enum {
 	BPF_SOCK_OPS_VOID,
+	BPF_SOCK_OPS_TIMEOUT_INIT,	/* Should return SYN-RTO value to use or
+					 * -1 if default value should be used
+					 */
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2ab7e2fa9bb9..bcc96654cd7e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6406,7 +6406,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	} else {
 		tcp_rsk(req)->tfo_listener = false;
 		if (!want_cookie)
-			inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+			inet_csk_reqsk_queue_hash_add(sk, req,
+				tcp_timeout_init((struct sock *)req));
 		af_ops->send_synack(sk, dst, &fl, req, &foc,
 				    !want_cookie ? TCP_SYNACK_NORMAL :
 						   TCP_SYNACK_COOKIE);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1d79137f3795..47fe0759a877 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3326,7 +3326,7 @@ static void tcp_connect_init(struct sock *sk)
 	tp->rcv_wup = tp->rcv_nxt;
 	tp->copied_seq = tp->rcv_nxt;
 
-	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
 	inet_csk(sk)->icsk_retransmits = 0;
 	tcp_clear_retrans(tp);
 }
-- 
cgit v1.2.3


From 13d3b1ebe28762c79e981931a41914fae5d04386 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:44 -0700
Subject: bpf: Support for setting initial receive window

This patch adds suppport for setting the initial advertized window from
within a BPF_SOCK_OPS program. This can be used to support larger
initial cwnd values in environments where it is known to be safe.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        | 10 ++++++++++
 include/uapi/linux/bpf.h |  4 ++++
 net/ipv4/tcp_minisocks.c |  9 ++++++++-
 net/ipv4/tcp_output.c    |  7 ++++++-
 4 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 564af2dee236..d6bb3948203d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2068,4 +2068,14 @@ static inline u32 tcp_timeout_init(struct sock *sk)
 	return timeout;
 }
 
+static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
+{
+	int rwnd;
+
+	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+
+	if (rwnd < 0)
+		rwnd = 0;
+	return rwnd;
+}
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 00702b294447..94d7ded1a6cf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -751,6 +751,10 @@ enum {
 	BPF_SOCK_OPS_TIMEOUT_INIT,	/* Should return SYN-RTO value to use or
 					 * -1 if default value should be used
 					 */
+	BPF_SOCK_OPS_RWND_INIT,		/* Should return initial advertized
+					 * window (in packets) or -1 if default
+					 * value should be used
+					 */
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d30ee31e94eb..0ff83c1637d8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -351,6 +351,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 	int full_space = tcp_full_space(sk_listener);
 	u32 window_clamp;
 	__u8 rcv_wscale;
+	u32 rcv_wnd;
 	int mss;
 
 	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -363,6 +364,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 	    (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
 		req->rsk_window_clamp = full_space;
 
+	rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
+	if (rcv_wnd == 0)
+		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+	else if (full_space < rcv_wnd * mss)
+		full_space = rcv_wnd * mss;
+
 	/* tcp_full_space because it is guaranteed to be the first packet */
 	tcp_select_initial_window(full_space,
 		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -370,7 +377,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 		&req->rsk_window_clamp,
 		ireq->wscale_ok,
 		&rcv_wscale,
-		dst_metric(dst, RTAX_INITRWND));
+		rcv_wnd);
 	ireq->rcv_wscale = rcv_wscale;
 }
 EXPORT_SYMBOL(tcp_openreq_init_rwin);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 47fe0759a877..ef809426b538 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3266,6 +3266,7 @@ static void tcp_connect_init(struct sock *sk)
 	const struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u8 rcv_wscale;
+	u32 rcv_wnd;
 
 	/* We'll fix this up when we get a response from the other end.
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3299,13 +3300,17 @@ static void tcp_connect_init(struct sock *sk)
 	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
 		tp->window_clamp = tcp_full_space(sk);
 
+	rcv_wnd = tcp_rwnd_init_bpf(sk);
+	if (rcv_wnd == 0)
+		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
 	tcp_select_initial_window(tcp_full_space(sk),
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 				  &tp->rcv_wnd,
 				  &tp->window_clamp,
 				  sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
 				  &rcv_wscale,
-				  dst_metric(dst, RTAX_INITRWND));
+				  rcv_wnd);
 
 	tp->rx_opt.rcv_wscale = rcv_wscale;
 	tp->rcv_ssthresh = tp->rcv_wnd;
-- 
cgit v1.2.3


From 8c4b4c7e9ff0447995750d9329949fa082520269 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:46 -0700
Subject: bpf: Add setsockopt helper function to bpf

Added support for calling a subset of socket setsockopts from
BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather
than making the changes to call the socket setsockopt function because
the changes required would have been larger.

The ops supported are:
  SO_RCVBUF
  SO_SNDBUF
  SO_MAX_PACING_RATE
  SO_PRIORITY
  SO_RCVLOWAT
  SO_MARK

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h  | 14 ++++++++-
 net/core/filter.c         | 79 ++++++++++++++++++++++++++++++++++++++++++++++-
 samples/bpf/bpf_helpers.h |  3 ++
 3 files changed, 94 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 94d7ded1a6cf..dd43b22758d6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -520,6 +520,17 @@ union bpf_attr {
  *     Set full skb->hash.
  *     @skb: pointer to skb
  *     @hash: hash to set
+ *
+ * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
+ *     Calls setsockopt. Not all opts are available, only those with
+ *     integer optvals plus TCP_CONGESTION.
+ *     Supported levels: SOL_SOCKET and IPROTO_TCP
+ *     @bpf_socket: pointer to bpf_socket
+ *     @level: SOL_SOCKET or IPROTO_TCP
+ *     @optname: option name
+ *     @optval: pointer to option value
+ *     @optlen: length of optval in byes
+ *     Return: 0 or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -570,7 +581,8 @@ union bpf_attr {
 	FN(probe_read_str),		\
 	FN(get_socket_cookie),		\
 	FN(get_socket_uid),		\
-	FN(set_hash),
+	FN(set_hash),			\
+	FN(setsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 1f6a26c4f8b9..ca033e15d35e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -54,6 +54,7 @@
 #include <net/dst.h>
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
+#include <net/tcp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -2672,6 +2673,71 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, level, int, optname, char *, optval, int, optlen)
+{
+	struct sock *sk = bpf_sock->sk;
+	int ret = 0;
+	int val;
+
+	if (!sk_fullsock(sk))
+		return -EINVAL;
+
+	if (level == SOL_SOCKET) {
+		if (optlen != sizeof(int))
+			return -EINVAL;
+		val = *((int *)optval);
+
+		/* Only some socketops are supported */
+		switch (optname) {
+		case SO_RCVBUF:
+			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+			sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+			break;
+		case SO_SNDBUF:
+			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+			break;
+		case SO_MAX_PACING_RATE:
+			sk->sk_max_pacing_rate = val;
+			sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+						 sk->sk_max_pacing_rate);
+			break;
+		case SO_PRIORITY:
+			sk->sk_priority = val;
+			break;
+		case SO_RCVLOWAT:
+			if (val < 0)
+				val = INT_MAX;
+			sk->sk_rcvlowat = val ? : 1;
+			break;
+		case SO_MARK:
+			sk->sk_mark = val;
+			break;
+		default:
+			ret = -EINVAL;
+		}
+	} else if (level == SOL_TCP &&
+		   sk->sk_prot->setsockopt == tcp_setsockopt) {
+		/* Place holder */
+		ret = -EINVAL;
+	} else {
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_setsockopt_proto = {
+	.func		= bpf_setsockopt,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -2822,6 +2888,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+	sock_ops_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_setsockopt:
+		return &bpf_setsockopt_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
 static const struct bpf_func_proto *
 lwt_xmit_func_proto(enum bpf_func_id func_id)
 {
@@ -3591,7 +3668,7 @@ const struct bpf_verifier_ops cg_sock_prog_ops = {
 };
 
 const struct bpf_verifier_ops sock_ops_prog_ops = {
-	.get_func_proto		= bpf_base_func_proto,
+	.get_func_proto		= sock_ops_func_proto,
 	.is_valid_access	= sock_ops_is_valid_access,
 	.convert_ctx_access	= sock_ops_convert_ctx_access,
 };
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f4840b8bb8f9..d50ac342dc92 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -60,6 +60,9 @@ static unsigned long long (*bpf_get_prandom_u32)(void) =
 	(void *) BPF_FUNC_get_prandom_u32;
 static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
 	(void *) BPF_FUNC_xdp_adjust_head;
+static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
+			     int optlen) =
+	(void *) BPF_FUNC_setsockopt;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From 9872a4bde31b0b055448e9ac1f4c9ee62d978766 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:47 -0700
Subject: bpf: Add TCP connection BPF callbacks

Added callbacks to BPF SOCK_OPS type program before an active
connection is intialized and after a passive or active connection is
established.

The following patch demostrates how they can be used to set send and
receive buffer sizes.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 11 +++++++++++
 net/ipv4/tcp_fastopen.c  |  1 +
 net/ipv4/tcp_input.c     |  3 ++-
 net/ipv4/tcp_output.c    |  1 +
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd43b22758d6..2405fe304c98 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -767,6 +767,17 @@ enum {
 					 * window (in packets) or -1 if default
 					 * value should be used
 					 */
+	BPF_SOCK_OPS_TCP_CONNECT_CB,	/* Calls BPF program right before an
+					 * active connection is initialized
+					 */
+	BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB,	/* Calls BPF program when an
+						 * active connection is
+						 * established
+						 */
+	BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,	/* Calls BPF program when a
+						 * passive connection is
+						 * established
+						 */
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8b1539efaf38..ce9c7fef200f 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -221,6 +221,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	tcp_init_congestion_control(child);
 	tcp_mtup_init(child);
 	tcp_init_metrics(child);
+	tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
 	tcp_init_buffer_space(child);
 
 	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bcc96654cd7e..664210e5e4a7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5571,7 +5571,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	icsk->icsk_af_ops->rebuild_header(sk);
 
 	tcp_init_metrics(sk);
-
+	tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
 	tcp_init_congestion_control(sk);
 
 	/* Prevent spurious tcp_cwnd_restart() on first data
@@ -5977,6 +5977,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		} else {
 			/* Make sure socket is routed, for correct metrics. */
 			icsk->icsk_af_ops->rebuild_header(sk);
+			tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
 			tcp_init_congestion_control(sk);
 
 			tcp_mtup_init(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ef809426b538..33b3e401e812 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3444,6 +3444,7 @@ int tcp_connect(struct sock *sk)
 	struct sk_buff *buff;
 	int err;
 
+	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
 	tcp_connect_init(sk);
 
 	if (unlikely(tp->repair)) {
-- 
cgit v1.2.3


From 91b5b21c7c16899abb37f4a9e4388b4e9aae0b9d Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:49 -0700
Subject: bpf: Add support for changing congestion control

Added support for changing congestion control for SOCK_OPS bpf
programs through the setsockopt bpf helper function. It also adds
a new SOCK_OPS op, BPF_SOCK_OPS_NEEDS_ECN, that is needed for
congestion controls, like dctcp, that need to enable ECN in the
SYN packets.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  9 ++++++++-
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c        | 18 +++++++++++++++++-
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_cong.c      | 32 ++++++++++++++++++++++----------
 net/ipv4/tcp_input.c     |  3 ++-
 net/ipv4/tcp_output.c    |  8 +++++---
 7 files changed, 58 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d6bb3948203d..70483296157f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1004,7 +1004,9 @@ void tcp_get_default_congestion_control(char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
+void tcp_reinit_congestion_control(struct sock *sk,
+				   const struct tcp_congestion_ops *ca);
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
@@ -2078,4 +2080,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 		rwnd = 0;
 	return rwnd;
 }
+
+static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
+{
+	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+}
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2405fe304c98..cc4725982bd8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -778,6 +778,9 @@ enum {
 						 * passive connection is
 						 * established
 						 */
+	BPF_SOCK_OPS_NEEDS_ECN,		/* If connection's congestion control
+					 * needs ECN
+					 */
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index ca033e15d35e..12df52711fe8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2719,8 +2719,24 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		}
 	} else if (level == SOL_TCP &&
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
-		/* Place holder */
+#ifdef CONFIG_INET
+		if (optname == TCP_CONGESTION) {
+			char name[TCP_CA_NAME_MAX];
+
+			strncpy(name, optval, min_t(long, optlen,
+						    TCP_CA_NAME_MAX-1));
+			name[TCP_CA_NAME_MAX-1] = 0;
+			ret = tcp_set_congestion_control(sk, name, false);
+			if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
+				/* replacing an existing ca */
+				tcp_reinit_congestion_control(sk,
+					inet_csk(sk)->icsk_ca_ops);
+		} else {
+			ret = -EINVAL;
+		}
+#else
 		ret = -EINVAL;
+#endif
 	} else {
 		ret = -EINVAL;
 	}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fae45e402742..71ce33decd97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		name[val] = 0;
 
 		lock_sock(sk);
-		err = tcp_set_congestion_control(sk, name);
+		err = tcp_set_congestion_control(sk, name, true);
 		release_sock(sk);
 		return err;
 	}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 324c9bcc5456..fde983f6376b 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk)
 		INET_ECN_dontxmit(sk);
 }
 
-static void tcp_reinit_congestion_control(struct sock *sk,
-					  const struct tcp_congestion_ops *ca)
+void tcp_reinit_congestion_control(struct sock *sk,
+				   const struct tcp_congestion_ops *ca)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -333,8 +333,12 @@ out:
 	return ret;
 }
 
-/* Change congestion control for socket */
-int tcp_set_congestion_control(struct sock *sk, const char *name)
+/* Change congestion control for socket. If load is false, then it is the
+ * responsibility of the caller to call tcp_init_congestion_control or
+ * tcp_reinit_congestion_control (if the current congestion control was
+ * already initialized.
+ */
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_congestion_ops *ca;
@@ -344,21 +348,29 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 		return -EPERM;
 
 	rcu_read_lock();
-	ca = __tcp_ca_find_autoload(name);
+	if (!load)
+		ca = tcp_ca_find(name);
+	else
+		ca = __tcp_ca_find_autoload(name);
 	/* No change asking for existing value */
 	if (ca == icsk->icsk_ca_ops) {
 		icsk->icsk_ca_setsockopt = 1;
 		goto out;
 	}
-	if (!ca)
+	if (!ca) {
 		err = -ENOENT;
-	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
-		   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
+	} else if (!load) {
+		icsk->icsk_ca_ops = ca;
+		if (!try_module_get(ca->owner))
+			err = -EBUSY;
+	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
+		     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
 		err = -EPERM;
-	else if (!try_module_get(ca->owner))
+	} else if (!try_module_get(ca->owner)) {
 		err = -EBUSY;
-	else
+	} else {
 		tcp_reinit_congestion_control(sk, ca);
+	}
  out:
 	rcu_read_unlock();
 	return err;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 664210e5e4a7..2920e0cb09f8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6191,7 +6191,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
 	ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
 
 	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
-	    (ecn_ok_dst & DST_FEATURE_ECN_CA))
+	    (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
+	    tcp_bpf_ca_needs_ecn((struct sock *)req))
 		inet_rsk(req)->ecn_ok = 1;
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 33b3e401e812..4d36f0b093e6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
 	if (!(tp->ecn_flags & TCP_ECN_OK))
 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
-	else if (tcp_ca_needs_ecn(sk))
+	else if (tcp_ca_needs_ecn(sk) ||
+		 tcp_bpf_ca_needs_ecn(sk))
 		INET_ECN_xmit(sk);
 }
 
@@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
 static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
 	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
-		       tcp_ca_needs_ecn(sk);
+		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
 
 	if (!use_ecn) {
 		const struct dst_entry *dst = __sk_dst_get(sk);
@@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 	if (use_ecn) {
 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
 		tp->ecn_flags = TCP_ECN_OK;
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
 			INET_ECN_xmit(sk);
 	}
 }
-- 
cgit v1.2.3


From fc7478103c84af437ca3bfae71a82631f770bf7e Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:51 -0700
Subject: bpf: Adds support for setting initial cwnd

Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_IW, which sets the
initial congestion window. This can be used when the hosts are far
apart (large RTTs) and it is safe to start with a large inital cwnd.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c        | 18 +++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cc4725982bd8..32755b538652 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -783,4 +783,6 @@ enum {
 					 */
 };
 
+#define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 12df52711fe8..794be0a454f5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2732,7 +2732,23 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 				tcp_reinit_congestion_control(sk,
 					inet_csk(sk)->icsk_ca_ops);
 		} else {
-			ret = -EINVAL;
+			struct tcp_sock *tp = tcp_sk(sk);
+
+			if (optlen != sizeof(int))
+				return -EINVAL;
+
+			val = *((int *)optval);
+			/* Only some options are supported */
+			switch (optname) {
+			case TCP_BPF_IW:
+				if (val <= 0 || tp->data_segs_out > 0)
+					ret = -EINVAL;
+				else
+					tp->snd_cwnd = val;
+				break;
+			default:
+				ret = -EINVAL;
+			}
 		}
 #else
 		ret = -EINVAL;
-- 
cgit v1.2.3


From 13bf96411ad2bd162a4f9470d58c6bb579c96e21 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:53 -0700
Subject: bpf: Adds support for setting sndcwnd clamp

Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_SNDCWND_CLAMP, which
sets the initial congestion window. It is useful to limit the sndcwnd
when the host are close to each other (small RTT).

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 32755b538652..a6a91e5e96fc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -784,5 +784,6 @@ enum {
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 794be0a454f5..523b91d25025 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2746,6 +2746,13 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 				else
 					tp->snd_cwnd = val;
 				break;
+			case TCP_BPF_SNDCWND_CLAMP:
+				if (val <= 0) {
+					ret = -EINVAL;
+				} else {
+					tp->snd_cwnd_clamp = val;
+					tp->snd_ssthresh = val;
+				}
 			default:
 				ret = -EINVAL;
 			}
-- 
cgit v1.2.3


From 807900395efebf9276178eb6157959f2e81fe013 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Thu, 29 Jun 2017 20:41:30 -0600
Subject: acpi/nfit: Issue Start ARS to retrieve existing records

ACPI 6.2 defines in section 9.20.7.2 that the OSPM may call a Start
ARS with Flags Bit [1] set upon receiving the 0x81 notification.

  Upon receiving the notification, the OSPM may decide to issue
  a Start ARS with Flags Bit [1] set to prepare for the retrieval
  of existing records and issue the Query ARS Status function to
  retrieve the records.

Add support to call a Start ARS from acpi_nfit_uc_error_notify()
with ND_ARS_RETURN_PREV_DATA set when HW_ERROR_SCRUB_ON is not set.

Link: http://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Linda Knippers <linda.knippers@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c   | 13 ++++++++++---
 drivers/acpi/nfit/mce.c    |  2 +-
 drivers/acpi/nfit/nfit.h   |  3 ++-
 include/uapi/linux/ndctl.h |  1 +
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 4f6939b51e90..1f6bc5065b0d 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1044,7 +1044,7 @@ static ssize_t scrub_store(struct device *dev,
 	if (nd_desc) {
 		struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 
-		rc = acpi_nfit_ars_rescan(acpi_desc);
+		rc = acpi_nfit_ars_rescan(acpi_desc, 0);
 	}
 	device_unlock(dev);
 	if (rc)
@@ -2137,6 +2137,7 @@ static int ars_start(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa
 	memset(&ars_start, 0, sizeof(ars_start));
 	ars_start.address = spa->address;
 	ars_start.length = spa->length;
+	ars_start.flags = acpi_desc->ars_start_flags;
 	if (nfit_spa_type(spa) == NFIT_SPA_PM)
 		ars_start.type = ND_ARS_PERSISTENT;
 	else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE)
@@ -2163,6 +2164,7 @@ static int ars_continue(struct acpi_nfit_desc *acpi_desc)
 	ars_start.address = ars_status->restart_address;
 	ars_start.length = ars_status->restart_length;
 	ars_start.type = ars_status->type;
+	ars_start.flags = acpi_desc->ars_start_flags;
 	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start,
 			sizeof(ars_start), &cmd_rc);
 	if (rc < 0)
@@ -2684,6 +2686,7 @@ static void acpi_nfit_scrub(struct work_struct *work)
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
 		acpi_nfit_async_scrub(acpi_desc, nfit_spa);
 	acpi_desc->scrub_count++;
+	acpi_desc->ars_start_flags = 0;
 	if (acpi_desc->scrub_count_state)
 		sysfs_notify_dirent(acpi_desc->scrub_count_state);
 	mutex_unlock(&acpi_desc->init_mutex);
@@ -2702,6 +2705,7 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
 				return rc;
 		}
 
+	acpi_desc->ars_start_flags = 0;
 	if (!acpi_desc->cancel)
 		queue_work(nfit_wq, &acpi_desc->work);
 	return 0;
@@ -2906,7 +2910,7 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
 	return 0;
 }
 
-int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, u8 flags)
 {
 	struct device *dev = acpi_desc->dev;
 	struct nfit_spa *nfit_spa;
@@ -2928,6 +2932,7 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
 
 		nfit_spa->ars_required = 1;
 	}
+	acpi_desc->ars_start_flags = flags;
 	queue_work(nfit_wq, &acpi_desc->work);
 	dev_dbg(dev, "%s: ars_scan triggered\n", __func__);
 	mutex_unlock(&acpi_desc->init_mutex);
@@ -3104,8 +3109,10 @@ static void acpi_nfit_update_notify(struct device *dev, acpi_handle handle)
 static void acpi_nfit_uc_error_notify(struct device *dev, acpi_handle handle)
 {
 	struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev);
+	u8 flags = (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) ?
+			0 : ND_ARS_RETURN_PREV_DATA;
 
-	acpi_nfit_ars_rescan(acpi_desc);
+	acpi_nfit_ars_rescan(acpi_desc, flags);
 }
 
 void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event)
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index fd86bec98dea..feeb95d574fa 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -79,7 +79,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
 			 * already in progress, just let that be the last
 			 * authoritative one
 			 */
-			acpi_nfit_ars_rescan(acpi_desc);
+			acpi_nfit_ars_rescan(acpi_desc, 0);
 		}
 		break;
 	}
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index e3da60b2d686..54292db61262 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -155,6 +155,7 @@ struct acpi_nfit_desc {
 	struct list_head idts;
 	struct nvdimm_bus *nvdimm_bus;
 	struct device *dev;
+	u8 ars_start_flags;
 	struct nd_cmd_ars_status *ars_status;
 	size_t ars_status_size;
 	struct work_struct work;
@@ -207,7 +208,7 @@ struct nfit_blk {
 
 extern struct list_head acpi_descs;
 extern struct mutex acpi_desc_lock;
-int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc);
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, u8 flags);
 
 #ifdef CONFIG_X86_MCE
 void nfit_mce_register(void);
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index e15768fb4b99..6d3c54264d8e 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -207,6 +207,7 @@ enum {
 enum {
 	ND_ARS_VOLATILE = 1,
 	ND_ARS_PERSISTENT = 2,
+	ND_ARS_RETURN_PREV_DATA = 1 << 1,
 	ND_CONFIG_LOCKED = 1,
 };
 
-- 
cgit v1.2.3


From 88832613715eb46c90b1e75b4c0a537f1cb5797e Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 22 Jun 2017 09:24:15 -0700
Subject: Input: introduce KEY_ASSISTANT

This adds a new keycode to allow users invoke a context-aware desktop
assistant application.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input-event-codes.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index f5a8d96e1e09..179891074b3c 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -600,6 +600,7 @@
 #define KEY_APPSELECT		0x244	/* AL Select Task/Application */
 #define KEY_SCREENSAVER		0x245	/* AL Screen Saver */
 #define KEY_VOICECOMMAND		0x246	/* Listening Voice Command */
+#define KEY_ASSISTANT		0x247	/* AL Context-aware desktop assistant */
 
 #define KEY_BRIGHTNESS_MIN		0x250	/* Set Brightness to Minimum */
 #define KEY_BRIGHTNESS_MAX		0x251	/* Set Brightness to Maximum */
-- 
cgit v1.2.3


From 2be7e212d5419a400d051c84ca9fdd083e5aacac Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 2 Jul 2017 02:13:26 +0200
Subject: bpf: add bpf_skb_adjust_room helper

This work adds a helper that can be used to adjust net room of an
skb. The helper is generic and can be further extended in future.
Main use case is for having a programmatic way to add/remove room to
v4/v6 header options along with cls_bpf on egress and ingress hook
of the data path. It reuses most of the infrastructure that we added
for the bpf_skb_change_type() helper which can be used in nat64
translations. Similarly, the helper only takes care of adjusting the
room so that related data is populated and csum adapted out of the
BPF program using it.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h       |  16 +++++-
 net/core/filter.c              | 126 +++++++++++++++++++++++++++++++++++++++--
 tools/include/uapi/linux/bpf.h |  16 +++++-
 3 files changed, 151 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a6a91e5e96fc..e99e3e6f8b37 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -531,6 +531,14 @@ union bpf_attr {
  *     @optval: pointer to option value
  *     @optlen: length of optval in byes
  *     Return: 0 or negative error
+ *
+ * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
+ *     Grow or shrink room in sk_buff.
+ *     @skb: pointer to skb
+ *     @len_diff: (signed) amount of room to grow/shrink
+ *     @mode: operation mode (enum bpf_adj_room_mode)
+ *     @flags: reserved for future use
+ *     Return: 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -582,7 +590,8 @@ union bpf_attr {
 	FN(get_socket_cookie),		\
 	FN(get_socket_uid),		\
 	FN(set_hash),			\
-	FN(setsockopt),
+	FN(setsockopt),			\
+	FN(skb_adjust_room),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -632,6 +641,11 @@ enum bpf_func_id {
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
 #define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
+/* Mode for BPF_FUNC_skb_adjust_room helper. */
+enum bpf_adj_room_mode {
+	BPF_ADJ_ROOM_NET,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index 68d8cd865c4a..29620df45b7c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2154,6 +2154,124 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return sizeof(struct iphdr);
+	case htons(ETH_P_IPV6):
+		return sizeof(struct ipv6hdr);
+	default:
+		return ~0U;
+	}
+}
+
+static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
+{
+	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
+	int ret;
+
+	ret = skb_cow(skb, len_diff);
+	if (unlikely(ret < 0))
+		return ret;
+
+	ret = bpf_skb_net_hdr_push(skb, off, len_diff);
+	if (unlikely(ret < 0))
+		return ret;
+
+	if (skb_is_gso(skb)) {
+		/* Due to header grow, MSS needs to be downgraded. */
+		skb_shinfo(skb)->gso_size -= len_diff;
+		/* Header must be checked, and gso_segs recomputed. */
+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+		skb_shinfo(skb)->gso_segs = 0;
+	}
+
+	return 0;
+}
+
+static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
+{
+	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
+	int ret;
+
+	ret = skb_unclone(skb, GFP_ATOMIC);
+	if (unlikely(ret < 0))
+		return ret;
+
+	ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
+	if (unlikely(ret < 0))
+		return ret;
+
+	if (skb_is_gso(skb)) {
+		/* Due to header shrink, MSS can be upgraded. */
+		skb_shinfo(skb)->gso_size += len_diff;
+		/* Header must be checked, and gso_segs recomputed. */
+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+		skb_shinfo(skb)->gso_segs = 0;
+	}
+
+	return 0;
+}
+
+static u32 __bpf_skb_max_len(const struct sk_buff *skb)
+{
+	return skb->dev->mtu + skb->dev->hard_header_len;
+}
+
+static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
+{
+	bool trans_same = skb->transport_header == skb->network_header;
+	u32 len_cur, len_diff_abs = abs(len_diff);
+	u32 len_min = bpf_skb_net_base_len(skb);
+	u32 len_max = __bpf_skb_max_len(skb);
+	__be16 proto = skb->protocol;
+	bool shrink = len_diff < 0;
+	int ret;
+
+	if (unlikely(len_diff_abs > 0xfffU))
+		return -EFAULT;
+	if (unlikely(proto != htons(ETH_P_IP) &&
+		     proto != htons(ETH_P_IPV6)))
+		return -ENOTSUPP;
+
+	len_cur = skb->len - skb_network_offset(skb);
+	if (skb_transport_header_was_set(skb) && !trans_same)
+		len_cur = skb_network_header_len(skb);
+	if ((shrink && (len_diff_abs >= len_cur ||
+			len_cur - len_diff_abs < len_min)) ||
+	    (!shrink && (skb->len + len_diff_abs > len_max &&
+			 !skb_is_gso(skb))))
+		return -ENOTSUPP;
+
+	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
+		       bpf_skb_net_grow(skb, len_diff_abs);
+
+	bpf_compute_data_end(skb);
+	return 0;
+}
+
+BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+	   u32, mode, u64, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	if (likely(mode == BPF_ADJ_ROOM_NET))
+		return bpf_skb_adjust_net(skb, len_diff);
+
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
+	.func		= bpf_skb_adjust_room,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static u32 __bpf_skb_min_len(const struct sk_buff *skb)
 {
 	u32 min_len = skb_network_offset(skb);
@@ -2166,11 +2284,6 @@ static u32 __bpf_skb_min_len(const struct sk_buff *skb)
 	return min_len;
 }
 
-static u32 __bpf_skb_max_len(const struct sk_buff *skb)
-{
-	return skb->dev->mtu + skb->dev->hard_header_len;
-}
-
 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
 {
 	unsigned int old_len = skb->len;
@@ -2307,6 +2420,7 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_skb_change_proto ||
 	    func == bpf_skb_change_head ||
 	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_adjust_room ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
@@ -2849,6 +2963,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_change_proto_proto;
 	case BPF_FUNC_skb_change_type:
 		return &bpf_skb_change_type_proto;
+	case BPF_FUNC_skb_adjust_room:
+		return &bpf_skb_adjust_room_proto;
 	case BPF_FUNC_skb_change_tail:
 		return &bpf_skb_change_tail_proto;
 	case BPF_FUNC_skb_get_tunnel_key:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 284b3661f1df..ce2988be4f0e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -531,6 +531,14 @@ union bpf_attr {
  *     @optval: pointer to option value
  *     @optlen: length of optval in byes
  *     Return: 0 or negative error
+ *
+ * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
+ *     Grow or shrink room in sk_buff.
+ *     @skb: pointer to skb
+ *     @len_diff: (signed) amount of room to grow/shrink
+ *     @mode: operation mode (enum bpf_adj_room_mode)
+ *     @flags: reserved for future use
+ *     Return: 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -582,7 +590,8 @@ union bpf_attr {
 	FN(get_socket_cookie),		\
 	FN(get_socket_uid),		\
 	FN(set_hash),			\
-	FN(setsockopt),
+	FN(setsockopt),			\
+	FN(skb_adjust_room),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -632,6 +641,11 @@ enum bpf_func_id {
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
 #define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
+/* Mode for BPF_FUNC_skb_adjust_room helper. */
+enum bpf_adj_room_mode {
+	BPF_ADJ_ROOM_NET_OPTS,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
-- 
cgit v1.2.3


From 62aa81d7c4c24b90fdb61da70ac0dbbc414f9939 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Thu, 6 Jul 2017 15:36:10 -0700
Subject: ocfs2: use magic.h

Filesystems generally use SUPER_MAGIC values from magic.h instead of a
local definition.

Link: http://lkml.kernel.org/r/20170521154217.27917-1-fabf@skynet.be
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Reviewed-by: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/ocfs2_fs.h        | 5 ++---
 include/uapi/linux/magic.h | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 44d178b8d1aa..5bb4a89f9045 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -25,6 +25,8 @@
 #ifndef _OCFS2_FS_H
 #define _OCFS2_FS_H
 
+#include <linux/magic.h>
+
 /* Version */
 #define OCFS2_MAJOR_REV_LEVEL		0
 #define OCFS2_MINOR_REV_LEVEL          	90
@@ -56,9 +58,6 @@
 #define OCFS2_MIN_BLOCKSIZE		512
 #define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
 
-/* Filesystem magic number */
-#define OCFS2_SUPER_MAGIC		0x7461636f
-
 /* Object signatures */
 #define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
 #define OCFS2_INODE_SIGNATURE		"INODE01"
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index a0908f1d2760..e439565df838 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -42,6 +42,7 @@
 #define MSDOS_SUPER_MAGIC	0x4d44		/* MD */
 #define NCP_SUPER_MAGIC		0x564c		/* Guess, what 0x564c is :-) */
 #define NFS_SUPER_MAGIC		0x6969
+#define OCFS2_SUPER_MAGIC	0x7461636f
 #define OPENPROM_SUPER_MAGIC	0x9fa1
 #define QNX4_SUPER_MAGIC	0x002f		/* qnx4 fs detection */
 #define QNX6_SUPER_MAGIC	0x68191122	/* qnx6 fs detection */
-- 
cgit v1.2.3


From 213980c0f23b6c4932fd5516da7e8443b2a615ea Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:40:06 -0700
Subject: mm, mempolicy: simplify rebinding mempolicies when updating cpusets

Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") has introduced a two-step protocol when
rebinding task's mempolicy due to cpuset update, in order to avoid a
parallel allocation seeing an empty effective nodemask and failing.

Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory
barrier related damage v3") introduced a seqlock protection and removed
the synchronization point between the two update steps.  At that point
(or perhaps later), the two-step rebinding became unnecessary.

Currently it only makes sure that the update first adds new nodes in
step 1 and then removes nodes in step 2.  Without memory barriers the
effects are questionable, and even then this cannot prevent a parallel
zonelist iteration checking the nodemask at each step to observe all
nodes as unusable for allocation.  We now fully rely on the seqlock to
prevent premature OOMs and allocation failures.

We can thus remove the two-step update parts and simplify the code.

Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h      |   6 +--
 include/uapi/linux/mempolicy.h |   8 ----
 kernel/cgroup/cpuset.c         |   4 +-
 mm/mempolicy.c                 | 102 ++++++++---------------------------------
 4 files changed, 21 insertions(+), 99 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ecb6cbeede5a..3a58b4be1b0c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -142,8 +142,7 @@ bool vma_policy_mof(struct vm_area_struct *vma);
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
-				enum mpol_rebind_step step);
+extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 
 extern int huge_node(struct vm_area_struct *vma,
@@ -260,8 +259,7 @@ static inline void numa_default_policy(void)
 }
 
 static inline void mpol_rebind_task(struct task_struct *tsk,
-				const nodemask_t *new,
-				enum mpol_rebind_step step)
+				const nodemask_t *new)
 {
 }
 
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 9cd8b21dddbe..2a4d89508fec 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -24,13 +24,6 @@ enum {
 	MPOL_MAX,	/* always last member of enum */
 };
 
-enum mpol_rebind_step {
-	MPOL_REBIND_ONCE,	/* do rebind work at once(not by two step) */
-	MPOL_REBIND_STEP1,	/* first step(set all the newly nodes) */
-	MPOL_REBIND_STEP2,	/* second step(clean all the disallowed nodes)*/
-	MPOL_REBIND_NSTEP,
-};
-
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
 #define MPOL_F_RELATIVE_NODES	(1 << 14)
@@ -65,7 +58,6 @@ enum mpol_rebind_step {
  */
 #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
-#define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae643412948a..5fd1bdbaa381 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 	}
 
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
 
 	if (need_loop) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c60807625fd5..047181452040 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)
 
 static const struct mempolicy_operations {
 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
-	/*
-	 * If read-side task has no lock to protect task->mempolicy, write-side
-	 * task will rebind the task->mempolicy by two step. The first step is
-	 * setting all the newly nodes, and the second step is cleaning all the
-	 * disallowed nodes. In this way, we can avoid finding no node to alloc
-	 * page.
-	 * If we have a lock to protect task->mempolicy in read-side, we do
-	 * rebind directly.
-	 *
-	 * step:
-	 * 	MPOL_REBIND_ONCE - do rebind work at once
-	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
-	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
-	 */
-	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
-			enum mpol_rebind_step step);
+	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 } mpol_ops[MPOL_MAX];
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p)
 	kmem_cache_free(policy_cache, p);
 }
 
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
-				enum mpol_rebind_step step)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 {
 }
 
-/*
- * step:
- * 	MPOL_REBIND_ONCE  - do rebind work at once
- * 	MPOL_REBIND_STEP1 - set all the newly nodes
- * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
-static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
-				 enum mpol_rebind_step step)
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	nodemask_t tmp;
 
@@ -325,35 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 	else {
-		/*
-		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
-		 * result
-		 */
-		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
-			nodes_remap(tmp, pol->v.nodes,
-					pol->w.cpuset_mems_allowed, *nodes);
-			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
-		} else if (step == MPOL_REBIND_STEP2) {
-			tmp = pol->w.cpuset_mems_allowed;
-			pol->w.cpuset_mems_allowed = *nodes;
-		} else
-			BUG();
+		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+								*nodes);
+		pol->w.cpuset_mems_allowed = tmp;
 	}
 
 	if (nodes_empty(tmp))
 		tmp = *nodes;
 
-	if (step == MPOL_REBIND_STEP1)
-		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
-	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
-		pol->v.nodes = tmp;
-	else
-		BUG();
+	pol->v.nodes = tmp;
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
-				  const nodemask_t *nodes,
-				  enum mpol_rebind_step step)
+						const nodemask_t *nodes)
 {
 	nodemask_t tmp;
 
@@ -379,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
 /*
  * mpol_rebind_policy - Migrate a policy to a different set of nodes
  *
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * 	MPOL_REBIND_ONCE  - do rebind work at once
- * 	MPOL_REBIND_STEP1 - set all the newly nodes
- * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * policies are protected by task->mems_allowed_seq to prevent a premature
+ * OOM/allocation failure due to parallel nodemask modification.
  */
-static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
-				enum mpol_rebind_step step)
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
 	if (!pol)
 		return;
-	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
+	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 
-	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
-		return;
-
-	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
-		BUG();
-
-	if (step == MPOL_REBIND_STEP1)
-		pol->flags |= MPOL_F_REBINDING;
-	else if (step == MPOL_REBIND_STEP2)
-		pol->flags &= ~MPOL_F_REBINDING;
-	else if (step >= MPOL_REBIND_NSTEP)
-		BUG();
-
-	mpol_ops[pol->mode].rebind(pol, newmask, step);
+	mpol_ops[pol->mode].rebind(pol, newmask);
 }
 
 /*
@@ -424,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
  * Called with task's alloc_lock held.
  */
 
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
-			enum mpol_rebind_step step)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
-	mpol_rebind_policy(tsk->mempolicy, new, step);
+	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
 /*
@@ -442,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 
 	down_write(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
+		mpol_rebind_policy(vma->vm_policy, new);
 	up_write(&mm->mmap_sem);
 }
 
@@ -2101,10 +2038,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 
 	if (current_cpuset_is_being_rebound()) {
 		nodemask_t mems = cpuset_mems_allowed(current);
-		if (new->flags & MPOL_F_REBINDING)
-			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
-		else
-			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
+		mpol_rebind_policy(new, &mems);
 	}
 	atomic_set(&new->refcnt, 1);
 	return new;
-- 
cgit v1.2.3


From 1068be7bd4b05ca41a6a8de724f52a9c87861412 Mon Sep 17 00:00:00 2001
From: "Bryant G. Ly" <bryantly@linux.vnet.ibm.com>
Date: Tue, 6 Jun 2017 09:28:49 -0500
Subject: tcmu: Add netlink for device reconfiguration

This gives tcmu the ability to handle events that can cause
reconfiguration, such as resize, path changes, write_cache, etc...

Signed-off-by: Bryant G. Ly <bryantly@linux.vnet.ibm.com>
Reviewed-By: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c     | 12 ++++++++++++
 include/uapi/linux/target_core_user.h |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 0c797cc69d9e..ae918222284b 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1562,6 +1562,7 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item,
 {
 	struct se_dev_attrib *da = container_of(to_config_group(item),
 					struct se_dev_attrib, da_group);
+	struct tcmu_dev *udev = TCMU_DEV(da->da_dev);
 	int val;
 	int ret;
 
@@ -1570,6 +1571,17 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item,
 		return ret;
 
 	da->emulate_write_cache = val;
+
+	/* Check if device has been configured before */
+	if (tcmu_dev_configured(udev)) {
+		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
+					 udev->uio_info.name,
+					 udev->uio_info.uio_dev->minor);
+		if (ret) {
+			pr_err("Unable to reconfigure device\n");
+			return ret;
+		}
+	}
 	return count;
 }
 CONFIGFS_ATTR(tcmu_, emulate_write_cache);
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index af17b4154ef6..403a61faada0 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -130,6 +130,7 @@ enum tcmu_genl_cmd {
 	TCMU_CMD_UNSPEC,
 	TCMU_CMD_ADDED_DEVICE,
 	TCMU_CMD_REMOVED_DEVICE,
+	TCMU_CMD_RECONFIG_DEVICE,
 	__TCMU_CMD_MAX,
 };
 #define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1)
-- 
cgit v1.2.3


From 8a45885c1514cdae2ee64b5ac03ffc00a1a8a9d7 Mon Sep 17 00:00:00 2001
From: "Bryant G. Ly" <bryantly@linux.vnet.ibm.com>
Date: Tue, 6 Jun 2017 09:28:52 -0500
Subject: tcmu: Add Type of reconfig into netlink

This patch adds more info about the attribute being changed,
so that usersapce can easily figure out what is happening.

Signed-off-by: Bryant G. Ly <bryantly@linux.vnet.ibm.com>
Reviewed-By: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c     | 20 ++++++++++++++------
 include/uapi/linux/target_core_user.h |  8 ++++++++
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 7c6475731895..afc1fd6bacaf 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1176,7 +1176,8 @@ static int tcmu_release(struct uio_info *info, struct inode *inode)
 	return 0;
 }
 
-static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int minor)
+static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name,
+			      int minor, int type)
 {
 	struct sk_buff *skb;
 	void *msg_header;
@@ -1198,6 +1199,10 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int mino
 	if (ret < 0)
 		goto free_skb;
 
+	ret = nla_put_u32(skb, TCMU_ATTR_TYPE, type);
+	if (ret < 0)
+		goto free_skb;
+
 	genlmsg_end(skb, msg_header);
 
 	ret = genlmsg_multicast_allns(&tcmu_genl_family, skb, 0,
@@ -1301,7 +1306,7 @@ static int tcmu_configure_device(struct se_device *dev)
 	kref_get(&udev->kref);
 
 	ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name,
-				 udev->uio_info.uio_dev->minor);
+				 udev->uio_info.uio_dev->minor, NO_RECONFIG);
 	if (ret)
 		goto err_netlink;
 
@@ -1383,7 +1388,7 @@ static void tcmu_free_device(struct se_device *dev)
 
 	if (tcmu_dev_configured(udev)) {
 		tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name,
-				   udev->uio_info.uio_dev->minor);
+				   udev->uio_info.uio_dev->minor, NO_RECONFIG);
 
 		uio_unregister_device(&udev->uio_info);
 	}
@@ -1577,7 +1582,8 @@ static ssize_t tcmu_dev_path_store(struct config_item *item, const char *page,
 	if (tcmu_dev_configured(udev)) {
 		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
 					 udev->uio_info.name,
-					 udev->uio_info.uio_dev->minor);
+					 udev->uio_info.uio_dev->minor,
+					 CONFIG_PATH);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
 			return ret;
@@ -1615,7 +1621,8 @@ static ssize_t tcmu_dev_size_store(struct config_item *item, const char *page,
 	if (tcmu_dev_configured(udev)) {
 		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
 					 udev->uio_info.name,
-					 udev->uio_info.uio_dev->minor);
+					 udev->uio_info.uio_dev->minor,
+					 CONFIG_SIZE);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
 			return ret;
@@ -1654,7 +1661,8 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item,
 	if (tcmu_dev_configured(udev)) {
 		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
 					 udev->uio_info.name,
-					 udev->uio_info.uio_dev->minor);
+					 udev->uio_info.uio_dev->minor,
+					 CONFIG_WRITECACHE);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
 			return ret;
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index 403a61faada0..5b00e3500005 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -139,8 +139,16 @@ enum tcmu_genl_attr {
 	TCMU_ATTR_UNSPEC,
 	TCMU_ATTR_DEVICE,
 	TCMU_ATTR_MINOR,
+	TCMU_ATTR_TYPE,
 	__TCMU_ATTR_MAX,
 };
 #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1)
 
+enum tcmu_reconfig_types {
+	NO_RECONFIG,
+	CONFIG_PATH,
+	CONFIG_SIZE,
+	CONFIG_WRITECACHE,
+};
+
 #endif
-- 
cgit v1.2.3


From 2d76443e02f260d7a5bd0ede1851ae5534f0c68d Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Mon, 12 Jun 2017 01:34:28 -0500
Subject: tcmu: reconfigure netlink attr changes

1. TCMU_ATTR_TYPE is too generic when it describes only the
reconfiguration type, so rename to TCMU_ATTR_RECONFIG_TYPE.

2. Only return the reconfig type when it is a
TCMU_CMD_RECONFIG_DEVICE command.

3. CONFIG_* type is not needed. We can pass the value along with an
ATTR to userspace, so it does not need to read sysfs/configfs.

4. Fix leak in tcmu_dev_path_store and rename to dev_config to
reflect it is more than just a path that can be changed.

6. Don't update kernel struct value if netlink sending fails.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: "Bryant G. Ly" <bryantly@linux.vnet.ibm.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c     | 73 +++++++++++++++++++++--------------
 include/uapi/linux/target_core_user.h | 12 ++----
 2 files changed, 48 insertions(+), 37 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 6322269d9e85..ca5b081295db 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1177,7 +1177,8 @@ static int tcmu_release(struct uio_info *info, struct inode *inode)
 }
 
 static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name,
-			      int minor, int type)
+			      int minor, int reconfig_attr,
+			      const void *reconfig_data)
 {
 	struct sk_buff *skb;
 	void *msg_header;
@@ -1199,9 +1200,27 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name,
 	if (ret < 0)
 		goto free_skb;
 
-	ret = nla_put_u32(skb, TCMU_ATTR_TYPE, type);
-	if (ret < 0)
-		goto free_skb;
+	if (cmd == TCMU_CMD_RECONFIG_DEVICE) {
+		switch (reconfig_attr) {
+		case TCMU_ATTR_DEV_CFG:
+			ret = nla_put_string(skb, reconfig_attr, reconfig_data);
+			break;
+		case TCMU_ATTR_DEV_SIZE:
+			ret = nla_put_u64_64bit(skb, reconfig_attr,
+						*((u64 *)reconfig_data),
+						TCMU_ATTR_PAD);
+			break;
+		case TCMU_ATTR_WRITECACHE:
+			ret = nla_put_u8(skb, reconfig_attr,
+					  *((u8 *)reconfig_data));
+			break;
+		default:
+			BUG();
+		}
+
+		if (ret < 0)
+			goto free_skb;
+	}
 
 	genlmsg_end(skb, msg_header);
 
@@ -1306,7 +1325,7 @@ static int tcmu_configure_device(struct se_device *dev)
 	kref_get(&udev->kref);
 
 	ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name,
-				 udev->uio_info.uio_dev->minor, NO_RECONFIG);
+				 udev->uio_info.uio_dev->minor, 0, NULL);
 	if (ret)
 		goto err_netlink;
 
@@ -1388,7 +1407,7 @@ static void tcmu_free_device(struct se_device *dev)
 
 	if (tcmu_dev_configured(udev)) {
 		tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name,
-				   udev->uio_info.uio_dev->minor, NO_RECONFIG);
+				   udev->uio_info.uio_dev->minor, 0, NULL);
 
 		uio_unregister_device(&udev->uio_info);
 	}
@@ -1553,7 +1572,7 @@ static ssize_t tcmu_cmd_time_out_store(struct config_item *item, const char *pag
 }
 CONFIGFS_ATTR(tcmu_, cmd_time_out);
 
-static ssize_t tcmu_dev_path_show(struct config_item *item, char *page)
+static ssize_t tcmu_dev_config_show(struct config_item *item, char *page)
 {
 	struct se_dev_attrib *da = container_of(to_config_group(item),
 						struct se_dev_attrib, da_group);
@@ -1562,37 +1581,34 @@ static ssize_t tcmu_dev_path_show(struct config_item *item, char *page)
 	return snprintf(page, PAGE_SIZE, "%s\n", udev->dev_config);
 }
 
-static ssize_t tcmu_dev_path_store(struct config_item *item, const char *page,
-				   size_t count)
+static ssize_t tcmu_dev_config_store(struct config_item *item, const char *page,
+				     size_t count)
 {
 	struct se_dev_attrib *da = container_of(to_config_group(item),
 						struct se_dev_attrib, da_group);
 	struct tcmu_dev *udev = TCMU_DEV(da->da_dev);
-	char *copy = NULL;
-	int ret;
+	int ret, len;
 
-	copy = kstrdup(page, GFP_KERNEL);
-	if (!copy) {
-		kfree(copy);
+	len = strlen(page);
+	if (!len || len > TCMU_CONFIG_LEN - 1)
 		return -EINVAL;
-	}
-	strlcpy(udev->dev_config, copy, TCMU_CONFIG_LEN);
 
 	/* Check if device has been configured before */
 	if (tcmu_dev_configured(udev)) {
 		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
 					 udev->uio_info.name,
 					 udev->uio_info.uio_dev->minor,
-					 CONFIG_PATH);
+					 TCMU_ATTR_DEV_CFG, page);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
 			return ret;
 		}
 	}
+	strlcpy(udev->dev_config, page, TCMU_CONFIG_LEN);
 
 	return count;
 }
-CONFIGFS_ATTR(tcmu_, dev_path);
+CONFIGFS_ATTR(tcmu_, dev_config);
 
 static ssize_t tcmu_dev_size_show(struct config_item *item, char *page)
 {
@@ -1609,26 +1625,25 @@ static ssize_t tcmu_dev_size_store(struct config_item *item, const char *page,
 	struct se_dev_attrib *da = container_of(to_config_group(item),
 						struct se_dev_attrib, da_group);
 	struct tcmu_dev *udev = TCMU_DEV(da->da_dev);
-	unsigned long val;
+	u64 val;
 	int ret;
 
-	ret = kstrtoul(page, 0, &val);
+	ret = kstrtou64(page, 0, &val);
 	if (ret < 0)
 		return ret;
-	udev->dev_size = val;
 
 	/* Check if device has been configured before */
 	if (tcmu_dev_configured(udev)) {
 		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
 					 udev->uio_info.name,
 					 udev->uio_info.uio_dev->minor,
-					 CONFIG_SIZE);
+					 TCMU_ATTR_DEV_SIZE, &val);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
 			return ret;
 		}
 	}
-
+	udev->dev_size = val;
 	return count;
 }
 CONFIGFS_ATTR(tcmu_, dev_size);
@@ -1648,33 +1663,33 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item,
 	struct se_dev_attrib *da = container_of(to_config_group(item),
 					struct se_dev_attrib, da_group);
 	struct tcmu_dev *udev = TCMU_DEV(da->da_dev);
-	int val;
+	u8 val;
 	int ret;
 
-	ret = kstrtouint(page, 0, &val);
+	ret = kstrtou8(page, 0, &val);
 	if (ret < 0)
 		return ret;
 
-	da->emulate_write_cache = val;
-
 	/* Check if device has been configured before */
 	if (tcmu_dev_configured(udev)) {
 		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
 					 udev->uio_info.name,
 					 udev->uio_info.uio_dev->minor,
-					 CONFIG_WRITECACHE);
+					 TCMU_ATTR_WRITECACHE, &val);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
 			return ret;
 		}
 	}
+
+	da->emulate_write_cache = val;
 	return count;
 }
 CONFIGFS_ATTR(tcmu_, emulate_write_cache);
 
 static struct configfs_attribute *tcmu_attrib_attrs[] = {
 	&tcmu_attr_cmd_time_out,
-	&tcmu_attr_dev_path,
+	&tcmu_attr_dev_config,
 	&tcmu_attr_dev_size,
 	&tcmu_attr_emulate_write_cache,
 	NULL,
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index 5b00e3500005..4bfc9a1b635c 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -139,16 +139,12 @@ enum tcmu_genl_attr {
 	TCMU_ATTR_UNSPEC,
 	TCMU_ATTR_DEVICE,
 	TCMU_ATTR_MINOR,
-	TCMU_ATTR_TYPE,
+	TCMU_ATTR_PAD,
+	TCMU_ATTR_DEV_CFG,
+	TCMU_ATTR_DEV_SIZE,
+	TCMU_ATTR_WRITECACHE,
 	__TCMU_ATTR_MAX,
 };
 #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1)
 
-enum tcmu_reconfig_types {
-	NO_RECONFIG,
-	CONFIG_PATH,
-	CONFIG_SIZE,
-	CONFIG_WRITECACHE,
-};
-
 #endif
-- 
cgit v1.2.3


From b3af66e24393f03ef81db17a11387d9e6174bd01 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Fri, 23 Jun 2017 01:18:15 -0500
Subject: tcmu: perfom device add, del and reconfig synchronously

This makes the device add, del reconfig operations sync. It fixes
the issue where for add and reconfig, we do not know if userspace
successfully completely the operation, so we leave invalid kernel
structs or report incorrect status for the config/reconfig operations.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c     | 213 ++++++++++++++++++++++++++++++----
 include/uapi/linux/target_core_user.h |   7 ++
 2 files changed, 200 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index e58127b8db8a..e080cd1a8fde 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -87,6 +87,8 @@
 /* Default maximum of the global data blocks(512K * PAGE_SIZE) */
 #define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024)
 
+static u8 tcmu_kern_cmd_reply_supported;
+
 static struct device *tcmu_root_device;
 
 struct tcmu_hba {
@@ -95,6 +97,13 @@ struct tcmu_hba {
 
 #define TCMU_CONFIG_LEN 256
 
+struct tcmu_nl_cmd {
+	/* wake up thread waiting for reply */
+	struct completion complete;
+	int cmd;
+	int status;
+};
+
 struct tcmu_dev {
 	struct list_head node;
 	struct kref kref;
@@ -135,6 +144,11 @@ struct tcmu_dev {
 	struct timer_list timeout;
 	unsigned int cmd_time_out;
 
+	spinlock_t nl_cmd_lock;
+	struct tcmu_nl_cmd curr_nl_cmd;
+	/* wake up threads waiting on curr_nl_cmd */
+	wait_queue_head_t nl_cmd_wq;
+
 	char dev_config[TCMU_CONFIG_LEN];
 };
 
@@ -178,16 +192,128 @@ static const struct genl_multicast_group tcmu_mcgrps[] = {
 	[TCMU_MCGRP_CONFIG] = { .name = "config", },
 };
 
+static struct nla_policy tcmu_attr_policy[TCMU_ATTR_MAX+1] = {
+	[TCMU_ATTR_DEVICE]	= { .type = NLA_STRING },
+	[TCMU_ATTR_MINOR]	= { .type = NLA_U32 },
+	[TCMU_ATTR_CMD_STATUS]	= { .type = NLA_S32 },
+	[TCMU_ATTR_DEVICE_ID]	= { .type = NLA_U32 },
+	[TCMU_ATTR_SUPP_KERN_CMD_REPLY] = { .type = NLA_U8 },
+};
+
+static int tcmu_genl_cmd_done(struct genl_info *info, int completed_cmd)
+{
+	struct se_device *dev;
+	struct tcmu_dev *udev;
+	struct tcmu_nl_cmd *nl_cmd;
+	int dev_id, rc, ret = 0;
+	bool is_removed = (completed_cmd == TCMU_CMD_REMOVED_DEVICE);
+
+	if (!info->attrs[TCMU_ATTR_CMD_STATUS] ||
+	    !info->attrs[TCMU_ATTR_DEVICE_ID]) {
+		printk(KERN_ERR "TCMU_ATTR_CMD_STATUS or TCMU_ATTR_DEVICE_ID not set, doing nothing\n");
+                return -EINVAL;
+        }
+
+	dev_id = nla_get_u32(info->attrs[TCMU_ATTR_DEVICE_ID]);
+	rc = nla_get_s32(info->attrs[TCMU_ATTR_CMD_STATUS]);
+
+	dev = target_find_device(dev_id, !is_removed);
+	if (!dev) {
+		printk(KERN_ERR "tcmu nl cmd %u/%u completion could not find device with dev id %u.\n",
+		       completed_cmd, rc, dev_id);
+		return -ENODEV;
+	}
+	udev = TCMU_DEV(dev);
+
+	spin_lock(&udev->nl_cmd_lock);
+	nl_cmd = &udev->curr_nl_cmd;
+
+	pr_debug("genl cmd done got id %d curr %d done %d rc %d\n", dev_id,
+		 nl_cmd->cmd, completed_cmd, rc);
+
+	if (nl_cmd->cmd != completed_cmd) {
+		printk(KERN_ERR "Mismatched commands (Expecting reply for %d. Current %d).\n",
+		       completed_cmd, nl_cmd->cmd);
+		ret = -EINVAL;
+	} else {
+		nl_cmd->status = rc;
+	}
+
+	spin_unlock(&udev->nl_cmd_lock);
+	if (!is_removed)
+		 target_undepend_item(&dev->dev_group.cg_item);
+	if (!ret)
+		complete(&nl_cmd->complete);
+	return ret;
+}
+
+static int tcmu_genl_rm_dev_done(struct sk_buff *skb, struct genl_info *info)
+{
+	return tcmu_genl_cmd_done(info, TCMU_CMD_REMOVED_DEVICE);
+}
+
+static int tcmu_genl_add_dev_done(struct sk_buff *skb, struct genl_info *info)
+{
+	return tcmu_genl_cmd_done(info, TCMU_CMD_ADDED_DEVICE);
+}
+
+static int tcmu_genl_reconfig_dev_done(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	return tcmu_genl_cmd_done(info, TCMU_CMD_RECONFIG_DEVICE);
+}
+
+static int tcmu_genl_set_features(struct sk_buff *skb, struct genl_info *info)
+{
+	if (info->attrs[TCMU_ATTR_SUPP_KERN_CMD_REPLY]) {
+		tcmu_kern_cmd_reply_supported  =
+			nla_get_u8(info->attrs[TCMU_ATTR_SUPP_KERN_CMD_REPLY]);
+		printk(KERN_INFO "tcmu daemon: command reply support %u.\n",
+		       tcmu_kern_cmd_reply_supported);
+	}
+
+	return 0;
+}
+
+static const struct genl_ops tcmu_genl_ops[] = {
+	{
+		.cmd	= TCMU_CMD_SET_FEATURES,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= tcmu_attr_policy,
+		.doit	= tcmu_genl_set_features,
+	},
+	{
+		.cmd	= TCMU_CMD_ADDED_DEVICE_DONE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= tcmu_attr_policy,
+		.doit	= tcmu_genl_add_dev_done,
+	},
+	{
+		.cmd	= TCMU_CMD_REMOVED_DEVICE_DONE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= tcmu_attr_policy,
+		.doit	= tcmu_genl_rm_dev_done,
+	},
+	{
+		.cmd	= TCMU_CMD_RECONFIG_DEVICE_DONE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= tcmu_attr_policy,
+		.doit	= tcmu_genl_reconfig_dev_done,
+	},
+};
+
 /* Our generic netlink family */
 static struct genl_family tcmu_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "TCM-USER",
-	.version = 1,
+	.version = 2,
 	.maxattr = TCMU_ATTR_MAX,
 	.mcgrps = tcmu_mcgrps,
 	.n_mcgrps = ARRAY_SIZE(tcmu_mcgrps),
 	.netnsok = true,
+	.ops = tcmu_genl_ops,
+	.n_ops = ARRAY_SIZE(tcmu_genl_ops),
 };
 
 #define tcmu_cmd_set_dbi_cur(cmd, index) ((cmd)->dbi_cur = (index))
@@ -989,6 +1115,9 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	setup_timer(&udev->timeout, tcmu_device_timedout,
 		(unsigned long)udev);
 
+	init_waitqueue_head(&udev->nl_cmd_wq);
+	spin_lock_init(&udev->nl_cmd_lock);
+
 	return &udev->se_dev;
 }
 
@@ -1176,9 +1305,54 @@ static int tcmu_release(struct uio_info *info, struct inode *inode)
 	return 0;
 }
 
-static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name,
-			      int minor, int reconfig_attr,
-			      const void *reconfig_data)
+static void tcmu_init_genl_cmd_reply(struct tcmu_dev *udev, int cmd)
+{
+	struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd;
+
+	if (!tcmu_kern_cmd_reply_supported)
+		return;
+relock:
+	spin_lock(&udev->nl_cmd_lock);
+
+	if (nl_cmd->cmd != TCMU_CMD_UNSPEC) {
+		spin_unlock(&udev->nl_cmd_lock);
+		pr_debug("sleeping for open nl cmd\n");
+		wait_event(udev->nl_cmd_wq, (nl_cmd->cmd == TCMU_CMD_UNSPEC));
+		goto relock;
+	}
+
+	memset(nl_cmd, 0, sizeof(*nl_cmd));
+	nl_cmd->cmd = cmd;
+	init_completion(&nl_cmd->complete);
+
+	spin_unlock(&udev->nl_cmd_lock);
+}
+
+static int tcmu_wait_genl_cmd_reply(struct tcmu_dev *udev)
+{
+	struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd;
+	int ret;
+	DEFINE_WAIT(__wait);
+
+	if (!tcmu_kern_cmd_reply_supported)
+		return 0;
+
+	pr_debug("sleeping for nl reply\n");
+	wait_for_completion(&nl_cmd->complete);
+
+	spin_lock(&udev->nl_cmd_lock);
+	nl_cmd->cmd = TCMU_CMD_UNSPEC;
+	ret = nl_cmd->status;
+	nl_cmd->status = 0;
+	spin_unlock(&udev->nl_cmd_lock);
+
+	wake_up_all(&udev->nl_cmd_wq);
+
+	return ret;;
+}
+
+static int tcmu_netlink_event(struct tcmu_dev *udev, enum tcmu_genl_cmd cmd,
+			      int reconfig_attr, const void *reconfig_data)
 {
 	struct sk_buff *skb;
 	void *msg_header;
@@ -1192,11 +1366,15 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name,
 	if (!msg_header)
 		goto free_skb;
 
-	ret = nla_put_string(skb, TCMU_ATTR_DEVICE, name);
+	ret = nla_put_string(skb, TCMU_ATTR_DEVICE, udev->uio_info.name);
 	if (ret < 0)
 		goto free_skb;
 
-	ret = nla_put_u32(skb, TCMU_ATTR_MINOR, minor);
+	ret = nla_put_u32(skb, TCMU_ATTR_MINOR, udev->uio_info.uio_dev->minor);
+	if (ret < 0)
+		goto free_skb;
+
+	ret = nla_put_u32(skb, TCMU_ATTR_DEVICE_ID, udev->se_dev.dev_index);
 	if (ret < 0)
 		goto free_skb;
 
@@ -1224,12 +1402,15 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name,
 
 	genlmsg_end(skb, msg_header);
 
+	tcmu_init_genl_cmd_reply(udev, cmd);
+
 	ret = genlmsg_multicast_allns(&tcmu_genl_family, skb, 0,
 				TCMU_MCGRP_CONFIG, GFP_KERNEL);
-
 	/* We don't care if no one is listening */
 	if (ret == -ESRCH)
 		ret = 0;
+	if (!ret)
+		ret = tcmu_wait_genl_cmd_reply(udev);
 
 	return ret;
 free_skb:
@@ -1324,8 +1505,7 @@ static int tcmu_configure_device(struct se_device *dev)
 	 */
 	kref_get(&udev->kref);
 
-	ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name,
-				 udev->uio_info.uio_dev->minor, 0, NULL);
+	ret = tcmu_netlink_event(udev, TCMU_CMD_ADDED_DEVICE, 0, NULL);
 	if (ret)
 		goto err_netlink;
 
@@ -1414,8 +1594,7 @@ static void tcmu_destroy_device(struct se_device *dev)
 	tcmu_blocks_release(udev);
 
 	if (tcmu_dev_configured(udev)) {
-		tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name,
-				   udev->uio_info.uio_dev->minor, 0, NULL);
+		tcmu_netlink_event(udev, TCMU_CMD_REMOVED_DEVICE, 0, NULL);
 
 		uio_unregister_device(&udev->uio_info);
 	}
@@ -1600,9 +1779,7 @@ static ssize_t tcmu_dev_config_store(struct config_item *item, const char *page,
 
 	/* Check if device has been configured before */
 	if (tcmu_dev_configured(udev)) {
-		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
-					 udev->uio_info.name,
-					 udev->uio_info.uio_dev->minor,
+		ret = tcmu_netlink_event(udev, TCMU_CMD_RECONFIG_DEVICE,
 					 TCMU_ATTR_DEV_CFG, page);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
@@ -1639,9 +1816,7 @@ static ssize_t tcmu_dev_size_store(struct config_item *item, const char *page,
 
 	/* Check if device has been configured before */
 	if (tcmu_dev_configured(udev)) {
-		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
-					 udev->uio_info.name,
-					 udev->uio_info.uio_dev->minor,
+		ret = tcmu_netlink_event(udev, TCMU_CMD_RECONFIG_DEVICE,
 					 TCMU_ATTR_DEV_SIZE, &val);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
@@ -1677,9 +1852,7 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item,
 
 	/* Check if device has been configured before */
 	if (tcmu_dev_configured(udev)) {
-		ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE,
-					 udev->uio_info.name,
-					 udev->uio_info.uio_dev->minor,
+		ret = tcmu_netlink_event(udev, TCMU_CMD_RECONFIG_DEVICE,
 					 TCMU_ATTR_WRITECACHE, &val);
 		if (ret) {
 			pr_err("Unable to reconfigure device\n");
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index 4bfc9a1b635c..24a1c4ec2248 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -131,6 +131,10 @@ enum tcmu_genl_cmd {
 	TCMU_CMD_ADDED_DEVICE,
 	TCMU_CMD_REMOVED_DEVICE,
 	TCMU_CMD_RECONFIG_DEVICE,
+	TCMU_CMD_ADDED_DEVICE_DONE,
+	TCMU_CMD_REMOVED_DEVICE_DONE,
+	TCMU_CMD_RECONFIG_DEVICE_DONE,
+	TCMU_CMD_SET_FEATURES,
 	__TCMU_CMD_MAX,
 };
 #define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1)
@@ -143,6 +147,9 @@ enum tcmu_genl_attr {
 	TCMU_ATTR_DEV_CFG,
 	TCMU_ATTR_DEV_SIZE,
 	TCMU_ATTR_WRITECACHE,
+	TCMU_ATTR_CMD_STATUS,
+	TCMU_ATTR_DEVICE_ID,
+	TCMU_ATTR_SUPP_KERN_CMD_REPLY,
 	__TCMU_ATTR_MAX,
 };
 #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1)
-- 
cgit v1.2.3


From 242fc35290bd8cf0effc6e3474e3a417985de2f3 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Wed, 5 Jul 2017 19:23:28 +0300
Subject: sched/headers/uapi: Fix linux/sched/types.h userspace compilation
 errors

Consistently use types provided by <linux/types.h> to fix the following
linux/sched/types.h userspace compilation errors:

  /usr/include/linux/sched/types.h:57:2: error: unknown type name 'u32'
    u32 size;
  ...
  u64 sched_period;

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # v4.12
Fixes: e2d1e2aec572 ("sched/headers: Move various ABI definitions to <uapi/linux/sched/types.h>")
Link: http://lkml.kernel.org/r/20170705162328.GA11026@altlinux.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/sched/types.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 307acbc82d80..34b81aa1a2f7 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -54,21 +54,21 @@ struct sched_param {
  * available in the scheduling class file or in Documentation/.
  */
 struct sched_attr {
-	u32 size;
+	__u32 size;
 
-	u32 sched_policy;
-	u64 sched_flags;
+	__u32 sched_policy;
+	__u64 sched_flags;
 
 	/* SCHED_NORMAL, SCHED_BATCH */
-	s32 sched_nice;
+	__s32 sched_nice;
 
 	/* SCHED_FIFO, SCHED_RR */
-	u32 sched_priority;
+	__u32 sched_priority;
 
 	/* SCHED_DEADLINE */
-	u64 sched_runtime;
-	u64 sched_deadline;
-	u64 sched_period;
+	__u64 sched_runtime;
+	__u64 sched_deadline;
+	__u64 sched_period;
 };
 
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */
-- 
cgit v1.2.3


From 7cee9384cb3e25de33d75ecdbf08bb15b4ea9fa5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 10 Jul 2017 11:40:19 -0700
Subject: Fix up over-eager 'wait_queue_t' renaming

Commit ac6424b981bc ("sched/wait: Rename wait_queue_t =>
wait_queue_entry_t") had scripted the renaming incorrectly, and didn't
actually check that the 'wait_queue_t' was a full token.

As a result, it also triggered on 'wait_queue_token', and renamed that
to 'wait_queue_entry_token' entry in the autofs4 packet structure
definition too.  That was entirely incorrect, and not intended.

The end result built fine when building just the kernel - because
everything had been renamed consistently there - but caused problems in
user space because the "struct autofs_packet_missing" type is exported
as part of the uapi.

This scripts it all back again:

    git grep -lw wait_queue_entry_token |
        xargs sed -i 's/wait_queue_entry_token/wait_queue_token/g'

and checks the end result.

Reported-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Fixes: ac6424b981bc ("sched/wait: Rename wait_queue_t => wait_queue_entry_t")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/autofs4.txt | 12 ++++++------
 fs/autofs4/autofs_i.h                 |  2 +-
 fs/autofs4/waitq.c                    | 18 +++++++++---------
 include/uapi/linux/auto_fs.h          |  4 ++--
 include/uapi/linux/auto_fs4.h         |  4 ++--
 5 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt
index 8444dc3d57e8..f10dd590f69f 100644
--- a/Documentation/filesystems/autofs4.txt
+++ b/Documentation/filesystems/autofs4.txt
@@ -316,7 +316,7 @@ For version 5, the format of the message is:
         struct autofs_v5_packet {
                 int proto_version;                /* Protocol version */
                 int type;                        /* Type of packet */
-                autofs_wqt_t wait_queue_entry_token;
+                autofs_wqt_t wait_queue_token;
                 __u32 dev;
                 __u64 ino;
                 __u32 uid;
@@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing
 `O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
 most one packet, and any unread portion of a packet will be discarded.
 
-The `wait_queue_entry_token` is a unique number which can identify a
+The `wait_queue_token` is a unique number which can identify a
 particular request to be acknowledged.  When a message is sent over
 the pipe the affected dentry is marked as either "active" or
 "expiring" and other accesses to it block until the message is
 acknowledged using one of the ioctls below and the relevant
-`wait_queue_entry_token`.
+`wait_queue_token`.
 
 Communicating with autofs: root directory ioctls
 ------------------------------------------------
@@ -358,7 +358,7 @@ capability, or must be the automount daemon.
 The available ioctl commands are:
 
 - **AUTOFS_IOC_READY**: a notification has been handled.  The argument
-    to the ioctl command is the "wait_queue_entry_token" number
+    to the ioctl command is the "wait_queue_token" number
     corresponding to the notification being acknowledged.
 - **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
     the error code `ENOENT`.
@@ -382,14 +382,14 @@ The available ioctl commands are:
         struct autofs_packet_expire_multi {
                 int proto_version;              /* Protocol version */
                 int type;                       /* Type of packet */
-                autofs_wqt_t wait_queue_entry_token;
+                autofs_wqt_t wait_queue_token;
                 int len;
                 char name[NAME_MAX+1];
         };
 
      is required.  This is filled in with the name of something
      that can be unmounted or removed.  If nothing can be expired,
-     `errno` is set to `EAGAIN`.  Even though a `wait_queue_entry_token`
+     `errno` is set to `EAGAIN`.  Even though a `wait_queue_token`
      is present in the structure, no "wait queue" is established
      and no acknowledgment is needed.
 - **AUTOFS_IOC_EXPIRE_MULTI**:  This is similar to
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 974f5346458a..beef981aa54f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -83,7 +83,7 @@ struct autofs_info {
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
 	struct autofs_wait_queue *next;
-	autofs_wqt_t wait_queue_entry_token;
+	autofs_wqt_t wait_queue_token;
 	/* We use the following to see what we are waiting for */
 	struct qstr name;
 	u32 dev;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 7071895b0678..24a58bf9ca72 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	size_t pktsz;
 
 	pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
-		 (unsigned long) wq->wait_queue_entry_token,
+		 (unsigned long) wq->wait_queue_token,
 		 wq->name.len, wq->name.name, type);
 
 	memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
@@ -120,7 +120,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*mp);
 
-		mp->wait_queue_entry_token = wq->wait_queue_entry_token;
+		mp->wait_queue_token = wq->wait_queue_token;
 		mp->len = wq->name.len;
 		memcpy(mp->name, wq->name.name, wq->name.len);
 		mp->name[wq->name.len] = '\0';
@@ -133,7 +133,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*ep);
 
-		ep->wait_queue_entry_token = wq->wait_queue_entry_token;
+		ep->wait_queue_token = wq->wait_queue_token;
 		ep->len = wq->name.len;
 		memcpy(ep->name, wq->name.name, wq->name.len);
 		ep->name[wq->name.len] = '\0';
@@ -153,7 +153,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*packet);
 
-		packet->wait_queue_entry_token = wq->wait_queue_entry_token;
+		packet->wait_queue_token = wq->wait_queue_token;
 		packet->len = wq->name.len;
 		memcpy(packet->name, wq->name.name, wq->name.len);
 		packet->name[wq->name.len] = '\0';
@@ -428,7 +428,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 			return -ENOMEM;
 		}
 
-		wq->wait_queue_entry_token = autofs4_next_wait_queue;
+		wq->wait_queue_token = autofs4_next_wait_queue;
 		if (++autofs4_next_wait_queue == 0)
 			autofs4_next_wait_queue = 1;
 		wq->next = sbi->queues;
@@ -461,7 +461,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		}
 
 		pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-			 (unsigned long) wq->wait_queue_entry_token, wq->name.len,
+			 (unsigned long) wq->wait_queue_token, wq->name.len,
 			 wq->name.name, notify);
 
 		/*
@@ -471,7 +471,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 	} else {
 		wq->wait_ctr++;
 		pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-			 (unsigned long) wq->wait_queue_entry_token, wq->name.len,
+			 (unsigned long) wq->wait_queue_token, wq->name.len,
 			 wq->name.name, notify);
 		mutex_unlock(&sbi->wq_mutex);
 		kfree(qstr.name);
@@ -550,13 +550,13 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 }
 
 
-int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_entry_token, int status)
+int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
 {
 	struct autofs_wait_queue *wq, **wql;
 
 	mutex_lock(&sbi->wq_mutex);
 	for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
-		if (wq->wait_queue_entry_token == wait_queue_entry_token)
+		if (wq->wait_queue_token == wait_queue_token)
 			break;
 	}
 
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 1953f8d6063b..aa63451ef20a 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -26,7 +26,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	AUTOFS_PROTO_VERSION
 
 /*
- * The wait_queue_entry_token (autofs_wqt_t) is part of a structure which is passed
+ * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
  * back to the kernel via ioctl from userspace. On architectures where 32- and
  * 64-bit userspace binaries can be executed it's important that the size of
  * autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we
@@ -49,7 +49,7 @@ struct autofs_packet_hdr {
 
 struct autofs_packet_missing {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_entry_token;
+	autofs_wqt_t wait_queue_token;
 	int len;
 	char name[NAME_MAX+1];
 };	
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 65b72d0222e7..7c6da423d54e 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -108,7 +108,7 @@ enum autofs_notify {
 /* v4 multi expire (via pipe) */
 struct autofs_packet_expire_multi {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_entry_token;
+	autofs_wqt_t wait_queue_token;
 	int len;
 	char name[NAME_MAX+1];
 };
@@ -123,7 +123,7 @@ union autofs_packet_union {
 /* autofs v5 common packet struct */
 struct autofs_v5_packet {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_entry_token;
+	autofs_wqt_t wait_queue_token;
 	__u32 dev;
 	__u64 ino;
 	__u32 uid;
-- 
cgit v1.2.3


From 949c033694864082db9b3f5304723a6d7407f8e2 Mon Sep 17 00:00:00 2001
From: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Date: Tue, 11 Jul 2017 00:22:33 +0300
Subject: KVM: s390: Fix KVM_S390_GET_CMMA_BITS ioctl definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case of KVM_S390_GET_CMMA_BITS, the kernel does not only read struct
kvm_s390_cmma_log passed from userspace (which constitutes _IOC_WRITE),
it also writes back a return value (which constitutes _IOC_READ) making
this an _IOWR ioctl instead of _IOW.

Fixes: 4036e387 ("KVM: s390: ioctls to get and set guest storage attributes")
Signed-off-by: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 include/uapi/linux/kvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c0b6dfec5f87..ebd604c222d8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1351,7 +1351,7 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_X86_SMM */
 #define KVM_SMI                   _IO(KVMIO,   0xb7)
 /* Available with KVM_CAP_S390_CMMA_MIGRATION */
-#define KVM_S390_GET_CMMA_BITS      _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log)
+#define KVM_S390_GET_CMMA_BITS      _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
 #define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-- 
cgit v1.2.3


From 0791e3644e5ef21646fe565b9061788d05ec71d4 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 12 Jul 2017 14:34:28 -0700
Subject: kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files

With current epoll architecture target files are addressed with
file_struct and file descriptor number, where the last is not unique.
Moreover files can be transferred from another process via unix socket,
added into queue and closed then so we won't find this descriptor in the
task fdinfo list.

Thus to checkpoint and restore such processes CRIU needs to find out
where exactly the target file is present to add it into epoll queue.
For this sake one can use kcmp call where some particular target file
from the queue is compared with arbitrary file passed as an argument.

Because epoll target files can have same file descriptor number but
different file_struct a caller should explicitly specify the offset
within.

To test if some particular file is matching entry inside epoll one have
to

 - fill kcmp_epoll_slot structure with epoll file descriptor,
   target file number and target file offset (in case if only
   one target is present then it should be 0)

 - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot)
    - the kernel fetch file pointer matching file descriptor @fd of pid1
    - lookups for file struct in epoll queue of pid2 and returns traditional
      0,1,2 result for sorting purpose

Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrey Vagin <avagin@openvz.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c            | 42 ++++++++++++++++++++++++++++++++++
 include/linux/eventpoll.h |  3 +++
 include/uapi/linux/kcmp.h | 10 +++++++++
 kernel/kcmp.c             | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 322904c3ebdf..e7e9901c3790 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1077,6 +1077,48 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 	return epir;
 }
 
+static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
+{
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (epi->ffd.fd == tfd) {
+			if (toff == 0)
+				return epi;
+			else
+				toff--;
+		}
+		cond_resched();
+	}
+
+	return NULL;
+}
+
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
+				     unsigned long toff)
+{
+	struct file *file_raw;
+	struct eventpoll *ep;
+	struct epitem *epi;
+
+	if (!is_file_epoll(file))
+		return ERR_PTR(-EINVAL);
+
+	ep = file->private_data;
+
+	mutex_lock(&ep->mtx);
+	epi = ep_find_tfd(ep, tfd, toff);
+	if (epi)
+		file_raw = epi->ffd.file;
+	else
+		file_raw = ERR_PTR(-ENOENT);
+	mutex_unlock(&ep->mtx);
+
+	return file_raw;
+}
+
 /*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 6daf6d4971f6..d8625d214ea7 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -14,6 +14,7 @@
 #define _LINUX_EVENTPOLL_H
 
 #include <uapi/linux/eventpoll.h>
+#include <uapi/linux/kcmp.h>
 
 
 /* Forward declarations to avoid compiler errors */
@@ -22,6 +23,8 @@ struct file;
 
 #ifdef CONFIG_EPOLL
 
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
+
 /* Used to initialize the epoll bits inside the "struct file" */
 static inline void eventpoll_init_file(struct file *file)
 {
diff --git a/include/uapi/linux/kcmp.h b/include/uapi/linux/kcmp.h
index 84df14b37360..481e103da78e 100644
--- a/include/uapi/linux/kcmp.h
+++ b/include/uapi/linux/kcmp.h
@@ -1,6 +1,8 @@
 #ifndef _UAPI_LINUX_KCMP_H
 #define _UAPI_LINUX_KCMP_H
 
+#include <linux/types.h>
+
 /* Comparison type */
 enum kcmp_type {
 	KCMP_FILE,
@@ -10,8 +12,16 @@ enum kcmp_type {
 	KCMP_SIGHAND,
 	KCMP_IO,
 	KCMP_SYSVSEM,
+	KCMP_EPOLL_TFD,
 
 	KCMP_TYPES,
 };
 
+/* Slot for KCMP_EPOLL_TFD */
+struct kcmp_epoll_slot {
+	__u32 efd;		/* epoll file descriptor */
+	__u32 tfd;		/* target file number */
+	__u32 toff;		/* target offset within same numbered sequence */
+};
+
 #endif /* _UAPI_LINUX_KCMP_H */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 3a47fa998fe0..ea34ed8bb952 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -11,6 +11,10 @@
 #include <linux/bug.h>
 #include <linux/err.h>
 #include <linux/kcmp.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/eventpoll.h>
+#include <linux/file.h>
 
 #include <asm/unistd.h>
 
@@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2)
 	return err;
 }
 
+#ifdef CONFIG_EPOLL
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	struct file *filp, *filp_epoll, *filp_tgt;
+	struct kcmp_epoll_slot slot;
+	struct files_struct *files;
+
+	if (copy_from_user(&slot, uslot, sizeof(slot)))
+		return -EFAULT;
+
+	filp = get_file_raw_ptr(task1, idx1);
+	if (!filp)
+		return -EBADF;
+
+	files = get_files_struct(task2);
+	if (!files)
+		return -EBADF;
+
+	spin_lock(&files->file_lock);
+	filp_epoll = fcheck_files(files, slot.efd);
+	if (filp_epoll)
+		get_file(filp_epoll);
+	else
+		filp_tgt = ERR_PTR(-EBADF);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (filp_epoll) {
+		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+		fput(filp_epoll);
+	} else
+
+	if (IS_ERR(filp_tgt))
+		return PTR_ERR(filp_tgt);
+
+	return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
+}
+#else
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		unsigned long, idx1, unsigned long, idx2)
 {
@@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		ret = -EOPNOTSUPP;
 #endif
 		break;
+	case KCMP_EPOLL_TFD:
+		ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 2cd648c110b5570c3280bd645797658cabbe5f5c Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 12 Jul 2017 14:34:44 -0700
Subject: include/linux/sem.h: correctly document sem_ctime

sem_ctime is initialized to the semget() time and then updated at every
semctl() that changes the array.

Thus it does not represent the time of the last change.

Especially, semop() calls are only stored in sem_otime, not in
sem_ctime.

This is already described in ipc/sem.c, I just overlooked that there is
a comment in include/linux/sem.h and man semctl(2) as well.

So: Correct wrong comments.

Link: http://lkml.kernel.org/r/20170515171912.6298-4-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: <1vier1@web.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sem.h      | 2 +-
 include/uapi/linux/sem.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 9db14093b73c..be5cf2ea14ad 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -30,7 +30,7 @@ struct sem {
 /* One sem_array data structure for each set of semaphores in the system. */
 struct sem_array {
 	struct kern_ipc_perm	sem_perm;	/* permissions .. see ipc.h */
-	time_t			sem_ctime;	/* last change time */
+	time_t			sem_ctime;	/* create/last semctl() time */
 	struct list_head	pending_alter;	/* pending operations */
 						/* that alter the array */
 	struct list_head	pending_const;	/* pending complex operations */
diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h
index dd73b908b2f3..67eb90361692 100644
--- a/include/uapi/linux/sem.h
+++ b/include/uapi/linux/sem.h
@@ -23,7 +23,7 @@
 struct semid_ds {
 	struct ipc_perm	sem_perm;		/* permissions .. see ipc.h */
 	__kernel_time_t	sem_otime;		/* last semop time */
-	__kernel_time_t	sem_ctime;		/* last change time */
+	__kernel_time_t	sem_ctime;		/* create/last semctl() time */
 	struct sem	*sem_base;		/* ptr to first semaphore in array */
 	struct sem_queue *sem_pending;		/* pending operations to be processed */
 	struct sem_queue **sem_pending_last;	/* last pending operation */
-- 
cgit v1.2.3


From efc479e6900c22bad9a2b649d13405ed9cde2d53 Mon Sep 17 00:00:00 2001
From: Roman Kagan <rkagan@virtuozzo.com>
Date: Thu, 22 Jun 2017 16:51:01 +0300
Subject: kvm: x86: hyperv: add KVM_CAP_HYPERV_SYNIC2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a flaw in the Hyper-V SynIC implementation in KVM: when message
page or event flags page is enabled by setting the corresponding msr,
KVM zeroes it out.  This is problematic because on migration the
corresponding MSRs are loaded on the destination, so the content of
those pages is lost.

This went unnoticed so far because the only user of those pages was
in-KVM hyperv synic timers, which could continue working despite that
zeroing.

Newer QEMU uses those pages for Hyper-V VMBus implementation, and
zeroing them breaks the migration.

Besides, in newer QEMU the content of those pages is fully managed by
QEMU, so zeroing them is undesirable even when writing the MSRs from the
guest side.

To support this new scheme, introduce a new capability,
KVM_CAP_HYPERV_SYNIC2, which, when enabled, makes sure that the synic
pages aren't zeroed out in KVM.

Signed-off-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  9 +++++++++
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/hyperv.c             | 13 +++++++++----
 arch/x86/kvm/hyperv.h             |  2 +-
 arch/x86/kvm/x86.c                |  7 ++++++-
 include/uapi/linux/kvm.h          |  1 +
 6 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3a9831b72945..78ac577c9378 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4329,3 +4329,12 @@ Querying this capability returns a bitmap indicating the possible
 virtual SMT modes that can be set using KVM_CAP_PPC_SMT.  If bit N
 (counting from the right) is set, then a virtual SMT mode of 2^N is
 available.
+
+8.11 KVM_CAP_HYPERV_SYNIC2
+
+Architectures: x86
+
+This capability enables a newer version of Hyper-V Synthetic interrupt
+controller (SynIC).  The only difference with KVM_CAP_HYPERV_SYNIC is that KVM
+doesn't clear SynIC message and event flags pages when they are enabled by
+writing to the respective MSRs.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ef37d0dc61bd..9d8de5dd7546 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -462,6 +462,7 @@ struct kvm_vcpu_hv_synic {
 	DECLARE_BITMAP(auto_eoi_bitmap, 256);
 	DECLARE_BITMAP(vec_bitmap, 256);
 	bool active;
+	bool dont_zero_synic_pages;
 };
 
 /* Hyper-V per vcpu emulation context */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ebae57ac5902..a8084406707e 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -221,7 +221,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
 		synic->version = data;
 		break;
 	case HV_X64_MSR_SIEFP:
-		if (data & HV_SYNIC_SIEFP_ENABLE)
+		if ((data & HV_SYNIC_SIEFP_ENABLE) && !host &&
+		    !synic->dont_zero_synic_pages)
 			if (kvm_clear_guest(vcpu->kvm,
 					    data & PAGE_MASK, PAGE_SIZE)) {
 				ret = 1;
@@ -232,7 +233,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
 			synic_exit(synic, msr);
 		break;
 	case HV_X64_MSR_SIMP:
-		if (data & HV_SYNIC_SIMP_ENABLE)
+		if ((data & HV_SYNIC_SIMP_ENABLE) && !host &&
+		    !synic->dont_zero_synic_pages)
 			if (kvm_clear_guest(vcpu->kvm,
 					    data & PAGE_MASK, PAGE_SIZE)) {
 				ret = 1;
@@ -687,14 +689,17 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
 		stimer_init(&hv_vcpu->stimer[i], i);
 }
 
-int kvm_hv_activate_synic(struct kvm_vcpu *vcpu)
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
 {
+	struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
 	/*
 	 * Hyper-V SynIC auto EOI SINT's are
 	 * not compatible with APICV, so deactivate APICV
 	 */
 	kvm_vcpu_deactivate_apicv(vcpu);
-	vcpu_to_synic(vcpu)->active = true;
+	synic->active = true;
+	synic->dont_zero_synic_pages = dont_zero_synic_pages;
 	return 0;
 }
 
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index cd1119538add..12f65fe1011d 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -56,7 +56,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
 void kvm_hv_irq_routing_update(struct kvm *kvm);
 int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
 void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
-int kvm_hv_activate_synic(struct kvm_vcpu *vcpu);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
 
 void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08aa5e442aa7..4f41c5222ecd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2659,6 +2659,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_VAPIC:
 	case KVM_CAP_HYPERV_SPIN:
 	case KVM_CAP_HYPERV_SYNIC:
+	case KVM_CAP_HYPERV_SYNIC2:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -3382,10 +3383,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 
 	switch (cap->cap) {
+	case KVM_CAP_HYPERV_SYNIC2:
+		if (cap->args[0])
+			return -EINVAL;
 	case KVM_CAP_HYPERV_SYNIC:
 		if (!irqchip_in_kernel(vcpu->kvm))
 			return -EINVAL;
-		return kvm_hv_activate_synic(vcpu);
+		return kvm_hv_activate_synic(vcpu, cap->cap ==
+					     KVM_CAP_HYPERV_SYNIC2);
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ebd604c222d8..38b2cfbc8112 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -927,6 +927,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_CMMA_MIGRATION 145
 #define KVM_CAP_PPC_FWNMI 146
 #define KVM_CAP_PPC_SMT_POSSIBLE 147
+#define KVM_CAP_HYPERV_SYNIC2 148
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From d3457c877b14aaee8c52923eedf05a3b78af0476 Mon Sep 17 00:00:00 2001
From: Roman Kagan <rkagan@virtuozzo.com>
Date: Fri, 14 Jul 2017 17:13:20 +0300
Subject: kvm: x86: hyperv: make VP_INDEX managed by userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hyper-V identifies vCPUs by Virtual Processor Index, which can be
queried via HV_X64_MSR_VP_INDEX msr.  It is defined by the spec as a
sequential number which can't exceed the maximum number of vCPUs per VM.
APIC ids can be sparse and thus aren't a valid replacement for VP
indices.

Current KVM uses its internal vcpu index as VP_INDEX.  However, to make
it predictable and persistent across VM migrations, the userspace has to
control the value of VP_INDEX.

This patch achieves that, by storing vp_index explicitly on vcpu, and
allowing HV_X64_MSR_VP_INDEX to be set from the host side.  For
compatibility it's initialized to KVM vcpu index.  Also a few variables
are renamed to make clear distinction betweed this Hyper-V vp_index and
KVM vcpu_id (== APIC id).  Besides, a new capability,
KVM_CAP_HYPERV_VP_INDEX, is added to allow the userspace to skip
attempting msr writes where unsupported, to avoid spamming error logs.

Signed-off-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  9 +++++++
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/hyperv.c             | 54 +++++++++++++++++++++++++--------------
 arch/x86/kvm/hyperv.h             |  1 +
 arch/x86/kvm/x86.c                |  3 +++
 include/uapi/linux/kvm.h          |  1 +
 6 files changed, 50 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 78ac577c9378..e63a35fafef0 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4338,3 +4338,12 @@ This capability enables a newer version of Hyper-V Synthetic interrupt
 controller (SynIC).  The only difference with KVM_CAP_HYPERV_SYNIC is that KVM
 doesn't clear SynIC message and event flags pages when they are enabled by
 writing to the respective MSRs.
+
+8.12 KVM_CAP_HYPERV_VP_INDEX
+
+Architectures: x86
+
+This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr.  Its
+value is used to denote the target vcpu for a SynIC interrupt.  For
+compatibilty, KVM initializes this msr to KVM's internal vcpu index.  When this
+capability is absent, userspace can still query this msr's value.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da3261e384d3..87ac4fba6d8e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -467,6 +467,7 @@ struct kvm_vcpu_hv_synic {
 
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
+	u32 vp_index;
 	u64 hv_vapic;
 	s64 runtime_offset;
 	struct kvm_vcpu_hv_synic synic;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a8084406707e..2695a34fa1c5 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -106,14 +106,27 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
 	return 0;
 }
 
-static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id)
+static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	if (vpidx < KVM_MAX_VCPUS)
+		vcpu = kvm_get_vcpu(kvm, vpidx);
+	if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+		return vcpu;
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+			return vcpu;
+	return NULL;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vcpu_hv_synic *synic;
 
-	if (vcpu_id >= atomic_read(&kvm->online_vcpus))
-		return NULL;
-	vcpu = kvm_get_vcpu(kvm, vcpu_id);
+	vcpu = get_vcpu_by_vpidx(kvm, vpidx);
 	if (!vcpu)
 		return NULL;
 	synic = vcpu_to_synic(vcpu);
@@ -320,11 +333,11 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
 	return ret;
 }
 
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint)
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
 {
 	struct kvm_vcpu_hv_synic *synic;
 
-	synic = synic_get(kvm, vcpu_id);
+	synic = synic_get(kvm, vpidx);
 	if (!synic)
 		return -EINVAL;
 
@@ -343,11 +356,11 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
 			kvm_hv_notify_acked_sint(vcpu, i);
 }
 
-static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi)
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
 {
 	struct kvm_vcpu_hv_synic *synic;
 
-	synic = synic_get(kvm, vcpu_id);
+	synic = synic_get(kvm, vpidx);
 	if (!synic)
 		return -EINVAL;
 
@@ -689,6 +702,13 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
 		stimer_init(&hv_vcpu->stimer[i], i);
 }
 
+void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+
+	hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+}
+
 int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
 {
 	struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
@@ -983,6 +1003,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
 
 	switch (msr) {
+	case HV_X64_MSR_VP_INDEX:
+		if (!host)
+			return 1;
+		hv->vp_index = (u32)data;
+		break;
 	case HV_X64_MSR_APIC_ASSIST_PAGE: {
 		u64 gfn;
 		unsigned long addr;
@@ -1094,18 +1119,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
 
 	switch (msr) {
-	case HV_X64_MSR_VP_INDEX: {
-		int r;
-		struct kvm_vcpu *v;
-
-		kvm_for_each_vcpu(r, v, vcpu->kvm) {
-			if (v == vcpu) {
-				data = r;
-				break;
-			}
-		}
+	case HV_X64_MSR_VP_INDEX:
+		data = hv->vp_index;
 		break;
-	}
 	case HV_X64_MSR_EOI:
 		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
 	case HV_X64_MSR_ICR:
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 12f65fe1011d..e637631a9574 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -59,6 +59,7 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
 int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
 
 void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
 
 static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6753f0982791..5b8f07889f6a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2666,6 +2666,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_SPIN:
 	case KVM_CAP_HYPERV_SYNIC:
 	case KVM_CAP_HYPERV_SYNIC2:
+	case KVM_CAP_HYPERV_VP_INDEX:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -7688,6 +7689,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 	struct msr_data msr;
 	struct kvm *kvm = vcpu->kvm;
 
+	kvm_hv_vcpu_postcreate(vcpu);
+
 	if (vcpu_load(vcpu))
 		return;
 	msr.data = 0x0;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 38b2cfbc8112..6cd63c18708a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -928,6 +928,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_FWNMI 146
 #define KVM_CAP_PPC_SMT_POSSIBLE 147
 #define KVM_CAP_HYPERV_SYNIC2 148
+#define KVM_CAP_HYPERV_VP_INDEX 149
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3