From 7016e0627171878810798a842a416dddee4e3329 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 13 Sep 2017 13:58:15 -0700
Subject: net: Convert int functions to bool

Global function ipv6_rcv_saddr_equal and static functions
ipv6_rcv_saddr_equal and ipv4_rcv_saddr_equal currently return int.

bool is slightly more descriptive for these functions so change
their return type from int to bool.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f44ff2476758..87981cd63180 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -94,8 +94,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
 		      u32 banned_flags);
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    u32 banned_flags);
-int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-			 bool match_wildcard);
+bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			  bool match_wildcard);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
-- 
cgit v1.2.3


From bffa72cf7f9df842f0016ba03586039296b4caaf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 Sep 2017 05:14:24 -0700
Subject: net: sk_buff rbnode reorg

skb->rbnode shares space with skb->next, skb->prev and skb->tstamp

Current uses (TCP receive ofo queue and netem) need to save/restore
tstamp, while skb->dev is either NULL (TCP) or a constant for a given
queue (netem).

Since we plan using an RB tree for TCP retransmit queue to speedup SACK
processing with large BDP, this patch exchanges skb->dev and
skb->tstamp.

This saves some overhead in both TCP and netem.

v2: removes the swtstamp field from struct tcp_skb_cb

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Wei Wang <weiwan@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 16 ++++++++--------
 include/net/tcp.h      |  6 ------
 net/ipv4/tcp_input.c   | 27 +++++----------------------
 net/sched/sch_netem.c  |  7 ++++---
 4 files changed, 17 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 72299ef00061..492828801acb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -661,8 +661,12 @@ struct sk_buff {
 			struct sk_buff		*prev;
 
 			union {
-				ktime_t		tstamp;
-				u64		skb_mstamp;
+				struct net_device	*dev;
+				/* Some protocols might use this space to store information,
+				 * while device pointer would be NULL.
+				 * UDP receive path is one user.
+				 */
+				unsigned long		dev_scratch;
 			};
 		};
 		struct rb_node	rbnode; /* used in netem & tcp stack */
@@ -670,12 +674,8 @@ struct sk_buff {
 	struct sock		*sk;
 
 	union {
-		struct net_device	*dev;
-		/* Some protocols might use this space to store information,
-		 * while device pointer would be NULL.
-		 * UDP receive path is one user.
-		 */
-		unsigned long		dev_scratch;
+		ktime_t		tstamp;
+		u64		skb_mstamp;
 	};
 	/*
 	 * This is the control buffer. It is free to use for every
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b510f284427a..49a8a46466f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -797,12 +797,6 @@ struct tcp_skb_cb {
 			u16	tcp_gso_segs;
 			u16	tcp_gso_size;
 		};
-
-		/* Used to stash the receive timestamp while this skb is in the
-		 * out of order queue, as skb->tstamp is overwritten by the
-		 * rbnode.
-		 */
-		ktime_t		swtstamp;
 	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bddf724f5c02..db9bb46b5776 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4266,11 +4266,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	tp->rx_opt.num_sacks = num_sacks;
 }
 
-enum tcp_queue {
-	OOO_QUEUE,
-	RCV_QUEUE,
-};
-
 /**
  * tcp_try_coalesce - try to merge skb to prior one
  * @sk: socket
@@ -4286,7 +4281,6 @@ enum tcp_queue {
  * Returns true if caller should free @from instead of queueing it
  */
 static bool tcp_try_coalesce(struct sock *sk,
-			     enum tcp_queue dest,
 			     struct sk_buff *to,
 			     struct sk_buff *from,
 			     bool *fragstolen)
@@ -4311,10 +4305,7 @@ static bool tcp_try_coalesce(struct sock *sk,
 
 	if (TCP_SKB_CB(from)->has_rxtstamp) {
 		TCP_SKB_CB(to)->has_rxtstamp = true;
-		if (dest == OOO_QUEUE)
-			TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp;
-		else
-			to->tstamp = from->tstamp;
+		to->tstamp = from->tstamp;
 	}
 
 	return true;
@@ -4351,9 +4342,6 @@ static void tcp_ofo_queue(struct sock *sk)
 		}
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
-		/* Replace tstamp which was stomped by rbnode */
-		if (TCP_SKB_CB(skb)->has_rxtstamp)
-			skb->tstamp = TCP_SKB_CB(skb)->swtstamp;
 
 		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
@@ -4365,8 +4353,7 @@ static void tcp_ofo_queue(struct sock *sk)
 			   TCP_SKB_CB(skb)->end_seq);
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
-		eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE,
-						 tail, skb, &fragstolen);
+		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (!eaten)
@@ -4420,10 +4407,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		return;
 	}
 
-	/* Stash tstamp to avoid being stomped on by rbnode */
-	if (TCP_SKB_CB(skb)->has_rxtstamp)
-		TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
-
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
@@ -4451,7 +4434,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	/* In the typical case, we are adding an skb to the end of the list.
 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 	 */
-	if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb,
+	if (tcp_try_coalesce(sk, tp->ooo_last_skb,
 			     skb, &fragstolen)) {
 coalesce_done:
 		tcp_grow_window(sk, skb);
@@ -4502,7 +4485,7 @@ coalesce_done:
 				__kfree_skb(skb1);
 				goto merge_right;
 			}
-		} else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1,
+		} else if (tcp_try_coalesce(sk, skb1,
 					    skb, &fragstolen)) {
 			goto coalesce_done;
 		}
@@ -4554,7 +4537,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
 
 	__skb_pull(skb, hdrlen);
 	eaten = (tail &&
-		 tcp_try_coalesce(sk, RCV_QUEUE, tail,
+		 tcp_try_coalesce(sk, tail,
 				  skb, fragstolen)) ? 1 : 0;
 	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
 	if (!eaten) {
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b1266e75ca43..063a4bdb9ee6 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -146,7 +146,6 @@ struct netem_sched_data {
  */
 struct netem_skb_cb {
 	psched_time_t	time_to_send;
-	ktime_t		tstamp_save;
 };
 
 
@@ -561,7 +560,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 
 		cb->time_to_send = now + delay;
-		cb->tstamp_save = skb->tstamp;
 		++q->counter;
 		tfifo_enqueue(skb, sch);
 	} else {
@@ -629,7 +627,10 @@ deliver:
 			qdisc_qstats_backlog_dec(sch, skb);
 			skb->next = NULL;
 			skb->prev = NULL;
-			skb->tstamp = netem_skb_cb(skb)->tstamp_save;
+			/* skb->dev shares skb->rbnode area,
+			 * we need to restore its value.
+			 */
+			skb->dev = qdisc_dev(sch);
 
 #ifdef CONFIG_NET_CLS_ACT
 			/*
-- 
cgit v1.2.3


From f5619866592c65adc087364cc1a3ba709201ea26 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 19 Sep 2017 11:56:57 -0400
Subject: net: dsa: remove copy of master ethtool_ops

There is no need to store a copy of the master ethtool ops, storing the
original pointer in DSA and the new one in the master netdev itself is
enough.

In the meantime, set orig_ethtool_ops to NULL when restoring the master
ethtool ops and check the presence of the master original ethtool ops as
well as its needed functions before calling them.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  1 -
 net/dsa/dsa.c     |  8 ++++----
 net/dsa/slave.c   | 19 +++++++++++--------
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index dd44d6ce1097..8dee216a5a9b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -188,7 +188,6 @@ struct dsa_port {
 	/*
 	 * Original copy of the master netdev ethtool_ops
 	 */
-	struct ethtool_ops	ethtool_ops;
 	const struct ethtool_ops *orig_ethtool_ops;
 };
 
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 03c58b0eb082..abadf7b49236 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -124,11 +124,10 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp)
 	if (!cpu_ops)
 		return -ENOMEM;
 
-	memcpy(&cpu_dp->ethtool_ops, master->ethtool_ops,
-	       sizeof(struct ethtool_ops));
 	cpu_dp->orig_ethtool_ops = master->ethtool_ops;
-	memcpy(cpu_ops, &cpu_dp->ethtool_ops,
-	       sizeof(struct ethtool_ops));
+	if (cpu_dp->orig_ethtool_ops)
+		memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops));
+
 	dsa_cpu_port_ethtool_init(cpu_ops);
 	master->ethtool_ops = cpu_ops;
 
@@ -138,6 +137,7 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp)
 void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp)
 {
 	cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops;
+	cpu_dp->orig_ethtool_ops = NULL;
 }
 
 void dsa_cpu_dsa_destroy(struct dsa_port *port)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 2afa99506f8b..2ff4f907d137 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -574,12 +574,13 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
 	struct dsa_switch *ds = cpu_dp->ds;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	s8 cpu_port = cpu_dp->index;
 	int count = 0;
 
-	if (cpu_dp->ethtool_ops.get_sset_count) {
-		count = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS);
-		cpu_dp->ethtool_ops.get_ethtool_stats(dev, stats, data);
+	if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+		count = ops->get_sset_count(dev, ETH_SS_STATS);
+		ops->get_ethtool_stats(dev, stats, data);
 	}
 
 	if (ds->ops->get_ethtool_stats)
@@ -591,10 +592,11 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
 	struct dsa_switch *ds = cpu_dp->ds;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	int count = 0;
 
-	if (cpu_dp->ethtool_ops.get_sset_count)
-		count += cpu_dp->ethtool_ops.get_sset_count(dev, sset);
+	if (ops && ops->get_sset_count)
+		count += ops->get_sset_count(dev, sset);
 
 	if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
 		count += ds->ops->get_sset_count(ds);
@@ -608,6 +610,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
 	struct dsa_switch *ds = cpu_dp->ds;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	s8 cpu_port = cpu_dp->index;
 	int len = ETH_GSTRING_LEN;
 	int mcount = 0, count;
@@ -619,9 +622,9 @@ static void dsa_cpu_port_get_strings(struct net_device *dev,
 	/* We do not want to be NULL-terminated, since this is a prefix */
 	pfx[sizeof(pfx) - 1] = '_';
 
-	if (cpu_dp->ethtool_ops.get_sset_count) {
-		mcount = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS);
-		cpu_dp->ethtool_ops.get_strings(dev, stringset, data);
+	if (ops && ops->get_sset_count && ops->get_strings) {
+		mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+		ops->get_strings(dev, stringset, data);
 	}
 
 	if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
-- 
cgit v1.2.3


From 752fbcc33405d6f8249465e4b2c4e420091bb825 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 19 Sep 2017 13:15:42 -0700
Subject: net_sched: no need to free qdisc in RCU callback

gen estimator has been rewritten in commit 1c0d32fde5bd
("net_sched: gen_estimator: complete rewrite of rate estimators"),
the caller no longer needs to wait for a grace period. So this
patch gets rid of it.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  1 -
 net/sched/sch_generic.c   | 10 ++--------
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 135f5a2dd931..684d8ed27eaa 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -93,7 +93,6 @@ struct Qdisc {
 	unsigned long		state;
 	struct Qdisc            *next_sched;
 	struct sk_buff		*skb_bad_txq;
-	struct rcu_head		rcu_head;
 	int			padded;
 	refcount_t		refcnt;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 92237e75dbbc..1fb0c754b7fd 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -688,10 +688,8 @@ void qdisc_reset(struct Qdisc *qdisc)
 }
 EXPORT_SYMBOL(qdisc_reset);
 
-static void qdisc_rcu_free(struct rcu_head *head)
+static void qdisc_free(struct Qdisc *qdisc)
 {
-	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
-
 	if (qdisc_is_percpu_stats(qdisc)) {
 		free_percpu(qdisc->cpu_bstats);
 		free_percpu(qdisc->cpu_qstats);
@@ -724,11 +722,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 
 	kfree_skb_list(qdisc->gso_skb);
 	kfree_skb(qdisc->skb_bad_txq);
-	/*
-	 * gen_estimator est_timer() might access qdisc->q.lock,
-	 * wait a RCU grace period before freeing qdisc.
-	 */
-	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
+	qdisc_free(qdisc);
 }
 EXPORT_SYMBOL(qdisc_destroy);
 
-- 
cgit v1.2.3


From a90c9347e90ed1e9323d71402ed18023bc910cd8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 Sep 2017 16:27:06 -0700
Subject: ipv6: addrlabel: per netns list

Having a global list of labels do not scale to thousands of
netns in the cloud era. This causes quadratic behavior on
netns creation and deletion.

This is time having a per netns list of ~10 labels.

Tested:

$ time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done)
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 3.637 MB perf.data (~158898 samples) ]

real    0m20.837s # instead of 0m24.227s
user    0m0.328s
sys     0m20.338s # instead of 0m23.753s

    16.17%       ip  [kernel.kallsyms]  [k] netlink_broadcast_filtered
    12.30%       ip  [kernel.kallsyms]  [k] netlink_has_listeners
     6.76%       ip  [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
     5.78%       ip  [kernel.kallsyms]  [k] memset_erms
     5.77%       ip  [kernel.kallsyms]  [k] kobject_uevent_env
     5.18%       ip  [kernel.kallsyms]  [k] refcount_sub_and_test
     4.96%       ip  [kernel.kallsyms]  [k] _raw_read_lock
     3.82%       ip  [kernel.kallsyms]  [k] refcount_inc_not_zero
     3.33%       ip  [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
     2.11%       ip  [kernel.kallsyms]  [k] unmap_page_range
     1.77%       ip  [kernel.kallsyms]  [k] __wake_up
     1.69%       ip  [kernel.kallsyms]  [k] strlen
     1.17%       ip  [kernel.kallsyms]  [k] __wake_up_common
     1.09%       ip  [kernel.kallsyms]  [k] insert_header
     1.04%       ip  [kernel.kallsyms]  [k] page_remove_rmap
     1.01%       ip  [kernel.kallsyms]  [k] consume_skb
     0.98%       ip  [kernel.kallsyms]  [k] netlink_trim
     0.51%       ip  [kernel.kallsyms]  [k] kernfs_link_sibling
     0.51%       ip  [kernel.kallsyms]  [k] filemap_map_pages
     0.46%       ip  [kernel.kallsyms]  [k] memcpy_erms

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h |  5 +++
 net/ipv6/addrlabel.c     | 81 ++++++++++++++++++------------------------------
 2 files changed, 35 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 2544f9760a42..2ea1ed341ef8 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -89,6 +89,11 @@ struct netns_ipv6 {
 	atomic_t		fib6_sernum;
 	struct seg6_pernet_data *seg6_data;
 	struct fib_notifier_ops	*notifier_ops;
+	struct {
+		struct hlist_head head;
+		spinlock_t	lock;
+		u32		seq;
+	} ip6addrlbl_table;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index b055bc79f56d..c6311d7108f6 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -30,7 +30,6 @@
  * Policy Table
  */
 struct ip6addrlbl_entry {
-	possible_net_t lbl_net;
 	struct in6_addr prefix;
 	int prefixlen;
 	int ifindex;
@@ -41,19 +40,6 @@ struct ip6addrlbl_entry {
 	struct rcu_head rcu;
 };
 
-static struct ip6addrlbl_table
-{
-	struct hlist_head head;
-	spinlock_t lock;
-	u32 seq;
-} ip6addrlbl_table;
-
-static inline
-struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl)
-{
-	return read_pnet(&lbl->lbl_net);
-}
-
 /*
  * Default policy table (RFC6724 + extensions)
  *
@@ -148,13 +134,10 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p)
 }
 
 /* Find label */
-static bool __ip6addrlbl_match(struct net *net,
-			       const struct ip6addrlbl_entry *p,
+static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p,
 			       const struct in6_addr *addr,
 			       int addrtype, int ifindex)
 {
-	if (!net_eq(ip6addrlbl_net(p), net))
-		return false;
 	if (p->ifindex && p->ifindex != ifindex)
 		return false;
 	if (p->addrtype && p->addrtype != addrtype)
@@ -169,8 +152,9 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net,
 						  int type, int ifindex)
 {
 	struct ip6addrlbl_entry *p;
-	hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) {
-		if (__ip6addrlbl_match(net, p, addr, type, ifindex))
+
+	hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
+		if (__ip6addrlbl_match(p, addr, type, ifindex))
 			return p;
 	}
 	return NULL;
@@ -196,8 +180,7 @@ u32 ipv6_addr_label(struct net *net,
 }
 
 /* allocate one entry */
-static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
-						 const struct in6_addr *prefix,
+static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix,
 						 int prefixlen, int ifindex,
 						 u32 label)
 {
@@ -236,24 +219,23 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
 	newp->addrtype = addrtype;
 	newp->label = label;
 	INIT_HLIST_NODE(&newp->list);
-	write_pnet(&newp->lbl_net, net);
 	refcount_set(&newp->refcnt, 1);
 	return newp;
 }
 
 /* add a label */
-static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
+static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
+			    int replace)
 {
-	struct hlist_node *n;
 	struct ip6addrlbl_entry *last = NULL, *p = NULL;
+	struct hlist_node *n;
 	int ret = 0;
 
 	ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp,
 		  replace);
 
-	hlist_for_each_entry_safe(p, n,	&ip6addrlbl_table.head, list) {
+	hlist_for_each_entry_safe(p, n,	&net->ipv6.ip6addrlbl_table.head, list) {
 		if (p->prefixlen == newp->prefixlen &&
-		    net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
 		    p->ifindex == newp->ifindex &&
 		    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
 			if (!replace) {
@@ -273,10 +255,10 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
 	if (last)
 		hlist_add_behind_rcu(&newp->list, &last->list);
 	else
-		hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
+		hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head);
 out:
 	if (!ret)
-		ip6addrlbl_table.seq++;
+		net->ipv6.ip6addrlbl_table.seq++;
 	return ret;
 }
 
@@ -292,12 +274,12 @@ static int ip6addrlbl_add(struct net *net,
 		  __func__, prefix, prefixlen, ifindex, (unsigned int)label,
 		  replace);
 
-	newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label);
+	newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label);
 	if (IS_ERR(newp))
 		return PTR_ERR(newp);
-	spin_lock(&ip6addrlbl_table.lock);
-	ret = __ip6addrlbl_add(newp, replace);
-	spin_unlock(&ip6addrlbl_table.lock);
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
+	ret = __ip6addrlbl_add(net, newp, replace);
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
 	if (ret)
 		ip6addrlbl_free(newp);
 	return ret;
@@ -315,9 +297,8 @@ static int __ip6addrlbl_del(struct net *net,
 	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
 		  __func__, prefix, prefixlen, ifindex);
 
-	hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
+	hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
 		if (p->prefixlen == prefixlen &&
-		    net_eq(ip6addrlbl_net(p), net) &&
 		    p->ifindex == ifindex &&
 		    ipv6_addr_equal(&p->prefix, prefix)) {
 			hlist_del_rcu(&p->list);
@@ -340,9 +321,9 @@ static int ip6addrlbl_del(struct net *net,
 		  __func__, prefix, prefixlen, ifindex);
 
 	ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
-	spin_lock(&ip6addrlbl_table.lock);
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
 	ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex);
-	spin_unlock(&ip6addrlbl_table.lock);
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
 	return ret;
 }
 
@@ -354,6 +335,9 @@ static int __net_init ip6addrlbl_net_init(struct net *net)
 
 	ADDRLABEL(KERN_DEBUG "%s\n", __func__);
 
+	spin_lock_init(&net->ipv6.ip6addrlbl_table.lock);
+	INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head);
+
 	for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
 		int ret = ip6addrlbl_add(net,
 					 ip6addrlbl_init_table[i].prefix,
@@ -373,14 +357,12 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net)
 	struct hlist_node *n;
 
 	/* Remove all labels belonging to the exiting net */
-	spin_lock(&ip6addrlbl_table.lock);
-	hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
-		if (net_eq(ip6addrlbl_net(p), net)) {
-			hlist_del_rcu(&p->list);
-			ip6addrlbl_put(p);
-		}
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
+	hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+		hlist_del_rcu(&p->list);
+		ip6addrlbl_put(p);
 	}
-	spin_unlock(&ip6addrlbl_table.lock);
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
 }
 
 static struct pernet_operations ipv6_addr_label_ops = {
@@ -390,8 +372,6 @@ static struct pernet_operations ipv6_addr_label_ops = {
 
 int __init ipv6_addr_label_init(void)
 {
-	spin_lock_init(&ip6addrlbl_table.lock);
-
 	return register_pernet_subsys(&ipv6_addr_label_ops);
 }
 
@@ -510,11 +490,10 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	int err;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) {
-		if (idx >= s_idx &&
-		    net_eq(ip6addrlbl_net(p), net)) {
+	hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
+		if (idx >= s_idx) {
 			err = ip6addrlbl_fill(skb, p,
-					      ip6addrlbl_table.seq,
+					      net->ipv6.ip6addrlbl_table.seq,
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWADDRLABEL,
@@ -571,7 +550,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
 	if (p && !ip6addrlbl_hold(p))
 		p = NULL;
-	lseq = ip6addrlbl_table.seq;
+	lseq = net->ipv6.ip6addrlbl_table.seq;
 	rcu_read_unlock();
 
 	if (!p) {
-- 
cgit v1.2.3


From 64bc17811b72758753e2b64cd8f2a63812c61fe1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 Sep 2017 16:27:09 -0700
Subject: ipv4: speedup ipv6 tunnels dismantle

Implement exit_batch() method to dismantle more devices
per round.

(rtnl_lock() ...
 unregister_netdevice_many() ...
 rtnl_unlock())

Tested:
$ cat add_del_unshare.sh
for i in `seq 1 40`
do
 (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait ; grep net_namespace /proc/slabinfo

Before patch :
$ time ./add_del_unshare.sh
net_namespace        126    282   5504    1    2 : tunables    8    4    0 : slabdata    126    282      0

real    1m38.965s
user    0m0.688s
sys     0m37.017s

After patch:
$ time ./add_del_unshare.sh
net_namespace        135    291   5504    1    2 : tunables    8    4    0 : slabdata    135    291      0

real	0m22.117s
user	0m0.728s
sys	0m35.328s

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h |  3 ++-
 net/ipv4/ip_gre.c        | 22 +++++++++-------------
 net/ipv4/ip_tunnel.c     | 12 +++++++++---
 net/ipv4/ip_vti.c        |  7 +++----
 net/ipv4/ipip.c          |  7 +++----
 5 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 992652856fe8..b41a1e057fce 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -258,7 +258,8 @@ int ip_tunnel_get_iflink(const struct net_device *dev);
 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
+void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id,
+			   struct rtnl_link_ops *ops);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0162fb955b33..9cee986ac6b8 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1013,15 +1013,14 @@ static int __net_init ipgre_init_net(struct net *net)
 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
 }
 
-static void __net_exit ipgre_exit_net(struct net *net)
+static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
 {
-	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
-	ip_tunnel_delete_net(itn, &ipgre_link_ops);
+	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
 }
 
 static struct pernet_operations ipgre_net_ops = {
 	.init = ipgre_init_net,
-	.exit = ipgre_exit_net,
+	.exit_batch = ipgre_exit_batch_net,
 	.id   = &ipgre_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
@@ -1540,15 +1539,14 @@ static int __net_init ipgre_tap_init_net(struct net *net)
 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
 }
 
-static void __net_exit ipgre_tap_exit_net(struct net *net)
+static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
 {
-	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
-	ip_tunnel_delete_net(itn, &ipgre_tap_ops);
+	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
 }
 
 static struct pernet_operations ipgre_tap_net_ops = {
 	.init = ipgre_tap_init_net,
-	.exit = ipgre_tap_exit_net,
+	.exit_batch = ipgre_tap_exit_batch_net,
 	.id   = &gre_tap_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
@@ -1559,16 +1557,14 @@ static int __net_init erspan_init_net(struct net *net)
 				  &erspan_link_ops, "erspan0");
 }
 
-static void __net_exit erspan_exit_net(struct net *net)
+static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
 {
-	struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
-
-	ip_tunnel_delete_net(itn, &erspan_link_ops);
+	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
 }
 
 static struct pernet_operations erspan_net_ops = {
 	.init = erspan_init_net,
-	.exit = erspan_exit_net,
+	.exit_batch = erspan_exit_batch_net,
 	.id   = &erspan_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index e9805ad664ac..fe6fee728ce4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
 	}
 }
 
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
+void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
+			   struct rtnl_link_ops *ops)
 {
+	struct ip_tunnel_net *itn;
+	struct net *net;
 	LIST_HEAD(list);
 
 	rtnl_lock();
-	ip_tunnel_destroy(itn, &list, ops);
+	list_for_each_entry(net, net_list, exit_list) {
+		itn = net_generic(net, id);
+		ip_tunnel_destroy(itn, &list, ops);
+	}
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
 }
-EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
 
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p, __u32 fwmark)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5ed63d250950..02d70ca99db1 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -452,15 +452,14 @@ static int __net_init vti_init_net(struct net *net)
 	return 0;
 }
 
-static void __net_exit vti_exit_net(struct net *net)
+static void __net_exit vti_exit_batch_net(struct list_head *list_net)
 {
-	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
-	ip_tunnel_delete_net(itn, &vti_link_ops);
+	ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops);
 }
 
 static struct pernet_operations vti_net_ops = {
 	.init = vti_init_net,
-	.exit = vti_exit_net,
+	.exit_batch = vti_exit_batch_net,
 	.id   = &vti_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index fb1ad22b5e29..1e47818e38c7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -634,15 +634,14 @@ static int __net_init ipip_init_net(struct net *net)
 	return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
 }
 
-static void __net_exit ipip_exit_net(struct net *net)
+static void __net_exit ipip_exit_batch_net(struct list_head *list_net)
 {
-	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
-	ip_tunnel_delete_net(itn, &ipip_link_ops);
+	ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops);
 }
 
 static struct pernet_operations ipip_net_ops = {
 	.init = ipip_init_net,
-	.exit = ipip_exit_net,
+	.exit_batch = ipip_exit_batch_net,
 	.id   = &ipip_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
-- 
cgit v1.2.3


From 2512b1b18d0748d867bb22387db7c86b903291ad Mon Sep 17 00:00:00 2001
From: Liad Kaufman <liad.kaufman@intel.com>
Date: Sat, 5 Aug 2017 11:44:31 +0300
Subject: mac80211: extend ieee80211_ie_split to support EXTENSION

Current ieee80211_ie_split() implementation doesn't
account for elements that are sub-elements of the
EXTENSION IE. To extend support to these IEs as well,
treat the WLAN_EID_EXTENSION ids in the %ids array
as indicating that the next id in the array is a
sub-element of the EXTENSION IE.

Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  6 ++++--
 net/wireless/util.c    | 54 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 50 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f12fa5245a45..aa9d993e519a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5934,7 +5934,8 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len,
  * @ies: the IE buffer
  * @ielen: the length of the IE buffer
  * @ids: an array with element IDs that are allowed before
- *	the split
+ *	the split. A WLAN_EID_EXTENSION value means that the next
+ *	EID in the list is a sub-element of the EXTENSION IE.
  * @n_ids: the size of the element ID array
  * @after_ric: array IE types that come after the RIC element
  * @n_after_ric: size of the @after_ric array
@@ -5965,7 +5966,8 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
  * @ies: the IE buffer
  * @ielen: the length of the IE buffer
  * @ids: an array with element IDs that are allowed before
- *	the split
+ *	the split. A WLAN_EID_EXTENSION value means that the next
+ *	EID in the list is a sub-element of the EXTENSION IE.
  * @n_ids: the size of the element ID array
  * @offset: offset where to start splitting in the buffer
  *
diff --git a/net/wireless/util.c b/net/wireless/util.c
index bcb1284c3415..4aab793c2f00 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1367,13 +1367,29 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len,
 }
 EXPORT_SYMBOL(cfg80211_get_p2p_attr);
 
-static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id)
+static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext)
 {
 	int i;
 
-	for (i = 0; i < n_ids; i++)
-		if (ids[i] == id)
+	/* Make sure array values are legal */
+	if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION))
+		return false;
+
+	i = 0;
+	while (i < n_ids) {
+		if (ids[i] == WLAN_EID_EXTENSION) {
+			if (id_ext && (ids[i + 1] == id))
+				return true;
+
+			i += 2;
+			continue;
+		}
+
+		if (ids[i] == id && !id_ext)
 			return true;
+
+		i++;
+	}
 	return false;
 }
 
@@ -1403,14 +1419,36 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
 {
 	size_t pos = offset;
 
-	while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) {
+	while (pos < ielen) {
+		u8 ext = 0;
+
+		if (ies[pos] == WLAN_EID_EXTENSION)
+			ext = 2;
+		if ((pos + ext) >= ielen)
+			break;
+
+		if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext],
+					  ies[pos] == WLAN_EID_EXTENSION))
+			break;
+
 		if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) {
 			pos = skip_ie(ies, ielen, pos);
 
-			while (pos < ielen &&
-			       !ieee80211_id_in_list(after_ric, n_after_ric,
-						     ies[pos]))
-				pos = skip_ie(ies, ielen, pos);
+			while (pos < ielen) {
+				if (ies[pos] == WLAN_EID_EXTENSION)
+					ext = 2;
+				else
+					ext = 0;
+
+				if ((pos + ext) >= ielen)
+					break;
+
+				if (!ieee80211_id_in_list(after_ric,
+							  n_after_ric,
+							  ies[pos + ext],
+							  ext == 2))
+					pos = skip_ie(ies, ielen, pos);
+			}
 		} else {
 			pos = skip_ie(ies, ielen, pos);
 		}
-- 
cgit v1.2.3


From 2d23d0736e3a4a0fdb92b8e46ea476639f16aae8 Mon Sep 17 00:00:00 2001
From: Roee Zamir <roee.zamir@intel.com>
Date: Sun, 6 Aug 2017 11:38:22 +0300
Subject: nl80211: add OCE scan and capability flags

Add Optimized Connectivity Experience (OCE) scan and capability flags.
Some of them unique to OCE and some are stand alone.
And add scan flags to enable/disable them.

Signed-off-by: Roee Zamir <roee.zamir@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  37 ++++++++++--
 net/wireless/nl80211.c       | 137 ++++++++++++++++++++++++-------------------
 2 files changed, 111 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 51626b4175c0..76404d8a8863 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4914,6 +4914,15 @@ enum nl80211_feature_flags {
  *	handshake with 802.1X in station mode (will pass EAP frames to the host
  *	and accept the set_pmk/del_pmk commands), doing it in the host might not
  *	be supported.
+ * @NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME: Driver is capable of overriding
+ *	the max channel attribute in the FILS request params IE with the
+ *	actual dwell time.
+ * @NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP: Driver accepts broadcast probe
+ *	response
+ * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE: Driver supports sending
+ *	the first probe request in each channel at rate of at least 5.5Mbps.
+ * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports
+ *	probe request tx deferral and suppression
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4936,6 +4945,10 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_FILS_SK_OFFLOAD,
 	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK,
 	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X,
+	NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME,
+	NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP,
+	NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE,
+	NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
@@ -5012,12 +5025,28 @@ enum nl80211_timeout_reason {
  *	locally administered 1, multicast 0) is assumed.
  *	This flag must not be requested when the feature isn't supported, check
  *	the nl80211 feature flags for the device.
+ * @NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME: fill the dwell time in the FILS
+ *	request parameters IE in the probe request
+ * @NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP: accept broadcast probe responses
+ * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE: send probe request frames at
+ *	rate of at least 5.5M. In case non OCE AP is dicovered in the channel,
+ *	only the first probe req in the channel will be sent in high rate.
+ * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: allow probe request
+ *	tx deferral (dot11FILSProbeDelay shall be set to 15ms)
+ *	and suppression (if it has received a broadcast Probe Response frame,
+ *	Beacon frame or FILS Discovery frame from an AP that the STA considers
+ *	a suitable candidate for (re-)association - suitable in terms of
+ *	SSID and/or RSSI
  */
 enum nl80211_scan_flags {
-	NL80211_SCAN_FLAG_LOW_PRIORITY			= 1<<0,
-	NL80211_SCAN_FLAG_FLUSH				= 1<<1,
-	NL80211_SCAN_FLAG_AP				= 1<<2,
-	NL80211_SCAN_FLAG_RANDOM_ADDR			= 1<<3,
+	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
+	NL80211_SCAN_FLAG_FLUSH					= 1<<1,
+	NL80211_SCAN_FLAG_AP					= 1<<2,
+	NL80211_SCAN_FLAG_RANDOM_ADDR				= 1<<3,
+	NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME			= 1<<4,
+	NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP		= 1<<5,
+	NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE		= 1<<6,
+	NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION	= 1<<7,
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 66e97136ab44..2e6f5f4065f9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6619,6 +6619,77 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev)
 	return regulatory_pre_cac_allowed(wdev->wiphy);
 }
 
+static int
+nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
+			 void *request, struct nlattr **attrs,
+			 bool is_sched_scan)
+{
+	u8 *mac_addr, *mac_addr_mask;
+	u32 *flags;
+	enum nl80211_feature_flags randomness_flag;
+
+	if (!attrs[NL80211_ATTR_SCAN_FLAGS])
+		return 0;
+
+	if (is_sched_scan) {
+		struct cfg80211_sched_scan_request *req = request;
+
+		randomness_flag = wdev ?
+				  NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR :
+				  NL80211_FEATURE_ND_RANDOM_MAC_ADDR;
+		flags = &req->flags;
+		mac_addr = req->mac_addr;
+		mac_addr_mask = req->mac_addr_mask;
+	} else {
+		struct cfg80211_scan_request *req = request;
+
+		randomness_flag = NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR;
+		flags = &req->flags;
+		mac_addr = req->mac_addr;
+		mac_addr_mask = req->mac_addr_mask;
+	}
+
+	*flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]);
+
+	if ((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
+	    !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN))
+		return -EOPNOTSUPP;
+
+	if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
+		int err;
+
+		if (!(wiphy->features & randomness_flag) ||
+		    (wdev && wdev->current_bss))
+			return -EOPNOTSUPP;
+
+		err = nl80211_parse_random_mac(attrs, mac_addr, mac_addr_mask);
+		if (err)
+			return err;
+	}
+
+	if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) &&
+	    !wiphy_ext_feature_isset(wiphy,
+				     NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME))
+		return -EOPNOTSUPP;
+
+	if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) &&
+	   !wiphy_ext_feature_isset(wiphy,
+				    NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP))
+		return -EOPNOTSUPP;
+
+	if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) &&
+	    !wiphy_ext_feature_isset(wiphy,
+				     NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION))
+		return -EOPNOTSUPP;
+
+	if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) &&
+	    !wiphy_ext_feature_isset(wiphy,
+				     NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -6824,34 +6895,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 			nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]);
 	}
 
-	if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) {
-		request->flags = nla_get_u32(
-			info->attrs[NL80211_ATTR_SCAN_FLAGS]);
-		if ((request->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
-		    !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) {
-			err = -EOPNOTSUPP;
-			goto out_free;
-		}
-
-		if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
-			if (!(wiphy->features &
-					NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR)) {
-				err = -EOPNOTSUPP;
-				goto out_free;
-			}
-
-			if (wdev->current_bss) {
-				err = -EOPNOTSUPP;
-				goto out_free;
-			}
-
-			err = nl80211_parse_random_mac(info->attrs,
-						       request->mac_addr,
-						       request->mac_addr_mask);
-			if (err)
-				goto out_free;
-		}
-	}
+	err = nl80211_check_scan_flags(wiphy, wdev, request, info->attrs,
+				       false);
+	if (err)
+		goto out_free;
 
 	request->no_cck =
 		nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);
@@ -7299,37 +7346,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 		       request->ie_len);
 	}
 
-	if (attrs[NL80211_ATTR_SCAN_FLAGS]) {
-		request->flags = nla_get_u32(
-			attrs[NL80211_ATTR_SCAN_FLAGS]);
-		if ((request->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
-		    !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) {
-			err = -EOPNOTSUPP;
-			goto out_free;
-		}
-
-		if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
-			u32 flg = NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR;
-
-			if (!wdev) /* must be net-detect */
-				flg = NL80211_FEATURE_ND_RANDOM_MAC_ADDR;
-
-			if (!(wiphy->features & flg)) {
-				err = -EOPNOTSUPP;
-				goto out_free;
-			}
-
-			if (wdev && wdev->current_bss) {
-				err = -EOPNOTSUPP;
-				goto out_free;
-			}
-
-			err = nl80211_parse_random_mac(attrs, request->mac_addr,
-						       request->mac_addr_mask);
-			if (err)
-				goto out_free;
-		}
-	}
+	err = nl80211_check_scan_flags(wiphy, wdev, request, attrs, true);
+	if (err)
+		goto out_free;
 
 	if (attrs[NL80211_ATTR_SCHED_SCAN_DELAY])
 		request->delay =
-- 
cgit v1.2.3


From 1272c5d89b597995cb10db87dd4a1adc91d36006 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Fri, 18 Aug 2017 15:33:56 +0300
Subject: mac80211: add documentation to ieee80211_rx_ba_offl()

Add documentation to ieee80211_rx_ba_offl() function and, while at it,
rename the bit argument to tid, for consistency.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 8 +++++++-
 net/mac80211/agg-rx.c  | 4 ++--
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 885690fa39c8..cc9073e45be9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5441,8 +5441,14 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
  */
 void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn);
 
+/**
+ * ieee80211_manage_rx_ba_offl - helper to queue an RX BA work
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback
+ * @addr: station mac address
+ * @tid: the rx tid
+ */
 void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, const u8 *addr,
-				 unsigned int bit);
+				 unsigned int tid);
 
 /**
  * ieee80211_start_rx_ba_session_offl - start a Rx BA session
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 2849a1fc41c5..88cc1ae935ea 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -459,7 +459,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 }
 
 void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif,
-				 const u8 *addr, unsigned int bit)
+				 const u8 *addr, unsigned int tid)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_local *local = sdata->local;
@@ -470,7 +470,7 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif,
 	if (!sta)
 		goto unlock;
 
-	set_bit(bit, sta->ampdu_mlme.tid_rx_manage_offl);
+	set_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl);
 	ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
  unlock:
 	rcu_read_unlock();
-- 
cgit v1.2.3


From a6bcda44843c6dfced0fb973e2607c2a98addfa9 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Sep 2017 11:52:43 +0200
Subject: cfg80211: remove unused function ieee80211_data_from_8023()

This function hasn't been used since the removal of iwmc3200wifi
in 2012. It also appears to have a bug when qos=True, since then
it'll copy uninitialized stack memory to the SKB.

Just remove the function entirely.

Reported-by: Jouni Malinen <j@w1.fi>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 Documentation/driver-api/80211/cfg80211.rst |   3 -
 include/net/cfg80211.h                      |  13 ----
 net/wireless/util.c                         | 115 ----------------------------
 3 files changed, 131 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/80211/cfg80211.rst b/Documentation/driver-api/80211/cfg80211.rst
index 8ffac57e1f5b..eeab91b59457 100644
--- a/Documentation/driver-api/80211/cfg80211.rst
+++ b/Documentation/driver-api/80211/cfg80211.rst
@@ -299,9 +299,6 @@ Data path helpers
 .. kernel-doc:: include/net/cfg80211.h
    :functions: ieee80211_data_to_8023
 
-.. kernel-doc:: include/net/cfg80211.h
-   :functions: ieee80211_data_from_8023
-
 .. kernel-doc:: include/net/cfg80211.h
    :functions: ieee80211_amsdu_to_8023s
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index aa9d993e519a..cc1996081463 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4346,19 +4346,6 @@ static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
 	return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype);
 }
 
-/**
- * ieee80211_data_from_8023 - convert an 802.3 frame to 802.11
- * @skb: the 802.3 frame
- * @addr: the device MAC address
- * @iftype: the virtual interface type
- * @bssid: the network bssid (used only for iftype STATION and ADHOC)
- * @qos: build 802.11 QoS data frame
- * Return: 0 on success, or a negative error code.
- */
-int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
-			     enum nl80211_iftype iftype, const u8 *bssid,
-			     bool qos);
-
 /**
  * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame
  *
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 4aab793c2f00..7dcdf67cba29 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -529,121 +529,6 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
 }
 EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr);
 
-int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
-			     enum nl80211_iftype iftype,
-			     const u8 *bssid, bool qos)
-{
-	struct ieee80211_hdr hdr;
-	u16 hdrlen, ethertype;
-	__le16 fc;
-	const u8 *encaps_data;
-	int encaps_len, skip_header_bytes;
-	int nh_pos, h_pos;
-	int head_need;
-
-	if (unlikely(skb->len < ETH_HLEN))
-		return -EINVAL;
-
-	nh_pos = skb_network_header(skb) - skb->data;
-	h_pos = skb_transport_header(skb) - skb->data;
-
-	/* convert Ethernet header to proper 802.11 header (based on
-	 * operation mode) */
-	ethertype = (skb->data[12] << 8) | skb->data[13];
-	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
-
-	switch (iftype) {
-	case NL80211_IFTYPE_AP:
-	case NL80211_IFTYPE_AP_VLAN:
-	case NL80211_IFTYPE_P2P_GO:
-		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
-		/* DA BSSID SA */
-		memcpy(hdr.addr1, skb->data, ETH_ALEN);
-		memcpy(hdr.addr2, addr, ETH_ALEN);
-		memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
-		hdrlen = 24;
-		break;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_P2P_CLIENT:
-		fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
-		/* BSSID SA DA */
-		memcpy(hdr.addr1, bssid, ETH_ALEN);
-		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
-		memcpy(hdr.addr3, skb->data, ETH_ALEN);
-		hdrlen = 24;
-		break;
-	case NL80211_IFTYPE_OCB:
-	case NL80211_IFTYPE_ADHOC:
-		/* DA SA BSSID */
-		memcpy(hdr.addr1, skb->data, ETH_ALEN);
-		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
-		memcpy(hdr.addr3, bssid, ETH_ALEN);
-		hdrlen = 24;
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	if (qos) {
-		fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
-		hdrlen += 2;
-	}
-
-	hdr.frame_control = fc;
-	hdr.duration_id = 0;
-	hdr.seq_ctrl = 0;
-
-	skip_header_bytes = ETH_HLEN;
-	if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) {
-		encaps_data = bridge_tunnel_header;
-		encaps_len = sizeof(bridge_tunnel_header);
-		skip_header_bytes -= 2;
-	} else if (ethertype >= ETH_P_802_3_MIN) {
-		encaps_data = rfc1042_header;
-		encaps_len = sizeof(rfc1042_header);
-		skip_header_bytes -= 2;
-	} else {
-		encaps_data = NULL;
-		encaps_len = 0;
-	}
-
-	skb_pull(skb, skip_header_bytes);
-	nh_pos -= skip_header_bytes;
-	h_pos -= skip_header_bytes;
-
-	head_need = hdrlen + encaps_len - skb_headroom(skb);
-
-	if (head_need > 0 || skb_cloned(skb)) {
-		head_need = max(head_need, 0);
-		if (head_need)
-			skb_orphan(skb);
-
-		if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC))
-			return -ENOMEM;
-	}
-
-	if (encaps_data) {
-		memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len);
-		nh_pos += encaps_len;
-		h_pos += encaps_len;
-	}
-
-	memcpy(skb_push(skb, hdrlen), &hdr, hdrlen);
-
-	nh_pos += hdrlen;
-	h_pos += hdrlen;
-
-	/* Update skb pointers to various headers since this modified frame
-	 * is going to go through Linux networking code that may potentially
-	 * need things like pointer to IP header. */
-	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, nh_pos);
-	skb_set_transport_header(skb, h_pos);
-
-	return 0;
-}
-EXPORT_SYMBOL(ieee80211_data_from_8023);
-
 static void
 __frame_add_frag(struct sk_buff *skb, struct page *page,
 		 void *ptr, int len, int size)
-- 
cgit v1.2.3


From 65026002d69de006e273749bb799d3b01b757eb0 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Fri, 18 Aug 2017 15:31:41 +0300
Subject: nl80211: add an option to allow MFP without requiring it

The user space can now allow the kernel to associate to an AP that
requires MFP or that doesn't have MFP enabled in the same
NL80211_CMD_CONNECT command, by using a new NL80211_MFP_OPTIONAL flag.
The driver / firmware will decide whether to use it or not.

Include a feature bit to advertise support for NL80211_MFP_OPTIONAL.
This allows new user space to run on old kernels and know that it
cannot use the new attribute if it isn't supported.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 13 +++++++++++--
 net/wireless/nl80211.c       |  8 +++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 76404d8a8863..59ba6ca66a0d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1407,8 +1407,12 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_USE_MFP: Whether management frame protection (IEEE 802.11w) is
  *	used for the association (&enum nl80211_mfp, represented as a u32);
- *	this attribute can be used
- *	with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests
+ *	this attribute can be used with %NL80211_CMD_ASSOCIATE and
+ *	%NL80211_CMD_CONNECT requests. %NL80211_MFP_OPTIONAL is not allowed for
+ *	%NL80211_CMD_ASSOCIATE since user space SME is expected and hence, it
+ *	must have decided whether to use management frame protection or not.
+ *	Setting %NL80211_MFP_OPTIONAL with a %NL80211_CMD_CONNECT request will
+ *	let the driver (or the firmware) decide whether to use MFP or not.
  *
  * @NL80211_ATTR_STA_FLAGS2: Attribute containing a
  *	&struct nl80211_sta_flag_update.
@@ -3947,10 +3951,12 @@ enum nl80211_key_type {
  * enum nl80211_mfp - Management frame protection state
  * @NL80211_MFP_NO: Management frame protection not used
  * @NL80211_MFP_REQUIRED: Management frame protection required
+ * @NL80211_MFP_OPTIONAL: Management frame protection is optional
  */
 enum nl80211_mfp {
 	NL80211_MFP_NO,
 	NL80211_MFP_REQUIRED,
+	NL80211_MFP_OPTIONAL,
 };
 
 enum nl80211_wpa_versions {
@@ -4923,6 +4929,8 @@ enum nl80211_feature_flags {
  *	the first probe request in each channel at rate of at least 5.5Mbps.
  * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports
  *	probe request tx deferral and suppression
+ * @NL80211_EXT_FEATURE_MFP_OPTIONAL: Driver supports the %NL80211_MFP_OPTIONAL
+ *	value in %NL80211_ATTR_USE_MFP.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4949,6 +4957,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP,
 	NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE,
 	NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
+	NL80211_EXT_FEATURE_MFP_OPTIONAL,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2e6f5f4065f9..1e39ba3cfd06 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8952,8 +8952,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 
 	if (info->attrs[NL80211_ATTR_USE_MFP]) {
 		connect.mfp = nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
+		if (connect.mfp == NL80211_MFP_OPTIONAL &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_MFP_OPTIONAL))
+			return -EOPNOTSUPP;
+
 		if (connect.mfp != NL80211_MFP_REQUIRED &&
-		    connect.mfp != NL80211_MFP_NO)
+		    connect.mfp != NL80211_MFP_NO &&
+		    connect.mfp != NL80211_MFP_OPTIONAL)
 			return -EINVAL;
 	} else {
 		connect.mfp = NL80211_MFP_NO;
-- 
cgit v1.2.3


From 6e617de84e87d626d1e976fc30e1322239fd4d2d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Sep 2017 18:26:53 +0200
Subject: net: avoid a full fib lookup when rp_filter is disabled.

Since commit 1dced6a85482 ("ipv4: Restore accept_local behaviour
in fib_validate_source()") a full fib lookup is needed even if
the rp_filter is disabled, if accept_local is false - which is
the default.

What we really need in the above scenario is just checking
that the source IP address is not local, and in most case we
can do that is a cheaper way looking up the ifaddr hash table.

This commit adds a helper for such lookup, and uses it to
validate the src address when rp_filter is disabled and no
'local' routes are created by the user space in the relevant
namespace.

A new ipv4 netns flag is added to account for such routes.
We need that to preserve the same behavior we had before this
patch.

It also drops the checks to bail early from __fib_validate_source,
added by the commit 1dced6a85482 ("ipv4: Restore accept_local
behaviour in fib_validate_source()") they do not give any
measurable performance improvement: if we do the lookup with are
on a slower path.

This improves UDP performances for unconnected sockets
when rp_filter is disabled by 5% and also gives small but
measurable performance improvement for TCP flood scenarios.

v1 -> v2:
 - use the ifaddr lookup helper in __ip_dev_find(), as suggested
   by Eric
 - fall-back to full lookup if custom local routes are present

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  1 +
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/devinet.c         | 30 ++++++++++++++++++------------
 net/ipv4/fib_frontend.c    | 22 +++++++++++++++++-----
 4 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index fb3f809e34e4..751d051f0bc7 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -179,6 +179,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
 			 __be32 local, int scope);
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
 				    __be32 mask);
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
 static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
 {
 	return !((addr^ifa->ifa_address)&ifa->ifa_mask);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 20d061c805e3..20720721da4b 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,6 +49,7 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_rules_ops	*rules_ops;
 	bool			fib_has_custom_rules;
+	bool			fib_has_custom_local_routes;
 	struct fib_table __rcu	*fib_main;
 	struct fib_table __rcu	*fib_default;
 #endif
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7adc0616599..7ce22a2c07ce 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
  */
 struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
-	u32 hash = inet_addr_hash(net, addr);
 	struct net_device *result = NULL;
 	struct in_ifaddr *ifa;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
-		if (ifa->ifa_local == addr) {
-			struct net_device *dev = ifa->ifa_dev->dev;
-
-			if (!net_eq(dev_net(dev), net))
-				continue;
-			result = dev;
-			break;
-		}
-	}
-	if (!result) {
+	ifa = inet_lookup_ifaddr_rcu(net, addr);
+	if (!ifa) {
 		struct flowi4 fl4 = { .daddr = addr };
 		struct fib_result res = { 0 };
 		struct fib_table *local;
@@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 		    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
 		    res.type == RTN_LOCAL)
 			result = FIB_RES_DEV(res);
+	} else {
+		result = ifa->ifa_dev->dev;
 	}
 	if (result && devref)
 		dev_hold(result);
@@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 }
 EXPORT_SYMBOL(__ip_dev_find);
 
+/* called under RCU lock */
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
+{
+	u32 hash = inet_addr_hash(net, addr);
+	struct in_ifaddr *ifa;
+
+	hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash)
+		if (ifa->ifa_local == addr &&
+		    net_eq(dev_net(ifa->ifa_dev->dev), net))
+			return ifa;
+
+	return NULL;
+}
+
 static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
 
 static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 37819ab4cc74..f02819134ba2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -345,9 +345,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	if (res.type != RTN_UNICAST &&
 	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 		goto e_inval;
-	if (!rpf && !fib_num_tclassid_users(net) &&
-	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
-		goto last_resort;
 	fib_combine_itag(itag, &res);
 	dev_match = false;
 
@@ -402,13 +399,26 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			struct in_device *idev, u32 *itag)
 {
 	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+	struct net *net = dev_net(dev);
 
-	if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
-	    IN_DEV_ACCEPT_LOCAL(idev) &&
+	if (!r && !fib_num_tclassid_users(net) &&
 	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+		if (IN_DEV_ACCEPT_LOCAL(idev))
+			goto ok;
+		/* if no local routes are added from user space we can check
+		 * for local addresses looking-up the ifaddr table
+		 */
+		if (net->ipv4.fib_has_custom_local_routes)
+			goto full_check;
+		if (inet_lookup_ifaddr_rcu(net, src))
+			return -EINVAL;
+
+ok:
 		*itag = 0;
 		return 0;
 	}
+
+full_check:
 	return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 }
 
@@ -759,6 +769,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 
 	err = fib_table_insert(net, tb, &cfg, extack);
+	if (!err && cfg.fc_type == RTN_LOCAL)
+		net->ipv4.fib_has_custom_local_routes = true;
 errout:
 	return err;
 }
-- 
cgit v1.2.3


From a1f3316dd7b5ce740c774697c664e2e60d095794 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 21 Sep 2017 18:18:23 -0700
Subject: ipv4: Move fib_has_custom_local_routes outside of IP_MULTIPLE_TABLES.

> net/ipv4/fib_frontend.c: In function 'fib_validate_source':
> net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
>    if (net->ipv4.fib_has_custom_local_routes)
>                 ^
> net/ipv4/fib_frontend.c: In function 'inet_rtm_newroute':
> net/ipv4/fib_frontend.c:773:12: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
>    net->ipv4.fib_has_custom_local_routes = true;
>             ^

Fixes: 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 20720721da4b..8387f099115e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,10 +49,10 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_rules_ops	*rules_ops;
 	bool			fib_has_custom_rules;
-	bool			fib_has_custom_local_routes;
 	struct fib_table __rcu	*fib_main;
 	struct fib_table __rcu	*fib_default;
 #endif
+	bool			fib_has_custom_local_routes;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	int			fib_num_tclassid_users;
 #endif
-- 
cgit v1.2.3


From 242c1a28eb61cb34974e8aa05235d84355940a8a Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Fri, 22 Sep 2017 10:25:22 +0800
Subject: net: Remove useless function skb_header_release

There is no one which would invokes the function skb_header_release.
So just remove it now.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/asix_common.c   |  2 +-
 include/linux/skbuff.h          | 19 -------------------
 net/batman-adv/soft-interface.c |  2 +-
 3 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c
index 522d2900cd1d..f4d7362eb325 100644
--- a/drivers/net/usb/asix_common.c
+++ b/drivers/net/usb/asix_common.c
@@ -245,7 +245,7 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 	 * - We are allowed to put 4 bytes at tail if skb_cloned()
 	 *   is false (and if we have 4 bytes of tailroom)
 	 *
-	 * TCP packets for example are cloned, but skb_header_release()
+	 * TCP packets for example are cloned, but __skb_header_release()
 	 * was called in tcp stack, allowing us to use headroom for our needs.
 	 */
 	if (!skb_header_cloned(skb) &&
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 492828801acb..f9db5539a6fb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1456,28 +1456,9 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
 	return 0;
 }
 
-/**
- *	skb_header_release - release reference to header
- *	@skb: buffer to operate on
- *
- *	Drop a reference to the header part of the buffer.  This is done
- *	by acquiring a payload reference.  You must not read from the header
- *	part of skb->data after this.
- *	Note : Check if you can use __skb_header_release() instead.
- */
-static inline void skb_header_release(struct sk_buff *skb)
-{
-	BUG_ON(skb->nohdr);
-	skb->nohdr = 1;
-	atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
-}
-
 /**
  *	__skb_header_release - release reference to header
  *	@skb: buffer to operate on
- *
- *	Variant of skb_header_release() assuming skb is private to caller.
- *	We can avoid one atomic operation.
  */
 static inline void __skb_header_release(struct sk_buff *skb)
 {
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 10f7edfb176e..c2c986746d0b 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -69,7 +69,7 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
 	int result;
 
 	/* TODO: We must check if we can release all references to non-payload
-	 * data using skb_header_release in our skbs to allow skb_cow_header to
+	 * data using __skb_header_release in our skbs to allow skb_cow_header to
 	 * work optimally. This means that those skbs are not allowed to read
 	 * or write any data which is before the current position of skb->data
 	 * after that call and thus allow other skbs with the same data buffer
-- 
cgit v1.2.3


From 373b8eeb0c15d4ce58f62afb12f213b1b5bbc3d3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 21 Sep 2017 23:45:43 +0300
Subject: xfrm: make aead_len() return unsigned int

Key lengths can't be negative.

Comparison with nla_len() is left signed just in case negative value
can sneak in there.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h   | 2 +-
 net/xfrm/xfrm_user.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f002a2c5e33c..0be4c547e383 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1764,7 +1764,7 @@ static inline int xfrm_acquire_is_on(struct net *net)
 }
 #endif
 
-static inline int aead_len(struct xfrm_algo_aead *alg)
+static inline unsigned int aead_len(struct xfrm_algo_aead *alg)
 {
 	return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 2bfbd9121e3b..32c67b80c3ce 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -84,7 +84,7 @@ static int verify_aead(struct nlattr **attrs)
 		return 0;
 
 	algp = nla_data(rt);
-	if (nla_len(rt) < aead_len(algp))
+	if (nla_len(rt) < (int)aead_len(algp))
 		return -EINVAL;
 
 	algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
-- 
cgit v1.2.3


From 06cd22f830f28023b82455c82c7db65fc6cf9c16 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 21 Sep 2017 23:46:30 +0300
Subject: xfrm: make xfrm_alg_len() return unsigned int

Key lengths can't be negative.

Comparison with nla_len() is left signed just in case negative value
can sneak in there.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h   | 2 +-
 net/xfrm/xfrm_user.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 0be4c547e383..2abc0e117f11 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1769,7 +1769,7 @@ static inline unsigned int aead_len(struct xfrm_algo_aead *alg)
 	return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
 }
 
-static inline int xfrm_alg_len(const struct xfrm_algo *alg)
+static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg)
 {
 	return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 32c67b80c3ce..09512d90e6a5 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -42,7 +42,7 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
 		return 0;
 
 	algp = nla_data(rt);
-	if (nla_len(rt) < xfrm_alg_len(algp))
+	if (nla_len(rt) < (int)xfrm_alg_len(algp))
 		return -EINVAL;
 
 	switch (type) {
-- 
cgit v1.2.3


From 1bd963a72e859d194d87a5a2a8839efee7e23102 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 21 Sep 2017 23:47:09 +0300
Subject: xfrm: make xfrm_alg_auth_len() return unsigned int

Key lengths can't be negative.

Comparison with nla_len() is left signed just in case negative value
can sneak in there.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h   | 2 +-
 net/xfrm/xfrm_user.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2abc0e117f11..5d5e11b653eb 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1774,7 +1774,7 @@ static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg)
 	return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
 }
 
-static inline int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg)
+static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg)
 {
 	return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 09512d90e6a5..465c23d4ea78 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -68,7 +68,7 @@ static int verify_auth_trunc(struct nlattr **attrs)
 		return 0;
 
 	algp = nla_data(rt);
-	if (nla_len(rt) < xfrm_alg_auth_len(algp))
+	if (nla_len(rt) < (int)xfrm_alg_auth_len(algp))
 		return -EINVAL;
 
 	algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
-- 
cgit v1.2.3


From 5e708e47c44366453c33373940455a75fd33f635 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 21 Sep 2017 23:47:50 +0300
Subject: xfrm: make xfrm_replay_state_esn_len() return unsigned int

Replay detection bitmaps can't have negative length.

Comparisons with nla_len() are left signed just in case negative value
can sneak in there.

Propagate unsignedness for code size savings:

	add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-38 (-38)
	function                                     old     new   delta
	xfrm_state_construct                        1802    1800      -2
	xfrm_update_ae_params                        295     289      -6
	xfrm_state_migrate                          1345    1339      -6
	xfrm_replay_notify_esn                       349     337     -12
	xfrm_replay_notify_bmp                       345     333     -12

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h   |  2 +-
 net/xfrm/xfrm_user.c | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 5d5e11b653eb..3cb618bbcfa5 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1779,7 +1779,7 @@ static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg)
 	return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
 }
 
-static inline int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn)
+static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn)
 {
 	return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32);
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 465c23d4ea78..83718db5ec9c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -130,7 +130,7 @@ static inline int verify_replay(struct xfrm_usersa_info *p,
 		if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8)
 			return -EINVAL;
 
-		if (nla_len(rt) < xfrm_replay_state_esn_len(rs) &&
+		if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) &&
 		    nla_len(rt) != sizeof(*rs))
 			return -EINVAL;
 	}
@@ -404,7 +404,7 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es
 					 struct nlattr *rp)
 {
 	struct xfrm_replay_state_esn *up;
-	int ulen;
+	unsigned int ulen;
 
 	if (!replay_esn || !rp)
 		return 0;
@@ -414,7 +414,7 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es
 
 	/* Check the overall length and the internal bitmap length to avoid
 	 * potential overflow. */
-	if (nla_len(rp) < ulen ||
+	if (nla_len(rp) < (int)ulen ||
 	    xfrm_replay_state_esn_len(replay_esn) != ulen ||
 	    replay_esn->bmp_len != up->bmp_len)
 		return -EINVAL;
@@ -430,14 +430,14 @@ static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn
 				       struct nlattr *rta)
 {
 	struct xfrm_replay_state_esn *p, *pp, *up;
-	int klen, ulen;
+	unsigned int klen, ulen;
 
 	if (!rta)
 		return 0;
 
 	up = nla_data(rta);
 	klen = xfrm_replay_state_esn_len(up);
-	ulen = nla_len(rta) >= klen ? klen : sizeof(*up);
+	ulen = nla_len(rta) >= (int)klen ? klen : sizeof(*up);
 
 	p = kzalloc(klen, GFP_KERNEL);
 	if (!p)
-- 
cgit v1.2.3


From 943170998b200190f99d3fe7e771437e2c51f319 Mon Sep 17 00:00:00 2001
From: Petar Penkov <peterpenkov96@gmail.com>
Date: Fri, 22 Sep 2017 13:49:14 -0700
Subject: tun: enable NAPI for TUN/TAP driver

Changes TUN driver to use napi_gro_receive() upon receiving packets
rather than netif_rx_ni(). Adds flag IFF_NAPI that enables these
changes and operation is not affected if the flag is disabled.  SKBs
are constructed upon packet arrival and are queued to be processed
later.

The new path was evaluated with a benchmark with the following setup:
Open two tap devices and a receiver thread that reads in a loop for
each device. Start one sender thread and pin all threads to different
CPUs. Send 1M minimum UDP packets to each device and measure sending
time for each of the sending methods:
	napi_gro_receive():	4.90s
	netif_rx_ni():		4.90s
	netif_receive_skb():	7.20s

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 133 +++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 119 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3c9985f29950..f16407242b18 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -121,7 +121,7 @@ do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE)
+		      IFF_MULTI_QUEUE | IFF_NAPI)
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -172,6 +172,7 @@ struct tun_file {
 		u16 queue_index;
 		unsigned int ifindex;
 	};
+	struct napi_struct napi;
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -229,6 +230,68 @@ struct tun_struct {
 	struct bpf_prog __rcu *xdp_prog;
 };
 
+static int tun_napi_receive(struct napi_struct *napi, int budget)
+{
+	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+	struct sk_buff_head process_queue;
+	struct sk_buff *skb;
+	int received = 0;
+
+	__skb_queue_head_init(&process_queue);
+
+	spin_lock(&queue->lock);
+	skb_queue_splice_tail_init(queue, &process_queue);
+	spin_unlock(&queue->lock);
+
+	while (received < budget && (skb = __skb_dequeue(&process_queue))) {
+		napi_gro_receive(napi, skb);
+		++received;
+	}
+
+	if (!skb_queue_empty(&process_queue)) {
+		spin_lock(&queue->lock);
+		skb_queue_splice(&process_queue, queue);
+		spin_unlock(&queue->lock);
+	}
+
+	return received;
+}
+
+static int tun_napi_poll(struct napi_struct *napi, int budget)
+{
+	unsigned int received;
+
+	received = tun_napi_receive(napi, budget);
+
+	if (received < budget)
+		napi_complete_done(napi, received);
+
+	return received;
+}
+
+static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
+			  bool napi_en)
+{
+	if (napi_en) {
+		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
+			       NAPI_POLL_WEIGHT);
+		napi_enable(&tfile->napi);
+	}
+}
+
+static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
+{
+	if (tun->flags & IFF_NAPI)
+		napi_disable(&tfile->napi);
+}
+
+static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
+{
+	if (tun->flags & IFF_NAPI)
+		netif_napi_del(&tfile->napi);
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -541,6 +604,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 
 	tun = rtnl_dereference(tfile->tun);
 
+	if (tun && clean) {
+		tun_napi_disable(tun, tfile);
+		tun_napi_del(tun, tfile);
+	}
+
 	if (tun && !tfile->detached) {
 		u16 index = tfile->queue_index;
 		BUG_ON(index >= tun->numqueues);
@@ -598,6 +666,7 @@ static void tun_detach_all(struct net_device *dev)
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
 		BUG_ON(!tfile);
+		tun_napi_disable(tun, tfile);
 		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 		RCU_INIT_POINTER(tfile->tun, NULL);
@@ -613,6 +682,7 @@ static void tun_detach_all(struct net_device *dev)
 	synchronize_net();
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
+		tun_napi_del(tun, tfile);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
 		sock_put(&tfile->sk);
@@ -631,7 +701,8 @@ static void tun_detach_all(struct net_device *dev)
 		module_put(THIS_MODULE);
 }
 
-static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
+static int tun_attach(struct tun_struct *tun, struct file *file,
+		      bool skip_filter, bool napi)
 {
 	struct tun_file *tfile = file->private_data;
 	struct net_device *dev = tun->dev;
@@ -677,10 +748,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 	tun->numqueues++;
 
-	if (tfile->detached)
+	if (tfile->detached) {
 		tun_enable_queue(tfile);
-	else
+	} else {
 		sock_hold(&tfile->sk);
+		tun_napi_init(tun, tfile, napi);
+	}
 
 	tun_set_real_num_queues(tun);
 
@@ -956,13 +1029,28 @@ static void tun_poll_controller(struct net_device *dev)
 	 * Tun only receives frames when:
 	 * 1) the char device endpoint gets data from user space
 	 * 2) the tun socket gets a sendmsg call from user space
-	 * Since both of those are synchronous operations, we are guaranteed
-	 * never to have pending data when we poll for it
-	 * so there is nothing to do here but return.
+	 * If NAPI is not enabled, since both of those are synchronous
+	 * operations, we are guaranteed never to have pending data when we poll
+	 * for it so there is nothing to do here but return.
 	 * We need this though so netpoll recognizes us as an interface that
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
+	 * If NAPI is enabled, however, we need to schedule polling for all
+	 * queues.
 	 */
+	struct tun_struct *tun = netdev_priv(dev);
+
+	if (tun->flags & IFF_NAPI) {
+		struct tun_file *tfile;
+		int i;
+
+		rcu_read_lock();
+		for (i = 0; i < tun->numqueues; i++) {
+			tfile = rcu_dereference(tun->tfiles[i]);
+			napi_schedule(&tfile->napi);
+		}
+		rcu_read_unlock();
+	}
 	return;
 }
 #endif
@@ -1549,11 +1637,25 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	}
 
 	rxhash = __skb_get_hash_symmetric(skb);
-#ifndef CONFIG_4KSTACKS
-	tun_rx_batched(tun, tfile, skb, more);
-#else
-	netif_rx_ni(skb);
-#endif
+
+	if (tun->flags & IFF_NAPI) {
+		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+		int queue_len;
+
+		spin_lock_bh(&queue->lock);
+		__skb_queue_tail(queue, skb);
+		queue_len = skb_queue_len(queue);
+		spin_unlock(&queue->lock);
+
+		if (!more || queue_len > NAPI_POLL_WEIGHT)
+			napi_schedule(&tfile->napi);
+
+		local_bh_enable();
+	} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
+		tun_rx_batched(tun, tfile, skb, more);
+	} else {
+		netif_rx_ni(skb);
+	}
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -1980,7 +2082,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		if (err < 0)
 			return err;
 
-		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER);
+		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
+				 ifr->ifr_flags & IFF_NAPI);
 		if (err < 0)
 			return err;
 
@@ -2066,7 +2169,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 				       NETIF_F_HW_VLAN_STAG_TX);
 
 		INIT_LIST_HEAD(&tun->disabled);
-		err = tun_attach(tun, file, false);
+		err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI);
 		if (err < 0)
 			goto err_free_flow;
 
@@ -2216,7 +2319,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 		ret = security_tun_dev_attach_queue(tun->security);
 		if (ret < 0)
 			goto unlock;
-		ret = tun_attach(tun, file, false);
+		ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI);
 	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
 		tun = rtnl_dereference(tfile->tun);
 		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d85ddd..30b6184884eb 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
+#define IFF_NAPI	0x0010
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
cgit v1.2.3


From 90e33d45940793def6f773b2d528e9f3c84ffdc7 Mon Sep 17 00:00:00 2001
From: Petar Penkov <peterpenkov96@gmail.com>
Date: Fri, 22 Sep 2017 13:49:15 -0700
Subject: tun: enable napi_gro_frags() for TUN/TAP driver

Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities. This flag is accepted only if the
device is in TAP mode and has the IFF_NAPI flag set as well. This is
done because both of these are explicit requirements for correct
operation in this mode.

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
Acked-by: Mahesh Bandewar <maheshb@google,com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 134 ++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 129 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f16407242b18..9880b3bc8fa5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
 #include <linux/skb_array.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/mutex.h>
 
 #include <linux/uaccess.h>
 
@@ -121,7 +122,8 @@ do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE | IFF_NAPI)
+		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@ struct tun_file {
 		unsigned int ifindex;
 	};
 	struct napi_struct napi;
+	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
 		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 			       NAPI_POLL_WEIGHT);
 		napi_enable(&tfile->napi);
+		mutex_init(&tfile->napi_mutex);
 	}
 }
 
@@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
 		netif_napi_del(&tfile->napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+	return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev)
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
 	 * If NAPI is enabled, however, we need to schedule polling for all
-	 * queues.
+	 * queues unless we are using napi_gro_frags(), which we call in
+	 * process context and not in NAPI context.
 	 */
 	struct tun_struct *tun = netdev_priv(dev);
 
@@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev)
 		struct tun_file *tfile;
 		int i;
 
+		if (tun_napi_frags_enabled(tun))
+			return;
+
 		rcu_read_lock();
 		for (i = 0; i < tun->numqueues; i++) {
 			tfile = rcu_dereference(tun->tfiles[i]);
@@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+					    size_t len,
+					    const struct iov_iter *it)
+{
+	struct sk_buff *skb;
+	size_t linear;
+	int err;
+	int i;
+
+	if (it->nr_segs > MAX_SKB_FRAGS + 1)
+		return ERR_PTR(-ENOMEM);
+
+	local_bh_disable();
+	skb = napi_get_frags(&tfile->napi);
+	local_bh_enable();
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	linear = iov_iter_single_seg_count(it);
+	err = __skb_grow(skb, linear);
+	if (err)
+		goto free;
+
+	skb->len = len;
+	skb->data_len = len - linear;
+	skb->truesize += skb->data_len;
+
+	for (i = 1; i < it->nr_segs; i++) {
+		size_t fragsz = it->iov[i].iov_len;
+		unsigned long offset;
+		struct page *page;
+		void *data;
+
+		if (fragsz == 0 || fragsz > PAGE_SIZE) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		local_bh_disable();
+		data = napi_alloc_frag(fragsz);
+		local_bh_enable();
+		if (!data) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		page = virt_to_head_page(data);
+		offset = data - page_address(page);
+		skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+	}
+
+	return skb;
+free:
+	/* frees skb and all frags allocated with napi_alloc_frag() */
+	napi_free_frags(&tfile->napi);
+	return ERR_PTR(err);
+}
+
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int err;
 	u32 rxhash;
 	int skb_xdp = 1;
+	bool frags = tun_napi_frags_enabled(tun);
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
 		/* For the packet that is not easy to be processed
 		 * (e.g gso or jumbo packet), we will do it at after
 		 * skb was created with generic XDP routine.
@@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 				linear = tun16_to_cpu(tun, gso.hdr_len);
 		}
 
-		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (frags) {
+			mutex_lock(&tfile->napi_mutex);
+			skb = tun_napi_alloc_frags(tfile, copylen, from);
+			/* tun_napi_alloc_frags() enforces a layout for the skb.
+			 * If zerocopy is enabled, then this layout will be
+			 * overwritten by zerocopy_sg_from_iter().
+			 */
+			zerocopy = false;
+		} else {
+			skb = tun_alloc_skb(tfile, align, copylen, linear,
+					    noblock);
+		}
+
 		if (IS_ERR(skb)) {
 			if (PTR_ERR(skb) != -EAGAIN)
 				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			if (frags)
+				mutex_unlock(&tfile->napi_mutex);
 			return PTR_ERR(skb);
 		}
 
@@ -1571,6 +1657,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		if (err) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
 			kfree_skb(skb);
+			if (frags) {
+				tfile->napi.skb = NULL;
+				mutex_unlock(&tfile->napi_mutex);
+			}
+
 			return -EFAULT;
 		}
 	}
@@ -1578,6 +1669,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
 		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
 		kfree_skb(skb);
+		if (frags) {
+			tfile->napi.skb = NULL;
+			mutex_unlock(&tfile->napi_mutex);
+		}
+
 		return -EINVAL;
 	}
 
@@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb->dev = tun->dev;
 		break;
 	case IFF_TAP:
-		skb->protocol = eth_type_trans(skb, tun->dev);
+		if (!frags)
+			skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
 
@@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	rxhash = __skb_get_hash_symmetric(skb);
 
-	if (tun->flags & IFF_NAPI) {
+	if (frags) {
+		/* Exercise flow dissector code path. */
+		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+		if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			napi_free_frags(&tfile->napi);
+			mutex_unlock(&tfile->napi_mutex);
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+
+		local_bh_disable();
+		napi_gro_frags(&tfile->napi);
+		local_bh_enable();
+		mutex_unlock(&tfile->napi_mutex);
+	} else if (tun->flags & IFF_NAPI) {
 		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
 		int queue_len;
 
@@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	if (tfile->detached)
 		return -EINVAL;
 
+	if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		if (!(ifr->ifr_flags & IFF_NAPI) ||
+		    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
+			return -EINVAL;
+	}
+
 	dev = __dev_get_by_name(net, ifr->ifr_name);
 	if (dev) {
 		if (ifr->ifr_flags & IFF_TUN_EXCL)
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 30b6184884eb..365ade5685c9 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,7 @@
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
 #define IFF_NAPI	0x0010
+#define IFF_NAPI_FRAGS	0x0020
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
cgit v1.2.3


From e451ae8e4f6b3f6bd3b83a5595657b5421b3bf69 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 23 Sep 2017 23:01:06 +0300
Subject: neigh: make struct neigh_table::entry_size unsigned int

Neigh entry size can't be negative.

Space savings:

	add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-7 (-7)
	function                                     old     new   delta
	lowpan_neigh_construct                        25      24      -1
	clip_seq_sub_iter                            152     151      -1
	clip_ioctl                                  1475    1474      -1
	clip_constructor                              93      92      -1
	__neigh_create                              2455    2452      -3

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9816df225af3..9a25512e0a6e 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -190,7 +190,7 @@ struct neigh_hash_table {
 
 struct neigh_table {
 	int			family;
-	int			entry_size;
+	unsigned int		entry_size;
 	int			key_len;
 	__be16			protocol;
 	__u32			(*hash)(const void *pkey,
-- 
cgit v1.2.3


From 01ccdf126ca5f9d4fe0889f65ee67afac910f19c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 23 Sep 2017 23:03:04 +0300
Subject: neigh: make strucrt neigh_table::entry_size unsigned int

Key length can't be negative.

Leave comparisons against nla_len() signed just in case truncated attribute
can sneak in there.

Space savings:

	add/remove: 0/0 grow/shrink: 0/7 up/down: 0/-7 (-7)
	function                                     old     new   delta
	pneigh_delete                                273     272      -1
	mlx5e_rep_netevent_event                    1415    1414      -1
	mlx5e_create_encap_header_ipv6              1194    1193      -1
	mlx5e_create_encap_header_ipv4              1071    1070      -1
	cxgb4_l2t_get                               1104    1103      -1
	__pneigh_lookup                               69      68      -1
	__neigh_create                              2452    2451      -1

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/l2t.c |  4 ++--
 include/net/neighbour.h                  |  2 +-
 net/core/neighbour.c                     | 18 +++++++++---------
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
index f7ef8871dd0b..1817a0307d26 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -422,7 +422,7 @@ struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh,
 	u8 lport;
 	u16 vlan;
 	struct l2t_entry *e;
-	int addr_len = neigh->tbl->key_len;
+	unsigned int addr_len = neigh->tbl->key_len;
 	u32 *addr = (u32 *)neigh->primary_key;
 	int ifidx = neigh->dev->ifindex;
 	int hash = addr_hash(d, addr, addr_len, ifidx);
@@ -536,7 +536,7 @@ void t4_l2t_update(struct adapter *adap, struct neighbour *neigh)
 	struct l2t_entry *e;
 	struct sk_buff_head *arpq = NULL;
 	struct l2t_data *d = adap->l2t;
-	int addr_len = neigh->tbl->key_len;
+	unsigned int addr_len = neigh->tbl->key_len;
 	u32 *addr = (u32 *) neigh->primary_key;
 	int ifidx = neigh->dev->ifindex;
 	int hash = addr_hash(d, addr, addr_len, ifidx);
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9a25512e0a6e..2492000e1035 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -191,7 +191,7 @@ struct neigh_hash_table {
 struct neigh_table {
 	int			family;
 	unsigned int		entry_size;
-	int			key_len;
+	unsigned int		key_len;
 	__be16			protocol;
 	__u32			(*hash)(const void *pkey,
 					const struct net_device *dev,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 16a1a4c4eb57..6ea3a1a7f36a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -457,7 +457,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
 				     const void *pkey)
 {
 	struct neighbour *n;
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	u32 hash_val;
 	struct neigh_hash_table *nht;
 
@@ -488,7 +488,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
 				 struct net_device *dev, bool want_ref)
 {
 	u32 hash_val;
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	int error;
 	struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
 	struct neigh_hash_table *nht;
@@ -572,7 +572,7 @@ out_neigh_release:
 }
 EXPORT_SYMBOL(__neigh_create);
 
-static u32 pneigh_hash(const void *pkey, int key_len)
+static u32 pneigh_hash(const void *pkey, unsigned int key_len)
 {
 	u32 hash_val = *(u32 *)(pkey + key_len - 4);
 	hash_val ^= (hash_val >> 16);
@@ -585,7 +585,7 @@ static u32 pneigh_hash(const void *pkey, int key_len)
 static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
 					      struct net *net,
 					      const void *pkey,
-					      int key_len,
+					      unsigned int key_len,
 					      struct net_device *dev)
 {
 	while (n) {
@@ -601,7 +601,7 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
 struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
 		struct net *net, const void *pkey, struct net_device *dev)
 {
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	u32 hash_val = pneigh_hash(pkey, key_len);
 
 	return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
@@ -614,7 +614,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
 				    struct net_device *dev, int creat)
 {
 	struct pneigh_entry *n;
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	u32 hash_val = pneigh_hash(pkey, key_len);
 
 	read_lock_bh(&tbl->lock);
@@ -659,7 +659,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 		  struct net_device *dev)
 {
 	struct pneigh_entry *n, **np;
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	u32 hash_val = pneigh_hash(pkey, key_len);
 
 	write_lock_bh(&tbl->lock);
@@ -1662,7 +1662,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tbl == NULL)
 		return -EAFNOSUPPORT;
 
-	if (nla_len(dst_attr) < tbl->key_len)
+	if (nla_len(dst_attr) < (int)tbl->key_len)
 		goto out;
 
 	if (ndm->ndm_flags & NTF_PROXY) {
@@ -1730,7 +1730,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tbl == NULL)
 		return -EAFNOSUPPORT;
 
-	if (nla_len(tb[NDA_DST]) < tbl->key_len)
+	if (nla_len(tb[NDA_DST]) < (int)tbl->key_len)
 		goto out;
 	dst = nla_data(tb[NDA_DST]);
 	lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
-- 
cgit v1.2.3


From 1e99c497012cd8647972876f1bd18545bc907aea Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Sun, 24 Sep 2017 12:09:45 +0300
Subject: qed: iWARP - Add check for errors on a SYN packet

A SYN packet which arrives with errors from FW should be dropped.
This required adding an additional field to the ll2
rx completion data.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 8 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_ll2.c   | 1 +
 include/linux/qed/qed_ll2_if.h              | 1 +
 3 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 568e9853cc8d..8fc9c811f6e3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1733,6 +1733,14 @@ qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 
 	memset(&cm_info, 0, sizeof(cm_info));
 	ll2_syn_handle = p_hwfn->p_rdma_info->iwarp.ll2_syn_handle;
+
+	/* Check if packet was received with errors... */
+	if (data->err_flags) {
+		DP_NOTICE(p_hwfn, "Error received on SYN packet: 0x%x\n",
+			  data->err_flags);
+		goto err;
+	}
+
 	if (GET_FIELD(data->parse_flags,
 		      PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED) &&
 	    GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMERROR)) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index c06ad4f0758e..250afa5486cf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -413,6 +413,7 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
 				  struct qed_ll2_comp_rx_data *data)
 {
 	data->parse_flags = le16_to_cpu(p_cqe->rx_cqe_fp.parse_flags.flags);
+	data->err_flags = le16_to_cpu(p_cqe->rx_cqe_fp.err_flags.flags);
 	data->length.packet_length =
 	    le16_to_cpu(p_cqe->rx_cqe_fp.packet_length);
 	data->vlan = le16_to_cpu(p_cqe->rx_cqe_fp.vlan);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index dd7a3b86bb9e..89fa0bbd54f3 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -101,6 +101,7 @@ struct qed_ll2_comp_rx_data {
 	void *cookie;
 	dma_addr_t rx_buf_addr;
 	u16 parse_flags;
+	u16 err_flags;
 	u16 vlan;
 	bool b_last_packet;
 	u8 connection_handle;
-- 
cgit v1.2.3


From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:50 +0200
Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers

Just do the rename into bpf_compute_data_pointers() as we'll add
one more pointer here to recompute.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  9 ++++++---
 kernel/bpf/sockmap.c   |  4 ++--
 net/bpf/test_run.c     |  2 +-
 net/core/filter.c      | 14 +++++++-------
 net/core/lwt_bpf.c     |  2 +-
 net/sched/act_bpf.c    |  4 ++--
 net/sched/cls_bpf.c    |  4 ++--
 7 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d29e58fde364..052bab3d62e7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,10 +496,13 @@ struct xdp_buff {
 	void *data_hard_start;
 };
 
-/* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf, act_bpf and lwt programs
+/* Compute the linear packet data range [data, data_end) which
+ * will be accessed by various program types (cls_bpf, act_bpf,
+ * lwt, ...). Subsystems allowing direct data access must (!)
+ * ensure that cb[] area can be written to when BPF program is
+ * invoked (otherwise cb[] save/restore is necessary).
  */
-static inline void bpf_compute_data_end(struct sk_buff *skb)
+static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 {
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..a298d6666698 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	skb_orphan(skb);
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 
@@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6be41a44d688..df672517b4fd 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (is_l2)
 		__skb_push(skb, ETH_HLEN);
 	if (is_direct_pkt_access)
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 	retval = bpf_test_run(prog, skb, repeat, &duration);
 	if (!is_l2)
 		__skb_push(skb, ETH_HLEN);
diff --git a/net/core/filter.c b/net/core/filter.c
index 82edad58d066..c468e7cfad19 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1402,7 +1402,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 {
 	int err = __bpf_try_make_writable(skb, write_len);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return err;
 }
 
@@ -1962,7 +1962,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -1984,7 +1984,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 	ret = skb_vlan_pop(skb);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2178,7 +2178,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
 	 * need to be verified first.
 	 */
 	ret = bpf_skb_proto_xlat(skb, proto);
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2303,7 +2303,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
 	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
 		       bpf_skb_net_grow(skb, len_diff_abs);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2394,7 +2394,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
 			skb_gso_reset(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2434,7 +2434,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
 		skb_reset_mac_header(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return 0;
 }
 
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 1307731ddfe4..e7e626fb87bb 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	 */
 	preempt_disable();
 	rcu_read_lock();
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	ret = bpf_prog_run_save_cb(lwt->prog, skb);
 	rcu_read_unlock();
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707eb2c96..5ef8ce8c83d4 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 520c5027646a..36671b0fb125 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -99,11 +99,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		} else if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
-- 
cgit v1.2.3


From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:51 +0200
Subject: bpf: add meta pointer for direct access

This work enables generic transfer of metadata from XDP into skb. The
basic idea is that we can make use of the fact that the resulting skb
must be linear and already comes with a larger headroom for supporting
bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
for adjusting a new pointer called xdp->data_meta. Thus, the packet has
a flexible and programmable room for meta data, followed by the actual
packet data. struct xdp_buff is therefore laid out that we first point
to data_hard_start, then data_meta directly prepended to data followed
by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
account whether we have meta data already prepended and if so, memmove()s
this along with the given offset provided there's enough room.

xdp->data_meta is optional and programs are not required to use it. The
rationale is that when we process the packet in XDP (e.g. as DoS filter),
we can push further meta data along with it for the XDP_PASS case, and
give the guarantee that a clsact ingress BPF program on the same device
can pick this up for further post-processing. Since we work with skb
there, we can also set skb->mark, skb->priority or other skb meta data
out of BPF, thus having this scratch space generic and programmable
allows for more flexibility than defining a direct 1:1 transfer of
potentially new XDP members into skb (it's also more efficient as we
don't need to initialize/handle each of such new members). The facility
also works together with GRO aggregation. The scratch space at the head
of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
yet supporting xdp->data_meta can simply be set up with xdp->data_meta
as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
such that the subsequent match against xdp->data for later access is
guaranteed to fail.

The verifier treats xdp->data_meta/xdp->data the same way as we treat
xdp->data/xdp->data_end pointer comparisons. The requirement for doing
the compare against xdp->data is that it hasn't been modified from it's
original address we got from ctx access. It may have a range marking
already from prior successful xdp->data/xdp->data_end pointer comparisons
though.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |   1 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |   1 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |   1 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |   1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |   1 +
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |   1 +
 drivers/net/tun.c                                  |   1 +
 drivers/net/virtio_net.c                           |   2 +
 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |  21 +++-
 include/linux/skbuff.h                             |  68 +++++++++++-
 include/uapi/linux/bpf.h                           |  13 ++-
 kernel/bpf/verifier.c                              | 114 ++++++++++++++++-----
 net/bpf/test_run.c                                 |   1 +
 net/core/dev.c                                     |  31 +++++-
 net/core/filter.c                                  |  77 +++++++++++++-
 net/core/skbuff.c                                  |   2 +
 19 files changed, 297 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index d8f0c837b72c..06ce63c00821 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 
 	xdp.data_hard_start = *data_ptr - offset;
 	xdp.data = *data_ptr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = *data_ptr + *len;
 	orig_data = xdp.data;
 	mapping = rx_buf->mapping - bp->rx_dma_offset;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 49b80da51ba7..d68478afccbf 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 
 	xdp.data_hard_start = page_address(page);
 	xdp.data = (void *)cpu_addr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + len;
 	orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1519dfb851d0..f426762bd83a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d962368d08d0..04bb03bda1cd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..8f9cb8abc497 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			xdp.data_hard_start = va - frags[0].page_offset;
 			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_end = xdp.data + length;
 			orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f1dd638384d3..30b3f3fbd719 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		return false;
 
 	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 	xdp.data_hard_start = va;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1c0187f0af51..e3a38be3600a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
 
 	xdp.data_hard_start = hard_start;
 	xdp.data = data + *off;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = data + *off + *len;
 
 	orig_data = xdp.data;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 6fc854b120b0..48ec4c56cddf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 
 	xdp.data_hard_start = page_address(bd->data);
 	xdp.data = xdp.data_hard_start + *data_offset;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 
 	/* Queues always have a full reset currently, so for the time
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2c36f6ebad79..a6e0bffe3d29 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		xdp.data_hard_start = buf;
 		xdp.data = buf + pad;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index dd14a4547932..fc059f193e7d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 
 		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
 		xdp.data = xdp.data_hard_start + xdp_headroom;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		data = page_address(xdp_page) + offset;
 		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..2b672c50f160 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -137,6 +137,7 @@ enum bpf_reg_type {
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 052bab3d62e7..911d454af107 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -487,12 +487,14 @@ struct sk_filter {
 
 struct bpf_skb_data_end {
 	struct qdisc_skb_cb qdisc_cb;
+	void *data_meta;
 	void *data_end;
 };
 
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_meta;
 	void *data_hard_start;
 };
 
@@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
 	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
-	cb->data_end = skb->data + skb_headlen(skb);
+	cb->data_meta = skb->data - skb_metadata_len(skb);
+	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 void bpf_warn_invalid_xdp_action(u32 act);
-void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 struct sock *do_sk_redirect_map(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f9db5539a6fb..19e64bfb1a66 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	unsigned short	_unused;
-	unsigned char	nr_frags;
+	__u8		__unused;
+	__u8		meta_len;
+	__u8		nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
 	return 0;
 }
 
+static inline u8 skb_metadata_len(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->meta_len;
+}
+
+static inline void *skb_metadata_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb);
+}
+
+static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
+					  const struct sk_buff *skb_b,
+					  u8 meta_len)
+{
+	const void *a = skb_metadata_end(skb_a);
+	const void *b = skb_metadata_end(skb_b);
+	/* Using more efficient varaiant than plain call to memcmp(). */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	u64 diffs = 0;
+
+	switch (meta_len) {
+#define __it(x, op) (x -= sizeof(u##op))
+#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
+	case 32: diffs |= __it_diff(a, b, 64);
+	case 24: diffs |= __it_diff(a, b, 64);
+	case 16: diffs |= __it_diff(a, b, 64);
+	case  8: diffs |= __it_diff(a, b, 64);
+		break;
+	case 28: diffs |= __it_diff(a, b, 64);
+	case 20: diffs |= __it_diff(a, b, 64);
+	case 12: diffs |= __it_diff(a, b, 64);
+	case  4: diffs |= __it_diff(a, b, 32);
+		break;
+	}
+	return diffs;
+#else
+	return memcmp(a - meta_len, b - meta_len, meta_len);
+#endif
+}
+
+static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
+					const struct sk_buff *skb_b)
+{
+	u8 len_a = skb_metadata_len(skb_a);
+	u8 len_b = skb_metadata_len(skb_b);
+
+	if (!(len_a | len_b))
+		return false;
+
+	return len_a != len_b ?
+	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
+}
+
+static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
+{
+	skb_shinfo(skb)->meta_len = meta_len;
+}
+
+static inline void skb_metadata_clear(struct sk_buff *skb)
+{
+	skb_metadata_set(skb, 0);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..e43491ac4823 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -582,6 +582,12 @@ union bpf_attr {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +644,7 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +722,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +730,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +793,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..f849eca36052 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 	va_end(args);
 }
 
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -187,6 +193,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
 	[PTR_TO_STACK]		= "fp",
 	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
@@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 			verbose("(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
 				verbose(",off=%d", reg->off);
-			if (t == PTR_TO_PACKET)
+			if (type_is_pkt_pointer(t))
 				verbose(",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
@@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+	return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+	return reg_is_pkt_pointer(reg) ||
+	       reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+				    enum bpf_reg_type which)
+{
+	/* The register can already have a range from prior markings.
+	 * This is fine as long as it hasn't been advanced from its
+	 * origin.
+	 */
+	return reg->type == which &&
+	       reg->id == 0 &&
+	       reg->off == 0 &&
+	       tnum_equals_const(reg->var_off, 0);
+}
+
 /* Attempts to improve min/max values based on var_off information */
 static void __update_reg_bounds(struct bpf_reg_state *reg)
 {
@@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		/* special case, because of NET_IP_ALIGN */
+	case PTR_TO_PACKET_META:
+		/* Special case, because of NET_IP_ALIGN. Given metadata sits
+		 * right in front, treat it the very same way.
+		 */
 		return check_pkt_ptr_alignment(reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
@@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
-			 * PTR_TO_PACKET[_END].  In the latter case, we know
-			 * the offset is zero.
+			 * PTR_TO_PACKET[_META,_END]. In the latter
+			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
 				mark_reg_unknown(state->regs, value_regno);
@@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
-	} else if (reg->type == PTR_TO_PACKET) {
+	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
@@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size);
@@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET &&
+	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
@@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (type != PTR_TO_PACKET && type != expected_type)
+		if (!type_is_pkt_pointer(type) &&
+		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (register_is_null(*reg))
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->key_size);
 		else
@@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->value_size);
 		else
@@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET ||
-		    regs[i].type == PTR_TO_PACKET_END)
+		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type != PTR_TO_PACKET &&
-		    reg->type != PTR_TO_PACKET_END)
-			continue;
-		__mark_reg_unknown(reg);
+		if (reg_is_pkt_pointer_any(reg))
+			__mark_reg_unknown(reg);
 	}
 }
 
@@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			dst_reg->range = 0;
@@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
@@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-				   struct bpf_reg_state *dst_reg)
+				   struct bpf_reg_state *dst_reg,
+				   enum bpf_reg_type type)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
@@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+		if (regs[i].type == type && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
 			regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
 
@@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, dst_reg->off);
 	}
 }
@@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(this_branch, dst_reg);
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(other_branch, dst_reg);
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return false;
 		/* Check our ids match any regs they're supposed to */
 		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
-		if (rcur->type != PTR_TO_PACKET)
+		if (rcur->type != rold->type)
 			return false;
 		/* We must have at least as much range as the old ptr
 		 * did, so that any accesses which were safe before are
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index df672517b4fd..a86e6687026e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	xdp.data_hard_start = data;
 	xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + size;
 
 	retval = bpf_test_run(prog, &xdp, repeat, &duration);
diff --git a/net/core/dev.c b/net/core/dev.c
index 97abddd9039a..e350c768d4b5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3864,8 +3864,8 @@ drop:
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	u32 act = XDP_DROP;
 	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
@@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_cloned(skb))
 		return XDP_PASS;
 
-	if (skb_linearize(skb))
-		goto do_drop;
+	/* XDP packets must be linear and must have sufficient headroom
+	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+	 * native XDP provides, thus we need to do it here as well.
+	 */
+	if (skb_is_nonlinear(skb) ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+		int troom = skb->tail + skb->data_len - skb->end;
+
+		/* In case we have to go down the path and also linearize,
+		 * then lets do the pskb_expand_head() work just once here.
+		 */
+		if (pskb_expand_head(skb,
+				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+			goto do_drop;
+		if (troom > 0 && __skb_linearize(skb))
+			goto do_drop;
+	}
 
 	/* The XDP program wants to see the packet starting at the MAC
 	 * header.
@@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
 	xdp.data = skb->data - mac_len;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
@@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	case XDP_REDIRECT:
 	case XDP_TX:
 		__skb_push(skb, mac_len);
-		/* fall through */
+		break;
 	case XDP_PASS:
+		metalen = xdp.data - xdp.data_meta;
+		if (metalen)
+			skb_metadata_set(skb, metalen);
 		break;
-
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		/* fall through */
@@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
 		diffs |= skb_metadata_dst_cmp(p, skb);
+		diffs |= skb_metadata_differs(p, skb);
 		if (maclen == ETH_HLEN)
 			diffs |= compare_ether_header(skb_mac_header(p),
 						      skb_mac_header(skb));
diff --git a/net/core/filter.c b/net/core/filter.c
index c468e7cfad19..9b6e7e84aafd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+	return xdp_data_meta_unsupported(xdp) ? 0 :
+	       xdp->data - xdp->data_meta;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	unsigned long metalen = xdp_get_metalen(xdp);
+	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
 
-	if (unlikely(data < xdp->data_hard_start ||
+	if (unlikely(data < data_start ||
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	if (metalen)
+		memmove(xdp->data_meta + offset,
+			xdp->data_meta, metalen);
+	xdp->data_meta += offset;
 	xdp->data = data;
 
 	return 0;
@@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+	void *meta = xdp->data_meta + offset;
+	unsigned long metalen = xdp->data - meta;
+
+	if (xdp_data_meta_unsupported(xdp))
+		return -ENOTSUPP;
+	if (unlikely(meta < xdp->data_hard_start ||
+		     meta > xdp->data))
+		return -EINVAL;
+	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
+		     (metalen > 32)))
+		return -EACCES;
+
+	xdp->data_meta = meta;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+	.func		= bpf_xdp_adjust_meta,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static int __bpf_tx_xdp(struct net_device *dev,
 			struct bpf_map *map,
 			struct xdp_buff *xdp,
@@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head)
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta)
 		return true;
 
 	return false;
@@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
+	case BPF_FUNC_xdp_adjust_meta:
+		return &bpf_xdp_adjust_meta_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
@@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
 	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
 			return false;
@@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
@@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 		return false;
 	}
 
@@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size,
 	case offsetof(struct xdp_md, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case offsetof(struct xdp_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		return false;
+	}
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case bpf_ctx_range(struct __sk_buff, mark):
@@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
-	case bpf_ctx_range(struct __sk_buff, tc_classid):
-		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sk_buff, data));
 		break;
 
+	case offsetof(struct __sk_buff, data_meta):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_meta);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_meta);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
+
 	case offsetof(struct __sk_buff, data_end):
 		off  = si->off;
 		off -= offsetof(struct __sk_buff, data_end);
@@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, data_meta));
+		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      si->dst_reg, si->src_reg,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 000ce735fa8d..d98c2e3ce2bf 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 
+	skb_metadata_clear(skb);
+
 	/* It is not generally safe to change skb->truesize.
 	 * For the moment, we really care of rx path, or
 	 * when skb is orphaned (not attached to a socket).
-- 
cgit v1.2.3


From 3b8e9238a8d194e82f0202a5fb68a63686ebe420 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 25 Sep 2017 10:58:21 +0200
Subject: net: sched: introduce helper to identify gact pass action

Introduce a helper called is_tcf_gact_pass which could be used to
tell if the action is gact pass or not.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_gact.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index 41afe1ce7b16..d979a0d48f9e 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -33,6 +33,11 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act,
 	return false;
 }
 
+static inline bool is_tcf_gact_ok(const struct tc_action *a)
+{
+	return __is_tcf_gact_act(a, TC_ACT_OK, false);
+}
+
 static inline bool is_tcf_gact_shot(const struct tc_action *a)
 {
 	return __is_tcf_gact_act(a, TC_ACT_SHOT, false);
-- 
cgit v1.2.3


From 85e482285bbbd508483cbe08de69c8fe00cdbbfe Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:11 +0200
Subject: fib: notifier: Add VIF add and delete event types

In order for an interface to forward packets according to the kernel
multicast routing table, it must be configured with a VIF index according
to the mroute user API. The VIF index is then used to refer to that
interface in the mroute user API, for example, to set the iif and oifs of
an MFC entry.

In order to allow drivers to be aware and offload multicast routes, they
have to be aware of the VIF add and delete notifications.

Due to the fact that a specific VIF can be deleted and re-added pointing to
another netdevice, and the MFC routes that point to it will forward the
matching packets to the new netdevice, a driver willing to offload MFC
cache entries must be aware of the VIF add and delete events in addition to
MFC routes notifications.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_notifier.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index 669b9716dc7a..54cd6b839d2f 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -20,6 +20,8 @@ enum fib_event_type {
 	FIB_EVENT_RULE_DEL,
 	FIB_EVENT_NH_ADD,
 	FIB_EVENT_NH_DEL,
+	FIB_EVENT_VIF_ADD,
+	FIB_EVENT_VIF_DEL,
 };
 
 struct fib_notifier_ops {
-- 
cgit v1.2.3


From 310ebbba3b7396b00bce08a33f1d2de2c74fa257 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:12 +0200
Subject: ipmr: Add reference count to MFC entries

Next commits will introduce MFC notifications through the atomic
fib_notification chain, thus allowing modules to be aware of MFC entries.

Due to the fact that modules may need to hold a reference to an MFC entry,
add reference count to MFC entries to prevent them from being freed while
these modules use them.

The reference counting is done only on resolved MFC entries currently.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 21 +++++++++++++++++++++
 net/ipv4/ipmr.c        |  8 +++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index d7f63339ef0b..10028f208efb 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -109,6 +109,7 @@ struct mfc_cache_cmp_arg {
  * @wrong_if: number of wrong source interface hits
  * @lastuse: time of last use of the group (traffic or update)
  * @ttls: OIF TTL threshold array
+ * @refcount: reference count for this entry
  * @list: global entry list
  * @rcu: used for entry destruction
  */
@@ -138,6 +139,7 @@ struct mfc_cache {
 			unsigned long wrong_if;
 			unsigned long lastuse;
 			unsigned char ttls[MAXVIFS];
+			refcount_t refcount;
 		} res;
 	} mfc_un;
 	struct list_head list;
@@ -148,4 +150,23 @@ struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
 		   struct rtmsg *rtm, u32 portid);
+
+#ifdef CONFIG_IP_MROUTE
+void ipmr_cache_free(struct mfc_cache *mfc_cache);
+#else
+static inline void ipmr_cache_free(struct mfc_cache *mfc_cache)
+{
+}
+#endif
+
+static inline void ipmr_cache_put(struct mfc_cache *c)
+{
+	if (refcount_dec_and_test(&c->mfc_un.res.refcount))
+		ipmr_cache_free(c);
+}
+static inline void ipmr_cache_hold(struct mfc_cache *c)
+{
+	refcount_inc(&c->mfc_un.res.refcount);
+}
+
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c9b3e6e069ae..86dc5f98c5dd 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -652,10 +652,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head)
 	kmem_cache_free(mrt_cachep, c);
 }
 
-static inline void ipmr_cache_free(struct mfc_cache *c)
+void ipmr_cache_free(struct mfc_cache *c)
 {
 	call_rcu(&c->rcu, ipmr_cache_free_rcu);
 }
+EXPORT_SYMBOL(ipmr_cache_free);
 
 /* Destroy an unresolved cache entry, killing queued skbs
  * and reporting error to netlink readers.
@@ -949,6 +950,7 @@ static struct mfc_cache *ipmr_cache_alloc(void)
 	if (c) {
 		c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
 		c->mfc_un.res.minvif = MAXVIFS;
+		refcount_set(&c->mfc_un.res.refcount, 1);
 	}
 	return c;
 }
@@ -1162,7 +1164,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 	list_del_rcu(&c->list);
 	mroute_netlink_event(mrt, c, RTM_DELROUTE);
-	ipmr_cache_free(c);
+	ipmr_cache_put(c);
 
 	return 0;
 }
@@ -1264,7 +1266,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 		list_del_rcu(&c->list);
 		mroute_netlink_event(mrt, c, RTM_DELROUTE);
-		ipmr_cache_free(c);
+		ipmr_cache_put(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
-- 
cgit v1.2.3


From 4d65b9487831170e699b2fc64a91b839d729bd78 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:13 +0200
Subject: ipmr: Add FIB notification access functions

Make the ipmr module register as a FIB notifier. To do that, implement both
the ipmr_seq_read and ipmr_dump ops.

The ipmr_seq_read op returns a sequence counter that is incremented on
every notification related operation done by the ipmr. To implement that,
add a sequence counter in the netns_ipv4 struct and increment it whenever a
new MFC route or VIF are added or deleted. The sequence operations are
protected by the RTNL lock.

The ipmr_dump iterates the list of MFC routes and the list of VIF entries
and sends notifications about them. The entries dump is done under RCU
where the VIF dump uses the mrt_lock too, as the vif->dev field can change
under RCU.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h   |  15 ++++++
 include/net/netns/ipv4.h |   3 ++
 net/ipv4/ipmr.c          | 137 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 153 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 10028f208efb..54c5cb82ddcb 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,7 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
 #ifdef CONFIG_IP_MROUTE
@@ -58,6 +59,14 @@ struct vif_device {
 	int		link;			/* Physical interface index	*/
 };
 
+struct vif_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct net_device *dev;
+	vifi_t vif_index;
+	unsigned short vif_flags;
+	u32 tb_id;
+};
+
 #define VIFF_STATIC 0x8000
 
 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
@@ -146,6 +155,12 @@ struct mfc_cache {
 	struct rcu_head	rcu;
 };
 
+struct mfc_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct mfc_cache *mfc;
+	u32 tb_id;
+};
+
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 8387f099115e..abc84d986da4 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -163,6 +163,9 @@ struct netns_ipv4 {
 	struct fib_notifier_ops	*notifier_ops;
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
 
+	struct fib_notifier_ops	*ipmr_notifier_ops;
+	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
+
 	atomic_t	rt_genid;
 };
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 86dc5f98c5dd..49879c338357 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -264,6 +264,16 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	fib_rules_unregister(net->ipv4.mr_rules_ops);
 	rtnl_unlock();
 }
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
+}
 #else
 #define ipmr_for_each_table(mrt, net) \
 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -298,6 +308,16 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	net->ipv4.mrt = NULL;
 	rtnl_unlock();
 }
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return 0;
+}
 #endif
 
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -587,6 +607,43 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 }
 #endif
 
+static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
+					struct net *net,
+					enum fib_event_type event_type,
+					struct vif_device *vif,
+					vifi_t vif_index, u32 tb_id)
+{
+	struct vif_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.dev = vif->dev,
+		.vif_index = vif_index,
+		.vif_flags = vif->flags,
+		.tb_id = tb_id,
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
+					struct net *net,
+					enum fib_event_type event_type,
+					struct mfc_cache *mfc, u32 tb_id)
+{
+	struct mfc_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.mfc = mfc,
+		.tb_id = tb_id
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
 /**
  *	vif_delete - Delete a VIF entry
  *	@notify: Set to 1, if the caller is a notifier_call
@@ -3050,14 +3107,87 @@ static const struct net_protocol pim_protocol = {
 };
 #endif
 
+static unsigned int ipmr_seq_read(struct net *net)
+{
+	ASSERT_RTNL();
+
+	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
+}
+
+static int ipmr_dump(struct net *net, struct notifier_block *nb)
+{
+	struct mr_table *mrt;
+	int err;
+
+	err = ipmr_rules_dump(net, nb);
+	if (err)
+		return err;
+
+	ipmr_for_each_table(mrt, net) {
+		struct vif_device *v = &mrt->vif_table[0];
+		struct mfc_cache *mfc;
+		int vifi;
+
+		/* Notifiy on table VIF entries */
+		read_lock(&mrt_lock);
+		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+			if (!v->dev)
+				continue;
+
+			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
+						     v, vifi, mrt->id);
+		}
+		read_unlock(&mrt_lock);
+
+		/* Notify on table MFC entries */
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+			call_ipmr_mfc_entry_notifier(nb, net,
+						     FIB_EVENT_ENTRY_ADD, mfc,
+						     mrt->id);
+	}
+
+	return 0;
+}
+
+static const struct fib_notifier_ops ipmr_notifier_ops_template = {
+	.family		= RTNL_FAMILY_IPMR,
+	.fib_seq_read	= ipmr_seq_read,
+	.fib_dump	= ipmr_dump,
+	.owner		= THIS_MODULE,
+};
+
+int __net_init ipmr_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	net->ipv4.ipmr_seq = 0;
+
+	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv4.ipmr_notifier_ops = ops;
+
+	return 0;
+}
+
+static void __net_exit ipmr_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
+	net->ipv4.ipmr_notifier_ops = NULL;
+}
+
 /* Setup for IP multicast routing */
 static int __net_init ipmr_net_init(struct net *net)
 {
 	int err;
 
+	err = ipmr_notifier_init(net);
+	if (err)
+		goto ipmr_notifier_fail;
+
 	err = ipmr_rules_init(net);
 	if (err < 0)
-		goto fail;
+		goto ipmr_rules_fail;
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
@@ -3074,7 +3204,9 @@ proc_cache_fail:
 proc_vif_fail:
 	ipmr_rules_exit(net);
 #endif
-fail:
+ipmr_rules_fail:
+	ipmr_notifier_exit(net);
+ipmr_notifier_fail:
 	return err;
 }
 
@@ -3084,6 +3216,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
 	remove_proc_entry("ip_mr_cache", net->proc_net);
 	remove_proc_entry("ip_mr_vif", net->proc_net);
 #endif
+	ipmr_notifier_exit(net);
 	ipmr_rules_exit(net);
 }
 
-- 
cgit v1.2.3


From c7c0bbeae9501a7e42f2fd306d6a6399aca688b6 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:15 +0200
Subject: net: ipmr: Add MFC offload indication

Allow drivers, registered to the fib notification chain indicate whether a
multicast MFC route is offloaded or not, similarly to unicast routes. The
indication of whether a route is offloaded is done using the mfc_flags
field on an mfc_cache struct, and the information is sent to the userspace
via the RTNetlink interface only.

Currently, MFC routes are either offloaded or not, thus there is no need to
add per-VIF offload indication.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 2 ++
 net/ipv4/ipmr.c        | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 54c5cb82ddcb..5566580811ce 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -90,9 +90,11 @@ struct mr_table {
 
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ * MFC_OFFLOAD - the entry was offloaded to the hardware
  */
 enum {
 	MFC_STATIC = BIT(0),
+	MFC_OFFLOAD = BIT(1),
 };
 
 struct mfc_cache_cmp_arg {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ba71bc402336..2a795d2c0502 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2268,6 +2268,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
 		return -EMSGSIZE;
 
+	if (c->mfc_flags & MFC_OFFLOAD)
+		rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
 	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
 		return -EMSGSIZE;
 
-- 
cgit v1.2.3


From 478e4c2f0067d57d7c17059caafab026ca32084a Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:16 +0200
Subject: net: mroute: Check if rule is a default rule

When the ipmr starts, it adds one default FIB rule that matches all packets
and sends them to the DEFAULT (multicast) FIB table. A more complex rule
can be added by user to specify that for a specific interface, a packet
should be look up at either an arbitrary table or according to the l3mdev
of the interface.

For drivers willing to offload the ipmr logic into a hardware but don't
want to offload all the FIB rules functionality, provide a function that
can indicate whether the FIB rule is the default multicast rule, thus only
one routing table is needed.

This way, a driver can register to the FIB notification chain, get
notifications about FIB rules added and trigger some kind of an internal
abort mechanism when a non default rule is added by the user.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h |  7 +++++++
 net/ipv4/ipmr.c        | 12 ++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 5566580811ce..b072a84fbe1c 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,7 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_rules.h>
 #include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
@@ -19,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
 int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
 int ip_mr_init(void);
+bool ipmr_rule_default(const struct fib_rule *rule);
 #else
 static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
 				       char __user *optval, unsigned int optlen)
@@ -46,6 +48,11 @@ static inline int ip_mroute_opt(int opt)
 {
 	return 0;
 }
+
+static inline bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
 #endif
 
 struct vif_device {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2a795d2c0502..292a8e80bdfa 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -274,6 +274,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net)
 {
 	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
 }
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
 #else
 #define ipmr_for_each_table(mrt, net) \
 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -318,6 +324,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net)
 {
 	return 0;
 }
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
 #endif
 
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
-- 
cgit v1.2.3


From 6ade97da601f8af793f6c7a861af754d0f0b6767 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 26 Sep 2017 23:12:28 +0300
Subject: arp: make arp_hdr_len() return unsigned int

Negative ARP header length are not a thing.

Constify arguments while I'm at it.

Space savings:

	add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3)
	function                        old     new   delta
	arpt_do_table                  1163    1160      -3

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 3 ++-
 include/linux/if_arp.h          | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c99dc59d729b..d2e94b8559f0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2491,7 +2491,8 @@ int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
 	struct slave *curr_active_slave, *curr_arp_slave;
 	unsigned char *arp_ptr;
 	__be32 sip, tip;
-	int alen, is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
+	int is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
+	unsigned int alen;
 
 	if (!slave_do_arp_validate(bond, slave)) {
 		if ((slave_do_arp_validate_only(bond) && is_arp) ||
diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 3355efc89781..6756fea18b69 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -31,7 +31,7 @@ static inline struct arphdr *arp_hdr(const struct sk_buff *skb)
 	return (struct arphdr *)skb_network_header(skb);
 }
 
-static inline int arp_hdr_len(struct net_device *dev)
+static inline unsigned int arp_hdr_len(const struct net_device *dev)
 {
 	switch (dev->type) {
 #if IS_ENABLED(CONFIG_FIREWIRE_NET)
-- 
cgit v1.2.3


From 5af48b59f35cf712793badabe1a574a0d0ce3bd3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 27 Sep 2017 16:12:44 +0300
Subject: net: bridge: add per-port group_fwd_mask with less restrictions

We need to be able to transparently forward most link-local frames via
tunnels (e.g. vxlan, qinq). Currently the bridge's group_fwd_mask has a
mask which restricts the forwarding of STP and LACP, but we need to be able
to forward these over tunnels and control that forwarding on a per-port
basis thus add a new per-port group_fwd_mask option which only disallows
mac pause frames to be forwarded (they're always dropped anyway).
The patch does not change the current default situation - all of the others
are still restricted unless configured for forwarding.
We have successfully tested this patch with LACP and STP forwarding over
VxLAN and qinq tunnels.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_input.c        |  1 +
 net/bridge/br_netlink.c      | 14 +++++++++++++-
 net/bridge/br_private.h      | 10 +++++++++-
 net/bridge/br_sysfs_if.c     | 18 ++++++++++++++++++
 5 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d062c58d5cb..ea87bd708ee9 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -325,6 +325,7 @@ enum {
 	IFLA_BRPORT_MCAST_TO_UCAST,
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
+	IFLA_BRPORT_GROUP_FWD_MASK,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7637f58c1226..7cb613776b31 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -289,6 +289,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		 *
 		 * Others reserved for future standardization
 		 */
+		fwd_mask |= p->group_fwd_mask;
 		switch (dest[5]) {
 		case 0x00:	/* Bridge Group Address */
 			/* If STP is turned off,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 3bc890716c89..dea88a255d26 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void)
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 		+ nla_total_size(sizeof(u8))	/* IFLA_BRPORT_MULTICAST_ROUTER */
 #endif
+		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_GROUP_FWD_MASK */
 		+ 0;
 }
 
@@ -208,7 +209,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       p->topology_change_ack) ||
 	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
 	    nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
-							BR_VLAN_TUNNEL)))
+							BR_VLAN_TUNNEL)) ||
+	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -637,6 +639,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
 	[IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 },
 	[IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 },
+	[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -773,6 +776,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 			return err;
 	}
 #endif
+
+	if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) {
+		u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]);
+
+		if (fwd_mask & BR_GROUPFWD_MACPAUSE)
+			return -EINVAL;
+		p->group_fwd_mask = fwd_mask;
+	}
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e870cfc85b14..020c709a017f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -36,7 +36,14 @@
 /* Control of forwarding link local multicast */
 #define BR_GROUPFWD_DEFAULT	0
 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */
-#define BR_GROUPFWD_RESTRICTED	0x0007u
+enum {
+	BR_GROUPFWD_STP		= BIT(0),
+	BR_GROUPFWD_MACPAUSE	= BIT(1),
+	BR_GROUPFWD_LACP	= BIT(2),
+};
+
+#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \
+				BR_GROUPFWD_LACP)
 /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */
 #define BR_GROUPFWD_8021AD	0xB801u
 
@@ -268,6 +275,7 @@ struct net_bridge_port {
 #ifdef CONFIG_NET_SWITCHDEV
 	int				offload_fwd_mark;
 #endif
+	u16				group_fwd_mask;
 };
 
 #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 5d5d413a6cf8..9110d5e56085 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -165,6 +165,23 @@ static int store_flush(struct net_bridge_port *p, unsigned long v)
 }
 static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);
 
+static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%#x\n", p->group_fwd_mask);
+}
+
+static int store_group_fwd_mask(struct net_bridge_port *p,
+				unsigned long v)
+{
+	if (v & BR_GROUPFWD_MACPAUSE)
+		return -EINVAL;
+	p->group_fwd_mask = v;
+
+	return 0;
+}
+static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask,
+		   store_group_fwd_mask);
+
 BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
 BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
 BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
@@ -223,6 +240,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_proxyarp_wifi,
 	&brport_attr_multicast_flood,
 	&brport_attr_broadcast_flood,
+	&brport_attr_group_fwd_mask,
 	NULL
 };
 
-- 
cgit v1.2.3


From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:52 -0700
Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info

The patch adds name and load_time to struct bpf_prog_aux.  They
are also exported to bpf_prog_info.

The bpf_prog's name is passed by userspace during BPF_PROG_LOAD.
The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes
and the name stored in the kernel is always \0 terminated.

The kernel will reject name that contains characters other than
isalnum() and '_'.  It will also reject name that is not null
terminated.

The existing 'user->uid' of the bpf_prog_aux is also exported to
the bpf_prog_info as created_by_uid.

The existing 'used_maps' of the bpf_prog_aux is exported to
the newly added members 'nr_map_ids' and 'map_ids' of
the bpf_prog_info.  On the input, nr_map_ids tells how
big the userspace's map_ids buffer is.  On the output,
nr_map_ids tells the exact user_map_cnt and it will only
copy up to the userspace's map_ids buffer is allowed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  8 ++++++++
 kernel/bpf/syscall.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b672c50f160..33ccc474fb04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,8 @@ struct bpf_prog_aux {
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
+	u64 load_time; /* ns since boottime */
+	u8 name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e43491ac4823..bd6348269bf5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -175,6 +175,8 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -210,6 +212,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -812,6 +815,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..45970df3f820 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
 #include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+	const char *end = src + BPF_OBJ_NAME_LEN;
+
+	/* Copy all isalnum() and '_' char */
+	while (src < end && *src) {
+		if (!isalnum(*src) && *src != '_')
+			return -EINVAL;
+		*dst++ = *src++;
+	}
+
+	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+	if (src == end)
+		return -EINVAL;
+
+	/* '\0' terminates dst */
+	*dst = 0;
+
+	return 0;
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD numa_node
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define	BPF_PROG_LOAD_LAST_FIELD prog_name
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_prog;
 
+	prog->aux->load_time = ktime_get_boot_ns();
+	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+	if (err)
+		goto free_prog;
+
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr);
 	if (err < 0)
@@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	info.type = prog->type;
 	info.id = prog->aux->id;
+	info.load_time = prog->aux->load_time;
+	info.created_by_uid = from_kuid_munged(current_user_ns(),
+					       prog->aux->user->uid);
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+	ulen = info.nr_map_ids;
+	info.nr_map_ids = prog->aux->used_map_cnt;
+	ulen = min_t(u32, info.nr_map_ids, ulen);
+	if (ulen) {
+		u32 *user_map_ids = (u32 *)info.map_ids;
+		u32 i;
+
+		for (i = 0; i < ulen; i++)
+			if (put_user(prog->aux->used_maps[i]->id,
+				     &user_map_ids[i]))
+				return -EFAULT;
+	}
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
-- 
cgit v1.2.3


From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:53 -0700
Subject: bpf: Add map_name to bpf_map_info

This patch allows userspace to specify a name for a map
during BPF_MAP_CREATE.

The map's name can later be exported to user space
via BPF_OBJ_GET_INFO_BY_FD.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 1 +
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 7 ++++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33ccc474fb04..252f4bc9eb25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
+	u8 name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd6348269bf5..6d2137b4cf38 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -190,6 +190,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -829,6 +830,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 45970df3f820..11a7f82a55d1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+#define BPF_MAP_CREATE_LAST_FIELD map_name
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	err = bpf_obj_name_cpy(map->name, attr->map_name);
+	if (err)
+		goto free_map_nouncharge;
+
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
@@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.value_size = map->value_size;
 	info.max_entries = map->max_entries;
 	info.map_flags = map->map_flags;
+	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From c7c3e5913bf18eda3cf38932bebdce48351baac9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Sep 2017 19:08:00 -0700
Subject: net: ipv4: remove fib_weight

fib_weight in fib_info is set but not used. Remove it and the
helpers for setting it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 3 ---
 net/ipv4/fib_semantics.c | 9 ---------
 2 files changed, 12 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1a7f7e424320..f80524396c06 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -122,9 +122,6 @@ struct fib_info {
 #define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
 #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
 	int			fib_nhs;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			fib_weight;
-#endif
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].nh_dev
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 57a5d48acee8..be0874620ecc 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi)
 		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
 	} endfor_nexthops(fi);
 }
-
-static inline void fib_add_weight(struct fib_info *fi,
-				  const struct fib_nh *nh)
-{
-	fi->fib_weight += nh->nh_weight;
-}
-
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 
 #define fib_rebalance(fi) do { } while (0)
-#define fib_add_weight(fi, nh) do { } while (0)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
@@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 
 	change_nexthops(fi) {
 		fib_info_update_nh_saddr(net, nexthop_nh);
-		fib_add_weight(fi, nexthop_nh);
 	} endfor_nexthops(fi)
 
 	fib_rebalance(fi);
-- 
cgit v1.2.3


From a3f5aa907340b5d7b54223ddbaa90410f168864d Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Fri, 14 Jul 2017 09:27:08 -0400
Subject: i40e: Enable VF to negotiate number of allocated queues

Currently the PF allocates a default number of queues for each VF and
cannot be changed.  This patch enables the VF to request a different
number of queues allocated to it.  This patch also adds a new virtchnl
op and capability flag to facilitate this negotiation.

After the PF receives a request message, it will set a requested number
of queues for that VF.  Then when the VF resets, its VSI will get a new
number of queues allocated to it.

This is a best effort request and since we only allocate a guaranteed
default number, if the VF tries to ask for more than the guaranteed
number, there may not be enough in HW to accommodate it unless other
queues for other VFs are freed. It should also be noted decreasing the
number queues allocated to a VF to below the default will NOT enable the
allocation of more than 32 VFs per PF and will not free queues guaranteed
to each VF by default.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h             |  1 +
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 75 ++++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  1 +
 include/linux/avf/virtchnl.h                       | 20 ++++++
 4 files changed, 97 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index b7a539cdca00..439c63cb2a0c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -77,6 +77,7 @@
 #define i40e_default_queues_per_vmdq(pf) \
 		(((pf)->hw_features & I40E_HW_RSS_AQ_CAPABLE) ? 4 : 1)
 #define I40E_DEFAULT_QUEUES_PER_VF	4
+#define I40E_MAX_VF_QUEUES		16
 #define I40E_DEFAULT_QUEUES_PER_TC	1 /* should be a power of 2 */
 #define i40e_pf_get_max_q_per_tc(pf) \
 		(((pf)->hw_features & I40E_HW_128_QP_RSS_CAPABLE) ? 128 : 64)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 4d1e670f490e..a75396c157d9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -815,6 +815,14 @@ static void i40e_free_vf_res(struct i40e_vf *vf)
 	 */
 	clear_bit(I40E_VF_STATE_INIT, &vf->vf_states);
 
+	/* It's possible the VF had requeuested more queues than the default so
+	 * do the accounting here when we're about to free them.
+	 */
+	if (vf->num_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF) {
+		pf->queues_left += vf->num_queue_pairs -
+				   I40E_DEFAULT_QUEUES_PER_VF;
+	}
+
 	/* free vsi & disconnect it from the parent uplink */
 	if (vf->lan_vsi_idx) {
 		i40e_vsi_release(pf->vsi[vf->lan_vsi_idx]);
@@ -868,12 +876,27 @@ static int i40e_alloc_vf_res(struct i40e_vf *vf)
 	int total_queue_pairs = 0;
 	int ret;
 
+	if (vf->num_req_queues &&
+	    vf->num_req_queues <= pf->queues_left + I40E_DEFAULT_QUEUES_PER_VF)
+		pf->num_vf_qps = vf->num_req_queues;
+	else
+		pf->num_vf_qps = I40E_DEFAULT_QUEUES_PER_VF;
+
 	/* allocate hw vsi context & associated resources */
 	ret = i40e_alloc_vsi_res(vf, I40E_VSI_SRIOV);
 	if (ret)
 		goto error_alloc;
 	total_queue_pairs += pf->vsi[vf->lan_vsi_idx]->alloc_queue_pairs;
 
+	/* We account for each VF to get a default number of queue pairs.  If
+	 * the VF has now requested more, we need to account for that to make
+	 * certain we never request more queues than we actually have left in
+	 * HW.
+	 */
+	if (total_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF)
+		pf->queues_left -=
+			total_queue_pairs - I40E_DEFAULT_QUEUES_PER_VF;
+
 	if (vf->trusted)
 		set_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
 	else
@@ -1579,6 +1602,9 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 					VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
 	}
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_REQ_QUEUES;
+
 	vfres->num_vsis = num_vsis;
 	vfres->num_queue_pairs = vf->num_queue_pairs;
 	vfres->max_vectors = pf->hw.func_caps.num_msix_vectors_vf;
@@ -1986,6 +2012,52 @@ error_param:
 				       aq_ret);
 }
 
+/**
+ * i40e_vc_request_queues_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * VFs get a default number of queues but can use this message to request a
+ * different number.  Will respond with either the number requested or the
+ * maximum we can support.
+ **/
+static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen)
+{
+	struct virtchnl_vf_res_request *vfres =
+		(struct virtchnl_vf_res_request *)msg;
+	int req_pairs = vfres->num_queue_pairs;
+	int cur_pairs = vf->num_queue_pairs;
+	struct i40e_pf *pf = vf->pf;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
+		return -EINVAL;
+
+	if (req_pairs <= 0) {
+		dev_err(&pf->pdev->dev,
+			"VF %d tried to request %d queues.  Ignoring.\n",
+			vf->vf_id, req_pairs);
+	} else if (req_pairs > I40E_MAX_VF_QUEUES) {
+		dev_err(&pf->pdev->dev,
+			"VF %d tried to request more than %d queues.\n",
+			vf->vf_id,
+			I40E_MAX_VF_QUEUES);
+		vfres->num_queue_pairs = I40E_MAX_VF_QUEUES;
+	} else if (req_pairs - cur_pairs > pf->queues_left) {
+		dev_warn(&pf->pdev->dev,
+			 "VF %d requested %d more queues, but only %d left.\n",
+			 vf->vf_id,
+			 req_pairs - cur_pairs,
+			 pf->queues_left);
+		vfres->num_queue_pairs = pf->queues_left + cur_pairs;
+	} else {
+		vf->num_req_queues = req_pairs;
+	}
+
+	return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES, 0,
+				      (u8 *)vfres, sizeof(vfres));
+}
+
 /**
  * i40e_vc_get_stats_msg
  * @vf: pointer to the VF info
@@ -2708,6 +2780,9 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
 		ret = i40e_vc_disable_vlan_stripping(vf, msg, msglen);
 		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		ret = i40e_vc_request_queues_msg(vf, msg, msglen);
+		break;
 
 	case VIRTCHNL_OP_UNKNOWN:
 	default:
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 1f4b0c504368..5111d05d5f2f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -97,6 +97,7 @@ struct i40e_vf {
 	u16 lan_vsi_id;		/* ID as used by firmware */
 
 	u8 num_queue_pairs;	/* num of qps assigned to VF vsis */
+	u8 num_req_queues;	/* num of requested qps */
 	u64 num_mdd_events;	/* num of mdd events detected */
 	/* num of continuous malformed or invalid msgs detected */
 	u64 num_invalid_msgs;
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 2b038442c352..60e5d90cb18a 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -135,6 +135,7 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_SET_RSS_HENA = 26,
 	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27,
 	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28,
+	VIRTCHNL_OP_REQUEST_QUEUES = 29,
 };
 
 /* This macro is used to generate a compilation error if a structure
@@ -235,6 +236,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_RSS_AQ		0x00000008
 #define VIRTCHNL_VF_OFFLOAD_RSS_REG		0x00000010
 #define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		0x00000020
+#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		0x00000040
 #define VIRTCHNL_VF_OFFLOAD_VLAN		0x00010000
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		0x00020000
 #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	0x00040000
@@ -325,6 +327,21 @@ struct virtchnl_vsi_queue_config_info {
 	struct virtchnl_queue_pair_info qpair[1];
 };
 
+/* VIRTCHNL_OP_REQUEST_QUEUES
+ * VF sends this message to request the PF to allocate additional queues to
+ * this VF.  Each VF gets a guaranteed number of queues on init but asking for
+ * additional queues must be negotiated.  This is a best effort request as it
+ * is possible the PF does not have enough queues left to support the request.
+ * If the PF cannot support the number requested it will respond with the
+ * maximum number it is able to support; otherwise it will respond with the
+ * number requested.
+ */
+
+/* VF resource request */
+struct virtchnl_vf_res_request {
+	u16 num_queue_pairs;
+};
+
 VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_vsi_queue_config_info);
 
 /* VIRTCHNL_OP_CONFIG_IRQ_MAP
@@ -691,6 +708,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
 		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		valid_len = sizeof(struct virtchnl_vf_res_request);
+		break;
 	/* These are always errors coming from the VF. */
 	case VIRTCHNL_OP_EVENT:
 	case VIRTCHNL_OP_UNKNOWN:
-- 
cgit v1.2.3


From 84e14fe353de7624872e582887712079ba0b2d56 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 26 Sep 2017 21:32:42 -0700
Subject: net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable
even on IPv6 sockets.

However, it turns out it is perfectly reasonable to want to set freebind
on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP
socket option on such a socket (they're all blindly errored out).

One use case for this is to allow spoofing src ip on a raw socket
via sendmsg cmsg.

Tested:
  built, and booted
  # python
  >>> import socket
  >>> SOL_IP = socket.SOL_IP
  >>> SOL_IPV6 = socket.IPPROTO_IPV6
  >>> IP_FREEBIND = 15
  >>> IPV6_FREEBIND = 78
  >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0)
  >>> s.getsockopt(SOL_IP, IP_FREEBIND)
  0
  >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND)
  0
  >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1)
  >>> s.getsockopt(SOL_IP, IP_FREEBIND)
  1
  >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND)
  1

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in6.h |  1 +
 net/ipv6/ipv6_sockglue.c | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 46444f8fbee4..4f8f3eb0699f 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -284,6 +284,7 @@ struct in6_flowlabel_req {
 #define IPV6_TRANSPARENT        75
 #define IPV6_UNICAST_IF         76
 #define IPV6_RECVFRAGSIZE	77
+#define IPV6_FREEBIND		78
 
 /*
  * Multicast Routing:
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a5e466d4e093..b9404feabd78 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -377,6 +377,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		retv = 0;
 		break;
 
+	case IPV6_FREEBIND:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		/* we also don't have a separate freebind bit for IPV6 */
+		inet_sk(sk)->freebind = valbool;
+		retv = 0;
+		break;
+
 	case IPV6_RECVORIGDSTADDR:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1214,6 +1222,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = inet_sk(sk)->transparent;
 		break;
 
+	case IPV6_FREEBIND:
+		val = inet_sk(sk)->freebind;
+		break;
+
 	case IPV6_RECVORIGDSTADDR:
 		val = np->rxopt.bits.rxorigdstaddr;
 		break;
-- 
cgit v1.2.3


From 152402483ed75b167d5628d414e876ffa7a6d4c4 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 29 Sep 2017 17:19:18 -0400
Subject: net: dsa: add tagging ops to port

The DSA tagging protocol operations are specific to each CPU port,
thus the dsa_device_ops pointer belongs to the dsa_port structure.

>From now on assign a slave's xmit copy from its CPU port tagging
operations. This will ease the future support for multiple CPU ports.

Also keep the tag_ops at the beginning of the dsa_port structure so that
we ensure copies for hot path are in cacheline 1.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  | 3 +++
 net/dsa/dsa2.c     | 1 +
 net/dsa/dsa_priv.h | 2 +-
 net/dsa/legacy.c   | 1 +
 net/dsa/slave.c    | 3 +--
 5 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8dee216a5a9b..4d1df2f086e8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -175,6 +175,9 @@ struct dsa_mall_tc_entry {
 
 
 struct dsa_port {
+	/* CPU port tagging operations used by master or slave devices */
+	const struct dsa_device_ops *tag_ops;
+
 	struct dsa_switch	*ds;
 	unsigned int		index;
 	const char		*name;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 6a10c5c1639f..9eac4726dc0c 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -522,6 +522,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 		return PTR_ERR(tag_ops);
 	}
 
+	dst->cpu_dp->tag_ops = tag_ops;
 	dst->tag_ops = tag_ops;
 	dst->rcv = dst->tag_ops->rcv;
 
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index d429505dc4e7..9397291bb3aa 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -66,7 +66,7 @@ struct dsa_notifier_vlan_info {
 };
 
 struct dsa_slave_priv {
-	/* Copy of dp->ds->dst->tag_ops->xmit for faster access in hot path */
+	/* Copy of CPU port xmit for faster access in slave transmit hot path */
 	struct sk_buff *	(*xmit)(struct sk_buff *skb,
 					struct net_device *dev);
 
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 8e849013f69d..4d374541815a 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -152,6 +152,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 		if (IS_ERR(tag_ops))
 			return PTR_ERR(tag_ops);
 
+		dst->cpu_dp->tag_ops = tag_ops;
 		dst->tag_ops = tag_ops;
 		dst->rcv = dst->tag_ops->rcv;
 	}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index bf8800de13c1..4b634db05cee 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1117,7 +1117,6 @@ int dsa_slave_resume(struct net_device *slave_dev)
 int dsa_slave_create(struct dsa_port *port, const char *name)
 {
 	struct dsa_switch *ds = port->ds;
-	struct dsa_switch_tree *dst = ds->dst;
 	struct net_device *master;
 	struct net_device *slave_dev;
 	struct dsa_slave_priv *p;
@@ -1162,7 +1161,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name)
 	}
 	p->dp = port;
 	INIT_LIST_HEAD(&p->mall_tc_list);
-	p->xmit = dst->tag_ops->xmit;
+	p->xmit = cpu_dp->tag_ops->xmit;
 
 	p->old_pause = -1;
 	p->old_link = -1;
-- 
cgit v1.2.3


From 3e41f93b358a8800194b87995ad076fc50919719 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 29 Sep 2017 17:19:19 -0400
Subject: net: dsa: prepare master receive hot path

In preparation to make DSA master devices point to their corresponding
CPU port instead of the whole tree, add copies of dst and rcv in the
dsa_port structure so that we keep fast access in the receive hot path.

Also keep the copies at the beginning of the dsa_port structure in order
to ensure they are available in cacheline 1.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 +++++
 net/dsa/dsa2.c    | 4 ++++
 net/dsa/legacy.c  | 4 ++++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4d1df2f086e8..6bda01fa5747 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -178,6 +178,11 @@ struct dsa_port {
 	/* CPU port tagging operations used by master or slave devices */
 	const struct dsa_device_ops *tag_ops;
 
+	/* Copies for faster access in master receive hot path */
+	struct dsa_switch_tree *dst;
+	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
+			       struct packet_type *pt);
+
 	struct dsa_switch	*ds;
 	unsigned int		index;
 	const char		*name;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 9eac4726dc0c..b71e3bb478e4 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -524,7 +524,11 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 
 	dst->cpu_dp->tag_ops = tag_ops;
 	dst->tag_ops = tag_ops;
+
+	/* Make a few copies for faster access in master receive hot path */
+	dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv;
 	dst->rcv = dst->tag_ops->rcv;
+	dst->cpu_dp->dst = dst;
 
 	return 0;
 }
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 4d374541815a..96c7e3f8b8bb 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -154,7 +154,11 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 
 		dst->cpu_dp->tag_ops = tag_ops;
 		dst->tag_ops = tag_ops;
+
+		/* Few copies for faster access in master receive hot path */
+		dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv;
 		dst->rcv = dst->tag_ops->rcv;
+		dst->cpu_dp->dst = dst;
 	}
 
 	memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable));
-- 
cgit v1.2.3


From 2f657a600409f1961d67642fe384a9d4be71d36a Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 29 Sep 2017 17:19:20 -0400
Subject: net: dsa: change dsa_ptr for a dsa_port

With DSA, a master net device (CPU facing interface) has a dsa_ptr
pointer to which hangs a dsa_switch_tree. This is not correct because a
master interface is wired to a dedicated switch port, and because we can
theoretically have several master interfaces pointing to several CPU
ports of the same switch fabric.

Change the master interface's dsa_ptr for the CPU dsa_port pointer.
This is a step towards supporting multiple CPU ports.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 net/dsa/dsa.c             |  6 +++---
 net/dsa/dsa2.c            |  2 +-
 net/dsa/dsa_priv.h        |  3 ++-
 net/dsa/legacy.c          |  2 +-
 net/dsa/master.c          | 15 +++++----------
 6 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1..e1d6ef130611 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -55,7 +55,7 @@
 struct netpoll_info;
 struct device;
 struct phy_device;
-struct dsa_switch_tree;
+struct dsa_port;
 
 /* 802.11 specific */
 struct wireless_dev;
@@ -1752,7 +1752,7 @@ struct net_device {
 	struct vlan_info __rcu	*vlan_info;
 #endif
 #if IS_ENABLED(CONFIG_NET_DSA)
-	struct dsa_switch_tree	*dsa_ptr;
+	struct dsa_port		*dsa_ptr;
 #endif
 #if IS_ENABLED(CONFIG_TIPC)
 	struct tipc_bearer __rcu *tipc_ptr;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 81c852e32821..51ca2a524a27 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -160,12 +160,12 @@ EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 			  struct packet_type *pt, struct net_device *unused)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct sk_buff *nskb = NULL;
 	struct pcpu_sw_netstats *s;
 	struct dsa_slave_priv *p;
 
-	if (unlikely(dst == NULL)) {
+	if (unlikely(!cpu_dp)) {
 		kfree_skb(skb);
 		return 0;
 	}
@@ -174,7 +174,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!skb)
 		return 0;
 
-	nskb = dst->rcv(skb, dev, pt);
+	nskb = cpu_dp->rcv(skb, dev, pt);
 	if (!nskb) {
 		kfree_skb(skb);
 		return 0;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index b71e3bb478e4..62302558f38c 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -438,7 +438,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
 	 * sent to the tag format's receive function.
 	 */
 	wmb();
-	dst->cpu_dp->netdev->dsa_ptr = dst;
+	dst->cpu_dp->netdev->dsa_ptr = dst->cpu_dp;
 
 	err = dsa_master_ethtool_setup(dst->cpu_dp->netdev);
 	if (err)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 9397291bb3aa..2850077cc9cc 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -116,7 +116,8 @@ void dsa_master_ethtool_restore(struct net_device *dev);
 static inline struct net_device *dsa_master_get_slave(struct net_device *dev,
 						      int device, int port)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	struct dsa_switch_tree *dst = cpu_dp->dst;
 	struct dsa_switch *ds;
 
 	if (device < 0 || device >= DSA_MAX_SWITCHES)
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 96c7e3f8b8bb..71917505a5cc 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -607,7 +607,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
 	 * sent to the tag format's receive function.
 	 */
 	wmb();
-	dev->dsa_ptr = dst;
+	dev->dsa_ptr = dst->cpu_dp;
 
 	return dsa_master_ethtool_setup(dst->cpu_dp->netdev);
 }
diff --git a/net/dsa/master.c b/net/dsa/master.c
index ef15d35f1574..5f3f57e372e0 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -16,8 +16,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
 					 struct ethtool_stats *stats,
 					 uint64_t *data)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	struct dsa_switch *ds = cpu_dp->ds;
 	int port = cpu_dp->index;
@@ -34,8 +33,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
 
 static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	struct dsa_switch *ds = cpu_dp->ds;
 	int count = 0;
@@ -52,8 +50,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 				   uint8_t *data)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	struct dsa_switch *ds = cpu_dp->ds;
 	int port = cpu_dp->index;
@@ -90,8 +87,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 
 int dsa_master_ethtool_setup(struct net_device *dev)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct dsa_switch *ds = cpu_dp->ds;
 	struct ethtool_ops *ops;
 
@@ -114,8 +110,7 @@ int dsa_master_ethtool_setup(struct net_device *dev)
 
 void dsa_master_ethtool_restore(struct net_device *dev)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 
 	dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
 	cpu_dp->orig_ethtool_ops = NULL;
-- 
cgit v1.2.3


From aa193d9b1d7ea6893ce24a9d141f676950563987 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 29 Sep 2017 17:19:21 -0400
Subject: net: dsa: remove tag ops from the switch tree

Now that the dsa_ptr is a dsa_port instance, there is no need to keep
the tag operations in the dsa_switch_tree structure. Remove it.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 11 -----------
 net/dsa/dsa2.c    |  2 --
 net/dsa/legacy.c  |  2 --
 3 files changed, 15 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6bda01fa5747..10dceccd9ce8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -130,11 +130,6 @@ struct dsa_switch_tree {
 	 */
 	struct dsa_platform_data	*pd;
 
-	/* Copy of tag_ops->rcv for faster access in hot path */
-	struct sk_buff *	(*rcv)(struct sk_buff *skb,
-				       struct net_device *dev,
-				       struct packet_type *pt);
-
 	/*
 	 * The switch port to which the CPU is attached.
 	 */
@@ -144,12 +139,6 @@ struct dsa_switch_tree {
 	 * Data for the individual switch chips.
 	 */
 	struct dsa_switch	*ds[DSA_MAX_SWITCHES];
-
-	/*
-	 * Tagging protocol operations for adding and removing an
-	 * encapsulation tag.
-	 */
-	const struct dsa_device_ops *tag_ops;
 };
 
 /* TC matchall action types, only mirroring for now */
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 62302558f38c..54ed054777bd 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -523,11 +523,9 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 	}
 
 	dst->cpu_dp->tag_ops = tag_ops;
-	dst->tag_ops = tag_ops;
 
 	/* Make a few copies for faster access in master receive hot path */
 	dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv;
-	dst->rcv = dst->tag_ops->rcv;
 	dst->cpu_dp->dst = dst;
 
 	return 0;
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 71917505a5cc..19ff6e0a21dc 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -153,11 +153,9 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 			return PTR_ERR(tag_ops);
 
 		dst->cpu_dp->tag_ops = tag_ops;
-		dst->tag_ops = tag_ops;
 
 		/* Few copies for faster access in master receive hot path */
 		dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv;
-		dst->rcv = dst->tag_ops->rcv;
 		dst->cpu_dp->dst = dst;
 	}
 
-- 
cgit v1.2.3


From e1cfcbe82b4534bd0f99fef92a6d33843fd85e0e Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 27 Sep 2017 11:35:40 +0800
Subject: ipv4: Namespaceify tcp_fastopen knob

Different namespace application might require enable TCP Fast Open
feature independently of the host.

This patch series continues making more of the TCP Fast Open related
sysctl knobs be per net-namespace.

Reported-by: Luca BRUNO <lucab@debian.org>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/af_inet.c         |  7 ++++---
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp.c             |  4 ++--
 net/ipv4/tcp_fastopen.c    | 11 +++++------
 net/ipv4/tcp_ipv4.c        |  2 ++
 7 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index abc84d986da4..16420ccaef15 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -128,6 +128,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_timestamps;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
+	int sysctl_tcp_fastopen;
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	int sysctl_udp_l3mdev_accept;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 770b608c8439..9e414a99034f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 
 /* sysctl variables for tcp */
-extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
 extern int sysctl_tcp_rfc1337;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e31108e5ef79..ddd126d120ac 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
 	unsigned char old_state;
-	int err;
+	int err, tcp_fastopen;
 
 	lock_sock(sk);
 
@@ -217,8 +217,9 @@ int inet_listen(struct socket *sock, int backlog)
 		 * because the socket was in TCP_LISTEN state previously but
 		 * was shutdown() rather than close().
 		 */
-		if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
-		    (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+		tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+		if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
+		    (tcp_fastopen & TFO_SERVER_ENABLE) &&
 		    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
 			fastopen_queue_tune(sk, backlog);
 			tcp_fastopen_init_key_once(true);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0d3c038d7b04..e31e853cf486 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -400,13 +400,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_fastopen",
-		.data		= &sysctl_tcp_fastopen,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "tcp_fastopen_key",
 		.mode		= 0600,
@@ -1085,6 +1078,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_fastopen",
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5091402720ab..dac56c4ad357 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1126,7 +1126,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	struct sockaddr *uaddr = msg->msg_name;
 	int err, flags;
 
-	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+	if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
 	    (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
 	     uaddr->sa_family == AF_UNSPEC))
 		return -EOPNOTSUPP;
@@ -2759,7 +2759,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_FASTOPEN_CONNECT:
 		if (val > 1 || val < 0) {
 			err = -EINVAL;
-		} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+		} else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
 			if (sk->sk_state == TCP_CLOSE)
 				tp->fastopen_connect = val;
 			else
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index e3c33220c418..31b08ec38cb8 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -9,8 +9,6 @@
 #include <net/inetpeer.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
-
 struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 
 static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
@@ -279,21 +277,22 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
 			      struct tcp_fastopen_cookie *foc)
 {
-	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
 	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+	int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
 	struct sock *child;
 
 	if (foc->len == 0) /* Client requests a cookie */
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
 
-	if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+	if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
 	      (syn_data || foc->len >= 0) &&
 	      tcp_fastopen_queue_check(sk))) {
 		foc->len = -1;
 		return NULL;
 	}
 
-	if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+	if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
 		goto fastopen;
 
 	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
@@ -347,7 +346,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 		return false;
 	}
 
-	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+	if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
 		cookie->len = -1;
 		return true;
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9416b5162bc..88409b13c9d2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2472,6 +2472,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_window_scaling = 1;
 	net->ipv4.sysctl_tcp_timestamps = 1;
 
+	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
-- 
cgit v1.2.3


From dd000598a39b6937fcefdf143720ec9fb5250e72 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 27 Sep 2017 11:35:41 +0800
Subject: ipv4: Remove the 'publish' logic in tcp_fastopen_init_key_once
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'publish' logic is not necessary after commit dfea2aa65424 ("tcp:
Do not call tcp_fastopen_reset_cipher from interrupt context"), because
in tcp_fastopen_cookie_gen，it wouldn't call tcp_fastopen_init_key_once.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h          | 2 +-
 net/ipv4/af_inet.c         | 2 +-
 net/ipv4/sysctl_net_ipv4.c | 5 -----
 net/ipv4/tcp.c             | 2 +-
 net/ipv4/tcp_fastopen.c    | 4 ++--
 5 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9e414a99034f..d9376e2458e9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1555,7 +1555,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
 			      struct tcp_fastopen_cookie *foc);
-void tcp_fastopen_init_key_once(bool publish);
+void tcp_fastopen_init_key_once(void);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
 bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ddd126d120ac..e73ce79d7176 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -222,7 +222,7 @@ int inet_listen(struct socket *sock, int backlog)
 		    (tcp_fastopen & TFO_SERVER_ENABLE) &&
 		    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
 			fastopen_queue_tune(sk, backlog);
-			tcp_fastopen_init_key_once(true);
+			tcp_fastopen_init_key_once();
 		}
 
 		err = inet_csk_listen_start(sk, backlog);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e31e853cf486..f6324ead0e19 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -282,11 +282,6 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
-		/* Generate a dummy secret but don't publish it. This
-		 * is needed so we don't regenerate a new key on the
-		 * first invocation of tcp_fastopen_cookie_gen
-		 */
-		tcp_fastopen_init_key_once(false);
 		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
 	}
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dac56c4ad357..4e395452d69f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2749,7 +2749,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_FASTOPEN:
 		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
 		    TCPF_LISTEN))) {
-			tcp_fastopen_init_key_once(true);
+			tcp_fastopen_init_key_once();
 
 			fastopen_queue_tune(sk, val);
 		} else {
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 31b08ec38cb8..8c8f0f0af59d 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -13,7 +13,7 @@ struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 
 static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
 
-void tcp_fastopen_init_key_once(bool publish)
+void tcp_fastopen_init_key_once(void)
 {
 	static u8 key[TCP_FASTOPEN_KEY_LENGTH];
 
@@ -23,7 +23,7 @@ void tcp_fastopen_init_key_once(bool publish)
 	 * All call sites of tcp_fastopen_cookie_gen also check
 	 * for a valid cookie, so this is an acceptable risk.
 	 */
-	if (net_get_random_once(key, sizeof(key)) && publish)
+	if (net_get_random_once(key, sizeof(key)))
 		tcp_fastopen_reset_cipher(key, sizeof(key));
 }
 
-- 
cgit v1.2.3


From 437138485656c41e32b8c63c0987cfa0348be0e6 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 27 Sep 2017 11:35:42 +0800
Subject: ipv4: Namespaceify tcp_fastopen_key knob

Different namespace application might require different tcp_fastopen_key
independently of the host.

David Miller pointed out there is a leak without releasing the context
of tcp_fastopen_key during netns teardown. So add the release action in
exit_batch path.

Tested:
1. Container namespace:
# cat /proc/sys/net/ipv4/tcp_fastopen_key:
2817fff2-f803cf97-eadfd1f3-78c0992b

cookie key in tcp syn packets:
Fast Open Cookie
    Kind: TCP Fast Open Cookie (34)
    Length: 10
    Fast Open Cookie: 1e5dd82a8c492ca9

2. Host:
# cat /proc/sys/net/ipv4/tcp_fastopen_key:
107d7c5f-68eb2ac7-02fb06e6-ed341702

cookie key in tcp syn packets:
Fast Open Cookie
    Kind: TCP Fast Open Cookie (34)
    Length: 10
    Fast Open Cookie: e213c02bf0afbc8a

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  4 +++
 include/net/tcp.h          |  6 ++---
 net/ipv4/af_inet.c         |  2 +-
 net/ipv4/sysctl_net_ipv4.c | 21 ++++++++-------
 net/ipv4/tcp.c             |  2 +-
 net/ipv4/tcp_fastopen.c    | 64 +++++++++++++++++++++++++++++++---------------
 net/ipv4/tcp_ipv4.c        |  6 +++++
 7 files changed, 70 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 16420ccaef15..7bb9603ff66c 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -36,6 +36,8 @@ struct inet_timewait_death_row {
 	int			sysctl_max_tw_buckets;
 };
 
+struct tcp_fastopen_context;
+
 struct netns_ipv4 {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*forw_hdr;
@@ -129,6 +131,8 @@ struct netns_ipv4 {
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
+	struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
+	spinlock_t tcp_fastopen_ctx_lock;
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	int sysctl_udp_l3mdev_accept;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d9376e2458e9..6d25d8305054 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1549,13 +1549,13 @@ struct tcp_fastopen_request {
 };
 void tcp_free_fastopen_req(struct tcp_sock *tp);
 
-extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-int tcp_fastopen_reset_cipher(void *key, unsigned int len);
+void tcp_fastopen_ctx_destroy(struct net *net);
+int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
 			      struct tcp_fastopen_cookie *foc);
-void tcp_fastopen_init_key_once(void);
+void tcp_fastopen_init_key_once(struct net *net);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
 bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e73ce79d7176..43a1bbed7a42 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -222,7 +222,7 @@ int inet_listen(struct socket *sock, int backlog)
 		    (tcp_fastopen & TFO_SERVER_ENABLE) &&
 		    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
 			fastopen_queue_tune(sk, backlog);
-			tcp_fastopen_init_key_once();
+			tcp_fastopen_init_key_once(sock_net(sk));
 		}
 
 		err = inet_csk_listen_start(sk, backlog);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index f6324ead0e19..20e19fe78dbd 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -251,10 +251,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
 	return ret;
 }
 
-static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
+static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos)
 {
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_tcp_fastopen);
 	struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
 	struct tcp_fastopen_context *ctxt;
 	int ret;
@@ -265,7 +267,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 		return -ENOMEM;
 
 	rcu_read_lock();
-	ctxt = rcu_dereference(tcp_fastopen_ctx);
+	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
 	if (ctxt)
 		memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
 	else
@@ -282,7 +284,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
-		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+		tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH);
 	}
 
 bad_key:
@@ -395,12 +397,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_fastopen_key",
-		.mode		= 0600,
-		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
-		.proc_handler	= proc_tcp_fastopen_key,
-	},
 	{
 		.procname	= "tcp_fastopen_blackhole_timeout_sec",
 		.data		= &sysctl_tcp_fastopen_blackhole_timeout,
@@ -1080,6 +1076,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_fastopen_key",
+		.mode		= 0600,
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen,
+		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+		.proc_handler	= proc_tcp_fastopen_key,
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4e395452d69f..23225c98d287 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2749,7 +2749,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_FASTOPEN:
 		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
 		    TCPF_LISTEN))) {
-			tcp_fastopen_init_key_once();
+			tcp_fastopen_init_key_once(net);
 
 			fastopen_queue_tune(sk, val);
 		} else {
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8c8f0f0af59d..4eae44ac3cb0 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -9,13 +9,18 @@
 #include <net/inetpeer.h>
 #include <net/tcp.h>
 
-struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-
-static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
-
-void tcp_fastopen_init_key_once(void)
+void tcp_fastopen_init_key_once(struct net *net)
 {
-	static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+	u8 key[TCP_FASTOPEN_KEY_LENGTH];
+	struct tcp_fastopen_context *ctxt;
+
+	rcu_read_lock();
+	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+	if (ctxt) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
 
 	/* tcp_fastopen_reset_cipher publishes the new context
 	 * atomically, so we allow this race happening here.
@@ -23,8 +28,8 @@ void tcp_fastopen_init_key_once(void)
 	 * All call sites of tcp_fastopen_cookie_gen also check
 	 * for a valid cookie, so this is an acceptable risk.
 	 */
-	if (net_get_random_once(key, sizeof(key)))
-		tcp_fastopen_reset_cipher(key, sizeof(key));
+	get_random_bytes(key, sizeof(key));
+	tcp_fastopen_reset_cipher(net, key, sizeof(key));
 }
 
 static void tcp_fastopen_ctx_free(struct rcu_head *head)
@@ -35,7 +40,22 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
 	kfree(ctx);
 }
 
-int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+void tcp_fastopen_ctx_destroy(struct net *net)
+{
+	struct tcp_fastopen_context *ctxt;
+
+	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
+
+	ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
+	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+
+	if (ctxt)
+		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
+}
+
+int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len)
 {
 	int err;
 	struct tcp_fastopen_context *ctx, *octx;
@@ -59,26 +79,27 @@ error:		kfree(ctx);
 	}
 	memcpy(ctx->key, key, len);
 
-	spin_lock(&tcp_fastopen_ctx_lock);
+	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
 
-	octx = rcu_dereference_protected(tcp_fastopen_ctx,
-				lockdep_is_held(&tcp_fastopen_ctx_lock));
-	rcu_assign_pointer(tcp_fastopen_ctx, ctx);
-	spin_unlock(&tcp_fastopen_ctx_lock);
+	octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
 
 	if (octx)
 		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
 	return err;
 }
 
-static bool __tcp_fastopen_cookie_gen(const void *path,
+static bool __tcp_fastopen_cookie_gen(struct net *net,
+				      const void *path,
 				      struct tcp_fastopen_cookie *foc)
 {
 	struct tcp_fastopen_context *ctx;
 	bool ok = false;
 
 	rcu_read_lock();
-	ctx = rcu_dereference(tcp_fastopen_ctx);
+	ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
 	if (ctx) {
 		crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
@@ -94,7 +115,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path,
  *
  * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
  */
-static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+static bool tcp_fastopen_cookie_gen(struct net *net,
+				    struct request_sock *req,
 				    struct sk_buff *syn,
 				    struct tcp_fastopen_cookie *foc)
 {
@@ -102,7 +124,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
 		const struct iphdr *iph = ip_hdr(syn);
 
 		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
-		return __tcp_fastopen_cookie_gen(path, foc);
+		return __tcp_fastopen_cookie_gen(net, path, foc);
 	}
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -110,13 +132,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
 		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
 		struct tcp_fastopen_cookie tmp;
 
-		if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+		if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) {
 			struct in6_addr *buf = &tmp.addr;
 			int i;
 
 			for (i = 0; i < 4; i++)
 				buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
-			return __tcp_fastopen_cookie_gen(buf, foc);
+			return __tcp_fastopen_cookie_gen(net, buf, foc);
 		}
 	}
 #endif
@@ -296,7 +318,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 		goto fastopen;
 
 	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
-	    tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+	    tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) &&
 	    foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
 	    foc->len == valid_foc.len &&
 	    !memcmp(foc->val, valid_foc.val, foc->len)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 88409b13c9d2..49c74c0d0d21 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2473,6 +2473,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_timestamps = 1;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
+	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 
 	return 0;
 fail:
@@ -2483,7 +2484,12 @@ fail:
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 {
+	struct net *net;
+
 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
+
+	list_for_each_entry(net, net_exit_list, exit_list)
+		tcp_fastopen_ctx_destroy(net);
 }
 
 static struct pernet_operations __net_initdata tcp_sk_ops = {
-- 
cgit v1.2.3


From 3733be14a32bae288b61ed28341e593baba983af Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 27 Sep 2017 11:35:43 +0800
Subject: ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob

Different namespace application might require different time period in
second to disable Fastopen on active TCP sockets.

Tested:
Simulate following similar situation that the server's data gets dropped
after 3WHS.
C ---- syn-data ---> S
C <--- syn/ack ----- S
C ---- ack --------> S
S (accept & write)
C?  X <- data ------ S
	[retry and timeout]

And then print netstat of TCPFastOpenBlackhole, the counter increased as
expected when the firewall blackhole issue is detected and active TFO is
disabled.
# cat /proc/net/netstat | awk '{print $91}'
TCPFastOpenBlackhole
1

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  3 +++
 net/ipv4/sysctl_net_ipv4.c | 20 +++++++++++---------
 net/ipv4/tcp_fastopen.c    | 30 +++++++++++-------------------
 net/ipv4/tcp_ipv4.c        |  2 ++
 4 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7bb9603ff66c..2c4222a5d102 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -133,6 +133,9 @@ struct netns_ipv4 {
 	int sysctl_tcp_fastopen;
 	struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 	spinlock_t tcp_fastopen_ctx_lock;
+	unsigned int sysctl_tcp_fastopen_blackhole_timeout;
+	atomic_t tfo_active_disable_times;
+	unsigned long tfo_active_disable_stamp;
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	int sysctl_udp_l3mdev_accept;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 20e19fe78dbd..cac8dd309f39 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -355,11 +355,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
 					     void __user *buffer,
 					     size_t *lenp, loff_t *ppos)
 {
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_tcp_fastopen_blackhole_timeout);
 	int ret;
 
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (write && ret == 0)
-		tcp_fastopen_active_timeout_reset();
+		atomic_set(&net->ipv4.tfo_active_disable_times, 0);
 
 	return ret;
 }
@@ -397,14 +399,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_fastopen_blackhole_timeout_sec",
-		.data		= &sysctl_tcp_fastopen_blackhole_timeout,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_tfo_blackhole_detect_timeout,
-		.extra1		= &zero,
-	},
 	{
 		.procname	= "tcp_abort_on_overflow",
 		.data		= &sysctl_tcp_abort_on_overflow,
@@ -1083,6 +1077,14 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
 		.proc_handler	= proc_tcp_fastopen_key,
 	},
+	{
+		.procname	= "tcp_fastopen_blackhole_timeout_sec",
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_tfo_blackhole_detect_timeout,
+		.extra1		= &zero,
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 4eae44ac3cb0..de470e7e586f 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -422,25 +422,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect);
  * TFO connection with data exchanges.
  */
 
-/* Default to 1hr */
-unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60;
-static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0);
-static unsigned long tfo_active_disable_stamp __read_mostly;
-
 /* Disable active TFO and record current jiffies and
  * tfo_active_disable_times
  */
 void tcp_fastopen_active_disable(struct sock *sk)
 {
-	atomic_inc(&tfo_active_disable_times);
-	tfo_active_disable_stamp = jiffies;
-	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE);
-}
+	struct net *net = sock_net(sk);
 
-/* Reset tfo_active_disable_times to 0 */
-void tcp_fastopen_active_timeout_reset(void)
-{
-	atomic_set(&tfo_active_disable_times, 0);
+	atomic_inc(&net->ipv4.tfo_active_disable_times);
+	net->ipv4.tfo_active_disable_stamp = jiffies;
+	NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
 }
 
 /* Calculate timeout for tfo active disable
@@ -449,17 +440,18 @@ void tcp_fastopen_active_timeout_reset(void)
  */
 bool tcp_fastopen_active_should_disable(struct sock *sk)
 {
-	int tfo_da_times = atomic_read(&tfo_active_disable_times);
-	int multiplier;
+	unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout;
+	int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
 	unsigned long timeout;
+	int multiplier;
 
 	if (!tfo_da_times)
 		return false;
 
 	/* Limit timout to max: 2^6 * initial timeout */
 	multiplier = 1 << min(tfo_da_times - 1, 6);
-	timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ;
-	if (time_before(jiffies, tfo_active_disable_stamp + timeout))
+	timeout = multiplier * tfo_bh_timeout * HZ;
+	if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout))
 		return true;
 
 	/* Mark check bit so we can check for successful active TFO
@@ -495,10 +487,10 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 			}
 		}
 	} else if (tp->syn_fastopen_ch &&
-		   atomic_read(&tfo_active_disable_times)) {
+		   atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
 		dst = sk_dst_get(sk);
 		if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
-			tcp_fastopen_active_timeout_reset();
+			atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
 		dst_release(dst);
 	}
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 49c74c0d0d21..ad3b5bbaf942 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2474,6 +2474,8 @@ static int __net_init tcp_sk_init(struct net *net)
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
+	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
+	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
 
 	return 0;
 fail:
-- 
cgit v1.2.3


From b80ccfe9bbcac70e66fdfaef73f0988a27f9a68c Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 26 Sep 2017 20:37:22 -0700
Subject: net-ipv6: remove unused IP6_ECN_clear() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This function is unused, and furthermore it is buggy since it suffers
from the same issue that requires IP6_ECN_set_ce() to take a pointer
to the skb so that it may (in case of CHECKSUM_COMPLETE) update skb->csum

Instead of fixing it, let's just outright remove it.

Tested: builds, and 'git grep IP6_ECN_clear' comes up empty

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_ecn.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index dce2d586d9ce..f5ff16d72fe6 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -133,11 +133,6 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph)
 	return 1;
 }
 
-static inline void IP6_ECN_clear(struct ipv6hdr *iph)
-{
-	*(__be32*)iph &= ~htonl(INET_ECN_MASK << 20);
-}
-
 static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner)
 {
 	dscp &= ~INET_ECN_MASK;
-- 
cgit v1.2.3


From 66b1bedf662518e9b6367990a87e9601b35a94c1 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 29 Sep 2017 14:21:14 +0300
Subject: ieee80211: Add WFA TPC report element OUI type

Add Transmit Power Control OUI type definition for WLAN_OUI_MICROSOFT.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 55a604ad459f..ee6657a0ed69 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2445,6 +2445,7 @@ enum ieee80211_sa_query_action {
 #define WLAN_OUI_TYPE_MICROSOFT_WPA	1
 #define WLAN_OUI_TYPE_MICROSOFT_WMM	2
 #define WLAN_OUI_TYPE_MICROSOFT_WPS	4
+#define WLAN_OUI_TYPE_MICROSOFT_TPC	8
 
 /*
  * WMM/802.11e Tspec Element
-- 
cgit v1.2.3


From 503c1fb98ba3859c13863957c7c65c92371a9e50 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 29 Sep 2017 14:21:49 +0200
Subject: cfg80211/nl80211: add a port authorized event

Add an event that indicates that a connection is authorized
(i.e. the 4 way handshake was performed by the driver). This event
should be sent by the driver after sending a connect/roamed event.

This is useful for networks that require 802.1X authentication.
In cases that the driver supports 4 way handshake offload, but the
802.1X authentication is managed by user space, the driver needs to
inform user space right after the 802.11 association was completed
so user space can initialize its 802.1X state machine etc.
However, it is also possible that the AP will choose to skip the
802.1X authentication (e.g. when PMKSA caching is used) and proceed
with the 4 way handshake immediately. In this case the driver needs
to inform user space that 802.1X authentication is no longer required
(e.g. to prevent user space from disconnecting since it did not get
any EAPOLs from the AP).

This is also useful for roaming, in which case it is possible that
the driver used the Fast Transition protocol so 802.1X is not
required.

Since there will now be a dedicated notification indicating that the
connection is authorized, the authorized flag can be removed from the
roamed event. Drivers can send the new port authorized event right
after sending the roamed event to indicate the new AP is already
authorized. This therefore reserves the old PORT_AUTHORIZED attribute.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 21 +++++++++++++++++----
 include/uapi/linux/nl80211.h | 28 ++++++++++++++++-----------
 net/wireless/core.h          |  5 +++++
 net/wireless/nl80211.c       | 34 ++++++++++++++++++++++++++++++---
 net/wireless/nl80211.h       |  2 ++
 net/wireless/sme.c           | 45 +++++++++++++++++++++++++++++++++++++++++++-
 net/wireless/util.c          |  3 +++
 7 files changed, 119 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cc1996081463..8b8118a7fadb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5428,9 +5428,6 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
  * @resp_ie_len: assoc response IEs length
- * @authorized: true if the 802.1X authentication was done by the driver or is
- *	not needed (e.g., when Fast Transition protocol was used), false
- *	otherwise. Ignored for networks that don't use 802.1X authentication.
  */
 struct cfg80211_roam_info {
 	struct ieee80211_channel *channel;
@@ -5440,7 +5437,6 @@ struct cfg80211_roam_info {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
-	bool authorized;
 };
 
 /**
@@ -5464,6 +5460,23 @@ struct cfg80211_roam_info {
 void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 		     gfp_t gfp);
 
+/**
+ * cfg80211_port_authorized - notify cfg80211 of successful security association
+ *
+ * @dev: network device
+ * @bssid: the BSSID of the AP
+ * @gfp: allocation flags
+ *
+ * This function should be called by a driver that supports 4 way handshake
+ * offload after a security association was successfully established (i.e.,
+ * the 4 way handshake was completed successfully). The call to this function
+ * should be preceded with a call to cfg80211_connect_result(),
+ * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to
+ * indicate the 802.11 association.
+ */
+void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
+			      gfp_t gfp);
+
 /**
  * cfg80211_disconnected - notify cfg80211 that connection was dropped
  *
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 59ba6ca66a0d..95832ce03a44 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -569,13 +569,14 @@
  *	authentication/association or not receiving a response from the AP.
  *	Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as
  *	well to remain backwards compatible.
- * @NL80211_CMD_ROAM: notifcation indicating the card/driver roamed by itself.
- *	When the driver roamed in a network that requires 802.1X authentication,
- *	%NL80211_ATTR_PORT_AUTHORIZED should be set if the 802.1X authentication
- *	was done by the driver or if roaming was done using Fast Transition
- *	protocol (in which case 802.1X authentication is not needed). If
- *	%NL80211_ATTR_PORT_AUTHORIZED is not set, user space is responsible for
- *	the 802.1X authentication.
+ *	When establishing a security association, drivers that support 4 way
+ *	handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when
+ *	the 4 way handshake is completed successfully.
+ * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself.
+ *	When a security association was established with the new AP (e.g. if
+ *	the FT protocol was used for roaming or the driver completed the 4 way
+ *	handshake), this event should be followed by an
+ *	%NL80211_CMD_PORT_AUTHORIZED event.
  * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify
  *	userspace that a connection was dropped by the AP or due to other
  *	reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and
@@ -982,6 +983,12 @@
  * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously
  *	configured PMK for the authenticator address identified by
  *	&NL80211_ATTR_MAC.
+ * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way
+ *	handshake was completed successfully by the driver. The BSSID is
+ *	specified with &NL80211_ATTR_MAC. Drivers that support 4 way handshake
+ *	offload should send this event after indicating 802.11 association with
+ *	&NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed
+ *	&NL80211_CMD_DISCONNECT should be indicated instead.
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -1185,6 +1192,8 @@ enum nl80211_commands {
 	NL80211_CMD_SET_PMK,
 	NL80211_CMD_DEL_PMK,
 
+	NL80211_CMD_PORT_AUTHORIZED,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2138,10 +2147,7 @@ enum nl80211_commands {
  *	in %NL80211_CMD_CONNECT to indicate that for 802.1X authentication it
  *	wants to use the supported offload of the 4-way handshake.
  * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT.
- * @NL80211_ATTR_PORT_AUTHORIZED: flag attribute used in %NL80211_CMD_ROAMED
- *	notification indicating that that 802.1X authentication was done by
- *	the driver or is not needed (because roaming used the Fast Transition
- *	protocol).
+ * @NL80211_ATTR_PORT_AUTHORIZED: (reserved)
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 6e809325af3b..35165f42c2a8 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -216,6 +216,7 @@ enum cfg80211_event_type {
 	EVENT_DISCONNECTED,
 	EVENT_IBSS_JOINED,
 	EVENT_STOPPED,
+	EVENT_PORT_AUTHORIZED,
 };
 
 struct cfg80211_event {
@@ -235,6 +236,9 @@ struct cfg80211_event {
 			u8 bssid[ETH_ALEN];
 			struct ieee80211_channel *channel;
 		} ij;
+		struct {
+			u8 bssid[ETH_ALEN];
+		} pa;
 	};
 };
 
@@ -385,6 +389,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 			bool wextev);
 void __cfg80211_roamed(struct wireless_dev *wdev,
 		       struct cfg80211_roam_info *info);
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid);
 int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev);
 void cfg80211_autodisconnect_wk(struct work_struct *work);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1e39ba3cfd06..90e212db6889 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13830,9 +13830,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 		     info->req_ie)) ||
 	    (info->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
-		     info->resp_ie)) ||
-	    (info->authorized &&
-	     nla_put_flag(msg, NL80211_ATTR_PORT_AUTHORIZED)))
+		     info->resp_ie)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
@@ -13846,6 +13844,36 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	nlmsg_free(msg);
 }
 
+void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
+				  struct net_device *netdev, const u8 *bssid)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PORT_AUTHORIZED);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+				NL80211_MCGRP_MLME, GFP_KERNEL);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
 void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 			       struct net_device *netdev, u16 reason,
 			       const u8 *ie, size_t ie_len, bool from_ap)
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index b96933322077..bf9e772a30b9 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -58,6 +58,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev,
 			 struct cfg80211_roam_info *info, gfp_t gfp);
+void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
+				  struct net_device *netdev, const u8 *bssid);
 void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 			       struct net_device *netdev, u16 reason,
 			       const u8 *ie, size_t ie_len, bool from_ap);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 0a49b88070d0..f38ed490e42b 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -960,7 +960,6 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	ev->rm.resp_ie_len = info->resp_ie_len;
 	memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len);
 	ev->rm.bss = info->bss;
-	ev->rm.authorized = info->authorized;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
 	list_add_tail(&ev->list, &wdev->event_list);
@@ -969,6 +968,50 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 }
 EXPORT_SYMBOL(cfg80211_roamed);
 
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid)
+{
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+		return;
+
+	if (WARN_ON(!wdev->current_bss) ||
+	    WARN_ON(!ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+		return;
+
+	nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev,
+				     bssid);
+}
+
+void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
+			      gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct cfg80211_event *ev;
+	unsigned long flags;
+
+	if (WARN_ON(!bssid))
+		return;
+
+	ev = kzalloc(sizeof(*ev), gfp);
+	if (!ev)
+		return;
+
+	ev->type = EVENT_PORT_AUTHORIZED;
+	memcpy(ev->pa.bssid, bssid, ETH_ALEN);
+
+	/*
+	 * Use the wdev event list so that if there are pending
+	 * connected/roamed events, they will be reported first.
+	 */
+	spin_lock_irqsave(&wdev->event_lock, flags);
+	list_add_tail(&ev->list, &wdev->event_list);
+	spin_unlock_irqrestore(&wdev->event_lock, flags);
+	queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_port_authorized);
+
 void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 			     size_t ie_len, u16 reason, bool from_ap)
 {
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 7a1fcc6ee060..ff21c314a609 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -846,6 +846,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 		case EVENT_STOPPED:
 			__cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
 			break;
+		case EVENT_PORT_AUTHORIZED:
+			__cfg80211_port_authorized(wdev, ev->pa.bssid);
+			break;
 		}
 		wdev_unlock(wdev);
 
-- 
cgit v1.2.3


From 32f16369e59fcc505c5ed93a6a8cad3d5636b463 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Mon, 2 Oct 2017 10:41:15 +0200
Subject: net/dst: Make skb parameter of skb{metadata_dst, tunnel_info}() const

Make the skb parameter of skb_metadata_dst() and skb_tunnel_info()
const as they are not modified. This is in preparation for using
them in call-sites where skb is const.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index a803129a4849..9fba2ebf6dda 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -24,7 +24,7 @@ struct metadata_dst {
 	} u;
 };
 
-static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
+static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb)
 {
 	struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb);
 
@@ -34,7 +34,8 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
-static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
+static inline struct ip_tunnel_info *
+skb_tunnel_info(const struct sk_buff *skb)
 {
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
 	struct dst_entry *dst;
-- 
cgit v1.2.3


From f2f2efb807d339513199b1bb771806c90cce83ae Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:28 +0300
Subject: byteorder: Move {cpu_to_be32, be32_to_cpu}_array() from Thunderbolt
 to core

We will be using these when communicating XDomain discovery protocol
over Thunderbolt link but they might be useful for other drivers as
well.

Make them available through byteorder/generic.h.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/ctl.c         | 14 --------------
 include/linux/byteorder/generic.h | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index fb40dd0588b9..e6a4c9458c76 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -289,20 +289,6 @@ static void tb_cfg_print_error(struct tb_ctl *ctl,
 	}
 }
 
-static void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
-{
-	int i;
-	for (i = 0; i < len; i++)
-		dst[i] = cpu_to_be32(src[i]);
-}
-
-static void be32_to_cpu_array(u32 *dst, __be32 *src, size_t len)
-{
-	int i;
-	for (i = 0; i < len; i++)
-		dst[i] = be32_to_cpu(src[i]);
-}
-
 static __be32 tb_crc(const void *data, size_t len)
 {
 	return cpu_to_be32(~__crc32c_le(~0, data, len));
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 89f67c1c3160..805d16654459 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -170,4 +170,20 @@ static inline void be64_add_cpu(__be64 *var, u64 val)
 	*var = cpu_to_be64(be64_to_cpu(*var) + val);
 }
 
+static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = cpu_to_be32(src[i]);
+}
+
+static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = be32_to_cpu(src[i]);
+}
+
 #endif /* _LINUX_BYTEORDER_GENERIC_H */
-- 
cgit v1.2.3


From cdae7c07e3e3509eaabc18c1640a55dc5b99c179 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:30 +0300
Subject: thunderbolt: Add support for XDomain properties

Thunderbolt XDomain discovery protocol uses directories which contain
properties and other directories to exchange information about what
capabilities the remote host supports. This also includes identification
information like device ID and name.

This adds support for parsing and formatting these properties and
establishes an API drivers can use in addition to the core Thunderbolt
driver. This API is exposed in a new header: include/linux/thunderbolt.h.

This code is based on the work done by Amir Levy and Michael Jamet.

Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/Makefile   |   2 +-
 drivers/thunderbolt/property.c | 670 +++++++++++++++++++++++++++++++++++++++++
 include/linux/thunderbolt.h    |  89 ++++++
 3 files changed, 760 insertions(+), 1 deletion(-)
 create mode 100644 drivers/thunderbolt/property.c
 create mode 100644 include/linux/thunderbolt.h

(limited to 'include')

diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile
index 4900febc6c8a..7afd21f5383a 100644
--- a/drivers/thunderbolt/Makefile
+++ b/drivers/thunderbolt/Makefile
@@ -1,3 +1,3 @@
 obj-${CONFIG_THUNDERBOLT} := thunderbolt.o
 thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o
-thunderbolt-objs += domain.o dma_port.o icm.o
+thunderbolt-objs += domain.o dma_port.o icm.o property.o
diff --git a/drivers/thunderbolt/property.c b/drivers/thunderbolt/property.c
new file mode 100644
index 000000000000..8fe913a95b4a
--- /dev/null
+++ b/drivers/thunderbolt/property.c
@@ -0,0 +1,670 @@
+/*
+ * Thunderbolt XDomain property support
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uuid.h>
+#include <linux/thunderbolt.h>
+
+struct tb_property_entry {
+	u32 key_hi;
+	u32 key_lo;
+	u16 length;
+	u8 reserved;
+	u8 type;
+	u32 value;
+};
+
+struct tb_property_rootdir_entry {
+	u32 magic;
+	u32 length;
+	struct tb_property_entry entries[];
+};
+
+struct tb_property_dir_entry {
+	u32 uuid[4];
+	struct tb_property_entry entries[];
+};
+
+#define TB_PROPERTY_ROOTDIR_MAGIC	0x55584401
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len,
+	bool is_root);
+
+static inline void parse_dwdata(void *dst, const void *src, size_t dwords)
+{
+	be32_to_cpu_array(dst, src, dwords);
+}
+
+static inline void format_dwdata(void *dst, const void *src, size_t dwords)
+{
+	cpu_to_be32_array(dst, src, dwords);
+}
+
+static bool tb_property_entry_valid(const struct tb_property_entry *entry,
+				  size_t block_len)
+{
+	switch (entry->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+	case TB_PROPERTY_TYPE_DATA:
+	case TB_PROPERTY_TYPE_TEXT:
+		if (entry->length > block_len)
+			return false;
+		if (entry->value + entry->length > block_len)
+			return false;
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		if (entry->length != 1)
+			return false;
+		break;
+	}
+
+	return true;
+}
+
+static bool tb_property_key_valid(const char *key)
+{
+	return key && strlen(key) <= TB_PROPERTY_KEY_SIZE;
+}
+
+static struct tb_property *
+tb_property_alloc(const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	property = kzalloc(sizeof(*property), GFP_KERNEL);
+	if (!property)
+		return NULL;
+
+	strcpy(property->key, key);
+	property->type = type;
+	INIT_LIST_HEAD(&property->list);
+
+	return property;
+}
+
+static struct tb_property *tb_property_parse(const u32 *block, size_t block_len,
+					const struct tb_property_entry *entry)
+{
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	struct tb_property *property;
+	struct tb_property_dir *dir;
+
+	if (!tb_property_entry_valid(entry, block_len))
+		return NULL;
+
+	parse_dwdata(key, entry, 2);
+	key[TB_PROPERTY_KEY_SIZE] = '\0';
+
+	property = tb_property_alloc(key, entry->type);
+	if (!property)
+		return NULL;
+
+	property->length = entry->length;
+
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		dir = __tb_property_parse_dir(block, block_len, entry->value,
+					      entry->length, false);
+		if (!dir) {
+			kfree(property);
+			return NULL;
+		}
+		property->value.dir = dir;
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		property->value.data = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.data) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.data, block + entry->value,
+			     entry->length);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		property->value.text = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.text) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.text, block + entry->value,
+			     entry->length);
+		/* Force null termination */
+		property->value.text[property->length * 4 - 1] = '\0';
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		property->value.immediate = entry->value;
+		break;
+
+	default:
+		property->type = TB_PROPERTY_TYPE_UNKNOWN;
+		break;
+	}
+
+	return property;
+}
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len, bool is_root)
+{
+	const struct tb_property_entry *entries;
+	size_t i, content_len, nentries;
+	unsigned int content_offset;
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	if (is_root) {
+		content_offset = dir_offset + 2;
+		content_len = dir_len;
+	} else {
+		dir->uuid = kmemdup(&block[dir_offset], sizeof(*dir->uuid),
+				    GFP_KERNEL);
+		content_offset = dir_offset + 4;
+		content_len = dir_len - 4; /* Length includes UUID */
+	}
+
+	entries = (const struct tb_property_entry *)&block[content_offset];
+	nentries = content_len / (sizeof(*entries) / 4);
+
+	INIT_LIST_HEAD(&dir->properties);
+
+	for (i = 0; i < nentries; i++) {
+		struct tb_property *property;
+
+		property = tb_property_parse(block, block_len, &entries[i]);
+		if (!property) {
+			tb_property_free_dir(dir);
+			return NULL;
+		}
+
+		list_add_tail(&property->list, &dir->properties);
+	}
+
+	return dir;
+}
+
+/**
+ * tb_property_parse_dir() - Parses properties from given property block
+ * @block: Property block to parse
+ * @block_len: Number of dword elements in the property block
+ *
+ * This function parses the XDomain properties data block into format that
+ * can be traversed using the helper functions provided by this module.
+ * Upon success returns the parsed directory. In case of error returns
+ * %NULL. The resulting &struct tb_property_dir needs to be released by
+ * calling tb_property_free_dir() when not needed anymore.
+ *
+ * The @block is expected to be root directory.
+ */
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len)
+{
+	const struct tb_property_rootdir_entry *rootdir =
+		(const struct tb_property_rootdir_entry *)block;
+
+	if (rootdir->magic != TB_PROPERTY_ROOTDIR_MAGIC)
+		return NULL;
+	if (rootdir->length > block_len)
+		return NULL;
+
+	return __tb_property_parse_dir(block, block_len, 0, rootdir->length,
+				       true);
+}
+
+/**
+ * tb_property_create_dir() - Creates new property directory
+ * @uuid: UUID used to identify the particular directory
+ *
+ * Creates new, empty property directory. If @uuid is %NULL then the
+ * directory is assumed to be root directory.
+ */
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid)
+{
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	INIT_LIST_HEAD(&dir->properties);
+	if (uuid) {
+		dir->uuid = kmemdup(uuid, sizeof(*dir->uuid), GFP_KERNEL);
+		if (!dir->uuid) {
+			kfree(dir);
+			return NULL;
+		}
+	}
+
+	return dir;
+}
+EXPORT_SYMBOL_GPL(tb_property_create_dir);
+
+static void tb_property_free(struct tb_property *property)
+{
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		tb_property_free_dir(property->value.dir);
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		kfree(property->value.data);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		kfree(property->value.text);
+		break;
+
+	default:
+		break;
+	}
+
+	kfree(property);
+}
+
+/**
+ * tb_property_free_dir() - Release memory allocated for property directory
+ * @dir: Directory to release
+ *
+ * This will release all the memory the directory occupies including all
+ * descendants. It is OK to pass %NULL @dir, then the function does
+ * nothing.
+ */
+void tb_property_free_dir(struct tb_property_dir *dir)
+{
+	struct tb_property *property, *tmp;
+
+	if (!dir)
+		return;
+
+	list_for_each_entry_safe(property, tmp, &dir->properties, list) {
+		list_del(&property->list);
+		tb_property_free(property);
+	}
+	kfree(dir->uuid);
+	kfree(dir);
+}
+EXPORT_SYMBOL_GPL(tb_property_free_dir);
+
+static size_t tb_property_dir_length(const struct tb_property_dir *dir,
+				     bool recurse, size_t *data_len)
+{
+	const struct tb_property *property;
+	size_t len = 0;
+
+	if (dir->uuid)
+		len += sizeof(*dir->uuid) / 4;
+	else
+		len += sizeof(struct tb_property_rootdir_entry) / 4;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		len += sizeof(struct tb_property_entry) / 4;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			if (recurse) {
+				len += tb_property_dir_length(
+					property->value.dir, recurse, data_len);
+			}
+			/* Reserve dword padding after each directory */
+			if (data_len)
+				*data_len += 1;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+		case TB_PROPERTY_TYPE_TEXT:
+			if (data_len)
+				*data_len += property->length;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	return len;
+}
+
+static ssize_t __tb_property_format_dir(const struct tb_property_dir *dir,
+	u32 *block, unsigned int start_offset, size_t block_len)
+{
+	unsigned int data_offset, dir_end;
+	const struct tb_property *property;
+	struct tb_property_entry *entry;
+	size_t dir_len, data_len = 0;
+	int ret;
+
+	/*
+	 * The structure of property block looks like following. Leaf
+	 * data/text is included right after the directory and each
+	 * directory follows each other (even nested ones).
+	 *
+	 * +----------+ <-- start_offset
+	 * |  header  | <-- root directory header
+	 * +----------+ ---
+	 * |  entry 0 | -^--------------------.
+	 * +----------+  |                    |
+	 * |  entry 1 | -|--------------------|--.
+	 * +----------+  |                    |  |
+	 * |  entry 2 | -|-----------------.  |  |
+	 * +----------+  |                 |  |  |
+	 * :          :  |  dir_len        |  |  |
+	 * .          .  |                 |  |  |
+	 * :          :  |                 |  |  |
+	 * +----------+  |                 |  |  |
+	 * |  entry n |  v                 |  |  |
+	 * +----------+ <-- data_offset    |  |  |
+	 * |  data 0  | <------------------|--'  |
+	 * +----------+                    |     |
+	 * |  data 1  | <------------------|-----'
+	 * +----------+                    |
+	 * | 00000000 | padding            |
+	 * +----------+ <-- dir_end <------'
+	 * |   UUID   | <-- directory UUID (child directory)
+	 * +----------+
+	 * |  entry 0 |
+	 * +----------+
+	 * |  entry 1 |
+	 * +----------+
+	 * :          :
+	 * .          .
+	 * :          :
+	 * +----------+
+	 * |  entry n |
+	 * +----------+
+	 * |  data 0  |
+	 * +----------+
+	 *
+	 * We use dir_end to hold pointer to the end of the directory. It
+	 * will increase as we add directories and each directory should be
+	 * added starting from previous dir_end.
+	 */
+	dir_len = tb_property_dir_length(dir, false, &data_len);
+	data_offset = start_offset + dir_len;
+	dir_end = start_offset + data_len + dir_len;
+
+	if (data_offset > dir_end)
+		return -EINVAL;
+	if (dir_end > block_len)
+		return -EINVAL;
+
+	/* Write headers first */
+	if (dir->uuid) {
+		struct tb_property_dir_entry *pe;
+
+		pe = (struct tb_property_dir_entry *)&block[start_offset];
+		memcpy(pe->uuid, dir->uuid, sizeof(pe->uuid));
+		entry = pe->entries;
+	} else {
+		struct tb_property_rootdir_entry *re;
+
+		re = (struct tb_property_rootdir_entry *)&block[start_offset];
+		re->magic = TB_PROPERTY_ROOTDIR_MAGIC;
+		re->length = dir_len - sizeof(*re) / 4;
+		entry = re->entries;
+	}
+
+	list_for_each_entry(property, &dir->properties, list) {
+		const struct tb_property_dir *child;
+
+		format_dwdata(entry, property->key, 2);
+		entry->type = property->type;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			child = property->value.dir;
+			ret = __tb_property_format_dir(child, block, dir_end,
+						       block_len);
+			if (ret < 0)
+				return ret;
+			entry->length = tb_property_dir_length(child, false,
+							       NULL);
+			entry->value = dir_end;
+			dir_end = ret;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+			format_dwdata(&block[data_offset], property->value.data,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_TEXT:
+			format_dwdata(&block[data_offset], property->value.text,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_VALUE:
+			entry->length = property->length;
+			entry->value = property->value.immediate;
+			break;
+
+		default:
+			break;
+		}
+
+		entry++;
+	}
+
+	return dir_end;
+}
+
+/**
+ * tb_property_format_dir() - Formats directory to the packed XDomain format
+ * @dir: Directory to format
+ * @block: Property block where the packed data is placed
+ * @block_len: Length of the property block
+ *
+ * This function formats the directory to the packed format that can be
+ * then send over the thunderbolt fabric to receiving host. Returns %0 in
+ * case of success and negative errno on faulure. Passing %NULL in @block
+ * returns number of entries the block takes.
+ */
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len)
+{
+	ssize_t ret;
+
+	if (!block) {
+		size_t dir_len, data_len = 0;
+
+		dir_len = tb_property_dir_length(dir, true, &data_len);
+		return dir_len + data_len;
+	}
+
+	ret = __tb_property_format_dir(dir, block, 0, block_len);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * tb_property_add_immediate() - Add immediate property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @value: Immediate value to store with the property
+ */
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_VALUE);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = 1;
+	property->value.immediate = value;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_immediate);
+
+/**
+ * tb_property_add_data() - Adds arbitrary data property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @buf: Data buffer to add
+ * @buflen: Number of bytes in the data buffer
+ *
+ * Function takes a copy of @buf and adds it to the directory.
+ */
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(buflen, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DATA);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	memcpy(property->value.data, buf, buflen);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_data);
+
+/**
+ * tb_property_add_text() - Adds string property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @text: String to add
+ *
+ * Function takes a copy of @text and adds it to the directory.
+ */
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(strlen(text) + 1, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_TEXT);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	strcpy(property->value.text, text);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_text);
+
+/**
+ * tb_property_add_dir() - Adds a directory to the parent directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @dir: Directory to add
+ */
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DIRECTORY);
+	if (!property)
+		return -ENOMEM;
+
+	property->value.dir = dir;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_dir);
+
+/**
+ * tb_property_remove() - Removes property from a parent directory
+ * @property: Property to remove
+ *
+ * Note memory for @property is released as well so it is not allowed to
+ * touch the object after call to this function.
+ */
+void tb_property_remove(struct tb_property *property)
+{
+	list_del(&property->list);
+	kfree(property);
+}
+EXPORT_SYMBOL_GPL(tb_property_remove);
+
+/**
+ * tb_property_find() - Find a property from a directory
+ * @dir: Directory where the property is searched
+ * @key: Key to look for
+ * @type: Type of the property
+ *
+ * Finds and returns property from the given directory. Does not recurse
+ * into sub-directories. Returns %NULL if the property was not found.
+ */
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+	const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		if (property->type == type && !strcmp(property->key, key))
+			return property;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tb_property_find);
+
+/**
+ * tb_property_get_next() - Get next property from directory
+ * @dir: Directory holding properties
+ * @prev: Previous property in the directory (%NULL returns the first)
+ */
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev)
+{
+	if (prev) {
+		if (list_is_last(&prev->list, &dir->properties))
+			return NULL;
+		return list_next_entry(prev, list);
+	}
+	return list_first_entry_or_null(&dir->properties, struct tb_property,
+					list);
+}
+EXPORT_SYMBOL_GPL(tb_property_get_next);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
new file mode 100644
index 000000000000..96561c1265ae
--- /dev/null
+++ b/include/linux/thunderbolt.h
@@ -0,0 +1,89 @@
+/*
+ * Thunderbolt service API
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef THUNDERBOLT_H_
+#define THUNDERBOLT_H_
+
+#include <linux/list.h>
+#include <linux/uuid.h>
+
+/**
+ * struct tb_property_dir - XDomain property directory
+ * @uuid: Directory UUID or %NULL if root directory
+ * @properties: List of properties in this directory
+ *
+ * User needs to provide serialization if needed.
+ */
+struct tb_property_dir {
+	const uuid_t *uuid;
+	struct list_head properties;
+};
+
+enum tb_property_type {
+	TB_PROPERTY_TYPE_UNKNOWN = 0x00,
+	TB_PROPERTY_TYPE_DIRECTORY = 0x44,
+	TB_PROPERTY_TYPE_DATA = 0x64,
+	TB_PROPERTY_TYPE_TEXT = 0x74,
+	TB_PROPERTY_TYPE_VALUE = 0x76,
+};
+
+#define TB_PROPERTY_KEY_SIZE	8
+
+/**
+ * struct tb_property - XDomain property
+ * @list: Used to link properties together in a directory
+ * @key: Key for the property (always terminated).
+ * @type: Type of the property
+ * @length: Length of the property data in dwords
+ * @value: Property value
+ *
+ * Users use @type to determine which field in @value is filled.
+ */
+struct tb_property {
+	struct list_head list;
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	enum tb_property_type type;
+	size_t length;
+	union {
+		struct tb_property_dir *dir;
+		u8 *data;
+		char *text;
+		u32 immediate;
+	} value;
+};
+
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len);
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len);
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid);
+void tb_property_free_dir(struct tb_property_dir *dir);
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value);
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen);
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text);
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir);
+void tb_property_remove(struct tb_property *tb_property);
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+			const char *key, enum tb_property_type type);
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev);
+
+#define tb_property_for_each(dir, property)			\
+	for (property = tb_property_get_next(dir, NULL);	\
+	     property;						\
+	     property = tb_property_get_next(dir, property))
+
+#endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From eaf8ff35a345449207ad116e2574c19780ec9a98 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:31 +0300
Subject: thunderbolt: Move enum tb_cfg_pkg_type to thunderbolt.h

These will be needed by Thunderbolt services when sending and receiving
XDomain control messages. While there change TB_CFG_PKG_PREPARE_TO_SLEEP
value to be decimal in order to be consistent with other members.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/ctl.h     |  1 +
 drivers/thunderbolt/tb_msgs.h | 17 -----------------
 include/linux/thunderbolt.h   | 17 +++++++++++++++++
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/ctl.h b/drivers/thunderbolt/ctl.h
index 36fd28b1c1c5..d0f21e1e0b8b 100644
--- a/drivers/thunderbolt/ctl.h
+++ b/drivers/thunderbolt/ctl.h
@@ -8,6 +8,7 @@
 #define _TB_CFG
 
 #include <linux/kref.h>
+#include <linux/thunderbolt.h>
 
 #include "nhi.h"
 #include "tb_msgs.h"
diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h
index f3adf58a40ce..f2b2550cd97c 100644
--- a/drivers/thunderbolt/tb_msgs.h
+++ b/drivers/thunderbolt/tb_msgs.h
@@ -15,23 +15,6 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 
-enum tb_cfg_pkg_type {
-	TB_CFG_PKG_READ = 1,
-	TB_CFG_PKG_WRITE = 2,
-	TB_CFG_PKG_ERROR = 3,
-	TB_CFG_PKG_NOTIFY_ACK = 4,
-	TB_CFG_PKG_EVENT = 5,
-	TB_CFG_PKG_XDOMAIN_REQ = 6,
-	TB_CFG_PKG_XDOMAIN_RESP = 7,
-	TB_CFG_PKG_OVERRIDE = 8,
-	TB_CFG_PKG_RESET = 9,
-	TB_CFG_PKG_ICM_EVENT = 10,
-	TB_CFG_PKG_ICM_CMD = 11,
-	TB_CFG_PKG_ICM_RESP = 12,
-	TB_CFG_PKG_PREPARE_TO_SLEEP = 0xd,
-
-};
-
 enum tb_cfg_space {
 	TB_CFG_HOPS = 0,
 	TB_CFG_PORT = 1,
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 96561c1265ae..b512b1e2b4f2 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -1,6 +1,7 @@
 /*
  * Thunderbolt service API
  *
+ * Copyright (C) 2014 Andreas Noever <andreas.noever@gmail.com>
  * Copyright (C) 2017, Intel Corporation
  * Authors: Michael Jamet <michael.jamet@intel.com>
  *          Mika Westerberg <mika.westerberg@linux.intel.com>
@@ -16,6 +17,22 @@
 #include <linux/list.h>
 #include <linux/uuid.h>
 
+enum tb_cfg_pkg_type {
+	TB_CFG_PKG_READ = 1,
+	TB_CFG_PKG_WRITE = 2,
+	TB_CFG_PKG_ERROR = 3,
+	TB_CFG_PKG_NOTIFY_ACK = 4,
+	TB_CFG_PKG_EVENT = 5,
+	TB_CFG_PKG_XDOMAIN_REQ = 6,
+	TB_CFG_PKG_XDOMAIN_RESP = 7,
+	TB_CFG_PKG_OVERRIDE = 8,
+	TB_CFG_PKG_RESET = 9,
+	TB_CFG_PKG_ICM_EVENT = 10,
+	TB_CFG_PKG_ICM_CMD = 11,
+	TB_CFG_PKG_ICM_RESP = 12,
+	TB_CFG_PKG_PREPARE_TO_SLEEP = 13,
+};
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From 9e99b9f4d5c36340dabda6d14053195b2a43796b Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:32 +0300
Subject: thunderbolt: Move thunderbolt domain structure to thunderbolt.h

These are needed by Thunderbolt services so move them to thunderbolt.h
to make sure they are available outside of drivers/thunderbolt.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/tb.h    | 42 ------------------------------------------
 include/linux/thunderbolt.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index e0deee4f1eb0..2fefe76621ca 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -39,20 +39,6 @@ struct tb_switch_nvm {
 	bool authenticating;
 };
 
-/**
- * enum tb_security_level - Thunderbolt security level
- * @TB_SECURITY_NONE: No security, legacy mode
- * @TB_SECURITY_USER: User approval required at minimum
- * @TB_SECURITY_SECURE: One time saved key required at minimum
- * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB)
- */
-enum tb_security_level {
-	TB_SECURITY_NONE,
-	TB_SECURITY_USER,
-	TB_SECURITY_SECURE,
-	TB_SECURITY_DPONLY,
-};
-
 #define TB_SWITCH_KEY_SIZE		32
 /* Each physical port contains 2 links on modern controllers */
 #define TB_SWITCH_LINKS_PER_PHY_PORT	2
@@ -223,33 +209,6 @@ struct tb_cm_ops {
 	int (*disconnect_pcie_paths)(struct tb *tb);
 };
 
-/**
- * struct tb - main thunderbolt bus structure
- * @dev: Domain device
- * @lock: Big lock. Must be held when accessing any struct
- *	  tb_switch / struct tb_port.
- * @nhi: Pointer to the NHI structure
- * @ctl: Control channel for this domain
- * @wq: Ordered workqueue for all domain specific work
- * @root_switch: Root switch of this domain
- * @cm_ops: Connection manager specific operations vector
- * @index: Linux assigned domain number
- * @security_level: Current security level
- * @privdata: Private connection manager specific data
- */
-struct tb {
-	struct device dev;
-	struct mutex lock;
-	struct tb_nhi *nhi;
-	struct tb_ctl *ctl;
-	struct workqueue_struct *wq;
-	struct tb_switch *root_switch;
-	const struct tb_cm_ops *cm_ops;
-	int index;
-	enum tb_security_level security_level;
-	unsigned long privdata[0];
-};
-
 static inline void *tb_priv(struct tb *tb)
 {
 	return (void *)tb->privdata;
@@ -368,7 +327,6 @@ static inline int tb_port_write(struct tb_port *port, const void *buffer,
 struct tb *icm_probe(struct tb_nhi *nhi);
 struct tb *tb_probe(struct tb_nhi *nhi);
 
-extern struct bus_type tb_bus_type;
 extern struct device_type tb_domain_type;
 extern struct device_type tb_switch_type;
 
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index b512b1e2b4f2..910b1bf92112 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -14,7 +14,9 @@
 #ifndef THUNDERBOLT_H_
 #define THUNDERBOLT_H_
 
+#include <linux/device.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/uuid.h>
 
 enum tb_cfg_pkg_type {
@@ -33,6 +35,49 @@ enum tb_cfg_pkg_type {
 	TB_CFG_PKG_PREPARE_TO_SLEEP = 13,
 };
 
+/**
+ * enum tb_security_level - Thunderbolt security level
+ * @TB_SECURITY_NONE: No security, legacy mode
+ * @TB_SECURITY_USER: User approval required at minimum
+ * @TB_SECURITY_SECURE: One time saved key required at minimum
+ * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB)
+ */
+enum tb_security_level {
+	TB_SECURITY_NONE,
+	TB_SECURITY_USER,
+	TB_SECURITY_SECURE,
+	TB_SECURITY_DPONLY,
+};
+
+/**
+ * struct tb - main thunderbolt bus structure
+ * @dev: Domain device
+ * @lock: Big lock. Must be held when accessing any struct
+ *	  tb_switch / struct tb_port.
+ * @nhi: Pointer to the NHI structure
+ * @ctl: Control channel for this domain
+ * @wq: Ordered workqueue for all domain specific work
+ * @root_switch: Root switch of this domain
+ * @cm_ops: Connection manager specific operations vector
+ * @index: Linux assigned domain number
+ * @security_level: Current security level
+ * @privdata: Private connection manager specific data
+ */
+struct tb {
+	struct device dev;
+	struct mutex lock;
+	struct tb_nhi *nhi;
+	struct tb_ctl *ctl;
+	struct workqueue_struct *wq;
+	struct tb_switch *root_switch;
+	const struct tb_cm_ops *cm_ops;
+	int index;
+	enum tb_security_level security_level;
+	unsigned long privdata[0];
+};
+
+extern struct bus_type tb_bus_type;
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From e69b71f8458b78a2ef44e3d07374a8f46e45123d Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:33 +0300
Subject: thunderbolt: Move tb_switch_phy_port_from_link() to thunderbolt.h

A Thunderbolt service might need to find the physical port from a link
the cable is connected to. For instance networking driver uses this
information to generate MAC address according the Apple ThunderboltIP
protocol.

Move this function to thunderbolt.h and rename it to
tb_phy_port_from_link() to reflect the fact that it does not take switch
as parameter.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/icm.c   | 2 +-
 drivers/thunderbolt/tb.h    | 7 -------
 include/linux/thunderbolt.h | 7 +++++++
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c
index 53250fc057e1..8c22b91ed040 100644
--- a/drivers/thunderbolt/icm.c
+++ b/drivers/thunderbolt/icm.c
@@ -89,7 +89,7 @@ static inline struct tb *icm_to_tb(struct icm *icm)
 
 static inline u8 phy_port_from_route(u64 route, u8 depth)
 {
-	return tb_switch_phy_port_from_link(route >> ((depth - 1) * 8));
+	return tb_phy_port_from_link(route >> ((depth - 1) * 8));
 }
 
 static inline u8 dual_link_from_link(u8 link)
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index 2fefe76621ca..ea21d927bd09 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -40,8 +40,6 @@ struct tb_switch_nvm {
 };
 
 #define TB_SWITCH_KEY_SIZE		32
-/* Each physical port contains 2 links on modern controllers */
-#define TB_SWITCH_LINKS_PER_PHY_PORT	2
 
 /**
  * struct tb_switch - a thunderbolt switch
@@ -367,11 +365,6 @@ struct tb_switch *tb_switch_find_by_link_depth(struct tb *tb, u8 link,
 					       u8 depth);
 struct tb_switch *tb_switch_find_by_uuid(struct tb *tb, const uuid_t *uuid);
 
-static inline unsigned int tb_switch_phy_port_from_link(unsigned int link)
-{
-	return (link - 1) / TB_SWITCH_LINKS_PER_PHY_PORT;
-}
-
 static inline void tb_switch_put(struct tb_switch *sw)
 {
 	put_device(&sw->dev);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 910b1bf92112..43b8d1e09341 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -78,6 +78,13 @@ struct tb {
 
 extern struct bus_type tb_bus_type;
 
+#define TB_LINKS_PER_PHY_PORT	2
+
+static inline unsigned int tb_phy_port_from_link(unsigned int link)
+{
+	return (link - 1) / TB_LINKS_PER_PHY_PORT;
+}
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From d1ff70241a275133e1a0258b7c23588b122276c8 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:34 +0300
Subject: thunderbolt: Add support for XDomain discovery protocol

When two hosts are connected over a Thunderbolt cable, there is a
protocol they can use to communicate capabilities supported by the host.
The discovery protocol uses automatically configured control channel
(ring 0) and is build on top of request/response transactions using
special XDomain primitives provided by the Thunderbolt base protocol.

The capabilities consists of a root directory block of basic properties
used for identification of the host, and then there can be zero or more
directories each describing a Thunderbolt service and its capabilities.

Once both sides have discovered what is supported the two hosts can
setup high-speed DMA paths and transfer data to the other side using
whatever protocol was agreed based on the properties. The software
protocol used to communicate which DMA paths to enable is service
specific.

This patch adds support for the XDomain discovery protocol to the
Thunderbolt bus. We model each remote host connection as a Linux XDomain
device. For each Thunderbolt service found supported on the XDomain
device, we create Linux Thunderbolt service device which Thunderbolt
service drivers can then bind to based on the protocol identification
information retrieved from the property directory describing the
service.

This code is based on the work done by Amir Levy and Michael Jamet.

Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-bus-thunderbolt |   48 +
 drivers/thunderbolt/Makefile                    |    2 +-
 drivers/thunderbolt/ctl.c                       |   11 +-
 drivers/thunderbolt/ctl.h                       |    2 +-
 drivers/thunderbolt/domain.c                    |  197 ++-
 drivers/thunderbolt/icm.c                       |  218 +++-
 drivers/thunderbolt/nhi.h                       |    2 +
 drivers/thunderbolt/switch.c                    |    7 +-
 drivers/thunderbolt/tb.h                        |   39 +-
 drivers/thunderbolt/tb_msgs.h                   |  123 ++
 drivers/thunderbolt/xdomain.c                   | 1576 +++++++++++++++++++++++
 include/linux/mod_devicetable.h                 |   26 +
 include/linux/thunderbolt.h                     |  242 ++++
 scripts/mod/devicetable-offsets.c               |    7 +
 scripts/mod/file2alias.c                        |   25 +
 15 files changed, 2507 insertions(+), 18 deletions(-)
 create mode 100644 drivers/thunderbolt/xdomain.c

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt
index 392bef5bd399..93798c02e28b 100644
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@@ -110,3 +110,51 @@ Description:	When new NVM image is written to the non-active NVM
 		is directly the status value from the DMA configuration
 		based mailbox before the device is power cycled. Writing
 		0 here clears the status.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/key
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains name of the property directory the XDomain
+		service exposes. This entry describes the protocol in
+		question. Following directories are already reserved by
+		the Apple XDomain specification:
+
+		network:  IP/ethernet over Thunderbolt
+		targetdm: Target disk mode protocol over Thunderbolt
+		extdisp:  External display mode protocol over Thunderbolt
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/modalias
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	Stores the same MODALIAS value emitted by uevent for
+		the XDomain service. Format: tbtsvc:kSpNvNrN
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcid
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain protocol identifier the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcvers
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain protocol version the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcrevs
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain software version the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcstns
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain service specific settings as
+		bitmask. Format: %x
diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile
index 7afd21f5383a..f2f0de27252b 100644
--- a/drivers/thunderbolt/Makefile
+++ b/drivers/thunderbolt/Makefile
@@ -1,3 +1,3 @@
 obj-${CONFIG_THUNDERBOLT} := thunderbolt.o
 thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o
-thunderbolt-objs += domain.o dma_port.o icm.o property.o
+thunderbolt-objs += domain.o dma_port.o icm.o property.o xdomain.o
diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index e6a4c9458c76..46e393c5fd1d 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -368,10 +368,10 @@ static int tb_ctl_tx(struct tb_ctl *ctl, const void *data, size_t len,
 /**
  * tb_ctl_handle_event() - acknowledge a plug event, invoke ctl->callback
  */
-static void tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type,
+static bool tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type,
 				struct ctl_pkg *pkg, size_t size)
 {
-	ctl->callback(ctl->callback_data, type, pkg->buffer, size);
+	return ctl->callback(ctl->callback_data, type, pkg->buffer, size);
 }
 
 static void tb_ctl_rx_submit(struct ctl_pkg *pkg)
@@ -444,6 +444,8 @@ static void tb_ctl_rx_callback(struct tb_ring *ring, struct ring_frame *frame,
 		break;
 
 	case TB_CFG_PKG_EVENT:
+	case TB_CFG_PKG_XDOMAIN_RESP:
+	case TB_CFG_PKG_XDOMAIN_REQ:
 		if (*(__be32 *)(pkg->buffer + frame->size) != crc32) {
 			tb_ctl_err(pkg->ctl,
 				   "RX: checksum mismatch, dropping packet\n");
@@ -451,8 +453,9 @@ static void tb_ctl_rx_callback(struct tb_ring *ring, struct ring_frame *frame,
 		}
 		/* Fall through */
 	case TB_CFG_PKG_ICM_EVENT:
-		tb_ctl_handle_event(pkg->ctl, frame->eof, pkg, frame->size);
-		goto rx;
+		if (tb_ctl_handle_event(pkg->ctl, frame->eof, pkg, frame->size))
+			goto rx;
+		break;
 
 	default:
 		break;
diff --git a/drivers/thunderbolt/ctl.h b/drivers/thunderbolt/ctl.h
index d0f21e1e0b8b..85c49dd301ea 100644
--- a/drivers/thunderbolt/ctl.h
+++ b/drivers/thunderbolt/ctl.h
@@ -16,7 +16,7 @@
 /* control channel */
 struct tb_ctl;
 
-typedef void (*event_cb)(void *data, enum tb_cfg_pkg_type type,
+typedef bool (*event_cb)(void *data, enum tb_cfg_pkg_type type,
 			 const void *buf, size_t size);
 
 struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data);
diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index 9f2dcd48974d..9b90115319ce 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -20,6 +20,98 @@
 
 static DEFINE_IDA(tb_domain_ida);
 
+static bool match_service_id(const struct tb_service_id *id,
+			     const struct tb_service *svc)
+{
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_KEY) {
+		if (strcmp(id->protocol_key, svc->key))
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_ID) {
+		if (id->protocol_id != svc->prtcid)
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) {
+		if (id->protocol_version != svc->prtcvers)
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) {
+		if (id->protocol_revision != svc->prtcrevs)
+			return false;
+	}
+
+	return true;
+}
+
+static const struct tb_service_id *__tb_service_match(struct device *dev,
+						      struct device_driver *drv)
+{
+	struct tb_service_driver *driver;
+	const struct tb_service_id *ids;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return NULL;
+
+	driver = container_of(drv, struct tb_service_driver, driver);
+	if (!driver->id_table)
+		return NULL;
+
+	for (ids = driver->id_table; ids->match_flags != 0; ids++) {
+		if (match_service_id(ids, svc))
+			return ids;
+	}
+
+	return NULL;
+}
+
+static int tb_service_match(struct device *dev, struct device_driver *drv)
+{
+	return !!__tb_service_match(dev, drv);
+}
+
+static int tb_service_probe(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tb_service_driver *driver;
+	const struct tb_service_id *id;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	id = __tb_service_match(dev, &driver->driver);
+
+	return driver->probe(svc, id);
+}
+
+static int tb_service_remove(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tb_service_driver *driver;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	if (driver->remove)
+		driver->remove(svc);
+
+	return 0;
+}
+
+static void tb_service_shutdown(struct device *dev)
+{
+	struct tb_service_driver *driver;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc || !dev->driver)
+		return;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	if (driver->shutdown)
+		driver->shutdown(svc);
+}
+
 static const char * const tb_security_names[] = {
 	[TB_SECURITY_NONE] = "none",
 	[TB_SECURITY_USER] = "user",
@@ -52,6 +144,10 @@ static const struct attribute_group *domain_attr_groups[] = {
 
 struct bus_type tb_bus_type = {
 	.name = "thunderbolt",
+	.match = tb_service_match,
+	.probe = tb_service_probe,
+	.remove = tb_service_remove,
+	.shutdown = tb_service_shutdown,
 };
 
 static void tb_domain_release(struct device *dev)
@@ -128,17 +224,26 @@ err_free:
 	return NULL;
 }
 
-static void tb_domain_event_cb(void *data, enum tb_cfg_pkg_type type,
+static bool tb_domain_event_cb(void *data, enum tb_cfg_pkg_type type,
 			       const void *buf, size_t size)
 {
 	struct tb *tb = data;
 
 	if (!tb->cm_ops->handle_event) {
 		tb_warn(tb, "domain does not have event handler\n");
-		return;
+		return true;
 	}
 
-	tb->cm_ops->handle_event(tb, type, buf, size);
+	switch (type) {
+	case TB_CFG_PKG_XDOMAIN_REQ:
+	case TB_CFG_PKG_XDOMAIN_RESP:
+		return tb_xdomain_handle_request(tb, type, buf, size);
+
+	default:
+		tb->cm_ops->handle_event(tb, type, buf, size);
+	}
+
+	return true;
 }
 
 /**
@@ -443,9 +548,92 @@ int tb_domain_disconnect_pcie_paths(struct tb *tb)
 	return tb->cm_ops->disconnect_pcie_paths(tb);
 }
 
+/**
+ * tb_domain_approve_xdomain_paths() - Enable DMA paths for XDomain
+ * @tb: Domain enabling the DMA paths
+ * @xd: XDomain DMA paths are created to
+ *
+ * Calls connection manager specific method to enable DMA paths to the
+ * XDomain in question.
+ *
+ * Return: 0% in case of success and negative errno otherwise. In
+ * particular returns %-ENOTSUPP if the connection manager
+ * implementation does not support XDomains.
+ */
+int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	if (!tb->cm_ops->approve_xdomain_paths)
+		return -ENOTSUPP;
+
+	return tb->cm_ops->approve_xdomain_paths(tb, xd);
+}
+
+/**
+ * tb_domain_disconnect_xdomain_paths() - Disable DMA paths for XDomain
+ * @tb: Domain disabling the DMA paths
+ * @xd: XDomain whose DMA paths are disconnected
+ *
+ * Calls connection manager specific method to disconnect DMA paths to
+ * the XDomain in question.
+ *
+ * Return: 0% in case of success and negative errno otherwise. In
+ * particular returns %-ENOTSUPP if the connection manager
+ * implementation does not support XDomains.
+ */
+int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	if (!tb->cm_ops->disconnect_xdomain_paths)
+		return -ENOTSUPP;
+
+	return tb->cm_ops->disconnect_xdomain_paths(tb, xd);
+}
+
+static int disconnect_xdomain(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd;
+	struct tb *tb = data;
+	int ret = 0;
+
+	xd = tb_to_xdomain(dev);
+	if (xd && xd->tb == tb)
+		ret = tb_xdomain_disable_paths(xd);
+
+	return ret;
+}
+
+/**
+ * tb_domain_disconnect_all_paths() - Disconnect all paths for the domain
+ * @tb: Domain whose paths are disconnected
+ *
+ * This function can be used to disconnect all paths (PCIe, XDomain) for
+ * example in preparation for host NVM firmware upgrade. After this is
+ * called the paths cannot be established without resetting the switch.
+ *
+ * Return: %0 in case of success and negative errno otherwise.
+ */
+int tb_domain_disconnect_all_paths(struct tb *tb)
+{
+	int ret;
+
+	ret = tb_domain_disconnect_pcie_paths(tb);
+	if (ret)
+		return ret;
+
+	return bus_for_each_dev(&tb_bus_type, NULL, tb, disconnect_xdomain);
+}
+
 int tb_domain_init(void)
 {
-	return bus_register(&tb_bus_type);
+	int ret;
+
+	ret = tb_xdomain_init();
+	if (ret)
+		return ret;
+	ret = bus_register(&tb_bus_type);
+	if (ret)
+		tb_xdomain_exit();
+
+	return ret;
 }
 
 void tb_domain_exit(void)
@@ -453,4 +641,5 @@ void tb_domain_exit(void)
 	bus_unregister(&tb_bus_type);
 	ida_destroy(&tb_domain_ida);
 	tb_switch_exit();
+	tb_xdomain_exit();
 }
diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c
index 8c22b91ed040..ab02d13f40b7 100644
--- a/drivers/thunderbolt/icm.c
+++ b/drivers/thunderbolt/icm.c
@@ -60,6 +60,8 @@
  * @get_route: Find a route string for given switch
  * @device_connected: Handle device connected ICM message
  * @device_disconnected: Handle device disconnected ICM message
+ * @xdomain_connected - Handle XDomain connected ICM message
+ * @xdomain_disconnected - Handle XDomain disconnected ICM message
  */
 struct icm {
 	struct mutex request_lock;
@@ -74,6 +76,10 @@ struct icm {
 				 const struct icm_pkg_header *hdr);
 	void (*device_disconnected)(struct tb *tb,
 				    const struct icm_pkg_header *hdr);
+	void (*xdomain_connected)(struct tb *tb,
+				  const struct icm_pkg_header *hdr);
+	void (*xdomain_disconnected)(struct tb *tb,
+				     const struct icm_pkg_header *hdr);
 };
 
 struct icm_notification {
@@ -89,7 +95,10 @@ static inline struct tb *icm_to_tb(struct icm *icm)
 
 static inline u8 phy_port_from_route(u64 route, u8 depth)
 {
-	return tb_phy_port_from_link(route >> ((depth - 1) * 8));
+	u8 link;
+
+	link = depth ? route >> ((depth - 1) * 8) : route;
+	return tb_phy_port_from_link(link);
 }
 
 static inline u8 dual_link_from_link(u8 link)
@@ -320,6 +329,51 @@ static int icm_fr_challenge_switch_key(struct tb *tb, struct tb_switch *sw,
 	return 0;
 }
 
+static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	struct icm_fr_pkg_approve_xdomain_response reply;
+	struct icm_fr_pkg_approve_xdomain request;
+	int ret;
+
+	memset(&request, 0, sizeof(request));
+	request.hdr.code = ICM_APPROVE_XDOMAIN;
+	request.link_info = xd->depth << ICM_LINK_INFO_DEPTH_SHIFT | xd->link;
+	memcpy(&request.remote_uuid, xd->remote_uuid, sizeof(*xd->remote_uuid));
+
+	request.transmit_path = xd->transmit_path;
+	request.transmit_ring = xd->transmit_ring;
+	request.receive_path = xd->receive_path;
+	request.receive_ring = xd->receive_ring;
+
+	memset(&reply, 0, sizeof(reply));
+	ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply),
+			  1, ICM_TIMEOUT);
+	if (ret)
+		return ret;
+
+	if (reply.hdr.flags & ICM_FLAGS_ERROR)
+		return -EIO;
+
+	return 0;
+}
+
+static int icm_fr_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	u8 phy_port;
+	u8 cmd;
+
+	phy_port = tb_phy_port_from_link(xd->link);
+	if (phy_port == 0)
+		cmd = NHI_MAILBOX_DISCONNECT_PA;
+	else
+		cmd = NHI_MAILBOX_DISCONNECT_PB;
+
+	nhi_mailbox_cmd(tb->nhi, cmd, 1);
+	usleep_range(10, 50);
+	nhi_mailbox_cmd(tb->nhi, cmd, 2);
+	return 0;
+}
+
 static void remove_switch(struct tb_switch *sw)
 {
 	struct tb_switch *parent_sw;
@@ -475,6 +529,141 @@ icm_fr_device_disconnected(struct tb *tb, const struct icm_pkg_header *hdr)
 	tb_switch_put(sw);
 }
 
+static void remove_xdomain(struct tb_xdomain *xd)
+{
+	struct tb_switch *sw;
+
+	sw = tb_to_switch(xd->dev.parent);
+	tb_port_at(xd->route, sw)->xdomain = NULL;
+	tb_xdomain_remove(xd);
+}
+
+static void
+icm_fr_xdomain_connected(struct tb *tb, const struct icm_pkg_header *hdr)
+{
+	const struct icm_fr_event_xdomain_connected *pkg =
+		(const struct icm_fr_event_xdomain_connected *)hdr;
+	struct tb_xdomain *xd;
+	struct tb_switch *sw;
+	u8 link, depth;
+	bool approved;
+	u64 route;
+
+	/*
+	 * After NVM upgrade adding root switch device fails because we
+	 * initiated reset. During that time ICM might still send
+	 * XDomain connected message which we ignore here.
+	 */
+	if (!tb->root_switch)
+		return;
+
+	link = pkg->link_info & ICM_LINK_INFO_LINK_MASK;
+	depth = (pkg->link_info & ICM_LINK_INFO_DEPTH_MASK) >>
+		ICM_LINK_INFO_DEPTH_SHIFT;
+	approved = pkg->link_info & ICM_LINK_INFO_APPROVED;
+
+	if (link > ICM_MAX_LINK || depth > ICM_MAX_DEPTH) {
+		tb_warn(tb, "invalid topology %u.%u, ignoring\n", link, depth);
+		return;
+	}
+
+	route = get_route(pkg->local_route_hi, pkg->local_route_lo);
+
+	xd = tb_xdomain_find_by_uuid(tb, &pkg->remote_uuid);
+	if (xd) {
+		u8 xd_phy_port, phy_port;
+
+		xd_phy_port = phy_port_from_route(xd->route, xd->depth);
+		phy_port = phy_port_from_route(route, depth);
+
+		if (xd->depth == depth && xd_phy_port == phy_port) {
+			xd->link = link;
+			xd->route = route;
+			xd->is_unplugged = false;
+			tb_xdomain_put(xd);
+			return;
+		}
+
+		/*
+		 * If we find an existing XDomain connection remove it
+		 * now. We need to go through login handshake and
+		 * everything anyway to be able to re-establish the
+		 * connection.
+		 */
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+
+	/*
+	 * Look if there already exists an XDomain in the same place
+	 * than the new one and in that case remove it because it is
+	 * most likely another host that got disconnected.
+	 */
+	xd = tb_xdomain_find_by_link_depth(tb, link, depth);
+	if (!xd) {
+		u8 dual_link;
+
+		dual_link = dual_link_from_link(link);
+		if (dual_link)
+			xd = tb_xdomain_find_by_link_depth(tb, dual_link,
+							   depth);
+	}
+	if (xd) {
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+
+	/*
+	 * If the user disconnected a switch during suspend and
+	 * connected another host to the same port, remove the switch
+	 * first.
+	 */
+	sw = get_switch_at_route(tb->root_switch, route);
+	if (sw)
+		remove_switch(sw);
+
+	sw = tb_switch_find_by_link_depth(tb, link, depth);
+	if (!sw) {
+		tb_warn(tb, "no switch exists at %u.%u, ignoring\n", link,
+			depth);
+		return;
+	}
+
+	xd = tb_xdomain_alloc(sw->tb, &sw->dev, route,
+			      &pkg->local_uuid, &pkg->remote_uuid);
+	if (!xd) {
+		tb_switch_put(sw);
+		return;
+	}
+
+	xd->link = link;
+	xd->depth = depth;
+
+	tb_port_at(route, sw)->xdomain = xd;
+
+	tb_xdomain_add(xd);
+	tb_switch_put(sw);
+}
+
+static void
+icm_fr_xdomain_disconnected(struct tb *tb, const struct icm_pkg_header *hdr)
+{
+	const struct icm_fr_event_xdomain_disconnected *pkg =
+		(const struct icm_fr_event_xdomain_disconnected *)hdr;
+	struct tb_xdomain *xd;
+
+	/*
+	 * If the connection is through one or multiple devices, the
+	 * XDomain device is removed along with them so it is fine if we
+	 * cannot find it here.
+	 */
+	xd = tb_xdomain_find_by_uuid(tb, &pkg->remote_uuid);
+	if (xd) {
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+}
+
 static struct pci_dev *get_upstream_port(struct pci_dev *pdev)
 {
 	struct pci_dev *parent;
@@ -594,6 +783,12 @@ static void icm_handle_notification(struct work_struct *work)
 	case ICM_EVENT_DEVICE_DISCONNECTED:
 		icm->device_disconnected(tb, n->pkg);
 		break;
+	case ICM_EVENT_XDOMAIN_CONNECTED:
+		icm->xdomain_connected(tb, n->pkg);
+		break;
+	case ICM_EVENT_XDOMAIN_DISCONNECTED:
+		icm->xdomain_disconnected(tb, n->pkg);
+		break;
 	}
 
 	mutex_unlock(&tb->lock);
@@ -927,6 +1122,10 @@ static void icm_unplug_children(struct tb_switch *sw)
 
 		if (tb_is_upstream_port(port))
 			continue;
+		if (port->xdomain) {
+			port->xdomain->is_unplugged = true;
+			continue;
+		}
 		if (!port->remote)
 			continue;
 
@@ -943,6 +1142,13 @@ static void icm_free_unplugged_children(struct tb_switch *sw)
 
 		if (tb_is_upstream_port(port))
 			continue;
+
+		if (port->xdomain && port->xdomain->is_unplugged) {
+			tb_xdomain_remove(port->xdomain);
+			port->xdomain = NULL;
+			continue;
+		}
+
 		if (!port->remote)
 			continue;
 
@@ -1009,8 +1215,10 @@ static int icm_start(struct tb *tb)
 	tb->root_switch->no_nvm_upgrade = x86_apple_machine;
 
 	ret = tb_switch_add(tb->root_switch);
-	if (ret)
+	if (ret) {
 		tb_switch_put(tb->root_switch);
+		tb->root_switch = NULL;
+	}
 
 	return ret;
 }
@@ -1042,6 +1250,8 @@ static const struct tb_cm_ops icm_fr_ops = {
 	.add_switch_key = icm_fr_add_switch_key,
 	.challenge_switch_key = icm_fr_challenge_switch_key,
 	.disconnect_pcie_paths = icm_disconnect_pcie_paths,
+	.approve_xdomain_paths = icm_fr_approve_xdomain_paths,
+	.disconnect_xdomain_paths = icm_fr_disconnect_xdomain_paths,
 };
 
 struct tb *icm_probe(struct tb_nhi *nhi)
@@ -1064,6 +1274,8 @@ struct tb *icm_probe(struct tb_nhi *nhi)
 		icm->get_route = icm_fr_get_route;
 		icm->device_connected = icm_fr_device_connected;
 		icm->device_disconnected = icm_fr_device_disconnected;
+		icm->xdomain_connected = icm_fr_xdomain_connected;
+		icm->xdomain_disconnected = icm_fr_xdomain_disconnected;
 		tb->cm_ops = &icm_fr_ops;
 		break;
 
@@ -1077,6 +1289,8 @@ struct tb *icm_probe(struct tb_nhi *nhi)
 		icm->get_route = icm_ar_get_route;
 		icm->device_connected = icm_fr_device_connected;
 		icm->device_disconnected = icm_fr_device_disconnected;
+		icm->xdomain_connected = icm_fr_xdomain_connected;
+		icm->xdomain_disconnected = icm_fr_xdomain_disconnected;
 		tb->cm_ops = &icm_fr_ops;
 		break;
 	}
diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h
index 5b5bb2c436be..0e05828983db 100644
--- a/drivers/thunderbolt/nhi.h
+++ b/drivers/thunderbolt/nhi.h
@@ -157,6 +157,8 @@ enum nhi_mailbox_cmd {
 	NHI_MAILBOX_SAVE_DEVS = 0x05,
 	NHI_MAILBOX_DISCONNECT_PCIE_PATHS = 0x06,
 	NHI_MAILBOX_DRV_UNLOADS = 0x07,
+	NHI_MAILBOX_DISCONNECT_PA = 0x10,
+	NHI_MAILBOX_DISCONNECT_PB = 0x11,
 	NHI_MAILBOX_ALLOW_ALL_DEVS = 0x23,
 };
 
diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index 53f40c57df59..dfc357d33e1e 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -171,11 +171,11 @@ static int nvm_authenticate_host(struct tb_switch *sw)
 
 	/*
 	 * Root switch NVM upgrade requires that we disconnect the
-	 * existing PCIe paths first (in case it is not in safe mode
+	 * existing paths first (in case it is not in safe mode
 	 * already).
 	 */
 	if (!sw->safe_mode) {
-		ret = tb_domain_disconnect_pcie_paths(sw->tb);
+		ret = tb_domain_disconnect_all_paths(sw->tb);
 		if (ret)
 			return ret;
 		/*
@@ -1363,6 +1363,9 @@ void tb_switch_remove(struct tb_switch *sw)
 		if (sw->ports[i].remote)
 			tb_switch_remove(sw->ports[i].remote->sw);
 		sw->ports[i].remote = NULL;
+		if (sw->ports[i].xdomain)
+			tb_xdomain_remove(sw->ports[i].xdomain);
+		sw->ports[i].xdomain = NULL;
 	}
 
 	if (!sw->is_unplugged)
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index ea21d927bd09..74af9d4929ab 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -9,6 +9,7 @@
 
 #include <linux/nvmem-provider.h>
 #include <linux/pci.h>
+#include <linux/thunderbolt.h>
 #include <linux/uuid.h>
 
 #include "tb_regs.h"
@@ -109,14 +110,25 @@ struct tb_switch {
 
 /**
  * struct tb_port - a thunderbolt port, part of a tb_switch
+ * @config: Cached port configuration read from registers
+ * @sw: Switch the port belongs to
+ * @remote: Remote port (%NULL if not connected)
+ * @xdomain: Remote host (%NULL if not connected)
+ * @cap_phy: Offset, zero if not found
+ * @port: Port number on switch
+ * @disabled: Disabled by eeprom
+ * @dual_link_port: If the switch is connected using two ports, points
+ *		    to the other port.
+ * @link_nr: Is this primary or secondary port on the dual_link.
  */
 struct tb_port {
 	struct tb_regs_port_header config;
 	struct tb_switch *sw;
-	struct tb_port *remote; /* remote port, NULL if not connected */
-	int cap_phy; /* offset, zero if not found */
-	u8 port; /* port number on switch */
-	bool disabled; /* disabled by eeprom */
+	struct tb_port *remote;
+	struct tb_xdomain *xdomain;
+	int cap_phy;
+	u8 port;
+	bool disabled;
 	struct tb_port *dual_link_port;
 	u8 link_nr:1;
 };
@@ -189,6 +201,8 @@ struct tb_path {
  * @add_switch_key: Add key to switch
  * @challenge_switch_key: Challenge switch using key
  * @disconnect_pcie_paths: Disconnects PCIe paths before NVM update
+ * @approve_xdomain_paths: Approve (establish) XDomain DMA paths
+ * @disconnect_xdomain_paths: Disconnect XDomain DMA paths
  */
 struct tb_cm_ops {
 	int (*driver_ready)(struct tb *tb);
@@ -205,6 +219,8 @@ struct tb_cm_ops {
 	int (*challenge_switch_key)(struct tb *tb, struct tb_switch *sw,
 				    const u8 *challenge, u8 *response);
 	int (*disconnect_pcie_paths)(struct tb *tb);
+	int (*approve_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd);
+	int (*disconnect_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd);
 };
 
 static inline void *tb_priv(struct tb *tb)
@@ -331,6 +347,8 @@ extern struct device_type tb_switch_type;
 int tb_domain_init(void);
 void tb_domain_exit(void);
 void tb_switch_exit(void);
+int tb_xdomain_init(void);
+void tb_xdomain_exit(void);
 
 struct tb *tb_domain_alloc(struct tb_nhi *nhi, size_t privsize);
 int tb_domain_add(struct tb *tb);
@@ -343,6 +361,9 @@ int tb_domain_approve_switch(struct tb *tb, struct tb_switch *sw);
 int tb_domain_approve_switch_key(struct tb *tb, struct tb_switch *sw);
 int tb_domain_challenge_switch_key(struct tb *tb, struct tb_switch *sw);
 int tb_domain_disconnect_pcie_paths(struct tb *tb);
+int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd);
+int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd);
+int tb_domain_disconnect_all_paths(struct tb *tb);
 
 static inline void tb_domain_put(struct tb *tb)
 {
@@ -422,4 +443,14 @@ static inline u64 tb_downstream_route(struct tb_port *port)
 	       | ((u64) port->port << (port->sw->config.depth * 8));
 }
 
+bool tb_xdomain_handle_request(struct tb *tb, enum tb_cfg_pkg_type type,
+			       const void *buf, size_t size);
+struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent,
+				    u64 route, const uuid_t *local_uuid,
+				    const uuid_t *remote_uuid);
+void tb_xdomain_add(struct tb_xdomain *xd);
+void tb_xdomain_remove(struct tb_xdomain *xd);
+struct tb_xdomain *tb_xdomain_find_by_link_depth(struct tb *tb, u8 link,
+						 u8 depth);
+
 #endif
diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h
index f2b2550cd97c..b0a092baa605 100644
--- a/drivers/thunderbolt/tb_msgs.h
+++ b/drivers/thunderbolt/tb_msgs.h
@@ -101,11 +101,14 @@ enum icm_pkg_code {
 	ICM_CHALLENGE_DEVICE = 0x5,
 	ICM_ADD_DEVICE_KEY = 0x6,
 	ICM_GET_ROUTE = 0xa,
+	ICM_APPROVE_XDOMAIN = 0x10,
 };
 
 enum icm_event_code {
 	ICM_EVENT_DEVICE_CONNECTED = 3,
 	ICM_EVENT_DEVICE_DISCONNECTED = 4,
+	ICM_EVENT_XDOMAIN_CONNECTED = 6,
+	ICM_EVENT_XDOMAIN_DISCONNECTED = 7,
 };
 
 struct icm_pkg_header {
@@ -188,6 +191,25 @@ struct icm_fr_event_device_disconnected {
 	u16 link_info;
 };
 
+struct icm_fr_event_xdomain_connected {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	uuid_t local_uuid;
+	u32 local_route_hi;
+	u32 local_route_lo;
+	u32 remote_route_hi;
+	u32 remote_route_lo;
+};
+
+struct icm_fr_event_xdomain_disconnected {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+};
+
 struct icm_fr_pkg_add_device_key {
 	struct icm_pkg_header hdr;
 	uuid_t ep_uuid;
@@ -224,6 +246,28 @@ struct icm_fr_pkg_challenge_device_response {
 	u32 response[8];
 };
 
+struct icm_fr_pkg_approve_xdomain {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+};
+
+struct icm_fr_pkg_approve_xdomain_response {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+};
+
 /* Alpine Ridge only messages */
 
 struct icm_ar_pkg_get_route {
@@ -240,4 +284,83 @@ struct icm_ar_pkg_get_route_response {
 	u32 route_lo;
 };
 
+/* XDomain messages */
+
+struct tb_xdomain_header {
+	u32 route_hi;
+	u32 route_lo;
+	u32 length_sn;
+};
+
+#define TB_XDOMAIN_LENGTH_MASK	GENMASK(5, 0)
+#define TB_XDOMAIN_SN_MASK	GENMASK(28, 27)
+#define TB_XDOMAIN_SN_SHIFT	27
+
+enum tb_xdp_type {
+	UUID_REQUEST_OLD = 1,
+	UUID_RESPONSE = 2,
+	PROPERTIES_REQUEST,
+	PROPERTIES_RESPONSE,
+	PROPERTIES_CHANGED_REQUEST,
+	PROPERTIES_CHANGED_RESPONSE,
+	ERROR_RESPONSE,
+	UUID_REQUEST = 12,
+};
+
+struct tb_xdp_header {
+	struct tb_xdomain_header xd_hdr;
+	uuid_t uuid;
+	u32 type;
+};
+
+struct tb_xdp_properties {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+	uuid_t dst_uuid;
+	u16 offset;
+	u16 reserved;
+};
+
+struct tb_xdp_properties_response {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+	uuid_t dst_uuid;
+	u16 offset;
+	u16 data_length;
+	u32 generation;
+	u32 data[0];
+};
+
+/*
+ * Max length of data array single XDomain property response is allowed
+ * to carry.
+ */
+#define TB_XDP_PROPERTIES_MAX_DATA_LENGTH	\
+	(((256 - 4 - sizeof(struct tb_xdp_properties_response))) / 4)
+
+/* Maximum size of the total property block in dwords we allow */
+#define TB_XDP_PROPERTIES_MAX_LENGTH		500
+
+struct tb_xdp_properties_changed {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+};
+
+struct tb_xdp_properties_changed_response {
+	struct tb_xdp_header hdr;
+};
+
+enum tb_xdp_error {
+	ERROR_SUCCESS,
+	ERROR_UNKNOWN_PACKET,
+	ERROR_UNKNOWN_DOMAIN,
+	ERROR_NOT_SUPPORTED,
+	ERROR_NOT_READY,
+};
+
+struct tb_xdp_error_response {
+	struct tb_xdp_header hdr;
+	u32 error;
+};
+
 #endif
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
new file mode 100644
index 000000000000..f2d06f6f7be9
--- /dev/null
+++ b/drivers/thunderbolt/xdomain.c
@@ -0,0 +1,1576 @@
+/*
+ * Thunderbolt XDomain discovery protocol support
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/uuid.h>
+#include <linux/workqueue.h>
+
+#include "tb.h"
+
+#define XDOMAIN_DEFAULT_TIMEOUT			5000 /* ms */
+#define XDOMAIN_PROPERTIES_RETRIES		60
+#define XDOMAIN_PROPERTIES_CHANGED_RETRIES	10
+
+struct xdomain_request_work {
+	struct work_struct work;
+	struct tb_xdp_header *pkg;
+	struct tb *tb;
+};
+
+/* Serializes access to the properties and protocol handlers below */
+static DEFINE_MUTEX(xdomain_lock);
+
+/* Properties exposed to the remote domains */
+static struct tb_property_dir *xdomain_property_dir;
+static u32 *xdomain_property_block;
+static u32 xdomain_property_block_len;
+static u32 xdomain_property_block_gen;
+
+/* Additional protocol handlers */
+static LIST_HEAD(protocol_handlers);
+
+/* UUID for XDomain discovery protocol: b638d70e-42ff-40bb-97c2-90e2c0b2ff07 */
+static const uuid_t tb_xdp_uuid =
+	UUID_INIT(0xb638d70e, 0x42ff, 0x40bb,
+		  0x97, 0xc2, 0x90, 0xe2, 0xc0, 0xb2, 0xff, 0x07);
+
+static bool tb_xdomain_match(const struct tb_cfg_request *req,
+			     const struct ctl_pkg *pkg)
+{
+	switch (pkg->frame.eof) {
+	case TB_CFG_PKG_ERROR:
+		return true;
+
+	case TB_CFG_PKG_XDOMAIN_RESP: {
+		const struct tb_xdp_header *res_hdr = pkg->buffer;
+		const struct tb_xdp_header *req_hdr = req->request;
+		u8 req_seq, res_seq;
+
+		if (pkg->frame.size < req->response_size / 4)
+			return false;
+
+		/* Make sure route matches */
+		if ((res_hdr->xd_hdr.route_hi & ~BIT(31)) !=
+		     req_hdr->xd_hdr.route_hi)
+			return false;
+		if ((res_hdr->xd_hdr.route_lo) != req_hdr->xd_hdr.route_lo)
+			return false;
+
+		/* Then check that the sequence number matches */
+		res_seq = res_hdr->xd_hdr.length_sn & TB_XDOMAIN_SN_MASK;
+		res_seq >>= TB_XDOMAIN_SN_SHIFT;
+		req_seq = req_hdr->xd_hdr.length_sn & TB_XDOMAIN_SN_MASK;
+		req_seq >>= TB_XDOMAIN_SN_SHIFT;
+		if (res_seq != req_seq)
+			return false;
+
+		/* Check that the XDomain protocol matches */
+		if (!uuid_equal(&res_hdr->uuid, &req_hdr->uuid))
+			return false;
+
+		return true;
+	}
+
+	default:
+		return false;
+	}
+}
+
+static bool tb_xdomain_copy(struct tb_cfg_request *req,
+			    const struct ctl_pkg *pkg)
+{
+	memcpy(req->response, pkg->buffer, req->response_size);
+	req->result.err = 0;
+	return true;
+}
+
+static void response_ready(void *data)
+{
+	tb_cfg_request_put(data);
+}
+
+static int __tb_xdomain_response(struct tb_ctl *ctl, const void *response,
+				 size_t size, enum tb_cfg_pkg_type type)
+{
+	struct tb_cfg_request *req;
+
+	req = tb_cfg_request_alloc();
+	if (!req)
+		return -ENOMEM;
+
+	req->match = tb_xdomain_match;
+	req->copy = tb_xdomain_copy;
+	req->request = response;
+	req->request_size = size;
+	req->request_type = type;
+
+	return tb_cfg_request(ctl, req, response_ready, req);
+}
+
+/**
+ * tb_xdomain_response() - Send a XDomain response message
+ * @xd: XDomain to send the message
+ * @response: Response to send
+ * @size: Size of the response
+ * @type: PDF type of the response
+ *
+ * This can be used to send a XDomain response message to the other
+ * domain. No response for the message is expected.
+ *
+ * Return: %0 in case of success and negative errno in case of failure
+ */
+int tb_xdomain_response(struct tb_xdomain *xd, const void *response,
+			size_t size, enum tb_cfg_pkg_type type)
+{
+	return __tb_xdomain_response(xd->tb->ctl, response, size, type);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_response);
+
+static int __tb_xdomain_request(struct tb_ctl *ctl, const void *request,
+	size_t request_size, enum tb_cfg_pkg_type request_type, void *response,
+	size_t response_size, enum tb_cfg_pkg_type response_type,
+	unsigned int timeout_msec)
+{
+	struct tb_cfg_request *req;
+	struct tb_cfg_result res;
+
+	req = tb_cfg_request_alloc();
+	if (!req)
+		return -ENOMEM;
+
+	req->match = tb_xdomain_match;
+	req->copy = tb_xdomain_copy;
+	req->request = request;
+	req->request_size = request_size;
+	req->request_type = request_type;
+	req->response = response;
+	req->response_size = response_size;
+	req->response_type = response_type;
+
+	res = tb_cfg_request_sync(ctl, req, timeout_msec);
+
+	tb_cfg_request_put(req);
+
+	return res.err == 1 ? -EIO : res.err;
+}
+
+/**
+ * tb_xdomain_request() - Send a XDomain request
+ * @xd: XDomain to send the request
+ * @request: Request to send
+ * @request_size: Size of the request in bytes
+ * @request_type: PDF type of the request
+ * @response: Response is copied here
+ * @response_size: Expected size of the response in bytes
+ * @response_type: Expected PDF type of the response
+ * @timeout_msec: Timeout in milliseconds to wait for the response
+ *
+ * This function can be used to send XDomain control channel messages to
+ * the other domain. The function waits until the response is received
+ * or when timeout triggers. Whichever comes first.
+ *
+ * Return: %0 in case of success and negative errno in case of failure
+ */
+int tb_xdomain_request(struct tb_xdomain *xd, const void *request,
+	size_t request_size, enum tb_cfg_pkg_type request_type,
+	void *response, size_t response_size,
+	enum tb_cfg_pkg_type response_type, unsigned int timeout_msec)
+{
+	return __tb_xdomain_request(xd->tb->ctl, request, request_size,
+				    request_type, response, response_size,
+				    response_type, timeout_msec);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_request);
+
+static inline void tb_xdp_fill_header(struct tb_xdp_header *hdr, u64 route,
+	u8 sequence, enum tb_xdp_type type, size_t size)
+{
+	u32 length_sn;
+
+	length_sn = (size - sizeof(hdr->xd_hdr)) / 4;
+	length_sn |= (sequence << TB_XDOMAIN_SN_SHIFT) & TB_XDOMAIN_SN_MASK;
+
+	hdr->xd_hdr.route_hi = upper_32_bits(route);
+	hdr->xd_hdr.route_lo = lower_32_bits(route);
+	hdr->xd_hdr.length_sn = length_sn;
+	hdr->type = type;
+	memcpy(&hdr->uuid, &tb_xdp_uuid, sizeof(tb_xdp_uuid));
+}
+
+static int tb_xdp_handle_error(const struct tb_xdp_header *hdr)
+{
+	const struct tb_xdp_error_response *error;
+
+	if (hdr->type != ERROR_RESPONSE)
+		return 0;
+
+	error = (const struct tb_xdp_error_response *)hdr;
+
+	switch (error->error) {
+	case ERROR_UNKNOWN_PACKET:
+	case ERROR_UNKNOWN_DOMAIN:
+		return -EIO;
+	case ERROR_NOT_SUPPORTED:
+		return -ENOTSUPP;
+	case ERROR_NOT_READY:
+		return -EAGAIN;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int tb_xdp_error_response(struct tb_ctl *ctl, u64 route, u8 sequence,
+				 enum tb_xdp_error error)
+{
+	struct tb_xdp_error_response res;
+
+	memset(&res, 0, sizeof(res));
+	tb_xdp_fill_header(&res.hdr, route, sequence, ERROR_RESPONSE,
+			   sizeof(res));
+	res.error = error;
+
+	return __tb_xdomain_response(ctl, &res, sizeof(res),
+				     TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tb_xdp_properties_request(struct tb_ctl *ctl, u64 route,
+	const uuid_t *src_uuid, const uuid_t *dst_uuid, int retry,
+	u32 **block, u32 *generation)
+{
+	struct tb_xdp_properties_response *res;
+	struct tb_xdp_properties req;
+	u16 data_len, len;
+	size_t total_size;
+	u32 *data = NULL;
+	int ret;
+
+	total_size = sizeof(*res) + TB_XDP_PROPERTIES_MAX_DATA_LENGTH * 4;
+	res = kzalloc(total_size, GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	memset(&req, 0, sizeof(req));
+	tb_xdp_fill_header(&req.hdr, route, retry % 4, PROPERTIES_REQUEST,
+			   sizeof(req));
+	memcpy(&req.src_uuid, src_uuid, sizeof(*src_uuid));
+	memcpy(&req.dst_uuid, dst_uuid, sizeof(*dst_uuid));
+
+	len = 0;
+	data_len = 0;
+
+	do {
+		ret = __tb_xdomain_request(ctl, &req, sizeof(req),
+					   TB_CFG_PKG_XDOMAIN_REQ, res,
+					   total_size, TB_CFG_PKG_XDOMAIN_RESP,
+					   XDOMAIN_DEFAULT_TIMEOUT);
+		if (ret)
+			goto err;
+
+		ret = tb_xdp_handle_error(&res->hdr);
+		if (ret)
+			goto err;
+
+		/*
+		 * Package length includes the whole payload without the
+		 * XDomain header. Validate first that the package is at
+		 * least size of the response structure.
+		 */
+		len = res->hdr.xd_hdr.length_sn & TB_XDOMAIN_LENGTH_MASK;
+		if (len < sizeof(*res) / 4) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		len += sizeof(res->hdr.xd_hdr) / 4;
+		len -= sizeof(*res) / 4;
+
+		if (res->offset != req.offset) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * First time allocate block that has enough space for
+		 * the whole properties block.
+		 */
+		if (!data) {
+			data_len = res->data_length;
+			if (data_len > TB_XDP_PROPERTIES_MAX_LENGTH) {
+				ret = -E2BIG;
+				goto err;
+			}
+
+			data = kcalloc(data_len, sizeof(u32), GFP_KERNEL);
+			if (!data) {
+				ret = -ENOMEM;
+				goto err;
+			}
+		}
+
+		memcpy(data + req.offset, res->data, len * 4);
+		req.offset += len;
+	} while (!data_len || req.offset < data_len);
+
+	*block = data;
+	*generation = res->generation;
+
+	kfree(res);
+
+	return data_len;
+
+err:
+	kfree(data);
+	kfree(res);
+
+	return ret;
+}
+
+static int tb_xdp_properties_response(struct tb *tb, struct tb_ctl *ctl,
+	u64 route, u8 sequence, const uuid_t *src_uuid,
+	const struct tb_xdp_properties *req)
+{
+	struct tb_xdp_properties_response *res;
+	size_t total_size;
+	u16 len;
+	int ret;
+
+	/*
+	 * Currently we expect all requests to be directed to us. The
+	 * protocol supports forwarding, though which we might add
+	 * support later on.
+	 */
+	if (!uuid_equal(src_uuid, &req->dst_uuid)) {
+		tb_xdp_error_response(ctl, route, sequence,
+				      ERROR_UNKNOWN_DOMAIN);
+		return 0;
+	}
+
+	mutex_lock(&xdomain_lock);
+
+	if (req->offset >= xdomain_property_block_len) {
+		mutex_unlock(&xdomain_lock);
+		return -EINVAL;
+	}
+
+	len = xdomain_property_block_len - req->offset;
+	len = min_t(u16, len, TB_XDP_PROPERTIES_MAX_DATA_LENGTH);
+	total_size = sizeof(*res) + len * 4;
+
+	res = kzalloc(total_size, GFP_KERNEL);
+	if (!res) {
+		mutex_unlock(&xdomain_lock);
+		return -ENOMEM;
+	}
+
+	tb_xdp_fill_header(&res->hdr, route, sequence, PROPERTIES_RESPONSE,
+			   total_size);
+	res->generation = xdomain_property_block_gen;
+	res->data_length = xdomain_property_block_len;
+	res->offset = req->offset;
+	uuid_copy(&res->src_uuid, src_uuid);
+	uuid_copy(&res->dst_uuid, &req->src_uuid);
+	memcpy(res->data, &xdomain_property_block[req->offset], len * 4);
+
+	mutex_unlock(&xdomain_lock);
+
+	ret = __tb_xdomain_response(ctl, res, total_size,
+				    TB_CFG_PKG_XDOMAIN_RESP);
+
+	kfree(res);
+	return ret;
+}
+
+static int tb_xdp_properties_changed_request(struct tb_ctl *ctl, u64 route,
+					     int retry, const uuid_t *uuid)
+{
+	struct tb_xdp_properties_changed_response res;
+	struct tb_xdp_properties_changed req;
+	int ret;
+
+	memset(&req, 0, sizeof(req));
+	tb_xdp_fill_header(&req.hdr, route, retry % 4,
+			   PROPERTIES_CHANGED_REQUEST, sizeof(req));
+	uuid_copy(&req.src_uuid, uuid);
+
+	memset(&res, 0, sizeof(res));
+	ret = __tb_xdomain_request(ctl, &req, sizeof(req),
+				   TB_CFG_PKG_XDOMAIN_REQ, &res, sizeof(res),
+				   TB_CFG_PKG_XDOMAIN_RESP,
+				   XDOMAIN_DEFAULT_TIMEOUT);
+	if (ret)
+		return ret;
+
+	return tb_xdp_handle_error(&res.hdr);
+}
+
+static int
+tb_xdp_properties_changed_response(struct tb_ctl *ctl, u64 route, u8 sequence)
+{
+	struct tb_xdp_properties_changed_response res;
+
+	memset(&res, 0, sizeof(res));
+	tb_xdp_fill_header(&res.hdr, route, sequence,
+			   PROPERTIES_CHANGED_RESPONSE, sizeof(res));
+	return __tb_xdomain_response(ctl, &res, sizeof(res),
+				     TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+/**
+ * tb_register_protocol_handler() - Register protocol handler
+ * @handler: Handler to register
+ *
+ * This allows XDomain service drivers to hook into incoming XDomain
+ * messages. After this function is called the service driver needs to
+ * be able to handle calls to callback whenever a package with the
+ * registered protocol is received.
+ */
+int tb_register_protocol_handler(struct tb_protocol_handler *handler)
+{
+	if (!handler->uuid || !handler->callback)
+		return -EINVAL;
+	if (uuid_equal(handler->uuid, &tb_xdp_uuid))
+		return -EINVAL;
+
+	mutex_lock(&xdomain_lock);
+	list_add_tail(&handler->list, &protocol_handlers);
+	mutex_unlock(&xdomain_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_register_protocol_handler);
+
+/**
+ * tb_unregister_protocol_handler() - Unregister protocol handler
+ * @handler: Handler to unregister
+ *
+ * Removes the previously registered protocol handler.
+ */
+void tb_unregister_protocol_handler(struct tb_protocol_handler *handler)
+{
+	mutex_lock(&xdomain_lock);
+	list_del_init(&handler->list);
+	mutex_unlock(&xdomain_lock);
+}
+EXPORT_SYMBOL_GPL(tb_unregister_protocol_handler);
+
+static void tb_xdp_handle_request(struct work_struct *work)
+{
+	struct xdomain_request_work *xw = container_of(work, typeof(*xw), work);
+	const struct tb_xdp_header *pkg = xw->pkg;
+	const struct tb_xdomain_header *xhdr = &pkg->xd_hdr;
+	struct tb *tb = xw->tb;
+	struct tb_ctl *ctl = tb->ctl;
+	const uuid_t *uuid;
+	int ret = 0;
+	u8 sequence;
+	u64 route;
+
+	route = ((u64)xhdr->route_hi << 32 | xhdr->route_lo) & ~BIT_ULL(63);
+	sequence = xhdr->length_sn & TB_XDOMAIN_SN_MASK;
+	sequence >>= TB_XDOMAIN_SN_SHIFT;
+
+	mutex_lock(&tb->lock);
+	if (tb->root_switch)
+		uuid = tb->root_switch->uuid;
+	else
+		uuid = NULL;
+	mutex_unlock(&tb->lock);
+
+	if (!uuid) {
+		tb_xdp_error_response(ctl, route, sequence, ERROR_NOT_READY);
+		goto out;
+	}
+
+	switch (pkg->type) {
+	case PROPERTIES_REQUEST:
+		ret = tb_xdp_properties_response(tb, ctl, route, sequence, uuid,
+			(const struct tb_xdp_properties *)pkg);
+		break;
+
+	case PROPERTIES_CHANGED_REQUEST: {
+		const struct tb_xdp_properties_changed *xchg =
+			(const struct tb_xdp_properties_changed *)pkg;
+		struct tb_xdomain *xd;
+
+		ret = tb_xdp_properties_changed_response(ctl, route, sequence);
+
+		/*
+		 * Since the properties have been changed, let's update
+		 * the xdomain related to this connection as well in
+		 * case there is a change in services it offers.
+		 */
+		xd = tb_xdomain_find_by_uuid_locked(tb, &xchg->src_uuid);
+		if (xd) {
+			queue_delayed_work(tb->wq, &xd->get_properties_work,
+					   msecs_to_jiffies(50));
+			tb_xdomain_put(xd);
+		}
+
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	if (ret) {
+		tb_warn(tb, "failed to send XDomain response for %#x\n",
+			pkg->type);
+	}
+
+out:
+	kfree(xw->pkg);
+	kfree(xw);
+}
+
+static void
+tb_xdp_schedule_request(struct tb *tb, const struct tb_xdp_header *hdr,
+			size_t size)
+{
+	struct xdomain_request_work *xw;
+
+	xw = kmalloc(sizeof(*xw), GFP_KERNEL);
+	if (!xw)
+		return;
+
+	INIT_WORK(&xw->work, tb_xdp_handle_request);
+	xw->pkg = kmemdup(hdr, size, GFP_KERNEL);
+	xw->tb = tb;
+
+	queue_work(tb->wq, &xw->work);
+}
+
+/**
+ * tb_register_service_driver() - Register XDomain service driver
+ * @drv: Driver to register
+ *
+ * Registers new service driver from @drv to the bus.
+ */
+int tb_register_service_driver(struct tb_service_driver *drv)
+{
+	drv->driver.bus = &tb_bus_type;
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(tb_register_service_driver);
+
+/**
+ * tb_unregister_service_driver() - Unregister XDomain service driver
+ * @xdrv: Driver to unregister
+ *
+ * Unregisters XDomain service driver from the bus.
+ */
+void tb_unregister_service_driver(struct tb_service_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(tb_unregister_service_driver);
+
+static ssize_t key_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	/*
+	 * It should be null terminated but anything else is pretty much
+	 * allowed.
+	 */
+	return sprintf(buf, "%*pEp\n", (int)strlen(svc->key), svc->key);
+}
+static DEVICE_ATTR_RO(key);
+
+static int get_modalias(struct tb_service *svc, char *buf, size_t size)
+{
+	return snprintf(buf, size, "tbsvc:k%sp%08Xv%08Xr%08X", svc->key,
+			svc->prtcid, svc->prtcvers, svc->prtcrevs);
+}
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	/* Full buffer size except new line and null termination */
+	get_modalias(svc, buf, PAGE_SIZE - 2);
+	return sprintf(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t prtcid_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcid);
+}
+static DEVICE_ATTR_RO(prtcid);
+
+static ssize_t prtcvers_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcvers);
+}
+static DEVICE_ATTR_RO(prtcvers);
+
+static ssize_t prtcrevs_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcrevs);
+}
+static DEVICE_ATTR_RO(prtcrevs);
+
+static ssize_t prtcstns_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "0x%08x\n", svc->prtcstns);
+}
+static DEVICE_ATTR_RO(prtcstns);
+
+static struct attribute *tb_service_attrs[] = {
+	&dev_attr_key.attr,
+	&dev_attr_modalias.attr,
+	&dev_attr_prtcid.attr,
+	&dev_attr_prtcvers.attr,
+	&dev_attr_prtcrevs.attr,
+	&dev_attr_prtcstns.attr,
+	NULL,
+};
+
+static struct attribute_group tb_service_attr_group = {
+	.attrs = tb_service_attrs,
+};
+
+static const struct attribute_group *tb_service_attr_groups[] = {
+	&tb_service_attr_group,
+	NULL,
+};
+
+static int tb_service_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+	char modalias[64];
+
+	get_modalias(svc, modalias, sizeof(modalias));
+	return add_uevent_var(env, "MODALIAS=%s", modalias);
+}
+
+static void tb_service_release(struct device *dev)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+	struct tb_xdomain *xd = tb_service_parent(svc);
+
+	ida_simple_remove(&xd->service_ids, svc->id);
+	kfree(svc->key);
+	kfree(svc);
+}
+
+struct device_type tb_service_type = {
+	.name = "thunderbolt_service",
+	.groups = tb_service_attr_groups,
+	.uevent = tb_service_uevent,
+	.release = tb_service_release,
+};
+EXPORT_SYMBOL_GPL(tb_service_type);
+
+static int remove_missing_service(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd = data;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return 0;
+
+	if (!tb_property_find(xd->properties, svc->key,
+			      TB_PROPERTY_TYPE_DIRECTORY))
+		device_unregister(dev);
+
+	return 0;
+}
+
+static int find_service(struct device *dev, void *data)
+{
+	const struct tb_property *p = data;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return 0;
+
+	return !strcmp(svc->key, p->key);
+}
+
+static int populate_service(struct tb_service *svc,
+			    struct tb_property *property)
+{
+	struct tb_property_dir *dir = property->value.dir;
+	struct tb_property *p;
+
+	/* Fill in standard properties */
+	p = tb_property_find(dir, "prtcid", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcid = p->value.immediate;
+	p = tb_property_find(dir, "prtcvers", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcvers = p->value.immediate;
+	p = tb_property_find(dir, "prtcrevs", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcrevs = p->value.immediate;
+	p = tb_property_find(dir, "prtcstns", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcstns = p->value.immediate;
+
+	svc->key = kstrdup(property->key, GFP_KERNEL);
+	if (!svc->key)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void enumerate_services(struct tb_xdomain *xd)
+{
+	struct tb_service *svc;
+	struct tb_property *p;
+	struct device *dev;
+
+	/*
+	 * First remove all services that are not available anymore in
+	 * the updated property block.
+	 */
+	device_for_each_child_reverse(&xd->dev, xd, remove_missing_service);
+
+	/* Then re-enumerate properties creating new services as we go */
+	tb_property_for_each(xd->properties, p) {
+		if (p->type != TB_PROPERTY_TYPE_DIRECTORY)
+			continue;
+
+		/* If the service exists already we are fine */
+		dev = device_find_child(&xd->dev, p, find_service);
+		if (dev) {
+			put_device(dev);
+			continue;
+		}
+
+		svc = kzalloc(sizeof(*svc), GFP_KERNEL);
+		if (!svc)
+			break;
+
+		if (populate_service(svc, p)) {
+			kfree(svc);
+			break;
+		}
+
+		svc->id = ida_simple_get(&xd->service_ids, 0, 0, GFP_KERNEL);
+		svc->dev.bus = &tb_bus_type;
+		svc->dev.type = &tb_service_type;
+		svc->dev.parent = &xd->dev;
+		dev_set_name(&svc->dev, "%s.%d", dev_name(&xd->dev), svc->id);
+
+		if (device_register(&svc->dev)) {
+			put_device(&svc->dev);
+			break;
+		}
+	}
+}
+
+static int populate_properties(struct tb_xdomain *xd,
+			       struct tb_property_dir *dir)
+{
+	const struct tb_property *p;
+
+	/* Required properties */
+	p = tb_property_find(dir, "deviceid", TB_PROPERTY_TYPE_VALUE);
+	if (!p)
+		return -EINVAL;
+	xd->device = p->value.immediate;
+
+	p = tb_property_find(dir, "vendorid", TB_PROPERTY_TYPE_VALUE);
+	if (!p)
+		return -EINVAL;
+	xd->vendor = p->value.immediate;
+
+	kfree(xd->device_name);
+	xd->device_name = NULL;
+	kfree(xd->vendor_name);
+	xd->vendor_name = NULL;
+
+	/* Optional properties */
+	p = tb_property_find(dir, "deviceid", TB_PROPERTY_TYPE_TEXT);
+	if (p)
+		xd->device_name = kstrdup(p->value.text, GFP_KERNEL);
+	p = tb_property_find(dir, "vendorid", TB_PROPERTY_TYPE_TEXT);
+	if (p)
+		xd->vendor_name = kstrdup(p->value.text, GFP_KERNEL);
+
+	return 0;
+}
+
+/* Called with @xd->lock held */
+static void tb_xdomain_restore_paths(struct tb_xdomain *xd)
+{
+	if (!xd->resume)
+		return;
+
+	xd->resume = false;
+	if (xd->transmit_path) {
+		dev_dbg(&xd->dev, "re-establishing DMA path\n");
+		tb_domain_approve_xdomain_paths(xd->tb, xd);
+	}
+}
+
+static void tb_xdomain_get_properties(struct work_struct *work)
+{
+	struct tb_xdomain *xd = container_of(work, typeof(*xd),
+					     get_properties_work.work);
+	struct tb_property_dir *dir;
+	struct tb *tb = xd->tb;
+	bool update = false;
+	u32 *block = NULL;
+	u32 gen = 0;
+	int ret;
+
+	ret = tb_xdp_properties_request(tb->ctl, xd->route, xd->local_uuid,
+					xd->remote_uuid, xd->properties_retries,
+					&block, &gen);
+	if (ret < 0) {
+		if (xd->properties_retries-- > 0) {
+			queue_delayed_work(xd->tb->wq, &xd->get_properties_work,
+					   msecs_to_jiffies(1000));
+		} else {
+			/* Give up now */
+			dev_err(&xd->dev,
+				"failed read XDomain properties from %pUb\n",
+				xd->remote_uuid);
+		}
+		return;
+	}
+
+	xd->properties_retries = XDOMAIN_PROPERTIES_RETRIES;
+
+	mutex_lock(&xd->lock);
+
+	/* Only accept newer generation properties */
+	if (xd->properties && gen <= xd->property_block_gen) {
+		/*
+		 * On resume it is likely that the properties block is
+		 * not changed (unless the other end added or removed
+		 * services). However, we need to make sure the existing
+		 * DMA paths are restored properly.
+		 */
+		tb_xdomain_restore_paths(xd);
+		goto err_free_block;
+	}
+
+	dir = tb_property_parse_dir(block, ret);
+	if (!dir) {
+		dev_err(&xd->dev, "failed to parse XDomain properties\n");
+		goto err_free_block;
+	}
+
+	ret = populate_properties(xd, dir);
+	if (ret) {
+		dev_err(&xd->dev, "missing XDomain properties in response\n");
+		goto err_free_dir;
+	}
+
+	/* Release the existing one */
+	if (xd->properties) {
+		tb_property_free_dir(xd->properties);
+		update = true;
+	}
+
+	xd->properties = dir;
+	xd->property_block_gen = gen;
+
+	tb_xdomain_restore_paths(xd);
+
+	mutex_unlock(&xd->lock);
+
+	kfree(block);
+
+	/*
+	 * Now the device should be ready enough so we can add it to the
+	 * bus and let userspace know about it. If the device is already
+	 * registered, we notify the userspace that it has changed.
+	 */
+	if (!update) {
+		if (device_add(&xd->dev)) {
+			dev_err(&xd->dev, "failed to add XDomain device\n");
+			return;
+		}
+	} else {
+		kobject_uevent(&xd->dev.kobj, KOBJ_CHANGE);
+	}
+
+	enumerate_services(xd);
+	return;
+
+err_free_dir:
+	tb_property_free_dir(dir);
+err_free_block:
+	kfree(block);
+	mutex_unlock(&xd->lock);
+}
+
+static void tb_xdomain_properties_changed(struct work_struct *work)
+{
+	struct tb_xdomain *xd = container_of(work, typeof(*xd),
+					     properties_changed_work.work);
+	int ret;
+
+	ret = tb_xdp_properties_changed_request(xd->tb->ctl, xd->route,
+				xd->properties_changed_retries, xd->local_uuid);
+	if (ret) {
+		if (xd->properties_changed_retries-- > 0)
+			queue_delayed_work(xd->tb->wq,
+					   &xd->properties_changed_work,
+					   msecs_to_jiffies(1000));
+		return;
+	}
+
+	xd->properties_changed_retries = XDOMAIN_PROPERTIES_CHANGED_RETRIES;
+}
+
+static ssize_t device_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%#x\n", xd->device);
+}
+static DEVICE_ATTR_RO(device);
+
+static ssize_t
+device_name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+	int ret;
+
+	if (mutex_lock_interruptible(&xd->lock))
+		return -ERESTARTSYS;
+	ret = sprintf(buf, "%s\n", xd->device_name ? xd->device_name : "");
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+static DEVICE_ATTR_RO(device_name);
+
+static ssize_t vendor_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%#x\n", xd->vendor);
+}
+static DEVICE_ATTR_RO(vendor);
+
+static ssize_t
+vendor_name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+	int ret;
+
+	if (mutex_lock_interruptible(&xd->lock))
+		return -ERESTARTSYS;
+	ret = sprintf(buf, "%s\n", xd->vendor_name ? xd->vendor_name : "");
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+static DEVICE_ATTR_RO(vendor_name);
+
+static ssize_t unique_id_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%pUb\n", xd->remote_uuid);
+}
+static DEVICE_ATTR_RO(unique_id);
+
+static struct attribute *xdomain_attrs[] = {
+	&dev_attr_device.attr,
+	&dev_attr_device_name.attr,
+	&dev_attr_unique_id.attr,
+	&dev_attr_vendor.attr,
+	&dev_attr_vendor_name.attr,
+	NULL,
+};
+
+static struct attribute_group xdomain_attr_group = {
+	.attrs = xdomain_attrs,
+};
+
+static const struct attribute_group *xdomain_attr_groups[] = {
+	&xdomain_attr_group,
+	NULL,
+};
+
+static void tb_xdomain_release(struct device *dev)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	put_device(xd->dev.parent);
+
+	tb_property_free_dir(xd->properties);
+	ida_destroy(&xd->service_ids);
+
+	kfree(xd->local_uuid);
+	kfree(xd->remote_uuid);
+	kfree(xd->device_name);
+	kfree(xd->vendor_name);
+	kfree(xd);
+}
+
+static void start_handshake(struct tb_xdomain *xd)
+{
+	xd->properties_retries = XDOMAIN_PROPERTIES_RETRIES;
+	xd->properties_changed_retries = XDOMAIN_PROPERTIES_CHANGED_RETRIES;
+
+	/* Start exchanging properties with the other host */
+	queue_delayed_work(xd->tb->wq, &xd->properties_changed_work,
+			   msecs_to_jiffies(100));
+	queue_delayed_work(xd->tb->wq, &xd->get_properties_work,
+			   msecs_to_jiffies(1000));
+}
+
+static void stop_handshake(struct tb_xdomain *xd)
+{
+	xd->properties_retries = 0;
+	xd->properties_changed_retries = 0;
+
+	cancel_delayed_work_sync(&xd->get_properties_work);
+	cancel_delayed_work_sync(&xd->properties_changed_work);
+}
+
+static int __maybe_unused tb_xdomain_suspend(struct device *dev)
+{
+	stop_handshake(tb_to_xdomain(dev));
+	return 0;
+}
+
+static int __maybe_unused tb_xdomain_resume(struct device *dev)
+{
+	struct tb_xdomain *xd = tb_to_xdomain(dev);
+
+	/*
+	 * Ask tb_xdomain_get_properties() restore any existing DMA
+	 * paths after properties are re-read.
+	 */
+	xd->resume = true;
+	start_handshake(xd);
+
+	return 0;
+}
+
+static const struct dev_pm_ops tb_xdomain_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(tb_xdomain_suspend, tb_xdomain_resume)
+};
+
+struct device_type tb_xdomain_type = {
+	.name = "thunderbolt_xdomain",
+	.release = tb_xdomain_release,
+	.pm = &tb_xdomain_pm_ops,
+};
+EXPORT_SYMBOL_GPL(tb_xdomain_type);
+
+/**
+ * tb_xdomain_alloc() - Allocate new XDomain object
+ * @tb: Domain where the XDomain belongs
+ * @parent: Parent device (the switch through the connection to the
+ *	    other domain is reached).
+ * @route: Route string used to reach the other domain
+ * @local_uuid: Our local domain UUID
+ * @remote_uuid: UUID of the other domain
+ *
+ * Allocates new XDomain structure and returns pointer to that. The
+ * object must be released by calling tb_xdomain_put().
+ */
+struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent,
+				    u64 route, const uuid_t *local_uuid,
+				    const uuid_t *remote_uuid)
+{
+	struct tb_xdomain *xd;
+
+	xd = kzalloc(sizeof(*xd), GFP_KERNEL);
+	if (!xd)
+		return NULL;
+
+	xd->tb = tb;
+	xd->route = route;
+	ida_init(&xd->service_ids);
+	mutex_init(&xd->lock);
+	INIT_DELAYED_WORK(&xd->get_properties_work, tb_xdomain_get_properties);
+	INIT_DELAYED_WORK(&xd->properties_changed_work,
+			  tb_xdomain_properties_changed);
+
+	xd->local_uuid = kmemdup(local_uuid, sizeof(uuid_t), GFP_KERNEL);
+	if (!xd->local_uuid)
+		goto err_free;
+
+	xd->remote_uuid = kmemdup(remote_uuid, sizeof(uuid_t), GFP_KERNEL);
+	if (!xd->remote_uuid)
+		goto err_free_local_uuid;
+
+	device_initialize(&xd->dev);
+	xd->dev.parent = get_device(parent);
+	xd->dev.bus = &tb_bus_type;
+	xd->dev.type = &tb_xdomain_type;
+	xd->dev.groups = xdomain_attr_groups;
+	dev_set_name(&xd->dev, "%u-%llx", tb->index, route);
+
+	return xd;
+
+err_free_local_uuid:
+	kfree(xd->local_uuid);
+err_free:
+	kfree(xd);
+
+	return NULL;
+}
+
+/**
+ * tb_xdomain_add() - Add XDomain to the bus
+ * @xd: XDomain to add
+ *
+ * This function starts XDomain discovery protocol handshake and
+ * eventually adds the XDomain to the bus. After calling this function
+ * the caller needs to call tb_xdomain_remove() in order to remove and
+ * release the object regardless whether the handshake succeeded or not.
+ */
+void tb_xdomain_add(struct tb_xdomain *xd)
+{
+	/* Start exchanging properties with the other host */
+	start_handshake(xd);
+}
+
+static int unregister_service(struct device *dev, void *data)
+{
+	device_unregister(dev);
+	return 0;
+}
+
+/**
+ * tb_xdomain_remove() - Remove XDomain from the bus
+ * @xd: XDomain to remove
+ *
+ * This will stop all ongoing configuration work and remove the XDomain
+ * along with any services from the bus. When the last reference to @xd
+ * is released the object will be released as well.
+ */
+void tb_xdomain_remove(struct tb_xdomain *xd)
+{
+	stop_handshake(xd);
+
+	device_for_each_child_reverse(&xd->dev, xd, unregister_service);
+
+	if (!device_is_registered(&xd->dev))
+		put_device(&xd->dev);
+	else
+		device_unregister(&xd->dev);
+}
+
+/**
+ * tb_xdomain_enable_paths() - Enable DMA paths for XDomain connection
+ * @xd: XDomain connection
+ * @transmit_path: HopID of the transmit path the other end is using to
+ *		   send packets
+ * @transmit_ring: DMA ring used to receive packets from the other end
+ * @receive_path: HopID of the receive path the other end is using to
+ *		  receive packets
+ * @receive_ring: DMA ring used to send packets to the other end
+ *
+ * The function enables DMA paths accordingly so that after successful
+ * return the caller can send and receive packets using high-speed DMA
+ * path.
+ *
+ * Return: %0 in case of success and negative errno in case of error
+ */
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
+			    u16 transmit_ring, u16 receive_path,
+			    u16 receive_ring)
+{
+	int ret;
+
+	mutex_lock(&xd->lock);
+
+	if (xd->transmit_path) {
+		ret = xd->transmit_path == transmit_path ? 0 : -EBUSY;
+		goto exit_unlock;
+	}
+
+	xd->transmit_path = transmit_path;
+	xd->transmit_ring = transmit_ring;
+	xd->receive_path = receive_path;
+	xd->receive_ring = receive_ring;
+
+	ret = tb_domain_approve_xdomain_paths(xd->tb, xd);
+
+exit_unlock:
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_enable_paths);
+
+/**
+ * tb_xdomain_disable_paths() - Disable DMA paths for XDomain connection
+ * @xd: XDomain connection
+ *
+ * This does the opposite of tb_xdomain_enable_paths(). After call to
+ * this the caller is not expected to use the rings anymore.
+ *
+ * Return: %0 in case of success and negative errno in case of error
+ */
+int tb_xdomain_disable_paths(struct tb_xdomain *xd)
+{
+	int ret = 0;
+
+	mutex_lock(&xd->lock);
+	if (xd->transmit_path) {
+		xd->transmit_path = 0;
+		xd->transmit_ring = 0;
+		xd->receive_path = 0;
+		xd->receive_ring = 0;
+
+		ret = tb_domain_disconnect_xdomain_paths(xd->tb, xd);
+	}
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_disable_paths);
+
+struct tb_xdomain_lookup {
+	const uuid_t *uuid;
+	u8 link;
+	u8 depth;
+};
+
+static struct tb_xdomain *switch_find_xdomain(struct tb_switch *sw,
+	const struct tb_xdomain_lookup *lookup)
+{
+	int i;
+
+	for (i = 1; i <= sw->config.max_port_number; i++) {
+		struct tb_port *port = &sw->ports[i];
+		struct tb_xdomain *xd;
+
+		if (tb_is_upstream_port(port))
+			continue;
+
+		if (port->xdomain) {
+			xd = port->xdomain;
+
+			if (lookup->uuid) {
+				if (uuid_equal(xd->remote_uuid, lookup->uuid))
+					return xd;
+			} else if (lookup->link == xd->link &&
+				   lookup->depth == xd->depth) {
+				return xd;
+			}
+		} else if (port->remote) {
+			xd = switch_find_xdomain(port->remote->sw, lookup);
+			if (xd)
+				return xd;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * tb_xdomain_find_by_uuid() - Find an XDomain by UUID
+ * @tb: Domain where the XDomain belongs to
+ * @uuid: UUID to look for
+ *
+ * Finds XDomain by walking through the Thunderbolt topology below @tb.
+ * The returned XDomain will have its reference count increased so the
+ * caller needs to call tb_xdomain_put() when it is done with the
+ * object.
+ *
+ * This will find all XDomains including the ones that are not yet added
+ * to the bus (handshake is still in progress).
+ *
+ * The caller needs to hold @tb->lock.
+ */
+struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid)
+{
+	struct tb_xdomain_lookup lookup;
+	struct tb_xdomain *xd;
+
+	memset(&lookup, 0, sizeof(lookup));
+	lookup.uuid = uuid;
+
+	xd = switch_find_xdomain(tb->root_switch, &lookup);
+	if (xd) {
+		get_device(&xd->dev);
+		return xd;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_find_by_uuid);
+
+/**
+ * tb_xdomain_find_by_link_depth() - Find an XDomain by link and depth
+ * @tb: Domain where the XDomain belongs to
+ * @link: Root switch link number
+ * @depth: Depth in the link
+ *
+ * Finds XDomain by walking through the Thunderbolt topology below @tb.
+ * The returned XDomain will have its reference count increased so the
+ * caller needs to call tb_xdomain_put() when it is done with the
+ * object.
+ *
+ * This will find all XDomains including the ones that are not yet added
+ * to the bus (handshake is still in progress).
+ *
+ * The caller needs to hold @tb->lock.
+ */
+struct tb_xdomain *tb_xdomain_find_by_link_depth(struct tb *tb, u8 link,
+						 u8 depth)
+{
+	struct tb_xdomain_lookup lookup;
+	struct tb_xdomain *xd;
+
+	memset(&lookup, 0, sizeof(lookup));
+	lookup.link = link;
+	lookup.depth = depth;
+
+	xd = switch_find_xdomain(tb->root_switch, &lookup);
+	if (xd) {
+		get_device(&xd->dev);
+		return xd;
+	}
+
+	return NULL;
+}
+
+bool tb_xdomain_handle_request(struct tb *tb, enum tb_cfg_pkg_type type,
+			       const void *buf, size_t size)
+{
+	const struct tb_protocol_handler *handler, *tmp;
+	const struct tb_xdp_header *hdr = buf;
+	unsigned int length;
+	int ret = 0;
+
+	/* We expect the packet is at least size of the header */
+	length = hdr->xd_hdr.length_sn & TB_XDOMAIN_LENGTH_MASK;
+	if (length != size / 4 - sizeof(hdr->xd_hdr) / 4)
+		return true;
+	if (length < sizeof(*hdr) / 4 - sizeof(hdr->xd_hdr) / 4)
+		return true;
+
+	/*
+	 * Handle XDomain discovery protocol packets directly here. For
+	 * other protocols (based on their UUID) we call registered
+	 * handlers in turn.
+	 */
+	if (uuid_equal(&hdr->uuid, &tb_xdp_uuid)) {
+		if (type == TB_CFG_PKG_XDOMAIN_REQ) {
+			tb_xdp_schedule_request(tb, hdr, size);
+			return true;
+		}
+		return false;
+	}
+
+	mutex_lock(&xdomain_lock);
+	list_for_each_entry_safe(handler, tmp, &protocol_handlers, list) {
+		if (!uuid_equal(&hdr->uuid, handler->uuid))
+			continue;
+
+		mutex_unlock(&xdomain_lock);
+		ret = handler->callback(buf, size, handler->data);
+		mutex_lock(&xdomain_lock);
+
+		if (ret)
+			break;
+	}
+	mutex_unlock(&xdomain_lock);
+
+	return ret > 0;
+}
+
+static int rebuild_property_block(void)
+{
+	u32 *block, len;
+	int ret;
+
+	ret = tb_property_format_dir(xdomain_property_dir, NULL, 0);
+	if (ret < 0)
+		return ret;
+
+	len = ret;
+
+	block = kcalloc(len, sizeof(u32), GFP_KERNEL);
+	if (!block)
+		return -ENOMEM;
+
+	ret = tb_property_format_dir(xdomain_property_dir, block, len);
+	if (ret) {
+		kfree(block);
+		return ret;
+	}
+
+	kfree(xdomain_property_block);
+	xdomain_property_block = block;
+	xdomain_property_block_len = len;
+	xdomain_property_block_gen++;
+
+	return 0;
+}
+
+static int update_xdomain(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd;
+
+	xd = tb_to_xdomain(dev);
+	if (xd) {
+		queue_delayed_work(xd->tb->wq, &xd->properties_changed_work,
+				   msecs_to_jiffies(50));
+	}
+
+	return 0;
+}
+
+static void update_all_xdomains(void)
+{
+	bus_for_each_dev(&tb_bus_type, NULL, NULL, update_xdomain);
+}
+
+static bool remove_directory(const char *key, const struct tb_property_dir *dir)
+{
+	struct tb_property *p;
+
+	p = tb_property_find(xdomain_property_dir, key,
+			     TB_PROPERTY_TYPE_DIRECTORY);
+	if (p && p->value.dir == dir) {
+		tb_property_remove(p);
+		return true;
+	}
+	return false;
+}
+
+/**
+ * tb_register_property_dir() - Register property directory to the host
+ * @key: Key (name) of the directory to add
+ * @dir: Directory to add
+ *
+ * Service drivers can use this function to add new property directory
+ * to the host available properties. The other connected hosts are
+ * notified so they can re-read properties of this host if they are
+ * interested.
+ *
+ * Return: %0 on success and negative errno on failure
+ */
+int tb_register_property_dir(const char *key, struct tb_property_dir *dir)
+{
+	int ret;
+
+	if (!key || strlen(key) > 8)
+		return -EINVAL;
+
+	mutex_lock(&xdomain_lock);
+	if (tb_property_find(xdomain_property_dir, key,
+			     TB_PROPERTY_TYPE_DIRECTORY)) {
+		ret = -EEXIST;
+		goto err_unlock;
+	}
+
+	ret = tb_property_add_dir(xdomain_property_dir, key, dir);
+	if (ret)
+		goto err_unlock;
+
+	ret = rebuild_property_block();
+	if (ret) {
+		remove_directory(key, dir);
+		goto err_unlock;
+	}
+
+	mutex_unlock(&xdomain_lock);
+	update_all_xdomains();
+	return 0;
+
+err_unlock:
+	mutex_unlock(&xdomain_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_register_property_dir);
+
+/**
+ * tb_unregister_property_dir() - Removes property directory from host
+ * @key: Key (name) of the directory
+ * @dir: Directory to remove
+ *
+ * This will remove the existing directory from this host and notify the
+ * connected hosts about the change.
+ */
+void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir)
+{
+	int ret = 0;
+
+	mutex_lock(&xdomain_lock);
+	if (remove_directory(key, dir))
+		ret = rebuild_property_block();
+	mutex_unlock(&xdomain_lock);
+
+	if (!ret)
+		update_all_xdomains();
+}
+EXPORT_SYMBOL_GPL(tb_unregister_property_dir);
+
+int tb_xdomain_init(void)
+{
+	int ret;
+
+	xdomain_property_dir = tb_property_create_dir(NULL);
+	if (!xdomain_property_dir)
+		return -ENOMEM;
+
+	/*
+	 * Initialize standard set of properties without any service
+	 * directories. Those will be added by service drivers
+	 * themselves when they are loaded.
+	 */
+	tb_property_add_immediate(xdomain_property_dir, "vendorid",
+				  PCI_VENDOR_ID_INTEL);
+	tb_property_add_text(xdomain_property_dir, "vendorid", "Intel Corp.");
+	tb_property_add_immediate(xdomain_property_dir, "deviceid", 0x1);
+	tb_property_add_text(xdomain_property_dir, "deviceid",
+			     utsname()->nodename);
+	tb_property_add_immediate(xdomain_property_dir, "devicerv", 0x80000100);
+
+	ret = rebuild_property_block();
+	if (ret) {
+		tb_property_free_dir(xdomain_property_dir);
+		xdomain_property_dir = NULL;
+	}
+
+	return ret;
+}
+
+void tb_xdomain_exit(void)
+{
+	kfree(xdomain_property_block);
+	tb_property_free_dir(xdomain_property_dir);
+}
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 694cebb50f72..7625c3b81f84 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -683,5 +683,31 @@ struct fsl_mc_device_id {
 	const char obj_type[16];
 };
 
+/**
+ * struct tb_service_id - Thunderbolt service identifiers
+ * @match_flags: Flags used to match the structure
+ * @protocol_key: Protocol key the service supports
+ * @protocol_id: Protocol id the service supports
+ * @protocol_version: Version of the protocol
+ * @protocol_revision: Revision of the protocol software
+ * @driver_data: Driver specific data
+ *
+ * Thunderbolt XDomain services are exposed as devices where each device
+ * carries the protocol information the service supports. Thunderbolt
+ * XDomain service drivers match against that information.
+ */
+struct tb_service_id {
+	__u32 match_flags;
+	char protocol_key[8 + 1];
+	__u32 protocol_id;
+	__u32 protocol_version;
+	__u32 protocol_revision;
+	kernel_ulong_t driver_data;
+};
+
+#define TBSVC_MATCH_PROTOCOL_KEY	0x0001
+#define TBSVC_MATCH_PROTOCOL_ID		0x0002
+#define TBSVC_MATCH_PROTOCOL_VERSION	0x0004
+#define TBSVC_MATCH_PROTOCOL_REVISION	0x0008
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 43b8d1e09341..18c0e3d5e85c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -17,6 +17,7 @@
 #include <linux/device.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/mod_devicetable.h>
 #include <linux/uuid.h>
 
 enum tb_cfg_pkg_type {
@@ -77,6 +78,8 @@ struct tb {
 };
 
 extern struct bus_type tb_bus_type;
+extern struct device_type tb_service_type;
+extern struct device_type tb_xdomain_type;
 
 #define TB_LINKS_PER_PHY_PORT	2
 
@@ -155,4 +158,243 @@ struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
 	     property;						\
 	     property = tb_property_get_next(dir, property))
 
+int tb_register_property_dir(const char *key, struct tb_property_dir *dir);
+void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
+
+/**
+ * struct tb_xdomain - Cross-domain (XDomain) connection
+ * @dev: XDomain device
+ * @tb: Pointer to the domain
+ * @remote_uuid: UUID of the remote domain (host)
+ * @local_uuid: Cached local UUID
+ * @route: Route string the other domain can be reached
+ * @vendor: Vendor ID of the remote domain
+ * @device: Device ID of the demote domain
+ * @lock: Lock to serialize access to the following fields of this structure
+ * @vendor_name: Name of the vendor (or %NULL if not known)
+ * @device_name: Name of the device (or %NULL if not known)
+ * @is_unplugged: The XDomain is unplugged
+ * @resume: The XDomain is being resumed
+ * @transmit_path: HopID which the remote end expects us to transmit
+ * @transmit_ring: Local ring (hop) where outgoing packets are pushed
+ * @receive_path: HopID which we expect the remote end to transmit
+ * @receive_ring: Local ring (hop) where incoming packets arrive
+ * @service_ids: Used to generate IDs for the services
+ * @properties: Properties exported by the remote domain
+ * @property_block_gen: Generation of @properties
+ * @properties_lock: Lock protecting @properties.
+ * @get_properties_work: Work used to get remote domain properties
+ * @properties_retries: Number of times left to read properties
+ * @properties_changed_work: Work used to notify the remote domain that
+ *			     our properties have changed
+ * @properties_changed_retries: Number of times left to send properties
+ *				changed notification
+ * @link: Root switch link the remote domain is connected (ICM only)
+ * @depth: Depth in the chain the remote domain is connected (ICM only)
+ *
+ * This structure represents connection across two domains (hosts).
+ * Each XDomain contains zero or more services which are exposed as
+ * &struct tb_service objects.
+ *
+ * Service drivers may access this structure if they need to enumerate
+ * non-standard properties but they need hold @lock when doing so
+ * because properties can be changed asynchronously in response to
+ * changes in the remote domain.
+ */
+struct tb_xdomain {
+	struct device dev;
+	struct tb *tb;
+	uuid_t *remote_uuid;
+	const uuid_t *local_uuid;
+	u64 route;
+	u16 vendor;
+	u16 device;
+	struct mutex lock;
+	const char *vendor_name;
+	const char *device_name;
+	bool is_unplugged;
+	bool resume;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+	struct ida service_ids;
+	struct tb_property_dir *properties;
+	u32 property_block_gen;
+	struct delayed_work get_properties_work;
+	int properties_retries;
+	struct delayed_work properties_changed_work;
+	int properties_changed_retries;
+	u8 link;
+	u8 depth;
+};
+
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
+			    u16 transmit_ring, u16 receive_path,
+			    u16 receive_ring);
+int tb_xdomain_disable_paths(struct tb_xdomain *xd);
+struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid);
+
+static inline struct tb_xdomain *
+tb_xdomain_find_by_uuid_locked(struct tb *tb, const uuid_t *uuid)
+{
+	struct tb_xdomain *xd;
+
+	mutex_lock(&tb->lock);
+	xd = tb_xdomain_find_by_uuid(tb, uuid);
+	mutex_unlock(&tb->lock);
+
+	return xd;
+}
+
+static inline struct tb_xdomain *tb_xdomain_get(struct tb_xdomain *xd)
+{
+	if (xd)
+		get_device(&xd->dev);
+	return xd;
+}
+
+static inline void tb_xdomain_put(struct tb_xdomain *xd)
+{
+	if (xd)
+		put_device(&xd->dev);
+}
+
+static inline bool tb_is_xdomain(const struct device *dev)
+{
+	return dev->type == &tb_xdomain_type;
+}
+
+static inline struct tb_xdomain *tb_to_xdomain(struct device *dev)
+{
+	if (tb_is_xdomain(dev))
+		return container_of(dev, struct tb_xdomain, dev);
+	return NULL;
+}
+
+int tb_xdomain_response(struct tb_xdomain *xd, const void *response,
+			size_t size, enum tb_cfg_pkg_type type);
+int tb_xdomain_request(struct tb_xdomain *xd, const void *request,
+		       size_t request_size, enum tb_cfg_pkg_type request_type,
+		       void *response, size_t response_size,
+		       enum tb_cfg_pkg_type response_type,
+		       unsigned int timeout_msec);
+
+/**
+ * tb_protocol_handler - Protocol specific handler
+ * @uuid: XDomain messages with this UUID are dispatched to this handler
+ * @callback: Callback called with the XDomain message. Returning %1
+ *	      here tells the XDomain core that the message was handled
+ *	      by this handler and should not be forwared to other
+ *	      handlers.
+ * @data: Data passed with the callback
+ * @list: Handlers are linked using this
+ *
+ * Thunderbolt services can hook into incoming XDomain requests by
+ * registering protocol handler. Only limitation is that the XDomain
+ * discovery protocol UUID cannot be registered since it is handled by
+ * the core XDomain code.
+ *
+ * The @callback must check that the message is really directed to the
+ * service the driver implements.
+ */
+struct tb_protocol_handler {
+	const uuid_t *uuid;
+	int (*callback)(const void *buf, size_t size, void *data);
+	void *data;
+	struct list_head list;
+};
+
+int tb_register_protocol_handler(struct tb_protocol_handler *handler);
+void tb_unregister_protocol_handler(struct tb_protocol_handler *handler);
+
+/**
+ * struct tb_service - Thunderbolt service
+ * @dev: XDomain device
+ * @id: ID of the service (shown in sysfs)
+ * @key: Protocol key from the properties directory
+ * @prtcid: Protocol ID from the properties directory
+ * @prtcvers: Protocol version from the properties directory
+ * @prtcrevs: Protocol software revision from the properties directory
+ * @prtcstns: Protocol settings mask from the properties directory
+ *
+ * Each domain exposes set of services it supports as collection of
+ * properties. For each service there will be one corresponding
+ * &struct tb_service. Service drivers are bound to these.
+ */
+struct tb_service {
+	struct device dev;
+	int id;
+	const char *key;
+	u32 prtcid;
+	u32 prtcvers;
+	u32 prtcrevs;
+	u32 prtcstns;
+};
+
+static inline struct tb_service *tb_service_get(struct tb_service *svc)
+{
+	if (svc)
+		get_device(&svc->dev);
+	return svc;
+}
+
+static inline void tb_service_put(struct tb_service *svc)
+{
+	if (svc)
+		put_device(&svc->dev);
+}
+
+static inline bool tb_is_service(const struct device *dev)
+{
+	return dev->type == &tb_service_type;
+}
+
+static inline struct tb_service *tb_to_service(struct device *dev)
+{
+	if (tb_is_service(dev))
+		return container_of(dev, struct tb_service, dev);
+	return NULL;
+}
+
+/**
+ * tb_service_driver - Thunderbolt service driver
+ * @driver: Driver structure
+ * @probe: Called when the driver is probed
+ * @remove: Called when the driver is removed (optional)
+ * @shutdown: Called at shutdown time to stop the service (optional)
+ * @id_table: Table of service identifiers the driver supports
+ */
+struct tb_service_driver {
+	struct device_driver driver;
+	int (*probe)(struct tb_service *svc, const struct tb_service_id *id);
+	void (*remove)(struct tb_service *svc);
+	void (*shutdown)(struct tb_service *svc);
+	const struct tb_service_id *id_table;
+};
+
+#define TB_SERVICE(key, id)				\
+	.match_flags = TBSVC_MATCH_PROTOCOL_KEY |	\
+		       TBSVC_MATCH_PROTOCOL_ID,		\
+	.protocol_key = (key),				\
+	.protocol_id = (id)
+
+int tb_register_service_driver(struct tb_service_driver *drv);
+void tb_unregister_service_driver(struct tb_service_driver *drv);
+
+static inline void *tb_service_get_drvdata(const struct tb_service *svc)
+{
+	return dev_get_drvdata(&svc->dev);
+}
+
+static inline void tb_service_set_drvdata(struct tb_service *svc, void *data)
+{
+	dev_set_drvdata(&svc->dev, data);
+}
+
+static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
+{
+	return tb_to_xdomain(svc->dev.parent);
+}
+
 #endif /* THUNDERBOLT_H_ */
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index e4d90e50f6fe..57263f2f8f2f 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -206,5 +206,12 @@ int main(void)
 	DEVID_FIELD(fsl_mc_device_id, vendor);
 	DEVID_FIELD(fsl_mc_device_id, obj_type);
 
+	DEVID(tb_service_id);
+	DEVID_FIELD(tb_service_id, match_flags);
+	DEVID_FIELD(tb_service_id, protocol_key);
+	DEVID_FIELD(tb_service_id, protocol_id);
+	DEVID_FIELD(tb_service_id, protocol_version);
+	DEVID_FIELD(tb_service_id, protocol_revision);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 29d6699d5a06..6ef6e63f96fd 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1301,6 +1301,31 @@ static int do_fsl_mc_entry(const char *filename, void *symval,
 }
 ADD_TO_DEVTABLE("fslmc", fsl_mc_device_id, do_fsl_mc_entry);
 
+/* Looks like: tbsvc:kSpNvNrN */
+static int do_tbsvc_entry(const char *filename, void *symval, char *alias)
+{
+	DEF_FIELD(symval, tb_service_id, match_flags);
+	DEF_FIELD_ADDR(symval, tb_service_id, protocol_key);
+	DEF_FIELD(symval, tb_service_id, protocol_id);
+	DEF_FIELD(symval, tb_service_id, protocol_version);
+	DEF_FIELD(symval, tb_service_id, protocol_revision);
+
+	strcpy(alias, "tbsvc:");
+	if (match_flags & TBSVC_MATCH_PROTOCOL_KEY)
+		sprintf(alias + strlen(alias), "k%s", *protocol_key);
+	else
+		strcat(alias + strlen(alias), "k*");
+	ADD(alias, "p", match_flags & TBSVC_MATCH_PROTOCOL_ID, protocol_id);
+	ADD(alias, "v", match_flags & TBSVC_MATCH_PROTOCOL_VERSION,
+	    protocol_version);
+	ADD(alias, "r", match_flags & TBSVC_MATCH_PROTOCOL_REVISION,
+	    protocol_revision);
+
+	add_wildcard(alias);
+	return 1;
+}
+ADD_TO_DEVTABLE("tbsvc", tb_service_id, do_tbsvc_entry);
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
-- 
cgit v1.2.3


From 3b3d9f4da96493e4f68d0a80ab210763a24f8b33 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:37 +0300
Subject: thunderbolt: Export ring handling functions to modules

These are used by Thunderbolt services to send and receive frames over
the high-speed DMA rings.

We also put the functions to tb_ namespace to make sure we do not
collide with others and add missing kernel-doc comments for the exported
functions.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/ctl.c   |  20 +++---
 drivers/thunderbolt/nhi.c   |  62 +++++++++++------
 drivers/thunderbolt/nhi.h   | 147 +----------------------------------------
 include/linux/thunderbolt.h | 158 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 211 insertions(+), 176 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index 05400b77dcd7..dd10789e1dbb 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -359,7 +359,7 @@ static int tb_ctl_tx(struct tb_ctl *ctl, const void *data, size_t len,
 	cpu_to_be32_array(pkg->buffer, data, len / 4);
 	*(__be32 *) (pkg->buffer + len) = tb_crc(pkg->buffer, len);
 
-	res = ring_tx(ctl->tx, &pkg->frame);
+	res = tb_ring_tx(ctl->tx, &pkg->frame);
 	if (res) /* ring is stopped */
 		tb_ctl_pkg_free(pkg);
 	return res;
@@ -376,7 +376,7 @@ static bool tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type,
 
 static void tb_ctl_rx_submit(struct ctl_pkg *pkg)
 {
-	ring_rx(pkg->ctl->rx, &pkg->frame); /*
+	tb_ring_rx(pkg->ctl->rx, &pkg->frame); /*
 					     * We ignore failures during stop.
 					     * All rx packets are referenced
 					     * from ctl->rx_packets, so we do
@@ -614,11 +614,11 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data)
 	if (!ctl->frame_pool)
 		goto err;
 
-	ctl->tx = ring_alloc_tx(nhi, 0, 10, RING_FLAG_NO_SUSPEND);
+	ctl->tx = tb_ring_alloc_tx(nhi, 0, 10, RING_FLAG_NO_SUSPEND);
 	if (!ctl->tx)
 		goto err;
 
-	ctl->rx = ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
+	ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
 				0xffff);
 	if (!ctl->rx)
 		goto err;
@@ -652,9 +652,9 @@ void tb_ctl_free(struct tb_ctl *ctl)
 		return;
 
 	if (ctl->rx)
-		ring_free(ctl->rx);
+		tb_ring_free(ctl->rx);
 	if (ctl->tx)
-		ring_free(ctl->tx);
+		tb_ring_free(ctl->tx);
 
 	/* free RX packets */
 	for (i = 0; i < TB_CTL_RX_PKG_COUNT; i++)
@@ -673,8 +673,8 @@ void tb_ctl_start(struct tb_ctl *ctl)
 {
 	int i;
 	tb_ctl_info(ctl, "control channel starting...\n");
-	ring_start(ctl->tx); /* is used to ack hotplug packets, start first */
-	ring_start(ctl->rx);
+	tb_ring_start(ctl->tx); /* is used to ack hotplug packets, start first */
+	tb_ring_start(ctl->rx);
 	for (i = 0; i < TB_CTL_RX_PKG_COUNT; i++)
 		tb_ctl_rx_submit(ctl->rx_packets[i]);
 
@@ -695,8 +695,8 @@ void tb_ctl_stop(struct tb_ctl *ctl)
 	ctl->running = false;
 	mutex_unlock(&ctl->request_queue_lock);
 
-	ring_stop(ctl->rx);
-	ring_stop(ctl->tx);
+	tb_ring_stop(ctl->rx);
+	tb_ring_stop(ctl->tx);
 
 	if (!list_empty(&ctl->request_queue))
 		tb_ctl_WARN(ctl, "dangling request in request_queue\n");
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index bebcad3d2c1f..e0a47f7581cb 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -253,7 +253,7 @@ invoke_callback:
 	}
 }
 
-int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
+int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 {
 	int ret = 0;
 	mutex_lock(&ring->lock);
@@ -266,6 +266,7 @@ int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 	mutex_unlock(&ring->lock);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
 
 static irqreturn_t ring_msix(int irq, void *data)
 {
@@ -309,9 +310,9 @@ static void ring_release_msix(struct tb_ring *ring)
 	ring->irq = 0;
 }
 
-static struct tb_ring *ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
-				  bool transmit, unsigned int flags,
-				  u16 sof_mask, u16 eof_mask)
+static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
+				     bool transmit, unsigned int flags,
+				     u16 sof_mask, u16 eof_mask)
 {
 	struct tb_ring *ring = NULL;
 	dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n",
@@ -377,24 +378,42 @@ err:
 	return NULL;
 }
 
-struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags)
+/**
+ * tb_ring_alloc_tx() - Allocate DMA ring for transmit
+ * @nhi: Pointer to the NHI the ring is to be allocated
+ * @hop: HopID (ring) to allocate
+ * @size: Number of entries in the ring
+ * @flags: Flags for the ring
+ */
+struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags)
 {
-	return ring_alloc(nhi, hop, size, true, flags, 0, 0);
+	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0);
 }
+EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
 
-struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags, u16 sof_mask, u16 eof_mask)
+/**
+ * tb_ring_alloc_rx() - Allocate DMA ring for receive
+ * @nhi: Pointer to the NHI the ring is to be allocated
+ * @hop: HopID (ring) to allocate
+ * @size: Number of entries in the ring
+ * @flags: Flags for the ring
+ * @sof_mask: Mask of PDF values that start a frame
+ * @eof_mask: Mask of PDF values that end a frame
+ */
+struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags, u16 sof_mask, u16 eof_mask)
 {
-	return ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask);
+	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask);
 }
+EXPORT_SYMBOL_GPL(tb_ring_alloc_rx);
 
 /**
- * ring_start() - enable a ring
+ * tb_ring_start() - enable a ring
  *
- * Must not be invoked in parallel with ring_stop().
+ * Must not be invoked in parallel with tb_ring_stop().
  */
-void ring_start(struct tb_ring *ring)
+void tb_ring_start(struct tb_ring *ring)
 {
 	u16 frame_size;
 	u32 flags;
@@ -450,21 +469,22 @@ err:
 	mutex_unlock(&ring->lock);
 	mutex_unlock(&ring->nhi->lock);
 }
-
+EXPORT_SYMBOL_GPL(tb_ring_start);
 
 /**
- * ring_stop() - shutdown a ring
+ * tb_ring_stop() - shutdown a ring
  *
  * Must not be invoked from a callback.
  *
- * This method will disable the ring. Further calls to ring_tx/ring_rx will
- * return -ESHUTDOWN until ring_stop has been called.
+ * This method will disable the ring. Further calls to
+ * tb_ring_tx/tb_ring_rx will return -ESHUTDOWN until ring_stop has been
+ * called.
  *
  * All enqueued frames will be canceled and their callbacks will be executed
  * with frame->canceled set to true (on the callback thread). This method
  * returns only after all callback invocations have finished.
  */
-void ring_stop(struct tb_ring *ring)
+void tb_ring_stop(struct tb_ring *ring)
 {
 	mutex_lock(&ring->nhi->lock);
 	mutex_lock(&ring->lock);
@@ -497,9 +517,10 @@ err:
 	schedule_work(&ring->work);
 	flush_work(&ring->work);
 }
+EXPORT_SYMBOL_GPL(tb_ring_stop);
 
 /*
- * ring_free() - free ring
+ * tb_ring_free() - free ring
  *
  * When this method returns all invocations of ring->callback will have
  * finished.
@@ -508,7 +529,7 @@ err:
  *
  * Must NOT be called from ring_frame->callback!
  */
-void ring_free(struct tb_ring *ring)
+void tb_ring_free(struct tb_ring *ring)
 {
 	mutex_lock(&ring->nhi->lock);
 	/*
@@ -550,6 +571,7 @@ void ring_free(struct tb_ring *ring)
 	mutex_destroy(&ring->lock);
 	kfree(ring);
 }
+EXPORT_SYMBOL_GPL(tb_ring_free);
 
 /**
  * nhi_mailbox_cmd() - Send a command through NHI mailbox
diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h
index 4503ddbeccb3..771d09ca5dc5 100644
--- a/drivers/thunderbolt/nhi.h
+++ b/drivers/thunderbolt/nhi.h
@@ -7,152 +7,7 @@
 #ifndef DSL3510_H_
 #define DSL3510_H_
 
-#include <linux/idr.h>
-#include <linux/mutex.h>
-#include <linux/workqueue.h>
-
-/**
- * struct tb_nhi - thunderbolt native host interface
- * @lock: Must be held during ring creation/destruction. Is acquired by
- *	  interrupt_work when dispatching interrupts to individual rings.
- * @pdev: Pointer to the PCI device
- * @iobase: MMIO space of the NHI
- * @tx_rings: All Tx rings available on this host controller
- * @rx_rings: All Rx rings available on this host controller
- * @msix_ida: Used to allocate MSI-X vectors for rings
- * @going_away: The host controller device is about to disappear so when
- *		this flag is set, avoid touching the hardware anymore.
- * @interrupt_work: Work scheduled to handle ring interrupt when no
- *		    MSI-X is used.
- * @hop_count: Number of rings (end point hops) supported by NHI.
- */
-struct tb_nhi {
-	struct mutex lock;
-	struct pci_dev *pdev;
-	void __iomem *iobase;
-	struct tb_ring **tx_rings;
-	struct tb_ring **rx_rings;
-	struct ida msix_ida;
-	bool going_away;
-	struct work_struct interrupt_work;
-	u32 hop_count;
-};
-
-/**
- * struct tb_ring - thunderbolt TX or RX ring associated with a NHI
- * @lock: Lock serializing actions to this ring. Must be acquired after
- *	  nhi->lock.
- * @nhi: Pointer to the native host controller interface
- * @size: Size of the ring
- * @hop: Hop (DMA channel) associated with this ring
- * @head: Head of the ring (write next descriptor here)
- * @tail: Tail of the ring (complete next descriptor here)
- * @descriptors: Allocated descriptors for this ring
- * @queue: Queue holding frames to be transferred over this ring
- * @in_flight: Queue holding frames that are currently in flight
- * @work: Interrupt work structure
- * @is_tx: Is the ring Tx or Rx
- * @running: Is the ring running
- * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise.
- * @vector: MSI-X vector number the ring uses (only set if @irq is > 0)
- * @flags: Ring specific flags
- * @sof_mask: Bit mask used to detect start of frame PDF
- * @eof_mask: Bit mask used to detect end of frame PDF
- */
-struct tb_ring {
-	struct mutex lock;
-	struct tb_nhi *nhi;
-	int size;
-	int hop;
-	int head;
-	int tail;
-	struct ring_desc *descriptors;
-	dma_addr_t descriptors_dma;
-	struct list_head queue;
-	struct list_head in_flight;
-	struct work_struct work;
-	bool is_tx:1;
-	bool running:1;
-	int irq;
-	u8 vector;
-	unsigned int flags;
-	u16 sof_mask;
-	u16 eof_mask;
-};
-
-/* Leave ring interrupt enabled on suspend */
-#define RING_FLAG_NO_SUSPEND	BIT(0)
-/* Configure the ring to be in frame mode */
-#define RING_FLAG_FRAME		BIT(1)
-/* Enable end-to-end flow control */
-#define RING_FLAG_E2E		BIT(2)
-
-struct ring_frame;
-typedef void (*ring_cb)(struct tb_ring*, struct ring_frame*, bool canceled);
-
-/**
- * struct ring_frame - for use with ring_rx/ring_tx
- */
-struct ring_frame {
-	dma_addr_t buffer_phy;
-	ring_cb callback;
-	struct list_head list;
-	u32 size:12; /* TX: in, RX: out*/
-	u32 flags:12; /* RX: out */
-	u32 eof:4; /* TX:in, RX: out */
-	u32 sof:4; /* TX:in, RX: out */
-};
-
-#define TB_FRAME_SIZE 0x100    /* minimum size for ring_rx */
-
-struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags);
-struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags, u16 sof_mask, u16 eof_mask);
-void ring_start(struct tb_ring *ring);
-void ring_stop(struct tb_ring *ring);
-void ring_free(struct tb_ring *ring);
-
-int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
-
-/**
- * ring_rx() - enqueue a frame on an RX ring
- *
- * frame->buffer, frame->buffer_phy and frame->callback have to be set. The
- * buffer must contain at least TB_FRAME_SIZE bytes.
- *
- * frame->callback will be invoked with frame->size, frame->flags, frame->eof,
- * frame->sof set once the frame has been received.
- *
- * If ring_stop is called after the packet has been enqueued frame->callback
- * will be called with canceled set to true.
- *
- * Return: Returns ESHUTDOWN if ring_stop has been called. Zero otherwise.
- */
-static inline int ring_rx(struct tb_ring *ring, struct ring_frame *frame)
-{
-	WARN_ON(ring->is_tx);
-	return __ring_enqueue(ring, frame);
-}
-
-/**
- * ring_tx() - enqueue a frame on an TX ring
- *
- * frame->buffer, frame->buffer_phy, frame->callback, frame->size, frame->eof
- * and frame->sof have to be set.
- *
- * frame->callback will be invoked with once the frame has been transmitted.
- *
- * If ring_stop is called after the packet has been enqueued frame->callback
- * will be called with canceled set to true.
- *
- * Return: Returns ESHUTDOWN if ring_stop has been called. Zero otherwise.
- */
-static inline int ring_tx(struct tb_ring *ring, struct ring_frame *frame)
-{
-	WARN_ON(!ring->is_tx);
-	return __ring_enqueue(ring, frame);
-}
+#include <linux/thunderbolt.h>
 
 enum nhi_fw_mode {
 	NHI_FW_SAFE_MODE,
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 18c0e3d5e85c..9ddb83ad890f 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -15,10 +15,12 @@
 #define THUNDERBOLT_H_
 
 #include <linux/device.h>
+#include <linux/idr.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/mod_devicetable.h>
 #include <linux/uuid.h>
+#include <linux/workqueue.h>
 
 enum tb_cfg_pkg_type {
 	TB_CFG_PKG_READ = 1,
@@ -397,4 +399,160 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
 	return tb_to_xdomain(svc->dev.parent);
 }
 
+/**
+ * struct tb_nhi - thunderbolt native host interface
+ * @lock: Must be held during ring creation/destruction. Is acquired by
+ *	  interrupt_work when dispatching interrupts to individual rings.
+ * @pdev: Pointer to the PCI device
+ * @iobase: MMIO space of the NHI
+ * @tx_rings: All Tx rings available on this host controller
+ * @rx_rings: All Rx rings available on this host controller
+ * @msix_ida: Used to allocate MSI-X vectors for rings
+ * @going_away: The host controller device is about to disappear so when
+ *		this flag is set, avoid touching the hardware anymore.
+ * @interrupt_work: Work scheduled to handle ring interrupt when no
+ *		    MSI-X is used.
+ * @hop_count: Number of rings (end point hops) supported by NHI.
+ */
+struct tb_nhi {
+	struct mutex lock;
+	struct pci_dev *pdev;
+	void __iomem *iobase;
+	struct tb_ring **tx_rings;
+	struct tb_ring **rx_rings;
+	struct ida msix_ida;
+	bool going_away;
+	struct work_struct interrupt_work;
+	u32 hop_count;
+};
+
+/**
+ * struct tb_ring - thunderbolt TX or RX ring associated with a NHI
+ * @lock: Lock serializing actions to this ring. Must be acquired after
+ *	  nhi->lock.
+ * @nhi: Pointer to the native host controller interface
+ * @size: Size of the ring
+ * @hop: Hop (DMA channel) associated with this ring
+ * @head: Head of the ring (write next descriptor here)
+ * @tail: Tail of the ring (complete next descriptor here)
+ * @descriptors: Allocated descriptors for this ring
+ * @queue: Queue holding frames to be transferred over this ring
+ * @in_flight: Queue holding frames that are currently in flight
+ * @work: Interrupt work structure
+ * @is_tx: Is the ring Tx or Rx
+ * @running: Is the ring running
+ * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise.
+ * @vector: MSI-X vector number the ring uses (only set if @irq is > 0)
+ * @flags: Ring specific flags
+ * @sof_mask: Bit mask used to detect start of frame PDF
+ * @eof_mask: Bit mask used to detect end of frame PDF
+ */
+struct tb_ring {
+	struct mutex lock;
+	struct tb_nhi *nhi;
+	int size;
+	int hop;
+	int head;
+	int tail;
+	struct ring_desc *descriptors;
+	dma_addr_t descriptors_dma;
+	struct list_head queue;
+	struct list_head in_flight;
+	struct work_struct work;
+	bool is_tx:1;
+	bool running:1;
+	int irq;
+	u8 vector;
+	unsigned int flags;
+	u16 sof_mask;
+	u16 eof_mask;
+};
+
+/* Leave ring interrupt enabled on suspend */
+#define RING_FLAG_NO_SUSPEND	BIT(0)
+/* Configure the ring to be in frame mode */
+#define RING_FLAG_FRAME		BIT(1)
+/* Enable end-to-end flow control */
+#define RING_FLAG_E2E		BIT(2)
+
+struct ring_frame;
+typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled);
+
+/**
+ * struct ring_frame - For use with ring_rx/ring_tx
+ * @buffer_phy: DMA mapped address of the frame
+ * @callback: Callback called when the frame is finished
+ * @list: Frame is linked to a queue using this
+ * @size: Size of the frame in bytes (%0 means %4096)
+ * @flags: Flags for the frame (see &enum ring_desc_flags)
+ * @eof: End of frame protocol defined field
+ * @sof: Start of frame protocol defined field
+ */
+struct ring_frame {
+	dma_addr_t buffer_phy;
+	ring_cb callback;
+	struct list_head list;
+	u32 size:12;
+	u32 flags:12;
+	u32 eof:4;
+	u32 sof:4;
+};
+
+/* Minimum size for ring_rx */
+#define TB_FRAME_SIZE		0x100
+
+struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags);
+struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags, u16 sof_mask,
+				 u16 eof_mask);
+void tb_ring_start(struct tb_ring *ring);
+void tb_ring_stop(struct tb_ring *ring);
+void tb_ring_free(struct tb_ring *ring);
+
+int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
+
+/**
+ * tb_ring_rx() - enqueue a frame on an RX ring
+ * @ring: Ring to enqueue the frame
+ * @frame: Frame to enqueue
+ *
+ * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The
+ * buffer must contain at least %TB_FRAME_SIZE bytes.
+ *
+ * @frame->callback will be invoked with @frame->size, @frame->flags,
+ * @frame->eof, @frame->sof set once the frame has been received.
+ *
+ * If ring_stop() is called after the packet has been enqueued
+ * @frame->callback will be called with canceled set to true.
+ *
+ * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise.
+ */
+static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame)
+{
+	WARN_ON(ring->is_tx);
+	return __tb_ring_enqueue(ring, frame);
+}
+
+/**
+ * tb_ring_tx() - enqueue a frame on an TX ring
+ * @ring: Ring the enqueue the frame
+ * @frame: Frame to enqueue
+ *
+ * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size,
+ * @frame->eof and @frame->sof have to be set.
+ *
+ * @frame->callback will be invoked with once the frame has been transmitted.
+ *
+ * If ring_stop() is called after the packet has been enqueued @frame->callback
+ * will be called with canceled set to true.
+ *
+ * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise.
+ */
+static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
+{
+	WARN_ON(!ring->is_tx);
+	return __tb_ring_enqueue(ring, frame);
+}
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From 2a91ec63f8a11e70d4b958dd4df867fec0247179 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:38 +0300
Subject: thunderbolt: Move ring descriptor flags to thunderbolt.h

A Thunderbolt service driver might need to check if there was an error
with the descriptor when in frame mode. We also add two Rx specific
error flags RING_DESC_CRC_ERROR and RING_DESC_BUFFER_OVERRUN.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/nhi_regs.h |  7 -------
 include/linux/thunderbolt.h    | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/nhi_regs.h b/drivers/thunderbolt/nhi_regs.h
index 491a4c0c18fc..5ed6934e31e7 100644
--- a/drivers/thunderbolt/nhi_regs.h
+++ b/drivers/thunderbolt/nhi_regs.h
@@ -17,13 +17,6 @@ enum ring_flags {
 	RING_FLAG_ENABLE = 1 << 31,
 };
 
-enum ring_desc_flags {
-	RING_DESC_ISOCH = 0x1, /* TX only? */
-	RING_DESC_COMPLETED = 0x2, /* set by NHI */
-	RING_DESC_POSTED = 0x4, /* always set this */
-	RING_DESC_INTERRUPT = 0x8, /* request an interrupt on completion */
-};
-
 /**
  * struct ring_desc - TX/RX ring entry
  *
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 9ddb83ad890f..e3b9af7be0ad 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -478,6 +478,24 @@ struct tb_ring {
 struct ring_frame;
 typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled);
 
+/**
+ * enum ring_desc_flags - Flags for DMA ring descriptor
+ * %RING_DESC_ISOCH: Enable isonchronous DMA (Tx only)
+ * %RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only)
+ * %RING_DESC_COMPLETED: Descriptor completed (set by NHI)
+ * %RING_DESC_POSTED: Always set this
+ * %RING_DESC_BUFFER_OVERRUN: RX buffer overrun
+ * %RING_DESC_INTERRUPT: Request an interrupt on completion
+ */
+enum ring_desc_flags {
+	RING_DESC_ISOCH = 0x1,
+	RING_DESC_CRC_ERROR = 0x1,
+	RING_DESC_COMPLETED = 0x2,
+	RING_DESC_POSTED = 0x4,
+	RING_DESC_BUFFER_OVERRUN = 0x04,
+	RING_DESC_INTERRUPT = 0x8,
+};
+
 /**
  * struct ring_frame - For use with ring_rx/ring_tx
  * @buffer_phy: DMA mapped address of the frame
-- 
cgit v1.2.3


From 22b7de1000e66d739c431d6be4e7e97c69fa7c98 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:39 +0300
Subject: thunderbolt: Use spinlock in ring serialization

This makes it possible to enqueue frames also from atomic context which
is needed for example, when networking packets are sent over a
Thunderbolt cable.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/nhi.c   | 26 ++++++++++++++------------
 include/linux/thunderbolt.h |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index e0a47f7581cb..7d1891ec3c47 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -212,8 +212,10 @@ static void ring_work(struct work_struct *work)
 	struct tb_ring *ring = container_of(work, typeof(*ring), work);
 	struct ring_frame *frame;
 	bool canceled = false;
+	unsigned long flags;
 	LIST_HEAD(done);
-	mutex_lock(&ring->lock);
+
+	spin_lock_irqsave(&ring->lock, flags);
 
 	if (!ring->running) {
 		/*  Move all frames to done and mark them as canceled. */
@@ -241,7 +243,8 @@ static void ring_work(struct work_struct *work)
 	ring_write_descriptors(ring);
 
 invoke_callback:
-	mutex_unlock(&ring->lock); /* allow callbacks to schedule new work */
+	/* allow callbacks to schedule new work */
+	spin_unlock_irqrestore(&ring->lock, flags);
 	while (!list_empty(&done)) {
 		frame = list_first_entry(&done, typeof(*frame), list);
 		/*
@@ -255,15 +258,17 @@ invoke_callback:
 
 int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 {
+	unsigned long flags;
 	int ret = 0;
-	mutex_lock(&ring->lock);
+
+	spin_lock_irqsave(&ring->lock, flags);
 	if (ring->running) {
 		list_add_tail(&frame->list, &ring->queue);
 		ring_write_descriptors(ring);
 	} else {
 		ret = -ESHUTDOWN;
 	}
-	mutex_unlock(&ring->lock);
+	spin_unlock_irqrestore(&ring->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
@@ -338,7 +343,7 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	if (!ring)
 		goto err;
 
-	mutex_init(&ring->lock);
+	spin_lock_init(&ring->lock);
 	INIT_LIST_HEAD(&ring->queue);
 	INIT_LIST_HEAD(&ring->in_flight);
 	INIT_WORK(&ring->work, ring_work);
@@ -371,8 +376,6 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	return ring;
 
 err:
-	if (ring)
-		mutex_destroy(&ring->lock);
 	kfree(ring);
 	mutex_unlock(&nhi->lock);
 	return NULL;
@@ -419,7 +422,7 @@ void tb_ring_start(struct tb_ring *ring)
 	u32 flags;
 
 	mutex_lock(&ring->nhi->lock);
-	mutex_lock(&ring->lock);
+	spin_lock_irq(&ring->lock);
 	if (ring->nhi->going_away)
 		goto err;
 	if (ring->running) {
@@ -466,7 +469,7 @@ void tb_ring_start(struct tb_ring *ring)
 	ring_interrupt_active(ring, true);
 	ring->running = true;
 err:
-	mutex_unlock(&ring->lock);
+	spin_unlock_irq(&ring->lock);
 	mutex_unlock(&ring->nhi->lock);
 }
 EXPORT_SYMBOL_GPL(tb_ring_start);
@@ -487,7 +490,7 @@ EXPORT_SYMBOL_GPL(tb_ring_start);
 void tb_ring_stop(struct tb_ring *ring)
 {
 	mutex_lock(&ring->nhi->lock);
-	mutex_lock(&ring->lock);
+	spin_lock_irq(&ring->lock);
 	dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n",
 		 RING_TYPE(ring), ring->hop);
 	if (ring->nhi->going_away)
@@ -508,7 +511,7 @@ void tb_ring_stop(struct tb_ring *ring)
 	ring->running = false;
 
 err:
-	mutex_unlock(&ring->lock);
+	spin_unlock_irq(&ring->lock);
 	mutex_unlock(&ring->nhi->lock);
 
 	/*
@@ -568,7 +571,6 @@ void tb_ring_free(struct tb_ring *ring)
 	 * to finish before freeing the ring.
 	 */
 	flush_work(&ring->work);
-	mutex_destroy(&ring->lock);
 	kfree(ring);
 }
 EXPORT_SYMBOL_GPL(tb_ring_free);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index e3b9af7be0ad..cf9e42db780f 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -448,7 +448,7 @@ struct tb_nhi {
  * @eof_mask: Bit mask used to detect end of frame PDF
  */
 struct tb_ring {
-	struct mutex lock;
+	spinlock_t lock;
 	struct tb_nhi *nhi;
 	int size;
 	int hop;
-- 
cgit v1.2.3


From 59120e06101db72442acf4c8b364a0c76d8faa68 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:40 +0300
Subject: thunderbolt: Use spinlock in NHI serialization

This is needed because ring polling functionality can be called from
atomic contexts when networking and other high-speed traffic is
transferred over a Thunderbolt cable.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/nhi.c   | 75 +++++++++++++++++++++++++--------------------
 include/linux/thunderbolt.h |  2 +-
 2 files changed, 42 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 7d1891ec3c47..0b3c0640048b 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -327,21 +327,9 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	if (transmit && hop == RING_E2E_UNUSED_HOPID)
 		return NULL;
 
-	mutex_lock(&nhi->lock);
-	if (hop >= nhi->hop_count) {
-		dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop);
-		goto err;
-	}
-	if (transmit && nhi->tx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop);
-		goto err;
-	} else if (!transmit && nhi->rx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop);
-		goto err;
-	}
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 	if (!ring)
-		goto err;
+		return NULL;
 
 	spin_lock_init(&ring->lock);
 	INIT_LIST_HEAD(&ring->queue);
@@ -359,25 +347,45 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	ring->tail = 0;
 	ring->running = false;
 
-	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
-		goto err;
-
 	ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev,
 			size * sizeof(*ring->descriptors),
 			&ring->descriptors_dma, GFP_KERNEL | __GFP_ZERO);
 	if (!ring->descriptors)
-		goto err;
+		goto err_free_ring;
 
+	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
+		goto err_free_descs;
+
+	spin_lock_irq(&nhi->lock);
+	if (hop >= nhi->hop_count) {
+		dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop);
+		goto err_release_msix;
+	}
+	if (transmit && nhi->tx_rings[hop]) {
+		dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop);
+		goto err_release_msix;
+	} else if (!transmit && nhi->rx_rings[hop]) {
+		dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop);
+		goto err_release_msix;
+	}
 	if (transmit)
 		nhi->tx_rings[hop] = ring;
 	else
 		nhi->rx_rings[hop] = ring;
-	mutex_unlock(&nhi->lock);
+	spin_unlock_irq(&nhi->lock);
+
 	return ring;
 
-err:
+err_release_msix:
+	spin_unlock_irq(&nhi->lock);
+	ring_release_msix(ring);
+err_free_descs:
+	dma_free_coherent(&ring->nhi->pdev->dev,
+			  ring->size * sizeof(*ring->descriptors),
+			  ring->descriptors, ring->descriptors_dma);
+err_free_ring:
 	kfree(ring);
-	mutex_unlock(&nhi->lock);
+
 	return NULL;
 }
 
@@ -421,8 +429,8 @@ void tb_ring_start(struct tb_ring *ring)
 	u16 frame_size;
 	u32 flags;
 
-	mutex_lock(&ring->nhi->lock);
-	spin_lock_irq(&ring->lock);
+	spin_lock_irq(&ring->nhi->lock);
+	spin_lock(&ring->lock);
 	if (ring->nhi->going_away)
 		goto err;
 	if (ring->running) {
@@ -469,8 +477,8 @@ void tb_ring_start(struct tb_ring *ring)
 	ring_interrupt_active(ring, true);
 	ring->running = true;
 err:
-	spin_unlock_irq(&ring->lock);
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock(&ring->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 }
 EXPORT_SYMBOL_GPL(tb_ring_start);
 
@@ -489,8 +497,8 @@ EXPORT_SYMBOL_GPL(tb_ring_start);
  */
 void tb_ring_stop(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
-	spin_lock_irq(&ring->lock);
+	spin_lock_irq(&ring->nhi->lock);
+	spin_lock(&ring->lock);
 	dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n",
 		 RING_TYPE(ring), ring->hop);
 	if (ring->nhi->going_away)
@@ -511,8 +519,8 @@ void tb_ring_stop(struct tb_ring *ring)
 	ring->running = false;
 
 err:
-	spin_unlock_irq(&ring->lock);
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock(&ring->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 
 	/*
 	 * schedule ring->work to invoke callbacks on all remaining frames.
@@ -534,7 +542,7 @@ EXPORT_SYMBOL_GPL(tb_ring_stop);
  */
 void tb_ring_free(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
+	spin_lock_irq(&ring->nhi->lock);
 	/*
 	 * Dissociate the ring from the NHI. This also ensures that
 	 * nhi_interrupt_work cannot reschedule ring->work.
@@ -564,7 +572,7 @@ void tb_ring_free(struct tb_ring *ring)
 		 RING_TYPE(ring),
 		 ring->hop);
 
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 	/**
 	 * ring->work can no longer be scheduled (it is scheduled only
 	 * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it
@@ -639,7 +647,7 @@ static void nhi_interrupt_work(struct work_struct *work)
 	int type = 0; /* current interrupt type 0: TX, 1: RX, 2: RX overflow */
 	struct tb_ring *ring;
 
-	mutex_lock(&nhi->lock);
+	spin_lock_irq(&nhi->lock);
 
 	/*
 	 * Starting at REG_RING_NOTIFY_BASE there are three status bitfields
@@ -677,7 +685,7 @@ static void nhi_interrupt_work(struct work_struct *work)
 		/* we do not check ring->running, this is done in ring->work */
 		schedule_work(&ring->work);
 	}
-	mutex_unlock(&nhi->lock);
+	spin_unlock_irq(&nhi->lock);
 }
 
 static irqreturn_t nhi_msi(int irq, void *data)
@@ -767,7 +775,6 @@ static void nhi_shutdown(struct tb_nhi *nhi)
 		devm_free_irq(&nhi->pdev->dev, nhi->pdev->irq, nhi);
 		flush_work(&nhi->interrupt_work);
 	}
-	mutex_destroy(&nhi->lock);
 	ida_destroy(&nhi->msix_ida);
 }
 
@@ -856,7 +863,7 @@ static int nhi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return res;
 	}
 
-	mutex_init(&nhi->lock);
+	spin_lock_init(&nhi->lock);
 
 	pci_set_master(pdev);
 
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index cf9e42db780f..d59e3f9a35c4 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -415,7 +415,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
  * @hop_count: Number of rings (end point hops) supported by NHI.
  */
 struct tb_nhi {
-	struct mutex lock;
+	spinlock_t lock;
 	struct pci_dev *pdev;
 	void __iomem *iobase;
 	struct tb_ring **tx_rings;
-- 
cgit v1.2.3


From 4ffe722eefcb07c76701f03e0d759fbaecedf79f Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:41 +0300
Subject: thunderbolt: Add polling mode for rings

In order to support things like networking over Thunderbolt cable, there
needs to be a way to switch the ring to a mode where it can be polled
with the interrupt masked. We implement such mode so that the caller can
allocate a ring by passing pointer to a function that is then called
when an interrupt is triggered. Completed frames can be fetched using
tb_ring_poll() and the interrupt can be re-enabled when the caller is
finished with polling by using tb_ring_poll_complete().

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/ctl.c   |   2 +-
 drivers/thunderbolt/nhi.c   | 126 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/thunderbolt.h |  23 +++++---
 3 files changed, 134 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index dd10789e1dbb..d079dbba2c03 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -619,7 +619,7 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data)
 		goto err;
 
 	ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
-				0xffff);
+				0xffff, NULL, NULL);
 	if (!ctl->rx)
 		goto err;
 
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 0b3c0640048b..af0a80ddf594 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -252,7 +252,8 @@ invoke_callback:
 		 * Do not hold on to it.
 		 */
 		list_del_init(&frame->list);
-		frame->callback(ring, frame, canceled);
+		if (frame->callback)
+			frame->callback(ring, frame, canceled);
 	}
 }
 
@@ -273,11 +274,106 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 }
 EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
 
+/**
+ * tb_ring_poll() - Poll one completed frame from the ring
+ * @ring: Ring to poll
+ *
+ * This function can be called when @start_poll callback of the @ring
+ * has been called. It will read one completed frame from the ring and
+ * return it to the caller. Returns %NULL if there is no more completed
+ * frames.
+ */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring)
+{
+	struct ring_frame *frame = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	if (!ring->running)
+		goto unlock;
+	if (ring_empty(ring))
+		goto unlock;
+
+	if (ring->descriptors[ring->tail].flags & RING_DESC_COMPLETED) {
+		frame = list_first_entry(&ring->in_flight, typeof(*frame),
+					 list);
+		list_del_init(&frame->list);
+
+		if (!ring->is_tx) {
+			frame->size = ring->descriptors[ring->tail].length;
+			frame->eof = ring->descriptors[ring->tail].eof;
+			frame->sof = ring->descriptors[ring->tail].sof;
+			frame->flags = ring->descriptors[ring->tail].flags;
+		}
+
+		ring->tail = (ring->tail + 1) % ring->size;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&ring->lock, flags);
+	return frame;
+}
+EXPORT_SYMBOL_GPL(tb_ring_poll);
+
+static void __ring_interrupt_mask(struct tb_ring *ring, bool mask)
+{
+	int idx = ring_interrupt_index(ring);
+	int reg = REG_RING_INTERRUPT_BASE + idx / 32 * 4;
+	int bit = idx % 32;
+	u32 val;
+
+	val = ioread32(ring->nhi->iobase + reg);
+	if (mask)
+		val &= ~BIT(bit);
+	else
+		val |= BIT(bit);
+	iowrite32(val, ring->nhi->iobase + reg);
+}
+
+/* Both @nhi->lock and @ring->lock should be held */
+static void __ring_interrupt(struct tb_ring *ring)
+{
+	if (!ring->running)
+		return;
+
+	if (ring->start_poll) {
+		__ring_interrupt_mask(ring, false);
+		ring->start_poll(ring->poll_data);
+	} else {
+		schedule_work(&ring->work);
+	}
+}
+
+/**
+ * tb_ring_poll_complete() - Re-start interrupt for the ring
+ * @ring: Ring to re-start the interrupt
+ *
+ * This will re-start (unmask) the ring interrupt once the user is done
+ * with polling.
+ */
+void tb_ring_poll_complete(struct tb_ring *ring)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->nhi->lock, flags);
+	spin_lock(&ring->lock);
+	if (ring->start_poll)
+		__ring_interrupt_mask(ring, false);
+	spin_unlock(&ring->lock);
+	spin_unlock_irqrestore(&ring->nhi->lock, flags);
+}
+EXPORT_SYMBOL_GPL(tb_ring_poll_complete);
+
 static irqreturn_t ring_msix(int irq, void *data)
 {
 	struct tb_ring *ring = data;
 
-	schedule_work(&ring->work);
+	spin_lock(&ring->nhi->lock);
+	spin_lock(&ring->lock);
+	__ring_interrupt(ring);
+	spin_unlock(&ring->lock);
+	spin_unlock(&ring->nhi->lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -317,7 +413,9 @@ static void ring_release_msix(struct tb_ring *ring)
 
 static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 				     bool transmit, unsigned int flags,
-				     u16 sof_mask, u16 eof_mask)
+				     u16 sof_mask, u16 eof_mask,
+				     void (*start_poll)(void *),
+				     void *poll_data)
 {
 	struct tb_ring *ring = NULL;
 	dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n",
@@ -346,6 +444,8 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	ring->head = 0;
 	ring->tail = 0;
 	ring->running = false;
+	ring->start_poll = start_poll;
+	ring->poll_data = poll_data;
 
 	ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev,
 			size * sizeof(*ring->descriptors),
@@ -399,7 +499,7 @@ err_free_ring:
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags)
 {
-	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0);
+	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
 
@@ -411,11 +511,17 @@ EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
  * @flags: Flags for the ring
  * @sof_mask: Mask of PDF values that start a frame
  * @eof_mask: Mask of PDF values that end a frame
+ * @start_poll: If not %NULL the ring will call this function when an
+ *		interrupt is triggered and masked, instead of callback
+ *		in each Rx frame.
+ * @poll_data: Optional data passed to @start_poll
  */
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask, u16 eof_mask)
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data)
 {
-	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask);
+	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask,
+			     start_poll, poll_data);
 }
 EXPORT_SYMBOL_GPL(tb_ring_alloc_rx);
 
@@ -556,6 +662,7 @@ void tb_ring_free(struct tb_ring *ring)
 		dev_WARN(&ring->nhi->pdev->dev, "%s %d still running\n",
 			 RING_TYPE(ring), ring->hop);
 	}
+	spin_unlock_irq(&ring->nhi->lock);
 
 	ring_release_msix(ring);
 
@@ -572,7 +679,6 @@ void tb_ring_free(struct tb_ring *ring)
 		 RING_TYPE(ring),
 		 ring->hop);
 
-	spin_unlock_irq(&ring->nhi->lock);
 	/**
 	 * ring->work can no longer be scheduled (it is scheduled only
 	 * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it
@@ -682,8 +788,10 @@ static void nhi_interrupt_work(struct work_struct *work)
 				 hop);
 			continue;
 		}
-		/* we do not check ring->running, this is done in ring->work */
-		schedule_work(&ring->work);
+
+		spin_lock(&ring->lock);
+		__ring_interrupt(ring);
+		spin_unlock(&ring->lock);
 	}
 	spin_unlock_irq(&nhi->lock);
 }
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index d59e3f9a35c4..36925e3aec7c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -446,6 +446,9 @@ struct tb_nhi {
  * @flags: Ring specific flags
  * @sof_mask: Bit mask used to detect start of frame PDF
  * @eof_mask: Bit mask used to detect end of frame PDF
+ * @start_poll: Called when ring interrupt is triggered to start
+ *		polling. Passing %NULL keeps the ring in interrupt mode.
+ * @poll_data: Data passed to @start_poll
  */
 struct tb_ring {
 	spinlock_t lock;
@@ -466,6 +469,8 @@ struct tb_ring {
 	unsigned int flags;
 	u16 sof_mask;
 	u16 eof_mask;
+	void (*start_poll)(void *data);
+	void *poll_data;
 };
 
 /* Leave ring interrupt enabled on suspend */
@@ -499,7 +504,7 @@ enum ring_desc_flags {
 /**
  * struct ring_frame - For use with ring_rx/ring_tx
  * @buffer_phy: DMA mapped address of the frame
- * @callback: Callback called when the frame is finished
+ * @callback: Callback called when the frame is finished (optional)
  * @list: Frame is linked to a queue using this
  * @size: Size of the frame in bytes (%0 means %4096)
  * @flags: Flags for the frame (see &enum ring_desc_flags)
@@ -522,8 +527,8 @@ struct ring_frame {
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags);
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask,
-				 u16 eof_mask);
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data);
 void tb_ring_start(struct tb_ring *ring);
 void tb_ring_stop(struct tb_ring *ring);
 void tb_ring_free(struct tb_ring *ring);
@@ -535,8 +540,8 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
  * @ring: Ring to enqueue the frame
  * @frame: Frame to enqueue
  *
- * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The
- * buffer must contain at least %TB_FRAME_SIZE bytes.
+ * @frame->buffer, @frame->buffer_phy have to be set. The buffer must
+ * contain at least %TB_FRAME_SIZE bytes.
  *
  * @frame->callback will be invoked with @frame->size, @frame->flags,
  * @frame->eof, @frame->sof set once the frame has been received.
@@ -557,8 +562,8 @@ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame)
  * @ring: Ring the enqueue the frame
  * @frame: Frame to enqueue
  *
- * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size,
- * @frame->eof and @frame->sof have to be set.
+ * @frame->buffer, @frame->buffer_phy, @frame->size, @frame->eof and
+ * @frame->sof have to be set.
  *
  * @frame->callback will be invoked with once the frame has been transmitted.
  *
@@ -573,4 +578,8 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
 	return __tb_ring_enqueue(ring, frame);
 }
 
+/* Used only when the ring is in polling mode */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring);
+void tb_ring_poll_complete(struct tb_ring *ring);
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From 3304559e353f098d7e0ed5ca981e26c406513e12 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:42 +0300
Subject: thunderbolt: Add function to retrieve DMA device for the ring

This is needed when Thunderbolt service drivers need to DMA map memory
before it is passed down to the ring.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 36925e3aec7c..7b69853188b1 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -19,6 +19,7 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/mod_devicetable.h>
+#include <linux/pci.h>
 #include <linux/uuid.h>
 #include <linux/workqueue.h>
 
@@ -582,4 +583,16 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
 struct ring_frame *tb_ring_poll(struct tb_ring *ring);
 void tb_ring_poll_complete(struct tb_ring *ring);
 
+/**
+ * tb_ring_dma_device() - Return device used for DMA mapping
+ * @ring: Ring whose DMA device is retrieved
+ *
+ * Use this function when you are mapping DMA for buffers that are
+ * passed to the ring for sending/receiving.
+ */
+static inline struct device *tb_ring_dma_device(struct tb_ring *ring)
+{
+	return &ring->nhi->pdev->dev;
+}
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From abf4bb6b63d0a54266f8e7eff3720c1974063971 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Tue, 3 Oct 2017 09:58:06 +0200
Subject: skbuff: Add the offload_mr_fwd_mark field

Similarly to the offload_fwd_mark field, the offload_mr_fwd_mark field is
used to allow partial offloading of MFC multicast routes.

Switchdev drivers can offload MFC multicast routes to the hardware by
registering to the FIB notification chain. When one of the route output
interfaces is not offload-able, i.e. has different parent ID, the route
cannot be fully offloaded by the hardware. Examples to non-offload-able
devices are a management NIC, dummy device, pimreg device, etc.

Similar problem exists in the bridge module, as one bridge can hold
interfaces with different parent IDs. At the bridge, the problem is solved
by the offload_fwd_mark skb field.

Currently, when a route cannot go through full offload, the only solution
for a switchdev driver is not to offload it at all and let the packet go
through slow path.

Using the offload_mr_fwd_mark field, a driver can indicate that a packet
was already forwarded by hardware to all the devices with the same parent
ID as the input device. Further patches in this patch-set are going to
enhance ipmr to skip multicast forwarding to devices with the same parent
ID if a packets is marked with that field.

The reason why the already existing "offload_fwd_mark" bit cannot be used
is that a switchdev driver would want to make the distinction between a
packet that has already gone through L2 forwarding but did not go through
multicast forwarding, and a packet that has already gone through both L2
and multicast forwarding.

For example: when a packet is ingressing from a switchport enslaved to a
bridge, which is configured with multicast forwarding, the following
scenarios are possible:
 - The packet can be trapped to the CPU due to exception while multicast
   forwarding (for example, MTU error). In that case, it had already gone
   through L2 forwarding in the hardware, thus A switchdev driver would
   want to set the skb->offload_fwd_mark and not the
   skb->offload_mr_fwd_mark.
 - The packet can also be trapped due to a pimreg/dummy device used as one
   of the output interfaces. In that case, it can go through both L2 and
   (partial) multicast forwarding inside the hardware, thus a switchdev
   driver would want to set both the skb->offload_fwd_mark and
   skb->offload_mr_fwd_mark.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellaox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 19e64bfb1a66..ada821466e88 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -772,6 +772,7 @@ struct sk_buff {
 	__u8			remcsum_offload:1;
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
+	__u8			offload_mr_fwd_mark:1;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
-- 
cgit v1.2.3


From 5d8b3e69fc5e5ccafc9db1251bb7c78a8622fddd Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Tue, 3 Oct 2017 09:58:07 +0200
Subject: ipv4: ipmr: Add the parent ID field to VIF struct

In order to allow the ipmr module to do partial multicast forwarding
according to the device parent ID, add the device parent ID field to the
VIF struct. This way, the forwarding path can use the parent ID field
without invoking switchdev calls, which requires the RTNL lock.

When a new VIF is added, set the device parent ID field in it by invoking
the switchdev_port_attr_get call.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h |  1 +
 net/ipv4/ipmr.c        | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index b072a84fbe1c..8242d05df35e 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -57,6 +57,7 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule)
 
 struct vif_device {
 	struct net_device 	*dev;			/* Device we are using */
+	struct netdev_phys_item_id dev_parent_id;	/* Device parent ID    */
 	unsigned long	bytes_in,bytes_out;
 	unsigned long	pkt_in,pkt_out;		/* Statistics 			*/
 	unsigned long	rate_limit;		/* Traffic shaping (NI) 	*/
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a844738b38bd..1b161ada7ae6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,6 +67,7 @@
 #include <net/fib_rules.h>
 #include <linux/netconf.h>
 #include <net/nexthop.h>
+#include <net/switchdev.h>
 
 struct ipmr_rule {
 	struct fib_rule		common;
@@ -868,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 		   struct vifctl *vifc, int mrtsock)
 {
 	int vifi = vifc->vifc_vifi;
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+	};
 	struct vif_device *v = &mrt->vif_table[vifi];
 	struct net_device *dev;
 	struct in_device *in_dev;
@@ -942,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 
 	/* Fill in the VIF structures */
 
+	attr.orig_dev = dev;
+	if (!switchdev_port_attr_get(dev, &attr)) {
+		memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
+		v->dev_parent_id.id_len = attr.u.ppid.id_len;
+	} else {
+		v->dev_parent_id.id_len = 0;
+	}
 	v->rate_limit = vifc->vifc_rate_limit;
 	v->local = vifc->vifc_lcl_addr.s_addr;
 	v->remote = vifc->vifc_rmt_addr.s_addr;
-- 
cgit v1.2.3


From 6c5570016b972d9b1f0f6c2dca9cc0422b1f92bf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 2 Oct 2017 23:50:05 +0200
Subject: net: core: decouple ifalias get/set from rtnl lock

Device alias can be set by either rtnetlink (rtnl is held) or sysfs.

rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose.
Add an extra mutex for it and use rcu to protect concurrent accesses.

This allows the sysfs path to not take rtnl and would later allow
to not hold it when dumping ifalias.

Based on suggestion from Eric Dumazet.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  8 +++++++-
 net/core/dev.c            | 52 +++++++++++++++++++++++++++++++++++------------
 net/core/net-sysfs.c      | 17 ++++++++--------
 net/core/rtnetlink.c      | 13 ++++++++++--
 4 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e1d6ef130611..d04424cfffba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -826,6 +826,11 @@ struct xfrmdev_ops {
 };
 #endif
 
+struct dev_ifalias {
+	struct rcu_head rcuhead;
+	char ifalias[];
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1632,7 +1637,7 @@ enum netdev_priv_flags {
 struct net_device {
 	char			name[IFNAMSIZ];
 	struct hlist_node	name_hlist;
-	char 			*ifalias;
+	struct dev_ifalias	__rcu *ifalias;
 	/*
 	 *	I/O specific fields
 	 *	FIXME: Merge these and struct ifmap into one
@@ -3275,6 +3280,7 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags,
 			unsigned int gchanges);
 int dev_change_name(struct net_device *, const char *);
 int dev_set_alias(struct net_device *, const char *, size_t);
+int dev_get_alias(const struct net_device *, char *, size_t);
 int dev_change_net_namespace(struct net_device *, struct net *, const char *);
 int __dev_set_mtu(struct net_device *, int);
 int dev_set_mtu(struct net_device *, int);
diff --git a/net/core/dev.c b/net/core/dev.c
index e350c768d4b5..1770097cfd86 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id);
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
+static DEFINE_MUTEX(ifalias_mutex);
+
 /* protects napi_hash addition/deletion and napi_gen_id */
 static DEFINE_SPINLOCK(napi_hash_lock);
 
@@ -1265,29 +1267,53 @@ rollback:
  */
 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 {
-	char *new_ifalias;
-
-	ASSERT_RTNL();
+	struct dev_ifalias *new_alias = NULL;
 
 	if (len >= IFALIASZ)
 		return -EINVAL;
 
-	if (!len) {
-		kfree(dev->ifalias);
-		dev->ifalias = NULL;
-		return 0;
+	if (len) {
+		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
+		if (!new_alias)
+			return -ENOMEM;
+
+		memcpy(new_alias->ifalias, alias, len);
+		new_alias->ifalias[len] = 0;
 	}
 
-	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
-	if (!new_ifalias)
-		return -ENOMEM;
-	dev->ifalias = new_ifalias;
-	memcpy(dev->ifalias, alias, len);
-	dev->ifalias[len] = 0;
+	mutex_lock(&ifalias_mutex);
+	rcu_swap_protected(dev->ifalias, new_alias,
+			   mutex_is_locked(&ifalias_mutex));
+	mutex_unlock(&ifalias_mutex);
+
+	if (new_alias)
+		kfree_rcu(new_alias, rcuhead);
 
 	return len;
 }
 
+/**
+ *	dev_get_alias - get ifalias of a device
+ *	@dev: device
+ *	@alias: buffer to store name of ifalias
+ *	@len: size of buffer
+ *
+ *	get ifalias for a device.  Caller must make sure dev cannot go
+ *	away,  e.g. rcu read lock or own a reference count to device.
+ */
+int dev_get_alias(const struct net_device *dev, char *name, size_t len)
+{
+	const struct dev_ifalias *alias;
+	int ret = 0;
+
+	rcu_read_lock();
+	alias = rcu_dereference(dev->ifalias);
+	if (alias)
+		ret = snprintf(name, len, "%s", alias->ifalias);
+	rcu_read_unlock();
+
+	return ret;
+}
 
 /**
  *	netdev_features_change - device changes features
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 927a6dcbad96..51d5836d8fb9 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
 	if (len >  0 && buf[len - 1] == '\n')
 		--count;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
 	ret = dev_set_alias(netdev, buf, count);
-	rtnl_unlock();
 
 	return ret < 0 ? ret : len;
 }
@@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	const struct net_device *netdev = to_net_dev(dev);
+	char tmp[IFALIASZ];
 	ssize_t ret = 0;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
-	if (netdev->ifalias)
-		ret = sprintf(buf, "%s\n", netdev->ifalias);
-	rtnl_unlock();
+	ret = dev_get_alias(netdev, tmp, sizeof(tmp));
+	if (ret > 0)
+		ret = sprintf(buf, "%s\n", tmp);
 	return ret;
 }
 static DEVICE_ATTR_RW(ifalias);
@@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d)
 
 	BUG_ON(dev->reg_state != NETREG_RELEASED);
 
-	kfree(dev->ifalias);
+	/* no need to wait for rcu grace period:
+	 * device is dead and about to be freed.
+	 */
+	kfree(rcu_access_pointer(dev->ifalias));
 	netdev_freemem(dev);
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e6955da0d58d..3961f87cdc76 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1366,6 +1366,16 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev)
 	return nla_put_u32(skb, IFLA_LINK, ifindex);
 }
 
+static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
+					      struct net_device *dev)
+{
+	char buf[IFALIASZ];
+	int ret;
+
+	ret = dev_get_alias(dev, buf, sizeof(buf));
+	return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
+}
+
 static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 				  const struct net_device *dev)
 {
@@ -1425,8 +1435,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
 	    (dev->qdisc &&
 	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
-	    (dev->ifalias &&
-	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+	    nla_put_ifalias(skb, dev) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
 			atomic_read(&dev->carrier_changes)) ||
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
-- 
cgit v1.2.3


From f952be79cebd49d04154781d99408867a069d375 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:11 -0300
Subject: sctp: introduce struct sctp_stream_out_ext

With the stream schedulers, sctp_stream_out will become too big to be
allocated by kmalloc and as we need to allocate with BH disabled, we
cannot use __vmalloc in sctp_stream_init().

This patch moves out the stats from sctp_stream_out to
sctp_stream_out_ext, which will be allocated only when the application
tries to sendmsg something on it.

Just the introduction of sctp_stream_out_ext would already fix the issue
described above by splitting the allocation in two. Moving the stats
to it also reduces the pressure on the allocator as we will ask for less
memory atomically when creating the socket and we will use GFP_KERNEL
later.

Then, for stream schedulers, we will just use sctp_stream_out_ext.

Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 10 ++++++++--
 net/sctp/chunk.c           |  6 +++---
 net/sctp/outqueue.c        |  4 ++--
 net/sctp/socket.c          | 27 +++++++++++++++++++++------
 net/sctp/stream.c          | 16 ++++++++++++++++
 5 files changed, 50 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 0477945de1a3..9b2b30b3ba4d 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -84,6 +84,7 @@ struct sctp_ulpq;
 struct sctp_ep_common;
 struct crypto_shash;
 struct sctp_stream;
+struct sctp_stream_out;
 
 
 #include <net/sctp/tsnmap.h>
@@ -380,6 +381,7 @@ struct sctp_sender_hb_info {
 
 int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 		     gfp_t gfp);
+int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid);
 void sctp_stream_free(struct sctp_stream *stream);
 void sctp_stream_clear(struct sctp_stream *stream);
 void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new);
@@ -1315,11 +1317,15 @@ struct sctp_inithdr_host {
 	__u32 initial_tsn;
 };
 
+struct sctp_stream_out_ext {
+	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
+	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
+};
+
 struct sctp_stream_out {
 	__u16	ssn;
 	__u8	state;
-	__u64	abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
-	__u64	abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
+	struct sctp_stream_out_ext *ext;
 };
 
 struct sctp_stream_in {
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 3afac275ee82..7b261afc47b9 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -311,10 +311,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 
 		if (chunk->sent_count) {
 			chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
-			streamout->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
+			streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
 		} else {
 			chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
-			streamout->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
+			streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
 		}
 		return 1;
 	} else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
@@ -323,7 +323,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 			&chunk->asoc->stream.out[chunk->sinfo.sinfo_stream];
 
 		chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
-		streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
+		streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
 		return 1;
 	} else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) &&
 		   chunk->msg->expires_at &&
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 2966ff400755..746b07b7937d 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -366,7 +366,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
 		streamout = &asoc->stream.out[chk->sinfo.sinfo_stream];
 		asoc->sent_cnt_removable--;
 		asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
-		streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
+		streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
 
 		if (!chk->tsn_gap_acked) {
 			if (chk->transport)
@@ -404,7 +404,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 			struct sctp_stream_out *streamout =
 				&asoc->stream.out[chk->sinfo.sinfo_stream];
 
-			streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
+			streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
 		}
 
 		msg_len -= SCTP_DATA_SNDSIZE(chk) +
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d4730ada7f32..d207734326b0 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1927,6 +1927,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		goto out_free;
 	}
 
+	/* Allocate sctp_stream_out_ext if not already done */
+	if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) {
+		err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream);
+		if (err)
+			goto out_free;
+	}
+
 	if (sctp_wspace(asoc) < msg_len)
 		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
 
@@ -6645,7 +6652,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len,
 					   char __user *optval,
 					   int __user *optlen)
 {
-	struct sctp_stream_out *streamout;
+	struct sctp_stream_out_ext *streamoute;
 	struct sctp_association *asoc;
 	struct sctp_prstatus params;
 	int retval = -EINVAL;
@@ -6668,21 +6675,29 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len,
 	if (!asoc || params.sprstat_sid >= asoc->stream.outcnt)
 		goto out;
 
-	streamout = &asoc->stream.out[params.sprstat_sid];
+	streamoute = asoc->stream.out[params.sprstat_sid].ext;
+	if (!streamoute) {
+		/* Not allocated yet, means all stats are 0 */
+		params.sprstat_abandoned_unsent = 0;
+		params.sprstat_abandoned_sent = 0;
+		retval = 0;
+		goto out;
+	}
+
 	if (policy == SCTP_PR_SCTP_NONE) {
 		params.sprstat_abandoned_unsent = 0;
 		params.sprstat_abandoned_sent = 0;
 		for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
 			params.sprstat_abandoned_unsent +=
-				streamout->abandoned_unsent[policy];
+				streamoute->abandoned_unsent[policy];
 			params.sprstat_abandoned_sent +=
-				streamout->abandoned_sent[policy];
+				streamoute->abandoned_sent[policy];
 		}
 	} else {
 		params.sprstat_abandoned_unsent =
-			streamout->abandoned_unsent[__SCTP_PR_INDEX(policy)];
+			streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)];
 		params.sprstat_abandoned_sent =
-			streamout->abandoned_sent[__SCTP_PR_INDEX(policy)];
+			streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)];
 	}
 
 	if (put_user(len, optlen) || copy_to_user(optval, &params, len)) {
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 952437d656cc..055ca25bbc91 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -121,8 +121,24 @@ in:
 	return 0;
 }
 
+int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
+{
+	struct sctp_stream_out_ext *soute;
+
+	soute = kzalloc(sizeof(*soute), GFP_KERNEL);
+	if (!soute)
+		return -ENOMEM;
+	stream->out[sid].ext = soute;
+
+	return 0;
+}
+
 void sctp_stream_free(struct sctp_stream *stream)
 {
+	int i;
+
+	for (i = 0; i < stream->outcnt; i++)
+		kfree(stream->out[i].ext);
 	kfree(stream->out);
 	kfree(stream->in);
 }
-- 
cgit v1.2.3


From 2fc019f790312e703efa1a44204c586112a430dc Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:12 -0300
Subject: sctp: introduce sctp_chunk_stream_no

Add a helper to fetch the stream number from a given chunk.

Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 9b2b30b3ba4d..c48f7999fe9b 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -642,6 +642,11 @@ void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *,
 		     union sctp_addr *);
 const union sctp_addr *sctp_source(const struct sctp_chunk *chunk);
 
+static inline __u16 sctp_chunk_stream_no(struct sctp_chunk *ch)
+{
+	return ntohs(ch->subh.data_hdr->stream);
+}
+
 enum {
 	SCTP_ADDR_NEW,		/* new address added to assoc/ep */
 	SCTP_ADDR_SRC,		/* address can be used as source */
-- 
cgit v1.2.3


From 5bbbbe32a43199c2b9ea5ea66fab6241c64beb51 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:13 -0300
Subject: sctp: introduce stream scheduler foundations

This patch introduces the hooks necessary to do stream scheduling, as
per RFC Draft ndata.  It also introduces the first scheduler, which is
what we do today but now factored out: first come first served (FCFS).

With stream scheduling now we have to track which chunk was enqueued on
which stream and be able to select another other than the in front of
the main outqueue. So we introduce a list on sctp_stream_out_ext
structure for this purpose.

We reuse sctp_chunk->transmitted_list space for the list above, as the
chunk cannot belong to the two lists at the same time. By using the
union in there, we can have distinct names for these moments.

sctp_sched_ops are the operations expected to be implemented by each
scheduler. The dequeueing is a bit particular to this implementation but
it is to match how we dequeue packets today. We first dequeue and then
check if it fits the packet and if not, we requeue it at head. Thus why
we don't have a peek operation but have dequeue_done instead, which is
called once the chunk can be safely considered as transmitted.

The check removed from sctp_outq_flush is now performed by
sctp_stream_outq_migrate, which is only called during assoc setup.
(sctp_sendmsg() also checks for it)

The only operation that is foreseen but not yet added here is a way to
signalize that a new packet is starting or that the packet is done, for
round robin scheduler per packet, but is intentionally left to the
patch that actually implements it.

Support for I-DATA chunks, also described in this RFC, with user message
interleaving is straightforward as it just requires the schedulers to
probe for the feature and ignore datamsg boundaries when dequeueing.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/stream_sched.h |  72 +++++++++++
 include/net/sctp/structs.h      |  15 ++-
 include/uapi/linux/sctp.h       |   6 +
 net/sctp/Makefile               |   2 +-
 net/sctp/outqueue.c             |  59 +++++----
 net/sctp/sm_sideeffect.c        |   3 +
 net/sctp/stream.c               |  88 +++++++++++--
 net/sctp/stream_sched.c         | 270 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 477 insertions(+), 38 deletions(-)
 create mode 100644 include/net/sctp/stream_sched.h
 create mode 100644 net/sctp/stream_sched.c

(limited to 'include')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
new file mode 100644
index 000000000000..c676550a4c7d
--- /dev/null
+++ b/include/net/sctp/stream_sched.h
@@ -0,0 +1,72 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * These are definitions used by the stream schedulers, defined in RFC
+ * draft ndata (https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-11)
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation  is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresses:
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *   Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#ifndef __sctp_stream_sched_h__
+#define __sctp_stream_sched_h__
+
+struct sctp_sched_ops {
+	/* Property handling for a given stream */
+	int (*set)(struct sctp_stream *stream, __u16 sid, __u16 value,
+		   gfp_t gfp);
+	int (*get)(struct sctp_stream *stream, __u16 sid, __u16 *value);
+
+	/* Init the specific scheduler */
+	int (*init)(struct sctp_stream *stream);
+	/* Init a stream */
+	int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
+	/* Frees the entire thing */
+	void (*free)(struct sctp_stream *stream);
+
+	/* Enqueue a chunk */
+	void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg);
+	/* Dequeue a chunk */
+	struct sctp_chunk *(*dequeue)(struct sctp_outq *q);
+	/* Called only if the chunk fit the packet */
+	void (*dequeue_done)(struct sctp_outq *q, struct sctp_chunk *chunk);
+	/* Sched all chunks already enqueued */
+	void (*sched_all)(struct sctp_stream *steam);
+	/* Unched all chunks already enqueued */
+	void (*unsched_all)(struct sctp_stream *steam);
+};
+
+int sctp_sched_set_sched(struct sctp_association *asoc,
+			 enum sctp_sched_type sched);
+int sctp_sched_get_sched(struct sctp_association *asoc);
+int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 value, gfp_t gfp);
+int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 *value);
+void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch);
+
+void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch);
+int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
+struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
+
+#endif /* __sctp_stream_sched_h__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index c48f7999fe9b..3c22a30fd71b 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -84,7 +84,6 @@ struct sctp_ulpq;
 struct sctp_ep_common;
 struct crypto_shash;
 struct sctp_stream;
-struct sctp_stream_out;
 
 
 #include <net/sctp/tsnmap.h>
@@ -531,8 +530,12 @@ struct sctp_chunk {
 	/* How many times this chunk have been sent, for prsctp RTX policy */
 	int sent_count;
 
-	/* This is our link to the per-transport transmitted list.  */
-	struct list_head transmitted_list;
+	union {
+		/* This is our link to the per-transport transmitted list.  */
+		struct list_head transmitted_list;
+		/* List in specific stream outq */
+		struct list_head stream_list;
+	};
 
 	/* This field is used by chunks that hold fragmented data.
 	 * For the first fragment this is the list that holds the rest of
@@ -1019,6 +1022,9 @@ struct sctp_outq {
 	/* Data pending that has never been transmitted.  */
 	struct list_head out_chunk_list;
 
+	/* Stream scheduler being used */
+	struct sctp_sched_ops *sched;
+
 	unsigned int out_qlen;	/* Total length of queued data chunks. */
 
 	/* Error of send failed, may used in SCTP_SEND_FAILED event. */
@@ -1325,6 +1331,7 @@ struct sctp_inithdr_host {
 struct sctp_stream_out_ext {
 	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
 	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
+	struct list_head outq; /* chunks enqueued by this stream */
 };
 
 struct sctp_stream_out {
@@ -1342,6 +1349,8 @@ struct sctp_stream {
 	struct sctp_stream_in *in;
 	__u16 outcnt;
 	__u16 incnt;
+	/* Current stream being sent, if any */
+	struct sctp_stream_out *out_curr;
 };
 
 #define SCTP_STREAM_CLOSED		0x00
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6217ff8500a1..4487e7625ddb 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1088,4 +1088,10 @@ struct sctp_add_streams {
 	uint16_t sas_outstrms;
 };
 
+/* SCTP Stream schedulers */
+enum sctp_sched_type {
+	SCTP_SS_FCFS,
+	SCTP_SS_MAX = SCTP_SS_FCFS
+};
+
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 70f1b570bab9..0f6e6d1d69fd 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -12,7 +12,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o
+	  offload.o stream_sched.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 746b07b7937d..4db012aa25f7 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -50,6 +50,7 @@
 
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 /* Declare internal functions here.  */
 static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn);
@@ -72,32 +73,38 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
 static inline void sctp_outq_head_data(struct sctp_outq *q,
-					struct sctp_chunk *ch)
+				       struct sctp_chunk *ch)
 {
+	struct sctp_stream_out_ext *oute;
+	__u16 stream;
+
 	list_add(&ch->list, &q->out_chunk_list);
 	q->out_qlen += ch->skb->len;
+
+	stream = sctp_chunk_stream_no(ch);
+	oute = q->asoc->stream.out[stream].ext;
+	list_add(&ch->stream_list, &oute->outq);
 }
 
 /* Take data from the front of the queue. */
 static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
 {
-	struct sctp_chunk *ch = NULL;
-
-	if (!list_empty(&q->out_chunk_list)) {
-		struct list_head *entry = q->out_chunk_list.next;
-
-		ch = list_entry(entry, struct sctp_chunk, list);
-		list_del_init(entry);
-		q->out_qlen -= ch->skb->len;
-	}
-	return ch;
+	return q->sched->dequeue(q);
 }
+
 /* Add data chunk to the end of the queue. */
 static inline void sctp_outq_tail_data(struct sctp_outq *q,
 				       struct sctp_chunk *ch)
 {
+	struct sctp_stream_out_ext *oute;
+	__u16 stream;
+
 	list_add_tail(&ch->list, &q->out_chunk_list);
 	q->out_qlen += ch->skb->len;
+
+	stream = sctp_chunk_stream_no(ch);
+	oute = q->asoc->stream.out[stream].ext;
+	list_add_tail(&ch->stream_list, &oute->outq);
 }
 
 /*
@@ -207,6 +214,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
 	INIT_LIST_HEAD(&q->retransmit);
 	INIT_LIST_HEAD(&q->sacked);
 	INIT_LIST_HEAD(&q->abandoned);
+	sctp_sched_set_sched(asoc, SCTP_SS_FCFS);
 }
 
 /* Free the outqueue structure and any related pending chunks.
@@ -258,6 +266,7 @@ static void __sctp_outq_teardown(struct sctp_outq *q)
 
 	/* Throw away any leftover data chunks. */
 	while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+		sctp_sched_dequeue_done(q, chunk);
 
 		/* Mark as send failure. */
 		sctp_chunk_fail(chunk, q->error);
@@ -391,13 +400,14 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 	struct sctp_outq *q = &asoc->outqueue;
 	struct sctp_chunk *chk, *temp;
 
+	q->sched->unsched_all(&asoc->stream);
+
 	list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
 		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
 		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
 			continue;
 
-		list_del_init(&chk->list);
-		q->out_qlen -= chk->skb->len;
+		sctp_sched_dequeue_common(q, chk);
 		asoc->sent_cnt_removable--;
 		asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
 		if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) {
@@ -415,6 +425,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 			break;
 	}
 
+	q->sched->sched_all(&asoc->stream);
+
 	return msg_len;
 }
 
@@ -1033,22 +1045,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
 			__u32 sid = ntohs(chunk->subh.data_hdr->stream);
 
-			/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
-			 * stream identifier.
-			 */
-			if (chunk->sinfo.sinfo_stream >= asoc->stream.outcnt) {
-
-				/* Mark as failed send. */
-				sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
-				if (asoc->peer.prsctp_capable &&
-				    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
-					asoc->sent_cnt_removable--;
-				sctp_chunk_free(chunk);
-				continue;
-			}
-
 			/* Has this chunk expired? */
 			if (sctp_chunk_abandoned(chunk)) {
+				sctp_sched_dequeue_done(q, chunk);
 				sctp_chunk_fail(chunk, 0);
 				sctp_chunk_free(chunk);
 				continue;
@@ -1070,6 +1069,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED) {
 				WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
+				sctp_sched_dequeue_done(q, chunk);
 				sctp_chunk_fail(chunk, 0);
 				sctp_chunk_free(chunk);
 				continue;
@@ -1133,6 +1133,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				else
 					asoc->stats.oodchunks++;
 
+				/* Only now it's safe to consider this
+				 * chunk as sent, sched-wise.
+				 */
+				sctp_sched_dequeue_done(q, chunk);
+
 				break;
 
 			default:
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index e6a2974e020e..402bfbb888cd 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -50,6 +50,7 @@
 #include <net/sock.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 static int sctp_cmd_interpreter(enum sctp_event event_type,
 				union sctp_subtype subtype,
@@ -1089,6 +1090,8 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc,
 
 	list_for_each_entry(chunk, &msg->chunks, frag_list)
 		sctp_outq_tail(&asoc->outqueue, chunk, gfp);
+
+	asoc->outqueue.sched->enqueue(&asoc->outqueue, msg);
 }
 
 
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 055ca25bbc91..5ea33a2c453b 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -32,8 +32,61 @@
  *    Xin Long <lucien.xin@gmail.com>
  */
 
+#include <linux/list.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Migrates chunks from stream queues to new stream queues if needed,
+ * but not across associations. Also, removes those chunks to streams
+ * higher than the new max.
+ */
+static void sctp_stream_outq_migrate(struct sctp_stream *stream,
+				     struct sctp_stream *new, __u16 outcnt)
+{
+	struct sctp_association *asoc;
+	struct sctp_chunk *ch, *temp;
+	struct sctp_outq *outq;
+	int i;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	outq = &asoc->outqueue;
+
+	list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) {
+		__u16 sid = sctp_chunk_stream_no(ch);
+
+		if (sid < outcnt)
+			continue;
+
+		sctp_sched_dequeue_common(outq, ch);
+		/* No need to call dequeue_done here because
+		 * the chunks are not scheduled by now.
+		 */
+
+		/* Mark as failed send. */
+		sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM);
+		if (asoc->peer.prsctp_capable &&
+		    SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags))
+			asoc->sent_cnt_removable--;
+
+		sctp_chunk_free(ch);
+	}
+
+	if (new) {
+		/* Here we actually move the old ext stuff into the new
+		 * buffer, because we want to keep it. Then
+		 * sctp_stream_update will swap ->out pointers.
+		 */
+		for (i = 0; i < outcnt; i++) {
+			kfree(new->out[i].ext);
+			new->out[i].ext = stream->out[i].ext;
+			stream->out[i].ext = NULL;
+		}
+	}
+
+	for (i = outcnt; i < stream->outcnt; i++)
+		kfree(stream->out[i].ext);
+}
 
 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 				 gfp_t gfp)
@@ -87,7 +140,8 @@ static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
 int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 		     gfp_t gfp)
 {
-	int i;
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	int i, ret = 0;
 
 	gfp |= __GFP_NOWARN;
 
@@ -97,6 +151,11 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	if (outcnt == stream->outcnt)
 		goto in;
 
+	/* Filter out chunks queued on streams that won't exist anymore */
+	sched->unsched_all(stream);
+	sctp_stream_outq_migrate(stream, NULL, outcnt);
+	sched->sched_all(stream);
+
 	i = sctp_stream_alloc_out(stream, outcnt, gfp);
 	if (i)
 		return i;
@@ -105,20 +164,27 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	for (i = 0; i < stream->outcnt; i++)
 		stream->out[i].state = SCTP_STREAM_OPEN;
 
+	sched->init(stream);
+
 in:
 	if (!incnt)
-		return 0;
+		goto out;
 
 	i = sctp_stream_alloc_in(stream, incnt, gfp);
 	if (i) {
-		kfree(stream->out);
-		stream->out = NULL;
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto free;
 	}
 
 	stream->incnt = incnt;
+	goto out;
 
-	return 0;
+free:
+	sched->free(stream);
+	kfree(stream->out);
+	stream->out = NULL;
+out:
+	return ret;
 }
 
 int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
@@ -130,13 +196,15 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
 		return -ENOMEM;
 	stream->out[sid].ext = soute;
 
-	return 0;
+	return sctp_sched_init_sid(stream, sid, GFP_KERNEL);
 }
 
 void sctp_stream_free(struct sctp_stream *stream)
 {
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
 	int i;
 
+	sched->free(stream);
 	for (i = 0; i < stream->outcnt; i++)
 		kfree(stream->out[i].ext);
 	kfree(stream->out);
@@ -156,6 +224,10 @@ void sctp_stream_clear(struct sctp_stream *stream)
 
 void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
 {
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+
+	sched->unsched_all(stream);
+	sctp_stream_outq_migrate(stream, new, new->outcnt);
 	sctp_stream_free(stream);
 
 	stream->out = new->out;
@@ -163,6 +235,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
 	stream->outcnt = new->outcnt;
 	stream->incnt  = new->incnt;
 
+	sched->sched_all(stream);
+
 	new->out = NULL;
 	new->in  = NULL;
 }
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
new file mode 100644
index 000000000000..40a9a9de2b98
--- /dev/null
+++ b/net/sctp/stream_sched.c
@@ -0,0 +1,270 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* First Come First Serve (a.k.a. FIFO)
+ * RFC DRAFT ndata Section 3.1
+ */
+static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid,
+			       __u16 value, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid,
+			       __u16 *value)
+{
+	*value = 0;
+	return 0;
+}
+
+static int sctp_sched_fcfs_init(struct sctp_stream *stream)
+{
+	return 0;
+}
+
+static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid,
+				    gfp_t gfp)
+{
+	return 0;
+}
+
+static void sctp_sched_fcfs_free(struct sctp_stream *stream)
+{
+}
+
+static void sctp_sched_fcfs_enqueue(struct sctp_outq *q,
+				    struct sctp_datamsg *msg)
+{
+}
+
+static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_chunk *ch = NULL;
+	struct list_head *entry;
+
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	if (stream->out_curr) {
+		ch = list_entry(stream->out_curr->ext->outq.next,
+				struct sctp_chunk, stream_list);
+	} else {
+		entry = q->out_chunk_list.next;
+		ch = list_entry(entry, struct sctp_chunk, list);
+	}
+
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q,
+					 struct sctp_chunk *chunk)
+{
+}
+
+static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream)
+{
+}
+
+static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream)
+{
+}
+
+static struct sctp_sched_ops sctp_sched_fcfs = {
+	.set = sctp_sched_fcfs_set,
+	.get = sctp_sched_fcfs_get,
+	.init = sctp_sched_fcfs_init,
+	.init_sid = sctp_sched_fcfs_init_sid,
+	.free = sctp_sched_fcfs_free,
+	.enqueue = sctp_sched_fcfs_enqueue,
+	.dequeue = sctp_sched_fcfs_dequeue,
+	.dequeue_done = sctp_sched_fcfs_dequeue_done,
+	.sched_all = sctp_sched_fcfs_sched_all,
+	.unsched_all = sctp_sched_fcfs_unsched_all,
+};
+
+/* API to other parts of the stack */
+
+struct sctp_sched_ops *sctp_sched_ops[] = {
+	&sctp_sched_fcfs,
+};
+
+int sctp_sched_set_sched(struct sctp_association *asoc,
+			 enum sctp_sched_type sched)
+{
+	struct sctp_sched_ops *n = sctp_sched_ops[sched];
+	struct sctp_sched_ops *old = asoc->outqueue.sched;
+	struct sctp_datamsg *msg = NULL;
+	struct sctp_chunk *ch;
+	int i, ret = 0;
+
+	if (old == n)
+		return ret;
+
+	if (sched > SCTP_SS_MAX)
+		return -EINVAL;
+
+	if (old) {
+		old->free(&asoc->stream);
+
+		/* Give the next scheduler a clean slate. */
+		for (i = 0; i < asoc->stream.outcnt; i++) {
+			void *p = asoc->stream.out[i].ext;
+
+			if (!p)
+				continue;
+
+			p += offsetofend(struct sctp_stream_out_ext, outq);
+			memset(p, 0, sizeof(struct sctp_stream_out_ext) -
+				     offsetofend(struct sctp_stream_out_ext, outq));
+		}
+	}
+
+	asoc->outqueue.sched = n;
+	n->init(&asoc->stream);
+	for (i = 0; i < asoc->stream.outcnt; i++) {
+		if (!asoc->stream.out[i].ext)
+			continue;
+
+		ret = n->init_sid(&asoc->stream, i, GFP_KERNEL);
+		if (ret)
+			goto err;
+	}
+
+	/* We have to requeue all chunks already queued. */
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		if (ch->msg == msg)
+			continue;
+		msg = ch->msg;
+		n->enqueue(&asoc->outqueue, msg);
+	}
+
+	return ret;
+
+err:
+	n->free(&asoc->stream);
+	asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */
+
+	return ret;
+}
+
+int sctp_sched_get_sched(struct sctp_association *asoc)
+{
+	int i;
+
+	for (i = 0; i <= SCTP_SS_MAX; i++)
+		if (asoc->outqueue.sched == sctp_sched_ops[i])
+			return i;
+
+	return 0;
+}
+
+int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 value, gfp_t gfp)
+{
+	if (sid >= asoc->stream.outcnt)
+		return -EINVAL;
+
+	if (!asoc->stream.out[sid].ext) {
+		int ret;
+
+		ret = sctp_stream_init_ext(&asoc->stream, sid);
+		if (ret)
+			return ret;
+	}
+
+	return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp);
+}
+
+int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 *value)
+{
+	if (sid >= asoc->stream.outcnt)
+		return -EINVAL;
+
+	if (!asoc->stream.out[sid].ext)
+		return 0;
+
+	return asoc->outqueue.sched->get(&asoc->stream, sid, value);
+}
+
+void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
+{
+	if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) {
+		struct sctp_stream_out *sout;
+		__u16 sid;
+
+		/* datamsg is not finish, so save it as current one,
+		 * in case application switch scheduler or a higher
+		 * priority stream comes in.
+		 */
+		sid = sctp_chunk_stream_no(ch);
+		sout = &q->asoc->stream.out[sid];
+		q->asoc->stream.out_curr = sout;
+		return;
+	}
+
+	q->asoc->stream.out_curr = NULL;
+	q->sched->dequeue_done(q, ch);
+}
+
+/* Auxiliary functions for the schedulers */
+void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch)
+{
+	list_del_init(&ch->list);
+	list_del_init(&ch->stream_list);
+	q->out_qlen -= ch->skb->len;
+}
+
+int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp)
+{
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+
+	INIT_LIST_HEAD(&stream->out[sid].ext->outq);
+	return sched->init_sid(stream, sid, gfp);
+}
+
+struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+
+	return asoc->outqueue.sched;
+}
-- 
cgit v1.2.3


From 13aa8770fe42d246c6f3a8eb814b85bccb428011 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:14 -0300
Subject: sctp: add sockopt to get/set stream scheduler

As defined per RFC Draft ndata Section 4.3.2, named as
SCTP_STREAM_SCHEDULER.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  1 +
 net/sctp/socket.c         | 75 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 4487e7625ddb..0050f10087d2 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -122,6 +122,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_RESET_ASSOC	120
 #define SCTP_ADD_STREAMS	121
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
+#define SCTP_STREAM_SCHEDULER	123
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d207734326b0..ae35dbf2810f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -79,6 +79,7 @@
 #include <net/sock.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 /* Forward declarations for internal helper functions. */
 static int sctp_writeable(struct sock *sk);
@@ -3914,6 +3915,36 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_scheduler(struct sock *sk,
+				     char __user *optval,
+				     unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_assoc_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_value > SCTP_SS_MAX)
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_sched_set_sched(asoc, params.assoc_value);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4095,6 +4126,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_ADD_STREAMS:
 		retval = sctp_setsockopt_add_streams(sk, optval, optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER:
+		retval = sctp_setsockopt_scheduler(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6793,6 +6827,43 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_scheduler(struct sock *sk, int len,
+				     char __user *optval,
+				     int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	params.assoc_value = sctp_sched_get_sched(asoc);
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -6975,6 +7046,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_enable_strreset(sk, len, optval,
 							 optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER:
+		retval = sctp_getsockopt_scheduler(sk, len, optval,
+						   optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 0ccdf3c7fdeda511b10def19505178a9d2d3fccd Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:15 -0300
Subject: sctp: add sockopt to get/set stream scheduler parameters

As defined per RFC Draft ndata Section 4.3.3, named as
SCTP_STREAM_SCHEDULER_VALUE.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  7 +++++
 net/sctp/socket.c         | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 0050f10087d2..00ac417d2c4f 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -123,6 +123,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ADD_STREAMS	121
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
 #define SCTP_STREAM_SCHEDULER	123
+#define SCTP_STREAM_SCHEDULER_VALUE	124
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -815,6 +816,12 @@ struct sctp_assoc_value {
     uint32_t                assoc_value;
 };
 
+struct sctp_stream_value {
+	sctp_assoc_t assoc_id;
+	uint16_t stream_id;
+	uint16_t stream_value;
+};
+
 /*
  * 7.2.2 Peer Address Information
  *
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ae35dbf2810f..88c28421ec15 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3945,6 +3945,34 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_scheduler_value(struct sock *sk,
+					   char __user *optval,
+					   unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_sched_set_value(asoc, params.stream_id,
+				      params.stream_value, GFP_KERNEL);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4129,6 +4157,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_STREAM_SCHEDULER:
 		retval = sctp_setsockopt_scheduler(sk, optval, optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER_VALUE:
+		retval = sctp_setsockopt_scheduler_value(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6864,6 +6895,48 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_scheduler_value(struct sock *sk, int len,
+					   char __user *optval,
+					   int __user *optlen)
+{
+	struct sctp_stream_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = sctp_sched_get_value(asoc, params.stream_id,
+				      &params.stream_value);
+	if (retval)
+		goto out;
+
+	if (put_user(len, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(optval, &params, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7050,6 +7123,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_scheduler(sk, len, optval,
 						   optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER_VALUE:
+		retval = sctp_getsockopt_scheduler_value(sk, len, optval,
+							 optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 637784ade221a3c8a7ecd0f583eddd95d6276b9a Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:16 -0300
Subject: sctp: introduce priority based stream scheduler

This patch introduces RFC Draft ndata section 3.4 Priority Based
Scheduler (SCTP_SS_PRIO).

It works by having a struct sctp_stream_priority for each priority
configured. This struct is then enlisted on a queue ordered per priority
if, and only if, there is a stream with data queued, so that dequeueing
is very straightforward: either finish current datamsg or simply dequeue
from the highest priority queued, which is the next stream pointed, and
that's it.

If there are multiple streams assigned with the same priority and with
data queued, it will do round robin amongst them while respecting
datamsgs boundaries (when not using idata chunks), to be reasonably
fair.

We intentionally don't maintain a list of priorities nor a list of all
streams with the same priority to save memory. The first would mean at
least 2 other pointers per priority (which, for 1000 priorities, that
can mean 16kB) and the second would also mean 2 other pointers but per
stream. As SCTP supports up to 65535 streams on a given asoc, that's
1MB. This impacts when giving a priority to some stream, as we have to
find out if the new priority is already being used and if we can free
the old one, and also when tearing down.

The new fields in struct sctp_stream_out_ext and sctp_stream are added
under a union because that memory is to be shared with other schedulers.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h   |  24 +++
 include/uapi/linux/sctp.h    |   3 +-
 net/sctp/Makefile            |   2 +-
 net/sctp/stream_sched.c      |   3 +
 net/sctp/stream_sched_prio.c | 347 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 377 insertions(+), 2 deletions(-)
 create mode 100644 net/sctp/stream_sched_prio.c

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 3c22a30fd71b..40eb8d66a37c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1328,10 +1328,27 @@ struct sctp_inithdr_host {
 	__u32 initial_tsn;
 };
 
+struct sctp_stream_priorities {
+	/* List of priorities scheduled */
+	struct list_head prio_sched;
+	/* List of streams scheduled */
+	struct list_head active;
+	/* The next stream stream in line */
+	struct sctp_stream_out_ext *next;
+	__u16 prio;
+};
+
 struct sctp_stream_out_ext {
 	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
 	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
 	struct list_head outq; /* chunks enqueued by this stream */
+	union {
+		struct {
+			/* Scheduled streams list */
+			struct list_head prio_list;
+			struct sctp_stream_priorities *prio_head;
+		};
+	};
 };
 
 struct sctp_stream_out {
@@ -1351,6 +1368,13 @@ struct sctp_stream {
 	__u16 incnt;
 	/* Current stream being sent, if any */
 	struct sctp_stream_out *out_curr;
+	union {
+		/* Fields used by priority scheduler */
+		struct {
+			/* List of priorities scheduled */
+			struct list_head prio_list;
+		};
+	};
 };
 
 #define SCTP_STREAM_CLOSED		0x00
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 00ac417d2c4f..850fa8b29d7e 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1099,7 +1099,8 @@ struct sctp_add_streams {
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
 	SCTP_SS_FCFS,
-	SCTP_SS_MAX = SCTP_SS_FCFS
+	SCTP_SS_PRIO,
+	SCTP_SS_MAX = SCTP_SS_PRIO
 };
 
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 0f6e6d1d69fd..647c9cfd4e95 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -12,7 +12,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o stream_sched.o
+	  offload.o stream_sched.o stream_sched_prio.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 40a9a9de2b98..115ddb765169 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -121,8 +121,11 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 
 /* API to other parts of the stack */
 
+extern struct sctp_sched_ops sctp_sched_prio;
+
 struct sctp_sched_ops *sctp_sched_ops[] = {
 	&sctp_sched_fcfs,
+	&sctp_sched_prio,
 };
 
 int sctp_sched_set_sched(struct sctp_association *asoc,
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
new file mode 100644
index 000000000000..384dbf3c8760
--- /dev/null
+++ b/net/sctp/stream_sched_prio.c
@@ -0,0 +1,347 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Priority handling
+ * RFC DRAFT ndata section 3.4
+ */
+
+static void sctp_sched_prio_unsched_all(struct sctp_stream *stream);
+
+static struct sctp_stream_priorities *sctp_sched_prio_new_head(
+			struct sctp_stream *stream, int prio, gfp_t gfp)
+{
+	struct sctp_stream_priorities *p;
+
+	p = kmalloc(sizeof(*p), gfp);
+	if (!p)
+		return NULL;
+
+	INIT_LIST_HEAD(&p->prio_sched);
+	INIT_LIST_HEAD(&p->active);
+	p->next = NULL;
+	p->prio = prio;
+
+	return p;
+}
+
+static struct sctp_stream_priorities *sctp_sched_prio_get_head(
+			struct sctp_stream *stream, int prio, gfp_t gfp)
+{
+	struct sctp_stream_priorities *p;
+	int i;
+
+	/* Look into scheduled priorities first, as they are sorted and
+	 * we can find it fast IF it's scheduled.
+	 */
+	list_for_each_entry(p, &stream->prio_list, prio_sched) {
+		if (p->prio == prio)
+			return p;
+		if (p->prio > prio)
+			break;
+	}
+
+	/* No luck. So we search on all streams now. */
+	for (i = 0; i < stream->outcnt; i++) {
+		if (!stream->out[i].ext)
+			continue;
+
+		p = stream->out[i].ext->prio_head;
+		if (!p)
+			/* Means all other streams won't be initialized
+			 * as well.
+			 */
+			break;
+		if (p->prio == prio)
+			return p;
+	}
+
+	/* If not even there, allocate a new one. */
+	return sctp_sched_prio_new_head(stream, prio, gfp);
+}
+
+static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p)
+{
+	struct list_head *pos;
+
+	pos = p->next->prio_list.next;
+	if (pos == &p->active)
+		pos = pos->next;
+	p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list);
+}
+
+static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute)
+{
+	bool scheduled = false;
+
+	if (!list_empty(&soute->prio_list)) {
+		struct sctp_stream_priorities *prio_head = soute->prio_head;
+
+		/* Scheduled */
+		scheduled = true;
+
+		if (prio_head->next == soute)
+			/* Try to move to the next stream */
+			sctp_sched_prio_next_stream(prio_head);
+
+		list_del_init(&soute->prio_list);
+
+		/* Also unsched the priority if this was the last stream */
+		if (list_empty(&prio_head->active)) {
+			list_del_init(&prio_head->prio_sched);
+			/* If there is no stream left, clear next */
+			prio_head->next = NULL;
+		}
+	}
+
+	return scheduled;
+}
+
+static void sctp_sched_prio_sched(struct sctp_stream *stream,
+				  struct sctp_stream_out_ext *soute)
+{
+	struct sctp_stream_priorities *prio, *prio_head;
+
+	prio_head = soute->prio_head;
+
+	/* Nothing to do if already scheduled */
+	if (!list_empty(&soute->prio_list))
+		return;
+
+	/* Schedule the stream. If there is a next, we schedule the new
+	 * one before it, so it's the last in round robin order.
+	 * If there isn't, we also have to schedule the priority.
+	 */
+	if (prio_head->next) {
+		list_add(&soute->prio_list, prio_head->next->prio_list.prev);
+		return;
+	}
+
+	list_add(&soute->prio_list, &prio_head->active);
+	prio_head->next = soute;
+
+	list_for_each_entry(prio, &stream->prio_list, prio_sched) {
+		if (prio->prio > prio_head->prio) {
+			list_add(&prio_head->prio_sched, prio->prio_sched.prev);
+			return;
+		}
+	}
+
+	list_add_tail(&prio_head->prio_sched, &stream->prio_list);
+}
+
+static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
+			       __u16 prio, gfp_t gfp)
+{
+	struct sctp_stream_out *sout = &stream->out[sid];
+	struct sctp_stream_out_ext *soute = sout->ext;
+	struct sctp_stream_priorities *prio_head, *old;
+	bool reschedule = false;
+	int i;
+
+	prio_head = sctp_sched_prio_get_head(stream, prio, gfp);
+	if (!prio_head)
+		return -ENOMEM;
+
+	reschedule = sctp_sched_prio_unsched(soute);
+	old = soute->prio_head;
+	soute->prio_head = prio_head;
+	if (reschedule)
+		sctp_sched_prio_sched(stream, soute);
+
+	if (!old)
+		/* Happens when we set the priority for the first time */
+		return 0;
+
+	for (i = 0; i < stream->outcnt; i++) {
+		soute = stream->out[i].ext;
+		if (soute && soute->prio_head == old)
+			/* It's still in use, nothing else to do here. */
+			return 0;
+	}
+
+	/* No hits, we are good to free it. */
+	kfree(old);
+
+	return 0;
+}
+
+static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid,
+			       __u16 *value)
+{
+	*value = stream->out[sid].ext->prio_head->prio;
+	return 0;
+}
+
+static int sctp_sched_prio_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->prio_list);
+
+	return 0;
+}
+
+static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid,
+				    gfp_t gfp)
+{
+	INIT_LIST_HEAD(&stream->out[sid].ext->prio_list);
+	return sctp_sched_prio_set(stream, sid, 0, gfp);
+}
+
+static void sctp_sched_prio_free(struct sctp_stream *stream)
+{
+	struct sctp_stream_priorities *prio, *n;
+	LIST_HEAD(list);
+	int i;
+
+	/* As we don't keep a list of priorities, to avoid multiple
+	 * frees we have to do it in 3 steps:
+	 *   1. unsched everyone, so the lists are free to use in 2.
+	 *   2. build the list of the priorities
+	 *   3. free the list
+	 */
+	sctp_sched_prio_unsched_all(stream);
+	for (i = 0; i < stream->outcnt; i++) {
+		if (!stream->out[i].ext)
+			continue;
+		prio = stream->out[i].ext->prio_head;
+		if (prio && list_empty(&prio->prio_sched))
+			list_add(&prio->prio_sched, &list);
+	}
+	list_for_each_entry_safe(prio, n, &list, prio_sched) {
+		list_del_init(&prio->prio_sched);
+		kfree(prio);
+	}
+}
+
+static void sctp_sched_prio_enqueue(struct sctp_outq *q,
+				    struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_prio_sched(stream, stream->out[sid].ext);
+}
+
+static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_priorities *prio;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch = NULL;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	/* Find which chunk is next. It's easy, it's either the current
+	 * one or the first chunk on the next active stream.
+	 */
+	if (stream->out_curr) {
+		soute = stream->out_curr->ext;
+	} else {
+		prio = list_entry(stream->prio_list.next,
+				  struct sctp_stream_priorities, prio_sched);
+		soute = prio->next;
+	}
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_prio_dequeue_done(struct sctp_outq *q,
+					 struct sctp_chunk *ch)
+{
+	struct sctp_stream_priorities *prio;
+	struct sctp_stream_out_ext *soute;
+	__u16 sid;
+
+	/* Last chunk on that msg, move to the next stream on
+	 * this priority.
+	 */
+	sid = sctp_chunk_stream_no(ch);
+	soute = q->asoc->stream.out[sid].ext;
+	prio = soute->prio_head;
+
+	sctp_sched_prio_next_stream(prio);
+
+	if (list_empty(&soute->outq))
+		sctp_sched_prio_unsched(soute);
+}
+
+static void sctp_sched_prio_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_out *sout;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid;
+
+		sid = sctp_chunk_stream_no(ch);
+		sout = &stream->out[sid];
+		if (sout->ext)
+			sctp_sched_prio_sched(stream, sout->ext);
+	}
+}
+
+static void sctp_sched_prio_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_priorities *p, *tmp;
+	struct sctp_stream_out_ext *soute, *souttmp;
+
+	list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched)
+		list_for_each_entry_safe(soute, souttmp, &p->active, prio_list)
+			sctp_sched_prio_unsched(soute);
+}
+
+struct sctp_sched_ops sctp_sched_prio = {
+	.set = sctp_sched_prio_set,
+	.get = sctp_sched_prio_get,
+	.init = sctp_sched_prio_init,
+	.init_sid = sctp_sched_prio_init_sid,
+	.free = sctp_sched_prio_free,
+	.enqueue = sctp_sched_prio_enqueue,
+	.dequeue = sctp_sched_prio_dequeue,
+	.dequeue_done = sctp_sched_prio_dequeue_done,
+	.sched_all = sctp_sched_prio_sched_all,
+	.unsched_all = sctp_sched_prio_unsched_all,
+};
-- 
cgit v1.2.3


From ac1ed8b82cd60ba8e7d84103ac1414b8c577c485 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:17 -0300
Subject: sctp: introduce round robin stream scheduler

This patch introduces RFC Draft ndata section 3.2 Priority Based
Scheduler (SCTP_SS_RR).

Works by maintaining a list of enqueued streams and tracking the last
one used to send data. When the datamsg is done, it switches to the next
stream.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  11 +++
 include/uapi/linux/sctp.h  |   3 +-
 net/sctp/Makefile          |   3 +-
 net/sctp/stream_sched.c    |   2 +
 net/sctp/stream_sched_rr.c | 201 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 net/sctp/stream_sched_rr.c

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 40eb8d66a37c..16f949eef52f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1348,6 +1348,10 @@ struct sctp_stream_out_ext {
 			struct list_head prio_list;
 			struct sctp_stream_priorities *prio_head;
 		};
+		/* Fields used by RR scheduler */
+		struct {
+			struct list_head rr_list;
+		};
 	};
 };
 
@@ -1374,6 +1378,13 @@ struct sctp_stream {
 			/* List of priorities scheduled */
 			struct list_head prio_list;
 		};
+		/* Fields used by RR scheduler */
+		struct {
+			/* List of streams scheduled */
+			struct list_head rr_list;
+			/* The next stream stream in line */
+			struct sctp_stream_out_ext *rr_next;
+		};
 	};
 };
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 850fa8b29d7e..6cd7d416ca40 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1100,7 +1100,8 @@ struct sctp_add_streams {
 enum sctp_sched_type {
 	SCTP_SS_FCFS,
 	SCTP_SS_PRIO,
-	SCTP_SS_MAX = SCTP_SS_PRIO
+	SCTP_SS_RR,
+	SCTP_SS_MAX = SCTP_SS_RR
 };
 
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 647c9cfd4e95..bf90c5397719 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -12,7 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o stream_sched.o stream_sched_prio.o
+	  offload.o stream_sched.o stream_sched_prio.o \
+	  stream_sched_rr.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 115ddb765169..03513a9fa110 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -122,10 +122,12 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 /* API to other parts of the stack */
 
 extern struct sctp_sched_ops sctp_sched_prio;
+extern struct sctp_sched_ops sctp_sched_rr;
 
 struct sctp_sched_ops *sctp_sched_ops[] = {
 	&sctp_sched_fcfs,
 	&sctp_sched_prio,
+	&sctp_sched_rr,
 };
 
 int sctp_sched_set_sched(struct sctp_association *asoc,
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
new file mode 100644
index 000000000000..7612a438c5b9
--- /dev/null
+++ b/net/sctp/stream_sched_rr.c
@@ -0,0 +1,201 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Priority handling
+ * RFC DRAFT ndata section 3.2
+ */
+static void sctp_sched_rr_unsched_all(struct sctp_stream *stream);
+
+static void sctp_sched_rr_next_stream(struct sctp_stream *stream)
+{
+	struct list_head *pos;
+
+	pos = stream->rr_next->rr_list.next;
+	if (pos == &stream->rr_list)
+		pos = pos->next;
+	stream->rr_next = list_entry(pos, struct sctp_stream_out_ext, rr_list);
+}
+
+static void sctp_sched_rr_unsched(struct sctp_stream *stream,
+				  struct sctp_stream_out_ext *soute)
+{
+	if (stream->rr_next == soute)
+		/* Try to move to the next stream */
+		sctp_sched_rr_next_stream(stream);
+
+	list_del_init(&soute->rr_list);
+
+	/* If we have no other stream queued, clear next */
+	if (list_empty(&stream->rr_list))
+		stream->rr_next = NULL;
+}
+
+static void sctp_sched_rr_sched(struct sctp_stream *stream,
+				struct sctp_stream_out_ext *soute)
+{
+	if (!list_empty(&soute->rr_list))
+		/* Already scheduled. */
+		return;
+
+	/* Schedule the stream */
+	list_add_tail(&soute->rr_list, &stream->rr_list);
+
+	if (!stream->rr_next)
+		stream->rr_next = soute;
+}
+
+static int sctp_sched_rr_set(struct sctp_stream *stream, __u16 sid,
+			     __u16 prio, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_rr_get(struct sctp_stream *stream, __u16 sid,
+			     __u16 *value)
+{
+	return 0;
+}
+
+static int sctp_sched_rr_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->rr_list);
+	stream->rr_next = NULL;
+
+	return 0;
+}
+
+static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid,
+				  gfp_t gfp)
+{
+	INIT_LIST_HEAD(&stream->out[sid].ext->rr_list);
+
+	return 0;
+}
+
+static void sctp_sched_rr_free(struct sctp_stream *stream)
+{
+	sctp_sched_rr_unsched_all(stream);
+}
+
+static void sctp_sched_rr_enqueue(struct sctp_outq *q,
+				  struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_rr_sched(stream, stream->out[sid].ext);
+}
+
+static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch = NULL;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	/* Find which chunk is next */
+	if (stream->out_curr)
+		soute = stream->out_curr->ext;
+	else
+		soute = stream->rr_next;
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_rr_dequeue_done(struct sctp_outq *q,
+				       struct sctp_chunk *ch)
+{
+	struct sctp_stream_out_ext *soute;
+	__u16 sid;
+
+	/* Last chunk on that msg, move to the next stream */
+	sid = sctp_chunk_stream_no(ch);
+	soute = q->asoc->stream.out[sid].ext;
+
+	sctp_sched_rr_next_stream(&q->asoc->stream);
+
+	if (list_empty(&soute->outq))
+		sctp_sched_rr_unsched(&q->asoc->stream, soute);
+}
+
+static void sctp_sched_rr_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid;
+
+		sid = sctp_chunk_stream_no(ch);
+		soute = stream->out[sid].ext;
+		if (soute)
+			sctp_sched_rr_sched(stream, soute);
+	}
+}
+
+static void sctp_sched_rr_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_out_ext *soute, *tmp;
+
+	list_for_each_entry_safe(soute, tmp, &stream->rr_list, rr_list)
+		sctp_sched_rr_unsched(stream, soute);
+}
+
+struct sctp_sched_ops sctp_sched_rr = {
+	.set = sctp_sched_rr_set,
+	.get = sctp_sched_rr_get,
+	.init = sctp_sched_rr_init,
+	.init_sid = sctp_sched_rr_init_sid,
+	.free = sctp_sched_rr_free,
+	.enqueue = sctp_sched_rr_enqueue,
+	.dequeue = sctp_sched_rr_dequeue,
+	.dequeue_done = sctp_sched_rr_dequeue_done,
+	.sched_all = sctp_sched_rr_sched_all,
+	.unsched_all = sctp_sched_rr_unsched_all,
+};
-- 
cgit v1.2.3


From e774d96b7d2c3489bfb5bbdc2b65ed41cd68d3d5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 4 Oct 2017 15:55:29 +0200
Subject: rtnetlink: remove slave_validate callback

no users in the tree.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h | 3 ---
 net/core/rtnetlink.c    | 6 ------
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 21837ca68ecc..6520993ff449 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -93,9 +93,6 @@ struct rtnl_link_ops {
 
 	int			slave_maxtype;
 	const struct nla_policy	*slave_policy;
-	int			(*slave_validate)(struct nlattr *tb[],
-						  struct nlattr *data[],
-						  struct netlink_ext_ack *extack);
 	int			(*slave_changelink)(struct net_device *dev,
 						    struct net_device *slave_dev,
 						    struct nlattr *tb[],
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3961f87cdc76..b63c5759641f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2631,12 +2631,6 @@ replay:
 					return err;
 				slave_data = slave_attr;
 			}
-			if (m_ops->slave_validate) {
-				err = m_ops->slave_validate(tb, slave_data,
-							    extack);
-				if (err < 0)
-					return err;
-			}
 		}
 
 		if (dev) {
-- 
cgit v1.2.3


From 5c45121dc39026ab2139910e57cf933fd57d30f2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 4 Oct 2017 15:58:49 +0200
Subject: rtnetlink: remove __rtnl_af_unregister

switch the only caller to rtnl_af_unregister.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h |  2 --
 net/core/rtnetlink.c    | 14 +-------------
 net/ipv6/addrconf.c     |  4 ++--
 3 files changed, 3 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 6520993ff449..e3ca8e2e3103 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -151,8 +151,6 @@ struct rtnl_af_ops {
 	size_t			(*get_stats_af_size)(const struct net_device *dev);
 };
 
-void __rtnl_af_unregister(struct rtnl_af_ops *ops);
-
 void rtnl_af_register(struct rtnl_af_ops *ops);
 void rtnl_af_unregister(struct rtnl_af_ops *ops);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b63c5759641f..3fb1ca33cba4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -475,18 +475,6 @@ void rtnl_af_register(struct rtnl_af_ops *ops)
 }
 EXPORT_SYMBOL_GPL(rtnl_af_register);
 
-/**
- * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
- * @ops: struct rtnl_af_ops * to unregister
- *
- * The caller must hold the rtnl_mutex.
- */
-void __rtnl_af_unregister(struct rtnl_af_ops *ops)
-{
-	list_del(&ops->list);
-}
-EXPORT_SYMBOL_GPL(__rtnl_af_unregister);
-
 /**
  * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
  * @ops: struct rtnl_af_ops * to unregister
@@ -494,7 +482,7 @@ EXPORT_SYMBOL_GPL(__rtnl_af_unregister);
 void rtnl_af_unregister(struct rtnl_af_ops *ops)
 {
 	rtnl_lock();
-	__rtnl_af_unregister(ops);
+	list_del(&ops->list);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL_GPL(rtnl_af_unregister);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f553f72d0bee..837418ff2d4b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6618,9 +6618,9 @@ void addrconf_cleanup(void)
 	unregister_pernet_subsys(&addrconf_ops);
 	ipv6_addr_label_cleanup();
 
-	rtnl_lock();
+	rtnl_af_unregister(&inet6_ops);
 
-	__rtnl_af_unregister(&inet6_ops);
+	rtnl_lock();
 
 	/* clean dev list */
 	for_each_netdev(&init_net, dev) {
-- 
cgit v1.2.3


From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:21 -0700
Subject: bpf: multi program support for cgroup+bpf

introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple
bpf programs to a cgroup.

The difference between three possible flags for BPF_PROG_ATTACH command:
- NONE(default): No further bpf programs allowed in the subtree.
- BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
  the program in this cgroup yields to sub-cgroup program.
- BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
  that cgroup program gets run in addition to the program in this cgroup.

NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't
change their behavior. It only clarifies the semantics in relation
to new flag.

Only one program is allowed to be attached to a cgroup with
NONE or BPF_F_ALLOW_OVERRIDE flag.
Multiple programs are allowed to be attached to a cgroup with
BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
(those that were attached first, run first)
The programs of sub-cgroup are executed first, then programs of
this cgroup and then programs of parent cgroup.
All eligible programs are executed regardless of return code from
earlier programs.

To allow efficient execution of multiple programs attached to a cgroup
and to avoid penalizing cgroups without any programs attached
introduce 'struct bpf_prog_array' which is RCU protected array
of pointers to bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  46 +++--
 include/linux/bpf.h        |  32 ++++
 include/linux/filter.h     |   2 +-
 include/uapi/linux/bpf.h   |  42 +++-
 kernel/bpf/cgroup.c        | 467 ++++++++++++++++++++++++++++++++-------------
 kernel/bpf/core.c          |  31 +++
 kernel/bpf/syscall.c       |  37 ++--
 kernel/cgroup/cgroup.c     |  28 ++-
 8 files changed, 516 insertions(+), 169 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d41d40ac3efd..102e56fbb6de 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -14,27 +14,42 @@ struct bpf_sock_ops_kern;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_prog_list {
+	struct list_head node;
+	struct bpf_prog *prog;
+};
+
+struct bpf_prog_array;
+
 struct cgroup_bpf {
-	/*
-	 * Store two sets of bpf_prog pointers, one for programs that are
-	 * pinned directly to this cgroup, and one for those that are effective
-	 * when this cgroup is accessed.
+	/* array of effective progs in this cgroup */
+	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+
+	/* attached progs to this cgroup and attach flags
+	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
+	 * have either zero or one element
+	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
-	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
-	bool disallow_override[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_BPF_ATTACH_TYPE];
+	u32 flags[MAX_BPF_ATTACH_TYPE];
+
+	/* temp storage for effective prog array used by prog_attach/detach */
+	struct bpf_prog_array __rcu *inactive;
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+int cgroup_bpf_inherit(struct cgroup *cgrp);
 
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool overridable);
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
 
-/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable);
+/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 
 struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
-static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
-				      struct cgroup *parent) {}
+static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 252f4bc9eb25..a6964b75f070 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
 
+/* an array of programs to be executed under rcu_lock.
+ *
+ * Typical usage:
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ *
+ * the structure returned by bpf_prog_array_alloc() should be populated
+ * with program pointers and the last pointer must be NULL.
+ * The user has to keep refcnt on the program and make sure the program
+ * is removed from the array before bpf_prog_put().
+ * The 'struct bpf_prog_array *' should only be replaced with xchg()
+ * since other cpus are walking the array of pointers in parallel.
+ */
+struct bpf_prog_array {
+	struct rcu_head rcu;
+	struct bpf_prog *progs[0];
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog **_prog;		\
+		u32 _ret = 1;				\
+		rcu_read_lock();			\
+		_prog = rcu_dereference(array)->progs;	\
+		for (; *_prog; _prog++)			\
+			_ret &= func(*_prog, ctx);	\
+		rcu_read_unlock();			\
+		_ret;					\
+	 })
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 911d454af107..2d2db394b0ca 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -481,7 +481,7 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6d2137b4cf38..762f74bc6c47 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,11 +143,47 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
- * to the given target_fd cgroup the descendent cgroup will be able to
- * override effective bpf program that was inherited from this cgroup
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..6b7500bbdb53 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 {
 	unsigned int type;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
-		struct bpf_prog *prog = cgrp->bpf.prog[type];
-
-		if (prog) {
-			bpf_prog_put(prog);
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+		struct list_head *progs = &cgrp->bpf.progs[type];
+		struct bpf_prog_list *pl, *tmp;
+
+		list_for_each_entry_safe(pl, tmp, progs, node) {
+			list_del(&pl->node);
+			bpf_prog_put(pl->prog);
+			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
+		bpf_prog_array_free(cgrp->bpf.effective[type]);
+	}
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+	struct bpf_prog_list *pl;
+	u32 cnt = 0;
+
+	list_for_each_entry(pl, head, node) {
+		if (!pl->prog)
+			continue;
+		cnt++;
 	}
+	return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+				    enum bpf_attach_type type,
+				    u32 new_flags)
+{
+	struct cgroup *p;
+
+	p = cgroup_parent(cgrp);
+	if (!p)
+		return true;
+	do {
+		u32 flags = p->bpf.flags[type];
+		u32 cnt;
+
+		if (flags & BPF_F_ALLOW_MULTI)
+			return true;
+		cnt = prog_list_length(&p->bpf.progs[type]);
+		WARN_ON_ONCE(cnt > 1);
+		if (cnt == 1)
+			return !!(flags & BPF_F_ALLOW_OVERRIDE);
+		p = cgroup_parent(p);
+	} while (p);
+	return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_prog_array __rcu **array)
+{
+	struct bpf_prog_array __rcu *progs;
+	struct bpf_prog_list *pl;
+	struct cgroup *p = cgrp;
+	int cnt = 0;
+
+	/* count number of effective programs by walking parents */
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[type]);
+		p = cgroup_parent(p);
+	} while (p);
+
+	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+	if (!progs)
+		return -ENOMEM;
+
+	/* populate the array with effective progs */
+	cnt = 0;
+	p = cgrp;
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			list_for_each_entry(pl,
+					    &p->bpf.progs[type], node) {
+				if (!pl->prog)
+					continue;
+				rcu_dereference_protected(progs, 1)->
+					progs[cnt++] = pl->prog;
+			}
+		p = cgroup_parent(p);
+	} while (p);
+
+	*array = progs;
+	return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+				     enum bpf_attach_type type,
+				     struct bpf_prog_array __rcu *array)
+{
+	struct bpf_prog_array __rcu *old_array;
+
+	old_array = xchg(&cgrp->bpf.effective[type], array);
+	/* free prog array after grace period, since __cgroup_bpf_run_*()
+	 * might be still walking the array
+	 */
+	bpf_prog_array_free(old_array);
 }
 
 /**
  * cgroup_bpf_inherit() - inherit effective programs from parent
  * @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
  */
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
 {
-	unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define	NR ARRAY_SIZE(cgrp->bpf.effective)
+	struct bpf_prog_array __rcu *arrays[NR] = {};
+	int i;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
-		struct bpf_prog *e;
+	for (i = 0; i < NR; i++)
+		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
-		e = rcu_dereference_protected(parent->bpf.effective[type],
-					      lockdep_is_held(&cgroup_mutex));
-		rcu_assign_pointer(cgrp->bpf.effective[type], e);
-		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
-	}
+	for (i = 0; i < NR; i++)
+		if (compute_effective_progs(cgrp, i, &arrays[i]))
+			goto cleanup;
+
+	for (i = 0; i < NR; i++)
+		activate_effective_progs(cgrp, i, arrays[i]);
+
+	return 0;
+cleanup:
+	for (i = 0; i < NR; i++)
+		bpf_prog_array_free(arrays[i]);
+	return -ENOMEM;
 }
 
+#define BPF_CGROUP_MAX_PROGS 64
+
 /**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
  *
  * Must be called with cgroup_mutex held.
  */
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags)
 {
-	struct bpf_prog *old_prog, *effective = NULL;
-	struct cgroup_subsys_state *pos;
-	bool overridable = true;
-
-	if (parent) {
-		overridable = !parent->bpf.disallow_override[type];
-		effective = rcu_dereference_protected(parent->bpf.effective[type],
-						      lockdep_is_held(&cgroup_mutex));
-	}
-
-	if (prog && effective && !overridable)
-		/* if parent has non-overridable prog attached, disallow
-		 * attaching new programs to descendent cgroup
-		 */
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	bool pl_was_allocated;
+	u32 old_flags;
+	int err;
+
+	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+		/* invalid combination */
+		return -EINVAL;
+
+	if (!hierarchy_allows_attach(cgrp, type, flags))
 		return -EPERM;
 
-	if (prog && effective && overridable != new_overridable)
-		/* if parent has overridable prog attached, only
-		 * allow overridable programs in descendent cgroup
+	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+		/* Disallow attaching non-overridable on top
+		 * of existing overridable in this cgroup.
+		 * Disallow attaching multi-prog if overridable or none
 		 */
 		return -EPERM;
 
-	old_prog = cgrp->bpf.prog[type];
-
-	if (prog) {
-		overridable = new_overridable;
-		effective = prog;
-		if (old_prog &&
-		    cgrp->bpf.disallow_override[type] == new_overridable)
-			/* disallow attaching non-overridable on top
-			 * of existing overridable in this cgroup
-			 * and vice versa
-			 */
-			return -EPERM;
+	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+		return -E2BIG;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		list_for_each_entry(pl, progs, node)
+			if (pl->prog == prog)
+				/* disallow attaching the same prog twice */
+				return -EINVAL;
+
+		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+		if (!pl)
+			return -ENOMEM;
+		pl_was_allocated = true;
+		pl->prog = prog;
+		list_add_tail(&pl->node, progs);
+	} else {
+		if (list_empty(progs)) {
+			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+			if (!pl)
+				return -ENOMEM;
+			pl_was_allocated = true;
+			list_add_tail(&pl->node, progs);
+		} else {
+			pl = list_first_entry(progs, typeof(*pl), node);
+			old_prog = pl->prog;
+			pl_was_allocated = false;
+		}
+		pl->prog = prog;
 	}
 
-	if (!prog && !old_prog)
-		/* report error when trying to detach and nothing is attached */
-		return -ENOENT;
+	old_flags = cgrp->bpf.flags[type];
+	cgrp->bpf.flags[type] = flags;
 
-	cgrp->bpf.prog[type] = prog;
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-	css_for_each_descendant_pre(pos, &cgrp->self) {
-		struct cgroup *desc = container_of(pos, struct cgroup, self);
-
-		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp) {
-			pos = css_rightmost_descendant(pos);
-		} else {
-			rcu_assign_pointer(desc->bpf.effective[type],
-					   effective);
-			desc->bpf.disallow_override[type] = !overridable;
-		}
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
 	}
 
-	if (prog)
-		static_branch_inc(&cgroup_bpf_enabled_key);
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	static_branch_inc(&cgroup_bpf_enabled_key);
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
 	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and cleanup the prog list */
+	pl->prog = old_prog;
+	if (pl_was_allocated) {
+		list_del(&pl->node);
+		kfree(pl);
+	}
+	return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 unused_flags)
+{
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	int err;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		if (!prog)
+			/* to detach MULTI prog the user has to specify valid FD
+			 * of the program to be detached
+			 */
+			return -EINVAL;
+	} else {
+		if (list_empty(progs))
+			/* report error when trying to detach and nothing is attached */
+			return -ENOENT;
+	}
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		/* find the prog and detach it */
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog != prog)
+				continue;
+			old_prog = prog;
+			/* mark it deleted, so it's ignored while
+			 * recomputing effective
+			 */
+			pl->prog = NULL;
+			break;
+		}
+		if (!old_prog)
+			return -ENOENT;
+	} else {
+		/* to maintain backward compatibility NONE and OVERRIDE cgroups
+		 * allow detaching with invalid FD (prog==NULL)
+		 */
+		pl = list_first_entry(progs, typeof(*pl), node);
+		old_prog = pl->prog;
+		pl->prog = NULL;
+	}
+
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
+	}
+
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* now can actually delete it from this cgroup list */
+	list_del(&pl->node);
+	kfree(pl);
+	if (list_empty(progs))
+		/* last program was detached, reset flags to zero */
+		cgrp->bpf.flags[type] = 0;
+
+	bpf_prog_put(old_prog);
+	static_branch_dec(&cgroup_bpf_enabled_key);
+	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and restore back old_prog */
+	pl->prog = old_prog;
+	return err;
 }
 
 /**
@@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type)
 {
-	struct bpf_prog *prog;
+	unsigned int offset = skb->data - skb_network_header(skb);
+	struct sock *save_sk;
 	struct cgroup *cgrp;
-	int ret = 0;
+	int ret;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	if (sk->sk_family != AF_INET &&
-	    sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog) {
-		unsigned int offset = skb->data - skb_network_header(skb);
-		struct sock *save_sk = skb->sk;
-
-		skb->sk = sk;
-		__skb_push(skb, offset);
-		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
-		__skb_pull(skb, offset);
-		skb->sk = save_sk;
-	}
-
-	rcu_read_unlock();
-
-	return ret;
+	save_sk = skb->sk;
+	skb->sk = sk;
+	__skb_push(skb, offset);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+				 bpf_prog_run_save_cb);
+	__skb_pull(skb, offset);
+	skb->sk = save_sk;
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 
@@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+	int ret;
 
-	rcu_read_unlock();
-
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
-
-	rcu_read_unlock();
+	int ret;
 
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+				 BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..6b49e1991ae7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+	struct bpf_prog_array hdr;
+	struct bpf_prog *null_prog;
+} empty_prog_array = {
+	.null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+	if (prog_cnt)
+		return kzalloc(sizeof(struct bpf_prog_array) +
+			       sizeof(struct bpf_prog *) * (prog_cnt + 1),
+			       flags);
+
+	return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+	if (!progs ||
+	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+		return;
+	kfree_rcu(progs, rcu);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b927da66f653..51bee695d32c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
 	return 0;
 }
 
+#define BPF_F_ATTACH_MASK \
+	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
@@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
-	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
 		return -EINVAL;
 
 	switch (attr->attach_type) {
@@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
-				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
 	cgroup_put(cgrp);
@@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
+	struct bpf_prog *prog;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
+		break;
 	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	case BPF_CGROUP_SOCK_OPS:
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp))
-			return PTR_ERR(cgrp);
-
-		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		ret = sockmap_get_from_fd(attr, false);
-		break;
+		return sockmap_get_from_fd(attr, false);
 	default:
 		return -EINVAL;
 	}
 
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+	cgroup_put(cgrp);
 	return ret;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..57eb866ae78d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	if (ret)
 		goto destroy_root;
 
+	ret = cgroup_bpf_inherit(root_cgrp);
+	WARN_ON_ONCE(ret);
+
 	trace_cgroup_setup_root(root);
 
 	/*
@@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 	cgrp->level = level;
+	ret = cgroup_bpf_inherit(cgrp);
+	if (ret)
+		goto out_idr_free;
 
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
-	if (parent)
-		cgroup_bpf_inherit(cgrp, parent);
-
 	cgroup_propagate_control(cgrp);
 
 	return cgrp;
 
+out_idr_free:
+	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
 #ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
 {
-	struct cgroup *parent = cgroup_parent(cgrp);
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+	ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
-- 
cgit v1.2.3


From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:22 -0700
Subject: bpf: introduce BPF_PROG_QUERY command

introduce BPF_PROG_QUERY command to retrieve a set of either
attached programs to given cgroup or a set of effective programs
that will execute for events within a cgroup

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  4 ++++
 include/linux/bpf.h        |  3 +++
 include/uapi/linux/bpf.h   | 13 +++++++++++++
 kernel/bpf/cgroup.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c          | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       | 34 ++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c     | 10 ++++++++++
 7 files changed, 148 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 102e56fbb6de..359b6f5d3d90 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 
 /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a6964b75f070..a67daea731ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -260,6 +260,9 @@ struct bpf_prog_array {
 
 struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt);
 
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	({						\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 762f74bc6c47..cb2b9f95160a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
 };
 
 enum bpf_map_type {
@@ -211,6 +212,9 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 #define BPF_OBJ_NAME_LEN 16U
 
 union bpf_attr {
@@ -289,6 +293,15 @@ union bpf_attr {
 		__u32		info_len;
 		__aligned_u64	info;
 	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6b7500bbdb53..e88abc0865d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -384,6 +384,52 @@ cleanup:
 	return err;
 }
 
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	enum bpf_attach_type type = attr->query.attach_type;
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	int cnt, ret = 0, i;
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+	else
+		cnt = prog_list_length(progs);
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+		return -EFAULT;
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+		return -EFAULT;
+	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+		/* return early if user requested only program count + flags */
+		return 0;
+	if (attr->query.prog_cnt < cnt) {
+		cnt = attr->query.prog_cnt;
+		ret = -ENOSPC;
+	}
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+						   prog_ids, cnt);
+	} else {
+		struct bpf_prog_list *pl;
+		u32 id;
+
+		i = 0;
+		list_for_each_entry(pl, progs, node) {
+			id = pl->prog->aux->id;
+			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+				return -EFAULT;
+			if (++i == cnt)
+				break;
+		}
+	}
+	return ret;
+}
+
 /**
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6b49e1991ae7..eba966c09053 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
 	kfree_rcu(progs, rcu);
 }
 
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+	struct bpf_prog **prog;
+	u32 cnt = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++)
+		cnt++;
+	rcu_read_unlock();
+	return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt)
+{
+	struct bpf_prog **prog;
+	u32 i = 0, id;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++) {
+		id = (*prog)->aux->id;
+		if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+			rcu_read_unlock();
+			return -EFAULT;
+		}
+		if (++i == cnt) {
+			prog++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (*prog)
+		return -ENOSPC;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 51bee695d32c..0048cb24ba7b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (CHECK_ATTR(BPF_PROG_QUERY))
+		return -EINVAL;
+	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+		return -EINVAL;
+
+	switch (attr->query.attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
+		break;
+	default:
+		return -EINVAL;
+	}
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	cgroup_put(cgrp);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_DETACH:
 		err = bpf_prog_detach(&attr);
 		break;
+	case BPF_PROG_QUERY:
+		err = bpf_prog_query(&attr, uattr);
+		break;
 #endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 57eb866ae78d..269512b94a94 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_query(cgrp, attr, uattr);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
-- 
cgit v1.2.3


From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 3 Oct 2017 13:53:23 +0200
Subject: dev: advertise the new nsid when the netns iface changes

x-netns interfaces are bound to two netns: the link netns and the upper
netns. Usually, this kind of interfaces is created in the link netns and
then moved to the upper netns. At the end, the interface is visible only
in the upper netns. The link nsid is advertised via netlink in the upper
netns, thus the user always knows where is the link part.

There is no such mechanism in the link netns. When the interface is moved
to another netns, the user cannot "follow" it.
This patch adds a new netlink attribute which helps to follow an interface
which moves to another netns. When the interface is unregistered, the new
nsid is advertised. If the interface is a x-netns interface (ie
rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed.

CC: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h    |  4 +++-
 include/uapi/linux/if_link.h |  1 +
 net/core/dev.c               | 11 ++++++++---
 net/core/rtnetlink.c         | 31 ++++++++++++++++++++++---------
 4 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index dea59c8eec54..1251638e60d3 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -17,9 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
-				       gfp_t flags);
+				       gfp_t flags, int *new_nsid);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ea87bd708ee9..cd580fc0e58f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -158,6 +158,7 @@ enum {
 	IFLA_PAD,
 	IFLA_XDP,
 	IFLA_EVENT,
+	IFLA_NEW_NETNSID,
 	__IFLA_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 454f05441546..bffc75429184 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -145,6 +145,7 @@
 #include <linux/crash_dump.h>
 #include <linux/sctp.h>
 #include <net/udp_tunnel.h>
+#include <linux/net_namespace.h>
 
 #include "net-sysfs.h"
 
@@ -7204,7 +7205,7 @@ static void rollback_registered_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL);
+						     GFP_KERNEL, NULL);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -8291,7 +8292,7 @@ EXPORT_SYMBOL(unregister_netdev);
 
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
-	int err;
+	int err, new_nsid;
 
 	ASSERT_RTNL();
 
@@ -8347,7 +8348,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net)
+		new_nsid = peernet2id_alloc(dev_net(dev), net);
+	else
+		new_nsid = peernet2id(dev_net(dev), net);
+	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
 
 	/*
 	 *	Flush the unicast and multicast chains
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3fb1ca33cba4..1ee98b1369d5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -915,6 +915,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
 	       + rtnl_xdp_size() /* IFLA_XDP */
 	       + nla_total_size(4)  /* IFLA_EVENT */
+	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1384,7 +1385,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event)
+			    u32 event, int *new_nsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1472,6 +1473,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (rtnl_fill_link_netnsid(skb, dev))
 		goto nla_put_failure;
 
+	if (new_nsid &&
+	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
+		goto nla_put_failure;
+
 	if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
 		goto nla_put_failure;
 
@@ -1701,7 +1706,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0);
+					       ext_filter_mask, 0, NULL);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -2808,7 +2813,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -2893,7 +2898,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
-				       u32 event, gfp_t flags)
+				       u32 event, gfp_t flags, int *new_nsid)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2904,7 +2909,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event,
+			       new_nsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -2927,14 +2933,14 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags)
+			       gfp_t flags, int *new_nsid)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
@@ -2942,10 +2948,17 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		  gfp_t flags)
 {
-	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags);
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL);
 }
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid)
+{
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+			   new_nsid);
+}
+
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
 				   struct net_device *dev,
 				   u8 *addr, u16 vid, u32 pid, u32 seq,
@@ -4321,7 +4334,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL);
+				   GFP_KERNEL, NULL);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 51d0c04795a4b5d9a188336884887a9d394a94b0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:45 -0700
Subject: net: Add extack to netdev_notifier_info

Add netlink_ext_ack to netdev_notifier_info to allow notifier
handlers to return errors to userspace.

Clean up the initialization in dev.c such that extack is easily
added in subsequent patches where relevant. Specifically, remove
the init call in call_netdevice_notifiers_info and have callers
initalize on stack when info is declared.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 +++++-
 net/core/dev.c            | 79 ++++++++++++++++++++++++++++-------------------
 2 files changed, 56 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d04424cfffba..05fcaba4b0d9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2309,7 +2309,8 @@ int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
 
 struct netdev_notifier_info {
-	struct net_device *dev;
+	struct net_device	*dev;
+	struct netlink_ext_ack	*extack;
 };
 
 struct netdev_notifier_change_info {
@@ -2334,6 +2335,7 @@ static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
 					     struct net_device *dev)
 {
 	info->dev = dev;
+	info->extack = NULL;
 }
 
 static inline struct net_device *
@@ -2342,6 +2344,12 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
 	return info->dev;
 }
 
+static inline struct netlink_ext_ack *
+netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
+{
+	return info->extack;
+}
+
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 
 
diff --git a/net/core/dev.c b/net/core/dev.c
index bffc75429184..e27a6bc0ac4d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -163,7 +163,6 @@ static struct list_head offload_base __read_mostly;
 
 static int netif_rx_internal(struct sk_buff *skb);
 static int call_netdevice_notifiers_info(unsigned long val,
-					 struct net_device *dev,
 					 struct netdev_notifier_info *info);
 static struct napi_struct *napi_by_id(unsigned int napi_id);
 
@@ -1339,10 +1338,11 @@ EXPORT_SYMBOL(netdev_features_change);
 void netdev_state_change(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
-		struct netdev_notifier_change_info change_info;
+		struct netdev_notifier_change_info change_info = {
+			.info.dev = dev,
+		};
 
-		change_info.flags_changed = 0;
-		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+		call_netdevice_notifiers_info(NETDEV_CHANGE,
 					      &change_info.info);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 	}
@@ -1563,9 +1563,10 @@ EXPORT_SYMBOL(dev_disable_lro);
 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 				   struct net_device *dev)
 {
-	struct netdev_notifier_info info;
+	struct netdev_notifier_info info = {
+		.dev = dev,
+	};
 
-	netdev_notifier_info_init(&info, dev);
 	return nb->notifier_call(nb, val, &info);
 }
 
@@ -1690,11 +1691,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
  */
 
 static int call_netdevice_notifiers_info(unsigned long val,
-					 struct net_device *dev,
 					 struct netdev_notifier_info *info)
 {
 	ASSERT_RTNL();
-	netdev_notifier_info_init(info, dev);
 	return raw_notifier_call_chain(&netdev_chain, val, info);
 }
 
@@ -1709,9 +1708,11 @@ static int call_netdevice_notifiers_info(unsigned long val,
 
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 {
-	struct netdev_notifier_info info;
+	struct netdev_notifier_info info = {
+		.dev = dev,
+	};
 
-	return call_netdevice_notifiers_info(val, dev, &info);
+	return call_netdevice_notifiers_info(val, &info);
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
@@ -6278,7 +6279,15 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
 				   void *upper_priv, void *upper_info)
 {
-	struct netdev_notifier_changeupper_info changeupper_info;
+	struct netdev_notifier_changeupper_info changeupper_info = {
+		.info = {
+			.dev = dev,
+		},
+		.upper_dev = upper_dev,
+		.master = master,
+		.linking = true,
+		.upper_info = upper_info,
+	};
 	int ret = 0;
 
 	ASSERT_RTNL();
@@ -6296,12 +6305,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (master && netdev_master_upper_dev_get(dev))
 		return -EBUSY;
 
-	changeupper_info.upper_dev = upper_dev;
-	changeupper_info.master = master;
-	changeupper_info.linking = true;
-	changeupper_info.upper_info = upper_info;
-
-	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 					    &changeupper_info.info);
 	ret = notifier_to_errno(ret);
 	if (ret)
@@ -6312,7 +6316,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 					    &changeupper_info.info);
 	ret = notifier_to_errno(ret);
 	if (ret)
@@ -6376,20 +6380,24 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
-	struct netdev_notifier_changeupper_info changeupper_info;
+	struct netdev_notifier_changeupper_info changeupper_info = {
+		.info = {
+			.dev = dev,
+		},
+		.upper_dev = upper_dev,
+		.linking = false,
+	};
 
 	ASSERT_RTNL();
 
-	changeupper_info.upper_dev = upper_dev;
 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
-	changeupper_info.linking = false;
 
-	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 				      &changeupper_info.info);
 
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
-	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 				      &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -6405,11 +6413,13 @@ EXPORT_SYMBOL(netdev_upper_dev_unlink);
 void netdev_bonding_info_change(struct net_device *dev,
 				struct netdev_bonding_info *bonding_info)
 {
-	struct netdev_notifier_bonding_info	info;
+	struct netdev_notifier_bonding_info info = {
+		.info.dev = dev,
+	};
 
 	memcpy(&info.bonding_info, bonding_info,
 	       sizeof(struct netdev_bonding_info));
-	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
+	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 				      &info.info);
 }
 EXPORT_SYMBOL(netdev_bonding_info_change);
@@ -6535,11 +6545,13 @@ EXPORT_SYMBOL(dev_get_nest_level);
 void netdev_lower_state_changed(struct net_device *lower_dev,
 				void *lower_state_info)
 {
-	struct netdev_notifier_changelowerstate_info changelowerstate_info;
+	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
+		.info.dev = lower_dev,
+	};
 
 	ASSERT_RTNL();
 	changelowerstate_info.lower_state_info = lower_state_info;
-	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
+	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 				      &changelowerstate_info.info);
 }
 EXPORT_SYMBOL(netdev_lower_state_changed);
@@ -6830,11 +6842,14 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 
 	if (dev->flags & IFF_UP &&
 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
-		struct netdev_notifier_change_info change_info;
-
-		change_info.flags_changed = changes;
-		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
-					      &change_info.info);
+		struct netdev_notifier_change_info change_info = {
+			.info = {
+				.dev = dev,
+			},
+			.flags_changed = changes,
+		};
+
+		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 	}
 }
 
-- 
cgit v1.2.3


From 33eaf2a6eb48ebf00374aaaf4b1b43f9950dcbe4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:46 -0700
Subject: net: Add extack to ndo_add_slave

Pass extack to do_set_master and down to ndo_add_slave

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c    |  5 +++--
 drivers/net/bonding/bond_options.c |  2 +-
 drivers/net/team/team.c            |  3 ++-
 drivers/net/vrf.c                  |  3 ++-
 include/linux/netdevice.h          |  3 ++-
 include/net/bonding.h              |  3 ++-
 net/batman-adv/soft-interface.c    |  3 ++-
 net/bridge/br_device.c             |  3 ++-
 net/core/rtnetlink.c               | 10 ++++++----
 9 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b19dc033fb36..78feb94a36db 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1328,7 +1328,8 @@ void bond_lower_state_changed(struct slave *slave)
 }
 
 /* enslave device <slave> to bond device <master> */
-int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
+int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
+		 struct netlink_ext_ack *extack)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
@@ -3492,7 +3493,7 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd
 	switch (cmd) {
 	case BOND_ENSLAVE_OLD:
 	case SIOCBONDENSLAVE:
-		res = bond_enslave(bond_dev, slave_dev);
+		res = bond_enslave(bond_dev, slave_dev, NULL);
 		break;
 	case BOND_RELEASE_OLD:
 	case SIOCBONDRELEASE:
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 5931aa2fe997..8a9b085c2a98 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1383,7 +1383,7 @@ static int bond_option_slaves_set(struct bonding *bond,
 	switch (command[0]) {
 	case '+':
 		netdev_dbg(bond->dev, "Adding slave %s\n", dev->name);
-		ret = bond_enslave(bond->dev, dev);
+		ret = bond_enslave(bond->dev, dev, NULL);
 		break;
 
 	case '-':
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index ae53e899259f..4359d45aa131 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1914,7 +1914,8 @@ static int team_netpoll_setup(struct net_device *dev,
 }
 #endif
 
-static int team_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int team_add_slave(struct net_device *dev, struct net_device *port_dev,
+			  struct netlink_ext_ack *extack)
 {
 	struct team *team = netdev_priv(dev);
 	int err;
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index cc18b7b11612..4a082ef53533 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -788,7 +788,8 @@ err:
 	return ret;
 }
 
-static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
+			 struct netlink_ext_ack *extack)
 {
 	if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev))
 		return -EINVAL;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 05fcaba4b0d9..368a5064a487 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1246,7 +1246,8 @@ struct net_device_ops {
 						     u32 flow_id);
 #endif
 	int			(*ndo_add_slave)(struct net_device *dev,
-						 struct net_device *slave_dev);
+						 struct net_device *slave_dev,
+						 struct netlink_ext_ack *extack);
 	int			(*ndo_del_slave)(struct net_device *dev,
 						 struct net_device *slave_dev);
 	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index b2e68657a216..2860cc66c2bb 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -596,7 +596,8 @@ void bond_destroy_sysfs(struct bond_net *net);
 void bond_prepare_sysfs_group(struct bonding *bond);
 int bond_sysfs_slave_add(struct slave *slave);
 void bond_sysfs_slave_del(struct slave *slave);
-int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev);
+int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
+		 struct netlink_ext_ack *extack);
 int bond_release(struct net_device *bond_dev, struct net_device *slave_dev);
 u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb);
 int bond_set_carrier(struct bonding *bond);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index c2c986746d0b..e7d5fbb6ad53 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -867,7 +867,8 @@ free_bat_counters:
  * Return: 0 if successful or error otherwise.
  */
 static int batadv_softif_slave_add(struct net_device *dev,
-				   struct net_device *slave_dev)
+				   struct net_device *slave_dev,
+				   struct netlink_ext_ack *extack)
 {
 	struct batadv_hard_iface *hard_iface;
 	struct net *net = dev_net(dev);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f6b6a92f1c48..cb0131d70ab1 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -320,7 +320,8 @@ void br_netpoll_disable(struct net_bridge_port *p)
 
 #endif
 
-static int br_add_slave(struct net_device *dev, struct net_device *slave_dev)
+static int br_add_slave(struct net_device *dev, struct net_device *slave_dev,
+			struct netlink_ext_ack *extack)
 
 {
 	struct net_bridge *br = netdev_priv(dev);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1ee98b1369d5..c5ee429bcce9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1957,7 +1957,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 	return err;
 }
 
-static int do_set_master(struct net_device *dev, int ifindex)
+static int do_set_master(struct net_device *dev, int ifindex,
+			 struct netlink_ext_ack *extack)
 {
 	struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
 	const struct net_device_ops *ops;
@@ -1982,7 +1983,7 @@ static int do_set_master(struct net_device *dev, int ifindex)
 			return -EINVAL;
 		ops = upper_dev->netdev_ops;
 		if (ops->ndo_add_slave) {
-			err = ops->ndo_add_slave(upper_dev, dev);
+			err = ops->ndo_add_slave(upper_dev, dev, extack);
 			if (err)
 				return err;
 		} else {
@@ -2115,7 +2116,7 @@ static int do_setlink(const struct sk_buff *skb,
 	}
 
 	if (tb[IFLA_MASTER]) {
-		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
 		if (err)
 			goto errout;
 		status |= DO_SETLINK_MODIFIED;
@@ -2753,7 +2754,8 @@ replay:
 				goto out_unregister;
 		}
 		if (tb[IFLA_MASTER]) {
-			err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+			err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]),
+					    extack);
 			if (err)
 				goto out_unregister;
 		}
-- 
cgit v1.2.3


From 42ab19ee90292993370a30ad242599d75a3b749e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:47 -0700
Subject: net: Add extack to upper device linking

Add extack arg to netdev_upper_dev_link and netdev_master_upper_dev_link

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                    |  7 ++++---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c |  2 +-
 drivers/net/hyperv/netvsc_drv.c                    |  2 +-
 drivers/net/ipvlan/ipvlan_main.c                   |  2 +-
 drivers/net/macsec.c                               |  2 +-
 drivers/net/macvlan.c                              |  7 ++++---
 drivers/net/macvtap.c                              |  2 +-
 drivers/net/team/team.c                            |  2 +-
 drivers/net/usb/qmi_wwan.c                         |  2 +-
 drivers/net/vrf.c                                  |  7 ++++---
 include/linux/if_macvlan.h                         |  3 ++-
 include/linux/netdevice.h                          |  6 ++++--
 net/8021q/vlan.c                                   |  6 +++---
 net/8021q/vlan.h                                   |  2 +-
 net/8021q/vlan_netlink.c                           |  2 +-
 net/batman-adv/hard-interface.c                    |  2 +-
 net/bridge/br_if.c                                 |  2 +-
 net/core/dev.c                                     | 15 ++++++++++-----
 net/openvswitch/vport-netdev.c                     |  3 ++-
 19 files changed, 44 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 78feb94a36db..bc92307c2082 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1217,14 +1217,15 @@ static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
 	}
 }
 
-static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave)
+static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
+				      struct netlink_ext_ack *extack)
 {
 	struct netdev_lag_upper_info lag_upper_info;
 	int err;
 
 	lag_upper_info.tx_type = bond_lag_tx_type(bond);
 	err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
-					   &lag_upper_info);
+					   &lag_upper_info, extack);
 	if (err)
 		return err;
 	rtmsg_ifinfo(RTM_NEWLINK, slave->dev, IFF_SLAVE, GFP_KERNEL);
@@ -1710,7 +1711,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		goto err_detach;
 	}
 
-	res = bond_master_upper_dev_link(bond, new_slave);
+	res = bond_master_upper_dev_link(bond, new_slave, extack);
 	if (res) {
 		netdev_dbg(bond_dev, "Error %d calling bond_master_upper_dev_link\n", res);
 		goto err_unregister;
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 98f22551eb45..1af326a60cbb 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -178,7 +178,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 	if (err)
 		goto err1;
 
-	err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL);
+	err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL, extack);
 	if (err)
 		goto err2;
 
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f300ae61c6c6..dfb986421ec6 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1748,7 +1748,7 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
 		goto rx_handler_failed;
 	}
 
-	ret = netdev_upper_dev_link(vf_netdev, ndev);
+	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
 	if (ret != 0) {
 		netdev_err(vf_netdev,
 			   "can not set master device %s (err = %d)\n",
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index c74893c1e620..57c3856bab05 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -584,7 +584,7 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		goto remove_ida;
 
-	err = netdev_upper_dev_link(phy_dev, dev);
+	err = netdev_upper_dev_link(phy_dev, dev, extack);
 	if (err) {
 		goto unregister_netdev;
 	}
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 98e4deaa3a6a..ccbe4eaffe4d 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3244,7 +3244,7 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 				       &macsec_netdev_addr_lock_key,
 				       macsec_get_nest_level(dev));
 
-	err = netdev_upper_dev_link(real_dev, dev);
+	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err < 0)
 		goto unregister;
 
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 1ffe77e95d46..858bd66511a2 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1344,7 +1344,8 @@ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode,
 }
 
 int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
-			   struct nlattr *tb[], struct nlattr *data[])
+			   struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvlan_port *port;
@@ -1433,7 +1434,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 		goto destroy_macvlan_port;
 
 	dev->priv_flags |= IFF_MACVLAN;
-	err = netdev_upper_dev_link(lowerdev, dev);
+	err = netdev_upper_dev_link(lowerdev, dev, extack);
 	if (err)
 		goto unregister_netdev;
 
@@ -1456,7 +1457,7 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[],
 			   struct netlink_ext_ack *extack)
 {
-	return macvlan_common_newlink(src_net, dev, tb, data);
+	return macvlan_common_newlink(src_net, dev, tb, data, extack);
 }
 
 void macvlan_dellink(struct net_device *dev, struct list_head *head)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index c2d0ea2fb019..f62aea2fcfa9 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -105,7 +105,7 @@ static int macvtap_newlink(struct net *src_net, struct net_device *dev,
 	/* Don't put anything that may fail after macvlan_common_newlink
 	 * because we can't undo what it does.
 	 */
-	err = macvlan_common_newlink(src_net, dev, tb, data);
+	err = macvlan_common_newlink(src_net, dev, tb, data, extack);
 	if (err) {
 		netdev_rx_handler_unregister(dev);
 		return err;
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 4359d45aa131..a468439969df 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1112,7 +1112,7 @@ static int team_upper_dev_link(struct team *team, struct team_port *port)
 
 	lag_upper_info.tx_type = team->mode->lag_tx_type;
 	err = netdev_master_upper_dev_link(port->dev, team->dev, NULL,
-					   &lag_upper_info);
+					   &lag_upper_info, NULL);
 	if (err)
 		return err;
 	port->dev->priv_flags |= IFF_TEAM_PORT;
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 8c3733608271..db7279d5b250 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -221,7 +221,7 @@ static int qmimux_register_device(struct net_device *real_dev, u8 mux_id)
 	/* Account for reference in struct qmimux_priv_priv */
 	dev_hold(real_dev);
 
-	err = netdev_upper_dev_link(real_dev, new_dev);
+	err = netdev_upper_dev_link(real_dev, new_dev, NULL);
 	if (err)
 		goto out_unregister_netdev;
 
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 4a082ef53533..77d0655a0250 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -764,7 +764,8 @@ static void cycle_netdev(struct net_device *dev)
 	}
 }
 
-static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
+			    struct netlink_ext_ack *extack)
 {
 	int ret;
 
@@ -775,7 +776,7 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 		return -EOPNOTSUPP;
 
 	port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
-	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
+	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack);
 	if (ret < 0)
 		goto err;
 
@@ -794,7 +795,7 @@ static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
 	if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev))
 		return -EINVAL;
 
-	return do_vrf_add_slave(dev, port_dev);
+	return do_vrf_add_slave(dev, port_dev, extack);
 }
 
 /* inverse of do_vrf_add_slave */
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c9ec1343d187..10e319f41fb1 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -72,7 +72,8 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 extern void macvlan_common_setup(struct net_device *dev);
 
 extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
-				  struct nlattr *tb[], struct nlattr *data[]);
+				  struct nlattr *tb[], struct nlattr *data[],
+				  struct netlink_ext_ack *extack);
 
 extern void macvlan_count_rx(const struct macvlan_dev *vlan,
 			     unsigned int len, bool success,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 368a5064a487..31bb3010c69b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3919,10 +3919,12 @@ void *netdev_adjacent_get_private(struct list_head *adj_list);
 void *netdev_lower_get_first_private_rcu(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
-int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev);
+int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
+			  struct netlink_ext_ack *extack);
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv, void *upper_info);
+				 void *upper_priv, void *upper_info,
+				 struct netlink_ext_ack *extack);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 9649579b5b9f..71c3e045505b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -138,7 +138,7 @@ int vlan_check_real_dev(struct net_device *real_dev,
 	return 0;
 }
 
-int register_vlan_dev(struct net_device *dev)
+int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	struct net_device *real_dev = vlan->real_dev;
@@ -174,7 +174,7 @@ int register_vlan_dev(struct net_device *dev)
 	if (err < 0)
 		goto out_uninit_mvrp;
 
-	err = netdev_upper_dev_link(real_dev, dev);
+	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err)
 		goto out_unregister_netdev;
 
@@ -270,7 +270,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	vlan->flags = VLAN_FLAG_REORDER_HDR;
 
 	new_dev->rtnl_link_ops = &vlan_link_ops;
-	err = register_vlan_dev(new_dev);
+	err = register_vlan_dev(new_dev, NULL);
 	if (err < 0)
 		goto out_free_newdev;
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index df8bd65dd370..94f8eed9f9b3 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -107,7 +107,7 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
 int vlan_check_real_dev(struct net_device *real_dev,
 			__be16 protocol, u16 vlan_id);
 void vlan_setup(struct net_device *dev);
-int register_vlan_dev(struct net_device *dev);
+int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
 bool vlan_dev_inherit_address(struct net_device *dev,
 			      struct net_device *real_dev);
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 5e831de3103e..6e7c5a6a7930 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -160,7 +160,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		return err;
 
-	return register_vlan_dev(dev);
+	return register_vlan_dev(dev, extack);
 }
 
 static inline size_t vlan_qos_map_size(unsigned int n)
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index e348f76ea8c1..f7b413b9297e 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -738,7 +738,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 	bat_priv = netdev_priv(hard_iface->soft_iface);
 
 	ret = netdev_master_upper_dev_link(hard_iface->net_dev,
-					   soft_iface, NULL, NULL);
+					   soft_iface, NULL, NULL, NULL);
 	if (ret)
 		goto err_dev;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f3aef22931ab..0a3fd727048d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -540,7 +540,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	dev->priv_flags |= IFF_BRIDGE_PORT;
 
-	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
+	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, NULL);
 	if (err)
 		goto err5;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e27a6bc0ac4d..fcddccb6be41 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6277,11 +6277,13 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
-				   void *upper_priv, void *upper_info)
+				   void *upper_priv, void *upper_info,
+				   struct netlink_ext_ack *extack)
 {
 	struct netdev_notifier_changeupper_info changeupper_info = {
 		.info = {
 			.dev = dev,
+			.extack = extack,
 		},
 		.upper_dev = upper_dev,
 		.master = master,
@@ -6341,9 +6343,11 @@ rollback:
  * returns zero.
  */
 int netdev_upper_dev_link(struct net_device *dev,
-			  struct net_device *upper_dev)
+			  struct net_device *upper_dev,
+			  struct netlink_ext_ack *extack)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
+	return __netdev_upper_dev_link(dev, upper_dev, false,
+				       NULL, NULL, extack);
 }
 EXPORT_SYMBOL(netdev_upper_dev_link);
 
@@ -6362,10 +6366,11 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  */
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv, void *upper_info)
+				 void *upper_priv, void *upper_info,
+				 struct netlink_ext_ack *extack)
 {
 	return __netdev_upper_dev_link(dev, upper_dev, true,
-				       upper_priv, upper_info);
+				       upper_priv, upper_info, extack);
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_link);
 
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 0389398fa4ab..2e5e7a41d8ef 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -108,7 +108,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
 
 	rtnl_lock();
 	err = netdev_master_upper_dev_link(vport->dev,
-					   get_dpdev(vport->dp), NULL, NULL);
+					   get_dpdev(vport->dp),
+					   NULL, NULL, NULL);
 	if (err)
 		goto error_unlock;
 
-- 
cgit v1.2.3


From 44f209807ee87a5eddf6c0432f3fb63cec27bad8 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 5 Oct 2017 16:46:50 -0400
Subject: VSOCK: export socket tables for sock_diag interface

The socket table symbols need to be exported from vsock.ko so that the
vsock_diag.ko module will be able to traverse sockets.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_vsock.h   |  5 +++++
 net/vmw_vsock/af_vsock.c | 10 ++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index f9fb566e75cf..30cba806e344 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -27,6 +27,11 @@
 
 #define LAST_RESERVED_PORT 1023
 
+#define VSOCK_HASH_SIZE         251
+extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
+extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
+extern spinlock_t vsock_table_lock;
+
 #define vsock_sk(__sk)    ((struct vsock_sock *)__sk)
 #define sk_vsock(__vsk)   (&(__vsk)->sk)
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index dfc8c51e4d74..9afe4da8c67d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -153,7 +153,6 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
  * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets.  The hash function
  * mods with VSOCK_HASH_SIZE to ensure this.
  */
-#define VSOCK_HASH_SIZE         251
 #define MAX_PORT_RETRIES        24
 
 #define VSOCK_HASH(addr)        ((addr)->svm_port % VSOCK_HASH_SIZE)
@@ -168,9 +167,12 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
 #define vsock_connected_sockets_vsk(vsk)				\
 	vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr)
 
-static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
-static struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
-static DEFINE_SPINLOCK(vsock_table_lock);
+struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
+EXPORT_SYMBOL_GPL(vsock_bind_table);
+struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
+EXPORT_SYMBOL_GPL(vsock_connected_table);
+DEFINE_SPINLOCK(vsock_table_lock);
+EXPORT_SYMBOL_GPL(vsock_table_lock);
 
 /* Autobind this socket to the local address if necessary. */
 static int vsock_auto_bind(struct vsock_sock *vsk)
-- 
cgit v1.2.3


From bf359b8127719535f88494adb3c2b73c06667dcd Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 5 Oct 2017 16:46:51 -0400
Subject: VSOCK: move __vsock_in_bound/connected_table() to af_vsock.h

The vsock_diag.ko module will need to check socket table membership.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_vsock.h   | 12 ++++++++++++
 net/vmw_vsock/af_vsock.c | 10 ----------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 30cba806e344..3dd217718a2f 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -180,6 +180,18 @@ const struct vsock_transport *vsock_core_get_transport(void);
 
 /**** UTILS ****/
 
+/* vsock_table_lock must be held */
+static inline bool __vsock_in_bound_table(struct vsock_sock *vsk)
+{
+	return !list_empty(&vsk->bound_table);
+}
+
+/* vsock_table_lock must be held */
+static inline bool __vsock_in_connected_table(struct vsock_sock *vsk)
+{
+	return !list_empty(&vsk->connected_table);
+}
+
 void vsock_release_pending(struct sock *pending);
 void vsock_add_pending(struct sock *listener, struct sock *pending);
 void vsock_remove_pending(struct sock *listener, struct sock *pending);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9afe4da8c67d..9b179a0081b3 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -250,16 +250,6 @@ static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
 	return NULL;
 }
 
-static bool __vsock_in_bound_table(struct vsock_sock *vsk)
-{
-	return !list_empty(&vsk->bound_table);
-}
-
-static bool __vsock_in_connected_table(struct vsock_sock *vsk)
-{
-	return !list_empty(&vsk->connected_table);
-}
-
 static void vsock_insert_unbound(struct vsock_sock *vsk)
 {
 	spin_lock_bh(&vsock_table_lock);
-- 
cgit v1.2.3


From 3b4477d2dcf2709d0be89e2a8dced3d0f4a017f2 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 5 Oct 2017 16:46:52 -0400
Subject: VSOCK: use TCP state constants for sk_state

There are two state fields: socket->state and sock->sk_state.  The
socket->state field uses SS_UNCONNECTED, SS_CONNECTED, etc while the
sock->sk_state typically uses values that match TCP state constants
(TCP_CLOSE, TCP_ESTABLISHED).  AF_VSOCK does not follow this convention
and instead uses SS_* constants for both fields.

The sk_state field will be exposed to userspace through the vsock_diag
interface for ss(8), netstat(8), and other programs.

This patch switches sk_state to TCP state constants so that the meaning
of this field is consistent with other address families.  Not just
AF_INET and AF_INET6 use the TCP constants, AF_UNIX and others do too.

The following mapping was used to convert the code:

  SS_FREE -> TCP_CLOSE
  SS_UNCONNECTED -> TCP_CLOSE
  SS_CONNECTING -> TCP_SYN_SENT
  SS_CONNECTED -> TCP_ESTABLISHED
  SS_DISCONNECTING -> TCP_CLOSING
  VSOCK_SS_LISTEN -> TCP_LISTEN

In __vsock_create() the sk_state initialization was dropped because
sock_init_data() already initializes sk_state to TCP_CLOSE.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_vsock.h                       |  3 --
 net/vmw_vsock/af_vsock.c                     | 46 ++++++++++++++++------------
 net/vmw_vsock/hyperv_transport.c             | 12 ++++----
 net/vmw_vsock/virtio_transport.c             |  2 +-
 net/vmw_vsock/virtio_transport_common.c      | 22 ++++++-------
 net/vmw_vsock/vmci_transport.c               | 34 ++++++++++----------
 net/vmw_vsock/vmci_transport_notify.c        |  2 +-
 net/vmw_vsock/vmci_transport_notify_qstate.c |  2 +-
 8 files changed, 64 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 3dd217718a2f..9324ac2d9ff2 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -22,9 +22,6 @@
 
 #include "vsock_addr.h"
 
-/* vsock-specific sock->sk_state constants */
-#define VSOCK_SS_LISTEN 255
-
 #define LAST_RESERVED_PORT 1023
 
 #define VSOCK_HASH_SIZE         251
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9b179a0081b3..98359c19522f 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -36,7 +36,7 @@
  * not support simultaneous connects (two "client" sockets connecting).
  *
  * - "Server" sockets are referred to as listener sockets throughout this
- * implementation because they are in the VSOCK_SS_LISTEN state.  When a
+ * implementation because they are in the TCP_LISTEN state.  When a
  * connection request is received (the second kind of socket mentioned above),
  * we create a new socket and refer to it as a pending socket.  These pending
  * sockets are placed on the pending connection list of the listener socket.
@@ -82,6 +82,15 @@
  * argument, we must ensure the reference count is increased to ensure the
  * socket isn't freed before the function is run; the deferred function will
  * then drop the reference.
+ *
+ * - sk->sk_state uses the TCP state constants because they are widely used by
+ * other address families and exposed to userspace tools like ss(8):
+ *
+ *   TCP_CLOSE - unconnected
+ *   TCP_SYN_SENT - connecting
+ *   TCP_ESTABLISHED - connected
+ *   TCP_CLOSING - disconnecting
+ *   TCP_LISTEN - listening
  */
 
 #include <linux/types.h>
@@ -477,7 +486,7 @@ void vsock_pending_work(struct work_struct *work)
 	if (vsock_in_connected_table(vsk))
 		vsock_remove_connected(vsk);
 
-	sk->sk_state = SS_FREE;
+	sk->sk_state = TCP_CLOSE;
 
 out:
 	release_sock(sk);
@@ -617,7 +626,6 @@ struct sock *__vsock_create(struct net *net,
 
 	sk->sk_destruct = vsock_sk_destruct;
 	sk->sk_backlog_rcv = vsock_queue_rcv_skb;
-	sk->sk_state = 0;
 	sock_reset_flag(sk, SOCK_DONE);
 
 	INIT_LIST_HEAD(&vsk->bound_table);
@@ -891,7 +899,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock,
 		/* Listening sockets that have connections in their accept
 		 * queue can be read.
 		 */
-		if (sk->sk_state == VSOCK_SS_LISTEN
+		if (sk->sk_state == TCP_LISTEN
 		    && !vsock_is_accept_queue_empty(sk))
 			mask |= POLLIN | POLLRDNORM;
 
@@ -920,7 +928,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock,
 		}
 
 		/* Connected sockets that can produce data can be written. */
-		if (sk->sk_state == SS_CONNECTED) {
+		if (sk->sk_state == TCP_ESTABLISHED) {
 			if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 				bool space_avail_now = false;
 				int ret = transport->notify_poll_out(
@@ -942,7 +950,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock,
 		 * POLLOUT|POLLWRNORM when peer is closed and nothing to read,
 		 * but local send is not shutdown.
 		 */
-		if (sk->sk_state == SS_UNCONNECTED) {
+		if (sk->sk_state == TCP_CLOSE) {
 			if (!(sk->sk_shutdown & SEND_SHUTDOWN))
 				mask |= POLLOUT | POLLWRNORM;
 
@@ -1112,9 +1120,9 @@ static void vsock_connect_timeout(struct work_struct *work)
 	sk = sk_vsock(vsk);
 
 	lock_sock(sk);
-	if (sk->sk_state == SS_CONNECTING &&
+	if (sk->sk_state == TCP_SYN_SENT &&
 	    (sk->sk_shutdown != SHUTDOWN_MASK)) {
-		sk->sk_state = SS_UNCONNECTED;
+		sk->sk_state = TCP_CLOSE;
 		sk->sk_err = ETIMEDOUT;
 		sk->sk_error_report(sk);
 		cancel = 1;
@@ -1160,7 +1168,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 		err = -EALREADY;
 		break;
 	default:
-		if ((sk->sk_state == VSOCK_SS_LISTEN) ||
+		if ((sk->sk_state == TCP_LISTEN) ||
 		    vsock_addr_cast(addr, addr_len, &remote_addr) != 0) {
 			err = -EINVAL;
 			goto out;
@@ -1183,7 +1191,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 		if (err)
 			goto out;
 
-		sk->sk_state = SS_CONNECTING;
+		sk->sk_state = TCP_SYN_SENT;
 
 		err = transport->connect(vsk);
 		if (err < 0)
@@ -1203,7 +1211,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 	timeout = vsk->connect_timeout;
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
-	while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) {
+	while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) {
 		if (flags & O_NONBLOCK) {
 			/* If we're not going to block, we schedule a timeout
 			 * function to generate a timeout on the connection
@@ -1226,13 +1234,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 
 		if (signal_pending(current)) {
 			err = sock_intr_errno(timeout);
-			sk->sk_state = SS_UNCONNECTED;
+			sk->sk_state = TCP_CLOSE;
 			sock->state = SS_UNCONNECTED;
 			vsock_transport_cancel_pkt(vsk);
 			goto out_wait;
 		} else if (timeout == 0) {
 			err = -ETIMEDOUT;
-			sk->sk_state = SS_UNCONNECTED;
+			sk->sk_state = TCP_CLOSE;
 			sock->state = SS_UNCONNECTED;
 			vsock_transport_cancel_pkt(vsk);
 			goto out_wait;
@@ -1243,7 +1251,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 
 	if (sk->sk_err) {
 		err = -sk->sk_err;
-		sk->sk_state = SS_UNCONNECTED;
+		sk->sk_state = TCP_CLOSE;
 		sock->state = SS_UNCONNECTED;
 	} else {
 		err = 0;
@@ -1276,7 +1284,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
 		goto out;
 	}
 
-	if (listener->sk_state != VSOCK_SS_LISTEN) {
+	if (listener->sk_state != TCP_LISTEN) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -1366,7 +1374,7 @@ static int vsock_listen(struct socket *sock, int backlog)
 	}
 
 	sk->sk_max_ack_backlog = backlog;
-	sk->sk_state = VSOCK_SS_LISTEN;
+	sk->sk_state = TCP_LISTEN;
 
 	err = 0;
 
@@ -1546,7 +1554,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 
 	/* Callers should not provide a destination with stream sockets. */
 	if (msg->msg_namelen) {
-		err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP;
+		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
 		goto out;
 	}
 
@@ -1557,7 +1565,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	if (sk->sk_state != SS_CONNECTED ||
+	if (sk->sk_state != TCP_ESTABLISHED ||
 	    !vsock_addr_bound(&vsk->local_addr)) {
 		err = -ENOTCONN;
 		goto out;
@@ -1681,7 +1689,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	lock_sock(sk);
 
-	if (sk->sk_state != SS_CONNECTED) {
+	if (sk->sk_state != TCP_ESTABLISHED) {
 		/* Recvmsg is supposed to return 0 if a peer performs an
 		 * orderly shutdown. Differentiate between that case and when a
 		 * peer has not connected or a local shutdown occured with the
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 14ed5a344cdf..bbac023e70d1 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -310,7 +310,7 @@ static void hvs_close_connection(struct vmbus_channel *chan)
 	struct sock *sk = get_per_channel_state(chan);
 	struct vsock_sock *vsk = vsock_sk(sk);
 
-	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_state = TCP_CLOSE;
 	sock_set_flag(sk, SOCK_DONE);
 	vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN;
 
@@ -344,8 +344,8 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 	if (!sk)
 		return;
 
-	if ((conn_from_host && sk->sk_state != VSOCK_SS_LISTEN) ||
-	    (!conn_from_host && sk->sk_state != SS_CONNECTING))
+	if ((conn_from_host && sk->sk_state != TCP_LISTEN) ||
+	    (!conn_from_host && sk->sk_state != TCP_SYN_SENT))
 		goto out;
 
 	if (conn_from_host) {
@@ -357,7 +357,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 		if (!new)
 			goto out;
 
-		new->sk_state = SS_CONNECTING;
+		new->sk_state = TCP_SYN_SENT;
 		vnew = vsock_sk(new);
 		hvs_new = vnew->trans;
 		hvs_new->chan = chan;
@@ -384,7 +384,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 	vmbus_set_chn_rescind_callback(chan, hvs_close_connection);
 
 	if (conn_from_host) {
-		new->sk_state = SS_CONNECTED;
+		new->sk_state = TCP_ESTABLISHED;
 		sk->sk_ack_backlog++;
 
 		hvs_addr_init(&vnew->local_addr, if_type);
@@ -399,7 +399,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 		vsock_enqueue_accept(sk, new);
 		release_sock(sk);
 	} else {
-		sk->sk_state = SS_CONNECTED;
+		sk->sk_state = TCP_ESTABLISHED;
 		sk->sk_socket->state = SS_CONNECTED;
 
 		vsock_insert_connected(vsock_sk(sk));
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 403d86e80162..8e03bd3f3668 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -414,7 +414,7 @@ static void virtio_vsock_event_fill(struct virtio_vsock *vsock)
 static void virtio_vsock_reset_sock(struct sock *sk)
 {
 	lock_sock(sk);
-	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_state = TCP_CLOSE;
 	sk->sk_err = ECONNRESET;
 	sk->sk_error_report(sk);
 	release_sock(sk);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index edba7ab97563..3ae3a33da70b 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -708,7 +708,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk,
 	sock_set_flag(sk, SOCK_DONE);
 	vsk->peer_shutdown = SHUTDOWN_MASK;
 	if (vsock_stream_has_data(vsk) <= 0)
-		sk->sk_state = SS_DISCONNECTING;
+		sk->sk_state = TCP_CLOSING;
 	sk->sk_state_change(sk);
 
 	if (vsk->close_work_scheduled &&
@@ -748,8 +748,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk)
 {
 	struct sock *sk = &vsk->sk;
 
-	if (!(sk->sk_state == SS_CONNECTED ||
-	      sk->sk_state == SS_DISCONNECTING))
+	if (!(sk->sk_state == TCP_ESTABLISHED ||
+	      sk->sk_state == TCP_CLOSING))
 		return true;
 
 	/* Already received SHUTDOWN from peer, reply with RST */
@@ -801,7 +801,7 @@ virtio_transport_recv_connecting(struct sock *sk,
 
 	switch (le16_to_cpu(pkt->hdr.op)) {
 	case VIRTIO_VSOCK_OP_RESPONSE:
-		sk->sk_state = SS_CONNECTED;
+		sk->sk_state = TCP_ESTABLISHED;
 		sk->sk_socket->state = SS_CONNECTED;
 		vsock_insert_connected(vsk);
 		sk->sk_state_change(sk);
@@ -821,7 +821,7 @@ virtio_transport_recv_connecting(struct sock *sk,
 
 destroy:
 	virtio_transport_reset(vsk, pkt);
-	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_state = TCP_CLOSE;
 	sk->sk_err = skerr;
 	sk->sk_error_report(sk);
 	return err;
@@ -857,7 +857,7 @@ virtio_transport_recv_connected(struct sock *sk,
 			vsk->peer_shutdown |= SEND_SHUTDOWN;
 		if (vsk->peer_shutdown == SHUTDOWN_MASK &&
 		    vsock_stream_has_data(vsk) <= 0)
-			sk->sk_state = SS_DISCONNECTING;
+			sk->sk_state = TCP_CLOSING;
 		if (le32_to_cpu(pkt->hdr.flags))
 			sk->sk_state_change(sk);
 		break;
@@ -928,7 +928,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
 
 	lock_sock_nested(child, SINGLE_DEPTH_NESTING);
 
-	child->sk_state = SS_CONNECTED;
+	child->sk_state = TCP_ESTABLISHED;
 
 	vchild = vsock_sk(child);
 	vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
@@ -1016,18 +1016,18 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
 		sk->sk_write_space(sk);
 
 	switch (sk->sk_state) {
-	case VSOCK_SS_LISTEN:
+	case TCP_LISTEN:
 		virtio_transport_recv_listen(sk, pkt);
 		virtio_transport_free_pkt(pkt);
 		break;
-	case SS_CONNECTING:
+	case TCP_SYN_SENT:
 		virtio_transport_recv_connecting(sk, pkt);
 		virtio_transport_free_pkt(pkt);
 		break;
-	case SS_CONNECTED:
+	case TCP_ESTABLISHED:
 		virtio_transport_recv_connected(sk, pkt);
 		break;
-	case SS_DISCONNECTING:
+	case TCP_CLOSING:
 		virtio_transport_recv_disconnecting(sk, pkt);
 		virtio_transport_free_pkt(pkt);
 		break;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 0206155bff53..391775e3575c 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -742,7 +742,7 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
 		/* The local context ID may be out of date, update it. */
 		vsk->local_addr.svm_cid = dst.svm_cid;
 
-		if (sk->sk_state == SS_CONNECTED)
+		if (sk->sk_state == TCP_ESTABLISHED)
 			vmci_trans(vsk)->notify_ops->handle_notify_pkt(
 					sk, pkt, true, &dst, &src,
 					&bh_process_pkt);
@@ -800,7 +800,9 @@ static void vmci_transport_handle_detach(struct sock *sk)
 		 * left in our consume queue.
 		 */
 		if (vsock_stream_has_data(vsk) <= 0) {
-			if (sk->sk_state == SS_CONNECTING) {
+			sk->sk_state = TCP_CLOSE;
+
+			if (sk->sk_state == TCP_SYN_SENT) {
 				/* The peer may detach from a queue pair while
 				 * we are still in the connecting state, i.e.,
 				 * if the peer VM is killed after attaching to
@@ -809,12 +811,10 @@ static void vmci_transport_handle_detach(struct sock *sk)
 				 * event like a reset.
 				 */
 
-				sk->sk_state = SS_UNCONNECTED;
 				sk->sk_err = ECONNRESET;
 				sk->sk_error_report(sk);
 				return;
 			}
-			sk->sk_state = SS_UNCONNECTED;
 		}
 		sk->sk_state_change(sk);
 	}
@@ -882,17 +882,17 @@ static void vmci_transport_recv_pkt_work(struct work_struct *work)
 	vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context;
 
 	switch (sk->sk_state) {
-	case VSOCK_SS_LISTEN:
+	case TCP_LISTEN:
 		vmci_transport_recv_listen(sk, pkt);
 		break;
-	case SS_CONNECTING:
+	case TCP_SYN_SENT:
 		/* Processing of pending connections for servers goes through
 		 * the listening socket, so see vmci_transport_recv_listen()
 		 * for that path.
 		 */
 		vmci_transport_recv_connecting_client(sk, pkt);
 		break;
-	case SS_CONNECTED:
+	case TCP_ESTABLISHED:
 		vmci_transport_recv_connected(sk, pkt);
 		break;
 	default:
@@ -941,7 +941,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
 		vsock_sk(pending)->local_addr.svm_cid = pkt->dg.dst.context;
 
 		switch (pending->sk_state) {
-		case SS_CONNECTING:
+		case TCP_SYN_SENT:
 			err = vmci_transport_recv_connecting_server(sk,
 								    pending,
 								    pkt);
@@ -1071,7 +1071,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
 	vsock_add_pending(sk, pending);
 	sk->sk_ack_backlog++;
 
-	pending->sk_state = SS_CONNECTING;
+	pending->sk_state = TCP_SYN_SENT;
 	vmci_trans(vpending)->produce_size =
 		vmci_trans(vpending)->consume_size = qp_size;
 	vmci_trans(vpending)->queue_pair_size = qp_size;
@@ -1196,11 +1196,11 @@ vmci_transport_recv_connecting_server(struct sock *listener,
 	 * the socket will be valid until it is removed from the queue.
 	 *
 	 * If we fail sending the attach below, we remove the socket from the
-	 * connected list and move the socket to SS_UNCONNECTED before
+	 * connected list and move the socket to TCP_CLOSE before
 	 * releasing the lock, so a pending slow path processing of an incoming
 	 * packet will not see the socket in the connected state in that case.
 	 */
-	pending->sk_state = SS_CONNECTED;
+	pending->sk_state = TCP_ESTABLISHED;
 
 	vsock_insert_connected(vpending);
 
@@ -1231,7 +1231,7 @@ vmci_transport_recv_connecting_server(struct sock *listener,
 
 destroy:
 	pending->sk_err = skerr;
-	pending->sk_state = SS_UNCONNECTED;
+	pending->sk_state = TCP_CLOSE;
 	/* As long as we drop our reference, all necessary cleanup will handle
 	 * when the cleanup function drops its reference and our destruct
 	 * implementation is called.  Note that since the listen handler will
@@ -1269,7 +1269,7 @@ vmci_transport_recv_connecting_client(struct sock *sk,
 		 * accounting (it can already be found since it's in the bound
 		 * table).
 		 */
-		sk->sk_state = SS_CONNECTED;
+		sk->sk_state = TCP_ESTABLISHED;
 		sk->sk_socket->state = SS_CONNECTED;
 		vsock_insert_connected(vsk);
 		sk->sk_state_change(sk);
@@ -1337,7 +1337,7 @@ vmci_transport_recv_connecting_client(struct sock *sk,
 destroy:
 	vmci_transport_send_reset(sk, pkt);
 
-	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_state = TCP_CLOSE;
 	sk->sk_err = skerr;
 	sk->sk_error_report(sk);
 	return err;
@@ -1525,7 +1525,7 @@ static int vmci_transport_recv_connected(struct sock *sk,
 		sock_set_flag(sk, SOCK_DONE);
 		vsk->peer_shutdown = SHUTDOWN_MASK;
 		if (vsock_stream_has_data(vsk) <= 0)
-			sk->sk_state = SS_DISCONNECTING;
+			sk->sk_state = TCP_CLOSING;
 
 		sk->sk_state_change(sk);
 		break;
@@ -1789,7 +1789,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk)
 		err = vmci_transport_send_conn_request(
 			sk, vmci_trans(vsk)->queue_pair_size);
 		if (err < 0) {
-			sk->sk_state = SS_UNCONNECTED;
+			sk->sk_state = TCP_CLOSE;
 			return err;
 		}
 	} else {
@@ -1799,7 +1799,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk)
 				sk, vmci_trans(vsk)->queue_pair_size,
 				supported_proto_versions);
 		if (err < 0) {
-			sk->sk_state = SS_UNCONNECTED;
+			sk->sk_state = TCP_CLOSE;
 			return err;
 		}
 
diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c
index 1406db4d97d1..41fb427f150a 100644
--- a/net/vmw_vsock/vmci_transport_notify.c
+++ b/net/vmw_vsock/vmci_transport_notify.c
@@ -355,7 +355,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk,
 		 * queue. Ask for notifications when there is something to
 		 * read.
 		 */
-		if (sk->sk_state == SS_CONNECTED) {
+		if (sk->sk_state == TCP_ESTABLISHED) {
 			if (!send_waiting_read(sk, 1))
 				return -1;
 
diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c
index f3a0afc46208..0cc84f2bb05e 100644
--- a/net/vmw_vsock/vmci_transport_notify_qstate.c
+++ b/net/vmw_vsock/vmci_transport_notify_qstate.c
@@ -176,7 +176,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk,
 		 * queue. Ask for notifications when there is something to
 		 * read.
 		 */
-		if (sk->sk_state == SS_CONNECTED)
+		if (sk->sk_state == TCP_ESTABLISHED)
 			vsock_block_update_write_window(sk);
 		*data_ready_now = false;
 	}
-- 
cgit v1.2.3


From 413a4317aca7d6367d57a5971b0c461f03851207 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 5 Oct 2017 16:46:53 -0400
Subject: VSOCK: add sock_diag interface

This patch adds the sock_diag interface for querying sockets from
userspace.  Tools like ss(8) and netstat(8) can use this interface to
list open sockets.

The userspace ABI is defined in <linux/vm_sockets_diag.h> and includes
netlink request and response structs.  The request can query sockets
based on their sk_state (e.g. listening sockets only) and the response
contains socket information fields including the local/remote addresses,
inode number, etc.

This patch does not dump VMCI pending sockets because I have only tested
the virtio transport, which does not use pending sockets.  Support can
be added later by extending vsock_diag_dump() if needed by VMCI users.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                          |   2 +
 include/uapi/linux/vm_sockets_diag.h |  33 +++++++
 net/vmw_vsock/Kconfig                |  10 ++
 net/vmw_vsock/Makefile               |   3 +
 net/vmw_vsock/diag.c                 | 186 +++++++++++++++++++++++++++++++++++
 5 files changed, 234 insertions(+)
 create mode 100644 include/uapi/linux/vm_sockets_diag.h
 create mode 100644 net/vmw_vsock/diag.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index d0cbb3d7a0ca..0fd9121953bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14286,6 +14286,8 @@ S:	Maintained
 F:	include/linux/virtio_vsock.h
 F:	include/uapi/linux/virtio_vsock.h
 F:	include/uapi/linux/vsockmon.h
+F:	include/uapi/linux/vm_sockets_diag.h
+F:	net/vmw_vsock/diag.c
 F:	net/vmw_vsock/af_vsock_tap.c
 F:	net/vmw_vsock/virtio_transport_common.c
 F:	net/vmw_vsock/virtio_transport.c
diff --git a/include/uapi/linux/vm_sockets_diag.h b/include/uapi/linux/vm_sockets_diag.h
new file mode 100644
index 000000000000..14cd7dc5a187
--- /dev/null
+++ b/include/uapi/linux/vm_sockets_diag.h
@@ -0,0 +1,33 @@
+/* AF_VSOCK sock_diag(7) interface for querying open sockets */
+
+#ifndef _UAPI__VM_SOCKETS_DIAG_H__
+#define _UAPI__VM_SOCKETS_DIAG_H__
+
+#include <linux/types.h>
+
+/* Request */
+struct vsock_diag_req {
+	__u8	sdiag_family;	/* must be AF_VSOCK */
+	__u8	sdiag_protocol;	/* must be 0 */
+	__u16	pad;		/* must be 0 */
+	__u32	vdiag_states;	/* query bitmap (e.g. 1 << TCP_LISTEN) */
+	__u32	vdiag_ino;	/* must be 0 (reserved) */
+	__u32	vdiag_show;	/* must be 0 (reserved) */
+	__u32	vdiag_cookie[2];
+};
+
+/* Response */
+struct vsock_diag_msg {
+	__u8	vdiag_family;	/* AF_VSOCK */
+	__u8	vdiag_type;	/* SOCK_STREAM or SOCK_DGRAM */
+	__u8	vdiag_state;	/* sk_state (e.g. TCP_LISTEN) */
+	__u8	vdiag_shutdown; /* local RCV_SHUTDOWN | SEND_SHUTDOWN */
+	__u32   vdiag_src_cid;
+	__u32   vdiag_src_port;
+	__u32   vdiag_dst_cid;
+	__u32   vdiag_dst_port;
+	__u32	vdiag_ino;
+	__u32	vdiag_cookie[2];
+};
+
+#endif /* _UAPI__VM_SOCKETS_DIAG_H__ */
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index a24369d175fd..970f96489fe7 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -15,6 +15,16 @@ config VSOCKETS
 	  To compile this driver as a module, choose M here: the module
 	  will be called vsock. If unsure, say N.
 
+config VSOCKETS_DIAG
+	tristate "Virtual Sockets monitoring interface"
+	depends on VSOCKETS
+	default y
+	help
+	  Support for PF_VSOCK sockets monitoring interface used by the ss tool.
+	  If unsure, say Y.
+
+	  Enable this module so userspace applications can query open sockets.
+
 config VMWARE_VMCI_VSOCKETS
 	tristate "VMware VMCI transport for Virtual Sockets"
 	depends on VSOCKETS && VMWARE_VMCI
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index e63d574234a9..64afc06805da 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_VSOCKETS) += vsock.o
+obj-$(CONFIG_VSOCKETS_DIAG) += vsock_diag.o
 obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
 obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
 obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
@@ -6,6 +7,8 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
 
 vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
 
+vsock_diag-y += diag.o
+
 vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
 	vmci_transport_notify_qstate.o
 
diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c
new file mode 100644
index 000000000000..31b567652250
--- /dev/null
+++ b/net/vmw_vsock/diag.c
@@ -0,0 +1,186 @@
+/*
+ * vsock sock_diag(7) module
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ * Author: Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/module.h>
+#include <linux/sock_diag.h>
+#include <linux/vm_sockets_diag.h>
+#include <net/af_vsock.h>
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+			u32 portid, u32 seq, u32 flags)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct vsock_diag_msg *rep;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep),
+			flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	rep = nlmsg_data(nlh);
+	rep->vdiag_family = AF_VSOCK;
+
+	/* Lock order dictates that sk_lock is acquired before
+	 * vsock_table_lock, so we cannot lock here.  Simply don't take
+	 * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is
+	 * held.
+	 */
+	rep->vdiag_type = sk->sk_type;
+	rep->vdiag_state = sk->sk_state;
+	rep->vdiag_shutdown = sk->sk_shutdown;
+	rep->vdiag_src_cid = vsk->local_addr.svm_cid;
+	rep->vdiag_src_port = vsk->local_addr.svm_port;
+	rep->vdiag_dst_cid = vsk->remote_addr.svm_cid;
+	rep->vdiag_dst_port = vsk->remote_addr.svm_port;
+	rep->vdiag_ino = sock_i_ino(sk);
+
+	sock_diag_save_cookie(sk, rep->vdiag_cookie);
+
+	return 0;
+}
+
+static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct vsock_diag_req *req;
+	struct vsock_sock *vsk;
+	unsigned int bucket;
+	unsigned int last_i;
+	unsigned int table;
+	struct net *net;
+	unsigned int i;
+
+	req = nlmsg_data(cb->nlh);
+	net = sock_net(skb->sk);
+
+	/* State saved between calls: */
+	table = cb->args[0];
+	bucket = cb->args[1];
+	i = last_i = cb->args[2];
+
+	/* TODO VMCI pending sockets? */
+
+	spin_lock_bh(&vsock_table_lock);
+
+	/* Bind table (locally created sockets) */
+	if (table == 0) {
+		while (bucket < ARRAY_SIZE(vsock_bind_table)) {
+			struct list_head *head = &vsock_bind_table[bucket];
+
+			i = 0;
+			list_for_each_entry(vsk, head, bound_table) {
+				struct sock *sk = sk_vsock(vsk);
+
+				if (!net_eq(sock_net(sk), net))
+					continue;
+				if (i < last_i)
+					goto next_bind;
+				if (!(req->vdiag_states & (1 << sk->sk_state)))
+					goto next_bind;
+				if (sk_diag_fill(sk, skb,
+						 NETLINK_CB(cb->skb).portid,
+						 cb->nlh->nlmsg_seq,
+						 NLM_F_MULTI) < 0)
+					goto done;
+next_bind:
+				i++;
+			}
+			last_i = 0;
+			bucket++;
+		}
+
+		table++;
+		bucket = 0;
+	}
+
+	/* Connected table (accepted connections) */
+	while (bucket < ARRAY_SIZE(vsock_connected_table)) {
+		struct list_head *head = &vsock_connected_table[bucket];
+
+		i = 0;
+		list_for_each_entry(vsk, head, connected_table) {
+			struct sock *sk = sk_vsock(vsk);
+
+			/* Skip sockets we've already seen above */
+			if (__vsock_in_bound_table(vsk))
+				continue;
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+			if (i < last_i)
+				goto next_connected;
+			if (!(req->vdiag_states & (1 << sk->sk_state)))
+				goto next_connected;
+			if (sk_diag_fill(sk, skb,
+					 NETLINK_CB(cb->skb).portid,
+					 cb->nlh->nlmsg_seq,
+					 NLM_F_MULTI) < 0)
+				goto done;
+next_connected:
+			i++;
+		}
+		last_i = 0;
+		bucket++;
+	}
+
+done:
+	spin_unlock_bh(&vsock_table_lock);
+
+	cb->args[0] = table;
+	cb->args[1] = bucket;
+	cb->args[2] = i;
+
+	return skb->len;
+}
+
+static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+	int hdrlen = sizeof(struct vsock_diag_req);
+	struct net *net = sock_net(skb->sk);
+
+	if (nlmsg_len(h) < hdrlen)
+		return -EINVAL;
+
+	if (h->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = vsock_diag_dump,
+		};
+		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct sock_diag_handler vsock_diag_handler = {
+	.family = AF_VSOCK,
+	.dump = vsock_diag_handler_dump,
+};
+
+static int __init vsock_diag_init(void)
+{
+	return sock_diag_register(&vsock_diag_handler);
+}
+
+static void __exit vsock_diag_exit(void)
+{
+	sock_diag_unregister(&vsock_diag_handler);
+}
+
+module_init(vsock_diag_init);
+module_exit(vsock_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG,
+			       40 /* AF_VSOCK */);
-- 
cgit v1.2.3


From 27204aaa9dc67b833b77179fdac556288ec3a4bf Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Wed, 4 Oct 2017 10:03:44 -0700
Subject: tcp: uniform the set up of sockets after successful connection

Currently in the TCP code, the initialization sequence for cached
metrics, congestion control, BPF, etc, after successful connection
is very inconsistent. This introduces inconsistent bevhavior and is
prone to bugs. The current call sequence is as follows:

(1) for active case (tcp_finish_connect() case):
        tcp_mtup_init(sk);
        icsk->icsk_af_ops->rebuild_header(sk);
        tcp_init_metrics(sk);
        tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
        tcp_init_congestion_control(sk);
        tcp_init_buffer_space(sk);

(2) for passive case (tcp_rcv_state_process() TCP_SYN_RECV case):
        icsk->icsk_af_ops->rebuild_header(sk);
        tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
        tcp_init_congestion_control(sk);
        tcp_mtup_init(sk);
        tcp_init_buffer_space(sk);
        tcp_init_metrics(sk);

(3) for TFO passive case (tcp_fastopen_create_child()):
        inet_csk(child)->icsk_af_ops->rebuild_header(child);
        tcp_init_congestion_control(child);
        tcp_mtup_init(child);
        tcp_init_metrics(child);
        tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
        tcp_init_buffer_space(child);

This commit uniforms the above functions to have the following sequence:
        tcp_mtup_init(sk);
        icsk->icsk_af_ops->rebuild_header(sk);
        tcp_init_metrics(sk);
        tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE/PASSIVE_ESTABLISHED_CB);
        tcp_init_congestion_control(sk);
        tcp_init_buffer_space(sk);
This sequence is the same as the (1) active case. We pick this sequence
because this order correctly allows BPF to override the settings
including congestion control module and initial cwnd, etc from
the route, and then allows the CC module to see those settings.

Suggested-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  1 +
 net/ipv4/tcp.c          | 12 ++++++++++++
 net/ipv4/tcp_fastopen.c |  7 +------
 net/ipv4/tcp_input.c    | 21 +++------------------
 4 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a3a8af56fd6..426c2e986016 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -416,6 +416,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 void tcp_disable_fack(struct tcp_sock *tp);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
+void tcp_init_transfer(struct sock *sk, int bpf_op);
 unsigned int tcp_poll(struct file *file, struct socket *sock,
 		      struct poll_table_struct *wait);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 23225c98d287..c115e37ca608 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -456,6 +456,18 @@ void tcp_init_sock(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
+void tcp_init_transfer(struct sock *sk, int bpf_op)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tcp_mtup_init(sk);
+	icsk->icsk_af_ops->rebuild_header(sk);
+	tcp_init_metrics(sk);
+	tcp_call_bpf(sk, bpf_op);
+	tcp_init_congestion_control(sk);
+	tcp_init_buffer_space(sk);
+}
+
 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
 {
 	if (tsflags && skb) {
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index de470e7e586f..29fff14d5a53 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -236,12 +236,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	refcount_set(&req->rsk_refcnt, 2);
 
 	/* Now finish processing the fastopen child socket. */
-	inet_csk(child)->icsk_af_ops->rebuild_header(child);
-	tcp_init_congestion_control(child);
-	tcp_mtup_init(child);
-	tcp_init_metrics(child);
-	tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
-	tcp_init_buffer_space(child);
+	tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
 
 	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index db9bb46b5776..bd3a35f5dbf2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5513,20 +5513,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 		security_inet_conn_established(sk, skb);
 	}
 
-	/* Make sure socket is routed, for correct metrics.  */
-	icsk->icsk_af_ops->rebuild_header(sk);
-
-	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
-	tcp_init_congestion_control(sk);
+	tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
 
 	/* Prevent spurious tcp_cwnd_restart() on first data
 	 * packet.
 	 */
 	tp->lsndtime = tcp_jiffies32;
 
-	tcp_init_buffer_space(sk);
-
 	if (sock_flag(sk, SOCK_KEEPOPEN))
 		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
 
@@ -5693,7 +5686,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		if (tcp_is_sack(tp) && sysctl_tcp_fack)
 			tcp_enable_fack(tp);
 
-		tcp_mtup_init(sk);
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
@@ -5920,14 +5912,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 			inet_csk(sk)->icsk_retransmits = 0;
 			reqsk_fastopen_remove(sk, req, false);
 		} else {
-			/* Make sure socket is routed, for correct metrics. */
-			icsk->icsk_af_ops->rebuild_header(sk);
-			tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
-			tcp_init_congestion_control(sk);
-
-			tcp_mtup_init(sk);
+			tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
 			tp->copied_seq = tp->rcv_nxt;
-			tcp_init_buffer_space(sk);
 		}
 		smp_mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
@@ -5957,8 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 			 * are sent out.
 			 */
 			tcp_rearm_rto(sk);
-		} else
-			tcp_init_metrics(sk);
+		}
 
 		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
 			tcp_update_pacing_rate(sk);
-- 
cgit v1.2.3


From e2080072ed2d98a55ae69d95dea60ff7a17cddd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Oct 2017 12:59:58 -0700
Subject: tcp: new list for sent but unacked skbs for RACK recovery

This patch adds a new queue (list) that tracks the sent but not yet
acked or SACKed skbs for a TCP connection. The list is chronologically
ordered by skb->skb_mstamp (the head is the oldest sent skb).

This list will be used to optimize TCP Rack recovery, which checks
an skb's timestamp to judge if it has been lost and needs to be
retransmitted. Since TCP write queue is ordered by sequence instead
of sent time, RACK has to scan over the write queue to catch all
eligible packets to detect lost retransmission, and iterates through
SACKed skbs repeatedly.

Special cares for rare events:
1. TCP repair fakes skb transmission so the send queue needs adjusted
2. SACK reneging would require re-inserting SACKed skbs into the
   send queue. For now I believe it's not worth the complexity to
   make RACK work perfectly on SACK reneging, so we do nothing here.
3. Fast Open: currently for non-TFO, send-queue correctly queues
   the pure SYN packet. For TFO which queues a pure SYN and
   then a data packet, send-queue only queues the data packet but
   not the pure SYN due to the structure of TFO code. This is okay
   because the SYN receiver would never respond with a SACK on a
   missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK).

In order to not grow sk_buff, we use an union for the new list and
_skb_refdst/destructor fields. This is a bit complicated because
we need to make sure _skb_refdst and destructor are properly zeroed
before skb is cloned/copied at transmit, and before being freed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h   | 11 +++++++++--
 include/linux/tcp.h      |  1 +
 include/net/tcp.h        | 24 +++++++++++++++++++++++-
 net/ipv4/tcp.c           |  2 ++
 net/ipv4/tcp_input.c     |  9 +++++++--
 net/ipv4/tcp_minisocks.c |  1 +
 net/ipv4/tcp_output.c    | 42 +++++++++++++++++++++++++++++++-----------
 7 files changed, 74 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ada821466e88..01a985937867 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@nf_trace: netfilter packet trace flag
  *	@protocol: Packet protocol from driver
  *	@destructor: Destruct function
+ *	@tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
  *	@_nfct: Associated connection, if any (with nfctinfo bits)
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
@@ -686,8 +687,14 @@ struct sk_buff {
 	 */
 	char			cb[48] __aligned(8);
 
-	unsigned long		_skb_refdst;
-	void			(*destructor)(struct sk_buff *skb);
+	union {
+		struct {
+			unsigned long	_skb_refdst;
+			void		(*destructor)(struct sk_buff *skb);
+		};
+		struct list_head	tcp_tsorted_anchor;
+	};
+
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4aa40ef02d32..1d2c44e09e31 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -191,6 +191,7 @@ struct tcp_sock {
 	u32	tsoffset;	/* timestamp offset */
 
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
 
 	u32	snd_wl1;	/* Sequence for window update		*/
 	u32	snd_wnd;	/* The window we expect to receive	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 426c2e986016..3b16f353b539 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1589,14 +1589,34 @@ enum tcp_chrono {
 void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
 void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
 
+/* This helper is needed, because skb->tcp_tsorted_anchor uses
+ * the same memory storage than skb->destructor/_skb_refdst
+ */
+static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
+{
+	skb->destructor = NULL;
+	skb->_skb_refdst = 0UL;
+}
+
+#define tcp_skb_tsorted_save(skb) {		\
+	unsigned long _save = skb->_skb_refdst;	\
+	skb->_skb_refdst = 0UL;
+
+#define tcp_skb_tsorted_restore(skb)		\
+	skb->_skb_refdst = _save;		\
+}
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
 	struct sk_buff *skb;
 
 	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
+	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		tcp_skb_tsorted_anchor_cleanup(skb);
 		sk_wmem_free_skb(sk, skb);
+	}
+	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
 }
@@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
+	list_del(&skb->tcp_tsorted_anchor);
+	tcp_skb_tsorted_anchor_cleanup(skb);
 	__skb_unlink(skb, &sk->sk_write_queue);
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c115e37ca608..8cf742fd4f99 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk)
 	tp->out_of_order_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	INIT_LIST_HEAD(&tp->tsq_node);
+	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
 
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -881,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 			 * available to the caller, no more, no less.
 			 */
 			skb->reserved_tailroom = skb->end - skb->tail - size;
+			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
 			return skb;
 		}
 		__kfree_skb(skb);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5b8d61846c2..fb0d7ed84b94 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 						tcp_skb_pcount(skb),
 						skb->skb_mstamp);
 			tcp_rate_skb_delivered(sk, skb, state->rate);
+			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+				list_del_init(&skb->tcp_tsorted_anchor);
 
 			if (!before(TCP_SKB_CB(skb)->seq,
 				    tcp_highest_sack_seq(tp)))
@@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 
 	shinfo = skb_shinfo(skb);
 	if (!before(shinfo->tskey, prior_snd_una) &&
-	    before(shinfo->tskey, tcp_sk(sk)->snd_una))
-		__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+	    before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
+		tcp_skb_tsorted_save(skb) {
+			__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+		} tcp_skb_tsorted_restore(skb);
+	}
 }
 
 /* Remove acknowledged frames from the retransmission queue. If our packet
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f31356d..2341b9f857b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
 
 		INIT_LIST_HEAD(&newtp->tsq_node);
+		INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
 
 		tcp_init_wl(newtp, treq->rcv_isn);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..8162e2880178 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
 		      HRTIMER_MODE_ABS_PINNED);
 }
 
+static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	skb->skb_mstamp = tp->tcp_mstamp;
+	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
 			- tp->snd_una;
 		oskb = skb;
-		if (unlikely(skb_cloned(skb)))
-			skb = pskb_copy(skb, gfp_mask);
-		else
-			skb = skb_clone(skb, gfp_mask);
+
+		tcp_skb_tsorted_save(oskb) {
+			if (unlikely(skb_cloned(oskb)))
+				skb = pskb_copy(oskb, gfp_mask);
+			else
+				skb = skb_clone(oskb, gfp_mask);
+		} tcp_skb_tsorted_restore(oskb);
+
 		if (unlikely(!skb))
 			return -ENOBUFS;
 	}
@@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		err = net_xmit_eval(err);
 	}
 	if (!err && oskb) {
-		oskb->skb_mstamp = tp->tcp_mstamp;
+		tcp_update_skb_after_send(tp, oskb);
 		tcp_rate_skb_sent(sk, oskb);
 	}
 	return err;
@@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
 	tcp_insert_write_queue_after(skb, buff, sk);
+	list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
 	return 0;
 }
@@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 
 		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
 			/* "skb_mstamp" is used as a start point for the retransmit timer */
-			skb->skb_mstamp = tp->tcp_mstamp;
+			tcp_update_skb_after_send(tp, skb);
 			goto repair; /* Skip network transmission */
 		}
 
@@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		     skb_headroom(skb) >= 0xFFFF)) {
 		struct sk_buff *nskb;
 
-		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
-		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-			     -ENOBUFS;
+		tcp_skb_tsorted_save(skb) {
+			nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
+			err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+				     -ENOBUFS;
+		} tcp_skb_tsorted_restore(skb);
+
 		if (!err)
-			skb->skb_mstamp = tp->tcp_mstamp;
+			tcp_update_skb_after_send(tp, skb);
 	} else {
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 	}
@@ -3023,6 +3037,7 @@ coalesce:
 				goto coalesce;
 			return;
 		}
+		INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
 		skb_reserve(skb, MAX_TCP_HEADER);
 		sk_forced_mem_schedule(sk, skb->truesize);
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk)
 	}
 	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
 		if (skb_cloned(skb)) {
-			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+			struct sk_buff *nskb;
+
+			tcp_skb_tsorted_save(skb) {
+				nskb = skb_copy(skb, GFP_ATOMIC);
+			} tcp_skb_tsorted_restore(skb);
 			if (!nskb)
 				return -ENOMEM;
+			INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
 			tcp_unlink_write_queue(skb, sk);
 			__skb_header_release(nskb);
 			__tcp_add_write_queue_head(sk, nskb);
-- 
cgit v1.2.3


From 4e64b1ed15e25b8dcc2819c6d43dab72eb0bea26 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 5 Oct 2017 23:46:14 -0700
Subject: net/ipv6: Convert icmpv6_push_pending_frames to void

commit cc71b7b07119 ("net/ipv6: remove unused err variable on
icmpv6_push_pending_frames") exposed icmpv6_push_pending_frames
return value not being used.

Remove now unnecessary int err declarations and uses.

Miscellanea:

o Remove unnecessary goto and out: labels
o Realign arguments

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h |  4 ++--
 net/ipv6/icmp.c    | 43 ++++++++++++++++++-------------------------
 net/ipv6/ping.c    |  5 ++---
 3 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 6eac5cf8f1e6..3cda3b521c36 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -300,8 +300,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl)
 
 void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info);
 
-int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
-			       struct icmp6hdr *thdr, int len);
+void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+				struct icmp6hdr *thdr, int len);
 
 int ip6_ra_control(struct sock *sk, int sel);
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index aeb49b4d8c7d..4e52d52a6752 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -250,15 +250,15 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset)
 	return (*op & 0xC0) == 0x80;
 }
 
-int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
-			       struct icmp6hdr *thdr, int len)
+void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+				struct icmp6hdr *thdr, int len)
 {
 	struct sk_buff *skb;
 	struct icmp6hdr *icmp6h;
 
 	skb = skb_peek(&sk->sk_write_queue);
 	if (!skb)
-		goto out;
+		return;
 
 	icmp6h = icmp6_hdr(skb);
 	memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
@@ -286,8 +286,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
 						      tmp_csum);
 	}
 	ip6_push_pending_frames(sk);
-out:
-	return 0;
 }
 
 struct icmpv6_msg {
@@ -437,7 +435,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	int iif = 0;
 	int addr_type = 0;
 	int len;
-	int err = 0;
 	u32 mark = IP6_REPLY_MARK(net, skb->mark);
 
 	if ((u8 *)hdr < skb->head ||
@@ -574,17 +571,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	rcu_read_lock();
 	idev = __in6_dev_get(skb->dev);
 
-	err = ip6_append_data(sk, icmpv6_getfrag, &msg,
-			      len + sizeof(struct icmp6hdr),
-			      sizeof(struct icmp6hdr),
-			      &ipc6, &fl6, (struct rt6_info *)dst,
-			      MSG_DONTWAIT, &sockc_unused);
-	if (err) {
+	if (ip6_append_data(sk, icmpv6_getfrag, &msg,
+			    len + sizeof(struct icmp6hdr),
+			    sizeof(struct icmp6hdr),
+			    &ipc6, &fl6, (struct rt6_info *)dst,
+			    MSG_DONTWAIT, &sockc_unused)) {
 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
 		ip6_flush_pending_frames(sk);
 	} else {
-		err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
-						 len + sizeof(struct icmp6hdr));
+		icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+					   len + sizeof(struct icmp6hdr));
 	}
 	rcu_read_unlock();
 out_dst_release:
@@ -681,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	struct icmpv6_msg msg;
 	struct dst_entry *dst;
 	struct ipcm6_cookie ipc6;
-	int err = 0;
 	u32 mark = IP6_REPLY_MARK(net, skb->mark);
 	struct sockcm_cookie sockc_unused = {0};
 
@@ -718,8 +713,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
 
-	err = ip6_dst_lookup(net, sk, &dst, &fl6);
-	if (err)
+	if (ip6_dst_lookup(net, sk, &dst, &fl6))
 		goto out;
 	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
 	if (IS_ERR(dst))
@@ -736,17 +730,16 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	ipc6.dontfrag = np->dontfrag;
 	ipc6.opt = NULL;
 
-	err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
-				sizeof(struct icmp6hdr), &ipc6, &fl6,
-				(struct rt6_info *)dst, MSG_DONTWAIT,
-				&sockc_unused);
-
-	if (err) {
+	if (ip6_append_data(sk, icmpv6_getfrag, &msg,
+			    skb->len + sizeof(struct icmp6hdr),
+			    sizeof(struct icmp6hdr), &ipc6, &fl6,
+			    (struct rt6_info *)dst, MSG_DONTWAIT,
+			    &sockc_unused)) {
 		__ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
 		ip6_flush_pending_frames(sk);
 	} else {
-		err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
-						 skb->len + sizeof(struct icmp6hdr));
+		icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+					   skb->len + sizeof(struct icmp6hdr));
 	}
 	dst_release(dst);
 out:
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index ac826dd338ff..d12c55dad7d1 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -154,9 +154,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				ICMP6_MIB_OUTERRORS);
 		ip6_flush_pending_frames(sk);
 	} else {
-		err = icmpv6_push_pending_frames(sk, &fl6,
-						 (struct icmp6hdr *) &pfh.icmph,
-						 len);
+		icmpv6_push_pending_frames(sk, &fl6,
+					   (struct icmp6hdr *)&pfh.icmph, len);
 	}
 	release_sock(sk);
 
-- 
cgit v1.2.3


From 18a4c0eab2623cc95be98a1e6af1ad18e7695977 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 5 Oct 2017 22:21:21 -0700
Subject: net: add rb_to_skb() and other rb tree helpers

Geeralize private netem_rb_to_skb()

TCP rtx queue will soon be converted to rb-tree,
so we will need skb_rbtree_walk() helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h  | 18 ++++++++++++++++++
 net/ipv4/tcp_fastopen.c |  8 +++-----
 net/ipv4/tcp_input.c    | 33 ++++++++++++---------------------
 net/sched/sch_netem.c   | 14 ++++----------
 4 files changed, 37 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 01a985937867..03634ec2f918 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 	return __skb_grow(skb, len);
 }
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 		for (; skb != (struct sk_buff *)(queue);			\
 		     skb = skb->next)
 
+#define skb_rbtree_walk(skb, root)						\
+		for (skb = skb_rb_first(root); skb != NULL;			\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)						\
+		for (; skb != NULL;						\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)					\
+		for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);	\
+		     skb = tmp)
+
 #define skb_queue_walk_from_safe(queue, skb, tmp)				\
 		for (tmp = skb->next;						\
 		     skb != (struct sk_buff *)(queue);				\
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 29fff14d5a53..7ee4aadcdd71 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -465,17 +465,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node *p;
-	struct sk_buff *skb;
 	struct dst_entry *dst;
+	struct sk_buff *skb;
 
 	if (!tp->syn_fastopen)
 		return;
 
 	if (!tp->data_segs_in) {
-		p = rb_first(&tp->out_of_order_queue);
-		if (p && !rb_next(p)) {
-			skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = skb_rb_first(&tp->out_of_order_queue);
+		if (skb && !skb_rb_next(skb)) {
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
 				tcp_fastopen_active_disable(sk);
 				return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb0d7ed84b94..90afe4143596 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4335,7 +4335,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 	p = rb_first(&tp->out_of_order_queue);
 	while (p) {
-		skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = rb_to_skb(p);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4399,7 +4399,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node **p, *q, *parent;
+	struct rb_node **p, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
 	bool fragstolen;
@@ -4458,7 +4458,7 @@ coalesce_done:
 	parent = NULL;
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
 			p = &parent->rb_left;
 			continue;
@@ -4503,9 +4503,7 @@ insert:
 
 merge_right:
 	/* Remove other segments covered by skb. */
-	while ((q = rb_next(&skb->rbnode)) != NULL) {
-		skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+	while ((skb1 = skb_rb_next(skb)) != NULL) {
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
 		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4520,7 +4518,7 @@ merge_right:
 		tcp_drop(sk, skb1);
 	}
 	/* If there is no skb after us, we are the last_skb ! */
-	if (!q)
+	if (!skb1)
 		tp->ooo_last_skb = skb;
 
 add_sack:
@@ -4706,7 +4704,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
 	if (list)
 		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
 
-	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+	return skb_rb_next(skb);
 }
 
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4735,7 +4733,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
 			p = &parent->rb_left;
 		else
@@ -4854,26 +4852,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb, *head;
-	struct rb_node *p;
 	u32 start, end;
 
-	p = rb_first(&tp->out_of_order_queue);
-	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+	skb = skb_rb_first(&tp->out_of_order_queue);
 new_range:
 	if (!skb) {
-		p = rb_last(&tp->out_of_order_queue);
-		/* Note: This is possible p is NULL here. We do not
-		 * use rb_entry_safe(), as ooo_last_skb is valid only
-		 * if rbtree is not empty.
-		 */
-		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
 		return;
 	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
 
 	for (head = skb;;) {
-		skb = tcp_skb_next(skb, NULL);
+		skb = skb_rb_next(skb);
 
 		/* Range is terminated when we see a gap or when
 		 * we are at the queue end.
@@ -4916,14 +4907,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 	do {
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
-		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+		tcp_drop(sk, rb_to_skb(node));
 		sk_mem_reclaim(sk);
 		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 		    !tcp_under_memory_pressure(sk))
 			break;
 		node = prev;
 	} while (node);
-	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+	tp->ooo_last_skb = rb_to_skb(prev);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5a4f10080290..db0228a65e8c 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -148,12 +148,6 @@ struct netem_skb_cb {
 	psched_time_t	time_to_send;
 };
 
-
-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
-{
-	return rb_entry(rb, struct sk_buff, rbnode);
-}
-
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
@@ -364,7 +358,7 @@ static void tfifo_reset(struct Qdisc *sch)
 	struct rb_node *p = rb_first(&q->t_root);
 
 	while (p) {
-		struct sk_buff *skb = netem_rb_to_skb(p);
+		struct sk_buff *skb = rb_to_skb(p);
 
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, &q->t_root);
@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		struct sk_buff *skb;
 
 		parent = *p;
-		skb = netem_rb_to_skb(parent);
+		skb = rb_to_skb(parent);
 		if (tnext >= netem_skb_cb(skb)->time_to_send)
 			p = &parent->rb_right;
 		else
@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				struct sk_buff *t_skb;
 				struct netem_skb_cb *t_last;
 
-				t_skb = netem_rb_to_skb(rb_last(&q->t_root));
+				t_skb = skb_rb_last(&q->t_root);
 				t_last = netem_skb_cb(t_skb);
 				if (!last ||
 				    t_last->time_to_send > last->time_to_send) {
@@ -617,7 +611,7 @@ deliver:
 	if (p) {
 		psched_time_t time_to_send;
 
-		skb = netem_rb_to_skb(p);
+		skb = rb_to_skb(p);
 
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
-- 
cgit v1.2.3


From ac3f09ba3e496bd7cc780ead05b1d1bb5f33aedb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 5 Oct 2017 22:21:22 -0700
Subject: tcp: uninline tcp_write_queue_purge()

Since the upcoming rtx rbtree will add some extra code,
it is time to not inline this fat function anymore.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 15 +--------------
 net/ipv4/tcp.c    | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3b16f353b539..744559b72784 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1606,20 +1606,7 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
 	skb->_skb_refdst = _save;		\
 }
 
-/* write queue abstraction */
-static inline void tcp_write_queue_purge(struct sock *sk)
-{
-	struct sk_buff *skb;
-
-	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
-		tcp_skb_tsorted_anchor_cleanup(skb);
-		sk_wmem_free_skb(sk, skb);
-	}
-	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
-	sk_mem_reclaim(sk);
-	tcp_clear_all_retrans_hints(tcp_sk(sk));
-}
+void tcp_write_queue_purge(struct sock *sk);
 
 static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8cf742fd4f99..f8ebae62f834 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2318,6 +2318,20 @@ static inline bool tcp_need_reset(int state)
 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
 }
 
+void tcp_write_queue_purge(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		tcp_skb_tsorted_anchor_cleanup(skb);
+		sk_wmem_free_skb(sk, skb);
+	}
+	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+	sk_mem_reclaim(sk);
+	tcp_clear_all_retrans_hints(tcp_sk(sk));
+}
+
 int tcp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
-- 
cgit v1.2.3


From 75c119afe14f74b4dd967d75ed9f57ab6c0ef045 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 5 Oct 2017 22:21:27 -0700
Subject: tcp: implement rb-tree based retransmit queue

Using a linear list to store all skbs in write queue has been okay
for quite a while : O(N) is not too bad when N < 500.

Things get messy when N is the order of 100,000 : Modern TCP stacks
want 10Gbit+ of throughput even with 200 ms RTT flows.

40 ns per cache line miss means a full scan can use 4 ms,
blowing away CPU caches.

SACK processing often can use various hints to avoid parsing
whole retransmit queue. But with high packet losses and/or high
reordering, hints no longer work.

Sender has to process thousands of unfriendly SACK, accumulating
a huge socket backlog, burning a cpu and massively dropping packets.

Using an rb-tree for retransmit queue has been avoided for years
because it added complexity and overhead, but now is the time
to be more resistant and say no to quadratic behavior.

1) RTX queue is no longer part of the write queue : already sent skbs
are stored in one rb-tree.

2) Since reaching the head of write queue no longer needs
sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue

Tested:

 On receiver :
 netem on ingress : delay 150ms 200us loss 1
 GRO disabled to force stress and SACK storms.

for f in `seq 1 10`
do
 ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1
done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}'

Before patch :

323.87
351.48
339.59
338.62
306.72
204.07
304.93
291.88
202.47
176.88
   2840

After patch:

1700.83
2207.98
2070.17
1544.26
2114.76
2124.89
1693.14
1080.91
2216.82
1299.94
  18053

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    |   7 ++-
 include/net/tcp.h     |  89 ++++++++++++++++----------------
 net/ipv4/tcp.c        |  41 +++++++++++----
 net/ipv4/tcp_input.c  | 133 +++++++++++++++++++++++++-----------------------
 net/ipv4/tcp_ipv4.c   |   2 +-
 net/ipv4/tcp_output.c | 137 +++++++++++++++++++++++++++-----------------------
 net/ipv4/tcp_timer.c  |  24 +++++----
 7 files changed, 245 insertions(+), 188 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index a6b9a8d1a6df..4827094f1db4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -60,7 +60,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/cgroup-defs.h>
-
+#include <linux/rbtree.h>
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
 #include <linux/poll.h>
@@ -397,7 +397,10 @@ struct sock {
 	int			sk_wmem_queued;
 	refcount_t		sk_wmem_alloc;
 	unsigned long		sk_tsq_flags;
-	struct sk_buff		*sk_send_head;
+	union {
+		struct sk_buff	*sk_send_head;
+		struct rb_root	tcp_rtx_queue;
+	};
 	struct sk_buff_head	sk_write_queue;
 	__s32			sk_peek_off;
 	int			sk_write_pending;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 744559b72784..5a95e5886b55 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
 void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
+enum tcp_queue {
+	TCP_FRAG_IN_WRITE_QUEUE,
+	TCP_FRAG_IN_RTX_QUEUE,
+};
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+		 struct sk_buff *skb, u32 len,
+		 unsigned int mss_now, gfp_t gfp);
 
 void tcp_send_probe0(struct sock *);
 void tcp_send_partial(struct sock *);
@@ -1608,6 +1614,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
 
 void tcp_write_queue_purge(struct sock *sk);
 
+static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
+{
+	return skb_rb_first(&sk->tcp_rtx_queue);
+}
+
 static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
 {
 	return skb_peek(&sk->sk_write_queue);
@@ -1630,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
 	return skb_queue_prev(&sk->sk_write_queue, skb);
 }
 
-#define tcp_for_write_queue(skb, sk)					\
-	skb_queue_walk(&(sk)->sk_write_queue, skb)
-
-#define tcp_for_write_queue_from(skb, sk)				\
-	skb_queue_walk_from(&(sk)->sk_write_queue, skb)
-
 #define tcp_for_write_queue_from_safe(skb, tmp, sk)			\
 	skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
 
 static inline struct sk_buff *tcp_send_head(const struct sock *sk)
 {
-	return sk->sk_send_head;
+	return skb_peek(&sk->sk_write_queue);
 }
 
 static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1650,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
 	return skb_queue_is_last(&sk->sk_write_queue, skb);
 }
 
-static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
+static inline bool tcp_write_queue_empty(const struct sock *sk)
 {
-	if (tcp_skb_is_last(sk, skb))
-		sk->sk_send_head = NULL;
-	else
-		sk->sk_send_head = tcp_write_queue_next(sk, skb);
+	return skb_queue_empty(&sk->sk_write_queue);
+}
+
+static inline bool tcp_rtx_queue_empty(const struct sock *sk)
+{
+	return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
+}
+
+static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
+{
+	return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
 }
 
 static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
 {
-	if (sk->sk_send_head == skb_unlinked) {
-		sk->sk_send_head = NULL;
+	if (tcp_write_queue_empty(sk))
 		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-	}
+
 	if (tcp_sk(sk)->highest_sack == skb_unlinked)
 		tcp_sk(sk)->highest_sack = NULL;
 }
 
-static inline void tcp_init_send_head(struct sock *sk)
-{
-	sk->sk_send_head = NULL;
-}
-
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_queue_tail(&sk->sk_write_queue, skb);
@@ -1683,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
 	__tcp_add_write_queue_tail(sk, skb);
 
 	/* Queue it, remembering where we must start sending. */
-	if (sk->sk_send_head == NULL) {
-		sk->sk_send_head = skb;
+	if (sk->sk_write_queue.next == skb) {
 		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
 
 		if (tcp_sk(sk)->highest_sack == NULL)
@@ -1697,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
 	__skb_queue_head(&sk->sk_write_queue, skb);
 }
 
-/* Insert buff after skb on the write queue of sk.  */
-static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
-						struct sk_buff *buff,
-						struct sock *sk)
-{
-	__skb_queue_after(&sk->sk_write_queue, skb, buff);
-}
-
 /* Insert new before skb on the write queue of sk.  */
 static inline void tcp_insert_write_queue_before(struct sk_buff *new,
 						  struct sk_buff *skb,
 						  struct sock *sk)
 {
 	__skb_queue_before(&sk->sk_write_queue, skb, new);
-
-	if (sk->sk_send_head == skb)
-		sk->sk_send_head = new;
 }
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
-	list_del(&skb->tcp_tsorted_anchor);
-	tcp_skb_tsorted_anchor_cleanup(skb);
 	__skb_unlink(skb, &sk->sk_write_queue);
 }
 
-static inline bool tcp_write_queue_empty(struct sock *sk)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
+
+static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
 {
-	return skb_queue_empty(&sk->sk_write_queue);
+	tcp_skb_tsorted_anchor_cleanup(skb);
+	rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
+}
+
+static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
+{
+	list_del(&skb->tcp_tsorted_anchor);
+	tcp_rtx_queue_unlink(skb, sk);
+	sk_wmem_free_skb(sk, skb);
 }
 
 static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1754,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
 
 static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
 {
-	tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
-						tcp_write_queue_next(sk, skb);
+	struct sk_buff *next = skb_rb_next(skb);
+
+	tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
 }
 
 static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1765,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
 
 static inline void tcp_highest_sack_reset(struct sock *sk)
 {
-	tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
+	struct sk_buff *skb = tcp_rtx_queue_head(sk);
+
+	tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
 }
 
 /* Called when old skb is about to be deleted (to be combined with new skb) */
@@ -1935,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
 /* At how many usecs into the future should the RTO fire? */
 static inline s64 tcp_rto_delta_us(const struct sock *sk)
 {
-	const struct sk_buff *skb = tcp_write_queue_head(sk);
+	const struct sk_buff *skb = tcp_rtx_queue_head(sk);
 	u32 rto = inet_csk(sk)->icsk_rto;
 	u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8d379c80936..3b34850d361f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->out_of_order_queue = RB_ROOT;
+	sk->tcp_rtx_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	INIT_LIST_HEAD(&tp->tsq_node);
 	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
@@ -701,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 
-	if (!tcp_send_head(sk))
-		return;
-
 	skb = tcp_write_queue_tail(sk);
+	if (!skb)
+		return;
 	if (!(flags & MSG_MORE) || forced_push(tp))
 		tcp_mark_push(tp, skb);
 
@@ -964,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 		int copy, i;
 		bool can_coalesce;
 
-		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+		if (!skb || (copy = size_goal - skb->len) <= 0 ||
 		    !tcp_skb_can_collapse_to(skb)) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
 
 			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
-						  skb_queue_empty(&sk->sk_write_queue));
+					tcp_rtx_and_write_queues_empty(sk));
 			if (!skb)
 				goto wait_for_memory;
 
@@ -1199,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 		}
 
-		skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+		skb = tcp_write_queue_tail(sk);
 		uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
 		if (!uarg) {
 			err = -ENOBUFS;
@@ -1275,7 +1275,7 @@ restart:
 		int max = size_goal;
 
 		skb = tcp_write_queue_tail(sk);
-		if (tcp_send_head(sk)) {
+		if (skb) {
 			if (skb->ip_summed == CHECKSUM_NONE)
 				max = mss_now;
 			copy = max - skb->len;
@@ -1295,7 +1295,7 @@ new_segment:
 				process_backlog = false;
 				goto restart;
 			}
-			first_skb = skb_queue_empty(&sk->sk_write_queue);
+			first_skb = tcp_rtx_and_write_queues_empty(sk);
 			skb = sk_stream_alloc_skb(sk,
 						  select_size(sk, sg, first_skb),
 						  sk->sk_allocation,
@@ -1521,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
 
 	/* XXX -- need to support SO_PEEK_OFF */
 
+	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+		if (err)
+			return err;
+		copied += skb->len;
+	}
+
 	skb_queue_walk(&sk->sk_write_queue, skb) {
 		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
 		if (err)
@@ -2320,6 +2327,22 @@ static inline bool tcp_need_reset(int state)
 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
 }
 
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+	struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+	while (p) {
+		struct sk_buff *skb = rb_to_skb(p);
+
+		p = rb_next(p);
+		/* Since we are deleting whole queue, no need to
+		 * list_del(&skb->tcp_tsorted_anchor)
+		 */
+		tcp_rtx_queue_unlink(skb, sk);
+		sk_wmem_free_skb(sk, skb);
+	}
+}
+
 void tcp_write_queue_purge(struct sock *sk)
 {
 	struct sk_buff *skb;
@@ -2329,6 +2352,7 @@ void tcp_write_queue_purge(struct sock *sk)
 		tcp_skb_tsorted_anchor_cleanup(skb);
 		sk_wmem_free_skb(sk, skb);
 	}
+	tcp_rtx_queue_purge(sk);
 	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2392,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 	 * issue in __tcp_select_window()
 	 */
 	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
-	tcp_init_send_head(sk);
 	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
 	__sk_dst_reset(sk);
 	dst_release(sk->sk_rx_dst);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 72c4732ae2da..d0682ce2a5d6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1142,6 +1142,7 @@ struct tcp_sacktag_state {
 	u64	last_sackt;
 	struct rate_sample *rate;
 	int	flag;
+	unsigned int mss_now;
 };
 
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 		if (pkt_len >= skb->len && !in_sack)
 			return 0;
 
-		err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+		err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+				   pkt_len, mss, GFP_ATOMIC);
 		if (err < 0)
 			return err;
 	}
@@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
 		TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
 
-	tcp_unlink_write_queue(skb, sk);
-	sk_wmem_free_skb(sk, skb);
+	tcp_rtx_queue_unlink_and_free(skb, sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
 
@@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 		goto fallback;
 
 	/* Can only happen with delayed DSACK + discard craziness */
-	if (unlikely(skb == tcp_write_queue_head(sk)))
+	prev = skb_rb_prev(skb);
+	if (!prev)
 		goto fallback;
-	prev = tcp_write_queue_prev(sk, skb);
 
 	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
 		goto fallback;
@@ -1501,12 +1502,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	/* Hole filled allows collapsing with the next as well, this is very
 	 * useful when hole on every nth skb pattern happens
 	 */
-	if (prev == tcp_write_queue_tail(sk))
+	skb = skb_rb_next(prev);
+	if (!skb)
 		goto out;
-	skb = tcp_write_queue_next(sk, prev);
 
 	if (!skb_can_shift(skb) ||
-	    (skb == tcp_send_head(sk)) ||
 	    ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
 	    (mss != tcp_skb_seglen(skb)))
 		goto out;
@@ -1539,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *tmp;
 
-	tcp_for_write_queue_from(skb, sk) {
+	skb_rbtree_walk_from(skb) {
 		int in_sack = 0;
 		bool dup_sack = dup_sack_in;
 
-		if (skb == tcp_send_head(sk))
-			break;
-
 		/* queue is in-order => we can short-circuit the walk early */
 		if (!before(TCP_SKB_CB(skb)->seq, end_seq))
 			break;
@@ -1607,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 	return skb;
 }
 
-/* Avoid all extra work that is being done by sacktag while walking in
- * a normal way
- */
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
+					   struct tcp_sacktag_state *state,
+					   u32 seq)
+{
+	struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+	struct sk_buff *skb;
+	int unack_bytes;
+
+	while (*p) {
+		parent = *p;
+		skb = rb_to_skb(parent);
+		if (before(seq, TCP_SKB_CB(skb)->seq)) {
+			p = &parent->rb_left;
+			continue;
+		}
+		if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+			p = &parent->rb_right;
+			continue;
+		}
+
+		state->fack_count = 0;
+		unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
+		if (state->mss_now && unack_bytes > 0)
+			state->fack_count = unack_bytes / state->mss_now;
+
+		return skb;
+	}
+	return NULL;
+}
+
 static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
 					struct tcp_sacktag_state *state,
 					u32 skip_to_seq)
 {
-	tcp_for_write_queue_from(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
-
-		if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
-			break;
+	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+		return skb;
 
-		state->fack_count += tcp_skb_pcount(skb);
-	}
-	return skb;
+	return tcp_sacktag_bsearch(sk, state, skip_to_seq);
 }
 
 static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1745,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 		}
 	}
 
-	skb = tcp_write_queue_head(sk);
+	state->mss_now = tcp_current_mss(sk);
 	state->fack_count = 0;
+	skb = NULL;
 	i = 0;
 
 	if (!tp->sacked_out) {
@@ -1970,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk)
 	if (tcp_is_reno(tp))
 		tcp_reset_reno_sack(tp);
 
-	skb = tcp_write_queue_head(sk);
+	skb = tcp_rtx_queue_head(sk);
 	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
 	if (is_reneg) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
@@ -1979,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk)
 	}
 	tcp_clear_all_retrans_hints(tp);
 
-	tcp_for_write_queue(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
-
+	skb_rbtree_walk_from(skb) {
 		mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
 			     is_reneg);
 		if (mark_lost)
@@ -2215,13 +2231,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 			return;
 		cnt = tp->lost_cnt_hint;
 	} else {
-		skb = tcp_write_queue_head(sk);
+		skb = tcp_rtx_queue_head(sk);
 		cnt = 0;
 	}
 
-	tcp_for_write_queue_from(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
+	skb_rbtree_walk_from(skb) {
 		/* TODO: do this better */
 		/* this is not the most efficient way to do this... */
 		tp->lost_skb_hint = skb;
@@ -2245,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 			/* If needed, chop off the prefix to mark as lost. */
 			lost = (packets - oldcnt) * mss;
 			if (lost < skb->len &&
-			    tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
+			    tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+					 lost, mss, GFP_ATOMIC) < 0)
 				break;
 			cnt = packets;
 		}
@@ -2329,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk)
 	if (tp->retrans_out)
 		return true;
 
-	skb = tcp_write_queue_head(sk);
+	skb = tcp_rtx_queue_head(sk);
 	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
 		return true;
 
@@ -2370,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
 	if (unmark_loss) {
 		struct sk_buff *skb;
 
-		tcp_for_write_queue(skb, sk) {
-			if (skb == tcp_send_head(sk))
-				break;
+		skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
 		}
 		tp->lost_out = 0;
@@ -2617,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk)
 	unsigned int mss = tcp_current_mss(sk);
 	u32 prior_lost = tp->lost_out;
 
-	tcp_for_write_queue(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
+	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
 		if (tcp_skb_seglen(skb) > mss &&
 		    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2713,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
 			 * is updated in tcp_ack()). Otherwise fall back to
 			 * the conventional recovery.
 			 */
-			if (tcp_send_head(sk) &&
+			if (!tcp_write_queue_empty(sk) &&
 			    after(tcp_wnd_end(tp), tp->snd_nxt)) {
 				*rexmit = REXMIT_NEW;
 				return;
@@ -3077,11 +3088,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
+	struct sk_buff *skb, *next;
 	bool fully_acked = true;
 	long sack_rtt_us = -1L;
 	long seq_rtt_us = -1L;
 	long ca_rtt_us = -1L;
-	struct sk_buff *skb;
 	u32 pkts_acked = 0;
 	u32 last_in_flight = 0;
 	bool rtt_update;
@@ -3089,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 	first_ackt = 0;
 
-	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 		u8 sacked = scb->sacked;
 		u32 acked_pcount;
@@ -3107,8 +3118,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 				break;
 			fully_acked = false;
 		} else {
-			/* Speedup tcp_unlink_write_queue() and next loop */
-			prefetchw(skb->next);
 			acked_pcount = tcp_skb_pcount(skb);
 		}
 
@@ -3160,12 +3169,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		if (!fully_acked)
 			break;
 
-		tcp_unlink_write_queue(skb, sk);
-		sk_wmem_free_skb(sk, skb);
+		next = skb_rb_next(skb);
 		if (unlikely(skb == tp->retransmit_skb_hint))
 			tp->retransmit_skb_hint = NULL;
 		if (unlikely(skb == tp->lost_skb_hint))
 			tp->lost_skb_hint = NULL;
+		tcp_rtx_queue_unlink_and_free(skb, sk);
 	}
 
 	if (!skb)
@@ -3257,12 +3266,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 static void tcp_ack_probe(struct sock *sk)
 {
-	const struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *head = tcp_send_head(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 
 	/* Was it a usable window open? */
-
-	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+	if (!head)
+		return;
+	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
 		icsk->icsk_backoff = 0;
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
 		/* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3382,7 +3393,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 			tp->pred_flags = 0;
 			tcp_fast_path_check(sk);
 
-			if (tcp_send_head(sk))
+			if (!tcp_write_queue_empty(sk))
 				tcp_slow_start_after_idle_check(sk);
 
 			if (nwin > tp->max_window) {
@@ -3567,8 +3578,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	sack_state.first_sackt = 0;
 	sack_state.rate = &rs;
 
-	/* We very likely will need to access write queue head. */
-	prefetchw(sk->sk_write_queue.next);
+	/* We very likely will need to access rtx queue. */
+	prefetch(sk->tcp_rtx_queue.rb_node);
 
 	/* If the ack is older than previous acks
 	 * then we can probably ignore it.
@@ -3682,8 +3693,7 @@ no_queue:
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
 	 */
-	if (tcp_send_head(sk))
-		tcp_ack_probe(sk);
+	tcp_ack_probe(sk);
 
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
@@ -4726,7 +4736,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
 }
 
 /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -5530,7 +5540,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 				    struct tcp_fastopen_cookie *cookie)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
 	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
 	bool syn_drop = false;
 
@@ -5565,9 +5575,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
 
 	if (data) { /* Retransmit unacked data in SYN */
-		tcp_for_write_queue_from(data, sk) {
-			if (data == tcp_send_head(sk) ||
-			    __tcp_retransmit_skb(sk, data, 1))
+		skb_rbtree_walk_from(data) {
+			if (__tcp_retransmit_skb(sk, data, 1))
 				break;
 		}
 		tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c7460fd90884..5418ecf03b78 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 					       TCP_TIMEOUT_INIT;
 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 
-		skb = tcp_write_queue_head(sk);
+		skb = tcp_rtx_queue_head(sk);
 		BUG_ON(!skb);
 
 		tcp_mstamp_refresh(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8162e2880178..696b0a168f16 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
 /* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int prior_packets = tp->packets_out;
 
-	tcp_advance_send_head(sk, skb);
 	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
+	__skb_unlink(skb, &sk->sk_write_queue);
+	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
 	tp->packets_out += tcp_skb_pcount(skb);
 	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
@@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
 	TCP_SKB_CB(skb)->eor = 0;
 }
 
+/* Insert buff after skb on the write or rtx queue of sk.  */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+					 struct sk_buff *buff,
+					 struct sock *sk,
+					 enum tcp_queue tcp_queue)
+{
+	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+		__skb_queue_after(&sk->sk_write_queue, skb, buff);
+	else
+		tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
  * Remember, these are still headerless SKBs at this point.
  */
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+		 struct sk_buff *skb, u32 len,
 		 unsigned int mss_now, gfp_t gfp)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
-	tcp_insert_write_queue_after(skb, buff, sk);
+	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
 	list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
 	return 0;
@@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
 		 * is caused by insufficient sender buffer:
 		 * 1) just sent some data (see tcp_write_xmit)
 		 * 2) not cwnd limited (this else condition)
-		 * 3) no more data to send (null tcp_send_head )
+		 * 3) no more data to send (tcp_write_queue_empty())
 		 * 4) application is hitting buffer limit (SOCK_NOSPACE)
 		 */
-		if (!tcp_send_head(sk) && sk->sk_socket &&
+		if (tcp_write_queue_empty(sk) && sk->sk_socket &&
 		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
 		    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 			tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
  * know that all the data is in scatter-gather pages, and that the
  * packet has never been sent out before (and thus is not cloned).
  */
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+			struct sk_buff *skb, unsigned int len,
 			unsigned int mss_now, gfp_t gfp)
 {
 	struct sk_buff *buff;
@@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	/* All of a TSO frame must be composed of paged data.  */
 	if (skb->len != skb->data_len)
-		return tcp_fragment(sk, skb, len, mss_now, gfp);
+		return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
 
 	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
 	if (unlikely(!buff))
@@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
-	tcp_insert_write_queue_after(skb, buff, sk);
+	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
 
 	return 0;
 }
@@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 			goto send_now;
 	}
 
-	head = tcp_write_queue_head(sk);
-
+	/* TODO : use tsorted_sent_queue ? */
+	head = tcp_rtx_queue_head(sk);
+	if (!head)
+		goto send_now;
 	age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
 	/* If next ACK is likely to come too late (half srtt), do not defer */
 	if (age < (tp->srtt_us >> 4))
@@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 	limit <<= factor;
 
 	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
-		/* Always send the 1st or 2nd skb in write queue.
+		/* Always send skb if rtx queue is empty.
 		 * No need to wait for TX completion to call us back,
 		 * after softirq/tasklet schedule.
 		 * This helps when TX completions are delayed too much.
 		 */
-		if (skb == sk->sk_write_queue.next ||
-		    skb->prev == sk->sk_write_queue.next)
+		if (tcp_rtx_queue_empty(sk))
 			return false;
 
 		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
 	 * it's the "most interesting" or current chrono we are
 	 * tracking and starts busy chrono if we have pending data.
 	 */
-	if (tcp_write_queue_empty(sk))
+	if (tcp_rtx_and_write_queues_empty(sk))
 		tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
 	else if (type == tp->chrono_type)
 		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 						    nonagle);
 
 		if (skb->len > limit &&
-		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+		    unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+					  skb, limit, mss_now, gfp)))
 			break;
 
 		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2350,7 +2368,7 @@ repair:
 		tcp_cwnd_validate(sk, is_cwnd_limited);
 		return false;
 	}
-	return !tp->packets_out && tcp_send_head(sk);
+	return !tp->packets_out && !tcp_write_queue_empty(sk);
 }
 
 bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 		return false;
 
 	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
-	     tcp_send_head(sk))
+	     !tcp_write_queue_empty(sk))
 		return false;
 
 	/* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk)
 	int mss = tcp_current_mss(sk);
 
 	skb = tcp_send_head(sk);
-	if (skb) {
-		if (tcp_snd_wnd_test(tp, skb, mss)) {
-			pcount = tp->packets_out;
-			tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
-			if (tp->packets_out > pcount)
-				goto probe_sent;
-			goto rearm_timer;
-		}
-		skb = tcp_write_queue_prev(sk, skb);
-	} else {
-		skb = tcp_write_queue_tail(sk);
+	if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+		pcount = tp->packets_out;
+		tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+		if (tp->packets_out > pcount)
+			goto probe_sent;
+		goto rearm_timer;
 	}
+	skb = skb_rb_last(&sk->tcp_rtx_queue);
 
 	/* At most one outstanding TLP retransmission. */
 	if (tp->tlp_high_seq)
@@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk)
 		goto rearm_timer;
 
 	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
-		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+		if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+					  (pcount - 1) * mss, mss,
 					  GFP_ATOMIC)))
 			goto rearm_timer;
-		skb = tcp_write_queue_next(sk, skb);
+		skb = skb_rb_next(skb);
 	}
 
 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+	struct sk_buff *next_skb = skb_rb_next(skb);
 	int skb_size, next_skb_size;
 
 	skb_size = skb->len;
@@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	}
 	tcp_highest_sack_combine(sk, next_skb, skb);
 
-	tcp_unlink_write_queue(next_skb, sk);
-
 	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
 		skb->ip_summed = CHECKSUM_PARTIAL;
 
@@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 
 	tcp_skb_collapse_tstamp(skb, next_skb);
 
-	sk_wmem_free_skb(sk, next_skb);
+	tcp_rtx_queue_unlink_and_free(next_skb, sk);
 	return true;
 }
 
@@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
 		return false;
 	if (skb_cloned(skb))
 		return false;
-	if (skb == tcp_send_head(sk))
-		return false;
 	/* Some heuristics for collapsing over SACK'd could be invented */
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
 		return false;
@@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 		return;
 
-	tcp_for_write_queue_from_safe(skb, tmp, sk) {
+	skb_rbtree_walk_from_safe(skb, tmp) {
 		if (!tcp_can_collapse(sk, skb))
 			break;
 
@@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 
 	len = cur_mss * segs;
 	if (skb->len > len) {
-		if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
+		if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+				 cur_mss, GFP_ATOMIC))
 			return -ENOMEM; /* We'll try again later. */
 	} else {
 		if (skb_unclone(skb, GFP_ATOMIC))
@@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb, *rtx_head = NULL, *hole = NULL;
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
-	struct sk_buff *hole = NULL;
 	u32 max_segs;
 	int mib_idx;
 
 	if (!tp->packets_out)
 		return;
 
-	if (tp->retransmit_skb_hint) {
-		skb = tp->retransmit_skb_hint;
-	} else {
-		skb = tcp_write_queue_head(sk);
+	skb = tp->retransmit_skb_hint;
+	if (!skb) {
+		rtx_head = tcp_rtx_queue_head(sk);
+		skb = rtx_head;
 	}
-
 	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
-	tcp_for_write_queue_from(skb, sk) {
+	skb_rbtree_walk_from(skb) {
 		__u8 sacked;
 		int segs;
 
-		if (skb == tcp_send_head(sk))
-			break;
-
 		if (tcp_pacing_check(sk))
 			break;
 
@@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += tcp_skb_pcount(skb);
 
-		if (skb == tcp_write_queue_head(sk) &&
+		if (skb == rtx_head &&
 		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 						  inet_csk(sk)->icsk_rto,
@@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk)
 	 * Note: in the latter case, FIN packet will be sent after a timeout,
 	 * as TCP stack thinks it has already been transmitted.
 	 */
-	if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
+	if (!tskb && tcp_under_memory_pressure(sk))
+		tskb = skb_rb_last(&sk->tcp_rtx_queue);
+
+	if (tskb) {
 coalesce:
 		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
 		TCP_SKB_CB(tskb)->end_seq++;
 		tp->write_seq++;
-		if (!tcp_send_head(sk)) {
+		if (tcp_write_queue_empty(sk)) {
 			/* This means tskb was already sent.
 			 * Pretend we included the FIN on previous transmit.
 			 * We need to set tp->snd_nxt to the value it would have
@@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk)
 {
 	struct sk_buff *skb;
 
-	skb = tcp_write_queue_head(sk);
+	skb = tcp_rtx_queue_head(sk);
 	if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
-		pr_debug("%s: wrong queue state\n", __func__);
+		pr_err("%s: wrong queue state\n", __func__);
 		return -EFAULT;
 	}
 	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk)
 			if (!nskb)
 				return -ENOMEM;
 			INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
-			tcp_unlink_write_queue(skb, sk);
+			tcp_rtx_queue_unlink_and_free(skb, sk);
 			__skb_header_release(nskb);
-			__tcp_add_write_queue_head(sk, nskb);
-			sk_wmem_free_skb(sk, skb);
+			tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
 			sk->sk_wmem_queued += nskb->truesize;
 			sk_mem_charge(sk, nskb->truesize);
 			skb = nskb;
@@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
 
 	tcb->end_seq += skb->len;
 	__skb_header_release(skb);
-	__tcp_add_write_queue_tail(sk, skb);
 	sk->sk_wmem_queued += skb->truesize;
 	sk_mem_charge(sk, skb->truesize);
 	tp->write_seq = tcb->end_seq;
@@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
 	if (!err) {
 		tp->syn_data = (fo->copied > 0);
+		tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
 		goto done;
 	}
 
-	/* data was not sent, this is our new send_head */
-	sk->sk_send_head = syn_data;
+	/* data was not sent, put it in write_queue */
+	__skb_queue_tail(&sk->sk_write_queue, syn_data);
 	tp->packets_out -= tcp_skb_pcount(syn_data);
 
 fallback:
@@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk)
 	tp->retrans_stamp = tcp_time_stamp(tp);
 	tcp_connect_queue_skb(sk, buff);
 	tcp_ecn_send_syn(sk, buff);
+	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
 
 	/* Send off SYN; include data in Fast Open. */
 	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
 		    skb->len > mss) {
 			seg_size = min(seg_size, mss);
 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
-			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+			if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+					 skb, seg_size, mss, GFP_ATOMIC))
 				return -1;
 		} else if (!tcp_skb_pcount(skb))
 			tcp_set_skb_tso_segs(skb, mss);
@@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk)
 
 	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
 
-	if (tp->packets_out || !tcp_send_head(sk)) {
+	if (tp->packets_out || tcp_write_queue_empty(sk)) {
 		/* Cancel probe timer, if it is not required. */
 		icsk->icsk_probes_out = 0;
 		icsk->icsk_backoff = 0;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 655dd8d7f064..7014cc00c74c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk,
 		return false;
 
 	start_ts = tcp_sk(sk)->retrans_stamp;
-	if (unlikely(!start_ts))
-		start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+	if (unlikely(!start_ts)) {
+		struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+		if (!head)
+			return false;
+		start_ts = tcp_skb_timestamp(head);
+	}
 
 	if (likely(timeout == 0)) {
 		linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data)
 static void tcp_probe_timer(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb = tcp_send_head(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int max_probes;
 	u32 start_ts;
 
-	if (tp->packets_out || !tcp_send_head(sk)) {
+	if (tp->packets_out || !skb) {
 		icsk->icsk_probes_out = 0;
 		return;
 	}
@@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk)
 	 * corresponding system limit. We also implement similar policy when
 	 * we use RTO to probe window in tcp_retransmit_timer().
 	 */
-	start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+	start_ts = tcp_skb_timestamp(skb);
 	if (!start_ts)
-		tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp;
+		skb->skb_mstamp = tp->tcp_mstamp;
 	else if (icsk->icsk_user_timeout &&
 		 (s32)(tcp_time_stamp(tp) - start_ts) >
 		 jiffies_to_msecs(icsk->icsk_user_timeout))
@@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk)
 	if (!tp->packets_out)
 		goto out;
 
-	WARN_ON(tcp_write_queue_empty(sk));
+	WARN_ON(tcp_rtx_queue_empty(sk));
 
 	tp->tlp_high_seq = 0;
 
@@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk)
 			goto out;
 		}
 		tcp_enter_loss(sk);
-		tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
+		tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
 		__sk_dst_reset(sk);
 		goto out_reset_timer;
 	}
@@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk)
 
 	tcp_enter_loss(sk);
 
-	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
+	if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
 		/* Retransmission failed because of local congestion,
 		 * do not backoff.
 		 */
@@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data)
 	elapsed = keepalive_time_when(tp);
 
 	/* It is alive without keepalive 8) */
-	if (tp->packets_out || tcp_send_head(sk))
+	if (tp->packets_out || !tcp_write_queue_empty(sk))
 		goto resched;
 
 	elapsed = keepalive_time_elapsed(tp);
-- 
cgit v1.2.3


From 180ca444b985c42948fa26abd278e616b5ce7eb2 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 6 Oct 2017 12:05:56 -0700
Subject: ipv6: introduce a new function fib6_update_sernum()

This function takes a route as input and tries to update the sernum in
the fib6_node this route is associated with. It will be used in later
commit when adding a cached route into the exception table under that
route.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  2 ++
 net/ipv6/ip6_fib.c    | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index d060d711a624..152b7b14a5a5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -358,6 +358,8 @@ void __net_exit fib6_notifier_exit(struct net *net);
 unsigned int fib6_tables_seq_read(struct net *net);
 int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
+void fib6_update_sernum(struct rt6_info *rt);
+
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e5308d7cbd75..0ba4fbb2f855 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -110,6 +110,20 @@ enum {
 	FIB6_NO_SERNUM_CHANGE = 0,
 };
 
+void fib6_update_sernum(struct rt6_info *rt)
+{
+	struct fib6_table *table = rt->rt6i_table;
+	struct net *net = dev_net(rt->dst.dev);
+	struct fib6_node *fn;
+
+	write_lock_bh(&table->tb6_lock);
+	fn = rcu_dereference_protected(rt->rt6i_node,
+			lockdep_is_held(&table->tb6_lock));
+	if (fn)
+		fn->fn_sernum = fib6_new_sernum(net);
+	write_unlock_bh(&table->tb6_lock);
+}
+
 /*
  *	Auxiliary address test functions for the radix tree.
  *
-- 
cgit v1.2.3


From 35732d01fe311ec13c4e42936878b782b8e7ea85 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 6 Oct 2017 12:05:57 -0700
Subject: ipv6: introduce a hash table to store dst cache

Add a hash table into struct rt6_info in order to store dst caches
created by pmtu discovery and ip redirect in ipv6 routing code.
APIs to add dst cache, delete dst cache, find dst cache and update
dst cache in the hash table are implemented and will be used in later
commits.
This is a preparation work to move all cache routes into the exception
table instead of getting inserted into the fib6 tree.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |  19 +++
 include/net/ip6_route.h |   3 +
 net/ipv6/route.c        | 341 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 363 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 152b7b14a5a5..c4864c1e8f13 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -98,6 +98,22 @@ struct rt6key {
 
 struct fib6_table;
 
+struct rt6_exception_bucket {
+	struct hlist_head	chain;
+	int			depth;
+};
+
+struct rt6_exception {
+	struct hlist_node	hlist;
+	struct rt6_info		*rt6i;
+	unsigned long		stamp;
+	struct rcu_head		rcu;
+};
+
+#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
+#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
+#define FIB6_MAX_DEPTH 5
+
 struct rt6_info {
 	struct dst_entry		dst;
 
@@ -134,12 +150,15 @@ struct rt6_info {
 
 	struct inet6_dev		*rt6i_idev;
 	struct rt6_info * __percpu	*rt6i_pcpu;
+	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
 	u32				rt6i_metric;
 	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
+	u8				exception_bucket_flushed:1,
+					unused:7;
 };
 
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index ee96f402cb75..3315605f34c9 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -95,6 +95,9 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
 int ip6_ins_rt(struct rt6_info *);
 int ip6_del_rt(struct rt6_info *);
 
+void rt6_flush_exceptions(struct rt6_info *rt);
+int rt6_remove_exception_rt(struct rt6_info *rt);
+
 static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
 				      const struct in6_addr *daddr,
 				      unsigned int prefs,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 26cc9f483b6d..dc5e70975966 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -44,6 +44,7 @@
 #include <linux/seq_file.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/jhash.h>
 #include <net/net_namespace.h>
 #include <net/snmp.h>
 #include <net/ipv6.h>
@@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net,
 			 struct in6_addr *dst, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags);
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+					   struct in6_addr *daddr,
+					   struct in6_addr *saddr);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -392,6 +396,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
+	struct rt6_exception_bucket *bucket;
 	struct dst_entry *from = dst->from;
 	struct inet6_dev *idev;
 
@@ -404,6 +409,11 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		rt->rt6i_idev = NULL;
 		in6_dev_put(idev);
 	}
+	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
+	if (bucket) {
+		rt->rt6i_exception_bucket = NULL;
+		kfree(bucket);
+	}
 
 	dst->from = NULL;
 	dst_release(from);
@@ -1091,6 +1101,337 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
 	return pcpu_rt;
 }
 
+/* exception hash table implementation
+ */
+static DEFINE_SPINLOCK(rt6_exception_lock);
+
+/* Remove rt6_ex from hash table and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
+				 struct rt6_exception *rt6_ex)
+{
+	if (!bucket || !rt6_ex)
+		return;
+	rt6_ex->rt6i->rt6i_node = NULL;
+	hlist_del_rcu(&rt6_ex->hlist);
+	rt6_release(rt6_ex->rt6i);
+	kfree_rcu(rt6_ex, rcu);
+	WARN_ON_ONCE(!bucket->depth);
+	bucket->depth--;
+}
+
+/* Remove oldest rt6_ex in bucket and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
+{
+	struct rt6_exception *rt6_ex, *oldest = NULL;
+
+	if (!bucket)
+		return;
+
+	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
+			oldest = rt6_ex;
+	}
+	rt6_remove_exception(bucket, oldest);
+}
+
+static u32 rt6_exception_hash(const struct in6_addr *dst,
+			      const struct in6_addr *src)
+{
+	static u32 seed __read_mostly;
+	u32 val;
+
+	net_get_random_once(&seed, sizeof(seed));
+	val = jhash(dst, sizeof(*dst), seed);
+
+#ifdef CONFIG_IPV6_SUBTREES
+	if (src)
+		val = jhash(src, sizeof(*src), val);
+#endif
+	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rt6_exception_lock
+ */
+static struct rt6_exception *
+__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
+			      const struct in6_addr *daddr,
+			      const struct in6_addr *saddr)
+{
+	struct rt6_exception *rt6_ex;
+	u32 hval;
+
+	if (!(*bucket) || !daddr)
+		return NULL;
+
+	hval = rt6_exception_hash(daddr, saddr);
+	*bucket += hval;
+
+	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
+		struct rt6_info *rt6 = rt6_ex->rt6i;
+		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+		if (matched && saddr)
+			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+		if (matched)
+			return rt6_ex;
+	}
+	return NULL;
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rcu_read_lock()
+ */
+static struct rt6_exception *
+__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
+			 const struct in6_addr *daddr,
+			 const struct in6_addr *saddr)
+{
+	struct rt6_exception *rt6_ex;
+	u32 hval;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (!(*bucket) || !daddr)
+		return NULL;
+
+	hval = rt6_exception_hash(daddr, saddr);
+	*bucket += hval;
+
+	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
+		struct rt6_info *rt6 = rt6_ex->rt6i;
+		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+		if (matched && saddr)
+			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+		if (matched)
+			return rt6_ex;
+	}
+	return NULL;
+}
+
+static int rt6_insert_exception(struct rt6_info *nrt,
+				struct rt6_info *ort)
+{
+	struct rt6_exception_bucket *bucket;
+	struct in6_addr *src_key = NULL;
+	struct rt6_exception *rt6_ex;
+	int err = 0;
+
+	/* ort can't be a cache or pcpu route */
+	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+		ort = (struct rt6_info *)ort->dst.from;
+	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
+
+	spin_lock_bh(&rt6_exception_lock);
+
+	if (ort->exception_bucket_flushed) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
+					lockdep_is_held(&rt6_exception_lock));
+	if (!bucket) {
+		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
+				 GFP_ATOMIC);
+		if (!bucket) {
+			err = -ENOMEM;
+			goto out;
+		}
+		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
+	}
+
+#ifdef CONFIG_IPV6_SUBTREES
+	/* rt6i_src.plen != 0 indicates ort is in subtree
+	 * and exception table is indexed by a hash of
+	 * both rt6i_dst and rt6i_src.
+	 * Otherwise, the exception table is indexed by
+	 * a hash of only rt6i_dst.
+	 */
+	if (ort->rt6i_src.plen)
+		src_key = &nrt->rt6i_src.addr;
+#endif
+	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
+					       src_key);
+	if (rt6_ex)
+		rt6_remove_exception(bucket, rt6_ex);
+
+	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
+	if (!rt6_ex) {
+		err = -ENOMEM;
+		goto out;
+	}
+	rt6_ex->rt6i = nrt;
+	rt6_ex->stamp = jiffies;
+	atomic_inc(&nrt->rt6i_ref);
+	nrt->rt6i_node = ort->rt6i_node;
+	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
+	bucket->depth++;
+
+	if (bucket->depth > FIB6_MAX_DEPTH)
+		rt6_exception_remove_oldest(bucket);
+
+out:
+	spin_unlock_bh(&rt6_exception_lock);
+
+	/* Update fn->fn_sernum to invalidate all cached dst */
+	if (!err)
+		fib6_update_sernum(ort);
+
+	return err;
+}
+
+void rt6_flush_exceptions(struct rt6_info *rt)
+{
+	struct rt6_exception_bucket *bucket;
+	struct rt6_exception *rt6_ex;
+	struct hlist_node *tmp;
+	int i;
+
+	spin_lock_bh(&rt6_exception_lock);
+	/* Prevent rt6_insert_exception() to recreate the bucket list */
+	rt->exception_bucket_flushed = 1;
+
+	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+				    lockdep_is_held(&rt6_exception_lock));
+	if (!bucket)
+		goto out;
+
+	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
+			rt6_remove_exception(bucket, rt6_ex);
+		WARN_ON_ONCE(bucket->depth);
+		bucket++;
+	}
+
+out:
+	spin_unlock_bh(&rt6_exception_lock);
+}
+
+/* Find cached rt in the hash table inside passed in rt
+ * Caller has to hold rcu_read_lock()
+ */
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+					   struct in6_addr *daddr,
+					   struct in6_addr *saddr)
+{
+	struct rt6_exception_bucket *bucket;
+	struct in6_addr *src_key = NULL;
+	struct rt6_exception *rt6_ex;
+	struct rt6_info *res = NULL;
+
+	bucket = rcu_dereference(rt->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+	/* rt6i_src.plen != 0 indicates rt is in subtree
+	 * and exception table is indexed by a hash of
+	 * both rt6i_dst and rt6i_src.
+	 * Otherwise, the exception table is indexed by
+	 * a hash of only rt6i_dst.
+	 */
+	if (rt->rt6i_src.plen)
+		src_key = saddr;
+#endif
+	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+
+	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+		res = rt6_ex->rt6i;
+
+	return res;
+}
+
+/* Remove the passed in cached rt from the hash table that contains it */
+int rt6_remove_exception_rt(struct rt6_info *rt)
+{
+	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+	struct rt6_exception_bucket *bucket;
+	struct in6_addr *src_key = NULL;
+	struct rt6_exception *rt6_ex;
+	int err;
+
+	if (!from ||
+	    !(rt->rt6i_flags | RTF_CACHE))
+		return -EINVAL;
+
+	if (!rcu_access_pointer(from->rt6i_exception_bucket))
+		return -ENOENT;
+
+	spin_lock_bh(&rt6_exception_lock);
+	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
+				    lockdep_is_held(&rt6_exception_lock));
+#ifdef CONFIG_IPV6_SUBTREES
+	/* rt6i_src.plen != 0 indicates 'from' is in subtree
+	 * and exception table is indexed by a hash of
+	 * both rt6i_dst and rt6i_src.
+	 * Otherwise, the exception table is indexed by
+	 * a hash of only rt6i_dst.
+	 */
+	if (from->rt6i_src.plen)
+		src_key = &rt->rt6i_src.addr;
+#endif
+	rt6_ex = __rt6_find_exception_spinlock(&bucket,
+					       &rt->rt6i_dst.addr,
+					       src_key);
+	if (rt6_ex) {
+		rt6_remove_exception(bucket, rt6_ex);
+		err = 0;
+	} else {
+		err = -ENOENT;
+	}
+
+	spin_unlock_bh(&rt6_exception_lock);
+	return err;
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+	struct rt6_exception_bucket *bucket;
+	struct in6_addr *src_key = NULL;
+	struct rt6_exception *rt6_ex;
+
+	if (!from ||
+	    !(rt->rt6i_flags | RTF_CACHE))
+		return;
+
+	rcu_read_lock();
+	bucket = rcu_dereference(from->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+	/* rt6i_src.plen != 0 indicates 'from' is in subtree
+	 * and exception table is indexed by a hash of
+	 * both rt6i_dst and rt6i_src.
+	 * Otherwise, the exception table is indexed by
+	 * a hash of only rt6i_dst.
+	 */
+	if (from->rt6i_src.plen)
+		src_key = &rt->rt6i_src.addr;
+#endif
+	rt6_ex = __rt6_find_exception_rcu(&bucket,
+					  &rt->rt6i_dst.addr,
+					  src_key);
+	if (rt6_ex)
+		rt6_ex->stamp = jiffies;
+
+	rcu_read_unlock();
+}
+
 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       int oif, struct flowi6 *fl6, int flags)
 {
-- 
cgit v1.2.3


From c757faa8bfa26a0dd24b41ff783e0da042156887 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 6 Oct 2017 12:06:01 -0700
Subject: ipv6: prepare fib6_age() for exception table

If all dst cache entries are stored in the exception table under the
main route, we have to go through them during fib6_age() when doing
garbage collecting.
Introduce a new function rt6_age_exception() which goes through all dst
entries in the exception table and remove those entries that are expired.
This function is called in fib6_age() so that all dst caches are also
garbage collected.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   | 13 +++++++++++
 include/net/ip6_route.h |  2 ++
 net/ipv6/ip6_fib.c      | 26 ++++++++-------------
 net/ipv6/route.c        | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c4864c1e8f13..11a79ef87a28 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -29,6 +29,14 @@
 #define FIB6_TABLE_HASHSZ 1
 #endif
 
+#define RT6_DEBUG 2
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) pr_debug(x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
+
 struct rt6_info;
 
 struct fib6_config {
@@ -75,6 +83,11 @@ struct fib6_node {
 	struct rcu_head		rcu;
 };
 
+struct fib6_gc_args {
+	int			timeout;
+	int			more;
+};
+
 #ifndef CONFIG_IPV6_SUBTREES
 #define FIB6_SUBTREE(fn)	NULL
 #else
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 3315605f34c9..a0087fb9864b 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -97,6 +97,8 @@ int ip6_del_rt(struct rt6_info *);
 
 void rt6_flush_exceptions(struct rt6_info *rt);
 int rt6_remove_exception_rt(struct rt6_info *rt);
+void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
+			unsigned long now);
 
 static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
 				      const struct in6_addr *daddr,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0ba4fbb2f855..3afbe50f2779 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -38,14 +38,6 @@
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
 
-#define RT6_DEBUG 2
-
-#if RT6_DEBUG >= 3
-#define RT6_TRACE(x...) pr_debug(x)
-#else
-#define RT6_TRACE(x...) do { ; } while (0)
-#endif
-
 static struct kmem_cache *fib6_node_kmem __read_mostly;
 
 struct fib6_cleaner {
@@ -1890,12 +1882,6 @@ static void fib6_flush_trees(struct net *net)
  *	Garbage collection
  */
 
-struct fib6_gc_args
-{
-	int			timeout;
-	int			more;
-};
-
 static int fib6_age(struct rt6_info *rt, void *arg)
 {
 	struct fib6_gc_args *gc_args = arg;
@@ -1904,9 +1890,6 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 	/*
 	 *	check addrconf expiration here.
 	 *	Routes are expired even if they are in use.
-	 *
-	 *	Also age clones. Note, that clones are aged out
-	 *	only if they are not in use now.
 	 */
 
 	if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
@@ -1915,6 +1898,9 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 			return -1;
 		}
 		gc_args->more++;
+	/* The following part will soon be removed when the exception
+	 * table is hooked up to store all cached routes.
+	 */
 	} else if (rt->rt6i_flags & RTF_CACHE) {
 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout))
 			rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -1940,6 +1926,12 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 		gc_args->more++;
 	}
 
+	/*	Also age clones in the exception table.
+	 *	Note, that clones are aged out
+	 *	only if they are not in use now.
+	 */
+	rt6_age_exceptions(rt, gc_args, now);
+
 	return 0;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e8e901589564..d2dd55f58b5d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1528,6 +1528,66 @@ static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
 	spin_unlock_bh(&rt6_exception_lock);
 }
 
+static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
+				      struct rt6_exception *rt6_ex,
+				      struct fib6_gc_args *gc_args,
+				      unsigned long now)
+{
+	struct rt6_info *rt = rt6_ex->rt6i;
+
+	if (atomic_read(&rt->dst.__refcnt) == 1 &&
+	    time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+		RT6_TRACE("aging clone %p\n", rt);
+		rt6_remove_exception(bucket, rt6_ex);
+		return;
+	} else if (rt->rt6i_flags & RTF_GATEWAY) {
+		struct neighbour *neigh;
+		__u8 neigh_flags = 0;
+
+		neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
+		if (neigh) {
+			neigh_flags = neigh->flags;
+			neigh_release(neigh);
+		}
+		if (!(neigh_flags & NTF_ROUTER)) {
+			RT6_TRACE("purging route %p via non-router but gateway\n",
+				  rt);
+			rt6_remove_exception(bucket, rt6_ex);
+			return;
+		}
+	}
+	gc_args->more++;
+}
+
+void rt6_age_exceptions(struct rt6_info *rt,
+			struct fib6_gc_args *gc_args,
+			unsigned long now)
+{
+	struct rt6_exception_bucket *bucket;
+	struct rt6_exception *rt6_ex;
+	struct hlist_node *tmp;
+	int i;
+
+	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+		return;
+
+	spin_lock_bh(&rt6_exception_lock);
+	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+				    lockdep_is_held(&rt6_exception_lock));
+
+	if (bucket) {
+		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+			hlist_for_each_entry_safe(rt6_ex, tmp,
+						  &bucket->chain, hlist) {
+				rt6_age_examine_exception(bucket, rt6_ex,
+							  gc_args, now);
+			}
+			bucket++;
+		}
+	}
+	spin_unlock_bh(&rt6_exception_lock);
+}
+
 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       int oif, struct flowi6 *fl6, int flags)
 {
-- 
cgit v1.2.3


From 38fbeeeeccdb38d0635398e8e344d245f6d8dc52 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 6 Oct 2017 12:06:02 -0700
Subject: ipv6: prepare fib6_locate() for exception table

fib6_locate() is used to find the fib6_node according to the passed in
prefix address key. It currently tries to find the fib6_node with the
exact match of the passed in key. However, when we move cached routes
into the exception table, fib6_locate() will fail to find the fib6_node
for it as the cached routes will be stored in the exception table under
the fib6_node with the longest prefix match of the cache's dst addr key.
This commit adds a new parameter to let the caller specify if it needs
exact match or longest prefix match.
Right now, all callers still does exact match when calling
fib6_locate(). It will be changed in later commit where exception table
is hooked up to store cached routes.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  3 ++-
 net/ipv6/addrconf.c   |  2 +-
 net/ipv6/ip6_fib.c    | 30 +++++++++++++++++++++++-------
 net/ipv6/route.c      |  5 +++--
 4 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 11a79ef87a28..4497a1eb4d41 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -357,7 +357,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root,
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *daddr, int dst_len,
-			      const struct in6_addr *saddr, int src_len);
+			      const struct in6_addr *saddr, int src_len,
+			      bool exact_match);
 
 void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
 		    void *arg);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 837418ff2d4b..3ccaf52824c9 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2322,7 +2322,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		return NULL;
 
 	read_lock_bh(&table->tb6_lock);
-	fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0);
+	fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
 	if (!fn)
 		goto out;
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 3afbe50f2779..b3e4cf0962f8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1343,14 +1343,21 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 /*
  *	Get node with specified destination prefix (and source prefix,
  *	if subtrees are used)
+ *	exact_match == true means we try to find fn with exact match of
+ *	the passed in prefix addr
+ *	exact_match == false means we try to find fn with longest prefix
+ *	match of the passed in prefix addr. This is useful for finding fn
+ *	for cached route as it will be stored in the exception table under
+ *	the node with longest prefix length.
  */
 
 
 static struct fib6_node *fib6_locate_1(struct fib6_node *root,
 				       const struct in6_addr *addr,
-				       int plen, int offset)
+				       int plen, int offset,
+				       bool exact_match)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *prev = NULL;
 
 	for (fn = root; fn ; ) {
 		struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
@@ -1360,11 +1367,13 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
 		 */
 		if (plen < fn->fn_bit ||
 		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
-			return NULL;
+			goto out;
 
 		if (plen == fn->fn_bit)
 			return fn;
 
+		prev = fn;
+
 		/*
 		 *	We have more bits to go
 		 */
@@ -1373,24 +1382,31 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
 		else
 			fn = fn->left;
 	}
-	return NULL;
+out:
+	if (exact_match)
+		return NULL;
+	else
+		return prev;
 }
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *daddr, int dst_len,
-			      const struct in6_addr *saddr, int src_len)
+			      const struct in6_addr *saddr, int src_len,
+			      bool exact_match)
 {
 	struct fib6_node *fn;
 
 	fn = fib6_locate_1(root, daddr, dst_len,
-			   offsetof(struct rt6_info, rt6i_dst));
+			   offsetof(struct rt6_info, rt6i_dst),
+			   exact_match);
 
 #ifdef CONFIG_IPV6_SUBTREES
 	if (src_len) {
 		WARN_ON(saddr == NULL);
 		if (fn && fn->subtree)
 			fn = fib6_locate_1(fn->subtree, saddr, src_len,
-					   offsetof(struct rt6_info, rt6i_src));
+					   offsetof(struct rt6_info, rt6i_src),
+					   exact_match);
 	}
 #endif
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d2dd55f58b5d..855b4ceec349 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2800,7 +2800,8 @@ static int ip6_route_del(struct fib6_config *cfg,
 
 	fn = fib6_locate(&table->tb6_root,
 			 &cfg->fc_dst, cfg->fc_dst_len,
-			 &cfg->fc_src, cfg->fc_src_len);
+			 &cfg->fc_src, cfg->fc_src_len,
+			 true);
 
 	if (fn) {
 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
@@ -3009,7 +3010,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 		return NULL;
 
 	read_lock_bh(&table->tb6_lock);
-	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
+	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
 	if (!fn)
 		goto out;
 
-- 
cgit v1.2.3


From 2b760fcf5cfb34e8610df56d83745b2b74ae1379 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 6 Oct 2017 12:06:03 -0700
Subject: ipv6: hook up exception table to store dst cache

This commit makes use of the exception hash table implementation to
store dst caches created by pmtu discovery and ip redirect into the hash
table under the rt_info and no longer inserts these routes into fib6
tree.
This makes the fib6 tree only contain static configured routes and could
now be protected by rcu instead of a rw lock.
With this change, in the route lookup related functions, after finding
the rt6_info with the longest prefix, we also need to search for the
exception table before doing backtracking.
In the route delete function, if the route being deleted is not a dst
cache, deletion of this route also need to flush the whole hash table
under it. If it is a dst cache, then only delete the cached dst in the
hash table.

Note: for fib6_walk_continue() function, w->root now is always pointing
to a root node considering that fib6_prune_clones() is removed from the
code. So we add a WARN_ON() msg to make sure w->root always points to a
root node and also removed the update of w->root in fib6_repair_tree().
This is a prerequisite for later patch because we don't need to make
w->root as rcu protected when replacing rwlock with RCU.
Also, we remove all prune related variables as it is no longer used.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |   1 -
 net/ipv6/addrconf.c   |   1 -
 net/ipv6/ip6_fib.c    |  95 ++++++++------------------------------------
 net/ipv6/route.c      | 108 +++++++++++++++++++++++++-------------------------
 4 files changed, 72 insertions(+), 133 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 4497a1eb4d41..d0b7283073e3 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -280,7 +280,6 @@ struct fib6_walker {
 	struct fib6_node *root, *node;
 	struct rt6_info *leaf;
 	enum fib6_walk_state state;
-	bool prune;
 	unsigned int skip;
 	unsigned int count;
 	int (*func)(struct fib6_walker *);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3ccaf52824c9..873afafddfc4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2326,7 +2326,6 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 	if (!fn)
 		goto out;
 
-	noflags |= RTF_CACHE;
 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
 		if (rt->dst.dev->ifindex != dev->ifindex)
 			continue;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b3e4cf0962f8..9c8e704e6af7 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -54,7 +54,6 @@ struct fib6_cleaner {
 #define FWS_INIT FWS_L
 #endif
 
-static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
 static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
 static int fib6_walk(struct net *net, struct fib6_walker *w);
@@ -1101,6 +1100,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 	if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
 		return -EINVAL;
+	if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
+		return -EINVAL;
 
 	if (info->nlh) {
 		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1192,11 +1193,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 #endif
 
 	err = fib6_add_rt2node(fn, rt, info, mxc);
-	if (!err) {
+	if (!err)
 		fib6_start_gc(info->nl_net, rt);
-		if (!(rt->rt6i_flags & RTF_CACHE))
-			fib6_prune_clones(info->nl_net, pn);
-	}
 
 out:
 	if (err) {
@@ -1511,19 +1509,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 		read_lock(&net->ipv6.fib6_walker_lock);
 		FOR_WALKERS(net, w) {
 			if (!child) {
-				if (w->root == fn) {
-					w->root = w->node = NULL;
-					RT6_TRACE("W %p adjusted by delroot 1\n", w);
-				} else if (w->node == fn) {
+				if (w->node == fn) {
 					RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
 					w->node = pn;
 					w->state = nstate;
 				}
 			} else {
-				if (w->root == fn) {
-					w->root = child;
-					RT6_TRACE("W %p adjusted by delroot 2\n", w);
-				}
 				if (w->node == fn) {
 					w->node = child;
 					if (children&2) {
@@ -1557,12 +1548,17 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	RT6_TRACE("fib6_del_route\n");
 
+	WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
+
 	/* Unlink it */
 	*rtp = rt->dst.rt6_next;
 	rt->rt6i_node = NULL;
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
 
+	/* Flush all cached dst in exception table */
+	rt6_flush_exceptions(rt);
+
 	/* Reset round-robin state, if necessary */
 	if (fn->rr_ptr == rt)
 		fn->rr_ptr = NULL;
@@ -1625,18 +1621,9 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 
 	WARN_ON(!(fn->fn_flags & RTN_RTINFO));
 
-	if (!(rt->rt6i_flags & RTF_CACHE)) {
-		struct fib6_node *pn = fn;
-#ifdef CONFIG_IPV6_SUBTREES
-		/* clones of this route might be in another subtree */
-		if (rt->rt6i_src.plen) {
-			while (!(pn->fn_flags & RTN_ROOT))
-				pn = pn->parent;
-			pn = pn->parent;
-		}
-#endif
-		fib6_prune_clones(info->nl_net, pn);
-	}
+	/* remove cached dst from exception table */
+	if (rt->rt6i_flags & RTF_CACHE)
+		return rt6_remove_exception_rt(rt);
 
 	/*
 	 *	Walk the leaf entries looking for ourself
@@ -1679,16 +1666,14 @@ static int fib6_walk_continue(struct fib6_walker *w)
 {
 	struct fib6_node *fn, *pn;
 
+	/* w->root should always be table->tb6_root */
+	WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
+
 	for (;;) {
 		fn = w->node;
 		if (!fn)
 			return 0;
 
-		if (w->prune && fn != w->root &&
-		    fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
-			w->state = FWS_C;
-			w->leaf = fn->leaf;
-		}
 		switch (w->state) {
 #ifdef CONFIG_IPV6_SUBTREES
 		case FWS_S:
@@ -1820,20 +1805,16 @@ static int fib6_clean_node(struct fib6_walker *w)
  *	func is called on each route.
  *		It may return -1 -> delete this route.
  *		              0  -> continue walking
- *
- *	prune==1 -> only immediate children of node (certainly,
- *	ignoring pure split nodes) will be scanned.
  */
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 			    int (*func)(struct rt6_info *, void *arg),
-			    bool prune, int sernum, void *arg)
+			    int sernum, void *arg)
 {
 	struct fib6_cleaner c;
 
 	c.w.root = root;
 	c.w.func = fib6_clean_node;
-	c.w.prune = prune;
 	c.w.count = 0;
 	c.w.skip = 0;
 	c.func = func;
@@ -1858,7 +1839,7 @@ static void __fib6_clean_all(struct net *net,
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
 			write_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
-					func, false, sernum, arg);
+					func, sernum, arg);
 			write_unlock_bh(&table->tb6_lock);
 		}
 	}
@@ -1871,22 +1852,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
 	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
 }
 
-static int fib6_prune_clone(struct rt6_info *rt, void *arg)
-{
-	if (rt->rt6i_flags & RTF_CACHE) {
-		RT6_TRACE("pruning clone %p\n", rt);
-		return -1;
-	}
-
-	return 0;
-}
-
-static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
-{
-	fib6_clean_tree(net, fn, fib6_prune_clone, true,
-			FIB6_NO_SERNUM_CHANGE, NULL);
-}
-
 static void fib6_flush_trees(struct net *net)
 {
 	int new_sernum = fib6_new_sernum(net);
@@ -1914,32 +1879,6 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 			return -1;
 		}
 		gc_args->more++;
-	/* The following part will soon be removed when the exception
-	 * table is hooked up to store all cached routes.
-	 */
-	} else if (rt->rt6i_flags & RTF_CACHE) {
-		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout))
-			rt->dst.obsolete = DST_OBSOLETE_KILL;
-		if (atomic_read(&rt->dst.__refcnt) == 1 &&
-		    rt->dst.obsolete == DST_OBSOLETE_KILL) {
-			RT6_TRACE("aging clone %p\n", rt);
-			return -1;
-		} else if (rt->rt6i_flags & RTF_GATEWAY) {
-			struct neighbour *neigh;
-			__u8 neigh_flags = 0;
-
-			neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
-			if (neigh) {
-				neigh_flags = neigh->flags;
-				neigh_release(neigh);
-			}
-			if (!(neigh_flags & NTF_ROUTER)) {
-				RT6_TRACE("purging route %p via non-router but gateway\n",
-					  rt);
-				return -1;
-			}
-		}
-		gc_args->more++;
 	}
 
 	/*	Also age clones in the exception table.
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 855b4ceec349..65130dde276a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -878,8 +878,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     struct fib6_table *table,
 					     struct flowi6 *fl6, int flags)
 {
+	struct rt6_info *rt, *rt_cache;
 	struct fib6_node *fn;
-	struct rt6_info *rt;
 
 	read_lock_bh(&table->tb6_lock);
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
@@ -893,6 +893,11 @@ restart:
 		if (fn)
 			goto restart;
 	}
+	/* Search through exception table */
+	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+	if (rt_cache)
+		rt = rt_cache;
+
 	dst_use(&rt->dst, jiffies);
 	read_unlock_bh(&table->tb6_lock);
 
@@ -1592,7 +1597,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       int oif, struct flowi6 *fl6, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
-	struct rt6_info *rt;
+	struct rt6_info *rt, *rt_cache;
 	int strict = 0;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
@@ -1624,6 +1629,10 @@ redo_rt6_select:
 		}
 	}
 
+	/*Search through exception table */
+	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+	if (rt_cache)
+		rt = rt_cache;
 
 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
 		dst_use(&rt->dst, jiffies);
@@ -1988,23 +1997,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 
 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
 		rt6_do_update_pmtu(rt6, mtu);
+		/* update rt6_ex->stamp for cache */
+		if (rt6->rt6i_flags & RTF_CACHE)
+			rt6_update_exception_stamp_rt(rt6);
 	} else if (daddr) {
 		struct rt6_info *nrt6;
 
 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
-
-			/* ip6_ins_rt(nrt6) will bump the
-			 * rt6->rt6i_node->fn_sernum
-			 * which will fail the next rt6_check() and
-			 * invalidate the sk->sk_dst_cache.
-			 */
-			ip6_ins_rt(nrt6);
-			/* Release the reference taken in
-			 * ip6_rt_cache_alloc()
-			 */
-			dst_release(&nrt6->dst);
+			if (rt6_insert_exception(nrt6, rt6))
+				dst_release_immediate(&nrt6->dst);
 		}
 	}
 }
@@ -2068,7 +2071,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 					     int flags)
 {
 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
-	struct rt6_info *rt;
+	struct rt6_info *rt, *rt_cache;
 	struct fib6_node *fn;
 
 	/* Get the "current" route for this destination and
@@ -2093,8 +2096,23 @@ restart:
 			continue;
 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
 			continue;
-		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
+		/* rt_cache's gateway might be different from its 'parent'
+		 * in the case of an ip redirect.
+		 * So we keep searching in the exception table if the gateway
+		 * is different.
+		 */
+		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
+			rt_cache = rt6_find_cached_rt(rt,
+						      &fl6->daddr,
+						      &fl6->saddr);
+			if (rt_cache &&
+			    ipv6_addr_equal(&rdfl->gateway,
+					    &rt_cache->rt6i_gateway)) {
+				rt = rt_cache;
+				break;
+			}
 			continue;
+		}
 		break;
 	}
 
@@ -2785,9 +2803,9 @@ out_put:
 static int ip6_route_del(struct fib6_config *cfg,
 			 struct netlink_ext_ack *extack)
 {
+	struct rt6_info *rt, *rt_cache;
 	struct fib6_table *table;
 	struct fib6_node *fn;
-	struct rt6_info *rt;
 	int err = -ESRCH;
 
 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
@@ -2801,13 +2819,17 @@ static int ip6_route_del(struct fib6_config *cfg,
 	fn = fib6_locate(&table->tb6_root,
 			 &cfg->fc_dst, cfg->fc_dst_len,
 			 &cfg->fc_src, cfg->fc_src_len,
-			 true);
+			 !(cfg->fc_flags & RTF_CACHE));
 
 	if (fn) {
 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
-			if ((rt->rt6i_flags & RTF_CACHE) &&
-			    !(cfg->fc_flags & RTF_CACHE))
-				continue;
+			if (cfg->fc_flags & RTF_CACHE) {
+				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
+							      &cfg->fc_src);
+				if (!rt_cache)
+					continue;
+				rt = rt_cache;
+			}
 			if (cfg->fc_ifindex &&
 			    (!rt->dst.dev ||
 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -2933,8 +2955,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	nrt->rt6i_protocol = RTPROT_REDIRECT;
 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
 
-	if (ip6_ins_rt(nrt))
-		goto out_release;
+	/* No need to remove rt from the exception table if rt is
+	 * a cached route because rt6_insert_exception() will
+	 * takes care of it
+	 */
+	if (rt6_insert_exception(nrt, rt)) {
+		dst_release_immediate(&nrt->dst);
+		goto out;
+	}
 
 	netevent.old = &rt->dst;
 	netevent.new = &nrt->dst;
@@ -2942,17 +2970,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	netevent.neigh = neigh;
 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
 
-	if (rt->rt6i_flags & RTF_CACHE) {
-		rt = (struct rt6_info *) dst_clone(&rt->dst);
-		ip6_del_rt(rt);
-	}
-
-out_release:
-	/* Release the reference taken in
-	 * ip6_rt_cache_alloc()
-	 */
-	dst_release(&nrt->dst);
-
 out:
 	neigh_release(neigh);
 }
@@ -3344,12 +3361,8 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
 
-	/* RTF_CACHE_GATEWAY case will be removed once the exception
-	 * table is hooked up to store all cached routes.
-	 */
-	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
-	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
-	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
 		return -1;
 	}
 
@@ -3438,20 +3451,9 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
 		spin_lock_bh(&rt6_exception_lock);
-		/* This case will be removed once the exception table
-		 * is hooked up.
-		 */
-		if (rt->rt6i_flags & RTF_CACHE) {
-			/* For RTF_CACHE with rt6i_pmtu == 0
-			 * (i.e. a redirected route),
-			 * the metrics of its rt->dst.from has already
-			 * been updated.
-			 */
-			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
-				rt->rt6i_pmtu = arg->mtu;
-		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
-			   (dst_mtu(&rt->dst) < arg->mtu &&
-			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+		if (dst_mtu(&rt->dst) >= arg->mtu ||
+		    (dst_mtu(&rt->dst) < arg->mtu &&
+		     dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
 		}
 		rt6_exceptions_update_pmtu(rt, arg->mtu);
-- 
cgit v1.2.3


From bbd63f06d114a52be33f6982fc89ca2768cdeb62 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 6 Oct 2017 12:06:07 -0700
Subject: ipv6: update fn_sernum after route is inserted to tree

fib6_add() logic currently calls fib6_add_1() to figure out what node
should be used for the newly added route and then call
fib6_add_rt2node() to insert the route to the node.
And during the call of fib6_add_1(), fn_sernum is updated for all nodes
that share the same prefix as the new route.
This does not have issue in the current code because reader thread will
not be able to access the tree while writer thread is inserting new
route to it. However, it is not the case once we transition to use RCU.
Reader thread could potentially see the new fn_sernum before the new
route is inserted. As a result, reader thread's route lookup will return
a stale route with the new fn_sernum.

In order to solve this issue, we remove all the update of fn_sernum in
fib6_add_1(), and instead, introduce a new function that updates fn_sernum
for all related nodes and call this functions once the route is
successfully inserted to the tree.
Also, smp_wmb() is used after a route is successfully inserted into the
fib tree and right before the updated of fn->sernum. And smp_rmb() is
used right after fn->sernum is accessed in rt6_get_cookie_safe(). This
is to guarantee that when the reader thread sees the new fn->sernum, the
new route is already inserted in the tree in memory.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  2 ++
 net/ipv6/ip6_fib.c    | 39 +++++++++++++++++++++------------------
 2 files changed, 23 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index d0b7283073e3..6bf929b50951 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -220,6 +220,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
 
 	if (fn) {
 		*cookie = fn->fn_sernum;
+		/* pairs with smp_wmb() in fib6_update_sernum_upto_root() */
+		smp_rmb();
 		status = true;
 	}
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index eee392f7b1f6..f604b311cc3e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -585,7 +585,7 @@ out:
 static struct fib6_node *fib6_add_1(struct fib6_node *root,
 				     struct in6_addr *addr, int plen,
 				     int offset, int allow_create,
-				     int replace_required, int sernum,
+				     int replace_required,
 				     struct netlink_ext_ack *extack)
 {
 	struct fib6_node *fn, *in, *ln;
@@ -631,8 +631,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
 				fn->leaf = NULL;
 			}
 
-			fn->fn_sernum = sernum;
-
 			return fn;
 		}
 
@@ -641,7 +639,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
 		 */
 
 		/* Try to walk down on tree. */
-		fn->fn_sernum = sernum;
 		dir = addr_bit_set(addr, fn->fn_bit);
 		pn = fn;
 		fn = dir ? fn->right : fn->left;
@@ -677,7 +674,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
 	ln->fn_bit = plen;
 
 	ln->parent = pn;
-	ln->fn_sernum = sernum;
 
 	if (dir)
 		pn->right = ln;
@@ -737,8 +733,6 @@ insert_above:
 		in->leaf = fn->leaf;
 		atomic_inc(&in->leaf->rt6i_ref);
 
-		in->fn_sernum = sernum;
-
 		/* update parent pointer */
 		if (dir)
 			pn->right = in;
@@ -750,8 +744,6 @@ insert_above:
 		ln->parent = in;
 		fn->parent = in;
 
-		ln->fn_sernum = sernum;
-
 		if (addr_bit_set(addr, bit)) {
 			in->right = ln;
 			in->left  = fn;
@@ -776,8 +768,6 @@ insert_above:
 
 		ln->parent = pn;
 
-		ln->fn_sernum = sernum;
-
 		if (dir)
 			pn->right = ln;
 		else
@@ -1079,6 +1069,20 @@ void fib6_force_start_gc(struct net *net)
 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
 
+static void fib6_update_sernum_upto_root(struct rt6_info *rt,
+					 int sernum)
+{
+	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
+				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+
+	/* paired with smp_rmb() in rt6_get_cookie_safe() */
+	smp_wmb();
+	while (fn) {
+		fn->fn_sernum = sernum;
+		fn = fn->parent;
+	}
+}
+
 /*
  *	Add routing information to the routing tree.
  *	<destination addr>/<source addr>
@@ -1111,7 +1115,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 	fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
 			offsetof(struct rt6_info, rt6i_dst), allow_create,
-			replace_required, sernum, extack);
+			replace_required, extack);
 	if (IS_ERR(fn)) {
 		err = PTR_ERR(fn);
 		fn = NULL;
@@ -1145,15 +1149,13 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
 			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
 			sfn->fn_flags = RTN_ROOT;
-			sfn->fn_sernum = sernum;
 
 			/* Now add the first leaf node to new subtree */
 
 			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
 					rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
-					allow_create, replace_required, sernum,
-					extack);
+					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
 				/* If it is failed, discard just allocated
@@ -1172,8 +1174,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
 					rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
-					allow_create, replace_required, sernum,
-					extack);
+					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
 				err = PTR_ERR(sn);
@@ -1190,8 +1191,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 #endif
 
 	err = fib6_add_rt2node(fn, rt, info, mxc);
-	if (!err)
+	if (!err) {
+		fib6_update_sernum_upto_root(rt, sernum);
 		fib6_start_gc(info->nl_net, rt);
+	}
 
 out:
 	if (err) {
-- 
cgit v1.2.3


From 66f5d6ce53e665477d2a33e8f539d4fa4ca81c83 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 6 Oct 2017 12:06:10 -0700
Subject: ipv6: replace rwlock with rcu and spinlock in fib6_table

With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().

A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.

2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.

3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.

4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.

5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h     |   2 +-
 include/net/ip6_fib.h |  24 ++-
 net/ipv6/addrconf.c   |  11 +-
 net/ipv6/ip6_fib.c    | 405 ++++++++++++++++++++++++++++++--------------------
 net/ipv6/route.c      | 121 ++++++++-------
 5 files changed, 333 insertions(+), 230 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 06a6765da074..204c19e25456 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -101,7 +101,7 @@ struct dst_entry {
 	union {
 		struct dst_entry	*next;
 		struct rtable __rcu	*rt_next;
-		struct rt6_info		*rt6_next;
+		struct rt6_info __rcu	*rt6_next;
 		struct dn_route __rcu	*dn_next;
 	};
 };
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6bf929b50951..0b438b9bcb10 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -68,18 +68,18 @@ struct fib6_config {
 };
 
 struct fib6_node {
-	struct fib6_node	*parent;
-	struct fib6_node	*left;
-	struct fib6_node	*right;
+	struct fib6_node __rcu	*parent;
+	struct fib6_node __rcu	*left;
+	struct fib6_node __rcu	*right;
 #ifdef CONFIG_IPV6_SUBTREES
-	struct fib6_node	*subtree;
+	struct fib6_node __rcu	*subtree;
 #endif
-	struct rt6_info		*leaf;
+	struct rt6_info __rcu	*leaf;
 
 	__u16			fn_bit;		/* bit key */
 	__u16			fn_flags;
 	int			fn_sernum;
-	struct rt6_info		*rr_ptr;
+	struct rt6_info __rcu	*rr_ptr;
 	struct rcu_head		rcu;
 };
 
@@ -91,7 +91,7 @@ struct fib6_gc_args {
 #ifndef CONFIG_IPV6_SUBTREES
 #define FIB6_SUBTREE(fn)	NULL
 #else
-#define FIB6_SUBTREE(fn)	((fn)->subtree)
+#define FIB6_SUBTREE(fn)	(rcu_dereference_protected((fn)->subtree, 1))
 #endif
 
 struct mx6_config {
@@ -174,6 +174,14 @@ struct rt6_info {
 					unused:7;
 };
 
+#define for_each_fib6_node_rt_rcu(fn)					\
+	for (rt = rcu_dereference((fn)->leaf); rt;			\
+	     rt = rcu_dereference(rt->dst.rt6_next))
+
+#define for_each_fib6_walker_rt(w)					\
+	for (rt = (w)->leaf; rt;					\
+	     rt = rcu_dereference_protected(rt->dst.rt6_next, 1))
+
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 {
 	return ((struct rt6_info *)dst)->rt6i_idev;
@@ -310,7 +318,7 @@ struct rt6_statistics {
 struct fib6_table {
 	struct hlist_node	tb6_hlist;
 	u32			tb6_id;
-	rwlock_t		tb6_lock;
+	spinlock_t		tb6_lock;
 	struct fib6_node	tb6_root;
 	struct inet_peer_base	tb6_peers;
 	unsigned int		flags;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f86e931d555e..9854d93e45bb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2321,12 +2321,12 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 	if (!table)
 		return NULL;
 
-	read_lock_bh(&table->tb6_lock);
+	rcu_read_lock();
 	fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
 	if (!fn)
 		goto out;
 
-	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->dst.dev->ifindex != dev->ifindex)
 			continue;
 		if ((rt->rt6i_flags & flags) != flags)
@@ -2338,7 +2338,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		break;
 	}
 out:
-	read_unlock_bh(&table->tb6_lock);
+	rcu_read_unlock();
 	return rt;
 }
 
@@ -5898,10 +5898,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 		spin_lock(&ifa->lock);
 		if (ifa->rt) {
 			struct rt6_info *rt = ifa->rt;
-			struct fib6_table *table = rt->rt6i_table;
 			int cpu;
 
-			read_lock(&table->tb6_lock);
+			rcu_read_lock();
 			addrconf_set_nopolicy(ifa->rt, val);
 			if (rt->rt6i_pcpu) {
 				for_each_possible_cpu(cpu) {
@@ -5911,7 +5910,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 					addrconf_set_nopolicy(*rtp, val);
 				}
 			}
-			read_unlock(&table->tb6_lock);
+			rcu_read_unlock();
 		}
 		spin_unlock(&ifa->lock);
 	}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index cf6137e81408..3f95908b39c3 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -54,8 +54,12 @@ struct fib6_cleaner {
 #define FWS_INIT FWS_L
 #endif
 
-static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
-static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
+static struct rt6_info *fib6_find_prefix(struct net *net,
+					 struct fib6_table *table,
+					 struct fib6_node *fn);
+static struct fib6_node *fib6_repair_tree(struct net *net,
+					  struct fib6_table *table,
+					  struct fib6_node *fn);
 static int fib6_walk(struct net *net, struct fib6_walker *w);
 static int fib6_walk_continue(struct fib6_walker *w);
 
@@ -107,12 +111,12 @@ void fib6_update_sernum(struct rt6_info *rt)
 	struct net *net = dev_net(rt->dst.dev);
 	struct fib6_node *fn;
 
-	write_lock_bh(&table->tb6_lock);
+	spin_lock_bh(&table->tb6_lock);
 	fn = rcu_dereference_protected(rt->rt6i_node,
 			lockdep_is_held(&table->tb6_lock));
 	if (fn)
 		fn->fn_sernum = fib6_new_sernum(net);
-	write_unlock_bh(&table->tb6_lock);
+	spin_unlock_bh(&table->tb6_lock);
 }
 
 /*
@@ -207,8 +211,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb)
 	 * Initialize table lock at a single place to give lockdep a key,
 	 * tables aren't visible prior to being linked to the list.
 	 */
-	rwlock_init(&tb->tb6_lock);
-
+	spin_lock_init(&tb->tb6_lock);
 	h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
 
 	/*
@@ -227,7 +230,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
 	table = kzalloc(sizeof(*table), GFP_ATOMIC);
 	if (table) {
 		table->tb6_id = id;
-		table->tb6_root.leaf = net->ipv6.ip6_null_entry;
+		rcu_assign_pointer(table->tb6_root.leaf,
+				   net->ipv6.ip6_null_entry);
 		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 		inet_peer_base_init(&table->tb6_peers);
 	}
@@ -324,11 +328,8 @@ unsigned int fib6_tables_seq_read(struct net *net)
 		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
 		struct fib6_table *tb;
 
-		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
-			read_lock_bh(&tb->tb6_lock);
+		hlist_for_each_entry_rcu(tb, head, tb6_hlist)
 			fib_seq += tb->fib_seq;
-			read_unlock_bh(&tb->tb6_lock);
-		}
 	}
 	rcu_read_unlock();
 
@@ -374,7 +375,7 @@ static int fib6_node_dump(struct fib6_walker *w)
 {
 	struct rt6_info *rt;
 
-	for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
+	for_each_fib6_walker_rt(w)
 		fib6_rt_dump(rt, w->args);
 	w->leaf = NULL;
 	return 0;
@@ -384,9 +385,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb,
 			    struct fib6_walker *w)
 {
 	w->root = &tb->tb6_root;
-	read_lock_bh(&tb->tb6_lock);
+	spin_lock_bh(&tb->tb6_lock);
 	fib6_walk(net, w);
-	read_unlock_bh(&tb->tb6_lock);
+	spin_unlock_bh(&tb->tb6_lock);
 }
 
 /* Called with rcu_read_lock() */
@@ -423,7 +424,7 @@ static int fib6_dump_node(struct fib6_walker *w)
 	int res;
 	struct rt6_info *rt;
 
-	for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+	for_each_fib6_walker_rt(w) {
 		res = rt6_dump_route(rt, w->args);
 		if (res < 0) {
 			/* Frame is full, suspend walking */
@@ -482,9 +483,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 		w->count = 0;
 		w->skip = 0;
 
-		read_lock_bh(&table->tb6_lock);
+		spin_lock_bh(&table->tb6_lock);
 		res = fib6_walk(net, w);
-		read_unlock_bh(&table->tb6_lock);
+		spin_unlock_bh(&table->tb6_lock);
 		if (res > 0) {
 			cb->args[4] = 1;
 			cb->args[5] = w->root->fn_sernum;
@@ -499,9 +500,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 		} else
 			w->skip = 0;
 
-		read_lock_bh(&table->tb6_lock);
+		spin_lock_bh(&table->tb6_lock);
 		res = fib6_walk_continue(w);
-		read_unlock_bh(&table->tb6_lock);
+		spin_unlock_bh(&table->tb6_lock);
 		if (res <= 0) {
 			fib6_walker_unlink(net, w);
 			cb->args[4] = 0;
@@ -582,11 +583,12 @@ out:
  *	node.
  */
 
-static struct fib6_node *fib6_add_1(struct fib6_node *root,
-				     struct in6_addr *addr, int plen,
-				     int offset, int allow_create,
-				     int replace_required,
-				     struct netlink_ext_ack *extack)
+static struct fib6_node *fib6_add_1(struct fib6_table *table,
+				    struct fib6_node *root,
+				    struct in6_addr *addr, int plen,
+				    int offset, int allow_create,
+				    int replace_required,
+				    struct netlink_ext_ack *extack)
 {
 	struct fib6_node *fn, *in, *ln;
 	struct fib6_node *pn = NULL;
@@ -601,7 +603,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
 	fn = root;
 
 	do {
-		key = (struct rt6key *)((u8 *)fn->leaf + offset);
+		struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+					    lockdep_is_held(&table->tb6_lock));
+		key = (struct rt6key *)((u8 *)leaf + offset);
 
 		/*
 		 *	Prefix match
@@ -627,8 +631,8 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
 		if (plen == fn->fn_bit) {
 			/* clean up an intermediate node */
 			if (!(fn->fn_flags & RTN_RTINFO)) {
-				rt6_release(fn->leaf);
-				fn->leaf = NULL;
+				RCU_INIT_POINTER(fn->leaf, NULL);
+				rt6_release(leaf);
 			}
 
 			return fn;
@@ -641,7 +645,11 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
 		/* Try to walk down on tree. */
 		dir = addr_bit_set(addr, fn->fn_bit);
 		pn = fn;
-		fn = dir ? fn->right : fn->left;
+		fn = dir ?
+		     rcu_dereference_protected(fn->right,
+					lockdep_is_held(&table->tb6_lock)) :
+		     rcu_dereference_protected(fn->left,
+					lockdep_is_held(&table->tb6_lock));
 	} while (fn);
 
 	if (!allow_create) {
@@ -672,13 +680,12 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
 	if (!ln)
 		return ERR_PTR(-ENOMEM);
 	ln->fn_bit = plen;
-
-	ln->parent = pn;
+	RCU_INIT_POINTER(ln->parent, pn);
 
 	if (dir)
-		pn->right = ln;
+		rcu_assign_pointer(pn->right, ln);
 	else
-		pn->left  = ln;
+		rcu_assign_pointer(pn->left, ln);
 
 	return ln;
 
@@ -692,7 +699,8 @@ insert_above:
 	 * and the current
 	 */
 
-	pn = fn->parent;
+	pn = rcu_dereference_protected(fn->parent,
+				       lockdep_is_held(&table->tb6_lock));
 
 	/* find 1st bit in difference between the 2 addrs.
 
@@ -729,27 +737,28 @@ insert_above:
 
 		in->fn_bit = bit;
 
-		in->parent = pn;
+		RCU_INIT_POINTER(in->parent, pn);
 		in->leaf = fn->leaf;
-		atomic_inc(&in->leaf->rt6i_ref);
+		atomic_inc(&rcu_dereference_protected(in->leaf,
+				lockdep_is_held(&table->tb6_lock))->rt6i_ref);
 
 		/* update parent pointer */
 		if (dir)
-			pn->right = in;
+			rcu_assign_pointer(pn->right, in);
 		else
-			pn->left  = in;
+			rcu_assign_pointer(pn->left, in);
 
 		ln->fn_bit = plen;
 
-		ln->parent = in;
-		fn->parent = in;
+		RCU_INIT_POINTER(ln->parent, in);
+		rcu_assign_pointer(fn->parent, in);
 
 		if (addr_bit_set(addr, bit)) {
-			in->right = ln;
-			in->left  = fn;
+			rcu_assign_pointer(in->right, ln);
+			rcu_assign_pointer(in->left, fn);
 		} else {
-			in->left  = ln;
-			in->right = fn;
+			rcu_assign_pointer(in->left, ln);
+			rcu_assign_pointer(in->right, fn);
 		}
 	} else { /* plen <= bit */
 
@@ -766,19 +775,19 @@ insert_above:
 
 		ln->fn_bit = plen;
 
-		ln->parent = pn;
-
-		if (dir)
-			pn->right = ln;
-		else
-			pn->left  = ln;
+		RCU_INIT_POINTER(ln->parent, pn);
 
 		if (addr_bit_set(&key->addr, plen))
-			ln->right = fn;
+			RCU_INIT_POINTER(ln->right, fn);
 		else
-			ln->left  = fn;
+			RCU_INIT_POINTER(ln->left, fn);
+
+		rcu_assign_pointer(fn->parent, ln);
 
-		fn->parent = ln;
+		if (dir)
+			rcu_assign_pointer(pn->right, ln);
+		else
+			rcu_assign_pointer(pn->left, ln);
 	}
 	return ln;
 }
@@ -824,6 +833,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
 static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 			  struct net *net)
 {
+	struct fib6_table *table = rt->rt6i_table;
+
 	if (atomic_read(&rt->rt6i_ref) != 1) {
 		/* This route is used as dummy address holder in some split
 		 * nodes. It is not leaked, but it still holds other resources,
@@ -832,12 +843,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 		 * to still alive ones.
 		 */
 		while (fn) {
-			if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
-				fn->leaf = fib6_find_prefix(net, fn);
-				atomic_inc(&fn->leaf->rt6i_ref);
+			struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+					    lockdep_is_held(&table->tb6_lock));
+			struct rt6_info *new_leaf;
+			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
+				new_leaf = fib6_find_prefix(net, table, fn);
+				atomic_inc(&new_leaf->rt6i_ref);
+				rcu_assign_pointer(fn->leaf, new_leaf);
 				rt6_release(rt);
 			}
-			fn = fn->parent;
+			fn = rcu_dereference_protected(fn->parent,
+				    lockdep_is_held(&table->tb6_lock));
 		}
 	}
 }
@@ -849,9 +865,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			    struct nl_info *info, struct mx6_config *mxc)
 {
+	struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 	struct rt6_info *iter = NULL;
-	struct rt6_info **ins;
-	struct rt6_info **fallback_ins = NULL;
+	struct rt6_info __rcu **ins;
+	struct rt6_info __rcu **fallback_ins = NULL;
 	int replace = (info->nlh &&
 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
 	int add = (!info->nlh ||
@@ -866,7 +884,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 
 	ins = &fn->leaf;
 
-	for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
+	for (iter = leaf; iter;
+	     iter = rcu_dereference_protected(iter->dst.rt6_next,
+				lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
 		/*
 		 *	Search for duplicates
 		 */
@@ -928,7 +948,8 @@ next_iter:
 	if (fallback_ins && !found) {
 		/* No ECMP-able route found, replace first non-ECMP one */
 		ins = fallback_ins;
-		iter = *ins;
+		iter = rcu_dereference_protected(*ins,
+				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 		found++;
 	}
 
@@ -942,7 +963,7 @@ next_iter:
 		struct rt6_info *sibling, *temp_sibling;
 
 		/* Find the first route that have the same metric */
-		sibling = fn->leaf;
+		sibling = leaf;
 		while (sibling) {
 			if (sibling->rt6i_metric == rt->rt6i_metric &&
 			    rt6_qualify_for_ecmp(sibling)) {
@@ -950,7 +971,8 @@ next_iter:
 					      &sibling->rt6i_siblings);
 				break;
 			}
-			sibling = sibling->dst.rt6_next;
+			sibling = rcu_dereference_protected(sibling->dst.rt6_next,
+				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 		}
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
@@ -979,10 +1001,10 @@ add:
 		if (err)
 			return err;
 
-		rt->dst.rt6_next = iter;
-		*ins = rt;
-		rcu_assign_pointer(rt->rt6i_node, fn);
+		rcu_assign_pointer(rt->dst.rt6_next, iter);
 		atomic_inc(&rt->rt6i_ref);
+		rcu_assign_pointer(rt->rt6i_node, fn);
+		rcu_assign_pointer(*ins, rt);
 		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
 					  rt);
 		if (!info->skip_notify)
@@ -1008,10 +1030,10 @@ add:
 		if (err)
 			return err;
 
-		*ins = rt;
+		atomic_inc(&rt->rt6i_ref);
 		rcu_assign_pointer(rt->rt6i_node, fn);
 		rt->dst.rt6_next = iter->dst.rt6_next;
-		atomic_inc(&rt->rt6i_ref);
+		rcu_assign_pointer(*ins, rt);
 		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
 					  rt);
 		if (!info->skip_notify)
@@ -1023,14 +1045,15 @@ add:
 		nsiblings = iter->rt6i_nsiblings;
 		iter->rt6i_node = NULL;
 		fib6_purge_rt(iter, fn, info->nl_net);
-		if (fn->rr_ptr == iter)
+		if (rcu_access_pointer(fn->rr_ptr) == iter)
 			fn->rr_ptr = NULL;
 		rt6_release(iter);
 
 		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
 			ins = &rt->dst.rt6_next;
-			iter = *ins;
+			iter = rcu_dereference_protected(*ins,
+				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 			while (iter) {
 				if (iter->rt6i_metric > rt->rt6i_metric)
 					break;
@@ -1038,14 +1061,15 @@ add:
 					*ins = iter->dst.rt6_next;
 					iter->rt6i_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
-					if (fn->rr_ptr == iter)
+					if (rcu_access_pointer(fn->rr_ptr) == iter)
 						fn->rr_ptr = NULL;
 					rt6_release(iter);
 					nsiblings--;
 				} else {
 					ins = &iter->dst.rt6_next;
 				}
-				iter = *ins;
+				iter = rcu_dereference_protected(*ins,
+					lockdep_is_held(&rt->rt6i_table->tb6_lock));
 			}
 			WARN_ON(nsiblings != 0);
 		}
@@ -1079,7 +1103,8 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt,
 	smp_wmb();
 	while (fn) {
 		fn->fn_sernum = sernum;
-		fn = fn->parent;
+		fn = rcu_dereference_protected(fn->parent,
+				lockdep_is_held(&rt->rt6i_table->tb6_lock));
 	}
 }
 
@@ -1087,12 +1112,14 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt,
  *	Add routing information to the routing tree.
  *	<destination addr>/<source addr>
  *	with source addr info in sub-trees
+ *	Need to own table->tb6_lock
  */
 
 int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	     struct nl_info *info, struct mx6_config *mxc,
 	     struct netlink_ext_ack *extack)
 {
+	struct fib6_table *table = rt->rt6i_table;
 	struct fib6_node *fn, *pn = NULL;
 	int err = -ENOMEM;
 	int allow_create = 1;
@@ -1113,7 +1140,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	if (!allow_create && !replace_required)
 		pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
 
-	fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
+	fn = fib6_add_1(table, root,
+			&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
 			offsetof(struct rt6_info, rt6i_dst), allow_create,
 			replace_required, extack);
 	if (IS_ERR(fn)) {
@@ -1128,7 +1156,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	if (rt->rt6i_src.plen) {
 		struct fib6_node *sn;
 
-		if (!fn->subtree) {
+		if (!rcu_access_pointer(fn->subtree)) {
 			struct fib6_node *sfn;
 
 			/*
@@ -1146,13 +1174,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			if (!sfn)
 				goto failure;
 
-			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
 			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+			rcu_assign_pointer(sfn->leaf,
+					   info->nl_net->ipv6.ip6_null_entry);
 			sfn->fn_flags = RTN_ROOT;
 
 			/* Now add the first leaf node to new subtree */
 
-			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
+			sn = fib6_add_1(table, sfn, &rt->rt6i_src.addr,
 					rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
 					allow_create, replace_required, extack);
@@ -1168,10 +1197,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			}
 
 			/* Now link new subtree to main tree */
-			sfn->parent = fn;
-			fn->subtree = sfn;
+			rcu_assign_pointer(sfn->parent, fn);
+			rcu_assign_pointer(fn->subtree, sfn);
 		} else {
-			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
+			sn = fib6_add_1(table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr,
 					rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
 					allow_create, replace_required, extack);
@@ -1182,9 +1211,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			}
 		}
 
-		if (!fn->leaf) {
-			fn->leaf = rt;
+		if (!rcu_access_pointer(fn->leaf)) {
 			atomic_inc(&rt->rt6i_ref);
+			rcu_assign_pointer(fn->leaf, rt);
 		}
 		fn = sn;
 	}
@@ -1203,19 +1232,23 @@ out:
 		 * If fib6_add_1 has cleared the old leaf pointer in the
 		 * super-tree leaf node we have to find a new one for it.
 		 */
-		if (pn != fn && pn->leaf == rt) {
-			pn->leaf = NULL;
+		struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+					    lockdep_is_held(&table->tb6_lock));
+		if (pn != fn && pn_leaf == rt) {
+			pn_leaf = NULL;
+			RCU_INIT_POINTER(pn->leaf, NULL);
 			atomic_dec(&rt->rt6i_ref);
 		}
-		if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
-			pn->leaf = fib6_find_prefix(info->nl_net, pn);
+		if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
+			pn_leaf = fib6_find_prefix(info->nl_net, table, pn);
 #if RT6_DEBUG >= 2
-			if (!pn->leaf) {
-				WARN_ON(pn->leaf == NULL);
-				pn->leaf = info->nl_net->ipv6.ip6_null_entry;
+			if (!pn_leaf) {
+				WARN_ON(!pn_leaf);
+				pn_leaf = info->nl_net->ipv6.ip6_null_entry;
 			}
 #endif
-			atomic_inc(&pn->leaf->rt6i_ref);
+			atomic_inc(&pn_leaf->rt6i_ref);
+			rcu_assign_pointer(pn->leaf, pn_leaf);
 		}
 #endif
 		goto failure;
@@ -1230,7 +1263,7 @@ failure:
 	 * fn->leaf.
 	 */
 	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
-		fib6_repair_tree(info->nl_net, fn);
+		fib6_repair_tree(info->nl_net, table, fn);
 	/* Always release dst as dst->__refcnt is guaranteed
 	 * to be taken before entering this function
 	 */
@@ -1268,7 +1301,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 
 		dir = addr_bit_set(args->addr, fn->fn_bit);
 
-		next = dir ? fn->right : fn->left;
+		next = dir ? rcu_dereference(fn->right) :
+			     rcu_dereference(fn->left);
 
 		if (next) {
 			fn = next;
@@ -1278,8 +1312,10 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 	}
 
 	while (fn) {
-		if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
-			struct rt6_info *leaf = fn->leaf;
+		struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
+		if (subtree || fn->fn_flags & RTN_RTINFO) {
+			struct rt6_info *leaf = rcu_dereference(fn->leaf);
 			struct rt6key *key;
 
 			if (!leaf)
@@ -1289,10 +1325,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 
 			if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
 #ifdef CONFIG_IPV6_SUBTREES
-				if (fn->subtree) {
+				if (subtree) {
 					struct fib6_node *sfn;
-					sfn = fib6_lookup_1(fn->subtree,
-							    args + 1);
+					sfn = fib6_lookup_1(subtree, args + 1);
 					if (!sfn)
 						goto backtrack;
 					fn = sfn;
@@ -1306,12 +1341,14 @@ backtrack:
 		if (fn->fn_flags & RTN_ROOT)
 			break;
 
-		fn = fn->parent;
+		fn = rcu_dereference(fn->parent);
 	}
 
 	return NULL;
 }
 
+/* called with rcu_read_lock() held
+ */
 struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
 			      const struct in6_addr *saddr)
 {
@@ -1359,7 +1396,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
 	struct fib6_node *fn, *prev = NULL;
 
 	for (fn = root; fn ; ) {
-		struct rt6_info *leaf = fn->leaf;
+		struct rt6_info *leaf = rcu_dereference(fn->leaf);
 		struct rt6key *key;
 
 		/* This node is being deleted */
@@ -1389,9 +1426,9 @@ next:
 		 *	We have more bits to go
 		 */
 		if (addr_bit_set(addr, fn->fn_bit))
-			fn = fn->right;
+			fn = rcu_dereference(fn->right);
 		else
-			fn = fn->left;
+			fn = rcu_dereference(fn->left);
 	}
 out:
 	if (exact_match)
@@ -1413,9 +1450,11 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 
 #ifdef CONFIG_IPV6_SUBTREES
 	if (src_len) {
+		struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
 		WARN_ON(saddr == NULL);
-		if (fn && fn->subtree)
-			fn = fib6_locate_1(fn->subtree, saddr, src_len,
+		if (fn && subtree)
+			fn = fib6_locate_1(subtree, saddr, src_len,
 					   offsetof(struct rt6_info, rt6i_src),
 					   exact_match);
 	}
@@ -1433,16 +1472,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
  *
  */
 
-static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
+static struct rt6_info *fib6_find_prefix(struct net *net,
+					 struct fib6_table *table,
+					 struct fib6_node *fn)
 {
+	struct fib6_node *child_left, *child_right;
+
 	if (fn->fn_flags & RTN_ROOT)
 		return net->ipv6.ip6_null_entry;
 
 	while (fn) {
-		if (fn->left)
-			return fn->left->leaf;
-		if (fn->right)
-			return fn->right->leaf;
+		child_left = rcu_dereference_protected(fn->left,
+				    lockdep_is_held(&table->tb6_lock));
+		child_right = rcu_dereference_protected(fn->right,
+				    lockdep_is_held(&table->tb6_lock));
+		if (child_left)
+			return rcu_dereference_protected(child_left->leaf,
+					lockdep_is_held(&table->tb6_lock));
+		if (child_right)
+			return rcu_dereference_protected(child_right->leaf,
+					lockdep_is_held(&table->tb6_lock));
 
 		fn = FIB6_SUBTREE(fn);
 	}
@@ -1452,31 +1501,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
 /*
  *	Called to trim the tree of intermediate nodes when possible. "fn"
  *	is the node we want to try and remove.
+ *	Need to own table->tb6_lock
  */
 
 static struct fib6_node *fib6_repair_tree(struct net *net,
-					   struct fib6_node *fn)
+					  struct fib6_table *table,
+					  struct fib6_node *fn)
 {
 	int children;
 	int nstate;
-	struct fib6_node *child, *pn;
+	struct fib6_node *child;
 	struct fib6_walker *w;
 	int iter = 0;
 
 	for (;;) {
+		struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
+					    lockdep_is_held(&table->tb6_lock));
+		struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
+					    lockdep_is_held(&table->tb6_lock));
+		struct fib6_node *pn = rcu_dereference_protected(fn->parent,
+					    lockdep_is_held(&table->tb6_lock));
+		struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
+					    lockdep_is_held(&table->tb6_lock));
+		struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
+					    lockdep_is_held(&table->tb6_lock));
+		struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+					    lockdep_is_held(&table->tb6_lock));
+		struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+					    lockdep_is_held(&table->tb6_lock));
+		struct rt6_info *new_fn_leaf;
+
 		RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
 		iter++;
 
 		WARN_ON(fn->fn_flags & RTN_RTINFO);
 		WARN_ON(fn->fn_flags & RTN_TL_ROOT);
-		WARN_ON(fn->leaf);
+		WARN_ON(fn_leaf);
 
 		children = 0;
 		child = NULL;
-		if (fn->right)
-			child = fn->right, children |= 1;
-		if (fn->left)
-			child = fn->left, children |= 2;
+		if (fn_r)
+			child = fn_r, children |= 1;
+		if (fn_l)
+			child = fn_l, children |= 2;
 
 		if (children == 3 || FIB6_SUBTREE(fn)
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1484,36 +1551,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 		    || (children && fn->fn_flags & RTN_ROOT)
 #endif
 		    ) {
-			fn->leaf = fib6_find_prefix(net, fn);
+			new_fn_leaf = fib6_find_prefix(net, table, fn);
 #if RT6_DEBUG >= 2
-			if (!fn->leaf) {
-				WARN_ON(!fn->leaf);
-				fn->leaf = net->ipv6.ip6_null_entry;
+			if (!new_fn_leaf) {
+				WARN_ON(!new_fn_leaf);
+				new_fn_leaf = net->ipv6.ip6_null_entry;
 			}
 #endif
-			atomic_inc(&fn->leaf->rt6i_ref);
-			return fn->parent;
+			atomic_inc(&new_fn_leaf->rt6i_ref);
+			rcu_assign_pointer(fn->leaf, new_fn_leaf);
+			return pn;
 		}
 
-		pn = fn->parent;
 #ifdef CONFIG_IPV6_SUBTREES
 		if (FIB6_SUBTREE(pn) == fn) {
 			WARN_ON(!(fn->fn_flags & RTN_ROOT));
-			FIB6_SUBTREE(pn) = NULL;
+			RCU_INIT_POINTER(pn->subtree, NULL);
 			nstate = FWS_L;
 		} else {
 			WARN_ON(fn->fn_flags & RTN_ROOT);
 #endif
-			if (pn->right == fn)
-				pn->right = child;
-			else if (pn->left == fn)
-				pn->left = child;
+			if (pn_r == fn)
+				rcu_assign_pointer(pn->right, child);
+			else if (pn_l == fn)
+				rcu_assign_pointer(pn->left, child);
 #if RT6_DEBUG >= 2
 			else
 				WARN_ON(1);
 #endif
 			if (child)
-				child->parent = pn;
+				rcu_assign_pointer(child->parent, pn);
 			nstate = FWS_R;
 #ifdef CONFIG_IPV6_SUBTREES
 		}
@@ -1546,17 +1613,18 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 		if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
 			return pn;
 
-		rt6_release(pn->leaf);
-		pn->leaf = NULL;
+		RCU_INIT_POINTER(pn->leaf, NULL);
+		rt6_release(pn_leaf);
 		fn = pn;
 	}
 }
 
-static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
-			   struct nl_info *info)
+static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
+			   struct rt6_info __rcu **rtp, struct nl_info *info)
 {
 	struct fib6_walker *w;
-	struct rt6_info *rt = *rtp;
+	struct rt6_info *rt = rcu_dereference_protected(*rtp,
+				    lockdep_is_held(&table->tb6_lock));
 	struct net *net = info->nl_net;
 
 	RT6_TRACE("fib6_del_route\n");
@@ -1573,7 +1641,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	rt6_flush_exceptions(rt);
 
 	/* Reset round-robin state, if necessary */
-	if (fn->rr_ptr == rt)
+	if (rcu_access_pointer(fn->rr_ptr) == rt)
 		fn->rr_ptr = NULL;
 
 	/* Remove this entry from other siblings */
@@ -1592,20 +1660,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	FOR_WALKERS(net, w) {
 		if (w->state == FWS_C && w->leaf == rt) {
 			RT6_TRACE("walker %p adjusted by delroute\n", w);
-			w->leaf = rt->dst.rt6_next;
+			w->leaf = rcu_dereference_protected(rt->dst.rt6_next,
+					    lockdep_is_held(&table->tb6_lock));
 			if (!w->leaf)
 				w->state = FWS_U;
 		}
 	}
 	read_unlock(&net->ipv6.fib6_walker_lock);
 
-	rt->dst.rt6_next = NULL;
-
 	/* If it was last route, expunge its radix tree node */
-	if (!fn->leaf) {
+	if (!rcu_access_pointer(fn->leaf)) {
 		fn->fn_flags &= ~RTN_RTINFO;
 		net->ipv6.rt6_stats->fib_route_nodes--;
-		fn = fib6_repair_tree(net, fn);
+		fn = fib6_repair_tree(net, table, fn);
 	}
 
 	fib6_purge_rt(rt, fn, net);
@@ -1616,12 +1683,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	rt6_release(rt);
 }
 
+/* Need to own table->tb6_lock */
 int fib6_del(struct rt6_info *rt, struct nl_info *info)
 {
 	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	struct fib6_table *table = rt->rt6i_table;
 	struct net *net = info->nl_net;
-	struct rt6_info **rtp;
+	struct rt6_info __rcu **rtp;
+	struct rt6_info __rcu **rtp_next;
 
 #if RT6_DEBUG >= 2
 	if (rt->dst.obsolete > 0) {
@@ -1642,11 +1712,14 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 	 *	Walk the leaf entries looking for ourself
 	 */
 
-	for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
-		if (*rtp == rt) {
-			fib6_del_route(fn, rtp, info);
+	for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
+		struct rt6_info *cur = rcu_dereference_protected(*rtp,
+					lockdep_is_held(&table->tb6_lock));
+		if (rt == cur) {
+			fib6_del_route(table, fn, rtp, info);
 			return 0;
 		}
+		rtp_next = &cur->dst.rt6_next;
 	}
 	return -ENOENT;
 }
@@ -1673,11 +1746,13 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
  *	0   -> walk is complete.
  *	>0  -> walk is incomplete (i.e. suspended)
  *	<0  -> walk is terminated by an error.
+ *
+ *	This function is called with tb6_lock held.
  */
 
 static int fib6_walk_continue(struct fib6_walker *w)
 {
-	struct fib6_node *fn, *pn;
+	struct fib6_node *fn, *pn, *left, *right;
 
 	/* w->root should always be table->tb6_root */
 	WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
@@ -1697,20 +1772,22 @@ static int fib6_walk_continue(struct fib6_walker *w)
 			w->state = FWS_L;
 #endif
 		case FWS_L:
-			if (fn->left) {
-				w->node = fn->left;
+			left = rcu_dereference_protected(fn->left, 1);
+			if (left) {
+				w->node = left;
 				w->state = FWS_INIT;
 				continue;
 			}
 			w->state = FWS_R;
 		case FWS_R:
-			if (fn->right) {
-				w->node = fn->right;
+			right = rcu_dereference_protected(fn->right, 1);
+			if (right) {
+				w->node = right;
 				w->state = FWS_INIT;
 				continue;
 			}
 			w->state = FWS_C;
-			w->leaf = fn->leaf;
+			w->leaf = rcu_dereference_protected(fn->leaf, 1);
 		case FWS_C:
 			if (w->leaf && fn->fn_flags & RTN_RTINFO) {
 				int err;
@@ -1732,7 +1809,9 @@ skip:
 		case FWS_U:
 			if (fn == w->root)
 				return 0;
-			pn = fn->parent;
+			pn = rcu_dereference_protected(fn->parent, 1);
+			left = rcu_dereference_protected(pn->left, 1);
+			right = rcu_dereference_protected(pn->right, 1);
 			w->node = pn;
 #ifdef CONFIG_IPV6_SUBTREES
 			if (FIB6_SUBTREE(pn) == fn) {
@@ -1741,13 +1820,13 @@ skip:
 				continue;
 			}
 #endif
-			if (pn->left == fn) {
+			if (left == fn) {
 				w->state = FWS_R;
 				continue;
 			}
-			if (pn->right == fn) {
+			if (right == fn) {
 				w->state = FWS_C;
-				w->leaf = w->node->leaf;
+				w->leaf = rcu_dereference_protected(w->node->leaf, 1);
 				continue;
 			}
 #if RT6_DEBUG >= 2
@@ -1790,7 +1869,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 		return 0;
 	}
 
-	for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+	for_each_fib6_walker_rt(w) {
 		res = c->func(rt, c->arg);
 		if (res < 0) {
 			w->leaf = rt;
@@ -1850,10 +1929,10 @@ static void __fib6_clean_all(struct net *net,
 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 		head = &net->ipv6.fib_table_hash[h];
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
-			write_lock_bh(&table->tb6_lock);
+			spin_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
 					func, sernum, arg);
-			write_unlock_bh(&table->tb6_lock);
+			spin_unlock_bh(&table->tb6_lock);
 		}
 	}
 	rcu_read_unlock();
@@ -1967,7 +2046,8 @@ static int __net_init fib6_net_init(struct net *net)
 		goto out_fib_table_hash;
 
 	net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
-	net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+	rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
+			   net->ipv6.ip6_null_entry);
 	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 	inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -1978,7 +2058,8 @@ static int __net_init fib6_net_init(struct net *net)
 	if (!net->ipv6.fib6_local_tbl)
 		goto out_fib6_main_tbl;
 	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
-	net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+	rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
+			   net->ipv6.ip6_null_entry);
 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 	inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
@@ -2108,7 +2189,9 @@ static int ipv6_route_yield(struct fib6_walker *w)
 		return 1;
 
 	do {
-		iter->w.leaf = iter->w.leaf->dst.rt6_next;
+		iter->w.leaf = rcu_dereference_protected(
+				iter->w.leaf->dst.rt6_next,
+				lockdep_is_held(&iter->tbl->tb6_lock));
 		iter->skip--;
 		if (!iter->skip && iter->w.leaf)
 			return 1;
@@ -2173,7 +2256,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (!v)
 		goto iter_table;
 
-	n = ((struct rt6_info *)v)->dst.rt6_next;
+	n = rcu_dereference(((struct rt6_info *)v)->dst.rt6_next);
 	if (n) {
 		++*pos;
 		return n;
@@ -2181,9 +2264,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 iter_table:
 	ipv6_route_check_sernum(iter);
-	read_lock(&iter->tbl->tb6_lock);
+	spin_lock_bh(&iter->tbl->tb6_lock);
 	r = fib6_walk_continue(&iter->w);
-	read_unlock(&iter->tbl->tb6_lock);
+	spin_unlock_bh(&iter->tbl->tb6_lock);
 	if (r > 0) {
 		if (v)
 			++*pos;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 24b80f43bbfb..cf44d0994b1e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -488,7 +488,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 }
 
 /*
- *	Route lookup. Any table->tb6_lock is implied.
+ *	Route lookup. rcu_read_lock() should be held.
  */
 
 static inline struct rt6_info *rt6_device_match(struct net *net,
@@ -503,7 +503,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 	if (!oif && ipv6_addr_any(saddr))
 		goto out;
 
-	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
+	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
 		struct net_device *dev = sprt->dst.dev;
 
 		if (oif) {
@@ -722,7 +722,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 
 	match = NULL;
 	cont = NULL;
-	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+	for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
 		if (rt->rt6i_metric != metric) {
 			cont = rt;
 			break;
@@ -731,7 +731,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
 	}
 
-	for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+	for (rt = leaf; rt && rt != rr_head;
+	     rt = rcu_dereference(rt->dst.rt6_next)) {
 		if (rt->rt6i_metric != metric) {
 			cont = rt;
 			break;
@@ -743,7 +744,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 	if (match || !cont)
 		return match;
 
-	for (rt = cont; rt; rt = rt->dst.rt6_next)
+	for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
 	return match;
@@ -752,7 +753,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 				   int oif, int strict)
 {
-	struct rt6_info *leaf = fn->leaf;
+	struct rt6_info *leaf = rcu_dereference(fn->leaf);
 	struct rt6_info *match, *rt0;
 	bool do_rr = false;
 	int key_plen;
@@ -760,9 +761,9 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 	if (!leaf)
 		return net->ipv6.ip6_null_entry;
 
-	rt0 = fn->rr_ptr;
+	rt0 = rcu_dereference(fn->rr_ptr);
 	if (!rt0)
-		fn->rr_ptr = rt0 = leaf;
+		rt0 = leaf;
 
 	/* Double check to make sure fn is not an intermediate node
 	 * and fn->leaf does not points to its child's leaf
@@ -781,14 +782,19 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 			     &do_rr);
 
 	if (do_rr) {
-		struct rt6_info *next = rt0->dst.rt6_next;
+		struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
 
 		/* no entries matched; do round-robin */
 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
 			next = leaf;
 
-		if (next != rt0)
-			fn->rr_ptr = next;
+		if (next != rt0) {
+			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+			/* make sure next is not being deleted from the tree */
+			if (next->rt6i_node)
+				rcu_assign_pointer(fn->rr_ptr, next);
+			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+		}
 	}
 
 	return match ? match : net->ipv6.ip6_null_entry;
@@ -878,13 +884,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 					struct in6_addr *saddr)
 {
-	struct fib6_node *pn;
+	struct fib6_node *pn, *sn;
 	while (1) {
 		if (fn->fn_flags & RTN_TL_ROOT)
 			return NULL;
-		pn = fn->parent;
-		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
-			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+		pn = rcu_dereference(fn->parent);
+		sn = FIB6_SUBTREE(pn);
+		if (sn && sn != fn)
+			fn = fib6_lookup(sn, NULL, saddr);
 		else
 			fn = pn;
 		if (fn->fn_flags & RTN_RTINFO)
@@ -916,13 +923,19 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 	struct rt6_info *rt, *rt_cache;
 	struct fib6_node *fn;
 
-	read_lock_bh(&table->tb6_lock);
+	rcu_read_lock();
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
-	rt = fn->leaf;
-	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
-	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
+	rt = rcu_dereference(fn->leaf);
+	if (!rt) {
+		rt = net->ipv6.ip6_null_entry;
+	} else {
+		rt = rt6_device_match(net, rt, &fl6->saddr,
+				      fl6->flowi6_oif, flags);
+		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+			rt = rt6_multipath_select(rt, fl6,
+						  fl6->flowi6_oif, flags);
+	}
 	if (rt == net->ipv6.ip6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -936,7 +949,7 @@ restart:
 	if (ip6_hold_safe(net, &rt, true))
 		dst_use_noref(&rt->dst, jiffies);
 
-	read_unlock_bh(&table->tb6_lock);
+	rcu_read_unlock();
 
 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
 
@@ -990,9 +1003,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 	struct fib6_table *table;
 
 	table = rt->rt6i_table;
-	write_lock_bh(&table->tb6_lock);
+	spin_lock_bh(&table->tb6_lock);
 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
-	write_unlock_bh(&table->tb6_lock);
+	spin_unlock_bh(&table->tb6_lock);
 
 	return err;
 }
@@ -1090,7 +1103,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 	return pcpu_rt;
 }
 
-/* It should be called with read_lock_bh(&tb6_lock) acquired */
+/* It should be called with rcu_read_lock() acquired */
 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 {
 	struct rt6_info *pcpu_rt, **p;
@@ -1632,7 +1645,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 	if (net->ipv6.devconf_all->forwarding == 0)
 		strict |= RT6_LOOKUP_F_REACHABLE;
 
-	read_lock_bh(&table->tb6_lock);
+	rcu_read_lock();
 
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
@@ -1662,7 +1675,7 @@ redo_rt6_select:
 		rt = rt_cache;
 
 	if (rt == net->ipv6.ip6_null_entry) {
-		read_unlock_bh(&table->tb6_lock);
+		rcu_read_unlock();
 		dst_hold(&rt->dst);
 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
 		return rt;
@@ -1671,7 +1684,7 @@ redo_rt6_select:
 			dst_use_noref(&rt->dst, jiffies);
 			rt6_dst_from_metrics_check(rt);
 		}
-		read_unlock_bh(&table->tb6_lock);
+		rcu_read_unlock();
 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
@@ -1687,11 +1700,11 @@ redo_rt6_select:
 		if (ip6_hold_safe(net, &rt, true)) {
 			dst_use_noref(&rt->dst, jiffies);
 		} else {
-			read_unlock_bh(&table->tb6_lock);
+			rcu_read_unlock();
 			uncached_rt = rt;
 			goto uncached_rt_out;
 		}
-		read_unlock_bh(&table->tb6_lock);
+		rcu_read_unlock();
 
 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
 		dst_release(&rt->dst);
@@ -1719,7 +1732,7 @@ uncached_rt_out:
 		pcpu_rt = rt6_get_pcpu_route(rt);
 
 		if (pcpu_rt) {
-			read_unlock_bh(&table->tb6_lock);
+			rcu_read_unlock();
 		} else {
 			/* atomic_inc_not_zero() is needed when using rcu */
 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
@@ -1730,12 +1743,12 @@ uncached_rt_out:
 				 * No dst_hold() on rt is needed because grabbing
 				 * rt->rt6i_ref makes sure rt can't be released.
 				 */
-				read_unlock_bh(&table->tb6_lock);
+				rcu_read_unlock();
 				pcpu_rt = rt6_make_pcpu_route(rt);
 				rt6_release(rt);
 			} else {
 				/* rt is already removed from tree */
-				read_unlock_bh(&table->tb6_lock);
+				rcu_read_unlock();
 				pcpu_rt = net->ipv6.ip6_null_entry;
 				dst_hold(&pcpu_rt->dst);
 			}
@@ -2131,10 +2144,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	 * routes.
 	 */
 
-	read_lock_bh(&table->tb6_lock);
+	rcu_read_lock();
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
-	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+	for_each_fib6_node_rt_rcu(fn) {
 		if (rt6_check_expired(rt))
 			continue;
 		if (rt->dst.error)
@@ -2179,7 +2192,7 @@ restart:
 out:
 	ip6_hold_safe(net, &rt, true);
 
-	read_unlock_bh(&table->tb6_lock);
+	rcu_read_unlock();
 
 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
 	return rt;
@@ -2778,9 +2791,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 	}
 
 	table = rt->rt6i_table;
-	write_lock_bh(&table->tb6_lock);
+	spin_lock_bh(&table->tb6_lock);
 	err = fib6_del(rt, info);
-	write_unlock_bh(&table->tb6_lock);
+	spin_unlock_bh(&table->tb6_lock);
 
 out:
 	ip6_rt_put(rt);
@@ -2806,7 +2819,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 	if (rt == net->ipv6.ip6_null_entry)
 		goto out_put;
 	table = rt->rt6i_table;
-	write_lock_bh(&table->tb6_lock);
+	spin_lock_bh(&table->tb6_lock);
 
 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
 		struct rt6_info *sibling, *next_sibling;
@@ -2836,7 +2849,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 
 	err = fib6_del(rt, info);
 out_unlock:
-	write_unlock_bh(&table->tb6_lock);
+	spin_unlock_bh(&table->tb6_lock);
 out_put:
 	ip6_rt_put(rt);
 
@@ -2861,7 +2874,7 @@ static int ip6_route_del(struct fib6_config *cfg,
 		return err;
 	}
 
-	read_lock_bh(&table->tb6_lock);
+	rcu_read_lock();
 
 	fn = fib6_locate(&table->tb6_root,
 			 &cfg->fc_dst, cfg->fc_dst_len,
@@ -2869,7 +2882,7 @@ static int ip6_route_del(struct fib6_config *cfg,
 			 !(cfg->fc_flags & RTF_CACHE));
 
 	if (fn) {
-		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+		for_each_fib6_node_rt_rcu(fn) {
 			if (cfg->fc_flags & RTF_CACHE) {
 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
 							      &cfg->fc_src);
@@ -2890,7 +2903,7 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			if (!dst_hold_safe(&rt->dst))
 				break;
-			read_unlock_bh(&table->tb6_lock);
+			rcu_read_unlock();
 
 			/* if gateway was specified only delete the one hop */
 			if (cfg->fc_flags & RTF_GATEWAY)
@@ -2899,7 +2912,7 @@ static int ip6_route_del(struct fib6_config *cfg,
 			return __ip6_del_rt_siblings(rt, cfg);
 		}
 	}
-	read_unlock_bh(&table->tb6_lock);
+	rcu_read_unlock();
 
 	return err;
 }
@@ -3074,12 +3087,12 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 	if (!table)
 		return NULL;
 
-	read_lock_bh(&table->tb6_lock);
+	rcu_read_lock();
 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
 	if (!fn)
 		goto out;
 
-	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->dst.dev->ifindex != ifindex)
 			continue;
 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
@@ -3090,7 +3103,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 		break;
 	}
 out:
-	read_unlock_bh(&table->tb6_lock);
+	rcu_read_unlock();
 	return rt;
 }
 
@@ -3136,8 +3149,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
 	if (!table)
 		return NULL;
 
-	read_lock_bh(&table->tb6_lock);
-	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+	rcu_read_lock();
+	for_each_fib6_node_rt_rcu(&table->tb6_root) {
 		if (dev == rt->dst.dev &&
 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
@@ -3145,7 +3158,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
 	}
 	if (rt)
 		ip6_hold_safe(NULL, &rt, false);
-	read_unlock_bh(&table->tb6_lock);
+	rcu_read_unlock();
 	return rt;
 }
 
@@ -3183,20 +3196,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table)
 	struct rt6_info *rt;
 
 restart:
-	read_lock_bh(&table->tb6_lock);
-	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+	rcu_read_lock();
+	for_each_fib6_node_rt_rcu(&table->tb6_root) {
 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
 			if (dst_hold_safe(&rt->dst)) {
-				read_unlock_bh(&table->tb6_lock);
+				rcu_read_unlock();
 				ip6_del_rt(rt);
 			} else {
-				read_unlock_bh(&table->tb6_lock);
+				rcu_read_unlock();
 			}
 			goto restart;
 		}
 	}
-	read_unlock_bh(&table->tb6_lock);
+	rcu_read_unlock();
 
 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
 }
-- 
cgit v1.2.3


From 81eb8447daae3b62247aa66bb17b82f8fef68249 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 6 Oct 2017 12:06:11 -0700
Subject: ipv6: take care of rt6_stats

Currently, most of the rt6_stats are not hooked up correctly. As the
last part of this patch series, hook up all existing rt6_stats and add
one new stat fib_rt_uncache to indicate the number of routes in the
uncached list.
For details of the stats, please refer to the comments added in
include/net/ip6_fib.h.

Note: fib_rt_alloc and fib_rt_uncache are not guaranteed to be modified
under a lock. So atomic_t is used for them.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 15 +++++++++------
 net/ipv6/ip6_fib.c    | 42 ++++++++++++++++++++++++------------------
 net/ipv6/route.c      | 16 ++++++++++++++--
 3 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 0b438b9bcb10..10c913816032 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -297,12 +297,15 @@ struct fib6_walker {
 };
 
 struct rt6_statistics {
-	__u32		fib_nodes;
-	__u32		fib_route_nodes;
-	__u32		fib_rt_alloc;		/* permanent routes	*/
-	__u32		fib_rt_entries;		/* rt entries in table	*/
-	__u32		fib_rt_cache;		/* cache routes		*/
-	__u32		fib_discarded_routes;
+	__u32		fib_nodes;		/* all fib6 nodes */
+	__u32		fib_route_nodes;	/* intermediate nodes */
+	__u32		fib_rt_entries;		/* rt entries in fib table */
+	__u32		fib_rt_cache;		/* cached rt entries in exception table */
+	__u32		fib_discarded_routes;	/* total number of routes delete */
+
+	/* The following stats are not protected by any lock */
+	atomic_t	fib_rt_alloc;		/* total number of routes alloced */
+	atomic_t	fib_rt_uncache;		/* rt entries in uncached list */
 };
 
 #define RTN_TL_ROOT	0x0001
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 3f95908b39c3..52a29ba32928 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -149,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
 	       addr[fn_bit >> 5];
 }
 
-static struct fib6_node *node_alloc(void)
+static struct fib6_node *node_alloc(struct net *net)
 {
 	struct fib6_node *fn;
 
 	fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
+	if (fn)
+		net->ipv6.rt6_stats->fib_nodes++;
 
 	return fn;
 }
 
-static void node_free_immediate(struct fib6_node *fn)
+static void node_free_immediate(struct net *net, struct fib6_node *fn)
 {
 	kmem_cache_free(fib6_node_kmem, fn);
+	net->ipv6.rt6_stats->fib_nodes--;
 }
 
 static void node_free_rcu(struct rcu_head *head)
@@ -170,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head)
 	kmem_cache_free(fib6_node_kmem, fn);
 }
 
-static void node_free(struct fib6_node *fn)
+static void node_free(struct net *net, struct fib6_node *fn)
 {
 	call_rcu(&fn->rcu, node_free_rcu);
+	net->ipv6.rt6_stats->fib_nodes--;
 }
 
 void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
@@ -583,7 +587,8 @@ out:
  *	node.
  */
 
-static struct fib6_node *fib6_add_1(struct fib6_table *table,
+static struct fib6_node *fib6_add_1(struct net *net,
+				    struct fib6_table *table,
 				    struct fib6_node *root,
 				    struct in6_addr *addr, int plen,
 				    int offset, int allow_create,
@@ -675,7 +680,7 @@ static struct fib6_node *fib6_add_1(struct fib6_table *table,
 	 *	Create new leaf node without children.
 	 */
 
-	ln = node_alloc();
+	ln = node_alloc(net);
 
 	if (!ln)
 		return ERR_PTR(-ENOMEM);
@@ -716,14 +721,14 @@ insert_above:
 	 *	(new leaf node)[ln] (old node)[fn]
 	 */
 	if (plen > bit) {
-		in = node_alloc();
-		ln = node_alloc();
+		in = node_alloc(net);
+		ln = node_alloc(net);
 
 		if (!in || !ln) {
 			if (in)
-				node_free_immediate(in);
+				node_free_immediate(net, in);
 			if (ln)
-				node_free_immediate(ln);
+				node_free_immediate(net, ln);
 			return ERR_PTR(-ENOMEM);
 		}
 
@@ -768,7 +773,7 @@ insert_above:
 		 *	     (old node)[fn] NULL
 		 */
 
-		ln = node_alloc();
+		ln = node_alloc(net);
 
 		if (!ln)
 			return ERR_PTR(-ENOMEM);
@@ -1065,6 +1070,7 @@ add:
 						fn->rr_ptr = NULL;
 					rt6_release(iter);
 					nsiblings--;
+					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 				} else {
 					ins = &iter->dst.rt6_next;
 				}
@@ -1140,7 +1146,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	if (!allow_create && !replace_required)
 		pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
 
-	fn = fib6_add_1(table, root,
+	fn = fib6_add_1(info->nl_net, table, root,
 			&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
 			offsetof(struct rt6_info, rt6i_dst), allow_create,
 			replace_required, extack);
@@ -1170,7 +1176,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			 */
 
 			/* Create subtree root node */
-			sfn = node_alloc();
+			sfn = node_alloc(info->nl_net);
 			if (!sfn)
 				goto failure;
 
@@ -1181,8 +1187,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			/* Now add the first leaf node to new subtree */
 
-			sn = fib6_add_1(table, sfn, &rt->rt6i_src.addr,
-					rt->rt6i_src.plen,
+			sn = fib6_add_1(info->nl_net, table, sfn,
+					&rt->rt6i_src.addr, rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
 					allow_create, replace_required, extack);
 
@@ -1191,7 +1197,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 				   root, and then (in failure) stale node
 				   in main tree.
 				 */
-				node_free_immediate(sfn);
+				node_free_immediate(info->nl_net, sfn);
 				err = PTR_ERR(sn);
 				goto failure;
 			}
@@ -1200,8 +1206,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			rcu_assign_pointer(sfn->parent, fn);
 			rcu_assign_pointer(fn->subtree, sfn);
 		} else {
-			sn = fib6_add_1(table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr,
-					rt->rt6i_src.plen,
+			sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
+					&rt->rt6i_src.addr, rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
 					allow_create, replace_required, extack);
 
@@ -1609,7 +1615,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 		}
 		read_unlock(&net->ipv6.fib6_walker_lock);
 
-		node_free(fn);
+		node_free(net, fn);
 		if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
 			return pn;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cf44d0994b1e..399d1bceec4a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -143,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt)
 {
 	if (!list_empty(&rt->rt6i_uncached)) {
 		struct uncached_list *ul = rt->rt6i_uncached_list;
+		struct net *net = dev_net(rt->dst.dev);
 
 		spin_lock_bh(&ul->lock);
 		list_del(&rt->rt6i_uncached);
+		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
 		spin_unlock_bh(&ul->lock);
 	}
 }
@@ -359,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 					1, DST_OBSOLETE_FORCE_CHK, flags);
 
-	if (rt)
+	if (rt) {
 		rt6_info_init(rt);
+		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
+	}
 
 	return rt;
 }
@@ -1156,6 +1160,8 @@ static DEFINE_SPINLOCK(rt6_exception_lock);
 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
 				 struct rt6_exception *rt6_ex)
 {
+	struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
+
 	if (!bucket || !rt6_ex)
 		return;
 	rt6_ex->rt6i->rt6i_node = NULL;
@@ -1164,6 +1170,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
 	kfree_rcu(rt6_ex, rcu);
 	WARN_ON_ONCE(!bucket->depth);
 	bucket->depth--;
+	net->ipv6.rt6_stats->fib_rt_cache--;
 }
 
 /* Remove oldest rt6_ex in bucket and free the memory
@@ -1270,6 +1277,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
 static int rt6_insert_exception(struct rt6_info *nrt,
 				struct rt6_info *ort)
 {
+	struct net *net = dev_net(ort->dst.dev);
 	struct rt6_exception_bucket *bucket;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
@@ -1339,6 +1347,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	nrt->rt6i_node = ort->rt6i_node;
 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
 	bucket->depth++;
+	net->ipv6.rt6_stats->fib_rt_cache++;
 
 	if (bucket->depth > FIB6_MAX_DEPTH)
 		rt6_exception_remove_oldest(bucket);
@@ -1714,6 +1723,7 @@ redo_rt6_select:
 			 * No need for another dst_hold()
 			 */
 			rt6_uncached_list_add(uncached_rt);
+			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
 		} else {
 			uncached_rt = net->ipv6.ip6_null_entry;
 			dst_hold(&uncached_rt->dst);
@@ -1894,6 +1904,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 		       DST_OBSOLETE_NONE, 0);
 	if (rt) {
 		rt6_info_init(rt);
+		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
 
 		new = &rt->dst;
 		new->__use = 1;
@@ -2341,6 +2352,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	 * do proper release of the net_device
 	 */
 	rt6_uncached_list_add(rt);
+	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
 
 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
 
@@ -4422,7 +4434,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
 		   net->ipv6.rt6_stats->fib_nodes,
 		   net->ipv6.rt6_stats->fib_route_nodes,
-		   net->ipv6.rt6_stats->fib_rt_alloc,
+		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
 		   net->ipv6.rt6_stats->fib_rt_entries,
 		   net->ipv6.rt6_stats->fib_rt_cache,
 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
-- 
cgit v1.2.3


From bdc476413dcdb5c38a7dec90fb2bca327021273a Mon Sep 17 00:00:00 2001
From: Amine Kherbouche <amine.kherbouche@6wind.com>
Date: Wed, 4 Oct 2017 19:35:57 +0200
Subject: ip_tunnel: add mpls over gre support

This commit introduces the MPLSoGRE support (RFC 4023), using ip tunnel
API by simply adding ipgre_tunnel_encap_(add|del)_mpls_ops() and the new
tunnel type TUNNEL_ENCAP_MPLS.

Signed-off-by: Amine Kherbouche <amine.kherbouche@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_tunnel.h |  1 +
 net/mpls/af_mpls.c             | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 2e520883c054..a2f48c01365e 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -84,6 +84,7 @@ enum tunnel_encap_types {
 	TUNNEL_ENCAP_NONE,
 	TUNNEL_ENCAP_FOU,
 	TUNNEL_ENCAP_GUE,
+	TUNNEL_ENCAP_MPLS,
 };
 
 #define TUNNEL_ENCAP_FLAG_CSUM		(1<<0)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c5b9ce41d66f..9745e8f69810 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -16,6 +16,7 @@
 #include <net/arp.h>
 #include <net/ip_fib.h>
 #include <net/netevent.h>
+#include <net/ip_tunnels.h>
 #include <net/netns/generic.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -39,6 +40,36 @@ static int one = 1;
 static int label_limit = (1 << 20) - 1;
 static int ttl_max = 255;
 
+#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
+size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e)
+{
+	return sizeof(struct mpls_shim_hdr);
+}
+
+static const struct ip_tunnel_encap_ops mpls_iptun_ops = {
+	.encap_hlen	= ipgre_mpls_encap_hlen,
+};
+
+static int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+	return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+	ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+#else
+static int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+	return 0;
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+}
+#endif
+
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
 		       struct nlmsghdr *nlh, struct net *net, u32 portid,
 		       unsigned int nlm_flags);
@@ -2485,6 +2516,10 @@ static int __init mpls_init(void)
 		      0);
 	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
 		      mpls_netconf_dump_devconf, 0);
+	err = ipgre_tunnel_encap_add_mpls_ops();
+	if (err)
+		pr_err("Can't add mpls over gre tunnel ops\n");
+
 	err = 0;
 out:
 	return err;
@@ -2502,6 +2537,7 @@ static void __exit mpls_exit(void)
 	dev_remove_pack(&mpls_packet_type);
 	unregister_netdevice_notifier(&mpls_dev_notifier);
 	unregister_pernet_subsys(&mpls_net_ops);
+	ipgre_tunnel_encap_del_mpls_ops();
 }
 module_exit(mpls_exit);
 
-- 
cgit v1.2.3


From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:19 -0700
Subject: bpf: perf event change needed for subsequent bpf helpers

This patch does not impact existing functionalities.
It contains the changes in perf event area needed for
subsequent bpf_perf_event_read_value and
bpf_perf_prog_read_value helpers.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h |  7 +++++--
 kernel/bpf/arraymap.c      |  2 +-
 kernel/events/core.c       | 15 +++++++++++++--
 kernel/trace/bpf_trace.c   |  2 +-
 4 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..79b18a20cf5d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -806,6 +806,7 @@ struct perf_output_handle {
 struct bpf_perf_event_data_kern {
 	struct pt_regs *regs;
 	struct perf_sample_data *data;
+	struct perf_event *event;
 };
 
 #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+					u64 *enabled, u64 *running)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00c3f5e..68d866628be0 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 
 	ee = ERR_PTR(-EOPNOTSUPP);
 	event = perf_file->private_data;
-	if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+	if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
 		goto err_out;
 
 	ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..902149f05381 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
  *     will not be local and we cannot read them atomically
  *   - must not have a pmu::count method
  */
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running)
 {
 	unsigned long flags;
 	int ret = 0;
+	u64 now;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 		goto out;
 	}
 
+	now = event->shadow_ctx_time + perf_clock();
+	if (enabled)
+		*enabled = now - event->tstamp_enabled;
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id())
+	if (event->oncpu == smp_processor_id()) {
 		event->pmu->read(event);
+		if (running)
+			*running = now - event->tstamp_running;
+	} else if (running) {
+		*running = event->total_time_running;
+	}
 
 	*value = local64_read(&event->count);
 out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
 		.regs = regs,
+		.event = event,
 	};
 	int ret = 0;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..95888ae6c263 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value);
+	err = perf_event_read_local(ee->event, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
-- 
cgit v1.2.3


From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:20 -0700
Subject: bpf: add helper bpf_perf_event_read_value for perf event array map

Hardware pmu counters are limited resources. When there are more
pmu based perf events opened than available counters, kernel will
multiplex these events so each event gets certain percentage
(but not 100%) of the pmu time. In case that multiplexing happens,
the number of samples or counter value will not reflect the
case compared to no multiplexing. This makes comparison between
different runs difficult.

Typically, the number of samples or counter value should be
normalized before comparing to other experiments. The typical
normalization is done like:
  normalized_num_samples = num_samples * time_enabled / time_running
  normalized_counter_value = counter_value * time_enabled / time_running
where time_enabled is the time enabled for event and time_running is
the time running for event since last normalization.

This patch adds helper bpf_perf_event_read_value for kprobed based perf
event array map, to read perf counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.
To achieve scaling factor between two bpf invocations, users
can can use cpu_id as the key (which is typical for perf array usage model)
to remember the previous value and do the calculation inside the
bpf program.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 21 +++++++++++++++++++--
 kernel/bpf/verifier.c    |  4 +++-
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6082faf5fd2a..7b57a212c7d7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -641,6 +641,14 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: An positive/negative integer to be added to xdp_md.data_meta
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_perf_event_read_value(map, flags, buf, buf_size)
+ *     read perf event counter value and perf event enabled/running time
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return: 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -697,7 +705,8 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
-	FN(xdp_adjust_meta),
+	FN(xdp_adjust_meta),		\
+	FN(perf_event_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -741,7 +750,9 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
-/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
 #define BPF_F_INDEX_MASK		0xffffffffULL
 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -934,4 +945,10 @@ enum {
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
 #define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
+struct bpf_perf_event_value {
+	__u64 counter;
+	__u64 enabled;
+	__u64 running;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 52b022310f6a..590125e29161 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1552,7 +1552,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 		if (func_id != BPF_FUNC_perf_event_read &&
-		    func_id != BPF_FUNC_perf_event_output)
+		    func_id != BPF_FUNC_perf_event_output &&
+		    func_id != BPF_FUNC_perf_event_read_value)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -1595,6 +1596,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_FUNC_perf_event_read:
 	case BPF_FUNC_perf_event_output:
+	case BPF_FUNC_perf_event_read_value:
 		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			goto error;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 95888ae6c263..0be86cc0130e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+		     u64 *value, u64 *enabled, u64 *running)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
-	u64 value = 0;
-	int err;
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value, NULL, NULL);
+	return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+	u64 value = 0;
+	int err;
+
+	err = get_map_perf_counter(map, flags, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
@@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+				   &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+	.func		= bpf_perf_event_read_value,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
 static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
 
 static __always_inline u64
@@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_perf_event_read_value:
+		return &bpf_perf_event_read_value_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:22 -0700
Subject: bpf: add helper bpf_perf_prog_read_value

This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf
programs, to read event counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.

The typical use case for perf event based bpf program is to attach itself
to a single event. In such cases, if it is desirable to get scaling factor
between two bpf invocations, users can can save the time values in a map,
and use the value from the map and the current value to calculate
the scaling factor.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7b57a212c7d7..5bbbec17aa5a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -649,6 +649,13 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_perf_prog_read_value(ctx, buf, buf_size)
+ *     read perf prog attached perf event counter and enabled/running time
+ *     @ctx: pointer to ctx
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return : 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -706,7 +713,8 @@ union bpf_attr {
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
 	FN(xdp_adjust_meta),		\
-	FN(perf_event_read_value),
+	FN(perf_event_read_value),	\
+	FN(perf_prog_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0be86cc0130e..04ea5314f2bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -613,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+				    &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+         .func           = bpf_perf_prog_read_value_tp,
+         .gpl_only       = true,
+         .ret_type       = RET_INTEGER,
+         .arg1_type      = ARG_PTR_TO_CTX,
+         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+         .arg3_type      = ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -620,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_perf_prog_read_value:
+		return &bpf_perf_prog_read_value_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 64237470ddf97b63155fbd272c9e743e01d5f514 Mon Sep 17 00:00:00 2001
From: Lin Zhang <xiaolou4617@gmail.com>
Date: Fri, 6 Oct 2017 01:37:29 +0800
Subject: net: phonet: mark header_ops as const

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_phonet.h | 2 +-
 net/phonet/af_phonet.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_phonet.h b/include/linux/if_phonet.h
index bbcdb0a767d8..a118ee4a8428 100644
--- a/include/linux/if_phonet.h
+++ b/include/linux/if_phonet.h
@@ -10,5 +10,5 @@
 
 #include <uapi/linux/if_phonet.h>
 
-extern struct header_ops phonet_header_ops;
+extern const struct header_ops phonet_header_ops;
 #endif
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index f925753668a7..b12142e55d19 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 	return 1;
 }
 
-struct header_ops phonet_header_ops = {
+const struct header_ops phonet_header_ops = {
 	.create = pn_header_create,
 	.parse = pn_header_parse,
 };
-- 
cgit v1.2.3


From 548ec114705bb8f0879a0da12abec17f17a7cc26 Mon Sep 17 00:00:00 2001
From: Lin Zhang <xiaolou4617@gmail.com>
Date: Fri, 6 Oct 2017 01:40:35 +0800
Subject: net: phonet: mark phonet_protocol as const

The phonet_protocol structs don't need to be written by anyone and
so can be marked as const.

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/phonet/phonet.h |  6 ++++--
 net/phonet/af_phonet.c      | 15 ++++++++-------
 net/phonet/datagram.c       |  2 +-
 net/phonet/pep.c            |  2 +-
 4 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h
index 039cc29cb4a8..51e1a2a45d02 100644
--- a/include/net/phonet/phonet.h
+++ b/include/net/phonet/phonet.h
@@ -108,8 +108,10 @@ struct phonet_protocol {
 	int			sock_type;
 };
 
-int phonet_proto_register(unsigned int protocol, struct phonet_protocol *pp);
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp);
+int phonet_proto_register(unsigned int protocol,
+		const struct phonet_protocol *pp);
+void phonet_proto_unregister(unsigned int protocol,
+		const struct phonet_protocol *pp);
 
 int phonet_sysctl_init(void);
 void phonet_sysctl_exit(void);
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index b12142e55d19..3b0ef691f5b1 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -35,11 +35,11 @@
 #include <net/phonet/pn_dev.h>
 
 /* Transport protocol registration */
-static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
 
-static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
+static const struct phonet_protocol *phonet_proto_get(unsigned int protocol)
 {
-	struct phonet_protocol *pp;
+	const struct phonet_protocol *pp;
 
 	if (protocol >= PHONET_NPROTO)
 		return NULL;
@@ -53,7 +53,7 @@ static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
 	return pp;
 }
 
-static inline void phonet_proto_put(struct phonet_protocol *pp)
+static inline void phonet_proto_put(const struct phonet_protocol *pp)
 {
 	module_put(pp->prot->owner);
 }
@@ -65,7 +65,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
 {
 	struct sock *sk;
 	struct pn_sock *pn;
-	struct phonet_protocol *pnp;
+	const struct phonet_protocol *pnp;
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -470,7 +470,7 @@ static struct packet_type phonet_packet_type __read_mostly = {
 static DEFINE_MUTEX(proto_tab_lock);
 
 int __init_or_module phonet_proto_register(unsigned int protocol,
-						struct phonet_protocol *pp)
+				const struct phonet_protocol *pp)
 {
 	int err = 0;
 
@@ -492,7 +492,8 @@ int __init_or_module phonet_proto_register(unsigned int protocol,
 }
 EXPORT_SYMBOL(phonet_proto_register);
 
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp)
+void phonet_proto_unregister(unsigned int protocol,
+			const struct phonet_protocol *pp)
 {
 	mutex_lock(&proto_tab_lock);
 	BUG_ON(proto_tab[protocol] != pp);
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 5e710435ffa9..b44fb9018fb8 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -195,7 +195,7 @@ static struct proto pn_proto = {
 	.name		= "PHONET",
 };
 
-static struct phonet_protocol pn_dgram_proto = {
+static const struct phonet_protocol pn_dgram_proto = {
 	.ops		= &phonet_dgram_ops,
 	.prot		= &pn_proto,
 	.sock_type	= SOCK_DGRAM,
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index e81537991ddf..9fc76b19cd3c 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1351,7 +1351,7 @@ static struct proto pep_proto = {
 	.name		= "PNPIPE",
 };
 
-static struct phonet_protocol pep_pn_proto = {
+static const struct phonet_protocol pep_pn_proto = {
 	.ops		= &phonet_stream_ops,
 	.prot		= &pep_proto,
 	.sock_type	= SOCK_SEQPACKET,
-- 
cgit v1.2.3


From 067cae47771c864604969fd902efe10916e0d79c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:12 -0700
Subject: bpf: Use char in prog and map name

Instead of u8, use char for prog and map name.  It can avoid the
userspace tool getting compiler's signess warning.  The
bpf_prog_aux, bpf_map, bpf_attr, bpf_prog_info and
bpf_map_info are changed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h            | 4 ++--
 include/uapi/linux/bpf.h       | 8 ++++----
 tools/include/uapi/linux/bpf.h | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a67daea731ab..bc7da2ddfcaf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
-	u8 name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
@@ -189,7 +189,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
-	u8 name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5bbbec17aa5a..6db9e1d679cd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
-		__u8	map_name[BPF_OBJ_NAME_LEN];
+		char	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
-		__u8		prog_name[BPF_OBJ_NAME_LEN];
+		char		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -888,7 +888,7 @@ struct bpf_prog_info {
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -898,7 +898,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0894fd20b12b..fb4fb81ce5b0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
-		__u8	map_name[BPF_OBJ_NAME_LEN];
+		char	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
-		__u8		prog_name[BPF_OBJ_NAME_LEN];
+		char		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -871,7 +871,7 @@ struct bpf_prog_info {
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -881,7 +881,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit v1.2.3


From 821f1b21cabb46827ce39ddf82e2789680b5042a Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Fri, 6 Oct 2017 22:12:37 -0700
Subject: bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd
 flood

This patch adds a new bridge port flag BR_NEIGH_SUPPRESS to
suppress arp and nd flood on bridge ports. It implements
rfc7432, section 10.
https://tools.ietf.org/html/rfc7432#section-10
for ethernet VPN deployments. It is similar to the existing
BR_PROXYARP* flags but has a few semantic differences to conform
to EVPN standard. Unlike the existing flags, this new flag suppresses
flood of all neigh discovery packets (arp and nd) to tunnel ports.
Supports both vlan filtering and non-vlan filtering bridges.

In case of EVPN, it is mainly used to avoid flooding
of arp and nd packets to tunnel ports like vxlan.

This patch adds netlink and sysfs support to set this bridge port
flag.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/Makefile          |  2 +-
 net/bridge/br_arp_nd_proxy.c | 32 ++++++++++++++++++++++++++++++++
 net/bridge/br_forward.c      |  2 +-
 net/bridge/br_if.c           |  5 +++++
 net/bridge/br_netlink.c      | 10 +++++++++-
 net/bridge/br_private.h      |  2 ++
 net/bridge/br_sysfs_if.c     |  2 ++
 9 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 net/bridge/br_arp_nd_proxy.c

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 3cd18ac0697f..316ee113a220 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -49,6 +49,7 @@ struct br_ip_list {
 #define BR_MULTICAST_TO_UNICAST	BIT(12)
 #define BR_VLAN_TUNNEL		BIT(13)
 #define BR_BCAST_FLOOD		BIT(14)
+#define BR_NEIGH_SUPPRESS	BIT(15)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cd580fc0e58f..b037e0ab1975 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -327,6 +327,7 @@ enum {
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
 	IFLA_BRPORT_GROUP_FWD_MASK,
+	IFLA_BRPORT_NEIGH_SUPPRESS,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 40b1ede527ca..4aee55fdcc92 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o
 bridge-y	:= br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
 			br_ioctl.o br_stp.o br_stp_bpdu.o \
 			br_stp_if.o br_stp_timer.o br_netlink.o \
-			br_netlink_tunnel.o
+			br_netlink_tunnel.o br_arp_nd_proxy.o
 
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
new file mode 100644
index 000000000000..f889ad5f0048
--- /dev/null
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -0,0 +1,32 @@
+/*
+ *  Handle bridge arp/nd proxy/suppress
+ *
+ *  Copyright (C) 2017 Cumulus Networks
+ *  Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  Authors:
+ *	Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include "br_private.h"
+
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+	bool neigh_suppress = false;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->flags & BR_NEIGH_SUPPRESS) {
+			neigh_suppress = true;
+			break;
+		}
+	}
+
+	br->neigh_suppress_enabled = neigh_suppress;
+}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 48fb17417fac..b4eed113d2ec 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -204,7 +204,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 		/* Do not flood to ports that enable proxy ARP */
 		if (p->flags & BR_PROXYARP)
 			continue;
-		if ((p->flags & BR_PROXYARP_WIFI) &&
+		if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) &&
 		    BR_INPUT_SKB_CB(skb)->proxyarp_replied)
 			continue;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 59a74a414e20..ae38547bbf91 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -310,6 +310,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
 		del_nbp(p);
 	}
 
+	br_recalculate_neigh_suppress_enabled(br);
+
 	br_fdb_delete_by_port(br, NULL, 0, 1);
 
 	cancel_delayed_work_sync(&br->gc_work);
@@ -660,4 +662,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
 
 	if (mask & BR_AUTO_MASK)
 		nbp_update_port_count(br);
+
+	if (mask & BR_NEIGH_SUPPRESS)
+		br_recalculate_neigh_suppress_enabled(br);
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index dea88a255d26..f0e82682e071 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -138,6 +138,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP_WIFI */
 		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
+		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -210,7 +211,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
 	    nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
 							BR_VLAN_TUNNEL)) ||
-	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask))
+	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
+	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
+		       !!(p->flags & BR_NEIGH_SUPPRESS)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -785,6 +788,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 		p->group_fwd_mask = fwd_mask;
 	}
 
+	err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS,
+			       BR_NEIGH_SUPPRESS);
+	if (err)
+		return err;
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index ab4df24f7bba..00fa371b1fb2 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -404,6 +404,7 @@ struct net_bridge {
 #ifdef CONFIG_NET_SWITCHDEV
 	int offload_fwd_mark;
 #endif
+	bool				neigh_suppress_enabled;
 };
 
 struct br_input_skb_cb {
@@ -1139,4 +1140,5 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
 }
 #endif /* CONFIG_NET_SWITCHDEV */
 
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
 #endif
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 9110d5e56085..0a1fa9ccd8b7 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -191,6 +191,7 @@ BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
 BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
 BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
 BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
+BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -241,6 +242,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_multicast_flood,
 	&brport_attr_broadcast_flood,
 	&brport_attr_group_fwd_mask,
+	&brport_attr_neigh_suppress,
 	NULL
 };
 
-- 
cgit v1.2.3


From 77041420751fe6d4acf2103b245dcc2b4b7b8360 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Mon, 9 Oct 2017 11:15:31 +0200
Subject: net: bridge: Notify on bridge device mrouter state changes

Add the SWITCHDEV_ATTR_ID_BRIDGE_MROUTER switchdev notification type, used
to indicate whether the bridge is or isn't mrouter. Notify when the bridge
changes its state, similarly to the already existing bridged port mrouter
notifications.

The notification uses the switchdev_attr.u.mrouter boolean flag to indicate
the current bridge mrouter status. Thus, it only indicates whether the
bridge is currently used as an mrouter or not, and does not indicate the
exact mrouter state of the bridge (learning, permanent, etc.).

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  1 +
 net/bridge/br_multicast.c | 38 +++++++++++++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d767b7991887..d756fbe46625 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -51,6 +51,7 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
 	SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
 	SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
+	SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
 };
 
 struct switchdev_attr {
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8dc5c8d69bcd..bd50550dd4ca 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -859,8 +859,32 @@ out:
 	spin_unlock(&br->multicast_lock);
 }
 
+static void br_mc_router_state_change(struct net_bridge *p,
+				      bool is_mc_router)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = p->dev,
+		.id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
+		.flags = SWITCHDEV_F_DEFER,
+		.u.mrouter = is_mc_router,
+	};
+
+	switchdev_port_attr_set(p->dev, &attr);
+}
+
 static void br_multicast_local_router_expired(unsigned long data)
 {
+	struct net_bridge *br = (struct net_bridge *)data;
+
+	spin_lock(&br->multicast_lock);
+	if (br->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    br->multicast_router == MDB_RTR_TYPE_PERM ||
+	    timer_pending(&br->multicast_router_timer))
+		goto out;
+
+	br_mc_router_state_change(br, false);
+out:
+	spin_unlock(&br->multicast_lock);
 }
 
 static void br_multicast_querier_expired(struct net_bridge *br,
@@ -1364,9 +1388,12 @@ static void br_multicast_mark_router(struct net_bridge *br,
 	unsigned long now = jiffies;
 
 	if (!port) {
-		if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
+		if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) {
+			if (!timer_pending(&br->multicast_router_timer))
+				br_mc_router_state_change(br, true);
 			mod_timer(&br->multicast_router_timer,
 				  now + br->multicast_querier_interval);
+		}
 		return;
 	}
 
@@ -1952,7 +1979,7 @@ void br_multicast_init(struct net_bridge *br)
 
 	spin_lock_init(&br->multicast_lock);
 	setup_timer(&br->multicast_router_timer,
-		    br_multicast_local_router_expired, 0);
+		    br_multicast_local_router_expired, (unsigned long)br);
 	setup_timer(&br->ip4_other_query.timer,
 		    br_ip4_multicast_querier_expired, (unsigned long)br);
 	setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired,
@@ -2042,9 +2069,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	switch (val) {
 	case MDB_RTR_TYPE_DISABLED:
 	case MDB_RTR_TYPE_PERM:
+		br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM);
 		del_timer(&br->multicast_router_timer);
-		/* fall through */
+		br->multicast_router = val;
+		err = 0;
+		break;
 	case MDB_RTR_TYPE_TEMP_QUERY:
+		if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+			br_mc_router_state_change(br, false);
 		br->multicast_router = val;
 		err = 0;
 		break;
-- 
cgit v1.2.3


From 0912bda436388a02c72164b4b490b578e64c012e Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Mon, 9 Oct 2017 11:15:32 +0200
Subject: net: bridge: Export bridge multicast router state

Add an access function that, given a bridge netdevice, returns whether the
bridge device is currently an mrouter or not. The function uses the already
existing br_multicast_is_router function to check that.

This function is needed in order to allow ports that join an already
existing bridge to know the current mrouter state of the bridge device.
Together with the bridge device mrouter ports switchdev notifications, it
is possible to have full offloading of the semantics of the bridge device
mcast router state.

Due to the fact that the bridge multicast router status can change in
packet RX path, take the multicast_router bridge spinlock to protect the
read.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  5 +++++
 net/bridge/br_multicast.c | 12 ++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 316ee113a220..02639ebea2f0 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -64,6 +64,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
 bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto);
 bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
+bool br_multicast_router(const struct net_device *dev);
 #else
 static inline int br_multicast_list_adjacent(struct net_device *dev,
 					     struct list_head *br_ip_list)
@@ -84,6 +85,10 @@ static inline bool br_multicast_enabled(const struct net_device *dev)
 {
 	return false;
 }
+static inline bool br_multicast_router(const struct net_device *dev)
+{
+	return false;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index bd50550dd4ca..7947e0436e18 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2216,6 +2216,18 @@ bool br_multicast_enabled(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(br_multicast_enabled);
 
+bool br_multicast_router(const struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	bool is_router;
+
+	spin_lock_bh(&br->multicast_lock);
+	is_router = br_multicast_is_router(br);
+	spin_unlock_bh(&br->multicast_lock);
+	return is_router;
+}
+EXPORT_SYMBOL_GPL(br_multicast_router);
+
 int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
 {
 	unsigned long max_delay;
-- 
cgit v1.2.3


From ed468ebee04ffba0231a8f50616bdb250752a891 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:44 +0300
Subject: qed: Add ll2 ability of opening a secondary queue

When more than one ll2 queue is opened ( that is not an OOO queue )
ll2 code does not have enough information to determine whether
the queue is the main one or not, so a new field is added to the
acquire input data to expose the control of determining whether
the queue is the main queue or a secondary queue.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 7 ++++++-
 drivers/net/ethernet/qlogic/qed/qed_ll2.h | 1 +
 include/linux/qed/qed_ll2_if.h            | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 75af40a7690a..3c695da890df 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -894,7 +894,7 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg;
 	p_ramrod->inner_vlan_removal_en = p_ll2_conn->input.rx_vlan_removal_en;
 	p_ramrod->queue_id = p_ll2_conn->queue_id;
-	p_ramrod->main_func_queue = (conn_type == QED_LL2_TYPE_OOO) ? 0 : 1;
+	p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
 
 	if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) &&
 	    p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE) &&
@@ -1265,6 +1265,11 @@ int qed_ll2_acquire_connection(void *cxt, struct qed_ll2_acquire_data *data)
 
 	p_ll2_info->tx_dest = (data->input.tx_dest == QED_LL2_TX_DEST_NW) ?
 			      CORE_TX_DEST_NW : CORE_TX_DEST_LB;
+	if (data->input.conn_type == QED_LL2_TYPE_OOO ||
+	    data->input.secondary_queue)
+		p_ll2_info->main_func_queue = false;
+	else
+		p_ll2_info->main_func_queue = true;
 
 	/* Correct maximum number of Tx BDs */
 	p_tx_max = &p_ll2_info->input.tx_max_bds_per_packet;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
index 9bdd08f15c79..f65817012e97 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
@@ -121,6 +121,7 @@ struct qed_ll2_info {
 	bool b_active;
 	enum core_tx_dest tx_dest;
 	u8 tx_stats_en;
+	bool main_func_queue;
 	struct qed_ll2_rx_queue rx_queue;
 	struct qed_ll2_tx_queue tx_queue;
 	struct qed_ll2_cbs cbs;
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 89fa0bbd54f3..d7cca590b743 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -171,6 +171,7 @@ struct qed_ll2_acquire_data_inputs {
 	enum qed_ll2_tx_dest tx_dest;
 	enum qed_ll2_error_handle ai_err_packet_too_big;
 	enum qed_ll2_error_handle ai_err_no_buf;
+	bool secondary_queue;
 	u8 gsi_enable;
 };
 
-- 
cgit v1.2.3


From 77caa792f5d8e4ecc88eb1cf4b9c478c07e0ec57 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:45 +0300
Subject: qed: Add ll2 option for dropping a tx packet

The option of sending a packet on the ll2 and dropping it exists in
hardware and was not used until now, thus not exposed.
The iWARP unaligned MPA flow requires this functionality for
flushing the tx queue.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 16 ++++++++++++++--
 include/linux/qed/qed_ll2_if.h            |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 3c695da890df..ad67d36956e8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1597,8 +1597,20 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
 	roce_flavor = (pkt->qed_roce_flavor == QED_LL2_ROCE) ? CORE_ROCE
 							     : CORE_RROCE;
 
-	tx_dest = (pkt->tx_dest == QED_LL2_TX_DEST_NW) ? CORE_TX_DEST_NW
-						       : CORE_TX_DEST_LB;
+	switch (pkt->tx_dest) {
+	case QED_LL2_TX_DEST_NW:
+		tx_dest = CORE_TX_DEST_NW;
+		break;
+	case QED_LL2_TX_DEST_LB:
+		tx_dest = CORE_TX_DEST_LB;
+		break;
+	case QED_LL2_TX_DEST_DROP:
+		tx_dest = CORE_TX_DEST_DROP;
+		break;
+	default:
+		tx_dest = CORE_TX_DEST_LB;
+		break;
+	}
 
 	start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
 	start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index d7cca590b743..95fdf02a3bbe 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -64,6 +64,7 @@ enum qed_ll2_roce_flavor_type {
 enum qed_ll2_tx_dest {
 	QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */
 	QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */
+	QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */
 	QED_LL2_TX_DEST_MAX
 };
 
-- 
cgit v1.2.3


From 6f34a284f36399501fcc034dc4522a2d8d9fa6c9 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:48 +0300
Subject: qed: Add LL2 slowpath handling

For iWARP unaligned MPA flow, a slowpath event of flushing an
MPA connection that entered an unaligned state is required.
The flush ramrod is received on the ll2 queue, and a pre-registered
callback function is called to handle the flush event.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 40 +++++++++++++++++++++++++++++--
 include/linux/qed/qed_ll2_if.h            |  5 ++++
 2 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 8eb9645c880d..047f556ca62e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -422,6 +422,41 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
 	data->u.placement_offset = p_cqe->rx_cqe_fp.placement_offset;
 }
 
+static int
+qed_ll2_handle_slowpath(struct qed_hwfn *p_hwfn,
+			struct qed_ll2_info *p_ll2_conn,
+			union core_rx_cqe_union *p_cqe,
+			unsigned long *p_lock_flags)
+{
+	struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+	struct core_rx_slow_path_cqe *sp_cqe;
+
+	sp_cqe = &p_cqe->rx_cqe_sp;
+	if (sp_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH) {
+		DP_NOTICE(p_hwfn,
+			  "LL2 - unexpected Rx CQE slowpath ramrod_cmd_id:%d\n",
+			  sp_cqe->ramrod_cmd_id);
+		return -EINVAL;
+	}
+
+	if (!p_ll2_conn->cbs.slowpath_cb) {
+		DP_NOTICE(p_hwfn,
+			  "LL2 - received RX_QUEUE_FLUSH but no callback was provided\n");
+		return -EINVAL;
+	}
+
+	spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags);
+
+	p_ll2_conn->cbs.slowpath_cb(p_ll2_conn->cbs.cookie,
+				    p_ll2_conn->my_id,
+				    le32_to_cpu(sp_cqe->opaque_data.data[0]),
+				    le32_to_cpu(sp_cqe->opaque_data.data[1]));
+
+	spin_lock_irqsave(&p_rx->lock, *p_lock_flags);
+
+	return 0;
+}
+
 static int
 qed_ll2_rxq_handle_completion(struct qed_hwfn *p_hwfn,
 			      struct qed_ll2_info *p_ll2_conn,
@@ -495,8 +530,8 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie)
 
 		switch (cqe->rx_cqe_sp.type) {
 		case CORE_RX_CQE_TYPE_SLOW_PATH:
-			DP_NOTICE(p_hwfn, "LL2 - unexpected Rx CQE slowpath\n");
-			rc = -EINVAL;
+			rc = qed_ll2_handle_slowpath(p_hwfn, p_ll2_conn,
+						     cqe, &flags);
 			break;
 		case CORE_RX_CQE_TYPE_GSI_OFFLOAD:
 		case CORE_RX_CQE_TYPE_REGULAR:
@@ -1214,6 +1249,7 @@ qed_ll2_set_cbs(struct qed_ll2_info *p_ll2_info, const struct qed_ll2_cbs *cbs)
 	p_ll2_info->cbs.rx_release_cb = cbs->rx_release_cb;
 	p_ll2_info->cbs.tx_comp_cb = cbs->tx_comp_cb;
 	p_ll2_info->cbs.tx_release_cb = cbs->tx_release_cb;
+	p_ll2_info->cbs.slowpath_cb = cbs->slowpath_cb;
 	p_ll2_info->cbs.cookie = cbs->cookie;
 
 	return 0;
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 95fdf02a3bbe..e755954d85fd 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -151,11 +151,16 @@ void (*qed_ll2_release_tx_packet_cb)(void *cxt,
 				     dma_addr_t first_frag_addr,
 				     bool b_last_fragment, bool b_last_packet);
 
+typedef
+void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle,
+			    u32 opaque_data_0, u32 opaque_data_1);
+
 struct qed_ll2_cbs {
 	qed_ll2_complete_rx_packet_cb rx_comp_cb;
 	qed_ll2_release_rx_packet_cb rx_release_cb;
 	qed_ll2_complete_tx_packet_cb tx_comp_cb;
 	qed_ll2_release_tx_packet_cb tx_release_cb;
+	qed_ll2_slowpath_cb slowpath_cb;
 	void *cookie;
 };
 
-- 
cgit v1.2.3


From cf4c950b87ee2f547ad3abd3aca6ae3f3eb3443f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 14:30:52 -0700
Subject: once: switch to new jump label API

Switch the DO_ONCE() macro from the deprecated jump label API to the new
one.  The new one is more readable, and for DO_ONCE() it also makes the
generated code more icache-friendly: now the one-time initialization
code is placed out-of-line at the jump target, rather than at the inline
fallthrough case.

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/once.h | 6 +++---
 lib/once.c           | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/once.h b/include/linux/once.h
index 9c98aaa87cbc..724724918e8b 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -5,7 +5,7 @@
 #include <linux/jump_label.h>
 
 bool __do_once_start(bool *done, unsigned long *flags);
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
@@ -38,8 +38,8 @@ void __do_once_done(bool *done, struct static_key *once_key,
 	({								     \
 		bool ___ret = false;					     \
 		static bool ___done = false;				     \
-		static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \
-		if (static_key_true(&___once_key)) {			     \
+		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
+		if (static_branch_unlikely(&___once_key)) {		     \
 			unsigned long ___flags;				     \
 			___ret = __do_once_start(&___done, &___flags);	     \
 			if (unlikely(___ret)) {				     \
diff --git a/lib/once.c b/lib/once.c
index 05c8604627eb..831c5a6b0bb2 100644
--- a/lib/once.c
+++ b/lib/once.c
@@ -5,7 +5,7 @@
 
 struct once_work {
 	struct work_struct work;
-	struct static_key *key;
+	struct static_key_true *key;
 };
 
 static void once_deferred(struct work_struct *w)
@@ -14,11 +14,11 @@ static void once_deferred(struct work_struct *w)
 
 	work = container_of(w, struct once_work, work);
 	BUG_ON(!static_key_enabled(work->key));
-	static_key_slow_dec(work->key);
+	static_branch_disable(work->key);
 	kfree(work);
 }
 
-static void once_disable_jump(struct static_key *key)
+static void once_disable_jump(struct static_key_true *key)
 {
 	struct once_work *w;
 
@@ -51,7 +51,7 @@ bool __do_once_start(bool *done, unsigned long *flags)
 }
 EXPORT_SYMBOL(__do_once_start);
 
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags)
 	__releases(once_lock)
 {
-- 
cgit v1.2.3


From ceaa001a170e43608854d5290a48064f57b565ed Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 4 Oct 2017 17:03:12 -0700
Subject: openvswitch: Add erspan tunnel support.

Add erspan netlink interface for OVS.

Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  1 +
 net/openvswitch/flow_netlink.c   | 51 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 156ee4cab82e..efdbfbfd3ee2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -359,6 +359,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_IPV6_SRC,		/* struct in6_addr src IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_IPV6_DST,		/* struct in6_addr dst IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_PAD,
+	OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,	/* be32 ERSPAN index. */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e8eb427ce6d1..fc0ca9a89b8e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -48,6 +48,7 @@
 #include <net/ndisc.h>
 #include <net/mpls.h>
 #include <net/vxlan.h>
+#include <net/erspan.h>
 
 #include "flow_netlink.h"
 
@@ -319,7 +320,8 @@ size_t ovs_tun_key_attr_size(void)
 		 * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
 		 */
 		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
-		+ nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+		+ nla_total_size(4);   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */
 }
 
 size_t ovs_key_attr_size(void)
@@ -371,6 +373,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 						.next = ovs_vxlan_ext_key_lens },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_SRC]      = { .len = sizeof(struct in6_addr) },
 	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
+	[OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = sizeof(u32) },
 };
 
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
@@ -593,6 +596,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
 	return 0;
 }
 
+static int erspan_tun_opt_from_nlattr(const struct nlattr *attr,
+				      struct sw_flow_match *match, bool is_mask,
+				      bool log)
+{
+	unsigned long opt_key_offset;
+	struct erspan_metadata opts;
+
+	BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
+
+	memset(&opts, 0, sizeof(opts));
+	opts.index = nla_get_be32(attr);
+
+	/* Index has only 20-bit */
+	if (ntohl(opts.index) & ~INDEX_MASK) {
+		OVS_NLERR(log, "ERSPAN index number %x too large.",
+			  ntohl(opts.index));
+		return -EINVAL;
+	}
+
+	SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask);
+	opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
+	SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
+				  is_mask);
+
+	return 0;
+}
+
 static int ip_tun_from_nlattr(const struct nlattr *attr,
 			      struct sw_flow_match *match, bool is_mask,
 			      bool log)
@@ -700,6 +730,19 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_PAD:
 			break;
+		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+			if (opts_type) {
+				OVS_NLERR(log, "Multiple metadata blocks provided");
+				return -EINVAL;
+			}
+
+			err = erspan_tun_opt_from_nlattr(a, match, is_mask, log);
+			if (err)
+				return err;
+
+			tun_flags |= TUNNEL_ERSPAN_OPT;
+			opts_type = type;
+			break;
 		default:
 			OVS_NLERR(log, "Unknown IP tunnel attribute %d",
 				  type);
@@ -824,6 +867,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 		else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
 			 vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
 			return -EMSGSIZE;
+		else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
+			 nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
+				      ((struct erspan_metadata *)tun_opts)->index))
+			return -EMSGSIZE;
 	}
 
 	return 0;
@@ -2195,6 +2242,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 			break;
 		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
 			break;
+		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+			break;
 		}
 	};
 
-- 
cgit v1.2.3


From e7bf8249e8f1bac64885eeccb55bcf6111901a81 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:10 -0700
Subject: bpf: encapsulate verifier log state into a structure

Put the loose log_* variables into a structure.  This will make
it simpler to remove the global verifier state in following patches.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 13 ++++++++++
 kernel/bpf/verifier.c        | 57 +++++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b8d200f60a40..163541ba70d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,6 +115,19 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+struct bpf_verifer_log {
+	u32 level;
+	char *kbuf;
+	char __user *ubuf;
+	u32 len_used;
+	u32 len_total;
+};
+
+static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 struct bpf_verifier_env;
 struct bpf_ext_analyzer_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6352a88ca6d1..e53458b02249 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -156,8 +156,7 @@ struct bpf_call_arg_meta {
 /* verbose verifier prints what it's seeing
  * bpf_check() is called under lock, so no race to access these global vars
  */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
+static struct bpf_verifer_log verifier_log;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
 
@@ -167,13 +166,15 @@ static DEFINE_MUTEX(bpf_verifier_lock);
  */
 static __printf(1, 2) void verbose(const char *fmt, ...)
 {
+	struct bpf_verifer_log *log = &verifier_log;
 	va_list args;
 
-	if (log_level == 0 || log_len >= log_size - 1)
+	if (!log->level || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+	log->len_used += vscnprintf(log->kbuf + log->len_used,
+				    log->len_total - log->len_used, fmt, args);
 	va_end(args);
 }
 
@@ -886,7 +887,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (log_level)
+	if (verifier_log.level)
 		print_verifier_state(state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
@@ -2956,7 +2957,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
 	}
-	if (log_level)
+	if (verifier_log.level)
 		print_verifier_state(this_branch);
 	return 0;
 }
@@ -3712,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (log_level) {
+			if (verifier_log.level) {
 				if (do_print_state)
 					verbose("\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
@@ -3725,8 +3726,9 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (log_level > 1 || (log_level && do_print_state)) {
-			if (log_level > 1)
+		if (verifier_log.level > 1 ||
+		    (verifier_log.level && do_print_state)) {
+			if (verifier_log.level > 1)
 				verbose("%d:", insn_idx);
 			else
 				verbose("\nfrom %d to %d:",
@@ -3735,7 +3737,7 @@ static int do_check(struct bpf_verifier_env *env)
 			do_print_state = false;
 		}
 
-		if (log_level) {
+		if (verifier_log.level) {
 			verbose("%d: ", insn_idx);
 			print_bpf_insn(env, insn);
 		}
@@ -4389,7 +4391,7 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	char __user *log_ubuf = NULL;
+	struct bpf_verifer_log *log = &verifier_log;
 	struct bpf_verifier_env *env;
 	int ret = -EINVAL;
 
@@ -4414,23 +4416,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		/* user requested verbose verifier output
 		 * and supplied buffer to store the verification trace
 		 */
-		log_level = attr->log_level;
-		log_ubuf = (char __user *) (unsigned long) attr->log_buf;
-		log_size = attr->log_size;
-		log_len = 0;
+		log->level = attr->log_level;
+		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+		log->len_total = attr->log_size;
+		log->len_used = 0;
 
 		ret = -EINVAL;
-		/* log_* values have to be sane */
-		if (log_size < 128 || log_size > UINT_MAX >> 8 ||
-		    log_level == 0 || log_ubuf == NULL)
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf)
 			goto err_unlock;
 
 		ret = -ENOMEM;
-		log_buf = vmalloc(log_size);
-		if (!log_buf)
+		log->kbuf = vmalloc(log->len_total);
+		if (!log->kbuf)
 			goto err_unlock;
 	} else {
-		log_level = 0;
+		log->level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4467,15 +4469,16 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log_level && log_len >= log_size - 1) {
-		BUG_ON(log_len >= log_size);
+	if (log->level && bpf_verifier_log_full(log)) {
+		BUG_ON(log->len_used >= log->len_total);
 		/* verifier log exceeded user supplied buffer */
 		ret = -ENOSPC;
 		/* fall through to return what was recorded */
 	}
 
 	/* copy verifier log back to user space including trailing zero */
-	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+	if (log->level && copy_to_user(log->ubuf, log->kbuf,
+				       log->len_used + 1) != 0) {
 		ret = -EFAULT;
 		goto free_log_buf;
 	}
@@ -4502,8 +4505,8 @@ skip_full_check:
 	}
 
 free_log_buf:
-	if (log_level)
-		vfree(log_buf);
+	if (log->level)
+		vfree(log->kbuf);
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
@@ -4540,7 +4543,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
-	log_level = 0;
+	verifier_log.level = 0;
 
 	env->strict_alignment = false;
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-- 
cgit v1.2.3


From 61bd5218eef349fcacc4976a251bc83a4748b4af Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:11 -0700
Subject: bpf: move global verifier log into verifier environment

The biggest piece of global state protected by the verifier lock
is the verifier_log.  Move that log to struct bpf_verifier_env.
struct bpf_verifier_env has to be passed now to all invocations
of verbose().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |   2 +
 kernel/bpf/verifier.c        | 491 +++++++++++++++++++++++--------------------
 2 files changed, 261 insertions(+), 232 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 163541ba70d9..5ddb9a626a51 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,6 +152,8 @@ struct bpf_verifier_env {
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+
+	struct bpf_verifer_log log;
 };
 
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e53458b02249..a352f93cd4b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,20 +153,16 @@ struct bpf_call_arg_meta {
 	int access_size;
 };
 
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static struct bpf_verifer_log verifier_log;
-
 static DEFINE_MUTEX(bpf_verifier_lock);
 
 /* log_level controls verbosity level of eBPF verifier.
  * verbose() is used to dump the verification trace to the log, so the user
  * can figure out what's wrong with the program
  */
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+				   const char *fmt, ...)
 {
-	struct bpf_verifer_log *log = &verifier_log;
+	struct bpf_verifer_log *log = &env->log;
 	va_list args;
 
 	if (!log->level || bpf_verifier_log_full(log))
@@ -214,7 +210,8 @@ static const char *func_id_name(int id)
 		return "unknown";
 }
 
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+				 struct bpf_verifier_state *state)
 {
 	struct bpf_reg_state *reg;
 	enum bpf_reg_type t;
@@ -225,21 +222,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		t = reg->type;
 		if (t == NOT_INIT)
 			continue;
-		verbose(" R%d=%s", i, reg_type_str[t]);
+		verbose(env, " R%d=%s", i, reg_type_str[t]);
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
 		    tnum_is_const(reg->var_off)) {
 			/* reg->off should be 0 for SCALAR_VALUE */
-			verbose("%lld", reg->var_off.value + reg->off);
+			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			verbose("(id=%d", reg->id);
+			verbose(env, "(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
-				verbose(",off=%d", reg->off);
+				verbose(env, ",off=%d", reg->off);
 			if (type_is_pkt_pointer(t))
-				verbose(",r=%d", reg->range);
+				verbose(env, ",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
 				 t == PTR_TO_MAP_VALUE_OR_NULL)
-				verbose(",ks=%d,vs=%d",
+				verbose(env, ",ks=%d,vs=%d",
 					reg->map_ptr->key_size,
 					reg->map_ptr->value_size);
 			if (tnum_is_const(reg->var_off)) {
@@ -247,38 +244,38 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				 * could be a pointer whose offset is too big
 				 * for reg->off
 				 */
-				verbose(",imm=%llx", reg->var_off.value);
+				verbose(env, ",imm=%llx", reg->var_off.value);
 			} else {
 				if (reg->smin_value != reg->umin_value &&
 				    reg->smin_value != S64_MIN)
-					verbose(",smin_value=%lld",
+					verbose(env, ",smin_value=%lld",
 						(long long)reg->smin_value);
 				if (reg->smax_value != reg->umax_value &&
 				    reg->smax_value != S64_MAX)
-					verbose(",smax_value=%lld",
+					verbose(env, ",smax_value=%lld",
 						(long long)reg->smax_value);
 				if (reg->umin_value != 0)
-					verbose(",umin_value=%llu",
+					verbose(env, ",umin_value=%llu",
 						(unsigned long long)reg->umin_value);
 				if (reg->umax_value != U64_MAX)
-					verbose(",umax_value=%llu",
+					verbose(env, ",umax_value=%llu",
 						(unsigned long long)reg->umax_value);
 				if (!tnum_is_unknown(reg->var_off)) {
 					char tn_buf[48];
 
 					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose(",var_off=%s", tn_buf);
+					verbose(env, ",var_off=%s", tn_buf);
 				}
 			}
-			verbose(")");
+			verbose(env, ")");
 		}
 	}
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+			verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
 				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
 	}
-	verbose("\n");
+	verbose(env, "\n");
 }
 
 static const char *const bpf_class_string[] = {
@@ -333,15 +330,15 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
-static void print_bpf_end_insn(const struct bpf_verifier_env *env,
+static void print_bpf_end_insn(struct bpf_verifier_env *env,
 			       const struct bpf_insn *insn)
 {
-	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
 		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
 		insn->imm, insn->dst_reg);
 }
 
-static void print_bpf_insn(const struct bpf_verifier_env *env,
+static void print_bpf_insn(struct bpf_verifier_env *env,
 			   const struct bpf_insn *insn)
 {
 	u8 class = BPF_CLASS(insn->code);
@@ -349,23 +346,23 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 	if (class == BPF_ALU || class == BPF_ALU64) {
 		if (BPF_OP(insn->code) == BPF_END) {
 			if (class == BPF_ALU64)
-				verbose("BUG_alu64_%02x\n", insn->code);
+				verbose(env, "BUG_alu64_%02x\n", insn->code);
 			else
 				print_bpf_end_insn(env, insn);
 		} else if (BPF_OP(insn->code) == BPF_NEG) {
-			verbose("(%02x) r%d = %s-r%d\n",
+			verbose(env, "(%02x) r%d = %s-r%d\n",
 				insn->code, insn->dst_reg,
 				class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) %sr%d %s %sr%d\n",
+			verbose(env, "(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->src_reg);
 		} else {
-			verbose("(%02x) %sr%d %s %s%d\n",
+			verbose(env, "(%02x) %sr%d %s %s%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
@@ -374,46 +371,46 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 		}
 	} else if (class == BPF_STX) {
 		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg,
 				insn->off, insn->src_reg);
 		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
 				insn->src_reg);
 		else
-			verbose("BUG_%02x\n", insn->code);
+			verbose(env, "BUG_%02x\n", insn->code);
 	} else if (class == BPF_ST) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_st_%02x\n", insn->code);
+			verbose(env, "BUG_st_%02x\n", insn->code);
 			return;
 		}
-		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
 			insn->code,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->dst_reg,
 			insn->off, insn->imm);
 	} else if (class == BPF_LDX) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_ldx_%02x\n", insn->code);
+			verbose(env, "BUG_ldx_%02x\n", insn->code);
 			return;
 		}
-		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
 			insn->code, insn->dst_reg,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->src_reg, insn->off);
 	} else if (class == BPF_LD) {
 		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->imm);
 		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->src_reg, insn->imm);
@@ -428,36 +425,37 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 			if (map_ptr && !env->allow_ptr_leaks)
 				imm = 0;
 
-			verbose("(%02x) r%d = 0x%llx\n", insn->code,
+			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
 				insn->dst_reg, (unsigned long long)imm);
 		} else {
-			verbose("BUG_ld_%02x\n", insn->code);
+			verbose(env, "BUG_ld_%02x\n", insn->code);
 			return;
 		}
 	} else if (class == BPF_JMP) {
 		u8 opcode = BPF_OP(insn->code);
 
 		if (opcode == BPF_CALL) {
-			verbose("(%02x) call %s#%d\n", insn->code,
+			verbose(env, "(%02x) call %s#%d\n", insn->code,
 				func_id_name(insn->imm), insn->imm);
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose("(%02x) goto pc%+d\n",
+			verbose(env, "(%02x) goto pc%+d\n",
 				insn->code, insn->off);
 		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose("(%02x) exit\n", insn->code);
+			verbose(env, "(%02x) exit\n", insn->code);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->src_reg, insn->off);
 		} else {
-			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->imm, insn->off);
 		}
 	} else {
-		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+		verbose(env, "(%02x) %s\n",
+			insn->code, bpf_class_string[class]);
 	}
 }
 
@@ -496,7 +494,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	env->head = elem;
 	env->stack_size++;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
-		verbose("BPF program is too complex\n");
+		verbose(env, "BPF program is too complex\n");
 		goto err;
 	}
 	return &elem->st;
@@ -534,10 +532,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
 	__mark_reg_known(reg, 0);
 }
 
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+				struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_known_zero(regs, %u)\n", regno);
+		verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -647,10 +646,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
 	__mark_reg_unbounded(reg);
 }
 
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+			     struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_unknown(regs, %u)\n", regno);
+		verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -665,10 +665,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
 	reg->type = NOT_INIT;
 }
 
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_not_init(regs, %u)\n", regno);
+		verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -677,22 +678,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_not_init(regs + regno);
 }
 
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *regs)
 {
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		mark_reg_not_init(regs, i);
+		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
 	}
 
 	/* frame pointer */
 	regs[BPF_REG_FP].type = PTR_TO_STACK;
-	mark_reg_known_zero(regs, BPF_REG_FP);
+	mark_reg_known_zero(env, regs, BPF_REG_FP);
 
 	/* 1st arg to a function */
 	regs[BPF_REG_1].type = PTR_TO_CTX;
-	mark_reg_known_zero(regs, BPF_REG_1);
+	mark_reg_known_zero(env, regs, BPF_REG_1);
 }
 
 enum reg_arg_type {
@@ -726,26 +728,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_reg_state *regs = env->cur_state.regs;
 
 	if (regno >= MAX_BPF_REG) {
-		verbose("R%d is invalid\n", regno);
+		verbose(env, "R%d is invalid\n", regno);
 		return -EINVAL;
 	}
 
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (regs[regno].type == NOT_INIT) {
-			verbose("R%d !read_ok\n", regno);
+			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
 		mark_reg_read(&env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
-			verbose("frame pointer is read only\n");
+			verbose(env, "frame pointer is read only\n");
 			return -EACCES;
 		}
 		regs[regno].live |= REG_LIVE_WRITTEN;
 		if (t == DST_OP)
-			mark_reg_unknown(regs, regno);
+			mark_reg_unknown(env, regs, regno);
 	}
 	return 0;
 }
@@ -770,7 +772,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
 	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -783,7 +786,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
 
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 
@@ -818,7 +821,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 	}
 }
 
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
 	u8 *slot_type;
@@ -828,12 +832,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 
 	if (slot_type[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
 			if (slot_type[i] != STACK_SPILL) {
-				verbose("corrupted spill memory\n");
+				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
@@ -849,14 +853,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 	} else {
 		for (i = 0; i < size; i++) {
 			if (slot_type[i] != STACK_MISC) {
-				verbose("invalid read from stack off %d+%d size %d\n",
+				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
 			}
 		}
 		if (value_regno >= 0)
 			/* have read misc data from the stack */
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 		return 0;
 	}
 }
@@ -868,7 +872,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
 
 	if (off < 0 || size <= 0 || off + size > map->value_size) {
-		verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			map->value_size, off, size);
 		return -EACCES;
 	}
@@ -887,8 +891,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (verifier_log.level)
-		print_verifier_state(state);
+	if (env->log.level)
+		print_verifier_state(env, state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
@@ -896,13 +900,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * will have a set floor within our range.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_map_access(env, regno, reg->smin_value + off, size);
 	if (err) {
-		verbose("R%d min value is outside of the array range\n", regno);
+		verbose(env, "R%d min value is outside of the array range\n",
+			regno);
 		return err;
 	}
 
@@ -911,13 +916,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * If reg->umax_value + off could overflow, treat that as unbounded too.
 	 */
 	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-		verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+		verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_map_access(env, regno, reg->umax_value + off, size);
 	if (err)
-		verbose("R%d max value is outside of the array range\n", regno);
+		verbose(env, "R%d max value is outside of the array range\n",
+			regno);
 	return err;
 }
 
@@ -956,7 +962,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
-		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
 	}
@@ -979,13 +985,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	 * detail to prove they're safe.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_packet_access(env, regno, off, size);
 	if (err) {
-		verbose("R%d offset is outside of the packet\n", regno);
+		verbose(env, "R%d offset is outside of the packet\n", regno);
 		return err;
 	}
 	return err;
@@ -1021,7 +1027,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		return 0;
 	}
 
-	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+	verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
 }
 
@@ -1039,7 +1045,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
 }
 
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
 	struct tnum reg_off;
@@ -1064,7 +1071,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+		verbose(env,
+			"misaligned packet access off %d+%s+%d+%d size %d\n",
 			ip_align, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1072,7 +1080,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 	return 0;
 }
 
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+				       const struct bpf_reg_state *reg,
 				       const char *pointer_desc,
 				       int off, int size, bool strict)
 {
@@ -1087,7 +1096,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned %saccess off %s+%d+%d size %d\n",
+		verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
 			pointer_desc, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1108,7 +1117,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		/* Special case, because of NET_IP_ALIGN. Given metadata sits
 		 * right in front, treat it the very same way.
 		 */
-		return check_pkt_ptr_alignment(reg, off, size, strict);
+		return check_pkt_ptr_alignment(env, reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -1121,7 +1130,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	default:
 		break;
 	}
-	return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+	return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+					   strict);
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1153,20 +1163,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into map\n", value_regno);
+			verbose(env, "R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
 
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into ctx\n", value_regno);
+			verbose(env, "R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
 		/* ctx accesses must be at a fixed offset, so that we can
@@ -1176,7 +1186,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable ctx access var_off=%s off=%d size=%d",
+			verbose(env,
+				"variable ctx access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
@@ -1188,9 +1199,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(state->regs, value_regno);
+				mark_reg_unknown(env, state->regs, value_regno);
 			else
-				mark_reg_known_zero(state->regs, value_regno);
+				mark_reg_known_zero(env, state->regs,
+						    value_regno);
 			state->regs[value_regno].id = 0;
 			state->regs[value_regno].off = 0;
 			state->regs[value_regno].range = 0;
@@ -1206,13 +1218,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable stack access var_off=%s off=%d size=%d",
+			verbose(env, "variable stack access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
 		off += reg->var_off.value;
 		if (off >= 0 || off < -MAX_BPF_STACK) {
-			verbose("invalid stack off=%d size=%d\n", off, size);
+			verbose(env, "invalid stack off=%d size=%d\n", off,
+				size);
 			return -EACCES;
 		}
 
@@ -1223,29 +1236,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			if (!env->allow_ptr_leaks &&
 			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
 			    size != BPF_REG_SIZE) {
-				verbose("attempt to corrupt spilled pointer on stack\n");
+				verbose(env, "attempt to corrupt spilled pointer on stack\n");
 				return -EACCES;
 			}
-			err = check_stack_write(state, off, size, value_regno);
+			err = check_stack_write(env, state, off, size,
+						value_regno);
 		} else {
-			err = check_stack_read(state, off, size, value_regno);
+			err = check_stack_read(env, state, off, size,
+					       value_regno);
 		}
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
-			verbose("cannot write into packet\n");
+			verbose(env, "cannot write into packet\n");
 			return -EACCES;
 		}
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into packet\n", value_regno);
+			verbose(env, "R%d leaks addr into packet\n",
+				value_regno);
 			return -EACCES;
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 	} else {
-		verbose("R%d invalid mem access '%s'\n",
-			regno, reg_type_str[reg->type]);
+		verbose(env, "R%d invalid mem access '%s'\n", regno,
+			reg_type_str[reg->type]);
 		return -EACCES;
 	}
 
@@ -1265,7 +1281,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 
 	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
 	    insn->imm != 0) {
-		verbose("BPF_XADD uses reserved fields\n");
+		verbose(env, "BPF_XADD uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -1280,7 +1296,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 		return err;
 
 	if (is_pointer_value(env, insn->src_reg)) {
-		verbose("R%d leaks addr into mem\n", insn->src_reg);
+		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
 		return -EACCES;
 	}
 
@@ -1321,7 +1337,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		    register_is_null(regs[regno]))
 			return 0;
 
-		verbose("R%d type=%s expected=%s\n", regno,
+		verbose(env, "R%d type=%s expected=%s\n", regno,
 			reg_type_str[regs[regno].type],
 			reg_type_str[PTR_TO_STACK]);
 		return -EACCES;
@@ -1332,13 +1348,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
-		verbose("invalid variable stack read R%d var_off=%s\n",
+		verbose(env, "invalid variable stack read R%d var_off=%s\n",
 			regno, tn_buf);
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
 	    access_size <= 0) {
-		verbose("invalid stack type R%d off=%d access_size=%d\n",
+		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
 			regno, off, access_size);
 		return -EACCES;
 	}
@@ -1354,7 +1370,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 
 	for (i = 0; i < access_size; i++) {
 		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
-			verbose("invalid indirect read from stack off %d+%d size %d\n",
+			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
 		}
@@ -1397,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (arg_type == ARG_ANYTHING) {
 		if (is_pointer_value(env, regno)) {
-			verbose("R%d leaks addr into helper function\n", regno);
+			verbose(env, "R%d leaks addr into helper function\n",
+				regno);
 			return -EACCES;
 		}
 		return 0;
@@ -1405,7 +1422,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
-		verbose("helper access to the packet is not allowed\n");
+		verbose(env, "helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1443,7 +1460,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
-		verbose("unsupported arg_type %d\n", arg_type);
+		verbose(env, "unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
 	}
 
@@ -1461,7 +1478,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			 * we have to check map_key here. Otherwise it means
 			 * that kernel subsystem misconfigured verifier
 			 */
-			verbose("invalid map_ptr to access map->key\n");
+			verbose(env, "invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
 		if (type_is_pkt_pointer(type))
@@ -1477,7 +1494,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (!meta->map_ptr) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("invalid map_ptr to access map->value\n");
+			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
 		if (type_is_pkt_pointer(type))
@@ -1497,7 +1514,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (regno == 0) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("ARG_CONST_SIZE cannot be first argument\n");
+			verbose(env,
+				"ARG_CONST_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
 
@@ -1514,7 +1532,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			meta = NULL;
 
 		if (reg->smin_value < 0) {
-			verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1528,7 +1546,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 
 		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-			verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1539,12 +1557,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	return err;
 err_type:
-	verbose("R%d type=%s expected=%s\n", regno,
+	verbose(env, "R%d type=%s expected=%s\n", regno,
 		reg_type_str[type], reg_type_str[expected_type]);
 	return -EACCES;
 }
 
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map, int func_id)
 {
 	if (!map)
 		return 0;
@@ -1632,7 +1651,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 
 	return 0;
 error:
-	verbose("cannot pass map_type %d into func %s#%d\n",
+	verbose(env, "cannot pass map_type %d into func %s#%d\n",
 		map->map_type, func_id_name(func_id), func_id);
 	return -EINVAL;
 }
@@ -1666,7 +1685,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (reg_is_pkt_pointer_any(&regs[i]))
-			mark_reg_unknown(regs, i);
+			mark_reg_unknown(env, regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1688,7 +1707,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* find function prototype */
 	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
-		verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
@@ -1696,13 +1716,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		fn = env->prog->aux->ops->get_func_proto(func_id);
 
 	if (!fn) {
-		verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	if (!env->prog->gpl_compatible && fn->gpl_only) {
-		verbose("cannot call GPL only function from proprietary program\n");
+		verbose(env, "cannot call GPL only function from proprietary program\n");
 		return -EINVAL;
 	}
 
@@ -1716,7 +1737,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	 */
 	err = check_raw_mode(fn);
 	if (err) {
-		verbose("kernel subsystem misconfigured func %s#%d\n",
+		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
 		return err;
 	}
@@ -1749,14 +1770,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
 	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
-		mark_reg_unknown(regs, BPF_REG_0);
+		mark_reg_unknown(env, regs, BPF_REG_0);
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
 	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1764,14 +1785,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		/* There is no offset yet applied, variable or fixed */
-		mark_reg_known_zero(regs, BPF_REG_0);
+		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].off = 0;
 		/* remember map_ptr, so that check_map_access()
 		 * can check 'value_size' boundary of memory access
 		 * to map element returned from bpf_map_lookup_elem()
 		 */
 		if (meta.map_ptr == NULL) {
-			verbose("kernel subsystem misconfigured verifier\n");
+			verbose(env,
+				"kernel subsystem misconfigured verifier\n");
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1782,12 +1804,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		else if (insn_aux->map_ptr != meta.map_ptr)
 			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
 	} else {
-		verbose("unknown return type %d of func %s#%d\n",
+		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
-	err = check_map_func_compatibility(meta.map_ptr, func_id);
+	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
 
@@ -1846,39 +1868,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad sbounds\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad ubounds\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
 	}
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d 32-bit pointer arithmetic prohibited\n",
+			verbose(env,
+				"R%d 32-bit pointer arithmetic prohibited\n",
 				dst);
 		return -EACCES;
 	}
 
 	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == CONST_PTR_TO_MAP) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == PTR_TO_PACKET_END) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
 				dst);
 		return -EACCES;
 	}
@@ -1943,7 +1968,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
 			if (!env->allow_ptr_leaks)
-				verbose("R%d tried to subtract pointer from scalar\n",
+				verbose(env, "R%d tried to subtract pointer from scalar\n",
 					dst);
 			return -EACCES;
 		}
@@ -1953,7 +1978,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		if (ptr_reg->type == PTR_TO_STACK) {
 			if (!env->allow_ptr_leaks)
-				verbose("R%d subtraction from stack pointer prohibited\n",
+				verbose(env, "R%d subtraction from stack pointer prohibited\n",
 					dst);
 			return -EACCES;
 		}
@@ -2008,13 +2033,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * ptr &= ~3 which would reduce min_value by 3.)
 		 */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d bitwise operator %s on pointer prohibited\n",
+			verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	default:
 		/* other operators (e.g. MUL,LSH) produce non-pointer results */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic with %s operator prohibited\n",
+			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	}
@@ -2180,7 +2205,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* We lose all sign bit information (except what we can pick
@@ -2208,7 +2233,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2236,7 +2261,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	default:
-		mark_reg_unknown(regs, insn->dst_reg);
+		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
 	}
 
@@ -2268,12 +2293,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				 * an arbitrary scalar.
 				 */
 				if (!env->allow_ptr_leaks) {
-					verbose("R%d pointer %s pointer prohibited\n",
+					verbose(env, "R%d pointer %s pointer prohibited\n",
 						insn->dst_reg,
 						bpf_alu_string[opcode >> 4]);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				return 0;
 			} else {
 				/* scalar += pointer
@@ -2325,13 +2350,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: unexpected ptr_reg\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: no src_reg\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
 	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2349,14 +2374,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			if (BPF_SRC(insn->code) != 0 ||
 			    insn->src_reg != BPF_REG_0 ||
 			    insn->off != 0 || insn->imm != 0) {
-				verbose("BPF_NEG uses reserved fields\n");
+				verbose(env, "BPF_NEG uses reserved fields\n");
 				return -EINVAL;
 			}
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
 			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
 			    BPF_CLASS(insn->code) == BPF_ALU64) {
-				verbose("BPF_END uses reserved fields\n");
+				verbose(env, "BPF_END uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2367,7 +2392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return err;
 
 		if (is_pointer_value(env, insn->dst_reg)) {
-			verbose("R%d pointer arithmetic prohibited\n",
+			verbose(env, "R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
 			return -EACCES;
 		}
@@ -2381,7 +2406,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 
@@ -2391,7 +2416,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2411,11 +2436,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
-					verbose("R%d partial copy of pointer\n",
+					verbose(env,
+						"R%d partial copy of pointer\n",
 						insn->src_reg);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				/* high 32 bits are known zero. */
 				regs[insn->dst_reg].var_off = tnum_cast(
 						regs[insn->dst_reg].var_off, 4);
@@ -2430,14 +2456,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 	} else if (opcode > BPF_END) {
-		verbose("invalid BPF_ALU opcode %x\n", opcode);
+		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
 		return -EINVAL;
 
 	} else {	/* all other ALU ops: and, sub, xor, add, ... */
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src1 operand */
@@ -2446,7 +2472,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2458,7 +2484,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
 		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
-			verbose("div by zero\n");
+			verbose(env, "div by zero\n");
 			return -EINVAL;
 		}
 
@@ -2467,7 +2493,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
 
 			if (insn->imm < 0 || insn->imm >= size) {
-				verbose("invalid shift %d\n", insn->imm);
+				verbose(env, "invalid shift %d\n", insn->imm);
 				return -EINVAL;
 			}
 		}
@@ -2820,13 +2846,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	int err;
 
 	if (opcode > BPF_JSLE) {
-		verbose("invalid BPF_JMP opcode %x\n", opcode);
+		verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
 		return -EINVAL;
 	}
 
 	if (BPF_SRC(insn->code) == BPF_X) {
 		if (insn->imm != 0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -2836,13 +2862,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			return err;
 
 		if (is_pointer_value(env, insn->src_reg)) {
-			verbose("R%d pointer comparison prohibited\n",
+			verbose(env, "R%d pointer comparison prohibited\n",
 				insn->src_reg);
 			return -EACCES;
 		}
 	} else {
 		if (insn->src_reg != BPF_REG_0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 	}
@@ -2954,11 +2980,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
 				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
-		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+		verbose(env, "R%d pointer comparison prohibited\n",
+			insn->dst_reg);
 		return -EACCES;
 	}
-	if (verifier_log.level)
-		print_verifier_state(this_branch);
+	if (env->log.level)
+		print_verifier_state(env, this_branch);
 	return 0;
 }
 
@@ -2977,11 +3004,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
-		verbose("invalid BPF_LD_IMM insn\n");
+		verbose(env, "invalid BPF_LD_IMM insn\n");
 		return -EINVAL;
 	}
 	if (insn->off != 0) {
-		verbose("BPF_LD_IMM64 uses reserved fields\n");
+		verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3039,14 +3066,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
-		verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+		verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
 
 	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
 	    BPF_SIZE(insn->code) == BPF_DW ||
 	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-		verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+		verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3056,7 +3083,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
-		verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+		verbose(env,
+			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
 		return -EINVAL;
 	}
 
@@ -3069,7 +3097,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 	/* reset caller saved regs to unreadable */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
@@ -3077,7 +3105,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * the value fetched from the packet.
 	 * Already marked as written above.
 	 */
-	mark_reg_unknown(regs, BPF_REG_0);
+	mark_reg_unknown(env, regs, BPF_REG_0);
 	return 0;
 }
 
@@ -3097,22 +3125,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 
 	reg = &env->cur_state.regs[BPF_REG_0];
 	if (reg->type != SCALAR_VALUE) {
-		verbose("At program exit the register R0 is not a known value (%s)\n",
+		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
 			reg_type_str[reg->type]);
 		return -EINVAL;
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
-		verbose("At program exit the register R0 ");
+		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("has value %s", tn_buf);
+			verbose(env, "has value %s", tn_buf);
 		} else {
-			verbose("has unknown scalar value");
+			verbose(env, "has unknown scalar value");
 		}
-		verbose(" should have been 0 or 1\n");
+		verbose(env, " should have been 0 or 1\n");
 		return -EINVAL;
 	}
 	return 0;
@@ -3178,7 +3206,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		return 0;
 
 	if (w < 0 || w >= env->prog->len) {
-		verbose("jump out of range from insn %d to %d\n", t, w);
+		verbose(env, "jump out of range from insn %d to %d\n", t, w);
 		return -EINVAL;
 	}
 
@@ -3195,13 +3223,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		insn_stack[cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-		verbose("back-edge from insn %d to %d\n", t, w);
+		verbose(env, "back-edge from insn %d to %d\n", t, w);
 		return -EINVAL;
 	} else if (insn_state[w] == EXPLORED) {
 		/* forward- or cross-edge */
 		insn_state[t] = DISCOVERED | e;
 	} else {
-		verbose("insn state internal bug\n");
+		verbose(env, "insn state internal bug\n");
 		return -EFAULT;
 	}
 	return 0;
@@ -3295,7 +3323,7 @@ peek_stack:
 mark_explored:
 	insn_state[t] = EXPLORED;
 	if (cur_stack-- <= 0) {
-		verbose("pop stack internal bug\n");
+		verbose(env, "pop stack internal bug\n");
 		ret = -EFAULT;
 		goto err_free;
 	}
@@ -3304,7 +3332,7 @@ mark_explored:
 check_state:
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn_state[i] != EXPLORED) {
-			verbose("unreachable insn %d\n", i);
+			verbose(env, "unreachable insn %d\n", i);
 			ret = -EINVAL;
 			goto err_free;
 		}
@@ -3685,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env)
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(regs);
+	init_reg_state(env, regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3694,7 +3722,7 @@ static int do_check(struct bpf_verifier_env *env)
 		int err;
 
 		if (insn_idx >= insn_cnt) {
-			verbose("invalid insn idx %d insn_cnt %d\n",
+			verbose(env, "invalid insn idx %d insn_cnt %d\n",
 				insn_idx, insn_cnt);
 			return -EFAULT;
 		}
@@ -3703,7 +3731,8 @@ static int do_check(struct bpf_verifier_env *env)
 		class = BPF_CLASS(insn->code);
 
 		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
-			verbose("BPF program is too large. Processed %d insn\n",
+			verbose(env,
+				"BPF program is too large. Processed %d insn\n",
 				insn_processed);
 			return -E2BIG;
 		}
@@ -3713,12 +3742,12 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (verifier_log.level) {
+			if (env->log.level) {
 				if (do_print_state)
-					verbose("\nfrom %d to %d: safe\n",
+					verbose(env, "\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
 				else
-					verbose("%d: safe\n", insn_idx);
+					verbose(env, "%d: safe\n", insn_idx);
 			}
 			goto process_bpf_exit;
 		}
@@ -3726,19 +3755,18 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (verifier_log.level > 1 ||
-		    (verifier_log.level && do_print_state)) {
-			if (verifier_log.level > 1)
-				verbose("%d:", insn_idx);
+		if (env->log.level > 1 || (env->log.level && do_print_state)) {
+			if (env->log.level > 1)
+				verbose(env, "%d:", insn_idx);
 			else
-				verbose("\nfrom %d to %d:",
+				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(&env->cur_state);
+			print_verifier_state(env, &env->cur_state);
 			do_print_state = false;
 		}
 
-		if (verifier_log.level) {
-			verbose("%d: ", insn_idx);
+		if (env->log.level) {
+			verbose(env, "%d: ", insn_idx);
 			print_bpf_insn(env, insn);
 		}
 
@@ -3795,7 +3823,7 @@ static int do_check(struct bpf_verifier_env *env)
 				 * src_reg == stack|map in some other branch.
 				 * Reject it.
 				 */
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
@@ -3835,14 +3863,14 @@ static int do_check(struct bpf_verifier_env *env)
 			} else if (dst_reg_type != *prev_dst_type &&
 				   (dst_reg_type == PTR_TO_CTX ||
 				    *prev_dst_type == PTR_TO_CTX)) {
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
 		} else if (class == BPF_ST) {
 			if (BPF_MODE(insn->code) != BPF_MEM ||
 			    insn->src_reg != BPF_REG_0) {
-				verbose("BPF_ST uses reserved fields\n");
+				verbose(env, "BPF_ST uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src operand */
@@ -3865,7 +3893,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->off != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_CALL uses reserved fields\n");
+					verbose(env, "BPF_CALL uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3878,7 +3906,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_JA uses reserved fields\n");
+					verbose(env, "BPF_JA uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3890,7 +3918,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_EXIT uses reserved fields\n");
+					verbose(env, "BPF_EXIT uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3905,7 +3933,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return err;
 
 				if (is_pointer_value(env, BPF_REG_0)) {
-					verbose("R0 leaks addr as return value\n");
+					verbose(env, "R0 leaks addr as return value\n");
 					return -EACCES;
 				}
 
@@ -3940,19 +3968,19 @@ process_bpf_exit:
 
 				insn_idx++;
 			} else {
-				verbose("invalid BPF_LD mode\n");
+				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
 			}
 		} else {
-			verbose("unknown insn class %d\n", class);
+			verbose(env, "unknown insn class %d\n", class);
 			return -EINVAL;
 		}
 
 		insn_idx++;
 	}
 
-	verbose("processed %d insns, stack depth %d\n",
-		insn_processed, env->prog->aux->stack_depth);
+	verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+		env->prog->aux->stack_depth);
 	return 0;
 }
 
@@ -3964,7 +3992,8 @@ static int check_map_prealloc(struct bpf_map *map)
 		!(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map,
 					struct bpf_prog *prog)
 
 {
@@ -3975,12 +4004,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
 	 */
 	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
 		if (!check_map_prealloc(map)) {
-			verbose("perf_event programs can only use preallocated hash map\n");
+			verbose(env, "perf_event programs can only use preallocated hash map\n");
 			return -EINVAL;
 		}
 		if (map->inner_map_meta &&
 		    !check_map_prealloc(map->inner_map_meta)) {
-			verbose("perf_event programs can only use preallocated inner hash map\n");
+			verbose(env, "perf_event programs can only use preallocated inner hash map\n");
 			return -EINVAL;
 		}
 	}
@@ -4003,14 +4032,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
 		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
-			verbose("BPF_LDX uses reserved fields\n");
+			verbose(env, "BPF_LDX uses reserved fields\n");
 			return -EINVAL;
 		}
 
 		if (BPF_CLASS(insn->code) == BPF_STX &&
 		    ((BPF_MODE(insn->code) != BPF_MEM &&
 		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
-			verbose("BPF_STX uses reserved fields\n");
+			verbose(env, "BPF_STX uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -4021,7 +4050,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			if (i == insn_cnt - 1 || insn[1].code != 0 ||
 			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
 			    insn[1].off != 0) {
-				verbose("invalid bpf_ld_imm64 insn\n");
+				verbose(env, "invalid bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
@@ -4030,19 +4059,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				goto next_insn;
 
 			if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
-				verbose("unrecognized bpf_ld_imm64 insn\n");
+				verbose(env,
+					"unrecognized bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
 			f = fdget(insn->imm);
 			map = __bpf_map_get(f);
 			if (IS_ERR(map)) {
-				verbose("fd %d is not pointing to valid bpf_map\n",
+				verbose(env, "fd %d is not pointing to valid bpf_map\n",
 					insn->imm);
 				return PTR_ERR(map);
 			}
 
-			err = check_map_prog_compatibility(map, env->prog);
+			err = check_map_prog_compatibility(env, map, env->prog);
 			if (err) {
 				fdput(f);
 				return err;
@@ -4164,7 +4194,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
 					env->prog);
 		if (cnt >= ARRAY_SIZE(insn_buf)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		} else if (cnt) {
 			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4212,7 +4242,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			u8 size_code;
 
 			if (type == BPF_WRITE) {
-				verbose("bpf verifier narrow ctx access misconfigured\n");
+				verbose(env, "bpf verifier narrow ctx access misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4231,7 +4261,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 					      &target_size);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
 		    (ctx_field_size && !target_size)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
 
@@ -4313,7 +4343,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 
 			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
 			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose("bpf verifier is misconfigured\n");
+				verbose(env, "bpf verifier is misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4357,7 +4387,8 @@ patch_call_imm:
 		 * programs to call them, must be real in-kernel functions
 		 */
 		if (!fn->func) {
-			verbose("kernel subsystem misconfigured func %s#%d\n",
+			verbose(env,
+				"kernel subsystem misconfigured func %s#%d\n",
 				func_id_name(insn->imm), insn->imm);
 			return -EFAULT;
 		}
@@ -4391,8 +4422,8 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	struct bpf_verifer_log *log = &verifier_log;
 	struct bpf_verifier_env *env;
+	struct bpf_verifer_log *log;
 	int ret = -EINVAL;
 
 	/* 'struct bpf_verifier_env' can be global, but since it's not small,
@@ -4401,6 +4432,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
+	log = &env->log;
 
 	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
 				     (*prog)->len);
@@ -4419,7 +4451,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log->level = attr->log_level;
 		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
 		log->len_total = attr->log_size;
-		log->len_used = 0;
 
 		ret = -EINVAL;
 		/* log attributes have to be sane */
@@ -4431,8 +4462,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log->kbuf = vmalloc(log->len_total);
 		if (!log->kbuf)
 			goto err_unlock;
-	} else {
-		log->level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4543,8 +4572,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
-	verifier_log.level = 0;
-
 	env->strict_alignment = false;
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
-- 
cgit v1.2.3


From d66f2b91f95b56e31772b9faa0d036cd2e53cb02 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:14 -0700
Subject: bpf: don't rely on the verifier lock for metadata_dst allocation

bpf_skb_set_tunnel_*() functions require allocation of per-cpu
metadata_dst.  The allocation happens upon verification of the
first program using those helpers.  In preparation for removing
the verifier lock, use cmpxchg() to make sure we only allocate
the metadata_dsts once.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h |  1 +
 net/core/dst.c             | 16 ++++++++++++++++
 net/core/filter.c          | 16 +++++++++-------
 3 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 9fba2ebf6dda..87a0bb8d449f 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -87,6 +87,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
 void metadata_dst_free(struct metadata_dst *);
 struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
 					gfp_t flags);
+void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst);
 struct metadata_dst __percpu *
 metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);
 
diff --git a/net/core/dst.c b/net/core/dst.c
index a6c47da7d0f8..8b2eafac984d 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -322,3 +322,19 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
 	return md_dst;
 }
 EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
+
+void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
+{
+	int cpu;
+
+#ifdef CONFIG_DST_CACHE
+	for_each_possible_cpu(cpu) {
+		struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
+
+		if (one_md_dst->type == METADATA_IP_TUNNEL)
+			dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
+	}
+#endif
+	free_percpu(md_dst);
+}
+EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
diff --git a/net/core/filter.c b/net/core/filter.c
index b7e8caa1e790..140fa9f9c0f4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -43,6 +43,7 @@
 #include <linux/timer.h>
 #include <linux/uaccess.h>
 #include <asm/unaligned.h>
+#include <asm/cmpxchg.h>
 #include <linux/filter.h>
 #include <linux/ratelimit.h>
 #include <linux/seccomp.h>
@@ -2987,14 +2988,15 @@ static const struct bpf_func_proto *
 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 {
 	if (!md_dst) {
-		/* Race is not possible, since it's called from verifier
-		 * that is holding verifier mutex.
-		 */
-		md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
-						   METADATA_IP_TUNNEL,
-						   GFP_KERNEL);
-		if (!md_dst)
+		struct metadata_dst __percpu *tmp;
+
+		tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+						METADATA_IP_TUNNEL,
+						GFP_KERNEL);
+		if (!tmp)
 			return NULL;
+		if (cmpxchg(&md_dst, NULL, tmp))
+			metadata_dst_free_percpu(tmp);
 	}
 
 	switch (which) {
-- 
cgit v1.2.3


From a2a7d5701052542cd2260e7659b12443e0a74733 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:15 -0700
Subject: bpf: write back the verifier log buffer as it gets filled

Verifier log buffer can be quite large (up to 16MB currently).
As Eric Dumazet points out if we allow multiple verification
requests to proceed simultaneously, malicious user may use the
verifier as a way of allocating large amounts of unswappable
memory to OOM the host.

Switch to a strategy of allocating a smaller buffer (1024B)
and writing it out into the user buffer after every print.

While at it remove the old BUG_ON().

This is in preparation of the global verifier lock removal.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  4 +++-
 kernel/bpf/verifier.c        | 41 +++++++++++++++++++----------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5ddb9a626a51..f00ef751c1c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,9 +115,11 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+#define BPF_VERIFIER_TMP_LOG_SIZE	1024
+
 struct bpf_verifer_log {
 	u32 level;
-	char *kbuf;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 	char __user *ubuf;
 	u32 len_used;
 	u32 len_total;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 274c6582ec39..2cdbcc4f8f6b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -165,15 +165,26 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
 				   const char *fmt, ...)
 {
 	struct bpf_verifer_log *log = &env->log;
+	unsigned int n;
 	va_list args;
 
-	if (!log->level || bpf_verifier_log_full(log))
+	if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log->len_used += vscnprintf(log->kbuf + log->len_used,
-				    log->len_total - log->len_used, fmt, args);
+	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
 	va_end(args);
+
+	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+		  "verifier log line truncated - local buffer too short\n");
+
+	n = min(log->len_total - log->len_used - 1, n);
+	log->kbuf[n] = '\0';
+
+	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+		log->len_used += n;
+	else
+		log->ubuf = NULL;
 }
 
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
@@ -4263,11 +4274,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
 		    !log->level || !log->ubuf)
 			goto err_unlock;
-
-		ret = -ENOMEM;
-		log->kbuf = vmalloc(log->len_total);
-		if (!log->kbuf)
-			goto err_unlock;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4304,18 +4310,11 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log->level && bpf_verifier_log_full(log)) {
-		BUG_ON(log->len_used >= log->len_total);
-		/* verifier log exceeded user supplied buffer */
+	if (log->level && bpf_verifier_log_full(log))
 		ret = -ENOSPC;
-		/* fall through to return what was recorded */
-	}
-
-	/* copy verifier log back to user space including trailing zero */
-	if (log->level && copy_to_user(log->ubuf, log->kbuf,
-				       log->len_used + 1) != 0) {
+	if (log->level && !log->ubuf) {
 		ret = -EFAULT;
-		goto free_log_buf;
+		goto err_release_maps;
 	}
 
 	if (ret == 0 && env->used_map_cnt) {
@@ -4326,7 +4325,7 @@ skip_full_check:
 
 		if (!env->prog->aux->used_maps) {
 			ret = -ENOMEM;
-			goto free_log_buf;
+			goto err_release_maps;
 		}
 
 		memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4339,9 +4338,7 @@ skip_full_check:
 		convert_pseudo_ld_imm64(env);
 	}
 
-free_log_buf:
-	if (log->level)
-		vfree(log->kbuf);
+err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
-- 
cgit v1.2.3


From b8226962b1c49c784aeddb9d2fafbf53dfdc2190 Mon Sep 17 00:00:00 2001
From: Eric Garver <e@erig.me>
Date: Tue, 10 Oct 2017 16:54:44 -0400
Subject: openvswitch: add ct_clear action

This adds a ct_clear action for clearing conntrack state. ct_clear is
currently implemented in OVS userspace, but is not backed by an action
in the kernel datapath. This is useful for flows that may modify a
packet tuple after a ct lookup has already occurred.

Signed-off-by: Eric Garver <e@erig.me>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  2 ++
 net/openvswitch/actions.c        |  4 ++++
 net/openvswitch/conntrack.c      | 11 +++++++++++
 net/openvswitch/conntrack.h      |  7 +++++++
 net/openvswitch/flow_netlink.c   |  5 +++++
 5 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index efdbfbfd3ee2..0cd6f8833147 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -807,6 +807,7 @@ struct ovs_action_push_eth {
  * packet.
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
+ * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -836,6 +837,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a54a556fcdb5..a551232daf61 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 				return err == -EINPROGRESS ? 0 : err;
 			break;
 
+		case OVS_ACTION_ATTR_CT_CLEAR:
+			err = ovs_ct_clear(skb, key);
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_ETH:
 			err = push_eth(skb, key, nla_data(a));
 			break;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index d558e882ca0c..fe861e2f0deb 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1129,6 +1129,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	return err;
 }
 
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	if (skb_nfct(skb)) {
+		nf_conntrack_put(skb_nfct(skb));
+		nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+		ovs_ct_fill_key(skb, key);
+	}
+
+	return 0;
+}
+
 static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 			     const struct sw_flow_key *key, bool log)
 {
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index bc7efd1867ab..399dfdd2c4f9 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *);
 
 int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
 		   const struct ovs_conntrack_info *);
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key);
 
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
@@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static inline int ovs_ct_clear(struct sk_buff *skb,
+			       struct sw_flow_key *key)
+{
+	return -ENOTSUPP;
+}
+
 static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 				   struct sw_flow_key *key)
 {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index fc0ca9a89b8e..dc0d79092e74 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -76,6 +76,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 			break;
 
 		case OVS_ACTION_ATTR_CT:
+		case OVS_ACTION_ATTR_CT_CLEAR:
 		case OVS_ACTION_ATTR_HASH:
 		case OVS_ACTION_ATTR_POP_ETH:
 		case OVS_ACTION_ATTR_POP_MPLS:
@@ -2528,6 +2529,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
 			[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
 			[OVS_ACTION_ATTR_CT] = (u32)-1,
+			[OVS_ACTION_ATTR_CT_CLEAR] = 0,
 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
@@ -2669,6 +2671,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			skip_copy = true;
 			break;
 
+		case OVS_ACTION_ATTR_CT_CLEAR:
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_ETH:
 			/* Disallow pushing an Ethernet header if one
 			 * is already present */
-- 
cgit v1.2.3


From 8c418b5b15747eda05d086e80fa0a767982fbf37 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 6 Oct 2017 11:53:32 +0200
Subject: fq: support filtering a given tin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add to the FQ API a way to filter a given tin, in order to
remove frames that fulfil certain criteria according to a
filter function.

This will be used by mac80211 to remove frames belonging to
an AP VLAN interface that's being removed.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/fq.h      |  7 +++++
 include/net/fq_impl.h | 72 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/fq.h b/include/net/fq.h
index 6d8521a30c5c..ac944a686840 100644
--- a/include/net/fq.h
+++ b/include/net/fq.h
@@ -90,6 +90,13 @@ typedef void fq_skb_free_t(struct fq *,
 			   struct fq_flow *,
 			   struct sk_buff *);
 
+/* Return %true to filter (drop) the frame. */
+typedef bool fq_skb_filter_t(struct fq *,
+			     struct fq_tin *,
+			     struct fq_flow *,
+			     struct sk_buff *,
+			     void *);
+
 typedef struct fq_flow *fq_flow_get_default_t(struct fq *,
 					      struct fq_tin *,
 					      int idx,
diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index 4e6131cd3f43..8b237e4afee6 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -12,24 +12,22 @@
 
 /* functions that are embedded into includer */
 
-static struct sk_buff *fq_flow_dequeue(struct fq *fq,
-				       struct fq_flow *flow)
+static void fq_adjust_removal(struct fq *fq,
+			      struct fq_flow *flow,
+			      struct sk_buff *skb)
 {
 	struct fq_tin *tin = flow->tin;
-	struct fq_flow *i;
-	struct sk_buff *skb;
-
-	lockdep_assert_held(&fq->lock);
-
-	skb = __skb_dequeue(&flow->queue);
-	if (!skb)
-		return NULL;
 
 	tin->backlog_bytes -= skb->len;
 	tin->backlog_packets--;
 	flow->backlog -= skb->len;
 	fq->backlog--;
 	fq->memory_usage -= skb->truesize;
+}
+
+static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow)
+{
+	struct fq_flow *i;
 
 	if (flow->backlog == 0) {
 		list_del_init(&flow->backlogchain);
@@ -43,6 +41,21 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq,
 		list_move_tail(&flow->backlogchain,
 			       &i->backlogchain);
 	}
+}
+
+static struct sk_buff *fq_flow_dequeue(struct fq *fq,
+				       struct fq_flow *flow)
+{
+	struct sk_buff *skb;
+
+	lockdep_assert_held(&fq->lock);
+
+	skb = __skb_dequeue(&flow->queue);
+	if (!skb)
+		return NULL;
+
+	fq_adjust_removal(fq, flow, skb);
+	fq_rejigger_backlog(fq, flow);
 
 	return skb;
 }
@@ -188,6 +201,45 @@ static void fq_tin_enqueue(struct fq *fq,
 	}
 }
 
+static void fq_flow_filter(struct fq *fq,
+			   struct fq_flow *flow,
+			   fq_skb_filter_t filter_func,
+			   void *filter_data,
+			   fq_skb_free_t free_func)
+{
+	struct fq_tin *tin = flow->tin;
+	struct sk_buff *skb, *tmp;
+
+	lockdep_assert_held(&fq->lock);
+
+	skb_queue_walk_safe(&flow->queue, skb, tmp) {
+		if (!filter_func(fq, tin, flow, skb, filter_data))
+			continue;
+
+		__skb_unlink(skb, &flow->queue);
+		fq_adjust_removal(fq, flow, skb);
+		free_func(fq, tin, flow, skb);
+	}
+
+	fq_rejigger_backlog(fq, flow);
+}
+
+static void fq_tin_filter(struct fq *fq,
+			  struct fq_tin *tin,
+			  fq_skb_filter_t filter_func,
+			  void *filter_data,
+			  fq_skb_free_t free_func)
+{
+	struct fq_flow *flow;
+
+	lockdep_assert_held(&fq->lock);
+
+	list_for_each_entry(flow, &tin->new_flows, flowchain)
+		fq_flow_filter(fq, flow, filter_func, filter_data, free_func);
+	list_for_each_entry(flow, &tin->old_flows, flowchain)
+		fq_flow_filter(fq, flow, filter_func, filter_data, free_func);
+}
+
 static void fq_flow_reset(struct fq *fq,
 			  struct fq_flow *flow,
 			  fq_skb_free_t free_func)
-- 
cgit v1.2.3


From 1ea4ff3e9f0b8d53e680a2bb9e8e644bf03aeb4d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 13 Sep 2017 16:07:22 +0200
Subject: cfg80211: support reloading regulatory database

If the regulatory database is loaded, and then updated, it may
be necessary to reload it. Add an nl80211 command to do this.

Note that this just reloads the database, it doesn't re-apply
the rules from it immediately.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  4 +++
 net/wireless/nl80211.c       | 11 ++++++
 net/wireless/reg.c           | 80 +++++++++++++++++++++++++++++++++-----------
 net/wireless/reg.h           |  6 ++++
 4 files changed, 81 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 95832ce03a44..f882fe1f9709 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -990,6 +990,8 @@
  *	&NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed
  *	&NL80211_CMD_DISCONNECT should be indicated instead.
  *
+ * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1194,6 +1196,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_PORT_AUTHORIZED,
 
+	NL80211_CMD_RELOAD_REGDB,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5129342151e6..67a03f2885a4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5678,6 +5678,11 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
 	}
 }
 
+static int nl80211_reload_regdb(struct sk_buff *skb, struct genl_info *info)
+{
+	return reg_reload_regdb();
+}
+
 static int nl80211_get_mesh_config(struct sk_buff *skb,
 				   struct genl_info *info)
 {
@@ -12708,6 +12713,12 @@ static const struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_RELOAD_REGDB,
+		.doit = nl80211_reload_regdb,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 	{
 		.cmd = NL80211_CMD_GET_MESH_CONFIG,
 		.doit = nl80211_get_mesh_config,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index e9aeb05aaf3e..180addda52af 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -781,6 +781,8 @@ static int query_regdb(const char *alpha2)
 	const struct fwdb_header *hdr = regdb;
 	const struct fwdb_country *country;
 
+	ASSERT_RTNL();
+
 	if (IS_ERR(regdb))
 		return PTR_ERR(regdb);
 
@@ -796,41 +798,47 @@ static int query_regdb(const char *alpha2)
 
 static void regdb_fw_cb(const struct firmware *fw, void *context)
 {
+	int set_error = 0;
+	bool restore = true;
 	void *db;
 
 	if (!fw) {
 		pr_info("failed to load regulatory.db\n");
-		regdb = ERR_PTR(-ENODATA);
-		goto restore;
-	}
-
-	if (!valid_regdb(fw->data, fw->size)) {
+		set_error = -ENODATA;
+	} else if (!valid_regdb(fw->data, fw->size)) {
 		pr_info("loaded regulatory.db is malformed\n");
-		release_firmware(fw);
-		regdb = ERR_PTR(-EINVAL);
-		goto restore;
+		set_error = -EINVAL;
 	}
 
-	db = kmemdup(fw->data, fw->size, GFP_KERNEL);
-	release_firmware(fw);
+	rtnl_lock();
+	if (WARN_ON(regdb && !IS_ERR(regdb))) {
+		/* just restore and free new db */
+	} else if (set_error) {
+		regdb = ERR_PTR(set_error);
+	} else if (fw) {
+		db = kmemdup(fw->data, fw->size, GFP_KERNEL);
+		if (db) {
+			regdb = db;
+			restore = context && query_regdb(context);
+		} else {
+			restore = true;
+		}
+	}
 
-	if (!db)
-		goto restore;
-	regdb = db;
+	if (restore)
+		restore_regulatory_settings(true);
 
-	if (query_regdb(context))
-		goto restore;
-	goto free;
- restore:
-	rtnl_lock();
-	restore_regulatory_settings(true);
 	rtnl_unlock();
- free:
+
 	kfree(context);
+
+	release_firmware(fw);
 }
 
 static int query_regdb_file(const char *alpha2)
 {
+	ASSERT_RTNL();
+
 	if (regdb)
 		return query_regdb(alpha2);
 
@@ -843,6 +851,38 @@ static int query_regdb_file(const char *alpha2)
 				       (void *)alpha2, regdb_fw_cb);
 }
 
+int reg_reload_regdb(void)
+{
+	const struct firmware *fw;
+	void *db;
+	int err;
+
+	err = request_firmware(&fw, "regulatory.db", &reg_pdev->dev);
+	if (err)
+		return err;
+
+	if (!valid_regdb(fw->data, fw->size)) {
+		err = -ENODATA;
+		goto out;
+	}
+
+	db = kmemdup(fw->data, fw->size, GFP_KERNEL);
+	if (!db) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rtnl_lock();
+	if (!IS_ERR_OR_NULL(regdb))
+		kfree(regdb);
+	regdb = db;
+	rtnl_unlock();
+
+ out:
+	release_firmware(fw);
+	return err;
+}
+
 static bool reg_query_database(struct regulatory_request *request)
 {
 	/* query internal regulatory database (if it exists) */
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index ca7fedf2e7a1..9529c522611a 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -179,4 +179,10 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy,
  * @wiphy2 - wiphy it's dfs_region to be checked against that of wiphy1
  */
 bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2);
+
+/**
+ * reg_reload_regdb - reload the regulatory.db firmware file
+ */
+int reg_reload_regdb(void);
+
 #endif  /* __NET_WIRELESS_REG_H */
-- 
cgit v1.2.3


From 4a269818a7eb8577d32d8b2879099c689ddbd856 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Oct 2017 13:27:29 -0700
Subject: tcp: fix tcp_unlink_write_queue()

Yury reported crash with this signature :

[  554.034021] [<ffff80003ccd5a58>] 0xffff80003ccd5a58
[  554.034156] [<ffff00000888fd34>] skb_release_all+0x14/0x30
[  554.034288] [<ffff00000888fd64>] __kfree_skb+0x14/0x28
[  554.034409] [<ffff0000088ece6c>] tcp_sendmsg_locked+0x4dc/0xcc8
[  554.034541] [<ffff0000088ed68c>] tcp_sendmsg+0x34/0x58
[  554.034659] [<ffff000008919fd4>] inet_sendmsg+0x2c/0xf8
[  554.034783] [<ffff0000088842e8>] sock_sendmsg+0x18/0x30
[  554.034928] [<ffff0000088861fc>] SyS_sendto+0x84/0xf8

Problem is that skb->destructor contains garbage, and this is
because I accidentally removed tcp_skb_tsorted_anchor_cleanup()
from tcp_unlink_write_queue()

This would trigger with a write(fd, <invalid_memory>, len) attempt,
and we will add to packetdrill this capability to avoid future
regressions.

Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue")
Reported-by: Yury Norov <ynorov@caviumnetworks.com>
Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5a95e5886b55..15163454174b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1712,6 +1712,7 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
+	tcp_skb_tsorted_anchor_cleanup(skb);
 	__skb_unlink(skb, &sk->sk_write_queue);
 }
 
-- 
cgit v1.2.3


From 28978713c51b0a70acf748f76f9d6d2d20dcf980 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 10 Oct 2017 23:45:18 -0700
Subject: net: qrtr: Move constants to header file

The constants are used by both the name server and clients, so clarify
their value and move them to the uapi header.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/qrtr.h | 3 +++
 net/qrtr/qrtr.c           | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
index 9d76c566f66e..63e8803e4d90 100644
--- a/include/uapi/linux/qrtr.h
+++ b/include/uapi/linux/qrtr.h
@@ -4,6 +4,9 @@
 #include <linux/socket.h>
 #include <linux/types.h>
 
+#define QRTR_NODE_BCAST	0xffffffffu
+#define QRTR_PORT_CTRL	0xfffffffeu
+
 struct sockaddr_qrtr {
 	__kernel_sa_family_t sq_family;
 	__u32 sq_node;
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 7e4b49a8349e..15981abc042c 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -61,8 +61,6 @@ struct qrtr_hdr {
 } __packed;
 
 #define QRTR_HDR_SIZE sizeof(struct qrtr_hdr)
-#define QRTR_NODE_BCAST ((unsigned int)-1)
-#define QRTR_PORT_CTRL ((unsigned int)-2)
 
 struct qrtr_sock {
 	/* WARNING: sk must be the first member */
-- 
cgit v1.2.3


From da7653f0faabbe45eb2d3fd6e4b400fe003e81ae Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 10 Oct 2017 23:45:19 -0700
Subject: net: qrtr: Add control packet definition to uapi

The QMUX protocol specification defines structure of the special control
packet messages being sent between handlers of the control port.

Add these to the uapi header, as this structure and the associated types
are shared between the kernel and all userspace handlers of control
messages.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/qrtr.h | 32 ++++++++++++++++++++++++++++++++
 net/qrtr/qrtr.c           | 12 ------------
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
index 63e8803e4d90..179af64846e0 100644
--- a/include/uapi/linux/qrtr.h
+++ b/include/uapi/linux/qrtr.h
@@ -13,4 +13,36 @@ struct sockaddr_qrtr {
 	__u32 sq_port;
 };
 
+enum qrtr_pkt_type {
+	QRTR_TYPE_DATA		= 1,
+	QRTR_TYPE_HELLO		= 2,
+	QRTR_TYPE_BYE		= 3,
+	QRTR_TYPE_NEW_SERVER	= 4,
+	QRTR_TYPE_DEL_SERVER	= 5,
+	QRTR_TYPE_DEL_CLIENT	= 6,
+	QRTR_TYPE_RESUME_TX	= 7,
+	QRTR_TYPE_EXIT          = 8,
+	QRTR_TYPE_PING          = 9,
+	QRTR_TYPE_NEW_LOOKUP	= 10,
+	QRTR_TYPE_DEL_LOOKUP	= 11,
+};
+
+struct qrtr_ctrl_pkt {
+	__le32 cmd;
+
+	union {
+		struct {
+			__le32 service;
+			__le32 instance;
+			__le32 node;
+			__le32 port;
+		} server;
+
+		struct {
+			__le32 node;
+			__le32 port;
+		} client;
+	};
+} __packed;
+
 #endif /* _LINUX_QRTR_H */
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 15981abc042c..d85ca7170b8f 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -26,18 +26,6 @@
 #define QRTR_MIN_EPH_SOCKET 0x4000
 #define QRTR_MAX_EPH_SOCKET 0x7fff
 
-enum qrtr_pkt_type {
-	QRTR_TYPE_DATA		= 1,
-	QRTR_TYPE_HELLO		= 2,
-	QRTR_TYPE_BYE		= 3,
-	QRTR_TYPE_NEW_SERVER	= 4,
-	QRTR_TYPE_DEL_SERVER	= 5,
-	QRTR_TYPE_DEL_CLIENT	= 6,
-	QRTR_TYPE_RESUME_TX	= 7,
-	QRTR_TYPE_EXIT		= 8,
-	QRTR_TYPE_PING		= 9,
-};
-
 /**
  * struct qrtr_hdr - (I|R)PCrouter packet header
  * @version: protocol version
-- 
cgit v1.2.3


From 843e79d05addd8eb06992cd6dfafc7b9d53f2bc8 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 11 Oct 2017 09:41:07 +0200
Subject: net: sched: make tc_action_ops->get_dev return dev and avoid passing
 net

Return dev directly, NULL if not possible. That is enough.

Makes no sense to pass struct net * to get_dev op, as there is only one
net possible, the one the action was created in. So just store it in
mirred priv and use directly.

Rename the mirred op callback function.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h          |  3 +--
 include/net/tc_act/tc_mirred.h |  1 +
 net/sched/act_mirred.c         | 13 +++++--------
 net/sched/cls_api.c            |  6 ++----
 4 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index b944e0eb93be..900168a9901e 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -93,8 +93,7 @@ struct tc_action_ops {
 	int     (*walk)(struct net *, struct sk_buff *,
 			struct netlink_callback *, int, const struct tc_action_ops *);
 	void	(*stats_update)(struct tc_action *, u64, u32, u64);
-	int	(*get_dev)(const struct tc_action *a, struct net *net,
-			   struct net_device **mirred_dev);
+	struct net_device *(*get_dev)(const struct tc_action *a);
 };
 
 struct tc_action_net {
diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 604bc31e23ab..21a656569840 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -10,6 +10,7 @@ struct tcf_mirred {
 	int			tcfm_ifindex;
 	bool			tcfm_mac_header_xmit;
 	struct net_device __rcu	*tcfm_dev;
+	struct net		*net;
 	struct list_head	tcfm_list;
 };
 #define to_mirred(a) ((struct tcf_mirred *)a)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 416627c66f08..8b3e59388480 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -140,6 +140,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	m->tcfm_eaction = parm->eaction;
 	if (dev != NULL) {
 		m->tcfm_ifindex = parm->ifindex;
+		m->net = net;
 		if (ret != ACT_P_CREATED)
 			dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
 		dev_hold(dev);
@@ -313,15 +314,11 @@ static struct notifier_block mirred_device_notifier = {
 	.notifier_call = mirred_device_event,
 };
 
-static int tcf_mirred_device(const struct tc_action *a, struct net *net,
-			     struct net_device **mirred_dev)
+static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
 {
-	int ifindex = tcf_mirred_ifindex(a);
+	struct tcf_mirred *m = to_mirred(a);
 
-	*mirred_dev = __dev_get_by_index(net, ifindex);
-	if (!*mirred_dev)
-		return -EINVAL;
-	return 0;
+	return __dev_get_by_index(m->net, m->tcfm_ifindex);
 }
 
 static struct tc_action_ops act_mirred_ops = {
@@ -336,7 +333,7 @@ static struct tc_action_ops act_mirred_ops = {
 	.walk		=	tcf_mirred_walker,
 	.lookup		=	tcf_mirred_search,
 	.size		=	sizeof(struct tcf_mirred),
-	.get_dev	=	tcf_mirred_device,
+	.get_dev	=	tcf_mirred_get_dev,
 };
 
 static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 0b2219adf520..450873b0c4b9 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1016,10 +1016,8 @@ int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
 
 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
-		if (a->ops->get_dev) {
-			a->ops->get_dev(a, dev_net(dev), hw_dev);
-			break;
-		}
+		if (a->ops->get_dev)
+			*hw_dev = a->ops->get_dev(a);
 	}
 	if (*hw_dev)
 		return 0;
-- 
cgit v1.2.3


From b3f55bdda8df55a563005e00b1b71212d8546541 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 11 Oct 2017 09:41:08 +0200
Subject: net: sched: introduce per-egress action device callbacks

Introduce infrastructure that allows drivers to register callbacks that
are called whenever tc would offload inserted rule and specified device
acts as tc action egress device.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h |  34 ++++++++
 include/net/pkt_cls.h |   2 +
 net/sched/act_api.c   | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/sched/cls_api.c   |  30 +++++++
 4 files changed, 286 insertions(+)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 900168a9901e..f5e8c9048fb0 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -174,4 +174,38 @@ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
 #endif
 }
 
+typedef int tc_setup_cb_t(enum tc_setup_type type,
+			  void *type_data, void *cb_priv);
+
+#ifdef CONFIG_NET_CLS_ACT
+int tc_setup_cb_egdev_register(const struct net_device *dev,
+			       tc_setup_cb_t *cb, void *cb_priv);
+void tc_setup_cb_egdev_unregister(const struct net_device *dev,
+				  tc_setup_cb_t *cb, void *cb_priv);
+int tc_setup_cb_egdev_call(const struct net_device *dev,
+			   enum tc_setup_type type, void *type_data,
+			   bool err_stop);
+#else
+static inline
+int tc_setup_cb_egdev_register(const struct net_device *dev,
+			       tc_setup_cb_t *cb, void *cb_priv)
+{
+	return 0;
+}
+
+static inline
+void tc_setup_cb_egdev_unregister(const struct net_device *dev,
+				  tc_setup_cb_t *cb, void *cb_priv)
+{
+}
+
+static inline
+int tc_setup_cb_egdev_call(const struct net_device *dev,
+			   enum tc_setup_type type, void *type_data,
+			   bool err_stop)
+{
+	return 0;
+}
+#endif
+
 #endif
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e80edd8879ef..6f8149c82571 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -206,6 +206,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
 		     struct net_device **hw_dev);
+int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
+			   void *type_data, bool err_stop);
 
 /**
  * struct tcf_pkt_info - packet information
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index da6fa82c98a8..ac97db92ab68 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -21,6 +21,8 @@
 #include <linux/kmod.h>
 #include <linux/err.h>
 #include <linux/module.h>
+#include <linux/rhashtable.h>
+#include <linux/list.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/sch_generic.h>
@@ -1249,8 +1251,226 @@ out_module_put:
 	return skb->len;
 }
 
+struct tcf_action_net {
+	struct rhashtable egdev_ht;
+};
+
+static unsigned int tcf_action_net_id;
+
+struct tcf_action_egdev_cb {
+	struct list_head list;
+	tc_setup_cb_t *cb;
+	void *cb_priv;
+};
+
+struct tcf_action_egdev {
+	struct rhash_head ht_node;
+	const struct net_device *dev;
+	unsigned int refcnt;
+	struct list_head cb_list;
+};
+
+static const struct rhashtable_params tcf_action_egdev_ht_params = {
+	.key_offset = offsetof(struct tcf_action_egdev, dev),
+	.head_offset = offsetof(struct tcf_action_egdev, ht_node),
+	.key_len = sizeof(const struct net_device *),
+};
+
+static struct tcf_action_egdev *
+tcf_action_egdev_lookup(const struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
+
+	return rhashtable_lookup_fast(&tan->egdev_ht, &dev,
+				      tcf_action_egdev_ht_params);
+}
+
+static struct tcf_action_egdev *
+tcf_action_egdev_get(const struct net_device *dev)
+{
+	struct tcf_action_egdev *egdev;
+	struct tcf_action_net *tan;
+
+	egdev = tcf_action_egdev_lookup(dev);
+	if (egdev)
+		goto inc_ref;
+
+	egdev = kzalloc(sizeof(*egdev), GFP_KERNEL);
+	if (!egdev)
+		return NULL;
+	INIT_LIST_HEAD(&egdev->cb_list);
+	tan = net_generic(dev_net(dev), tcf_action_net_id);
+	rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node,
+			       tcf_action_egdev_ht_params);
+
+inc_ref:
+	egdev->refcnt++;
+	return egdev;
+}
+
+static void tcf_action_egdev_put(struct tcf_action_egdev *egdev)
+{
+	struct tcf_action_net *tan;
+
+	if (--egdev->refcnt)
+		return;
+	tan = net_generic(dev_net(egdev->dev), tcf_action_net_id);
+	rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node,
+			       tcf_action_egdev_ht_params);
+	kfree(egdev);
+}
+
+static struct tcf_action_egdev_cb *
+tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev,
+			   tc_setup_cb_t *cb, void *cb_priv)
+{
+	struct tcf_action_egdev_cb *egdev_cb;
+
+	list_for_each_entry(egdev_cb, &egdev->cb_list, list)
+		if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv)
+			return egdev_cb;
+	return NULL;
+}
+
+static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev,
+				    enum tc_setup_type type,
+				    void *type_data, bool err_stop)
+{
+	struct tcf_action_egdev_cb *egdev_cb;
+	int ok_count = 0;
+	int err;
+
+	list_for_each_entry(egdev_cb, &egdev->cb_list, list) {
+		err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv);
+		if (err) {
+			if (err_stop)
+				return err;
+		} else {
+			ok_count++;
+		}
+	}
+	return ok_count;
+}
+
+static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev,
+				   tc_setup_cb_t *cb, void *cb_priv)
+{
+	struct tcf_action_egdev_cb *egdev_cb;
+
+	egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
+	if (WARN_ON(egdev_cb))
+		return -EEXIST;
+	egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL);
+	if (!egdev_cb)
+		return -ENOMEM;
+	egdev_cb->cb = cb;
+	egdev_cb->cb_priv = cb_priv;
+	list_add(&egdev_cb->list, &egdev->cb_list);
+	return 0;
+}
+
+static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev,
+				    tc_setup_cb_t *cb, void *cb_priv)
+{
+	struct tcf_action_egdev_cb *egdev_cb;
+
+	egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
+	if (WARN_ON(!egdev_cb))
+		return;
+	list_del(&egdev_cb->list);
+	kfree(egdev_cb);
+}
+
+static int __tc_setup_cb_egdev_register(const struct net_device *dev,
+					tc_setup_cb_t *cb, void *cb_priv)
+{
+	struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev);
+	int err;
+
+	if (!egdev)
+		return -ENOMEM;
+	err = tcf_action_egdev_cb_add(egdev, cb, cb_priv);
+	if (err)
+		goto err_cb_add;
+	return 0;
+
+err_cb_add:
+	tcf_action_egdev_put(egdev);
+	return err;
+}
+int tc_setup_cb_egdev_register(const struct net_device *dev,
+			       tc_setup_cb_t *cb, void *cb_priv)
+{
+	int err;
+
+	rtnl_lock();
+	err = __tc_setup_cb_egdev_register(dev, cb, cb_priv);
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register);
+
+static void __tc_setup_cb_egdev_unregister(const struct net_device *dev,
+					   tc_setup_cb_t *cb, void *cb_priv)
+{
+	struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
+
+	if (WARN_ON(!egdev))
+		return;
+	tcf_action_egdev_cb_del(egdev, cb, cb_priv);
+	tcf_action_egdev_put(egdev);
+}
+void tc_setup_cb_egdev_unregister(const struct net_device *dev,
+				  tc_setup_cb_t *cb, void *cb_priv)
+{
+	rtnl_lock();
+	__tc_setup_cb_egdev_unregister(dev, cb, cb_priv);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister);
+
+int tc_setup_cb_egdev_call(const struct net_device *dev,
+			   enum tc_setup_type type, void *type_data,
+			   bool err_stop)
+{
+	struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
+
+	if (!egdev)
+		return 0;
+	return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop);
+}
+EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call);
+
+static __net_init int tcf_action_net_init(struct net *net)
+{
+	struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
+
+	return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params);
+}
+
+static void __net_exit tcf_action_net_exit(struct net *net)
+{
+	struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
+
+	rhashtable_destroy(&tan->egdev_ht);
+}
+
+static struct pernet_operations tcf_action_net_ops = {
+	.init = tcf_action_net_init,
+	.exit = tcf_action_net_exit,
+	.id = &tcf_action_net_id,
+	.size = sizeof(struct tcf_action_net),
+};
+
 static int __init tc_action_init(void)
 {
+	int err;
+
+	err = register_pernet_subsys(&tcf_action_net_ops);
+	if (err)
+		return err;
+
 	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 450873b0c4b9..99f9432f63cf 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1026,6 +1026,36 @@ int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
 }
 EXPORT_SYMBOL(tcf_exts_get_dev);
 
+int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
+			   void *type_data, bool err_stop)
+{
+	int ok_count = 0;
+#ifdef CONFIG_NET_CLS_ACT
+	const struct tc_action *a;
+	struct net_device *dev;
+	LIST_HEAD(actions);
+	int ret;
+
+	if (!tcf_exts_has_actions(exts))
+		return 0;
+
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (!a->ops->get_dev)
+			continue;
+		dev = a->ops->get_dev(a);
+		if (!dev || !tc_can_offload(dev))
+			continue;
+		ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop);
+		if (ret < 0)
+			return ret;
+		ok_count += ret;
+	}
+#endif
+	return ok_count;
+}
+EXPORT_SYMBOL(tcf_exts_egdev_cb_call);
+
 static int __init tc_filter_init(void)
 {
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
-- 
cgit v1.2.3


From 717503b9cf57c0bb7ea4d3a9f5699c9a04adf988 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 11 Oct 2017 09:41:09 +0200
Subject: net: sched: convert cls_flower->egress_dev users to tc_setup_cb_egdev
 infra

The only user of cls_flower->egress_dev is mlx5. So do the conversion
there alongside with the code originating the call in cls_flower
function fl_hw_replace_filter to the newly introduced egress device
callback infrastucture.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  3 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 31 +++++++----
 include/net/pkt_cls.h                             |  5 +-
 net/sched/cls_api.c                               | 13 +++--
 net/sched/cls_flower.c                            | 63 ++++++++++++-----------
 6 files changed, 73 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cc13d3dbd366..5ec6d3e8dc89 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1081,6 +1081,9 @@ int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
 int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
 			       struct ethtool_flash *flash);
 
+int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
+		   void *type_data);
+
 /* mlx5e generic netdev management API */
 struct net_device*
 mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cc11bbbd0309..2a32102e7648 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3108,8 +3108,8 @@ static int mlx5e_setup_tc_cls_flower(struct net_device *dev,
 }
 #endif
 
-static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			  void *type_data)
+int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
+		   void *type_data)
 {
 	switch (type) {
 #ifdef CONFIG_MLX5_ESWITCH
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 45e03c427faf..765fc74fbb1b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -34,6 +34,7 @@
 #include <linux/mlx5/fs.h>
 #include <net/switchdev.h>
 #include <net/pkt_cls.h>
+#include <net/act_api.h>
 #include <net/netevent.h>
 #include <net/arp.h>
 
@@ -667,14 +668,6 @@ mlx5e_rep_setup_tc_cls_flower(struct net_device *dev,
 	    cls_flower->common.chain_index)
 		return -EOPNOTSUPP;
 
-	if (cls_flower->egress_dev) {
-		struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-
-		dev = mlx5_eswitch_get_uplink_netdev(esw);
-		return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
-						     cls_flower);
-	}
-
 	switch (cls_flower->command) {
 	case TC_CLSFLOWER_REPLACE:
 		return mlx5e_configure_flower(priv, cls_flower);
@@ -698,6 +691,14 @@ static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	}
 }
 
+static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
+				 void *cb_priv)
+{
+	struct net_device *dev = cb_priv;
+
+	return mlx5e_setup_tc(dev, type, type_data);
+}
+
 bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
@@ -1017,15 +1018,24 @@ mlx5e_vport_rep_load(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep)
 		goto err_detach_netdev;
 	}
 
+	err = tc_setup_cb_egdev_register(netdev, mlx5e_rep_setup_tc_cb,
+					 mlx5_eswitch_get_uplink_netdev(esw));
+	if (err)
+		goto err_neigh_cleanup;
+
 	err = register_netdev(netdev);
 	if (err) {
 		pr_warn("Failed to register representor netdev for vport %d\n",
 			rep->vport);
-		goto err_neigh_cleanup;
+		goto err_egdev_cleanup;
 	}
 
 	return 0;
 
+err_egdev_cleanup:
+	tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb,
+				     mlx5_eswitch_get_uplink_netdev(esw));
+
 err_neigh_cleanup:
 	mlx5e_rep_neigh_cleanup(rpriv);
 
@@ -1047,7 +1057,8 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep)
 	void *ppriv = priv->ppriv;
 
 	unregister_netdev(rep->netdev);
-
+	tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb,
+				     mlx5_eswitch_get_uplink_netdev(esw));
 	mlx5e_rep_neigh_cleanup(rpriv);
 	mlx5e_detach_netdev(priv);
 	mlx5e_destroy_netdev(priv);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6f8149c82571..c0bdf5cad727 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -206,8 +206,6 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
 		     struct net_device **hw_dev);
-int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
-			   void *type_data, bool err_stop);
 
 /**
  * struct tcf_pkt_info - packet information
@@ -407,6 +405,9 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 }
 #endif /* CONFIG_NET_CLS_IND */
 
+int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
+		     void *type_data, bool err_stop);
+
 struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 99f9432f63cf..51994a202585 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1026,8 +1026,9 @@ int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
 }
 EXPORT_SYMBOL(tcf_exts_get_dev);
 
-int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
-			   void *type_data, bool err_stop)
+static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
+				       enum tc_setup_type type,
+				       void *type_data, bool err_stop)
 {
 	int ok_count = 0;
 #ifdef CONFIG_NET_CLS_ACT
@@ -1054,7 +1055,13 @@ int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
 #endif
 	return ok_count;
 }
-EXPORT_SYMBOL(tcf_exts_egdev_cb_call);
+
+int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
+		     void *type_data, bool err_stop)
+{
+	return tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
+}
+EXPORT_SYMBOL(tc_setup_cb_call);
 
 static int __init tc_filter_init(void)
 {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index db831ac708f6..5b7bb968d1d4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -88,7 +88,6 @@ struct cls_fl_filter {
 	u32 handle;
 	u32 flags;
 	struct rcu_head	rcu;
-	struct net_device *hw_dev;
 };
 
 static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
@@ -201,16 +200,17 @@ static void fl_destroy_filter(struct rcu_head *head)
 static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
 	struct tc_cls_flower_offload cls_flower = {};
-	struct net_device *dev = f->hw_dev;
-
-	if (!tc_can_offload(dev))
-		return;
+	struct net_device *dev = tp->q->dev_queue->dev;
 
 	tc_cls_common_offload_init(&cls_flower.common, tp);
 	cls_flower.command = TC_CLSFLOWER_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower);
+	if (tc_can_offload(dev))
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
+					      &cls_flower);
+	tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER,
+			 &cls_flower, false);
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -220,20 +220,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload cls_flower = {};
+	bool skip_sw = tc_skip_sw(f->flags);
 	int err;
 
-	if (!tc_can_offload(dev)) {
-		if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) ||
-		    (f->hw_dev && !tc_can_offload(f->hw_dev))) {
-			f->hw_dev = dev;
-			return tc_skip_sw(f->flags) ? -EINVAL : 0;
-		}
-		dev = f->hw_dev;
-		cls_flower.egress_dev = true;
-	} else {
-		f->hw_dev = dev;
-	}
-
 	tc_cls_common_offload_init(&cls_flower.common, tp);
 	cls_flower.command = TC_CLSFLOWER_REPLACE;
 	cls_flower.cookie = (unsigned long) f;
@@ -242,31 +231,47 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	cls_flower.key = &f->mkey;
 	cls_flower.exts = &f->exts;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
-					    &cls_flower);
-	if (!err)
-		f->flags |= TCA_CLS_FLAGS_IN_HW;
+	if (tc_can_offload(dev)) {
+		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
+						    &cls_flower);
+		if (err) {
+			if (skip_sw)
+				return err;
+		} else {
+			f->flags |= TCA_CLS_FLAGS_IN_HW;
+		}
+	}
 
-	if (tc_skip_sw(f->flags))
+	err = tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER,
+			       &cls_flower, skip_sw);
+	if (err < 0) {
+		fl_hw_destroy_filter(tp, f);
 		return err;
+	} else if (err > 0) {
+		f->flags |= TCA_CLS_FLAGS_IN_HW;
+	}
+
+	if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
+		return -EINVAL;
+
 	return 0;
 }
 
 static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
 	struct tc_cls_flower_offload cls_flower = {};
-	struct net_device *dev = f->hw_dev;
-
-	if (!tc_can_offload(dev))
-		return;
+	struct net_device *dev = tp->q->dev_queue->dev;
 
 	tc_cls_common_offload_init(&cls_flower.common, tp);
 	cls_flower.command = TC_CLSFLOWER_STATS;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.exts = &f->exts;
 
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
-				      &cls_flower);
+	if (tc_can_offload(dev))
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
+					      &cls_flower);
+	tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER,
+			 &cls_flower, false);
 }
 
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
-- 
cgit v1.2.3


From 7578d7b45ed870b13a8ace57e32feaed623c2a94 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 11 Oct 2017 09:41:10 +0200
Subject: net: sched: remove unused tcf_exts_get_dev helper and
 cls_flower->egress_dev

The helper and the struct field ares no longer used by any code,
so remove them.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h |  3 ---
 net/sched/cls_api.c   | 22 ----------------------
 2 files changed, 25 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index c0bdf5cad727..f5263743076b 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -204,8 +204,6 @@ void tcf_exts_destroy(struct tcf_exts *exts);
 void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src);
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
-int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
-		     struct net_device **hw_dev);
 
 /**
  * struct tcf_pkt_info - packet information
@@ -517,7 +515,6 @@ struct tc_cls_flower_offload {
 	struct fl_flow_key *mask;
 	struct fl_flow_key *key;
 	struct tcf_exts *exts;
-	bool egress_dev;
 };
 
 enum tc_matchall_command {
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 51994a202585..2977b8a90851 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1004,28 +1004,6 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 }
 EXPORT_SYMBOL(tcf_exts_dump_stats);
 
-int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
-		     struct net_device **hw_dev)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	const struct tc_action *a;
-	LIST_HEAD(actions);
-
-	if (!tcf_exts_has_actions(exts))
-		return -EINVAL;
-
-	tcf_exts_to_list(exts, &actions);
-	list_for_each_entry(a, &actions, list) {
-		if (a->ops->get_dev)
-			*hw_dev = a->ops->get_dev(a);
-	}
-	if (*hw_dev)
-		return 0;
-#endif
-	return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(tcf_exts_get_dev);
-
 static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
 				       enum tc_setup_type type,
 				       void *type_data, bool err_stop)
-- 
cgit v1.2.3


From 437d2762ba07f0fc639d5a09acb323fe4106a61f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Oct 2017 20:45:40 -0700
Subject: tcp: remove obsolete helpers

Remove three inline helpers that are no longer needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 15163454174b..3b3b9b968e2d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1629,18 +1629,6 @@ static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
 	return skb_peek_tail(&sk->sk_write_queue);
 }
 
-static inline struct sk_buff *tcp_write_queue_next(const struct sock *sk,
-						   const struct sk_buff *skb)
-{
-	return skb_queue_next(&sk->sk_write_queue, skb);
-}
-
-static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
-						   const struct sk_buff *skb)
-{
-	return skb_queue_prev(&sk->sk_write_queue, skb);
-}
-
 #define tcp_for_write_queue_from_safe(skb, tmp, sk)			\
 	skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
 
@@ -1697,11 +1685,6 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
 	}
 }
 
-static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *skb)
-{
-	__skb_queue_head(&sk->sk_write_queue, skb);
-}
-
 /* Insert new before skb on the write queue of sk.  */
 static inline void tcp_insert_write_queue_before(struct sk_buff *new,
 						  struct sk_buff *skb,
-- 
cgit v1.2.3


From 60724d4bae14cd295b27b1610cad9a2720eb0860 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 11 Oct 2017 10:57:48 -0700
Subject: net: dsa: Add support for DSA specific notifiers

In preparation for communicating a given DSA network device's port
number and switch index, create a specialized DSA notifier and two
events: DSA_PORT_REGISTER and DSA_PORT_UNREGISTER that communicate: the
slave network device (slave_dev), port number and switch number in the
tree.

This will be later used for network device drivers like bcmsysport which
needs to cooperate with its DSA network devices to set-up queue mapping
and scheduling.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 net/dsa/dsa.c     | 23 +++++++++++++++++++++++
 net/dsa/slave.c   | 13 +++++++++++++
 3 files changed, 81 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 10dceccd9ce8..40a709a0754d 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -471,4 +471,49 @@ static inline int dsa_switch_resume(struct dsa_switch *ds)
 }
 #endif /* CONFIG_PM_SLEEP */
 
+enum dsa_notifier_type {
+	DSA_PORT_REGISTER,
+	DSA_PORT_UNREGISTER,
+};
+
+struct dsa_notifier_info {
+	struct net_device *dev;
+};
+
+struct dsa_notifier_register_info {
+	struct dsa_notifier_info info;	/* must be first */
+	struct net_device *master;
+	unsigned int port_number;
+	unsigned int switch_number;
+};
+
+static inline struct net_device *
+dsa_notifier_info_to_dev(const struct dsa_notifier_info *info)
+{
+	return info->dev;
+}
+
+#if IS_ENABLED(CONFIG_NET_DSA)
+int register_dsa_notifier(struct notifier_block *nb);
+int unregister_dsa_notifier(struct notifier_block *nb);
+int call_dsa_notifiers(unsigned long val, struct net_device *dev,
+		       struct dsa_notifier_info *info);
+#else
+static inline int register_dsa_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int unregister_dsa_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
+				     struct dsa_notifier_info *info)
+{
+	return NOTIFY_DONE;
+}
+#endif
+
 #endif
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 51ca2a524a27..832c659ff993 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -14,6 +14,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/notifier.h>
 #include <linux/of.h>
 #include <linux/of_mdio.h>
 #include <linux/of_platform.h>
@@ -261,6 +262,28 @@ bool dsa_schedule_work(struct work_struct *work)
 	return queue_work(dsa_owq, work);
 }
 
+static ATOMIC_NOTIFIER_HEAD(dsa_notif_chain);
+
+int register_dsa_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&dsa_notif_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_dsa_notifier);
+
+int unregister_dsa_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&dsa_notif_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_dsa_notifier);
+
+int call_dsa_notifiers(unsigned long val, struct net_device *dev,
+		       struct dsa_notifier_info *info)
+{
+	info->dev = dev;
+	return atomic_notifier_call_chain(&dsa_notif_chain, val, info);
+}
+EXPORT_SYMBOL_GPL(call_dsa_notifiers);
+
 static int __init dsa_init_module(void)
 {
 	int rc;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index fb2954ff198c..45f4ea845c07 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1116,6 +1116,7 @@ int dsa_slave_resume(struct net_device *slave_dev)
 
 int dsa_slave_create(struct dsa_port *port, const char *name)
 {
+	struct dsa_notifier_register_info rinfo = { };
 	struct dsa_switch *ds = port->ds;
 	struct net_device *master;
 	struct net_device *slave_dev;
@@ -1177,6 +1178,12 @@ int dsa_slave_create(struct dsa_port *port, const char *name)
 		goto out_free;
 	}
 
+	rinfo.info.dev = slave_dev;
+	rinfo.master = master;
+	rinfo.port_number = p->dp->index;
+	rinfo.switch_number = p->dp->ds->index;
+	call_dsa_notifiers(DSA_PORT_REGISTER, slave_dev, &rinfo.info);
+
 	ret = register_netdev(slave_dev);
 	if (ret) {
 		netdev_err(master, "error %d registering interface %s\n",
@@ -1200,6 +1207,7 @@ out_free:
 void dsa_slave_destroy(struct net_device *slave_dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(slave_dev);
+	struct dsa_notifier_register_info rinfo = { };
 	struct device_node *port_dn;
 
 	port_dn = p->dp->dn;
@@ -1211,6 +1219,11 @@ void dsa_slave_destroy(struct net_device *slave_dev)
 		if (of_phy_is_fixed_link(port_dn))
 			of_phy_deregister_fixed_link(port_dn);
 	}
+	rinfo.info.dev = slave_dev;
+	rinfo.master = p->dp->cpu_dp->netdev;
+	rinfo.port_number = p->dp->index;
+	rinfo.switch_number = p->dp->ds->index;
+	call_dsa_notifiers(DSA_PORT_UNREGISTER, slave_dev, &rinfo.info);
 	unregister_netdev(slave_dev);
 	free_percpu(p->stats64);
 	free_netdev(slave_dev);
-- 
cgit v1.2.3


From 0a5f14ce67a6e093e651d3cd75e6ac281123d93a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 11 Oct 2017 10:57:49 -0700
Subject: net: dsa: tag_brcm: Indicate to master netdevice port + queue

We need to tell the DSA master network device doing the actual
transmission what the desired switch port and queue number is for it to
resolve that to the internal transmit queue it is mapped to.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  | 5 +++++
 net/dsa/tag_brcm.c | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 40a709a0754d..ce1d622734d7 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -516,4 +516,9 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 }
 #endif
 
+/* Broadcom tag specific helpers to insert and extract queue/port number */
+#define BRCM_TAG_SET_PORT_QUEUE(p, q)	((p) << 8 | q)
+#define BRCM_TAG_GET_PORT(v)		((v) >> 8)
+#define BRCM_TAG_GET_QUEUE(v)		((v) & 0xff)
+
 #endif
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 8e4bdb9d9ae3..cc4f472fbd77 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -86,6 +86,12 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
 		brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
 	brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK;
 
+	/* Now tell the master network device about the desired output queue
+	 * as well
+	 */
+	skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(p->dp->index,
+							   queue));
+
 	return skb;
 }
 
-- 
cgit v1.2.3


From ad2d116c5242875bba27522682ec5ba7f0df75f0 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 11 Oct 2017 11:14:49 -0700
Subject: sched: tc_mirred: Remove whitespaces

This file contains unnecessary whitespaces as newlines, remove them,
found by looking at what struct tc_mirred looks like.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_mirred.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h
index 3d7a2b352a62..69038c29e8a9 100644
--- a/include/uapi/linux/tc_act/tc_mirred.h
+++ b/include/uapi/linux/tc_act/tc_mirred.h
@@ -9,13 +9,13 @@
 #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */
 #define TCA_INGRESS_REDIR 3  /* packet redirect to INGRESS*/
 #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */
-                                                                                
+
 struct tc_mirred {
 	tc_gen;
 	int                     eaction;   /* one of IN/EGRESS_MIRROR/REDIR */
 	__u32                   ifindex;  /* ifindex of egress port */
 };
-                                                                                
+
 enum {
 	TCA_MIRRED_UNSPEC,
 	TCA_MIRRED_TM,
@@ -24,5 +24,5 @@ enum {
 	__TCA_MIRRED_MAX
 };
 #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1)
-                                                                                
+
 #endif
-- 
cgit v1.2.3


From 8f04748016f3b583e675e0f649d42cfc10812a8b Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Wed, 11 Oct 2017 10:50:29 -0400
Subject: net sched actions: change IFE modules alias names

Make style of module alias name consistent with other subsystems in kernel,
for example net devices.

Fixes: 084e2f6566d2 ("Support to encoding decoding skb mark on IFE action")
Fixes: 200e10f46936 ("Support to encoding decoding skb prio on IFE action")
Fixes: 408fbc22ef1e ("net sched ife action: Introduce skb tcindex metadata encap decap")
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h     | 2 +-
 net/sched/act_ife.c             | 2 +-
 net/sched/act_meta_mark.c       | 2 +-
 net/sched/act_meta_skbprio.c    | 2 +-
 net/sched/act_meta_skbtcindex.c | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 30ba459ddd34..104578f16062 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -40,7 +40,7 @@ struct tcf_meta_ops {
 	struct module	*owner;
 };
 
-#define MODULE_ALIAS_IFE_META(metan)   MODULE_ALIAS("ifemeta" __stringify_1(metan))
+#define MODULE_ALIAS_IFE_META(metan)   MODULE_ALIAS("ife-meta-" metan)
 
 int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi);
 int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi);
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 8ccd35825b6b..791aeee11c7e 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -263,7 +263,7 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
 		if (exists)
 			spin_unlock_bh(&ife->tcf_lock);
 		rtnl_unlock();
-		request_module("ifemeta%u", metaid);
+		request_module("ife-meta-%u", metaid);
 		rtnl_lock();
 		if (exists)
 			spin_lock_bh(&ife->tcf_lock);
diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c
index 82892170ce4f..1e3f10e5da99 100644
--- a/net/sched/act_meta_mark.c
+++ b/net/sched/act_meta_mark.c
@@ -76,4 +76,4 @@ module_exit(ifemark_cleanup_module);
 MODULE_AUTHOR("Jamal Hadi Salim(2015)");
 MODULE_DESCRIPTION("Inter-FE skb mark metadata module");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_IFE_META(IFE_META_SKBMARK);
+MODULE_ALIAS_IFE_META("skbmark");
diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c
index 26bf4d86030b..4033f9fc4d4a 100644
--- a/net/sched/act_meta_skbprio.c
+++ b/net/sched/act_meta_skbprio.c
@@ -73,4 +73,4 @@ module_exit(ifeprio_cleanup_module);
 MODULE_AUTHOR("Jamal Hadi Salim(2015)");
 MODULE_DESCRIPTION("Inter-FE skb prio metadata action");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_IFE_META(IFE_META_PRIO);
+MODULE_ALIAS_IFE_META("skbprio");
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
index 3b35774ce890..2ea1f26c9e96 100644
--- a/net/sched/act_meta_skbtcindex.c
+++ b/net/sched/act_meta_skbtcindex.c
@@ -76,4 +76,4 @@ module_exit(ifetc_index_cleanup_module);
 MODULE_AUTHOR("Jamal Hadi Salim(2016)");
 MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX);
+MODULE_ALIAS_IFE_META("tcindex");
-- 
cgit v1.2.3


From aa9fd9a325d51fa0b11153b03b8fefff569fa955 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 11 Oct 2017 17:16:08 -0400
Subject: sched: act: ife: update parameters via rcu handling

This patch changes the parameter updating via RCU and not protected by a
spinlock anymore. This reduce the time that the spinlock is being held.

Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h | 10 ++++--
 net/sched/act_ife.c         | 87 ++++++++++++++++++++++++++++++---------------
 2 files changed, 67 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 104578f16062..c7fb99c3f76c 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -6,12 +6,18 @@
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
 
-struct tcf_ife_info {
-	struct tc_action common;
+struct tcf_ife_params {
 	u8 eth_dst[ETH_ALEN];
 	u8 eth_src[ETH_ALEN];
 	u16 eth_type;
 	u16 flags;
+
+	struct rcu_head rcu;
+};
+
+struct tcf_ife_info {
+	struct tc_action common;
+	struct tcf_ife_params __rcu *params;
 	/* list of metaids allowed */
 	struct list_head metalist;
 };
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index f59d78918cf9..252ee7d8c731 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -406,10 +406,14 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind)
 static void tcf_ife_cleanup(struct tc_action *a, int bind)
 {
 	struct tcf_ife_info *ife = to_ife(a);
+	struct tcf_ife_params *p;
 
 	spin_lock_bh(&ife->tcf_lock);
 	_tcf_ife_cleanup(a, bind);
 	spin_unlock_bh(&ife->tcf_lock);
+
+	p = rcu_dereference_protected(ife->params, 1);
+	kfree_rcu(p, rcu);
 }
 
 /* under ife->tcf_lock for existing action */
@@ -446,6 +450,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 	struct nlattr *tb[TCA_IFE_MAX + 1];
 	struct nlattr *tb2[IFE_META_MAX + 1];
+	struct tcf_ife_params *p, *p_old;
 	struct tcf_ife_info *ife;
 	u16 ife_type = ETH_P_IFE;
 	struct tc_ife *parm;
@@ -471,24 +476,34 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	if (parm->flags & ~IFE_ENCODE)
 		return -EINVAL;
 
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
 	exists = tcf_idr_check(tn, parm->index, a, bind);
-	if (exists && bind)
+	if (exists && bind) {
+		kfree(p);
 		return 0;
+	}
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
 				     bind, true);
-		if (ret)
+		if (ret) {
+			kfree(p);
 			return ret;
+		}
 		ret = ACT_P_CREATED;
 	} else {
 		tcf_idr_release(*a, bind);
-		if (!ovr)
+		if (!ovr) {
+			kfree(p);
 			return -EEXIST;
+		}
 	}
 
 	ife = to_ife(*a);
-	ife->flags = parm->flags;
+	p->flags = parm->flags;
 
 	if (parm->flags & IFE_ENCODE) {
 		if (tb[TCA_IFE_TYPE])
@@ -499,24 +514,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			saddr = nla_data(tb[TCA_IFE_SMAC]);
 	}
 
-	if (exists)
-		spin_lock_bh(&ife->tcf_lock);
 	ife->tcf_action = parm->action;
 
 	if (parm->flags & IFE_ENCODE) {
 		if (daddr)
-			ether_addr_copy(ife->eth_dst, daddr);
+			ether_addr_copy(p->eth_dst, daddr);
 		else
-			eth_zero_addr(ife->eth_dst);
+			eth_zero_addr(p->eth_dst);
 
 		if (saddr)
-			ether_addr_copy(ife->eth_src, saddr);
+			ether_addr_copy(p->eth_src, saddr);
 		else
-			eth_zero_addr(ife->eth_src);
+			eth_zero_addr(p->eth_src);
 
-		ife->eth_type = ife_type;
+		p->eth_type = ife_type;
 	}
 
+	if (exists)
+		spin_lock_bh(&ife->tcf_lock);
+
 	if (ret == ACT_P_CREATED)
 		INIT_LIST_HEAD(&ife->metalist);
 
@@ -532,6 +548,7 @@ metadata_parse_err:
 
 			if (exists)
 				spin_unlock_bh(&ife->tcf_lock);
+			kfree(p);
 			return err;
 		}
 
@@ -552,6 +569,7 @@ metadata_parse_err:
 
 			if (exists)
 				spin_unlock_bh(&ife->tcf_lock);
+			kfree(p);
 			return err;
 		}
 	}
@@ -559,6 +577,11 @@ metadata_parse_err:
 	if (exists)
 		spin_unlock_bh(&ife->tcf_lock);
 
+	p_old = rtnl_dereference(ife->params);
+	rcu_assign_pointer(ife->params, p);
+	if (p_old)
+		kfree_rcu(p_old, rcu);
+
 	if (ret == ACT_P_CREATED)
 		tcf_idr_insert(tn, *a);
 
@@ -570,12 +593,13 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_ife_info *ife = to_ife(a);
+	struct tcf_ife_params *p = rtnl_dereference(ife->params);
 	struct tc_ife opt = {
 		.index = ife->tcf_index,
 		.refcnt = ife->tcf_refcnt - ref,
 		.bindcnt = ife->tcf_bindcnt - bind,
 		.action = ife->tcf_action,
-		.flags = ife->flags,
+		.flags = p->flags,
 	};
 	struct tcf_t t;
 
@@ -586,17 +610,17 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 	if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD))
 		goto nla_put_failure;
 
-	if (!is_zero_ether_addr(ife->eth_dst)) {
-		if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst))
+	if (!is_zero_ether_addr(p->eth_dst)) {
+		if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst))
 			goto nla_put_failure;
 	}
 
-	if (!is_zero_ether_addr(ife->eth_src)) {
-		if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src))
+	if (!is_zero_ether_addr(p->eth_src)) {
+		if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src))
 			goto nla_put_failure;
 	}
 
-	if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type))
+	if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type))
 		goto nla_put_failure;
 
 	if (dump_metalist(skb, ife)) {
@@ -698,7 +722,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
 }
 
 static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
-			  struct tcf_result *res)
+			  struct tcf_result *res, struct tcf_ife_params *p)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
@@ -762,19 +786,18 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		}
 		skboff += err;
 	}
+	spin_unlock(&ife->tcf_lock);
 	oethh = (struct ethhdr *)skb->data;
 
-	if (!is_zero_ether_addr(ife->eth_src))
-		ether_addr_copy(oethh->h_source, ife->eth_src);
-	if (!is_zero_ether_addr(ife->eth_dst))
-		ether_addr_copy(oethh->h_dest, ife->eth_dst);
-	oethh->h_proto = htons(ife->eth_type);
+	if (!is_zero_ether_addr(p->eth_src))
+		ether_addr_copy(oethh->h_source, p->eth_src);
+	if (!is_zero_ether_addr(p->eth_dst))
+		ether_addr_copy(oethh->h_dest, p->eth_dst);
+	oethh->h_proto = htons(p->eth_type);
 
 	if (skb_at_tc_ingress(skb))
 		skb_pull(skb, skb->dev->hard_header_len);
 
-	spin_unlock(&ife->tcf_lock);
-
 	return action;
 }
 
@@ -782,9 +805,17 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
 		       struct tcf_result *res)
 {
 	struct tcf_ife_info *ife = to_ife(a);
-
-	if (ife->flags & IFE_ENCODE)
-		return tcf_ife_encode(skb, a, res);
+	struct tcf_ife_params *p;
+	int ret;
+
+	rcu_read_lock();
+	p = rcu_dereference(ife->params);
+	if (p->flags & IFE_ENCODE) {
+		ret = tcf_ife_encode(skb, a, res, p);
+		rcu_read_unlock();
+		return ret;
+	}
+	rcu_read_unlock();
 
 	return tcf_ife_decode(skb, a, res);
 }
-- 
cgit v1.2.3


From 2355a6546a053b1c16ebefd6ce1f0cccc00e1da5 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Thu, 12 Oct 2017 10:21:25 +0200
Subject: net: phy: broadcom: support new device flag for setting master mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some of Broadcom's PHYs run by default in slave mode with Automatic
Slave/Master configuration disabled. It stops them from working properly
with some devices.

So far it has been verified for BCM54210E and BCM50212E which don't
work well with Intel's I217-LM and I218-LM:
http://ark.intel.com/products/60019/Intel-Ethernet-Connection-I217-LM
http://ark.intel.com/products/71307/Intel-Ethernet-Connection-I218-LM
I was told there is massive ping loss.

This commit adds support for a new flag which can be set by an ethernet
driver to fixup PHY setup.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 6 ++++++
 include/linux/brcmphy.h    | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 1e9ad30a35c8..d7ed69deabfb 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -43,6 +43,12 @@ static int bcm54210e_config_init(struct phy_device *phydev)
 	val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN;
 	bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val);
 
+	if (phydev->dev_flags & PHY_BRCM_EN_MASTER_MODE) {
+		val = phy_read(phydev, MII_CTRL1000);
+		val |= CTL1000_AS_MASTER | CTL1000_ENABLE_MASTER;
+		phy_write(phydev, MII_CTRL1000, val);
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index abcda9b458ab..9ac9e3e3d1e5 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -63,6 +63,7 @@
 #define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
 #define PHY_BRCM_CLEAR_RGMII_MODE	0x00004000
 #define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00008000
+#define PHY_BRCM_EN_MASTER_MODE		0x00010000
 
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
-- 
cgit v1.2.3


From 75da2163dbb6af9f2dce1d80056d11d290dd19a5 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 13 Oct 2017 11:04:23 +0200
Subject: tipc: introduce communication groups

As a preparation for introducing flow control for multicast and datagram
messaging we need a more strictly defined framework than we have now. A
socket must be able keep track of exactly how many and which other
sockets it is allowed to communicate with at any moment, and keep the
necessary state for those.

We therefore introduce a new concept we have named Communication Group.
Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN.
The call takes four parameters: 'type' serves as group identifier,
'instance' serves as an logical member identifier, and 'scope' indicates
the visibility of the group (node/cluster/zone). Finally, 'flags' makes
it possible to set certain properties for the member. For now, there is
only one flag, indicating if the creator of the socket wants to receive
a copy of broadcast or multicast messages it is sending via the socket,
and if wants to be eligible as destination for its own anycasts.

A group is closed, i.e., sockets which have not joined a group will
not be able to send messages to or receive messages from members of
the group, and vice versa.

Any member of a group can send multicast ('group broadcast') messages
to all group members, optionally including itself, using the primitive
send(). The messages are received via the recvmsg() primitive. A socket
can only be member of one group at a time.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |  14 ++
 net/tipc/Makefile         |   2 +-
 net/tipc/group.c          | 404 ++++++++++++++++++++++++++++++++++++++++++++++
 net/tipc/group.h          |  64 ++++++++
 net/tipc/link.c           |   3 +-
 net/tipc/msg.h            |  50 +++++-
 net/tipc/name_table.c     |  44 +++--
 net/tipc/name_table.h     |   3 +
 net/tipc/node.h           |   3 +-
 net/tipc/socket.c         | 209 ++++++++++++++++++++----
 10 files changed, 748 insertions(+), 48 deletions(-)
 create mode 100644 net/tipc/group.c
 create mode 100644 net/tipc/group.h

(limited to 'include')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 5351b08c897a..5f7b2c4a09ab 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -231,6 +231,20 @@ struct sockaddr_tipc {
 #define TIPC_SOCK_RECVQ_DEPTH	132	/* Default: none (read only) */
 #define TIPC_MCAST_BROADCAST    133     /* Default: TIPC selects. No arg */
 #define TIPC_MCAST_REPLICAST    134     /* Default: TIPC selects. No arg */
+#define TIPC_GROUP_JOIN         135     /* Takes struct tipc_group_req* */
+#define TIPC_GROUP_LEAVE        136     /* No argument */
+
+/*
+ * Flag values
+ */
+#define TIPC_GROUP_LOOPBACK     0x1  /* Receive copy of sent msg when match */
+
+struct tipc_group_req {
+	__u32 type;      /* group id */
+	__u32 instance;  /* member id */
+	__u32 scope;     /* zone/cluster/node */
+	__u32 flags;
+};
 
 /*
  * Maximum sizes of TIPC bearer-related names (including terminating NULL)
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
index 31b9f9c52974..a3af73ec0b78 100644
--- a/net/tipc/Makefile
+++ b/net/tipc/Makefile
@@ -8,7 +8,7 @@ tipc-y	+= addr.o bcast.o bearer.o \
 	   core.o link.o discover.o msg.o  \
 	   name_distr.o  subscr.o monitor.o name_table.o net.o  \
 	   netlink.o netlink_compat.o node.o socket.o eth_media.o \
-	   server.o socket.o
+	   server.o socket.o group.o
 
 tipc-$(CONFIG_TIPC_MEDIA_UDP)	+= udp_media.o
 tipc-$(CONFIG_TIPC_MEDIA_IB)	+= ib_media.o
diff --git a/net/tipc/group.c b/net/tipc/group.c
new file mode 100644
index 000000000000..3f0e1ce1e3b9
--- /dev/null
+++ b/net/tipc/group.c
@@ -0,0 +1,404 @@
+/*
+ * net/tipc/group.c: TIPC group messaging code
+ *
+ * Copyright (c) 2017, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "addr.h"
+#include "group.h"
+#include "bcast.h"
+#include "server.h"
+#include "msg.h"
+#include "socket.h"
+#include "node.h"
+#include "name_table.h"
+#include "subscr.h"
+
+#define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1)
+#define ADV_IDLE ADV_UNIT
+
+enum mbr_state {
+	MBR_QUARANTINED,
+	MBR_DISCOVERED,
+	MBR_JOINING,
+	MBR_PUBLISHED,
+	MBR_JOINED,
+	MBR_LEAVING
+};
+
+struct tipc_member {
+	struct rb_node tree_node;
+	struct list_head list;
+	u32 node;
+	u32 port;
+	enum mbr_state state;
+	u16 bc_rcv_nxt;
+};
+
+struct tipc_group {
+	struct rb_root members;
+	struct tipc_nlist dests;
+	struct net *net;
+	int subid;
+	u32 type;
+	u32 instance;
+	u32 domain;
+	u32 scope;
+	u32 portid;
+	u16 member_cnt;
+	u16 bc_snd_nxt;
+	bool loopback;
+};
+
+static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
+				  int mtyp, struct sk_buff_head *xmitq);
+
+u16 tipc_group_bc_snd_nxt(struct tipc_group *grp)
+{
+	return grp->bc_snd_nxt;
+}
+
+static bool tipc_group_is_receiver(struct tipc_member *m)
+{
+	return m && m->state >= MBR_JOINED;
+}
+
+int tipc_group_size(struct tipc_group *grp)
+{
+	return grp->member_cnt;
+}
+
+struct tipc_group *tipc_group_create(struct net *net, u32 portid,
+				     struct tipc_group_req *mreq)
+{
+	struct tipc_group *grp;
+	u32 type = mreq->type;
+
+	grp = kzalloc(sizeof(*grp), GFP_ATOMIC);
+	if (!grp)
+		return NULL;
+	tipc_nlist_init(&grp->dests, tipc_own_addr(net));
+	grp->members = RB_ROOT;
+	grp->net = net;
+	grp->portid = portid;
+	grp->domain = addr_domain(net, mreq->scope);
+	grp->type = type;
+	grp->instance = mreq->instance;
+	grp->scope = mreq->scope;
+	grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK;
+	if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid))
+		return grp;
+	kfree(grp);
+	return NULL;
+}
+
+void tipc_group_delete(struct net *net, struct tipc_group *grp)
+{
+	struct rb_root *tree = &grp->members;
+	struct tipc_member *m, *tmp;
+	struct sk_buff_head xmitq;
+
+	__skb_queue_head_init(&xmitq);
+
+	rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) {
+		tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq);
+		list_del(&m->list);
+		kfree(m);
+	}
+	tipc_node_distr_xmit(net, &xmitq);
+	tipc_nlist_purge(&grp->dests);
+	tipc_topsrv_kern_unsubscr(net, grp->subid);
+	kfree(grp);
+}
+
+struct tipc_member *tipc_group_find_member(struct tipc_group *grp,
+					   u32 node, u32 port)
+{
+	struct rb_node *n = grp->members.rb_node;
+	u64 nkey, key = (u64)node << 32 | port;
+	struct tipc_member *m;
+
+	while (n) {
+		m = container_of(n, struct tipc_member, tree_node);
+		nkey = (u64)m->node << 32 | m->port;
+		if (key < nkey)
+			n = n->rb_left;
+		else if (key > nkey)
+			n = n->rb_right;
+		else
+			return m;
+	}
+	return NULL;
+}
+
+static struct tipc_member *tipc_group_find_node(struct tipc_group *grp,
+						u32 node)
+{
+	struct tipc_member *m;
+	struct rb_node *n;
+
+	for (n = rb_first(&grp->members); n; n = rb_next(n)) {
+		m = container_of(n, struct tipc_member, tree_node);
+		if (m->node == node)
+			return m;
+	}
+	return NULL;
+}
+
+static void tipc_group_add_to_tree(struct tipc_group *grp,
+				   struct tipc_member *m)
+{
+	u64 nkey, key = (u64)m->node << 32 | m->port;
+	struct rb_node **n, *parent = NULL;
+	struct tipc_member *tmp;
+
+	n = &grp->members.rb_node;
+	while (*n) {
+		tmp = container_of(*n, struct tipc_member, tree_node);
+		parent = *n;
+		tmp = container_of(parent, struct tipc_member, tree_node);
+		nkey = (u64)tmp->node << 32 | tmp->port;
+		if (key < nkey)
+			n = &(*n)->rb_left;
+		else if (key > nkey)
+			n = &(*n)->rb_right;
+		else
+			return;
+	}
+	rb_link_node(&m->tree_node, parent, n);
+	rb_insert_color(&m->tree_node, &grp->members);
+}
+
+static struct tipc_member *tipc_group_create_member(struct tipc_group *grp,
+						    u32 node, u32 port,
+						    int state)
+{
+	struct tipc_member *m;
+
+	m = kzalloc(sizeof(*m), GFP_ATOMIC);
+	if (!m)
+		return NULL;
+	INIT_LIST_HEAD(&m->list);
+	m->node = node;
+	m->port = port;
+	grp->member_cnt++;
+	tipc_group_add_to_tree(grp, m);
+	tipc_nlist_add(&grp->dests, m->node);
+	m->state = state;
+	return m;
+}
+
+void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port)
+{
+	tipc_group_create_member(grp, node, port, MBR_DISCOVERED);
+}
+
+static void tipc_group_delete_member(struct tipc_group *grp,
+				     struct tipc_member *m)
+{
+	rb_erase(&m->tree_node, &grp->members);
+	grp->member_cnt--;
+	list_del_init(&m->list);
+
+	/* If last member on a node, remove node from dest list */
+	if (!tipc_group_find_node(grp, m->node))
+		tipc_nlist_del(&grp->dests, m->node);
+
+	kfree(m);
+}
+
+struct tipc_nlist *tipc_group_dests(struct tipc_group *grp)
+{
+	return &grp->dests;
+}
+
+void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq,
+		     int *scope)
+{
+	seq->type = grp->type;
+	seq->lower = grp->instance;
+	seq->upper = grp->instance;
+	*scope = grp->scope;
+}
+
+void tipc_group_update_bc_members(struct tipc_group *grp)
+{
+	grp->bc_snd_nxt++;
+}
+
+/* tipc_group_filter_msg() - determine if we should accept arriving message
+ */
+void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq,
+			   struct sk_buff_head *xmitq)
+{
+	struct sk_buff *skb = __skb_dequeue(inputq);
+	struct tipc_member *m;
+	struct tipc_msg *hdr;
+	u32 node, port;
+	int mtyp;
+
+	if (!skb)
+		return;
+
+	hdr = buf_msg(skb);
+	mtyp = msg_type(hdr);
+	node =  msg_orignode(hdr);
+	port = msg_origport(hdr);
+
+	if (!msg_in_group(hdr))
+		goto drop;
+
+	m = tipc_group_find_member(grp, node, port);
+	if (!tipc_group_is_receiver(m))
+		goto drop;
+
+	__skb_queue_tail(inputq, skb);
+
+	m->bc_rcv_nxt = msg_grp_bc_seqno(hdr) + 1;
+	return;
+drop:
+	kfree_skb(skb);
+}
+
+static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
+				  int mtyp, struct sk_buff_head *xmitq)
+{
+	struct tipc_msg *hdr;
+	struct sk_buff *skb;
+
+	skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0,
+			      m->node, tipc_own_addr(grp->net),
+			      m->port, grp->portid, 0);
+	if (!skb)
+		return;
+
+	hdr = buf_msg(skb);
+	if (mtyp == GRP_JOIN_MSG)
+		msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt);
+	__skb_queue_tail(xmitq, skb);
+}
+
+void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
+			  struct sk_buff_head *xmitq)
+{
+	u32 node = msg_orignode(hdr);
+	u32 port = msg_origport(hdr);
+	struct tipc_member *m;
+
+	if (!grp)
+		return;
+
+	m = tipc_group_find_member(grp, node, port);
+
+	switch (msg_type(hdr)) {
+	case GRP_JOIN_MSG:
+		if (!m)
+			m = tipc_group_create_member(grp, node, port,
+						     MBR_QUARANTINED);
+		if (!m)
+			return;
+		m->bc_rcv_nxt = msg_grp_bc_syncpt(hdr);
+
+		/* Wait until PUBLISH event is received */
+		if (m->state == MBR_DISCOVERED)
+			m->state = MBR_JOINING;
+		else if (m->state == MBR_PUBLISHED)
+			m->state = MBR_JOINED;
+		return;
+	case GRP_LEAVE_MSG:
+		if (!m)
+			return;
+
+		/* Wait until WITHDRAW event is received */
+		if (m->state != MBR_LEAVING) {
+			m->state = MBR_LEAVING;
+			return;
+		}
+		/* Otherwise deliver already received WITHDRAW event */
+		tipc_group_delete_member(grp, m);
+		return;
+	default:
+		pr_warn("Received unknown GROUP_PROTO message\n");
+	}
+}
+
+/* tipc_group_member_evt() - receive and handle a member up/down event
+ */
+void tipc_group_member_evt(struct tipc_group *grp,
+			   struct sk_buff *skb,
+			   struct sk_buff_head *xmitq)
+{
+	struct tipc_msg *hdr = buf_msg(skb);
+	struct tipc_event *evt = (void *)msg_data(hdr);
+	u32 node = evt->port.node;
+	u32 port = evt->port.ref;
+	struct tipc_member *m;
+	struct net *net;
+	u32 self;
+
+	if (!grp)
+		goto drop;
+
+	net = grp->net;
+	self = tipc_own_addr(net);
+	if (!grp->loopback && node == self && port == grp->portid)
+		goto drop;
+
+	m = tipc_group_find_member(grp, node, port);
+
+	if (evt->event == TIPC_PUBLISHED) {
+		if (!m)
+			m = tipc_group_create_member(grp, node, port,
+						     MBR_DISCOVERED);
+		if (!m)
+			goto drop;
+
+		/* Wait if JOIN message not yet received */
+		if (m->state == MBR_DISCOVERED)
+			m->state = MBR_PUBLISHED;
+		else
+			m->state = MBR_JOINED;
+		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
+	} else if (evt->event == TIPC_WITHDRAWN) {
+		if (!m)
+			goto drop;
+
+		/* Keep back event if more messages might be expected */
+		if (m->state != MBR_LEAVING && tipc_node_is_up(net, node))
+			m->state = MBR_LEAVING;
+		else
+			tipc_group_delete_member(grp, m);
+	}
+drop:
+	kfree_skb(skb);
+}
diff --git a/net/tipc/group.h b/net/tipc/group.h
new file mode 100644
index 000000000000..9bdf4479fc03
--- /dev/null
+++ b/net/tipc/group.h
@@ -0,0 +1,64 @@
+/*
+ * net/tipc/group.h: Include file for TIPC group unicast/multicast functions
+ *
+ * Copyright (c) 2017, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_GROUP_H
+#define _TIPC_GROUP_H
+
+#include "core.h"
+
+struct tipc_group;
+struct tipc_member;
+struct tipc_msg;
+
+struct tipc_group *tipc_group_create(struct net *net, u32 portid,
+				     struct tipc_group_req *mreq);
+void tipc_group_delete(struct net *net, struct tipc_group *grp);
+void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port);
+struct tipc_nlist *tipc_group_dests(struct tipc_group *grp);
+void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq,
+		     int *scope);
+void tipc_group_filter_msg(struct tipc_group *grp,
+			   struct sk_buff_head *inputq,
+			   struct sk_buff_head *xmitq);
+void tipc_group_member_evt(struct tipc_group *grp,
+			   struct sk_buff *skb,
+			   struct sk_buff_head *xmitq);
+void tipc_group_proto_rcv(struct tipc_group *grp,
+			  struct tipc_msg *hdr,
+			  struct sk_buff_head *xmitq);
+void tipc_group_update_bc_members(struct tipc_group *grp);
+u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
+int tipc_group_size(struct tipc_group *grp);
+#endif
diff --git a/net/tipc/link.c b/net/tipc/link.c
index ac0144f532aa..bd25bff63925 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1046,11 +1046,12 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
 	case TIPC_MEDIUM_IMPORTANCE:
 	case TIPC_HIGH_IMPORTANCE:
 	case TIPC_CRITICAL_IMPORTANCE:
-		if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) {
+		if (unlikely(msg_mcast(hdr))) {
 			skb_queue_tail(l->bc_rcvlink->inputq, skb);
 			return true;
 		}
 	case CONN_MANAGER:
+	case GROUP_PROTOCOL:
 		skb_queue_tail(inputq, skb);
 		return true;
 	case NAME_DISTRIBUTOR:
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index be3e38aa9dd2..dad400935405 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -1,7 +1,7 @@
 /*
  * net/tipc/msg.h: Include file for TIPC message header routines
  *
- * Copyright (c) 2000-2007, 2014-2015 Ericsson AB
+ * Copyright (c) 2000-2007, 2014-2017 Ericsson AB
  * Copyright (c) 2005-2008, 2010-2011, Wind River Systems
  * All rights reserved.
  *
@@ -61,10 +61,11 @@ struct plist;
 /*
  * Payload message types
  */
-#define TIPC_CONN_MSG		0
-#define TIPC_MCAST_MSG		1
-#define TIPC_NAMED_MSG		2
-#define TIPC_DIRECT_MSG		3
+#define TIPC_CONN_MSG           0
+#define TIPC_MCAST_MSG          1
+#define TIPC_NAMED_MSG          2
+#define TIPC_DIRECT_MSG         3
+#define TIPC_GRP_BCAST_MSG      4
 
 /*
  * Internal message users
@@ -73,6 +74,7 @@ struct plist;
 #define  MSG_BUNDLER          6
 #define  LINK_PROTOCOL        7
 #define  CONN_MANAGER         8
+#define  GROUP_PROTOCOL       9
 #define  TUNNEL_PROTOCOL      10
 #define  NAME_DISTRIBUTOR     11
 #define  MSG_FRAGMENTER       12
@@ -87,6 +89,7 @@ struct plist;
 #define BASIC_H_SIZE              32	/* Basic payload message */
 #define NAMED_H_SIZE              40	/* Named payload message */
 #define MCAST_H_SIZE              44	/* Multicast payload message */
+#define GROUP_H_SIZE              44	/* Group payload message */
 #define INT_H_SIZE                40	/* Internal messages */
 #define MIN_H_SIZE                24	/* Smallest legal TIPC header size */
 #define MAX_H_SIZE                60	/* Largest possible TIPC header size */
@@ -252,6 +255,11 @@ static inline void msg_set_type(struct tipc_msg *m, u32 n)
 	msg_set_bits(m, 1, 29, 0x7, n);
 }
 
+static inline int msg_in_group(struct tipc_msg *m)
+{
+	return (msg_type(m) == TIPC_GRP_BCAST_MSG);
+}
+
 static inline u32 msg_named(struct tipc_msg *m)
 {
 	return msg_type(m) == TIPC_NAMED_MSG;
@@ -259,7 +267,9 @@ static inline u32 msg_named(struct tipc_msg *m)
 
 static inline u32 msg_mcast(struct tipc_msg *m)
 {
-	return msg_type(m) == TIPC_MCAST_MSG;
+	int mtyp = msg_type(m);
+
+	return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG));
 }
 
 static inline u32 msg_connected(struct tipc_msg *m)
@@ -514,6 +524,12 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
 #define DSC_REQ_MSG		0
 #define DSC_RESP_MSG		1
 
+/*
+ * Group protocol message types
+ */
+#define GRP_JOIN_MSG         0
+#define GRP_LEAVE_MSG        1
+
 /*
  * Word 1
  */
@@ -795,6 +811,28 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
 	msg_set_bits(m, 9, 0, 0xffff, n);
 }
 
+static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n)
+{
+	msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
+/* Word 10
+ */
+static inline u16 msg_grp_bc_seqno(struct tipc_msg *m)
+{
+	return msg_bits(m, 10, 16, 0xffff);
+}
+
+static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 10, 16, 0xffff, n);
+}
+
 static inline bool msg_peer_link_is_up(struct tipc_msg *m)
 {
 	if (likely(msg_user(m) != LINK_PROTOCOL))
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 76bd2777baaf..114d72bab827 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -43,6 +43,7 @@
 #include "bcast.h"
 #include "addr.h"
 #include "node.h"
+#include "group.h"
 #include <net/genetlink.h>
 
 #define TIPC_NAMETBL_SIZE 1024		/* must be a power of 2 */
@@ -596,18 +597,6 @@ not_found:
 	return ref;
 }
 
-/**
- * tipc_nametbl_mc_translate - find multicast destinations
- *
- * Creates list of all local ports that overlap the given multicast address;
- * also determines if any off-node ports overlap.
- *
- * Note: Publications with a scope narrower than 'limit' are ignored.
- * (i.e. local node-scope publications mustn't receive messages arriving
- * from another node, even if the multcast link brought it here)
- *
- * Returns non-zero if any off-node ports overlap
- */
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 			      u32 limit, struct list_head *dports)
 {
@@ -679,6 +668,37 @@ exit:
 	rcu_read_unlock();
 }
 
+/* tipc_nametbl_build_group - build list of communication group members
+ */
+void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
+			      u32 type, u32 domain)
+{
+	struct sub_seq *sseq, *stop;
+	struct name_info *info;
+	struct publication *p;
+	struct name_seq *seq;
+
+	rcu_read_lock();
+	seq = nametbl_find_seq(net, type);
+	if (!seq)
+		goto exit;
+
+	spin_lock_bh(&seq->lock);
+	sseq = seq->sseqs;
+	stop = seq->sseqs + seq->first_free;
+	for (; sseq != stop; sseq++) {
+		info = sseq->info;
+		list_for_each_entry(p, &info->zone_list, zone_list) {
+			if (!tipc_in_scope(domain, p->node))
+				continue;
+			tipc_group_add_member(grp, p->node, p->ref);
+		}
+	}
+	spin_unlock_bh(&seq->lock);
+exit:
+	rcu_read_unlock();
+}
+
 /*
  * tipc_nametbl_publish - add name publication to network name tables
  */
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index d121175a92b5..97646b17a4a2 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -40,6 +40,7 @@
 struct tipc_subscription;
 struct tipc_plist;
 struct tipc_nlist;
+struct tipc_group;
 
 /*
  * TIPC name types reserved for internal TIPC use (both current and planned)
@@ -101,6 +102,8 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
 int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
 			      u32 limit, struct list_head *dports);
+void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
+			      u32 type, u32 domain);
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
 				   u32 upper, u32 domain,
 				   struct tipc_nlist *nodes);
diff --git a/net/tipc/node.h b/net/tipc/node.h
index df2f2197c4ad..acd58d23a70e 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -48,7 +48,8 @@ enum {
 	TIPC_BCAST_SYNCH      = (1 << 1),
 	TIPC_BCAST_STATE_NACK = (1 << 2),
 	TIPC_BLOCK_FLOWCTL    = (1 << 3),
-	TIPC_BCAST_RCAST      = (1 << 4)
+	TIPC_BCAST_RCAST      = (1 << 4),
+	TIPC_MCAST_GROUPS     = (1 << 5)
 };
 
 #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index daf7c4df4531..64bbf9d03629 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/socket.c: TIPC socket API
  *
- * Copyright (c) 2001-2007, 2012-2016, Ericsson AB
+ * Copyright (c) 2001-2007, 2012-2017, Ericsson AB
  * Copyright (c) 2004-2008, 2010-2013, Wind River Systems
  * All rights reserved.
  *
@@ -45,6 +45,7 @@
 #include "socket.h"
 #include "bcast.h"
 #include "netlink.h"
+#include "group.h"
 
 #define CONN_TIMEOUT_DEFAULT	8000	/* default connect timeout = 8s */
 #define CONN_PROBING_INTERVAL	msecs_to_jiffies(3600000)  /* [ms] => 1 h */
@@ -78,7 +79,7 @@ enum {
  * @conn_timeout: the time we can wait for an unresponded setup request
  * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
  * @cong_link_cnt: number of congested links
- * @sent_unacked: # messages sent by socket, and not yet acked by peer
+ * @snt_unacked: # messages sent by socket, and not yet acked by peer
  * @rcv_unacked: # messages read by user, but not yet acked back to peer
  * @peer: 'connected' peer for dgram/rdm
  * @node: hash table node
@@ -109,6 +110,7 @@ struct tipc_sock {
 	struct rhash_head node;
 	struct tipc_mc_method mc_method;
 	struct rcu_head rcu;
+	struct tipc_group *group;
 };
 
 static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
@@ -123,6 +125,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
 			   struct tipc_name_seq const *seq);
 static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 			    struct tipc_name_seq const *seq);
+static int tipc_sk_leave(struct tipc_sock *tsk);
 static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
 static int tipc_sk_insert(struct tipc_sock *tsk);
 static void tipc_sk_remove(struct tipc_sock *tsk);
@@ -559,6 +562,7 @@ static int tipc_release(struct socket *sock)
 
 	__tipc_shutdown(sock, TIPC_ERR_NO_PORT);
 	sk->sk_shutdown = SHUTDOWN_MASK;
+	tipc_sk_leave(tsk);
 	tipc_sk_withdraw(tsk, 0, NULL);
 	sk_stop_timer(sk, &sk->sk_timer);
 	tipc_sk_remove(tsk);
@@ -601,7 +605,10 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr,
 		res = tipc_sk_withdraw(tsk, 0, NULL);
 		goto exit;
 	}
-
+	if (tsk->group) {
+		res = -EACCES;
+		goto exit;
+	}
 	if (uaddr_len < sizeof(struct sockaddr_tipc)) {
 		res = -EINVAL;
 		goto exit;
@@ -698,6 +705,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_group *grp = tsk->group;
 	u32 mask = 0;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
@@ -718,8 +726,9 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 			mask |= (POLLIN | POLLRDNORM);
 		break;
 	case TIPC_OPEN:
-		if (!tsk->cong_link_cnt)
-			mask |= POLLOUT;
+		if (!grp || tipc_group_size(grp))
+			if (!tsk->cong_link_cnt)
+				mask |= POLLOUT;
 		if (tipc_sk_type_connectionless(sk) &&
 		    (!skb_queue_empty(&sk->sk_receive_queue)))
 			mask |= (POLLIN | POLLRDNORM);
@@ -757,6 +766,9 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	struct tipc_nlist dsts;
 	int rc;
 
+	if (tsk->group)
+		return -EACCES;
+
 	/* Block or return if any destination link is congested */
 	rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
 	if (unlikely(rc))
@@ -793,6 +805,64 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	return rc ? rc : dlen;
 }
 
+/**
+ * tipc_send_group_bcast - send message to all members in communication group
+ * @sk: socket structure
+ * @m: message to send
+ * @dlen: total length of message data
+ * @timeout: timeout to wait for wakeup
+ *
+ * Called from function tipc_sendmsg(), which has done all sanity checks
+ * Returns the number of bytes sent on success, or errno
+ */
+static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
+				 int dlen, long timeout)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_group *grp = tsk->group;
+	struct tipc_nlist *dsts = tipc_group_dests(grp);
+	struct tipc_mc_method *method = &tsk->mc_method;
+	struct tipc_msg *hdr = &tsk->phdr;
+	int mtu = tipc_bcast_get_mtu(net);
+	struct sk_buff_head pkts;
+	int rc = -EHOSTUNREACH;
+
+	if (!dsts->local && !dsts->remote)
+		return -EHOSTUNREACH;
+
+	/* Block or return if any destination link is congested */
+	rc = tipc_wait_for_cond(sock, &timeout,	!tsk->cong_link_cnt);
+	if (unlikely(rc))
+		return rc;
+
+	/* Complete message header */
+	msg_set_type(hdr, TIPC_GRP_BCAST_MSG);
+	msg_set_hdr_sz(hdr, MCAST_H_SIZE);
+	msg_set_destport(hdr, 0);
+	msg_set_destnode(hdr, 0);
+	msg_set_nameinst(hdr, 0);
+	msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp));
+
+	/* Build message as chain of buffers */
+	skb_queue_head_init(&pkts);
+	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
+	if (unlikely(rc != dlen))
+		return rc;
+
+	/* Send message */
+	rc = tipc_mcast_xmit(net, &pkts, method, dsts,
+			     &tsk->cong_link_cnt);
+	if (unlikely(rc))
+		return rc;
+
+	/* Update broadcast sequence number */
+	tipc_group_update_bc_members(tsk->group);
+
+	return dlen;
+}
+
 /**
  * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets
  * @arrvq: queue with arriving messages, to be cloned after destination lookup
@@ -803,13 +873,15 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 		       struct sk_buff_head *inputq)
 {
-	struct tipc_msg *msg;
-	struct list_head dports;
-	u32 portid;
 	u32 scope = TIPC_CLUSTER_SCOPE;
-	struct sk_buff_head tmpq;
-	uint hsz;
+	u32 self = tipc_own_addr(net);
 	struct sk_buff *skb, *_skb;
+	u32 lower = 0, upper = ~0;
+	struct sk_buff_head tmpq;
+	u32 portid, oport, onode;
+	struct list_head dports;
+	struct tipc_msg *msg;
+	int hsz;
 
 	__skb_queue_head_init(&tmpq);
 	INIT_LIST_HEAD(&dports);
@@ -818,14 +890,18 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 	for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
 		msg = buf_msg(skb);
 		hsz = skb_headroom(skb) + msg_hdr_sz(msg);
-
-		if (in_own_node(net, msg_orignode(msg)))
+		oport = msg_origport(msg);
+		onode = msg_orignode(msg);
+		if (onode == self)
 			scope = TIPC_NODE_SCOPE;
 
 		/* Create destination port list and message clones: */
-		tipc_nametbl_mc_translate(net,
-					  msg_nametype(msg), msg_namelower(msg),
-					  msg_nameupper(msg), scope, &dports);
+		if (!msg_in_group(msg)) {
+			lower = msg_namelower(msg);
+			upper = msg_nameupper(msg);
+		}
+		tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper,
+					  scope, &dports);
 		while (tipc_dest_pop(&dports, NULL, &portid)) {
 			_skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
 			if (_skb) {
@@ -895,10 +971,6 @@ exit:
 	kfree_skb(skb);
 }
 
-static void tipc_sk_top_evt(struct tipc_sock *tsk, struct tipc_event *evt)
-{
-}
-
 /**
  * tipc_sendmsg - send message in connectionless manner
  * @sock: socket structure
@@ -934,6 +1006,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
 	struct list_head *clinks = &tsk->cong_links;
 	bool syn = !tipc_sk_type_connectionless(sk);
+	struct tipc_group *grp = tsk->group;
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_name_seq *seq;
 	struct sk_buff_head pkts;
@@ -944,6 +1017,9 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
 		return -EMSGSIZE;
 
+	if (unlikely(grp))
+		return tipc_send_group_bcast(sock, m, dlen, timeout);
+
 	if (unlikely(!dest)) {
 		dest = &tsk->peer;
 		if (!syn || dest->family != AF_TIPC)
@@ -1543,6 +1619,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 	struct sk_buff *skb = __skb_dequeue(inputq);
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_msg *hdr = buf_msg(skb);
+	struct tipc_group *grp = tsk->group;
 
 	switch (msg_user(hdr)) {
 	case CONN_MANAGER:
@@ -1553,8 +1630,12 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 		tsk->cong_link_cnt--;
 		sk->sk_write_space(sk);
 		break;
+	case GROUP_PROTOCOL:
+		tipc_group_proto_rcv(grp, hdr, xmitq);
+		break;
 	case TOP_SRV:
-		tipc_sk_top_evt(tsk, (void *)msg_data(hdr));
+		tipc_group_member_evt(tsk->group, skb, xmitq);
+		skb = NULL;
 		break;
 	default:
 		break;
@@ -1699,6 +1780,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
 {
 	bool sk_conn = !tipc_sk_type_connectionless(sk);
 	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_group *grp = tsk->group;
 	struct tipc_msg *hdr = buf_msg(skb);
 	struct net *net = sock_net(sk);
 	struct sk_buff_head inputq;
@@ -1710,15 +1792,19 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
 
 	if (unlikely(!msg_isdata(hdr)))
 		tipc_sk_proto_rcv(sk, &inputq, xmitq);
-	else if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG))
+	else if (unlikely(msg_type(hdr) > TIPC_GRP_BCAST_MSG))
 		return kfree_skb(skb);
 
+	if (unlikely(grp))
+		tipc_group_filter_msg(grp, &inputq, xmitq);
+
 	/* Validate and add to receive buffer if there is space */
 	while ((skb = __skb_dequeue(&inputq))) {
 		hdr = buf_msg(skb);
 		limit = rcvbuf_limit(sk, skb);
 		if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) ||
-		    (!sk_conn && msg_connected(hdr)))
+		    (!sk_conn && msg_connected(hdr)) ||
+		    (!grp && msg_in_group(hdr)))
 			err = TIPC_ERR_NO_PORT;
 		else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit)
 			err = TIPC_ERR_OVERLOAD;
@@ -1837,7 +1923,6 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
 			sock_put(sk);
 			continue;
 		}
-
 		/* No destination socket => dequeue skb if still there */
 		skb = tipc_skb_dequeue(inputq, dport);
 		if (!skb)
@@ -1905,6 +1990,11 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest,
 
 	lock_sock(sk);
 
+	if (tsk->group) {
+		res = -EINVAL;
+		goto exit;
+	}
+
 	if (dst->family == AF_UNSPEC) {
 		memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc));
 		if (!tipc_sk_type_connectionless(sk))
@@ -2341,6 +2431,52 @@ void tipc_sk_rht_destroy(struct net *net)
 	rhashtable_destroy(&tn->sk_rht);
 }
 
+static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
+{
+	struct net *net = sock_net(&tsk->sk);
+	u32 domain = addr_domain(net, mreq->scope);
+	struct tipc_group *grp = tsk->group;
+	struct tipc_msg *hdr = &tsk->phdr;
+	struct tipc_name_seq seq;
+	int rc;
+
+	if (mreq->type < TIPC_RESERVED_TYPES)
+		return -EACCES;
+	if (grp)
+		return -EACCES;
+	grp = tipc_group_create(net, tsk->portid, mreq);
+	if (!grp)
+		return -ENOMEM;
+	tsk->group = grp;
+	msg_set_lookup_scope(hdr, mreq->scope);
+	msg_set_nametype(hdr, mreq->type);
+	msg_set_dest_droppable(hdr, true);
+	seq.type = mreq->type;
+	seq.lower = mreq->instance;
+	seq.upper = seq.lower;
+	tipc_nametbl_build_group(net, grp, mreq->type, domain);
+	rc = tipc_sk_publish(tsk, mreq->scope, &seq);
+	if (rc)
+		tipc_group_delete(net, grp);
+	return rc;
+}
+
+static int tipc_sk_leave(struct tipc_sock *tsk)
+{
+	struct net *net = sock_net(&tsk->sk);
+	struct tipc_group *grp = tsk->group;
+	struct tipc_name_seq seq;
+	int scope;
+
+	if (!grp)
+		return -EINVAL;
+	tipc_group_self(grp, &seq, &scope);
+	tipc_group_delete(net, grp);
+	tsk->group = NULL;
+	tipc_sk_withdraw(tsk, scope, &seq);
+	return 0;
+}
+
 /**
  * tipc_setsockopt - set socket option
  * @sock: socket structure
@@ -2359,6 +2495,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
+	struct tipc_group_req mreq;
 	u32 value = 0;
 	int res = 0;
 
@@ -2374,9 +2511,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 	case TIPC_CONN_TIMEOUT:
 		if (ol < sizeof(value))
 			return -EINVAL;
-		res = get_user(value, (u32 __user *)ov);
-		if (res)
-			return res;
+		if (get_user(value, (u32 __user *)ov))
+			return -EFAULT;
+		break;
+	case TIPC_GROUP_JOIN:
+		if (ol < sizeof(mreq))
+			return -EINVAL;
+		if (copy_from_user(&mreq, ov, sizeof(mreq)))
+			return -EFAULT;
 		break;
 	default:
 		if (ov || ol)
@@ -2409,6 +2551,12 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 		tsk->mc_method.rcast = true;
 		tsk->mc_method.mandatory = true;
 		break;
+	case TIPC_GROUP_JOIN:
+		res = tipc_sk_join(tsk, &mreq);
+		break;
+	case TIPC_GROUP_LEAVE:
+		res = tipc_sk_leave(tsk);
+		break;
 	default:
 		res = -EINVAL;
 	}
@@ -2436,7 +2584,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
-	int len;
+	struct tipc_name_seq seq;
+	int len, scope;
 	u32 value;
 	int res;
 
@@ -2470,6 +2619,12 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
 	case TIPC_SOCK_RECVQ_DEPTH:
 		value = skb_queue_len(&sk->sk_receive_queue);
 		break;
+	case TIPC_GROUP_JOIN:
+		seq.type = 0;
+		if (tsk->group)
+			tipc_group_self(tsk->group, &seq, &scope);
+		value = seq.type;
+		break;
 	default:
 		res = -EINVAL;
 	}
-- 
cgit v1.2.3


From ae236fb208a6fbbd2e7a6913385e8fb78ac807f8 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 13 Oct 2017 11:04:25 +0200
Subject: tipc: receive group membership events via member socket

Like with any other service, group members' availability can be
subscribed for by connecting to be topology server. However, because
the events arrive via a different socket than the member socket, there
is a real risk that membership events my arrive out of synch with the
actual JOIN/LEAVE action. I.e., it is possible to receive the first
messages from a new member before the corresponding JOIN event arrives,
just as it is possible to receive the last messages from a leaving
member after the LEAVE event has already been received.

Since each member socket is internally also subscribing for membership
events, we now fix this problem by passing those events on to the user
via the member socket. We leverage the already present member synch-
ronization protocol to guarantee correct message/event order. An event
is delivered to the user as an empty message where the two source
addresses identify the new/lost member. Furthermore, we set the MSG_OOB
bit in the message flags to mark it as an event. If the event is an
indication about a member loss we also set the MSG_EOR bit, so it can
be distinguished from a member addition event.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |  1 +
 net/tipc/group.c          | 60 +++++++++++++++++++++++++++++++++++++----------
 net/tipc/group.h          |  2 ++
 net/tipc/msg.h            | 22 +++++++++++++++--
 net/tipc/socket.c         | 49 ++++++++++++++++++++++++--------------
 5 files changed, 101 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 5f7b2c4a09ab..ef41c11a7f38 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -238,6 +238,7 @@ struct sockaddr_tipc {
  * Flag values
  */
 #define TIPC_GROUP_LOOPBACK     0x1  /* Receive copy of sent msg when match */
+#define TIPC_GROUP_MEMBER_EVTS  0x2  /* Receive membership events in socket */
 
 struct tipc_group_req {
 	__u32 type;      /* group id */
diff --git a/net/tipc/group.c b/net/tipc/group.c
index beb214a3420c..1bfa9348b26d 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -59,6 +59,7 @@ enum mbr_state {
 struct tipc_member {
 	struct rb_node tree_node;
 	struct list_head list;
+	struct sk_buff *event_msg;
 	u32 node;
 	u32 port;
 	u32 instance;
@@ -79,6 +80,7 @@ struct tipc_group {
 	u16 member_cnt;
 	u16 bc_snd_nxt;
 	bool loopback;
+	bool events;
 };
 
 static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
@@ -117,6 +119,7 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid,
 	grp->instance = mreq->instance;
 	grp->scope = mreq->scope;
 	grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK;
+	grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS;
 	if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid))
 		return grp;
 	kfree(grp);
@@ -279,6 +282,13 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq,
 	if (!msg_in_group(hdr))
 		goto drop;
 
+	if (mtyp == TIPC_GRP_MEMBER_EVT) {
+		if (!grp->events)
+			goto drop;
+		__skb_queue_tail(inputq, skb);
+		return;
+	}
+
 	m = tipc_group_find_member(grp, node, port);
 	if (!tipc_group_is_receiver(m))
 		goto drop;
@@ -311,6 +321,7 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
 }
 
 void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
+			  struct sk_buff_head *inputq,
 			  struct sk_buff_head *xmitq)
 {
 	u32 node = msg_orignode(hdr);
@@ -332,10 +343,12 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
 		m->bc_rcv_nxt = msg_grp_bc_syncpt(hdr);
 
 		/* Wait until PUBLISH event is received */
-		if (m->state == MBR_DISCOVERED)
+		if (m->state == MBR_DISCOVERED) {
 			m->state = MBR_JOINING;
-		else if (m->state == MBR_PUBLISHED)
+		} else if (m->state == MBR_PUBLISHED) {
 			m->state = MBR_JOINED;
+			__skb_queue_tail(inputq, m->event_msg);
+		}
 		return;
 	case GRP_LEAVE_MSG:
 		if (!m)
@@ -347,6 +360,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
 			return;
 		}
 		/* Otherwise deliver already received WITHDRAW event */
+		__skb_queue_tail(inputq, m->event_msg);
 		tipc_group_delete_member(grp, m);
 		return;
 	default:
@@ -354,16 +368,17 @@ void tipc_group_proto_rcv(struct tipc_group *grp, struct tipc_msg *hdr,
 	}
 }
 
-/* tipc_group_member_evt() - receive and handle a member up/down event
- */
 void tipc_group_member_evt(struct tipc_group *grp,
 			   struct sk_buff *skb,
+			   struct sk_buff_head *inputq,
 			   struct sk_buff_head *xmitq)
 {
 	struct tipc_msg *hdr = buf_msg(skb);
 	struct tipc_event *evt = (void *)msg_data(hdr);
+	u32 instance = evt->found_lower;
 	u32 node = evt->port.node;
 	u32 port = evt->port.ref;
+	int event = evt->event;
 	struct tipc_member *m;
 	struct net *net;
 	u32 self;
@@ -376,32 +391,51 @@ void tipc_group_member_evt(struct tipc_group *grp,
 	if (!grp->loopback && node == self && port == grp->portid)
 		goto drop;
 
+	/* Convert message before delivery to user */
+	msg_set_hdr_sz(hdr, GROUP_H_SIZE);
+	msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE);
+	msg_set_type(hdr, TIPC_GRP_MEMBER_EVT);
+	msg_set_origport(hdr, port);
+	msg_set_orignode(hdr, node);
+	msg_set_nametype(hdr, grp->type);
+	msg_set_grp_evt(hdr, event);
+
 	m = tipc_group_find_member(grp, node, port);
 
-	if (evt->event == TIPC_PUBLISHED) {
+	if (event == TIPC_PUBLISHED) {
 		if (!m)
 			m = tipc_group_create_member(grp, node, port,
 						     MBR_DISCOVERED);
 		if (!m)
 			goto drop;
 
-		/* Wait if JOIN message not yet received */
-		if (m->state == MBR_DISCOVERED)
+		/* Hold back event if JOIN message not yet received */
+		if (m->state == MBR_DISCOVERED) {
+			m->event_msg = skb;
 			m->state = MBR_PUBLISHED;
-		else
+		} else {
+			__skb_queue_tail(inputq, skb);
 			m->state = MBR_JOINED;
-		m->instance = evt->found_lower;
+		}
+		m->instance = instance;
+		TIPC_SKB_CB(skb)->orig_member = m->instance;
 		tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq);
-	} else if (evt->event == TIPC_WITHDRAWN) {
+	} else if (event == TIPC_WITHDRAWN) {
 		if (!m)
 			goto drop;
 
-		/* Keep back event if more messages might be expected */
-		if (m->state != MBR_LEAVING && tipc_node_is_up(net, node))
+		TIPC_SKB_CB(skb)->orig_member = m->instance;
+
+		/* Hold back event if more messages might be expected */
+		if (m->state != MBR_LEAVING && tipc_node_is_up(net, node)) {
+			m->event_msg = skb;
 			m->state = MBR_LEAVING;
-		else
+		} else {
+			__skb_queue_tail(inputq, skb);
 			tipc_group_delete_member(grp, m);
+		}
 	}
+	return;
 drop:
 	kfree_skb(skb);
 }
diff --git a/net/tipc/group.h b/net/tipc/group.h
index 9bdf4479fc03..5d3f10d28967 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -54,9 +54,11 @@ void tipc_group_filter_msg(struct tipc_group *grp,
 			   struct sk_buff_head *xmitq);
 void tipc_group_member_evt(struct tipc_group *grp,
 			   struct sk_buff *skb,
+			   struct sk_buff_head *inputq,
 			   struct sk_buff_head *xmitq);
 void tipc_group_proto_rcv(struct tipc_group *grp,
 			  struct tipc_msg *hdr,
+			  struct sk_buff_head *inputq,
 			  struct sk_buff_head *xmitq);
 void tipc_group_update_bc_members(struct tipc_group *grp);
 u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index e438716d2372..1b527b154e46 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -65,7 +65,8 @@ struct plist;
 #define TIPC_MCAST_MSG          1
 #define TIPC_NAMED_MSG          2
 #define TIPC_DIRECT_MSG         3
-#define TIPC_GRP_BCAST_MSG      4
+#define TIPC_GRP_MEMBER_EVT     4
+#define TIPC_GRP_BCAST_MSG      5
 
 /*
  * Internal message users
@@ -258,7 +259,14 @@ static inline void msg_set_type(struct tipc_msg *m, u32 n)
 
 static inline int msg_in_group(struct tipc_msg *m)
 {
-	return (msg_type(m) == TIPC_GRP_BCAST_MSG);
+	int mtyp = msg_type(m);
+
+	return (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MEMBER_EVT);
+}
+
+static inline bool msg_is_grp_evt(struct tipc_msg *m)
+{
+	return msg_type(m) == TIPC_GRP_MEMBER_EVT;
 }
 
 static inline u32 msg_named(struct tipc_msg *m)
@@ -824,6 +832,16 @@ static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n)
 
 /* Word 10
  */
+static inline u16 msg_grp_evt(struct tipc_msg *m)
+{
+	return msg_bits(m, 10, 0, 0x3);
+}
+
+static inline void msg_set_grp_evt(struct tipc_msg *m, int n)
+{
+	msg_set_bits(m, 10, 0, 0x3, n);
+}
+
 static inline u16 msg_grp_bc_seqno(struct tipc_msg *m)
 {
 	return msg_bits(m, 10, 16, 0xffff);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 25ecf1201527..0a2eac309177 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -709,41 +709,43 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 			      poll_table *wait)
 {
 	struct sock *sk = sock->sk;
+	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_group *grp = tsk->group;
-	u32 mask = 0;
+	u32 revents = 0;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+		revents |= POLLRDHUP | POLLIN | POLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		revents |= POLLHUP;
 
 	switch (sk->sk_state) {
 	case TIPC_ESTABLISHED:
 		if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
-			mask |= POLLOUT;
+			revents |= POLLOUT;
 		/* fall thru' */
 	case TIPC_LISTEN:
 	case TIPC_CONNECTING:
-		if (!skb_queue_empty(&sk->sk_receive_queue))
-			mask |= (POLLIN | POLLRDNORM);
+		if (skb)
+			revents |= POLLIN | POLLRDNORM;
 		break;
 	case TIPC_OPEN:
 		if (!grp || tipc_group_size(grp))
 			if (!tsk->cong_link_cnt)
-				mask |= POLLOUT;
-		if (tipc_sk_type_connectionless(sk) &&
-		    (!skb_queue_empty(&sk->sk_receive_queue)))
-			mask |= (POLLIN | POLLRDNORM);
+				revents |= POLLOUT;
+		if (!tipc_sk_type_connectionless(sk))
+			break;
+		if (!skb)
+			break;
+		revents |= POLLIN | POLLRDNORM;
 		break;
 	case TIPC_DISCONNECTING:
-		mask = (POLLIN | POLLRDNORM | POLLHUP);
+		revents = POLLIN | POLLRDNORM | POLLHUP;
 		break;
 	}
-
-	return mask;
+	return revents;
 }
 
 /**
@@ -1415,11 +1417,12 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
 			size_t buflen,	int flags)
 {
 	struct sock *sk = sock->sk;
-	struct tipc_sock *tsk = tipc_sk(sk);
-	struct sk_buff *skb;
-	struct tipc_msg *hdr;
 	bool connected = !tipc_sk_type_connectionless(sk);
+	struct tipc_sock *tsk = tipc_sk(sk);
 	int rc, err, hlen, dlen, copy;
+	struct tipc_msg *hdr;
+	struct sk_buff *skb;
+	bool grp_evt;
 	long timeout;
 
 	/* Catch invalid receive requests */
@@ -1443,6 +1446,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
 		dlen = msg_data_sz(hdr);
 		hlen = msg_hdr_sz(hdr);
 		err = msg_errcode(hdr);
+		grp_evt = msg_is_grp_evt(hdr);
 		if (likely(dlen || err))
 			break;
 		tsk_advance_rx_queue(sk);
@@ -1469,11 +1473,20 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
 	if (unlikely(rc))
 		goto exit;
 
+	/* Mark message as group event if applicable */
+	if (unlikely(grp_evt)) {
+		if (msg_grp_evt(hdr) == TIPC_WITHDRAWN)
+			m->msg_flags |= MSG_EOR;
+		m->msg_flags |= MSG_OOB;
+		copy = 0;
+	}
+
 	/* Caption of data or error code/rejected data was successful */
 	if (unlikely(flags & MSG_PEEK))
 		goto exit;
 
 	tsk_advance_rx_queue(sk);
+
 	if (likely(!connected))
 		goto exit;
 
@@ -1648,10 +1661,10 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 		sk->sk_write_space(sk);
 		break;
 	case GROUP_PROTOCOL:
-		tipc_group_proto_rcv(grp, hdr, xmitq);
+		tipc_group_proto_rcv(grp, hdr, inputq, xmitq);
 		break;
 	case TOP_SRV:
-		tipc_group_member_evt(tsk->group, skb, xmitq);
+		tipc_group_member_evt(tsk->group, skb, inputq, xmitq);
 		skb = NULL;
 		break;
 	default:
-- 
cgit v1.2.3


From 4e8b86c062695454df0b76f3fee4fab8dc4bb716 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Thu, 7 Sep 2017 04:00:06 -0700
Subject: mqprio: Introduce new hardware offload mode and shaper in mqprio

The offload types currently supported in mqprio are 0 (no offload) and
1 (offload only TCs) by setting these values for the 'hw' option. If
offloads are supported by setting the 'hw' option to 1, the default
offload mode is 'dcb' where only the TC values are offloaded to the
device. This patch introduces a new hardware offload mode called
'channel' with 'hw' set to 1 in mqprio which makes full use of the
mqprio options, the TCs, the queue configurations and the QoS parameters
for the TCs. This is achieved through a new netlink attribute for the
'mode' option which takes values such as 'dcb' (default) and 'channel'.
The 'channel' mode also supports QoS attributes for traffic class such as
minimum and maximum values for bandwidth rate limits.

This patch enables configuring additional HW shaper attributes associated
with a traffic class. Currently the shaper for bandwidth rate limiting is
supported which takes options such as minimum and maximum bandwidth rates
and are offloaded to the hardware in the 'channel' mode. The min and max
limits for bandwidth rates are provided by the user along with the TCs
and the queue configurations when creating the mqprio qdisc. The interface
can be extended to support new HW shapers in future through the 'shaper'
attribute.

Introduces a new data structure 'tc_mqprio_qopt_offload' for offloading
mqprio queue options and use this to be shared between the kernel and
device driver. This contains a copy of the existing data structure
for mqprio queue options. This new data structure can be extended when
adding new attributes for traffic class such as mode, shaper, shaper
parameters (bandwidth rate limits). The existing data structure for mqprio
queue options will be shared between the kernel and userspace.

Example:
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit

To dump the bandwidth rates:

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
             queues:(0:3) (4:7)
             mode:channel
             shaper:bw_rlimit   min_rate:1Gbit 2Gbit   max_rate:4Gbit 5Gbit

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/net/pkt_cls.h          |   9 ++
 include/uapi/linux/pkt_sched.h |  32 +++++++
 net/sched/sch_mqprio.c         | 183 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 215 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f5263743076b..60d39789e4f0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -546,6 +546,15 @@ struct tc_cls_bpf_offload {
 	u32 gen_flags;
 };
 
+struct tc_mqprio_qopt_offload {
+	/* struct tc_mqprio_qopt must always be the first element */
+	struct tc_mqprio_qopt qopt;
+	u16 mode;
+	u16 shaper;
+	u32 flags;
+	u64 min_rate[TC_QOPT_MAX_QUEUE];
+	u64 max_rate[TC_QOPT_MAX_QUEUE];
+};
 
 /* This structure holds cookie structure that is passed from user
  * to the kernel for actions and classifiers
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf5528fed..e95b5c9b9fad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -625,6 +625,22 @@ enum {
 
 #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
 
+enum {
+	TC_MQPRIO_MODE_DCB,
+	TC_MQPRIO_MODE_CHANNEL,
+	__TC_MQPRIO_MODE_MAX
+};
+
+#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
+
+enum {
+	TC_MQPRIO_SHAPER_DCB,
+	TC_MQPRIO_SHAPER_BW_RATE,	/* Add new shapers below */
+	__TC_MQPRIO_SHAPER_MAX
+};
+
+#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
+
 struct tc_mqprio_qopt {
 	__u8	num_tc;
 	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -633,6 +649,22 @@ struct tc_mqprio_qopt {
 	__u16	offset[TC_QOPT_MAX_QUEUE];
 };
 
+#define TC_MQPRIO_F_MODE		0x1
+#define TC_MQPRIO_F_SHAPER		0x2
+#define TC_MQPRIO_F_MIN_RATE		0x4
+#define TC_MQPRIO_F_MAX_RATE		0x8
+
+enum {
+	TCA_MQPRIO_UNSPEC,
+	TCA_MQPRIO_MODE,
+	TCA_MQPRIO_SHAPER,
+	TCA_MQPRIO_MIN_RATE64,
+	TCA_MQPRIO_MAX_RATE64,
+	__TCA_MQPRIO_MAX,
+};
+
+#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
+
 /* SFB */
 
 enum {
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 6bcdfe6e7b63..f1ae9be83934 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -18,10 +18,16 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/sch_generic.h>
+#include <net/pkt_cls.h>
 
 struct mqprio_sched {
 	struct Qdisc		**qdiscs;
+	u16 mode;
+	u16 shaper;
 	int hw_offload;
+	u32 flags;
+	u64 min_rate[TC_QOPT_MAX_QUEUE];
+	u64 max_rate[TC_QOPT_MAX_QUEUE];
 };
 
 static void mqprio_destroy(struct Qdisc *sch)
@@ -39,9 +45,17 @@ static void mqprio_destroy(struct Qdisc *sch)
 	}
 
 	if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
-		struct tc_mqprio_qopt mqprio = {};
+		struct tc_mqprio_qopt_offload mqprio = { { 0 } };
 
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &mqprio);
+		switch (priv->mode) {
+		case TC_MQPRIO_MODE_DCB:
+		case TC_MQPRIO_MODE_CHANNEL:
+			dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+						      &mqprio);
+			break;
+		default:
+			return;
+		}
 	} else {
 		netdev_set_num_tc(dev, 0);
 	}
@@ -97,6 +111,26 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
 	return 0;
 }
 
+static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
+	[TCA_MQPRIO_MODE]	= { .len = sizeof(u16) },
+	[TCA_MQPRIO_SHAPER]	= { .len = sizeof(u16) },
+	[TCA_MQPRIO_MIN_RATE64]	= { .type = NLA_NESTED },
+	[TCA_MQPRIO_MAX_RATE64]	= { .type = NLA_NESTED },
+};
+
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		      const struct nla_policy *policy, int len)
+{
+	int nested_len = nla_len(nla) - NLA_ALIGN(len);
+
+	if (nested_len >= nla_attr_size(0))
+		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+				 nested_len, policy, NULL);
+
+	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+	return 0;
+}
+
 static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct net_device *dev = qdisc_dev(sch);
@@ -105,6 +139,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	struct Qdisc *qdisc;
 	int i, err = -EOPNOTSUPP;
 	struct tc_mqprio_qopt *qopt = NULL;
+	struct nlattr *tb[TCA_MQPRIO_MAX + 1];
+	struct nlattr *attr;
+	int rem;
+	int len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
 
 	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
 	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
@@ -122,6 +160,58 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	if (mqprio_parse_opt(dev, qopt))
 		return -EINVAL;
 
+	if (len > 0) {
+		err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
+				 sizeof(*qopt));
+		if (err < 0)
+			return err;
+
+		if (!qopt->hw)
+			return -EINVAL;
+
+		if (tb[TCA_MQPRIO_MODE]) {
+			priv->flags |= TC_MQPRIO_F_MODE;
+			priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]);
+		}
+
+		if (tb[TCA_MQPRIO_SHAPER]) {
+			priv->flags |= TC_MQPRIO_F_SHAPER;
+			priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]);
+		}
+
+		if (tb[TCA_MQPRIO_MIN_RATE64]) {
+			if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+				return -EINVAL;
+			i = 0;
+			nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
+					    rem) {
+				if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
+					return -EINVAL;
+				if (i >= qopt->num_tc)
+					break;
+				priv->min_rate[i] = *(u64 *)nla_data(attr);
+				i++;
+			}
+			priv->flags |= TC_MQPRIO_F_MIN_RATE;
+		}
+
+		if (tb[TCA_MQPRIO_MAX_RATE64]) {
+			if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+				return -EINVAL;
+			i = 0;
+			nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
+					    rem) {
+				if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
+					return -EINVAL;
+				if (i >= qopt->num_tc)
+					break;
+				priv->max_rate[i] = *(u64 *)nla_data(attr);
+				i++;
+			}
+			priv->flags |= TC_MQPRIO_F_MAX_RATE;
+		}
+	}
+
 	/* pre-allocate qdisc, attachment can't fail */
 	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
 			       GFP_KERNEL);
@@ -146,14 +236,36 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 * supplied and verified mapping
 	 */
 	if (qopt->hw) {
-		struct tc_mqprio_qopt mqprio = *qopt;
+		struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
 
-		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+		switch (priv->mode) {
+		case TC_MQPRIO_MODE_DCB:
+			if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
+				return -EINVAL;
+			break;
+		case TC_MQPRIO_MODE_CHANNEL:
+			mqprio.flags = priv->flags;
+			if (priv->flags & TC_MQPRIO_F_MODE)
+				mqprio.mode = priv->mode;
+			if (priv->flags & TC_MQPRIO_F_SHAPER)
+				mqprio.shaper = priv->shaper;
+			if (priv->flags & TC_MQPRIO_F_MIN_RATE)
+				for (i = 0; i < mqprio.qopt.num_tc; i++)
+					mqprio.min_rate[i] = priv->min_rate[i];
+			if (priv->flags & TC_MQPRIO_F_MAX_RATE)
+				for (i = 0; i < mqprio.qopt.num_tc; i++)
+					mqprio.max_rate[i] = priv->max_rate[i];
+			break;
+		default:
+			return -EINVAL;
+		}
+		err = dev->netdev_ops->ndo_setup_tc(dev,
+						    TC_SETUP_MQPRIO,
 						    &mqprio);
 		if (err)
 			return err;
 
-		priv->hw_offload = mqprio.hw;
+		priv->hw_offload = mqprio.qopt.hw;
 	} else {
 		netdev_set_num_tc(dev, qopt->num_tc);
 		for (i = 0; i < qopt->num_tc; i++)
@@ -223,11 +335,51 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
 	return 0;
 }
 
+static int dump_rates(struct mqprio_sched *priv,
+		      struct tc_mqprio_qopt *opt, struct sk_buff *skb)
+{
+	struct nlattr *nest;
+	int i;
+
+	if (priv->flags & TC_MQPRIO_F_MIN_RATE) {
+		nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64);
+		if (!nest)
+			goto nla_put_failure;
+
+		for (i = 0; i < opt->num_tc; i++) {
+			if (nla_put(skb, TCA_MQPRIO_MIN_RATE64,
+				    sizeof(priv->min_rate[i]),
+				    &priv->min_rate[i]))
+				goto nla_put_failure;
+		}
+		nla_nest_end(skb, nest);
+	}
+
+	if (priv->flags & TC_MQPRIO_F_MAX_RATE) {
+		nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64);
+		if (!nest)
+			goto nla_put_failure;
+
+		for (i = 0; i < opt->num_tc; i++) {
+			if (nla_put(skb, TCA_MQPRIO_MAX_RATE64,
+				    sizeof(priv->max_rate[i]),
+				    &priv->max_rate[i]))
+				goto nla_put_failure;
+		}
+		nla_nest_end(skb, nest);
+	}
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
 static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mqprio_sched *priv = qdisc_priv(sch);
-	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
 	struct tc_mqprio_qopt opt = { 0 };
 	struct Qdisc *qdisc;
 	unsigned int i;
@@ -258,12 +410,25 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 		opt.offset[i] = dev->tc_to_txq[i].offset;
 	}
 
-	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+	if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt))
+		goto nla_put_failure;
+
+	if ((priv->flags & TC_MQPRIO_F_MODE) &&
+	    nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode))
+		goto nla_put_failure;
+
+	if ((priv->flags & TC_MQPRIO_F_SHAPER) &&
+	    nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper))
+		goto nla_put_failure;
+
+	if ((priv->flags & TC_MQPRIO_F_MIN_RATE ||
+	     priv->flags & TC_MQPRIO_F_MAX_RATE) &&
+	    (dump_rates(priv, &opt, skb) != 0))
 		goto nla_put_failure;
 
-	return skb->len;
+	return nla_nest_end(skb, nla);
 nla_put_failure:
-	nlmsg_trim(skb, b);
+	nlmsg_trim(skb, nla);
 	return -1;
 }
 
-- 
cgit v1.2.3


From 17a9422de78c3a59b490b400f555635c477f1476 Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Wed, 11 Oct 2017 14:49:43 -0700
Subject: i40e/i40evf: don't trust VF to reset itself

When using 'ethtool -L' on a VF to change number of requested queues
from PF, we shouldn't trust the VF to reset itself after making the
request.  Doing it that way opens the door for a potentially malicious
VF to do nasty things to the PF which should never be the case.

This makes it such that after VF makes a successful request, PF will
then reset the VF to institute required changes.  Only if the request
fails will PF send a message back to VF letting it know the request was
unsuccessful.

Testing-hints:
There should be no real functional changes.  This is simply hardening
against a potentially malicious VF.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c  | 9 +++++++--
 drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c | 7 +++----
 include/linux/avf/virtchnl.h                        | 4 ++--
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index ce0981e2f605..f8a794b72462 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2045,8 +2045,9 @@ error_param:
  * @msglen: msg length
  *
  * VFs get a default number of queues but can use this message to request a
- * different number.  Will respond with either the number requested or the
- * maximum we can support.
+ * different number.  If the request is successful, PF will reset the VF and
+ * return 0.  If unsuccessful, PF will send message informing VF of number of
+ * available queues and return result of sending VF a message.
  **/
 static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen)
 {
@@ -2077,7 +2078,11 @@ static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen)
 			 pf->queues_left);
 		vfres->num_queue_pairs = pf->queues_left + cur_pairs;
 	} else {
+		/* successful request */
 		vf->num_req_queues = req_pairs;
+		i40e_vc_notify_vf_reset(vf);
+		i40e_reset_vf(vf, false);
+		return 0;
 	}
 
 	return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES, 0,
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
index 2bb81c39d85f..46c8b8a3907c 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
@@ -407,6 +407,7 @@ int i40evf_request_queues(struct i40evf_adapter *adapter, int num)
 	vfres.num_queue_pairs = num;
 
 	adapter->current_op = VIRTCHNL_OP_REQUEST_QUEUES;
+	adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED;
 	return i40evf_send_pf_msg(adapter, VIRTCHNL_OP_REQUEST_QUEUES,
 				  (u8 *)&vfres, sizeof(vfres));
 }
@@ -1098,15 +1099,13 @@ void i40evf_virtchnl_completion(struct i40evf_adapter *adapter,
 	case VIRTCHNL_OP_REQUEST_QUEUES: {
 		struct virtchnl_vf_res_request *vfres =
 			(struct virtchnl_vf_res_request *)msg;
-		if (vfres->num_queue_pairs == adapter->num_req_queues) {
-			adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED;
-			i40evf_schedule_reset(adapter);
-		} else {
+		if (vfres->num_queue_pairs != adapter->num_req_queues) {
 			dev_info(&adapter->pdev->dev,
 				 "Requested %d queues, PF can support %d\n",
 				 adapter->num_req_queues,
 				 vfres->num_queue_pairs);
 			adapter->num_req_queues = 0;
+			adapter->flags &= ~I40EVF_FLAG_REINIT_ITR_NEEDED;
 		}
 		}
 		break;
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 60e5d90cb18a..3ce61342fa31 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -333,8 +333,8 @@ struct virtchnl_vsi_queue_config_info {
  * additional queues must be negotiated.  This is a best effort request as it
  * is possible the PF does not have enough queues left to support the request.
  * If the PF cannot support the number requested it will respond with the
- * maximum number it is able to support; otherwise it will respond with the
- * number requested.
+ * maximum number it is able to support.  If the request is successful, PF will
+ * then reset the VF to institute required changes.
  */
 
 /* VF resource request */
-- 
cgit v1.2.3


From 7c39afb394c79e72c3795b4a42d55155b34ee073 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Tue, 15 Aug 2017 13:46:04 +0300
Subject: net/mlx5: PTP code migration to driver core section

PTP code is moved to core section of mlx5 driver in order to share
it between ethernet and infiniband. This movement involves the following
changes:
- Change mlx5e_ prefix to be mlx5_
- Add clock structs to Core
- Add clock object to mlx5_core_dev
- Call Init/Uninit clock from core init/cleanup
- Rename mlx5e_tstamp to be mlx5_clock

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Eitan Rabin <rabin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  39 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   7 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  95 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  17 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |   3 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |   3 +-
 .../net/ethernet/mellanox/mlx5/core/lib/clock.c    | 548 +++++++++------------
 .../net/ethernet/mellanox/mlx5/core/lib/clock.h    |  51 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   4 +
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   1 +
 include/linux/mlx5/driver.h                        |  24 +
 12 files changed, 416 insertions(+), 382 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cc13d3dbd366..2059122eb089 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -267,28 +267,6 @@ struct mlx5e_dcbx {
 };
 #endif
 
-#define MAX_PIN_NUM	8
-struct mlx5e_pps {
-	u8                         pin_caps[MAX_PIN_NUM];
-	struct work_struct         out_work;
-	u64                        start[MAX_PIN_NUM];
-	u8                         enabled;
-};
-
-struct mlx5e_tstamp {
-	rwlock_t                   lock;
-	struct cyclecounter        cycles;
-	struct timecounter         clock;
-	struct hwtstamp_config     hwtstamp_config;
-	u32                        nominal_c_mult;
-	unsigned long              overflow_period;
-	struct delayed_work        overflow_work;
-	struct mlx5_core_dev      *mdev;
-	struct ptp_clock          *ptp;
-	struct ptp_clock_info      ptp_info;
-	struct mlx5e_pps           pps_info;
-};
-
 enum {
 	MLX5E_RQ_STATE_ENABLED,
 	MLX5E_RQ_STATE_AM,
@@ -375,9 +353,10 @@ struct mlx5e_txqsq {
 	u8                         min_inline_mode;
 	u16                        edge;
 	struct device             *pdev;
-	struct mlx5e_tstamp       *tstamp;
 	__be32                     mkey_be;
 	unsigned long              state;
+	struct hwtstamp_config    *tstamp;
+	struct mlx5_clock         *clock;
 
 	/* control path */
 	struct mlx5_wq_ctrl        wq_ctrl;
@@ -543,10 +522,11 @@ struct mlx5e_rq {
 	struct mlx5e_channel  *channel;
 	struct device         *pdev;
 	struct net_device     *netdev;
-	struct mlx5e_tstamp   *tstamp;
 	struct mlx5e_rq_stats  stats;
 	struct mlx5e_cq        cq;
 	struct mlx5e_page_cache page_cache;
+	struct hwtstamp_config *tstamp;
+	struct mlx5_clock      *clock;
 
 	mlx5e_fp_handle_rx_cqe handle_rx_cqe;
 	mlx5e_fp_post_rx_wqes  post_wqes;
@@ -588,7 +568,7 @@ struct mlx5e_channel {
 	/* control */
 	struct mlx5e_priv         *priv;
 	struct mlx5_core_dev      *mdev;
-	struct mlx5e_tstamp       *tstamp;
+	struct hwtstamp_config    *tstamp;
 	int                        ix;
 };
 
@@ -789,7 +769,7 @@ struct mlx5e_priv {
 	struct mlx5_core_dev      *mdev;
 	struct net_device         *netdev;
 	struct mlx5e_stats         stats;
-	struct mlx5e_tstamp        tstamp;
+	struct hwtstamp_config     tstamp;
 	u16 q_counter;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	struct mlx5e_dcbx          dcbx;
@@ -873,12 +853,6 @@ void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv);
 void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv);
 void mlx5e_set_rx_mode_work(struct work_struct *work);
 
-void mlx5e_fill_hwstamp(struct mlx5e_tstamp *clock, u64 timestamp,
-			struct skb_shared_hwtstamps *hwts);
-void mlx5e_timestamp_init(struct mlx5e_priv *priv);
-void mlx5e_timestamp_cleanup(struct mlx5e_priv *priv);
-void mlx5e_pps_event_handler(struct mlx5e_priv *priv,
-			     struct ptp_clock_event *event);
 int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr);
 int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr);
 int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val);
@@ -889,6 +863,7 @@ int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto,
 			   u16 vid);
 void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
 void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
+void mlx5e_timestamp_set(struct mlx5e_priv *priv);
 
 struct mlx5e_redirect_rqt_param {
 	bool is_rss;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index d12e9fc0d76b..81a112e40fe3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1417,14 +1417,15 @@ static int mlx5e_set_pauseparam(struct net_device *netdev,
 int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
 			      struct ethtool_ts_info *info)
 {
+	struct mlx5_core_dev *mdev = priv->mdev;
 	int ret;
 
 	ret = ethtool_op_get_ts_info(priv->netdev, info);
 	if (ret)
 		return ret;
 
-	info->phc_index = priv->tstamp.ptp ?
-			  ptp_clock_index(priv->tstamp.ptp) : -1;
+	info->phc_index = mdev->clock.ptp ?
+			  ptp_clock_index(mdev->clock.ptp) : -1;
 
 	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
 		return 0;
@@ -1754,7 +1755,7 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev,
 	if (!MLX5_CAP_GEN(mdev, cqe_compression))
 		return -EOPNOTSUPP;
 
-	if (enable && priv->tstamp.hwtstamp_config.rx_filter != HWTSTAMP_FILTER_NONE) {
+	if (enable && priv->tstamp.rx_filter != HWTSTAMP_FILTER_NONE) {
 		netdev_err(netdev, "Can't enable cqe compression while timestamping is enabled.\n");
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cc11bbbd0309..6df00dd9745a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -373,8 +373,6 @@ static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
 			      enum mlx5_dev_event event, unsigned long param)
 {
 	struct mlx5e_priv *priv = vpriv;
-	struct ptp_clock_event ptp_event;
-	struct mlx5_eqe *eqe = NULL;
 
 	if (!test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state))
 		return;
@@ -384,14 +382,6 @@ static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
 	case MLX5_DEV_EVENT_PORT_DOWN:
 		queue_work(priv->wq, &priv->update_carrier_work);
 		break;
-	case MLX5_DEV_EVENT_PPS:
-		eqe = (struct mlx5_eqe *)param;
-		ptp_event.index = eqe->data.pps.pin;
-		ptp_event.timestamp =
-			timecounter_cyc2time(&priv->tstamp.clock,
-					     be64_to_cpu(eqe->data.pps.time_stamp));
-		mlx5e_pps_event_handler(vpriv, &ptp_event);
-		break;
 	default:
 		break;
 	}
@@ -585,6 +575,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
 	rq->tstamp  = c->tstamp;
+	rq->clock   = &mdev->clock;
 	rq->channel = c;
 	rq->ix      = c->ix;
 	rq->mdev    = mdev;
@@ -1123,6 +1114,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 
 	sq->pdev      = c->pdev;
 	sq->tstamp    = c->tstamp;
+	sq->clock     = &mdev->clock;
 	sq->mkey_be   = c->mkey_be;
 	sq->channel   = c;
 	sq->txq_ix    = txq_ix;
@@ -2678,6 +2670,12 @@ void mlx5e_switch_priv_channels(struct mlx5e_priv *priv,
 		netif_carrier_on(netdev);
 }
 
+void mlx5e_timestamp_set(struct mlx5e_priv *priv)
+{
+	priv->tstamp.tx_type   = HWTSTAMP_TX_OFF;
+	priv->tstamp.rx_filter = HWTSTAMP_FILTER_NONE;
+}
+
 int mlx5e_open_locked(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -2693,7 +2691,7 @@ int mlx5e_open_locked(struct net_device *netdev)
 	mlx5e_activate_priv_channels(priv);
 	if (priv->profile->update_carrier)
 		priv->profile->update_carrier(priv);
-	mlx5e_timestamp_init(priv);
+	mlx5e_timestamp_set(priv);
 
 	if (priv->profile->update_stats)
 		queue_delayed_work(priv->wq, &priv->update_stats_work, 0);
@@ -2731,7 +2729,6 @@ int mlx5e_close_locked(struct net_device *netdev)
 
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
-	mlx5e_timestamp_cleanup(priv);
 	netif_carrier_off(priv->netdev);
 	mlx5e_deactivate_priv_channels(priv);
 	mlx5e_close_channels(&priv->channels);
@@ -3403,6 +3400,80 @@ out:
 	return err;
 }
 
+int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* TX HW timestamp */
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	mutex_lock(&priv->state_lock);
+	/* RX HW timestamp */
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		/* Reset CQE compression to Admin default */
+		mlx5e_modify_rx_cqe_compression_locked(priv, priv->channels.params.rx_cqe_compress_def);
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
+		/* Disable CQE compression */
+		netdev_warn(priv->netdev, "Disabling cqe compression");
+		err = mlx5e_modify_rx_cqe_compression_locked(priv, false);
+		if (err) {
+			netdev_err(priv->netdev, "Failed disabling cqe compression err=%d\n", err);
+			mutex_unlock(&priv->state_lock);
+			return err;
+		}
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		mutex_unlock(&priv->state_lock);
+		return -ERANGE;
+	}
+
+	memcpy(&priv->tstamp, &config, sizeof(config));
+	mutex_unlock(&priv->state_lock);
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr)
+{
+	struct hwtstamp_config *cfg = &priv->tstamp;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+		return -EOPNOTSUPP;
+
+	return copy_to_user(ifr->ifr_data, cfg, sizeof(*cfg)) ? -EFAULT : 0;
+}
+
 static int mlx5e_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 15a1687483cc..7e3bfe62ef6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -42,10 +42,11 @@
 #include "en_rep.h"
 #include "ipoib/ipoib.h"
 #include "en_accel/ipsec_rxtx.h"
+#include "lib/clock.h"
 
-static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp *tstamp)
+static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
 {
-	return tstamp->hwtstamp_config.rx_filter == HWTSTAMP_FILTER_ALL;
+	return config->rx_filter == HWTSTAMP_FILTER_ALL;
 }
 
 static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cqcc,
@@ -661,7 +662,6 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 				      struct sk_buff *skb)
 {
 	struct net_device *netdev = rq->netdev;
-	struct mlx5e_tstamp *tstamp = rq->tstamp;
 	int lro_num_seg;
 
 	lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
@@ -676,8 +676,9 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 		rq->stats.lro_bytes += cqe_bcnt;
 	}
 
-	if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
-		mlx5e_fill_hwstamp(tstamp, get_cqe_ts(cqe), skb_hwtstamps(skb));
+	if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
+		skb_hwtstamps(skb)->hwtstamp =
+				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
 
 	skb_record_rx_queue(skb, rq->ix);
 
@@ -1163,7 +1164,6 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 					 struct sk_buff *skb)
 {
 	struct net_device *netdev = rq->netdev;
-	struct mlx5e_tstamp *tstamp = rq->tstamp;
 	char *pseudo_header;
 	u8 *dgid;
 	u8 g;
@@ -1188,8 +1188,9 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 	skb->ip_summed = CHECKSUM_COMPLETE;
 	skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
 
-	if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
-		mlx5e_fill_hwstamp(tstamp, get_cqe_ts(cqe), skb_hwtstamps(skb));
+	if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
+		skb_hwtstamps(skb)->hwtstamp =
+				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
 
 	skb_record_rx_queue(skb, rq->ix);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 1d6925d4369a..a7c208a1ad83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -35,6 +35,7 @@
 #include "en.h"
 #include "ipoib/ipoib.h"
 #include "en_accel/ipsec_rxtx.h"
+#include "lib/clock.h"
 
 #define MLX5E_SQ_NOPS_ROOM  MLX5_SEND_WQE_MAX_WQEBBS
 #define MLX5E_SQ_STOP_ROOM (MLX5_SEND_WQE_MAX_WQEBBS +\
@@ -452,8 +453,9 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 				     SKBTX_HW_TSTAMP)) {
 				struct skb_shared_hwtstamps hwts = {};
 
-				mlx5e_fill_hwstamp(sq->tstamp,
-						   get_cqe_ts(cqe), &hwts);
+				hwts.hwtstamp =
+					mlx5_timecounter_cyc2time(sq->clock,
+								  get_cqe_ts(cqe));
 				skb_tstamp_tx(skb, &hwts);
 			}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index fc606bfd1d6e..60771865c99c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -491,8 +491,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			break;
 
 		case MLX5_EVENT_TYPE_PPS_EVENT:
-			if (dev->event)
-				dev->event(dev, MLX5_DEV_EVENT_PPS, (unsigned long)eqe);
+			mlx5_pps_event(dev, eqe);
 			break;
 
 		case MLX5_EVENT_TYPE_FPGA_ERROR:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 145e392ab849..14dfb577691b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -404,7 +404,7 @@ static int mlx5i_open(struct net_device *netdev)
 
 	mlx5e_refresh_tirs(priv, false);
 	mlx5e_activate_priv_channels(priv);
-	mlx5e_timestamp_init(priv);
+	mlx5e_timestamp_set(priv);
 
 	mutex_unlock(&priv->state_lock);
 	return 0;
@@ -429,7 +429,6 @@ static int mlx5i_close(struct net_device *netdev)
 
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
-	mlx5e_timestamp_cleanup(priv);
 	netif_carrier_off(priv->netdev);
 	mlx5e_deactivate_priv_channels(priv);
 	mlx5e_close_channels(&priv->channels);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index 84dd63e74041..fa8aed62b231 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -34,250 +34,164 @@
 #include "en.h"
 
 enum {
-	MLX5E_CYCLES_SHIFT	= 23
+	MLX5_CYCLES_SHIFT	= 23
 };
 
 enum {
-	MLX5E_PIN_MODE_IN		= 0x0,
-	MLX5E_PIN_MODE_OUT		= 0x1,
+	MLX5_PIN_MODE_IN		= 0x0,
+	MLX5_PIN_MODE_OUT		= 0x1,
 };
 
 enum {
-	MLX5E_OUT_PATTERN_PULSE		= 0x0,
-	MLX5E_OUT_PATTERN_PERIODIC	= 0x1,
+	MLX5_OUT_PATTERN_PULSE		= 0x0,
+	MLX5_OUT_PATTERN_PERIODIC	= 0x1,
 };
 
 enum {
-	MLX5E_EVENT_MODE_DISABLE	= 0x0,
-	MLX5E_EVENT_MODE_REPETETIVE	= 0x1,
-	MLX5E_EVENT_MODE_ONCE_TILL_ARM	= 0x2,
+	MLX5_EVENT_MODE_DISABLE	= 0x0,
+	MLX5_EVENT_MODE_REPETETIVE	= 0x1,
+	MLX5_EVENT_MODE_ONCE_TILL_ARM	= 0x2,
 };
 
 enum {
-	MLX5E_MTPPS_FS_ENABLE			= BIT(0x0),
-	MLX5E_MTPPS_FS_PATTERN			= BIT(0x2),
-	MLX5E_MTPPS_FS_PIN_MODE			= BIT(0x3),
-	MLX5E_MTPPS_FS_TIME_STAMP		= BIT(0x4),
-	MLX5E_MTPPS_FS_OUT_PULSE_DURATION	= BIT(0x5),
-	MLX5E_MTPPS_FS_ENH_OUT_PER_ADJ		= BIT(0x7),
+	MLX5_MTPPS_FS_ENABLE			= BIT(0x0),
+	MLX5_MTPPS_FS_PATTERN			= BIT(0x2),
+	MLX5_MTPPS_FS_PIN_MODE			= BIT(0x3),
+	MLX5_MTPPS_FS_TIME_STAMP		= BIT(0x4),
+	MLX5_MTPPS_FS_OUT_PULSE_DURATION	= BIT(0x5),
+	MLX5_MTPPS_FS_ENH_OUT_PER_ADJ		= BIT(0x7),
 };
 
-void mlx5e_fill_hwstamp(struct mlx5e_tstamp *tstamp, u64 timestamp,
-			struct skb_shared_hwtstamps *hwts)
+static u64 read_internal_timer(const struct cyclecounter *cc)
 {
-	u64 nsec;
+	struct mlx5_clock *clock = container_of(cc, struct mlx5_clock, cycles);
+	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
+						  clock);
 
-	read_lock(&tstamp->lock);
-	nsec = timecounter_cyc2time(&tstamp->clock, timestamp);
-	read_unlock(&tstamp->lock);
-
-	hwts->hwtstamp = ns_to_ktime(nsec);
-}
-
-static u64 mlx5e_read_internal_timer(const struct cyclecounter *cc)
-{
-	struct mlx5e_tstamp *tstamp = container_of(cc, struct mlx5e_tstamp,
-						   cycles);
-
-	return mlx5_read_internal_timer(tstamp->mdev) & cc->mask;
+	return mlx5_read_internal_timer(mdev) & cc->mask;
 }
 
-static void mlx5e_pps_out(struct work_struct *work)
+static void mlx5_pps_out(struct work_struct *work)
 {
-	struct mlx5e_pps *pps_info = container_of(work, struct mlx5e_pps,
-						  out_work);
-	struct mlx5e_tstamp *tstamp = container_of(pps_info, struct mlx5e_tstamp,
-						   pps_info);
+	struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps,
+						 out_work);
+	struct mlx5_clock *clock = container_of(pps_info, struct mlx5_clock,
+						pps_info);
+	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
+						  clock);
 	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 	unsigned long flags;
 	int i;
 
-	for (i = 0; i < tstamp->ptp_info.n_pins; i++) {
+	for (i = 0; i < clock->ptp_info.n_pins; i++) {
 		u64 tstart;
 
-		write_lock_irqsave(&tstamp->lock, flags);
-		tstart = tstamp->pps_info.start[i];
-		tstamp->pps_info.start[i] = 0;
-		write_unlock_irqrestore(&tstamp->lock, flags);
+		write_lock_irqsave(&clock->lock, flags);
+		tstart = clock->pps_info.start[i];
+		clock->pps_info.start[i] = 0;
+		write_unlock_irqrestore(&clock->lock, flags);
 		if (!tstart)
 			continue;
 
 		MLX5_SET(mtpps_reg, in, pin, i);
 		MLX5_SET64(mtpps_reg, in, time_stamp, tstart);
-		MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_TIME_STAMP);
-		mlx5_set_mtpps(tstamp->mdev, in, sizeof(in));
+		MLX5_SET(mtpps_reg, in, field_select, MLX5_MTPPS_FS_TIME_STAMP);
+		mlx5_set_mtpps(mdev, in, sizeof(in));
 	}
 }
 
-static void mlx5e_timestamp_overflow(struct work_struct *work)
+static void mlx5_timestamp_overflow(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
-	struct mlx5e_tstamp *tstamp = container_of(dwork, struct mlx5e_tstamp,
-						   overflow_work);
-	struct mlx5e_priv *priv = container_of(tstamp, struct mlx5e_priv, tstamp);
+	struct mlx5_clock *clock = container_of(dwork, struct mlx5_clock,
+						overflow_work);
 	unsigned long flags;
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	timecounter_read(&tstamp->clock);
-	write_unlock_irqrestore(&tstamp->lock, flags);
-	queue_delayed_work(priv->wq, &tstamp->overflow_work,
-			   msecs_to_jiffies(tstamp->overflow_period * 1000));
-}
-
-int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
-{
-	struct hwtstamp_config config;
-	int err;
-
-	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
-		return -EOPNOTSUPP;
-
-	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
-		return -EFAULT;
-
-	/* TX HW timestamp */
-	switch (config.tx_type) {
-	case HWTSTAMP_TX_OFF:
-	case HWTSTAMP_TX_ON:
-		break;
-	default:
-		return -ERANGE;
-	}
-
-	mutex_lock(&priv->state_lock);
-	/* RX HW timestamp */
-	switch (config.rx_filter) {
-	case HWTSTAMP_FILTER_NONE:
-		/* Reset CQE compression to Admin default */
-		mlx5e_modify_rx_cqe_compression_locked(priv, priv->channels.params.rx_cqe_compress_def);
-		break;
-	case HWTSTAMP_FILTER_ALL:
-	case HWTSTAMP_FILTER_SOME:
-	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
-	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
-	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
-	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
-	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
-	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
-	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
-	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
-	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
-	case HWTSTAMP_FILTER_PTP_V2_EVENT:
-	case HWTSTAMP_FILTER_PTP_V2_SYNC:
-	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
-	case HWTSTAMP_FILTER_NTP_ALL:
-		/* Disable CQE compression */
-		netdev_warn(priv->netdev, "Disabling cqe compression");
-		err = mlx5e_modify_rx_cqe_compression_locked(priv, false);
-		if (err) {
-			netdev_err(priv->netdev, "Failed disabling cqe compression err=%d\n", err);
-			mutex_unlock(&priv->state_lock);
-			return err;
-		}
-		config.rx_filter = HWTSTAMP_FILTER_ALL;
-		break;
-	default:
-		mutex_unlock(&priv->state_lock);
-		return -ERANGE;
-	}
-
-	memcpy(&priv->tstamp.hwtstamp_config, &config, sizeof(config));
-	mutex_unlock(&priv->state_lock);
-
-	return copy_to_user(ifr->ifr_data, &config,
-			    sizeof(config)) ? -EFAULT : 0;
-}
-
-int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr)
-{
-	struct hwtstamp_config *cfg = &priv->tstamp.hwtstamp_config;
-
-	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
-		return -EOPNOTSUPP;
-
-	return copy_to_user(ifr->ifr_data, cfg, sizeof(*cfg)) ? -EFAULT : 0;
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_read(&clock->tc);
+	write_unlock_irqrestore(&clock->lock, flags);
+	schedule_delayed_work(&clock->overflow_work, clock->overflow_period);
 }
 
-static int mlx5e_ptp_settime(struct ptp_clock_info *ptp,
-			     const struct timespec64 *ts)
+static int mlx5_ptp_settime(struct ptp_clock_info *ptp,
+			    const struct timespec64 *ts)
 {
-	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
-						   ptp_info);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						 ptp_info);
 	u64 ns = timespec64_to_ns(ts);
 	unsigned long flags;
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	timecounter_init(&tstamp->clock, &tstamp->cycles, ns);
-	write_unlock_irqrestore(&tstamp->lock, flags);
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_init(&clock->tc, &clock->cycles, ns);
+	write_unlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
 
-static int mlx5e_ptp_gettime(struct ptp_clock_info *ptp,
-			     struct timespec64 *ts)
+static int mlx5_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 {
-	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
-						   ptp_info);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
 	u64 ns;
 	unsigned long flags;
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	ns = timecounter_read(&tstamp->clock);
-	write_unlock_irqrestore(&tstamp->lock, flags);
+	write_lock_irqsave(&clock->lock, flags);
+	ns = timecounter_read(&clock->tc);
+	write_unlock_irqrestore(&clock->lock, flags);
 
 	*ts = ns_to_timespec64(ns);
 
 	return 0;
 }
 
-static int mlx5e_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 {
-	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
-						   ptp_info);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
 	unsigned long flags;
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	timecounter_adjtime(&tstamp->clock, delta);
-	write_unlock_irqrestore(&tstamp->lock, flags);
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_adjtime(&clock->tc, delta);
+	write_unlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
 
-static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 {
 	u64 adj;
 	u32 diff;
 	unsigned long flags;
 	int neg_adj = 0;
-	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
-						  ptp_info);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
 
 	if (delta < 0) {
 		neg_adj = 1;
 		delta = -delta;
 	}
 
-	adj = tstamp->nominal_c_mult;
+	adj = clock->nominal_c_mult;
 	adj *= delta;
 	diff = div_u64(adj, 1000000000ULL);
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	timecounter_read(&tstamp->clock);
-	tstamp->cycles.mult = neg_adj ? tstamp->nominal_c_mult - diff :
-					tstamp->nominal_c_mult + diff;
-	write_unlock_irqrestore(&tstamp->lock, flags);
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_read(&clock->tc);
+	clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff :
+				       clock->nominal_c_mult + diff;
+	write_unlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
 
-static int mlx5e_extts_configure(struct ptp_clock_info *ptp,
-				 struct ptp_clock_request *rq,
-				 int on)
+static int mlx5_extts_configure(struct ptp_clock_info *ptp,
+				struct ptp_clock_request *rq,
+				int on)
 {
-	struct mlx5e_tstamp *tstamp =
-		container_of(ptp, struct mlx5e_tstamp, ptp_info);
-	struct mlx5e_priv *priv =
-		container_of(tstamp, struct mlx5e_priv, tstamp);
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev =
+			container_of(clock, struct mlx5_core_dev, clock);
 	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 	u32 field_select = 0;
 	u8 pin_mode = 0;
@@ -285,24 +199,24 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp,
 	int pin = -1;
 	int err = 0;
 
-	if (!MLX5_PPS_CAP(priv->mdev))
+	if (!MLX5_PPS_CAP(mdev))
 		return -EOPNOTSUPP;
 
-	if (rq->extts.index >= tstamp->ptp_info.n_pins)
+	if (rq->extts.index >= clock->ptp_info.n_pins)
 		return -EINVAL;
 
 	if (on) {
-		pin = ptp_find_pin(tstamp->ptp, PTP_PF_EXTTS, rq->extts.index);
+		pin = ptp_find_pin(clock->ptp, PTP_PF_EXTTS, rq->extts.index);
 		if (pin < 0)
 			return -EBUSY;
-		pin_mode = MLX5E_PIN_MODE_IN;
+		pin_mode = MLX5_PIN_MODE_IN;
 		pattern = !!(rq->extts.flags & PTP_FALLING_EDGE);
-		field_select = MLX5E_MTPPS_FS_PIN_MODE |
-			       MLX5E_MTPPS_FS_PATTERN |
-			       MLX5E_MTPPS_FS_ENABLE;
+		field_select = MLX5_MTPPS_FS_PIN_MODE |
+			       MLX5_MTPPS_FS_PATTERN |
+			       MLX5_MTPPS_FS_ENABLE;
 	} else {
 		pin = rq->extts.index;
-		field_select = MLX5E_MTPPS_FS_ENABLE;
+		field_select = MLX5_MTPPS_FS_ENABLE;
 	}
 
 	MLX5_SET(mtpps_reg, in, pin, pin);
@@ -311,22 +225,22 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp,
 	MLX5_SET(mtpps_reg, in, enable, on);
 	MLX5_SET(mtpps_reg, in, field_select, field_select);
 
-	err = mlx5_set_mtpps(priv->mdev, in, sizeof(in));
+	err = mlx5_set_mtpps(mdev, in, sizeof(in));
 	if (err)
 		return err;
 
-	return mlx5_set_mtppse(priv->mdev, pin, 0,
-			       MLX5E_EVENT_MODE_REPETETIVE & on);
+	return mlx5_set_mtppse(mdev, pin, 0,
+			       MLX5_EVENT_MODE_REPETETIVE & on);
 }
 
-static int mlx5e_perout_configure(struct ptp_clock_info *ptp,
-				  struct ptp_clock_request *rq,
-				  int on)
+static int mlx5_perout_configure(struct ptp_clock_info *ptp,
+				 struct ptp_clock_request *rq,
+				 int on)
 {
-	struct mlx5e_tstamp *tstamp =
-		container_of(ptp, struct mlx5e_tstamp, ptp_info);
-	struct mlx5e_priv *priv =
-		container_of(tstamp, struct mlx5e_priv, tstamp);
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev =
+			container_of(clock, struct mlx5_core_dev, clock);
 	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 	u64 nsec_now, nsec_delta, time_stamp = 0;
 	u64 cycles_now, cycles_delta;
@@ -339,20 +253,20 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp,
 	int err = 0;
 	s64 ns;
 
-	if (!MLX5_PPS_CAP(priv->mdev))
+	if (!MLX5_PPS_CAP(mdev))
 		return -EOPNOTSUPP;
 
-	if (rq->perout.index >= tstamp->ptp_info.n_pins)
+	if (rq->perout.index >= clock->ptp_info.n_pins)
 		return -EINVAL;
 
 	if (on) {
-		pin = ptp_find_pin(tstamp->ptp, PTP_PF_PEROUT,
+		pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT,
 				   rq->perout.index);
 		if (pin < 0)
 			return -EBUSY;
 
-		pin_mode = MLX5E_PIN_MODE_OUT;
-		pattern = MLX5E_OUT_PATTERN_PERIODIC;
+		pin_mode = MLX5_PIN_MODE_OUT;
+		pattern = MLX5_OUT_PATTERN_PERIODIC;
 		ts.tv_sec = rq->perout.period.sec;
 		ts.tv_nsec = rq->perout.period.nsec;
 		ns = timespec64_to_ns(&ts);
@@ -363,21 +277,21 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp,
 		ts.tv_sec = rq->perout.start.sec;
 		ts.tv_nsec = rq->perout.start.nsec;
 		ns = timespec64_to_ns(&ts);
-		cycles_now = mlx5_read_internal_timer(tstamp->mdev);
-		write_lock_irqsave(&tstamp->lock, flags);
-		nsec_now = timecounter_cyc2time(&tstamp->clock, cycles_now);
+		cycles_now = mlx5_read_internal_timer(mdev);
+		write_lock_irqsave(&clock->lock, flags);
+		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
 		nsec_delta = ns - nsec_now;
-		cycles_delta = div64_u64(nsec_delta << tstamp->cycles.shift,
-					 tstamp->cycles.mult);
-		write_unlock_irqrestore(&tstamp->lock, flags);
+		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
+					 clock->cycles.mult);
+		write_unlock_irqrestore(&clock->lock, flags);
 		time_stamp = cycles_now + cycles_delta;
-		field_select = MLX5E_MTPPS_FS_PIN_MODE |
-			       MLX5E_MTPPS_FS_PATTERN |
-			       MLX5E_MTPPS_FS_ENABLE |
-			       MLX5E_MTPPS_FS_TIME_STAMP;
+		field_select = MLX5_MTPPS_FS_PIN_MODE |
+			       MLX5_MTPPS_FS_PATTERN |
+			       MLX5_MTPPS_FS_ENABLE |
+			       MLX5_MTPPS_FS_TIME_STAMP;
 	} else {
 		pin = rq->perout.index;
-		field_select = MLX5E_MTPPS_FS_ENABLE;
+		field_select = MLX5_MTPPS_FS_ENABLE;
 	}
 
 	MLX5_SET(mtpps_reg, in, pin, pin);
@@ -387,233 +301,225 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp,
 	MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp);
 	MLX5_SET(mtpps_reg, in, field_select, field_select);
 
-	err = mlx5_set_mtpps(priv->mdev, in, sizeof(in));
+	err = mlx5_set_mtpps(mdev, in, sizeof(in));
 	if (err)
 		return err;
 
-	return mlx5_set_mtppse(priv->mdev, pin, 0,
-			       MLX5E_EVENT_MODE_REPETETIVE & on);
+	return mlx5_set_mtppse(mdev, pin, 0,
+			       MLX5_EVENT_MODE_REPETETIVE & on);
 }
 
-static int mlx5e_pps_configure(struct ptp_clock_info *ptp,
-			       struct ptp_clock_request *rq,
-			       int on)
+static int mlx5_pps_configure(struct ptp_clock_info *ptp,
+			      struct ptp_clock_request *rq,
+			      int on)
 {
-	struct mlx5e_tstamp *tstamp =
-		container_of(ptp, struct mlx5e_tstamp, ptp_info);
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
 
-	tstamp->pps_info.enabled = !!on;
+	clock->pps_info.enabled = !!on;
 	return 0;
 }
 
-static int mlx5e_ptp_enable(struct ptp_clock_info *ptp,
-			    struct ptp_clock_request *rq,
-			    int on)
+static int mlx5_ptp_enable(struct ptp_clock_info *ptp,
+			   struct ptp_clock_request *rq,
+			   int on)
 {
 	switch (rq->type) {
 	case PTP_CLK_REQ_EXTTS:
-		return mlx5e_extts_configure(ptp, rq, on);
+		return mlx5_extts_configure(ptp, rq, on);
 	case PTP_CLK_REQ_PEROUT:
-		return mlx5e_perout_configure(ptp, rq, on);
+		return mlx5_perout_configure(ptp, rq, on);
 	case PTP_CLK_REQ_PPS:
-		return mlx5e_pps_configure(ptp, rq, on);
+		return mlx5_pps_configure(ptp, rq, on);
 	default:
 		return -EOPNOTSUPP;
 	}
 	return 0;
 }
 
-static int mlx5e_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin,
-			    enum ptp_pin_function func, unsigned int chan)
+static int mlx5_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin,
+			   enum ptp_pin_function func, unsigned int chan)
 {
 	return (func == PTP_PF_PHYSYNC) ? -EOPNOTSUPP : 0;
 }
 
-static const struct ptp_clock_info mlx5e_ptp_clock_info = {
+static const struct ptp_clock_info mlx5_ptp_clock_info = {
 	.owner		= THIS_MODULE,
+	.name		= "mlx5_p2p",
 	.max_adj	= 100000000,
 	.n_alarm	= 0,
 	.n_ext_ts	= 0,
 	.n_per_out	= 0,
 	.n_pins		= 0,
 	.pps		= 0,
-	.adjfreq	= mlx5e_ptp_adjfreq,
-	.adjtime	= mlx5e_ptp_adjtime,
-	.gettime64	= mlx5e_ptp_gettime,
-	.settime64	= mlx5e_ptp_settime,
+	.adjfreq	= mlx5_ptp_adjfreq,
+	.adjtime	= mlx5_ptp_adjtime,
+	.gettime64	= mlx5_ptp_gettime,
+	.settime64	= mlx5_ptp_settime,
 	.enable		= NULL,
 	.verify		= NULL,
 };
 
-static void mlx5e_timestamp_init_config(struct mlx5e_tstamp *tstamp)
-{
-	tstamp->hwtstamp_config.tx_type = HWTSTAMP_TX_OFF;
-	tstamp->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
-}
-
-static int mlx5e_init_pin_config(struct mlx5e_tstamp *tstamp)
+static int mlx5_init_pin_config(struct mlx5_clock *clock)
 {
 	int i;
 
-	tstamp->ptp_info.pin_config =
-		kzalloc(sizeof(*tstamp->ptp_info.pin_config) *
-			       tstamp->ptp_info.n_pins, GFP_KERNEL);
-	if (!tstamp->ptp_info.pin_config)
+	clock->ptp_info.pin_config =
+			kzalloc(sizeof(*clock->ptp_info.pin_config) *
+				clock->ptp_info.n_pins, GFP_KERNEL);
+	if (!clock->ptp_info.pin_config)
 		return -ENOMEM;
-	tstamp->ptp_info.enable = mlx5e_ptp_enable;
-	tstamp->ptp_info.verify = mlx5e_ptp_verify;
-	tstamp->ptp_info.pps = 1;
+	clock->ptp_info.enable = mlx5_ptp_enable;
+	clock->ptp_info.verify = mlx5_ptp_verify;
+	clock->ptp_info.pps = 1;
 
-	for (i = 0; i < tstamp->ptp_info.n_pins; i++) {
-		snprintf(tstamp->ptp_info.pin_config[i].name,
-			 sizeof(tstamp->ptp_info.pin_config[i].name),
+	for (i = 0; i < clock->ptp_info.n_pins; i++) {
+		snprintf(clock->ptp_info.pin_config[i].name,
+			 sizeof(clock->ptp_info.pin_config[i].name),
 			 "mlx5_pps%d", i);
-		tstamp->ptp_info.pin_config[i].index = i;
-		tstamp->ptp_info.pin_config[i].func = PTP_PF_NONE;
-		tstamp->ptp_info.pin_config[i].chan = i;
+		clock->ptp_info.pin_config[i].index = i;
+		clock->ptp_info.pin_config[i].func = PTP_PF_NONE;
+		clock->ptp_info.pin_config[i].chan = i;
 	}
 
 	return 0;
 }
 
-static void mlx5e_get_pps_caps(struct mlx5e_priv *priv,
-			       struct mlx5e_tstamp *tstamp)
+static void mlx5_get_pps_caps(struct mlx5_core_dev *mdev)
 {
+	struct mlx5_clock *clock = &mdev->clock;
 	u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 
-	mlx5_query_mtpps(priv->mdev, out, sizeof(out));
-
-	tstamp->ptp_info.n_pins = MLX5_GET(mtpps_reg, out,
-					   cap_number_of_pps_pins);
-	tstamp->ptp_info.n_ext_ts = MLX5_GET(mtpps_reg, out,
-					     cap_max_num_of_pps_in_pins);
-	tstamp->ptp_info.n_per_out = MLX5_GET(mtpps_reg, out,
-					      cap_max_num_of_pps_out_pins);
-
-	tstamp->pps_info.pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode);
-	tstamp->pps_info.pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode);
-	tstamp->pps_info.pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode);
-	tstamp->pps_info.pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode);
-	tstamp->pps_info.pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode);
-	tstamp->pps_info.pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode);
-	tstamp->pps_info.pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode);
-	tstamp->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode);
+	mlx5_query_mtpps(mdev, out, sizeof(out));
+
+	clock->ptp_info.n_pins = MLX5_GET(mtpps_reg, out,
+					  cap_number_of_pps_pins);
+	clock->ptp_info.n_ext_ts = MLX5_GET(mtpps_reg, out,
+					    cap_max_num_of_pps_in_pins);
+	clock->ptp_info.n_per_out = MLX5_GET(mtpps_reg, out,
+					     cap_max_num_of_pps_out_pins);
+
+	clock->pps_info.pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode);
+	clock->pps_info.pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode);
+	clock->pps_info.pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode);
+	clock->pps_info.pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode);
+	clock->pps_info.pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode);
+	clock->pps_info.pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode);
+	clock->pps_info.pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode);
+	clock->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode);
 }
 
-void mlx5e_pps_event_handler(struct mlx5e_priv *priv,
-			     struct ptp_clock_event *event)
+void mlx5_pps_event(struct mlx5_core_dev *mdev,
+		    struct mlx5_eqe *eqe)
 {
-	struct net_device *netdev = priv->netdev;
-	struct mlx5e_tstamp *tstamp = &priv->tstamp;
+	struct mlx5_clock *clock = &mdev->clock;
+	struct ptp_clock_event ptp_event;
 	struct timespec64 ts;
 	u64 nsec_now, nsec_delta;
 	u64 cycles_now, cycles_delta;
-	int pin = event->index;
+	int pin = eqe->data.pps.pin;
 	s64 ns;
 	unsigned long flags;
 
-	switch (tstamp->ptp_info.pin_config[pin].func) {
+	switch (clock->ptp_info.pin_config[pin].func) {
 	case PTP_PF_EXTTS:
-		if (tstamp->pps_info.enabled) {
-			event->type = PTP_CLOCK_PPSUSR;
-			event->pps_times.ts_real = ns_to_timespec64(event->timestamp);
+		if (clock->pps_info.enabled) {
+			ptp_event.type = PTP_CLOCK_PPSUSR;
+			ptp_event.pps_times.ts_real = ns_to_timespec64(eqe->data.pps.time_stamp);
 		} else {
-			event->type = PTP_CLOCK_EXTTS;
+			ptp_event.type = PTP_CLOCK_EXTTS;
 		}
-		ptp_clock_event(tstamp->ptp, event);
+		ptp_clock_event(clock->ptp, &ptp_event);
 		break;
 	case PTP_PF_PEROUT:
-		mlx5e_ptp_gettime(&tstamp->ptp_info, &ts);
-		cycles_now = mlx5_read_internal_timer(tstamp->mdev);
+		mlx5_ptp_gettime(&clock->ptp_info, &ts);
+		cycles_now = mlx5_read_internal_timer(mdev);
 		ts.tv_sec += 1;
 		ts.tv_nsec = 0;
 		ns = timespec64_to_ns(&ts);
-		write_lock_irqsave(&tstamp->lock, flags);
-		nsec_now = timecounter_cyc2time(&tstamp->clock, cycles_now);
+		write_lock_irqsave(&clock->lock, flags);
+		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
 		nsec_delta = ns - nsec_now;
-		cycles_delta = div64_u64(nsec_delta << tstamp->cycles.shift,
-					 tstamp->cycles.mult);
-		tstamp->pps_info.start[pin] = cycles_now + cycles_delta;
-		queue_work(priv->wq, &tstamp->pps_info.out_work);
-		write_unlock_irqrestore(&tstamp->lock, flags);
+		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
+					 clock->cycles.mult);
+		clock->pps_info.start[pin] = cycles_now + cycles_delta;
+		schedule_work(&clock->pps_info.out_work);
+		write_unlock_irqrestore(&clock->lock, flags);
 		break;
 	default:
-		netdev_err(netdev, "%s: Unhandled event\n", __func__);
+		mlx5_core_err(mdev, " Unhandled event\n");
 	}
 }
 
-void mlx5e_timestamp_init(struct mlx5e_priv *priv)
+void mlx5_init_clock(struct mlx5_core_dev *mdev)
 {
-	struct mlx5e_tstamp *tstamp = &priv->tstamp;
+	struct mlx5_clock *clock = &mdev->clock;
 	u64 ns;
 	u64 frac = 0;
 	u32 dev_freq;
 
-	mlx5e_timestamp_init_config(tstamp);
-	dev_freq = MLX5_CAP_GEN(priv->mdev, device_frequency_khz);
+	dev_freq = MLX5_CAP_GEN(mdev, device_frequency_khz);
 	if (!dev_freq) {
-		mlx5_core_warn(priv->mdev, "invalid device_frequency_khz, aborting HW clock init\n");
+		mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n");
 		return;
 	}
-	rwlock_init(&tstamp->lock);
-	tstamp->cycles.read = mlx5e_read_internal_timer;
-	tstamp->cycles.shift = MLX5E_CYCLES_SHIFT;
-	tstamp->cycles.mult = clocksource_khz2mult(dev_freq,
-						   tstamp->cycles.shift);
-	tstamp->nominal_c_mult = tstamp->cycles.mult;
-	tstamp->cycles.mask = CLOCKSOURCE_MASK(41);
-	tstamp->mdev = priv->mdev;
-
-	timecounter_init(&tstamp->clock, &tstamp->cycles,
+	rwlock_init(&clock->lock);
+	clock->cycles.read = read_internal_timer;
+	clock->cycles.shift = MLX5_CYCLES_SHIFT;
+	clock->cycles.mult = clocksource_khz2mult(dev_freq,
+						  clock->cycles.shift);
+	clock->nominal_c_mult = clock->cycles.mult;
+	clock->cycles.mask = CLOCKSOURCE_MASK(41);
+
+	timecounter_init(&clock->tc, &clock->cycles,
 			 ktime_to_ns(ktime_get_real()));
 
 	/* Calculate period in seconds to call the overflow watchdog - to make
 	 * sure counter is checked at least once every wrap around.
 	 */
-	ns = cyclecounter_cyc2ns(&tstamp->cycles, tstamp->cycles.mask,
+	ns = cyclecounter_cyc2ns(&clock->cycles, clock->cycles.mask,
 				 frac, &frac);
 	do_div(ns, NSEC_PER_SEC / 2 / HZ);
-	tstamp->overflow_period = ns;
+	clock->overflow_period = ns;
 
-	INIT_WORK(&tstamp->pps_info.out_work, mlx5e_pps_out);
-	INIT_DELAYED_WORK(&tstamp->overflow_work, mlx5e_timestamp_overflow);
-	if (tstamp->overflow_period)
-		queue_delayed_work(priv->wq, &tstamp->overflow_work, 0);
+	INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out);
+	INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow);
+	if (clock->overflow_period)
+		schedule_delayed_work(&clock->overflow_work, 0);
 	else
-		mlx5_core_warn(priv->mdev, "invalid overflow period, overflow_work is not scheduled\n");
+		mlx5_core_warn(mdev, "invalid overflow period, overflow_work is not scheduled\n");
 
 	/* Configure the PHC */
-	tstamp->ptp_info = mlx5e_ptp_clock_info;
-	snprintf(tstamp->ptp_info.name, 16, "mlx5 ptp");
+	clock->ptp_info = mlx5_ptp_clock_info;
 
 	/* Initialize 1PPS data structures */
-	if (MLX5_PPS_CAP(priv->mdev))
-		mlx5e_get_pps_caps(priv, tstamp);
-	if (tstamp->ptp_info.n_pins)
-		mlx5e_init_pin_config(tstamp);
-
-	tstamp->ptp = ptp_clock_register(&tstamp->ptp_info,
-					 &priv->mdev->pdev->dev);
-	if (IS_ERR(tstamp->ptp)) {
-		mlx5_core_warn(priv->mdev, "ptp_clock_register failed %ld\n",
-			       PTR_ERR(tstamp->ptp));
-		tstamp->ptp = NULL;
+	if (MLX5_PPS_CAP(mdev))
+		mlx5_get_pps_caps(mdev);
+	if (clock->ptp_info.n_pins)
+		mlx5_init_pin_config(clock);
+
+	clock->ptp = ptp_clock_register(&clock->ptp_info,
+					&mdev->pdev->dev);
+	if (IS_ERR(clock->ptp)) {
+		mlx5_core_warn(mdev, "ptp_clock_register failed %ld\n",
+			       PTR_ERR(clock->ptp));
+		clock->ptp = NULL;
 	}
 }
 
-void mlx5e_timestamp_cleanup(struct mlx5e_priv *priv)
+void mlx5_cleanup_clock(struct mlx5_core_dev *mdev)
 {
-	struct mlx5e_tstamp *tstamp = &priv->tstamp;
+	struct mlx5_clock *clock = &mdev->clock;
 
-	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+	if (!MLX5_CAP_GEN(mdev, device_frequency_khz))
 		return;
 
-	if (priv->tstamp.ptp) {
-		ptp_clock_unregister(priv->tstamp.ptp);
-		priv->tstamp.ptp = NULL;
+	if (clock->ptp) {
+		ptp_clock_unregister(clock->ptp);
+		clock->ptp = NULL;
 	}
 
-	cancel_work_sync(&tstamp->pps_info.out_work);
-	cancel_delayed_work_sync(&tstamp->overflow_work);
-	kfree(tstamp->ptp_info.pin_config);
+	cancel_work_sync(&clock->pps_info.out_work);
+	cancel_delayed_work_sync(&clock->overflow_work);
+	kfree(clock->ptp_info.pin_config);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
new file mode 100644
index 000000000000..a8eecedd46c2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __LIB_CLOCK_H__
+#define __LIB_CLOCK_H__
+
+void mlx5_init_clock(struct mlx5_core_dev *mdev);
+void mlx5_cleanup_clock(struct mlx5_core_dev *mdev);
+
+static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock,
+						u64 timestamp)
+{
+	u64 nsec;
+
+	read_lock(&clock->lock);
+	nsec = timecounter_cyc2time(&clock->tc, timestamp);
+	read_unlock(&clock->lock);
+
+	return ns_to_ktime(nsec);
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 0d2c8dcd6eae..ecbe9fad22d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -59,6 +59,7 @@
 #include "lib/mlx5.h"
 #include "fpga/core.h"
 #include "accel/ipsec.h"
+#include "lib/clock.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox Connect-IB, ConnectX-4 core driver");
@@ -889,6 +890,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 
 	mlx5_init_reserved_gids(dev);
 
+	mlx5_init_clock(dev);
+
 	err = mlx5_init_rl_table(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init rate limiting\n");
@@ -949,6 +952,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 	mlx5_mpfs_cleanup(dev);
 	mlx5_cleanup_rl_table(dev);
+	mlx5_cleanup_clock(dev);
 	mlx5_cleanup_reserved_gids(dev);
 	mlx5_cleanup_mkey_table(dev);
 	mlx5_cleanup_srq_table(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index b7c2900b75f9..8f00de2fe283 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -93,6 +93,7 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param);
 void mlx5_core_page_fault(struct mlx5_core_dev *dev,
 			  struct mlx5_pagefault *pfault);
+void mlx5_pps_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
 void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 401c8972cc3a..08c77b7e59cb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -49,6 +49,8 @@
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
 #include <linux/mlx5/srq.h>
+#include <linux/timecounter.h>
+#include <linux/ptp_clock_kernel.h>
 
 enum {
 	MLX5_BOARD_ID_LEN = 64,
@@ -760,6 +762,27 @@ struct mlx5_rsvd_gids {
 	struct ida ida;
 };
 
+#define MAX_PIN_NUM	8
+struct mlx5_pps {
+	u8                         pin_caps[MAX_PIN_NUM];
+	struct work_struct         out_work;
+	u64                        start[MAX_PIN_NUM];
+	u8                         enabled;
+};
+
+struct mlx5_clock {
+	rwlock_t                   lock;
+	struct cyclecounter        cycles;
+	struct timecounter         tc;
+	struct hwtstamp_config     hwtstamp_config;
+	u32                        nominal_c_mult;
+	unsigned long              overflow_period;
+	struct delayed_work        overflow_work;
+	struct ptp_clock          *ptp;
+	struct ptp_clock_info      ptp_info;
+	struct mlx5_pps            pps_info;
+};
+
 struct mlx5_core_dev {
 	struct pci_dev	       *pdev;
 	/* sync pci state */
@@ -800,6 +823,7 @@ struct mlx5_core_dev {
 #ifdef CONFIG_RFS_ACCEL
 	struct cpu_rmap         *rmap;
 #endif
+	struct mlx5_clock        clock;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 841f4f24053acad69240c6ab7427a1d24bc29491 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 13 Oct 2017 14:18:09 -0400
Subject: net: dsa: remove .set_addr

Now that there is no user for the .set_addr function, remove it from
DSA. If a switch supports this feature (like mv88e6xxx), the
implementation can be done in the driver setup.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 -
 net/dsa/dsa2.c    | 6 ------
 net/dsa/legacy.c  | 6 ------
 3 files changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ce1d622734d7..2746741f74cf 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -291,7 +291,6 @@ struct dsa_switch_ops {
 	enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds);
 
 	int	(*setup)(struct dsa_switch *ds);
-	int	(*set_addr)(struct dsa_switch *ds, u8 *addr);
 	u32	(*get_phy_flags)(struct dsa_switch *ds, int port);
 
 	/*
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 54ed054777bd..6ac9e11d385c 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -336,12 +336,6 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	if (err)
 		return err;
 
-	if (ds->ops->set_addr) {
-		err = ds->ops->set_addr(ds, dst->cpu_dp->netdev->dev_addr);
-		if (err < 0)
-			return err;
-	}
-
 	if (!ds->slave_mii_bus && ds->ops->phy_read) {
 		ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
 		if (!ds->slave_mii_bus)
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 19ff6e0a21dc..b0fefbffe082 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -172,12 +172,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 	if (ret)
 		return ret;
 
-	if (ops->set_addr) {
-		ret = ops->set_addr(ds, master->dev_addr);
-		if (ret < 0)
-			return ret;
-	}
-
 	if (!ds->slave_mii_bus && ops->phy_read) {
 		ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
 		if (!ds->slave_mii_bus)
-- 
cgit v1.2.3


From e086101b150ae8e99e54ab26101ef3835fa9f48d Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 13 Oct 2017 13:03:16 -0700
Subject: tcp: add a tracepoint for tcp retransmission

We need a real-time notification for tcp retransmission
for monitoring.

Of course we could use ftrace to dynamically instrument this
kernel function too, however we can't retrieve the connection
information at the same time, for example perf-tools [1] reads
/proc/net/tcp for socket details, which is slow when we have
a lots of connections.

Therefore, this patch adds a tracepoint for __tcp_retransmit_skb()
and exposes src/dst IP addresses and ports of the connection.
This also makes it easier to integrate into perf.

Note, I expose both IPv4 and IPv6 addresses at the same time:
for a IPv4 socket, v4 mapped address is used as IPv6 addresses,
for a IPv6 socket, LOOPBACK4_IPV6 is already filled by kernel.
Also, add sk and skb pointers as they are useful for BPF.

1. https://github.com/brendangregg/perf-tools/blob/master/net/tcpretrans

Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Brendan Gregg <bgregg@netflix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 net/core/net-traces.c      |  1 +
 net/ipv4/tcp_output.c      |  3 ++
 3 files changed, 72 insertions(+)
 create mode 100644 include/trace/events/tcp.h

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
new file mode 100644
index 000000000000..3d1cbd072b7e
--- /dev/null
+++ b/include/trace/events/tcp.h
@@ -0,0 +1,68 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tcp
+
+#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TCP_H
+
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/tracepoint.h>
+#include <net/ipv6.h>
+
+TRACE_EVENT(tcp_retransmit_skb,
+
+	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+
+	TP_ARGS(sk, skb),
+
+	TP_STRUCT__entry(
+		__field(void *, skbaddr)
+		__field(void *, skaddr)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+	),
+
+	TP_fast_assign(
+		struct ipv6_pinfo *np = inet6_sk(sk);
+		struct inet_sock *inet = inet_sk(sk);
+		struct in6_addr *pin6;
+		__be32 *p32;
+
+		__entry->skbaddr = skb;
+		__entry->skaddr = sk;
+
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = inet->inet_saddr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 =  inet->inet_daddr;
+
+		if (np) {
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			*pin6 = np->saddr;
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			*pin6 = *(np->daddr_cache);
+		} else {
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
+		}
+	),
+
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6 daddrv6=%pI6",
+		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6)
+);
+
+#endif /* _TRACE_TCP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 1132820c8e62..f4e4fa2db505 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -31,6 +31,7 @@
 #include <trace/events/napi.h>
 #include <trace/events/sock.h>
 #include <trace/events/udp.h>
+#include <trace/events/tcp.h>
 #include <trace/events/fib.h>
 #include <trace/events/qdisc.h>
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 696b0a168f16..6c74f2a39778 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -42,6 +42,8 @@
 #include <linux/gfp.h>
 #include <linux/module.h>
 
+#include <trace/events/tcp.h>
+
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse __read_mostly = 1;
 
@@ -2875,6 +2877,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 
 	if (likely(!err)) {
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+		trace_tcp_retransmit_skb(sk, skb);
 	} else if (err != -EBUSY) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
-- 
cgit v1.2.3


From 32302902ff093891d8e64439cbb8ceae83e21ef8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 12 Oct 2017 11:38:45 -0700
Subject: mqprio: Reserve last 32 classid values for HW traffic classes and
 misc IDs

This patch makes a slight tweak to mqprio in order to bring the
classid values used back in line with what is used for mq. The general idea
is to reserve values :ffe0 - :ffef to identify hardware traffic classes
normally reported via dev->num_tc. By doing this we can maintain a
consistent behavior with mq for classid where :1 - :ffdf will represent a
physical qdisc mapped onto a Tx queue represented by classid - 1, and the
traffic classes will be mapped onto a known subset of classid values
reserved for our virtual qdiscs.

Note I reserved the range from :fff0 - :ffff since this way we might be
able to reuse these classid values with clsact and ingress which would mean
that for mq, mqprio, ingress, and clsact we should be able to maintain a
similar classid layout.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_mqprio.c         | 79 ++++++++++++++++++++++++------------------
 2 files changed, 47 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e95b5c9b9fad..e7cc3d3c7421 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -74,6 +74,7 @@ struct tc_estimator {
 #define TC_H_INGRESS    (0xFFFFFFF1U)
 #define TC_H_CLSACT	TC_H_INGRESS
 
+#define TC_H_MIN_PRIORITY	0xFFE0U
 #define TC_H_MIN_INGRESS	0xFFF2U
 #define TC_H_MIN_EGRESS		0xFFF3U
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index f1ae9be83934..cae91b4b08a6 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -153,6 +153,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!netif_is_multiqueue(dev))
 		return -EOPNOTSUPP;
 
+	/* make certain can allocate enough classids to handle queues */
+	if (dev->num_tx_queues >= TC_H_MIN_PRIORITY)
+		return -ENOMEM;
+
 	if (!opt || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
 
@@ -305,7 +309,7 @@ static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
 					     unsigned long cl)
 {
 	struct net_device *dev = qdisc_dev(sch);
-	unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+	unsigned long ntx = cl - 1;
 
 	if (ntx >= dev->num_tx_queues)
 		return NULL;
@@ -447,38 +451,35 @@ static unsigned long mqprio_find(struct Qdisc *sch, u32 classid)
 	struct net_device *dev = qdisc_dev(sch);
 	unsigned int ntx = TC_H_MIN(classid);
 
-	if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
-		return 0;
-	return ntx;
+	/* There are essentially two regions here that have valid classid
+	 * values. The first region will have a classid value of 1 through
+	 * num_tx_queues. All of these are backed by actual Qdiscs.
+	 */
+	if (ntx < TC_H_MIN_PRIORITY)
+		return (ntx <= dev->num_tx_queues) ? ntx : 0;
+
+	/* The second region represents the hardware traffic classes. These
+	 * are represented by classid values of TC_H_MIN_PRIORITY through
+	 * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1
+	 */
+	return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0;
 }
 
 static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
 			 struct sk_buff *skb, struct tcmsg *tcm)
 {
-	struct net_device *dev = qdisc_dev(sch);
+	if (cl < TC_H_MIN_PRIORITY) {
+		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+		struct net_device *dev = qdisc_dev(sch);
+		int tc = netdev_txq_to_tc(dev, cl - 1);
 
-	if (cl <= netdev_get_num_tc(dev)) {
+		tcm->tcm_parent = (tc < 0) ? 0 :
+			TC_H_MAKE(TC_H_MAJ(sch->handle),
+				  TC_H_MIN(tc + TC_H_MIN_PRIORITY));
+		tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	} else {
 		tcm->tcm_parent = TC_H_ROOT;
 		tcm->tcm_info = 0;
-	} else {
-		int i;
-		struct netdev_queue *dev_queue;
-
-		dev_queue = mqprio_queue_get(sch, cl);
-		tcm->tcm_parent = 0;
-		for (i = 0; i < netdev_get_num_tc(dev); i++) {
-			struct netdev_tc_txq tc = dev->tc_to_txq[i];
-			int q_idx = cl - netdev_get_num_tc(dev);
-
-			if (q_idx > tc.offset &&
-			    q_idx <= tc.offset + tc.count) {
-				tcm->tcm_parent =
-					TC_H_MAKE(TC_H_MAJ(sch->handle),
-						  TC_H_MIN(i + 1));
-				break;
-			}
-		}
-		tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
 	}
 	tcm->tcm_handle |= TC_H_MIN(cl);
 	return 0;
@@ -489,15 +490,14 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	__releases(d->lock)
 	__acquires(d->lock)
 {
-	struct net_device *dev = qdisc_dev(sch);
-
-	if (cl <= netdev_get_num_tc(dev)) {
+	if (cl >= TC_H_MIN_PRIORITY) {
 		int i;
 		__u32 qlen = 0;
 		struct Qdisc *qdisc;
 		struct gnet_stats_queue qstats = {0};
 		struct gnet_stats_basic_packed bstats = {0};
-		struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+		struct net_device *dev = qdisc_dev(sch);
+		struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK];
 
 		/* Drop lock here it will be reclaimed before touching
 		 * statistics this is required because the d->lock we
@@ -550,12 +550,25 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 
 	/* Walk hierarchy with a virtual class per tc */
 	arg->count = arg->skip;
-	for (ntx = arg->skip;
-	     ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
-	     ntx++) {
+	for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) {
+		if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+
+	/* Pad the values and skip over unused traffic classes */
+	if (ntx < TC_MAX_QUEUE) {
+		arg->count = TC_MAX_QUEUE;
+		ntx = TC_MAX_QUEUE;
+	}
+
+	/* Reset offset, sort out remaining per-queue qdiscs */
+	for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) {
 		if (arg->fn(sch, ntx + 1, arg) < 0) {
 			arg->stop = 1;
-			break;
+			return;
 		}
 		arg->count++;
 	}
-- 
cgit v1.2.3


From 69d78ef25c7b0058674145500efb12255738ba8a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 13 Oct 2017 14:00:57 +0200
Subject: net: sched: store Qdisc pointer in struct block

Prepare for removal of tp->q and store Qdisc pointer in the block
structure.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     | 4 ++--
 include/net/sch_generic.h | 1 +
 net/sched/cls_api.c       | 3 ++-
 net/sched/sch_atm.c       | 4 ++--
 net/sched/sch_cbq.c       | 2 +-
 net/sched/sch_drr.c       | 2 +-
 net/sched/sch_dsmark.c    | 2 +-
 net/sched/sch_fq_codel.c  | 2 +-
 net/sched/sch_hfsc.c      | 4 ++--
 net/sched/sch_htb.c       | 4 ++--
 net/sched/sch_ingress.c   | 6 +++---
 net/sched/sch_multiq.c    | 2 +-
 net/sched/sch_prio.c      | 2 +-
 net/sched/sch_qfq.c       | 2 +-
 net/sched/sch_sfb.c       | 2 +-
 net/sched/sch_sfq.c       | 2 +-
 16 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 60d39789e4f0..e6c9e1e4d711 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -22,7 +22,7 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
 void tcf_chain_put(struct tcf_chain *chain);
 int tcf_block_get(struct tcf_block **p_block,
-		  struct tcf_proto __rcu **p_filter_chain);
+		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q);
 void tcf_block_put(struct tcf_block *block);
 int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		 struct tcf_result *res, bool compat_mode);
@@ -30,7 +30,7 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 #else
 static inline
 int tcf_block_get(struct tcf_block **p_block,
-		  struct tcf_proto __rcu **p_filter_chain)
+		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
 {
 	return 0;
 }
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 684d8ed27eaa..df4032ca1b7f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -270,6 +270,7 @@ struct tcf_chain {
 
 struct tcf_block {
 	struct list_head chain_list;
+	struct Qdisc *q;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2977b8a90851..f7d3f1f539b7 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -241,7 +241,7 @@ tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain,
 }
 
 int tcf_block_get(struct tcf_block **p_block,
-		  struct tcf_proto __rcu **p_filter_chain)
+		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
 {
 	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
 	struct tcf_chain *chain;
@@ -257,6 +257,7 @@ int tcf_block_get(struct tcf_block **p_block,
 		goto err_chain_create;
 	}
 	tcf_chain_filter_chain_ptr_set(chain, p_filter_chain);
+	block->q = q;
 	*p_block = block;
 	return 0;
 
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index c5fcdf1a58a0..2dbd249c0b2f 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -281,7 +281,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 		goto err_out;
 	}
 
-	error = tcf_block_get(&flow->block, &flow->filter_list);
+	error = tcf_block_get(&flow->block, &flow->filter_list, sch);
 	if (error) {
 		kfree(flow);
 		goto err_out;
@@ -546,7 +546,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
 		p->link.q = &noop_qdisc;
 	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
 
-	err = tcf_block_get(&p->link.block, &p->link.filter_list);
+	err = tcf_block_get(&p->link.block, &p->link.filter_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index dcef97fa8047..c3b92d62190e 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1566,7 +1566,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	if (cl == NULL)
 		goto failure;
 
-	err = tcf_block_get(&cl->block, &cl->filter_list);
+	err = tcf_block_get(&cl->block, &cl->filter_list, sch);
 	if (err) {
 		kfree(cl);
 		return err;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 2d0e8d4bdc29..753dc7a77b60 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -412,7 +412,7 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 	struct drr_sched *q = qdisc_priv(sch);
 	int err;
 
-	err = tcf_block_get(&q->block, &q->filter_list);
+	err = tcf_block_get(&q->block, &q->filter_list, sch);
 	if (err)
 		return err;
 	err = qdisc_class_hash_init(&q->clhash);
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 2836c80c7aa5..fb4fb71c68cf 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -344,7 +344,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!opt)
 		goto errout;
 
-	err = tcf_block_get(&p->block, &p->filter_list);
+	err = tcf_block_get(&p->block, &p->filter_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index de3b57ceca7b..3c40edeff1e8 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -481,7 +481,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 			return err;
 	}
 
-	err = tcf_block_get(&q->block, &q->filter_list);
+	err = tcf_block_get(&q->block, &q->filter_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3f88b75488b0..a692184bd333 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1033,7 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (cl == NULL)
 		return -ENOBUFS;
 
-	err = tcf_block_get(&cl->block, &cl->filter_list);
+	err = tcf_block_get(&cl->block, &cl->filter_list, sch);
 	if (err) {
 		kfree(cl);
 		return err;
@@ -1405,7 +1405,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 		return err;
 	q->eligible = RB_ROOT;
 
-	err = tcf_block_get(&q->root.block, &q->root.filter_list);
+	err = tcf_block_get(&q->root.block, &q->root.filter_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index c6d7ae81b41f..57be73c0e1d2 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1030,7 +1030,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!opt)
 		return -EINVAL;
 
-	err = tcf_block_get(&q->block, &q->filter_list);
+	err = tcf_block_get(&q->block, &q->filter_list, sch);
 	if (err)
 		return err;
 
@@ -1393,7 +1393,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		if (!cl)
 			goto failure;
 
-		err = tcf_block_get(&cl->block, &cl->filter_list);
+		err = tcf_block_get(&cl->block, &cl->filter_list, sch);
 		if (err) {
 			kfree(cl);
 			goto failure;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 44de4ee51ce9..9ccc1b89b0d9 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -59,7 +59,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
-	err = tcf_block_get(&q->block, &dev->ingress_cl_list);
+	err = tcf_block_get(&q->block, &dev->ingress_cl_list, sch);
 	if (err)
 		return err;
 
@@ -153,11 +153,11 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
-	err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list);
+	err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list, sch);
 	if (err)
 		return err;
 
-	err = tcf_block_get(&q->egress_block, &dev->egress_cl_list);
+	err = tcf_block_get(&q->egress_block, &dev->egress_cl_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index ff4fc3e0facd..31e0a284eeff 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -245,7 +245,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
 	if (opt == NULL)
 		return -EINVAL;
 
-	err = tcf_block_get(&q->block, &q->filter_list);
+	err = tcf_block_get(&q->block, &q->filter_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 2dd6c68ae91e..95fad348c8d7 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -212,7 +212,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!opt)
 		return -EINVAL;
 
-	err = tcf_block_get(&q->block, &q->filter_list);
+	err = tcf_block_get(&q->block, &q->filter_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 6ddfd4991108..8694c7b6d2b1 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1419,7 +1419,7 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 	int i, j, err;
 	u32 max_cl_shift, maxbudg_shift, max_classes;
 
-	err = tcf_block_get(&q->block, &q->filter_list);
+	err = tcf_block_get(&q->block, &q->filter_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index cc39e170b4aa..487d375f5a06 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -553,7 +553,7 @@ static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
 	struct sfb_sched_data *q = qdisc_priv(sch);
 	int err;
 
-	err = tcf_block_get(&q->block, &q->filter_list);
+	err = tcf_block_get(&q->block, &q->filter_list, sch);
 	if (err)
 		return err;
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 74ea863b8240..123a53af2506 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -725,7 +725,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 	setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
 			       (unsigned long)sch);
 
-	err = tcf_block_get(&q->block, &q->filter_list);
+	err = tcf_block_get(&q->block, &q->filter_list, sch);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 855319becbcffec6988a4e781a861b69a71c5b58 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 13 Oct 2017 14:00:58 +0200
Subject: net: sched: store net pointer in block and introduce qdisc_net helper

Store net pointer in the block structure. Along the way, introduce
qdisc_net helper which allows to easily obtain net pointer for
qdisc instance.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h   | 7 +++++++
 include/net/sch_generic.h | 1 +
 net/sched/cls_api.c       | 1 +
 3 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 259bc191ba59..2d234af15f3e 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -4,7 +4,9 @@
 #include <linux/jiffies.h>
 #include <linux/ktime.h>
 #include <linux/if_vlan.h>
+#include <linux/netdevice.h>
 #include <net/sch_generic.h>
+#include <net/net_namespace.h>
 #include <uapi/linux/pkt_sched.h>
 
 #define DEFAULT_TX_QUEUE_LEN	1000
@@ -146,4 +148,9 @@ static inline bool is_classid_clsact_egress(u32 classid)
 	       TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS);
 }
 
+static inline struct net *qdisc_net(struct Qdisc *q)
+{
+	return dev_net(q->dev_queue->dev);
+}
+
 #endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index df4032ca1b7f..9b2cb91dc0d9 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -270,6 +270,7 @@ struct tcf_chain {
 
 struct tcf_block {
 	struct list_head chain_list;
+	struct net *net;
 	struct Qdisc *q;
 };
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f7d3f1f539b7..856003caa3bb 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -257,6 +257,7 @@ int tcf_block_get(struct tcf_block **p_block,
 		goto err_chain_create;
 	}
 	tcf_chain_filter_chain_ptr_set(chain, p_filter_chain);
+	block->net = qdisc_net(q);
 	block->q = q;
 	*p_block = block;
 	return 0;
-- 
cgit v1.2.3


From 44186460c85a0121562db7cfef132d63c869118f Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 13 Oct 2017 14:00:59 +0200
Subject: net: sched: introduce tcf_block_q and tcf_block_dev helpers

These helpers allows to get a q and netdev pointers
for given block easily.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e6c9e1e4d711..7bed674ba29a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -24,6 +24,17 @@ void tcf_chain_put(struct tcf_chain *chain);
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q);
 void tcf_block_put(struct tcf_block *block);
+
+static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
+{
+	return block->q;
+}
+
+static inline struct net_device *tcf_block_dev(struct tcf_block *block)
+{
+	return tcf_block_q(block)->dev_queue->dev;
+}
+
 int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		 struct tcf_result *res, bool compat_mode);
 
@@ -39,6 +50,16 @@ static inline void tcf_block_put(struct tcf_block *block)
 {
 }
 
+static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
+{
+	return NULL;
+}
+
+static inline struct net_device *tcf_block_dev(struct tcf_block *block)
+{
+	return NULL;
+}
+
 static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			       struct tcf_result *res, bool compat_mode)
 {
-- 
cgit v1.2.3


From 34e3759cf86a3e75463e34c1bb9691777406a175 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 13 Oct 2017 14:01:00 +0200
Subject: net: sched: teach tcf_bind/unbind_filter to use block->q

Whenever the block->q is set, it can be used instead of tp->q as it
contains the same value. When it is not set, which can't happen now but
it might happen with the follow-up shared blocks introduction, the class
is not set in the result. That would lead to a class lookup instead
of direct class pointer use for classful qdiscs. However, it is not
planned to support classful qdisqs sharing filter blocks, so that may
never happen.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 7bed674ba29a..49a143e0fe65 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -74,36 +74,43 @@ __cls_set_class(unsigned long *clp, unsigned long cl)
 }
 
 static inline unsigned long
-cls_set_class(struct tcf_proto *tp, unsigned long *clp, 
-	unsigned long cl)
+cls_set_class(struct Qdisc *q, unsigned long *clp, unsigned long cl)
 {
 	unsigned long old_cl;
-	
-	tcf_tree_lock(tp);
+
+	sch_tree_lock(q);
 	old_cl = __cls_set_class(clp, cl);
-	tcf_tree_unlock(tp);
- 
+	sch_tree_unlock(q);
 	return old_cl;
 }
 
 static inline void
 tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base)
 {
+	struct Qdisc *q = tp->chain->block->q;
 	unsigned long cl;
 
-	cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, r->classid);
-	cl = cls_set_class(tp, &r->class, cl);
+	/* Check q as it is not set for shared blocks. In that case,
+	 * setting class is not supported.
+	 */
+	if (!q)
+		return;
+	cl = q->ops->cl_ops->bind_tcf(q, base, r->classid);
+	cl = cls_set_class(q, &r->class, cl);
 	if (cl)
-		tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+		q->ops->cl_ops->unbind_tcf(q, cl);
 }
 
 static inline void
 tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
 {
+	struct Qdisc *q = tp->chain->block->q;
 	unsigned long cl;
 
+	if (!q)
+		return;
 	if ((cl = __cls_set_class(&r->class, 0)) != 0)
-		tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+		q->ops->cl_ops->unbind_tcf(q, cl);
 }
 
 struct tcf_exts {
-- 
cgit v1.2.3


From 74e3be6021d22df2ffcb691eae1affeb2bd0128e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 13 Oct 2017 14:01:04 +0200
Subject: net: sched: use tcf_block_q helper to get q pointer for sch_tree_lock

Use tcf_block_q helper to get q pointer to be used for direct call of
sch_tree_lock/unlock instead of tcf_tree_lock/unlock.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 ---
 net/sched/sch_api.c       | 6 ++++--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9b2cb91dc0d9..0aea9e23e97a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -359,9 +359,6 @@ static inline void sch_tree_unlock(const struct Qdisc *q)
 	spin_unlock_bh(qdisc_root_sleeping_lock(q));
 }
 
-#define tcf_tree_lock(tp)	sch_tree_lock((tp)->q)
-#define tcf_tree_unlock(tp)	sch_tree_unlock((tp)->q)
-
 extern struct Qdisc noop_qdisc;
 extern struct Qdisc_ops noop_qdisc_ops;
 extern struct Qdisc_ops pfifo_fast_ops;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index aa82116ed10c..a9ac912f1d67 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1661,9 +1661,11 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
 	struct tcf_bind_args *a = (void *)arg;
 
 	if (tp->ops->bind_class) {
-		tcf_tree_lock(tp);
+		struct Qdisc *q = tcf_block_q(tp->chain->block);
+
+		sch_tree_lock(q);
 		tp->ops->bind_class(n, a->classid, a->cl);
-		tcf_tree_unlock(tp);
+		sch_tree_unlock(q);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 0da4af00b2ed3dbe46788623a696c4169447eadc Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 13 Oct 2017 15:08:07 -0700
Subject: ipv6: only update __use and lastusetime once per jiffy at most

In order to not dirty the cacheline too often, we try to only update
dst->__use and dst->lastusetime at most once per jiffy.
As dst->lastusetime is only used by ipv6 garbage collector, it should
be good enough time resolution.
And __use is only used in ipv6_route_seq_show() to show how many times a
dst has been used. And as __use is not atomic_t right now, it does not
show the precise number of usage times anyway. So we think it should be
OK to only update it at most once per jiffy.

According to my latest syn flood test on a machine with intel Xeon 6th
gen processor and 2 10G mlx nics bonded together, each with 8 rx queues
on 2 NUMA nodes:
With this patch, the packet process rate increases from ~3.49Mpps to
~3.75Mpps with a 7% increase rate.

Note: dst_use() is being renamed to dst_hold_and_use() to better specify
the purpose of the function.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@googl.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h     | 15 ++++++++-------
 net/decnet/dn_route.c |  8 ++++----
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 204c19e25456..5047e8053d6c 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -255,17 +255,18 @@ static inline void dst_hold(struct dst_entry *dst)
 	WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
 }
 
-static inline void dst_use(struct dst_entry *dst, unsigned long time)
+static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
 {
-	dst_hold(dst);
-	dst->__use++;
-	dst->lastuse = time;
+	if (time != dst->lastuse) {
+		dst->__use++;
+		dst->lastuse = time;
+	}
 }
 
-static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
+static inline void dst_hold_and_use(struct dst_entry *dst, unsigned long time)
 {
-	dst->__use++;
-	dst->lastuse = time;
+	dst_hold(dst);
+	dst_use_noref(dst, time);
 }
 
 static inline struct dst_entry *dst_clone(struct dst_entry *dst)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 0bd3afd01dd2..bff5ab88cdbb 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -338,7 +338,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
 					   dn_rt_hash_table[hash].chain);
 			rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
 
-			dst_use(&rth->dst, now);
+			dst_hold_and_use(&rth->dst, now);
 			spin_unlock_bh(&dn_rt_hash_table[hash].lock);
 
 			dst_release_immediate(&rt->dst);
@@ -351,7 +351,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
 	rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain);
 	rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
 
-	dst_use(&rt->dst, now);
+	dst_hold_and_use(&rt->dst, now);
 	spin_unlock_bh(&dn_rt_hash_table[hash].lock);
 	*rp = rt;
 	return 0;
@@ -1258,7 +1258,7 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *
 			    (flp->flowidn_mark == rt->fld.flowidn_mark) &&
 			    dn_is_output_route(rt) &&
 			    (rt->fld.flowidn_oif == flp->flowidn_oif)) {
-				dst_use(&rt->dst, jiffies);
+				dst_hold_and_use(&rt->dst, jiffies);
 				rcu_read_unlock_bh();
 				*pprt = &rt->dst;
 				return 0;
@@ -1535,7 +1535,7 @@ static int dn_route_input(struct sk_buff *skb)
 		    (rt->fld.flowidn_oif == 0) &&
 		    (rt->fld.flowidn_mark == skb->mark) &&
 		    (rt->fld.flowidn_iif == cb->iif)) {
-			dst_use(&rt->dst, jiffies);
+			dst_hold_and_use(&rt->dst, jiffies);
 			rcu_read_unlock();
 			skb_dst_set(skb, (struct dst_entry *)rt);
 			return 0;
-- 
cgit v1.2.3


From 9185a610f8f7f1b4e4d28c9de27d1969cf58e0f1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 12 Oct 2017 18:40:02 -0400
Subject: tracing: bpf: Hide bpf trace events when they are not used

All the trace events defined in include/trace/events/bpf.h are only
used when CONFIG_BPF_SYSCALL is defined. But this file gets included by
include/linux/bpf_trace.h which is included by the networking code with
CREATE_TRACE_POINTS defined.

If a trace event is created but not used it still has data structures
and functions created for its use, even though nothing is using them.
To not waste space, do not define the BPF trace events in bpf.h unless
CONFIG_BPF_SYSCALL is defined.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/bpf.h | 5 ++++-
 kernel/bpf/core.c          | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
index 52c8425d144b..1fb58faa4a44 100644
--- a/include/trace/events/bpf.h
+++ b/include/trace/events/bpf.h
@@ -4,6 +4,9 @@
 #if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_BPF_H
 
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
+
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/fs.h>
@@ -345,7 +348,7 @@ TRACE_EVENT(bpf_map_next_key,
 		  __print_hex(__get_dynamic_array(nxt), __entry->key_len),
 		  __entry->key_trunc ? " ..." : "")
 );
-
+#endif /* CONFIG_BPF_SYSCALL */
 #endif /* _TRACE_BPF_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 248961af2421..8e7c8bf2b687 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1580,5 +1580,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
 
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
 EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
-- 
cgit v1.2.3


From 5a6cd6de76ae78b651e7c36eba8b1da465d65f06 Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Thu, 5 Oct 2017 14:53:40 -0700
Subject: ethtool: add ethtool_intersect_link_masks

This function provides a way to intersect two link masks together to
find the common ground between them.  For example in i40e, the driver
first generates link masks for what is supported by the PHY type.  The
driver then gets the link masks for what the NVM supports.  The
resulting intersection between them yields what can truly be supported.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/ethtool.h | 10 ++++++++++
 net/core/ethtool.c      | 16 ++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4587a4c36923..c77fa3529e15 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -163,6 +163,16 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
+/**
+ * ethtool_intersect_link_masks - Given two link masks, AND them together
+ * @dst: first mask and where result is stored
+ * @src: second mask to intersect with
+ *
+ * Given two link mode masks, AND them together and save the result in dst.
+ */
+void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				  struct ethtool_link_ksettings *src);
+
 void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
 					     u32 legacy_u32);
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 3228411ada0f..0c406306792a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -403,6 +403,22 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
+/* Given two link masks, AND them together and save the result in dst. */
+void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				  struct ethtool_link_ksettings *src)
+{
+	unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS);
+	unsigned int idx = 0;
+
+	for (; idx < size; idx++) {
+		dst->link_modes.supported[idx] &=
+			src->link_modes.supported[idx];
+		dst->link_modes.advertising[idx] &=
+			src->link_modes.advertising[idx];
+	}
+}
+EXPORT_SYMBOL(ethtool_intersect_link_masks);
+
 void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
 					     u32 legacy_u32)
 {
-- 
cgit v1.2.3


From a68f4a27f55f1d54e35c270aff89383da4b1b656 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 18 Oct 2017 11:36:39 +0100
Subject: rxrpc: Support service upgrade from a kernel service

Provide support for a kernel service to make use of the service upgrade
facility.  This involves:

 (1) Pass an upgrade request flag to rxrpc_kernel_begin_call().

 (2) Make rxrpc_kernel_recv_data() return the call's current service ID so
     that the caller can detect service upgrade and see what the service
     was upgraded to.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/networking/rxrpc.txt | 17 +++++++++++++++--
 fs/afs/internal.h                  |  1 +
 fs/afs/rxrpc.c                     | 11 +++++++----
 include/net/af_rxrpc.h             |  5 +++--
 net/rxrpc/af_rxrpc.c               |  5 ++++-
 net/rxrpc/recvmsg.c                |  5 ++++-
 6 files changed, 34 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index 810620153a44..9fb61a6bc7cf 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -782,7 +782,9 @@ The kernel interface functions are as follows:
 				struct key *key,
 				unsigned long user_call_ID,
 				s64 tx_total_len,
-				gfp_t gfp);
+				gfp_t gfp,
+				rxrpc_notify_rx_t notify_rx,
+				bool upgrade);
 
      This allocates the infrastructure to make a new RxRPC call and assigns
      call and connection numbers.  The call will be made on the UDP port that
@@ -803,6 +805,13 @@ The kernel interface functions are as follows:
      allows the kernel to encrypt directly to the packet buffers, thereby
      saving a copy.  The value may not be less than -1.
 
+     notify_rx is a pointer to a function to be called when events such as
+     incoming data packets or remote aborts happen.
+
+     upgrade should be set to true if a client operation should request that
+     the server upgrade the service to a better one.  The resultant service ID
+     is returned by rxrpc_kernel_recv_data().
+
      If this function is successful, an opaque reference to the RxRPC call is
      returned.  The caller now holds a reference on this and it must be
      properly ended.
@@ -850,7 +859,8 @@ The kernel interface functions are as follows:
 				   size_t size,
 				   size_t *_offset,
 				   bool want_more,
-				   u32 *_abort)
+				   u32 *_abort,
+				   u16 *_service)
 
       This is used to receive data from either the reply part of a client call
       or the request part of a service call.  buf and size specify how much
@@ -873,6 +883,9 @@ The kernel interface functions are as follows:
       If a remote ABORT is detected, the abort code received will be stored in
       *_abort and ECONNABORTED will be returned.
 
+      The service ID that the call ended up with is returned into *_service.
+      This can be used to see if a call got a service upgrade.
+
  (*) Abort a call.
 
 	void rxrpc_kernel_abort_call(struct socket *sock,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 82e16556afea..3f03f7888302 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -100,6 +100,7 @@ struct afs_call {
 	bool			send_pages;	/* T if data from mapping should be sent */
 	bool			need_attention;	/* T if RxRPC poked us */
 	bool			async;		/* T if asynchronous */
+	bool			upgrade;	/* T to request service upgrade */
 	u16			service_id;	/* RxRPC service ID to call */
 	__be16			port;		/* target UDP port */
 	u32			operation_ID;	/* operation ID for an incoming call */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 0bf191f0dbaf..172a4f9747ac 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -387,7 +387,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 					 tx_total_len, gfp,
 					 (async ?
 					  afs_wake_up_async_call :
-					  afs_wake_up_call_waiter));
+					  afs_wake_up_call_waiter),
+					 call->upgrade);
 	call->key = NULL;
 	if (IS_ERR(rxcall)) {
 		ret = PTR_ERR(rxcall);
@@ -443,7 +444,7 @@ error_do_abort:
 		abort_code = 0;
 		offset = 0;
 		rxrpc_kernel_recv_data(afs_socket, rxcall, NULL, 0, &offset,
-				       false, &abort_code);
+				       false, &abort_code, &call->service_id);
 		ret = call->type->abort_to_error(abort_code);
 	}
 error_kill_call:
@@ -471,7 +472,8 @@ static void afs_deliver_to_call(struct afs_call *call)
 			size_t offset = 0;
 			ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
 						     NULL, 0, &offset, false,
-						     &call->abort_code);
+						     &call->abort_code,
+						     &call->service_id);
 			trace_afs_recv_data(call, 0, offset, false, ret);
 
 			if (ret == -EINPROGRESS || ret == -EAGAIN)
@@ -851,7 +853,8 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 
 	ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
 				     buf, count, &call->offset,
-				     want_more, &call->abort_code);
+				     want_more, &call->abort_code,
+				     &call->service_id);
 	trace_afs_recv_data(call, count, call->offset, want_more, ret);
 	if (ret == 0 || ret == -EAGAIN)
 		return ret;
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 3ac79150291f..820dd365a08e 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -49,12 +49,13 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *,
 					   unsigned long,
 					   s64,
 					   gfp_t,
-					   rxrpc_notify_rx_t);
+					   rxrpc_notify_rx_t,
+					   bool);
 int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *,
 			   struct msghdr *, size_t,
 			   rxrpc_notify_end_tx_t);
 int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
-			   void *, size_t, size_t *, bool, u32 *);
+			   void *, size_t, size_t *, bool, u32 *, u16 *);
 bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
 			     u32, int, const char *);
 void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index fb17552fd292..481f7dc90ba2 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -265,6 +265,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
  * @tx_total_len: Total length of data to transmit during the call (or -1)
  * @gfp: The allocation constraints
  * @notify_rx: Where to send notifications instead of socket queue
+ * @upgrade: Request service upgrade for call
  *
  * Allow a kernel service to begin a call on the nominated socket.  This just
  * sets up all the internal tracking structures and allocates connection and
@@ -279,7 +280,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 					   unsigned long user_call_ID,
 					   s64 tx_total_len,
 					   gfp_t gfp,
-					   rxrpc_notify_rx_t notify_rx)
+					   rxrpc_notify_rx_t notify_rx,
+					   bool upgrade)
 {
 	struct rxrpc_conn_parameters cp;
 	struct rxrpc_call *call;
@@ -304,6 +306,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 	cp.key			= key;
 	cp.security_level	= 0;
 	cp.exclusive		= false;
+	cp.upgrade		= upgrade;
 	cp.service_id		= srx->srx_service;
 	call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, tx_total_len,
 				     gfp);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index bdece21f313d..e4937b3f3685 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -607,6 +607,7 @@ wait_error:
  * @_offset: The running offset into the buffer.
  * @want_more: True if more data is expected to be read
  * @_abort: Where the abort code is stored if -ECONNABORTED is returned
+ * @_service: Where to store the actual service ID (may be upgraded)
  *
  * Allow a kernel service to receive data and pick up information about the
  * state of a call.  Returns 0 if got what was asked for and there's more
@@ -624,7 +625,7 @@ wait_error:
  */
 int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 			   void *buf, size_t size, size_t *_offset,
-			   bool want_more, u32 *_abort)
+			   bool want_more, u32 *_abort, u16 *_service)
 {
 	struct iov_iter iter;
 	struct kvec iov;
@@ -680,6 +681,8 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 read_phase_complete:
 	ret = 1;
 out:
+	if (_service)
+		*_service = call->service_id;
 	mutex_unlock(&call->user_mutex);
 	_leave(" = %d [%zu,%d]", ret, *_offset, *_abort);
 	return ret;
-- 
cgit v1.2.3


From f4d15fb6f99af9b99f688bd87579137be44f85ee Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 18 Oct 2017 11:07:31 +0100
Subject: rxrpc: Provide functions for allowing cleaner handling of signals

Provide a couple of functions to allow cleaner handling of signals in a
kernel service.  They are:

 (1) rxrpc_kernel_get_rtt()

     This allows the kernel service to find out the RTT time for a call, so
     as to better judge how large a timeout to employ.

     Note, though, that whilst this returns a value in nanoseconds, the
     timeouts can only actually be in jiffies.

 (2) rxrpc_kernel_check_life()

     This returns a number that is updated when ACKs are received from the
     peer (notably including PING RESPONSE ACKs which we can elicit by
     sending PING ACKs to see if the call still exists on the server).

     The caller should compare the numbers of two calls to see if the call
     is still alive.

These can be used to provide an extending timeout rather than returning
immediately in the case that a signal occurs that would otherwise abort an
RPC operation.  The timeout would be extended if the server is still
responsive and the call is still apparently alive on the server.

For most operations this isn't that necessary - but for FS.StoreData it is:
OpenAFS writes the data to storage as it comes in without making a backup,
so if we immediately abort it when partially complete on a CTRL+C, say, we
have no idea of the state of the file after the abort.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/networking/rxrpc.txt | 24 ++++++++++++++++++++++++
 include/net/af_rxrpc.h             |  2 ++
 net/rxrpc/af_rxrpc.c               | 19 +++++++++++++++++++
 net/rxrpc/peer_object.c            | 13 +++++++++++++
 4 files changed, 58 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index 9fb61a6bc7cf..1fb5c553aedd 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -1033,6 +1033,30 @@ The kernel interface functions are as follows:
 
      It returns 0 if the call was requeued and an error otherwise.
 
+ (*) Get call RTT.
+
+	u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call);
+
+     Get the RTT time to the peer in use by a call.  The value returned is in
+     nanoseconds.
+
+ (*) Check call still alive.
+
+	u32 rxrpc_kernel_check_life(struct socket *sock,
+				    struct rxrpc_call *call);
+
+     This returns a number that is updated when ACKs are received from the peer
+     (notably including PING RESPONSE ACKs which we can elicit by sending PING
+     ACKs to see if the call still exists on the server).  The caller should
+     compare the numbers of two calls to see if the call is still alive after
+     waiting for a suitable interval.
+
+     This allows the caller to work out if the server is still contactable and
+     if the call is still alive on the server whilst waiting for the server to
+     process a client operation.
+
+     This function may transmit a PING ACK.
+
 
 =======================
 CONFIGURABLE PARAMETERS
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 820dd365a08e..2b3a6eec4570 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -61,6 +61,7 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
 void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
 void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
 			   struct sockaddr_rxrpc *);
+u64 rxrpc_kernel_get_rtt(struct socket *, struct rxrpc_call *);
 int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
 			       rxrpc_user_attach_call_t, unsigned long, gfp_t);
 void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64);
@@ -68,5 +69,6 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *,
 			    struct sockaddr_rxrpc *, struct key *);
 int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *,
 			    enum rxrpc_call_completion *, u32 *);
+u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *);
 
 #endif /* _NET_RXRPC_H */
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 481f7dc90ba2..73c980e26581 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -339,6 +339,25 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
 }
 EXPORT_SYMBOL(rxrpc_kernel_end_call);
 
+/**
+ * rxrpc_kernel_check_life - Check to see whether a call is still alive
+ * @sock: The socket the call is on
+ * @call: The call to check
+ *
+ * Allow a kernel service to find out whether a call is still alive - ie. we're
+ * getting ACKs from the server.  Returns a number representing the life state
+ * which can be compared to that returned by a previous call.
+ *
+ * If this is a client call, ping ACKs will be sent to the server to find out
+ * whether it's still responsive and whether the call is still alive on the
+ * server.
+ */
+u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call)
+{
+	return call->acks_latest;
+}
+EXPORT_SYMBOL(rxrpc_kernel_check_life);
+
 /**
  * rxrpc_kernel_check_call - Check a call's state
  * @sock: The socket the call is on
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 5787f97f5330..d02a99f37f5f 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -411,3 +411,16 @@ void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,
 	*_srx = call->peer->srx;
 }
 EXPORT_SYMBOL(rxrpc_kernel_get_peer);
+
+/**
+ * rxrpc_kernel_get_rtt - Get a call's peer RTT
+ * @sock: The socket on which the call is in progress.
+ * @call: The call to query
+ *
+ * Get the call's peer RTT.
+ */
+u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call)
+{
+	return call->peer->rtt;
+}
+EXPORT_SYMBOL(rxrpc_kernel_get_rtt);
-- 
cgit v1.2.3


From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:28 +0200
Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP

The 'cpumap' is primarily used as a backend map for XDP BPF helper
call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.

This patch implement the main part of the map.  It is not connected to
the XDP redirect system yet, and no SKB allocation are done yet.

The main concern in this patch is to ensure the datapath can run
without any locking.  This adds complexity to the setup and tear-down
procedure, which assumptions are extra carefully documented in the
code comments.

V2:
 - make sure array isn't larger than NR_CPUS
 - make sure CPUs added is a valid possible CPU

V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl>

V5:
 - Restrict map allocation to root / CAP_SYS_ADMIN
 - WARN_ON_ONCE if queue is not empty on tear-down
 - Return -EPERM on memlock limit instead of -ENOMEM
 - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup()
 - Moved cpu_map_enqueue() to next patch

V6: all notice by Daniel Borkmann
 - Fix err return code in cpu_map_alloc() introduced in V5
 - Move cpu_possible() check after max_entries boundary check
 - Forbid usage initially in check_map_func_compatibility()

V7:
 - Fix alloc error path spotted by Daniel Borkmann
 - Did stress test adding+removing CPUs from the map concurrently
 - Fixed refcnt issue on cpu_map_entry, kthread started too soon
 - Make sure packets are flushed during tear-down, involved use of
   rcu_barrier() and kthread_run only exit after queue is empty
 - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring

V8:
 - Nitpicking comments and gramma by Edward Cree
 - Fix missing semi-colon introduced in V7 due to rebasing
 - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/Makefile            |   1 +
 kernel/bpf/cpumap.c            | 560 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |   8 +-
 kernel/bpf/verifier.c          |   5 +
 tools/include/uapi/linux/bpf.h |   1 +
 7 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/cpumap.c

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 6f1a567667b8..814c1081a4a9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #ifdef CONFIG_STREAM_PARSER
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6db9e1d679cd..4303fb6c3817 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 53fb09f92e3f..e597daae6120 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..e1e25ddba038
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,560 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU.  The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage.  This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call.  It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+	void *q[CPU_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+	u32 qsize;  /* Queue size placeholder for map lookup */
+
+	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+	struct xdp_bulk_queue __percpu *bulkq;
+
+	/* Queue with potential multi-producers, and single-consumer kthread */
+	struct ptr_ring *queue;
+	struct task_struct *kthread;
+	struct work_struct kthread_stop_wq;
+
+	atomic_t refcnt; /* Control when this struct can be free'ed */
+	struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+	struct bpf_map map;
+	/* Below members specific for map type */
+	struct bpf_cpu_map_entry **cpu_map;
+	unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_cpu_map *cmap;
+	int err = -ENOMEM;
+	u64 cost;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+		return ERR_PTR(-EINVAL);
+
+	cmap = kzalloc(sizeof(*cmap), GFP_USER);
+	if (!cmap)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	cmap->map.map_type = attr->map_type;
+	cmap->map.key_size = attr->key_size;
+	cmap->map.value_size = attr->value_size;
+	cmap->map.max_entries = attr->max_entries;
+	cmap->map.map_flags = attr->map_flags;
+	cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+	/* Pre-limit array size based on NR_CPUS, not final CPU check */
+	if (cmap->map.max_entries > NR_CPUS) {
+		err = -E2BIG;
+		goto free_cmap;
+	}
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_cmap;
+	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	ret = bpf_map_precharge_memlock(cmap->map.pages);
+	if (ret) {
+		err = ret;
+		goto free_cmap;
+	}
+
+	/* A per cpu bitfield with a bit per possible CPU in map  */
+	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+					    __alignof__(unsigned long));
+	if (!cmap->flush_needed)
+		goto free_cmap;
+
+	/* Alloc array for possible remote "destination" CPUs */
+	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+					   sizeof(struct bpf_cpu_map_entry *),
+					   cmap->map.numa_node);
+	if (!cmap->cpu_map)
+		goto free_percpu;
+
+	return &cmap->map;
+free_percpu:
+	free_percpu(cmap->flush_needed);
+free_cmap:
+	kfree(cmap);
+	return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	if (WARN_ON_ONCE(ptr))
+		page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+	struct bpf_cpu_map_entry *rcpu;
+
+	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+	 * as it waits until all in-flight call_rcu() callbacks complete.
+	 */
+	rcu_barrier();
+
+	/* kthread_stop will wake_up_process and wait for it to complete */
+	kthread_stop(rcpu->kthread);
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+	struct bpf_cpu_map_entry *rcpu = data;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* When kthread gives stop order, then rcpu have been disconnected
+	 * from map, thus no new packets can enter. Remaining in-flight
+	 * per CPU stored packets are flushed to this queue.  Wait honoring
+	 * kthread_stop signal until queue is empty.
+	 */
+	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		struct xdp_pkt *xdp_pkt;
+
+		schedule();
+		/* Do work */
+		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
+			/* For now just "refcnt-free" */
+			page_frag_free(xdp_pkt);
+		}
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	put_cpu_map_entry(rcpu);
+	return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+	struct bpf_cpu_map_entry *rcpu;
+	int numa, err;
+
+	/* Have map->numa_node, but choose node of redirect target CPU */
+	numa = cpu_to_node(cpu);
+
+	rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+	if (!rcpu)
+		return NULL;
+
+	/* Alloc percpu bulkq */
+	rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+					 sizeof(void *), gfp);
+	if (!rcpu->bulkq)
+		goto free_rcu;
+
+	/* Alloc queue */
+	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+	if (!rcpu->queue)
+		goto free_bulkq;
+
+	err = ptr_ring_init(rcpu->queue, qsize, gfp);
+	if (err)
+		goto free_queue;
+
+	rcpu->qsize = qsize;
+
+	/* Setup kthread */
+	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+					       "cpumap/%d/map:%d", cpu, map_id);
+	if (IS_ERR(rcpu->kthread))
+		goto free_ptr_ring;
+
+	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+	/* Make sure kthread runs on a single CPU */
+	kthread_bind(rcpu->kthread, cpu);
+	wake_up_process(rcpu->kthread);
+
+	return rcpu;
+
+free_ptr_ring:
+	ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+	kfree(rcpu->queue);
+free_bulkq:
+	free_percpu(rcpu->bulkq);
+free_rcu:
+	kfree(rcpu);
+	return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_cpu_map_entry *rcpu;
+	int cpu;
+
+	/* This cpu_map_entry have been disconnected from map and one
+	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any
+	 * new packets and cannot change/set flush_needed that can
+	 * find this entry.
+	 */
+	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+	/* Flush remaining packets in percpu bulkq */
+	for_each_online_cpu(cpu) {
+		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+		/* No concurrent bq_enqueue can run at this point */
+		bq_flush_to_queue(rcpu, bq);
+	}
+	free_percpu(rcpu->bulkq);
+	/* Cannot kthread_stop() here, last put free rcpu resources */
+	put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program.  The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq).  A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue.  Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+	struct bpf_cpu_map_entry *old_rcpu;
+
+	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+	if (old_rcpu) {
+		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+		schedule_work(&old_rcpu->kthread_stop_wq);
+	}
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 key_cpu = *(u32 *)key;
+
+	if (key_cpu >= map->max_entries)
+		return -EINVAL;
+
+	/* notice caller map_delete_elem() use preempt_disable() */
+	__cpu_map_entry_replace(cmap, key_cpu, NULL);
+	return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	/* Array index key correspond to CPU number */
+	u32 key_cpu = *(u32 *)key;
+	/* Value is the queue size */
+	u32 qsize = *(u32 *)value;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(key_cpu >= cmap->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+	if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+		return -EOVERFLOW;
+
+	/* Make sure CPU is a valid possible cpu */
+	if (!cpu_possible(key_cpu))
+		return -ENODEV;
+
+	if (qsize == 0) {
+		rcpu = NULL; /* Same as deleting */
+	} else {
+		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+		if (!rcpu)
+			return -ENOMEM;
+	}
+	rcu_read_lock();
+	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
+	rcu_read_unlock();
+	return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	int cpu;
+	u32 i;
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the bpf programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+	 * It does __not__ ensure pending flush operations (if any) are
+	 * complete.
+	 */
+	synchronize_rcu();
+
+	/* To ensure all pending flush operations have completed wait for flush
+	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+	 * Because the above synchronize_rcu() ensures the map is disconnected
+	 * from the program we can assume no new bits will be set.
+	 */
+	for_each_online_cpu(cpu) {
+		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+		while (!bitmap_empty(bitmap, cmap->map.max_entries))
+			cond_resched();
+	}
+
+	/* For cpu_map the remote CPUs can still be using the entries
+	 * (struct bpf_cpu_map_entry).
+	 */
+	for (i = 0; i < cmap->map.max_entries; i++) {
+		struct bpf_cpu_map_entry *rcpu;
+
+		rcpu = READ_ONCE(cmap->cpu_map[i]);
+		if (!rcpu)
+			continue;
+
+		/* bq flush and cleanup happens after RCU graze-period */
+		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+	}
+	free_percpu(cmap->flush_needed);
+	bpf_map_area_free(cmap->cpu_map);
+	kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	rcpu = READ_ONCE(cmap->cpu_map[key]);
+	return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map_entry *rcpu =
+		__cpu_map_lookup_elem(map, *(u32 *)key);
+
+	return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= cmap->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == cmap->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+	.map_alloc		= cpu_map_alloc,
+	.map_free		= cpu_map_free,
+	.map_delete_elem	= cpu_map_delete_elem,
+	.map_update_elem	= cpu_map_update_elem,
+	.map_lookup_elem	= cpu_map_lookup_elem,
+	.map_get_next_key	= cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq)
+{
+	struct ptr_ring *q;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	q = rcpu->queue;
+	spin_lock(&q->producer_lock);
+
+	for (i = 0; i < bq->count; i++) {
+		void *xdp_pkt = bq->q[i];
+		int err;
+
+		err = __ptr_ring_produce(q, xdp_pkt);
+		if (err) {
+			/* Free xdp_pkt */
+			page_frag_free(xdp_pkt);
+		}
+	}
+	bq->count = 0;
+	spin_unlock(&q->producer_lock);
+
+	return 0;
+}
+
+/* Notice: Will change in later patch */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+};
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+		bq_flush_to_queue(rcpu, bq);
+
+	/* Notice, xdp_buff/page MUST be queued here, long enough for
+	 * driver to code invoking us to finished, due to driver
+	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
+	 *
+	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * with another CPU on page-refcnt and remaining driver code).
+	 * Queue time is very short, as driver will invoke flush
+	 * operation, when completing napi->poll call.
+	 */
+	bq->q[bq->count++] = xdp_pkt;
+	return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+	__set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+	u32 bit;
+
+	/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+	 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+	 * bitmap indicate which percpu bulkq have packets.
+	 */
+	for_each_set_bit(bit, bitmap, map->max_entries) {
+		struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+		struct xdp_bulk_queue *bq;
+
+		/* This is possible if entry is removed by user space
+		 * between xdp redirect and flush op.
+		 */
+		if (unlikely(!rcpu))
+			continue;
+
+		__clear_bit(bit, bitmap);
+
+		/* Flush all frames in bulkq to real queue */
+		bq = this_cpu_ptr(rcpu->bulkq);
+		bq_flush_to_queue(rcpu, bq);
+
+		/* If already running, costs spin_lock_irqsave + smb_mb */
+		wake_up_process(rcpu->kthread);
+	}
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d124e702e040..54fba06942f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
+	/* Need to create a kthread, thus must support schedule */
+	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		err = map->ops->map_update_elem(map, key, value, attr->flags);
+		goto out;
+	}
+
 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
 	 * inside bpf map update or delete otherwise deadlocks are possible
 	 */
@@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr)
 	}
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9755279d94cb..cefa64be9a2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	/* Restrict bpf side of cpumap, open when use-cases appear */
+	case BPF_MAP_TYPE_CPUMAP:
+		if (func_id != BPF_FUNC_redirect_map)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fb4fb81ce5b0..fa93033dc521 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
-- 
cgit v1.2.3


From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:34 +0200
Subject: bpf: XDP_REDIRECT enable use of cpumap

This patch connects cpumap to the xdp_do_redirect_map infrastructure.

Still no SKB allocation are done yet.  The XDP frames are transferred
to the other CPU, but they are simply refcnt decremented on the remote
CPU.  This served as a good benchmark for measuring the overhead of
remote refcnt decrement.  If driver page recycle cache is not
efficient then this, exposes a bottleneck in the page allocator.

A shout-out to MST's ptr_ring, which is the secret behind is being so
efficient to transfer memory pointers between CPUs, without constantly
bouncing cache-lines between CPUs.

V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot.

V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet,
 as implementation require a separate upstream discussion.

V5:
 - Fix a maybe-uninitialized pointed out by kbuild test robot.
 - Restrict bpf-prog side access to cpumap, open when use-cases appear
 - Implement cpu_map_enqueue() as a more simple void pointer enqueue

V6:
 - Allow cpumap type for usage in helper bpf_redirect_map,
   general bpf-prog side restriction moved to earlier patch.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h        |  31 +++++++++-
 include/trace/events/xdp.h |  10 +++-
 kernel/bpf/cpumap.c        |  22 ++++++-
 kernel/bpf/verifier.c      |   3 +-
 net/core/filter.c          | 140 +++++++++++++++++++++++++++++++++++----------
 5 files changed, 172 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4373125de1f3..6d4dd844828a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -355,6 +355,13 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
+void __cpu_map_flush(struct bpf_map *map);
+struct xdp_buff;
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
+
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 {
@@ -362,7 +369,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
-#else
+#else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -425,6 +432,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
+
+static inline
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	return NULL;
+}
+
+static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
+{
+}
+
+static inline void __cpu_map_flush(struct bpf_map *map)
+{
+}
+
+struct xdp_buff;
+static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_buff *xdp,
+				  struct net_device *dev_rx)
+{
+	return 0;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4e16c43fba10..eb2ece96c1a2 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -136,12 +136,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#define devmap_ifindex(fwd, map)				\
+	(!fwd ? 0 :						\
+	 (!map ? 0 :						\
+	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	   ((struct net_device *)fwd)->ifindex : 0)))
+
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
-	 trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
 				0, map, idx)
 
 #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
-	 trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e1e25ddba038..768da6a2c265 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -500,7 +500,7 @@ struct xdp_pkt {
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -520,6 +520,26 @@ int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	return 0;
 }
 
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
+{
+	struct xdp_pkt *xdp_pkt;
+	int headroom;
+
+	/* For now this is just used as a void pointer to data_hard_start.
+	 * Followup patch will generalize this.
+	 */
+	xdp_pkt = xdp->data_hard_start;
+
+	/* Fake writing into xdp_pkt->data to measure overhead */
+	headroom = xdp->data - xdp->data_hard_start;
+	if (headroom < sizeof(*xdp_pkt))
+		xdp_pkt->data = xdp->data;
+
+	bq_enqueue(rcpu, xdp_pkt);
+	return 0;
+}
+
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cefa64be9a2f..e4d5136725a2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1486,7 +1486,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_FUNC_redirect_map:
-		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_CPUMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
diff --git a/net/core/filter.c b/net/core/filter.c
index 140fa9f9c0f4..4d88e0665c41 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2526,10 +2526,36 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
 	if (err)
 		return err;
-	if (map)
+	dev->netdev_ops->ndo_xdp_flush(dev);
+	return 0;
+}
+
+static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
+			    struct bpf_map *map,
+			    struct xdp_buff *xdp,
+			    u32 index)
+{
+	int err;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		struct net_device *dev = fwd;
+
+		if (!dev->netdev_ops->ndo_xdp_xmit)
+			return -EOPNOTSUPP;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		if (err)
+			return err;
 		__dev_map_insert_ctx(map, index);
-	else
-		dev->netdev_ops->ndo_xdp_flush(dev);
+
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		struct bpf_cpu_map_entry *rcpu = fwd;
+
+		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
+		if (err)
+			return err;
+		__cpu_map_insert_ctx(map, index);
+	}
 	return 0;
 }
 
@@ -2539,11 +2565,33 @@ void xdp_do_flush_map(void)
 	struct bpf_map *map = ri->map_to_flush;
 
 	ri->map_to_flush = NULL;
-	if (map)
-		__dev_map_flush(map);
+	if (map) {
+		switch (map->map_type) {
+		case BPF_MAP_TYPE_DEVMAP:
+			__dev_map_flush(map);
+			break;
+		case BPF_MAP_TYPE_CPUMAP:
+			__cpu_map_flush(map);
+			break;
+		default:
+			break;
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush_map);
 
+static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
+{
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_DEVMAP:
+		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_CPUMAP:
+		return __cpu_map_lookup_elem(map, index);
+	default:
+		return NULL;
+	}
+}
+
 static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
 				   unsigned long aux)
 {
@@ -2556,8 +2604,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
-	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
+	void *fwd = NULL;
 	int err;
 
 	ri->ifindex = 0;
@@ -2570,7 +2618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 		goto err;
 	}
 
-	fwd = __dev_map_lookup_elem(map, index);
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (!fwd) {
 		err = -EINVAL;
 		goto err;
@@ -2578,7 +2626,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	if (ri->map_to_flush && ri->map_to_flush != map)
 		xdp_do_flush_map();
 
-	err = __bpf_tx_xdp(fwd, map, xdp, index);
+	err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
 	if (unlikely(err))
 		goto err;
 
@@ -2620,54 +2668,88 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *xdp_prog)
+static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
+{
+	unsigned int len;
+
+	if (unlikely(!(fwd->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+	if (skb->len > len)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb,
+				struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
 	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	unsigned int len;
 	int err = 0;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 	ri->map_owner = 0;
 
-	if (map) {
-		if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
-			err = -EFAULT;
-			map = NULL;
-			goto err;
-		}
-		fwd = __dev_map_lookup_elem(map, index);
-	} else {
-		fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
+		err = -EFAULT;
+		map = NULL;
+		goto err;
 	}
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
 	}
 
-	if (unlikely(!(fwd->flags & IFF_UP))) {
-		err = -ENETDOWN;
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+			goto err;
+		skb->dev = fwd;
+	} else {
+		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
+		err = -EBADRQC;
 		goto err;
 	}
 
-	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-	if (skb->len > len) {
-		err = -EMSGSIZE;
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	return 0;
+err:
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+	return err;
+}
+
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+			    struct bpf_prog *xdp_prog)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	u32 index = ri->ifindex;
+	struct net_device *fwd;
+	int err = 0;
+
+	if (ri->map)
+		return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+
+	ri->ifindex = 0;
+	fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(!fwd)) {
+		err = -EINVAL;
 		goto err;
 	}
 
+	if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+		goto err;
+
 	skb->dev = fwd;
-	map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
-		: _trace_xdp_redirect(dev, xdp_prog, index);
+	_trace_xdp_redirect(dev, xdp_prog, index);
 	return 0;
 err:
-	map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
-		: _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.2.3


From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:39 +0200
Subject: bpf: cpumap xdp_buff to skb conversion and allocation

This patch makes cpumap functional, by adding SKB allocation and
invoking the network stack on the dequeuing CPU.

For constructing the SKB on the remote CPU, the xdp_buff in converted
into a struct xdp_pkt, and it mapped into the top headroom of the
packet, to avoid allocating separate mem.  For now, struct xdp_pkt is
just a cpumap internal data structure, with info carried between
enqueue to dequeue.

If a driver doesn't have enough headroom it is simply dropped, with
return code -EOVERFLOW.  This will be picked up the xdp tracepoint
infrastructure, to allow users to catch this.

V2: take into account xdp->data_meta

V4:
 - Drop busypoll tricks, keeping it more simple.
 - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei

V5: correct RCU read protection around __netif_receive_skb_core.

V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   1 +
 kernel/bpf/cpumap.c       | 152 +++++++++++++++++++++++++++++++++++++++-------
 net/core/dev.c            |  27 ++++++++
 3 files changed, 158 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31bb3010c69b..bf014afcb914 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
+int netif_receive_skb_core(struct sk_buff *skb);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 768da6a2c265..ee7adf4352dd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -25,6 +25,9 @@
 #include <linux/kthread.h>
 #include <linux/capability.h>
 
+#include <linux/netdevice.h>   /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
 /* General idea: XDP packets getting XDP redirected to another CPU,
  * will maximum be stored/queued for one driver ->poll() call.  It is
  * guaranteed that setting flush bit and flush operation happen on
@@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+	struct xdp_pkt *xdp_pkt;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if ((headroom - metasize) < sizeof(*xdp_pkt))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_pkt = xdp->data_hard_start;
+
+	xdp_pkt->data = xdp->data;
+	xdp_pkt->len  = xdp->data_end - xdp->data;
+	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+	xdp_pkt->metasize = metasize;
+
+	return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_pkt *xdp_pkt)
+{
+	unsigned int frame_size;
+	void *pkt_data_start;
+	struct sk_buff *skb;
+
+	/* build_skb need to place skb_shared_info after SKB end, and
+	 * also want to know the memory "truesize".  Thus, need to
+	 * know the memory frame size backing xdp_buff.
+	 *
+	 * XDP was designed to have PAGE_SIZE frames, but this
+	 * assumption is not longer true with ixgbe and i40e.  It
+	 * would be preferred to set frame_size to 2048 or 4096
+	 * depending on the driver.
+	 *   frame_size = 2048;
+	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *
+	 * Instead, with info avail, skb_shared_info in placed after
+	 * packet len.  This, unfortunately fakes the truesize.
+	 * Another disadvantage of this approach, the skb_shared_info
+	 * is not at a fixed memory location, with mixed length
+	 * packets, which is bad for cache-line hotness.
+	 */
+	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	skb = build_skb(pkt_data_start, frame_size);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, xdp_pkt->headroom);
+	__skb_put(skb, xdp_pkt->len);
+	if (xdp_pkt->metasize)
+		skb_metadata_set(skb, xdp_pkt->metasize);
+
+	/* Essential SKB info: protocol and skb->dev */
+	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+	/* Optional SKB info, currently missing:
+	 * - HW checksum info		(skb->ip_summed)
+	 * - HW RX hash			(skb_set_hash)
+	 * - RX ring dev queue index	(skb_record_rx_queue)
+	 */
+
+	return skb;
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		unsigned int processed = 0, drops = 0;
 		struct xdp_pkt *xdp_pkt;
 
-		schedule();
-		/* Do work */
-		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
-			/* For now just "refcnt-free" */
-			page_frag_free(xdp_pkt);
+		/* Release CPU reschedule checks */
+		if (__ptr_ring_empty(rcpu->queue)) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+		} else {
+			cond_resched();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		/* Process packets in rcpu->queue */
+		local_bh_disable();
+		/*
+		 * The bpf_cpu_map_entry is single consumer, with this
+		 * kthread CPU pinned. Lockless access to ptr_ring
+		 * consume side valid as no-resize allowed of queue.
+		 */
+		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+			struct sk_buff *skb;
+			int ret;
+
+			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			if (!skb) {
+				page_frag_free(xdp_pkt);
+				continue;
+			}
+
+			/* Inject into network stack */
+			ret = netif_receive_skb_core(skb);
+			if (ret == NET_RX_DROP)
+				drops++;
+
+			/* Limit BH-disable period */
+			if (++processed == 8)
+				break;
 		}
-		__set_current_state(TASK_INTERRUPTIBLE);
+		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	return 0;
 }
 
-/* Notice: Will change in later patch */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-};
-
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
@@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
 	struct xdp_pkt *xdp_pkt;
-	int headroom;
 
-	/* For now this is just used as a void pointer to data_hard_start.
-	 * Followup patch will generalize this.
-	 */
-	xdp_pkt = xdp->data_hard_start;
+	xdp_pkt = convert_to_xdp_pkt(xdp);
+	if (!xdp_pkt)
+		return -EOVERFLOW;
 
-	/* Fake writing into xdp_pkt->data to measure overhead */
-	headroom = xdp->data - xdp->data_hard_start;
-	if (headroom < sizeof(*xdp_pkt))
-		xdp_pkt->data = xdp->data;
+	/* Info needed when constructing SKB on remote CPU */
+	xdp_pkt->dev_rx = dev_rx;
 
 	bq_enqueue(rcpu, xdp_pkt);
 	return 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index d2b20e73080e..cf5894f0e6eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4492,6 +4492,33 @@ out:
 	return ret;
 }
 
+/**
+ *	netif_receive_skb_core - special purpose version of netif_receive_skb
+ *	@skb: buffer to process
+ *
+ *	More direct receive version of netif_receive_skb().  It should
+ *	only be used by callers that have a need to skip RPS and Generic XDP.
+ *	Caller must also take care of handling if (page_is_)pfmemalloc.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ *
+ *	Return values (usually ignored):
+ *	NET_RX_SUCCESS: no congestion
+ *	NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb_core(struct sk_buff *skb)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = __netif_receive_skb_core(skb, false);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(netif_receive_skb_core);
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	int ret;
-- 
cgit v1.2.3


From f9419f7bd7a5318b636a941a0214c5cdfa6f6530 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:44 +0200
Subject: bpf: cpumap add tracepoints

This adds two tracepoint to the cpumap.  One for the enqueue side
trace_xdp_cpumap_enqueue() and one for the kthread dequeue side
trace_xdp_cpumap_kthread().

To mitigate the tracepoint overhead, these are invoked during the
enqueue/dequeue bulking phases, thus amortizing the cost.

The obvious use-cases are for debugging and monitoring.  The
non-intuitive use-case is using these as a feedback loop to know the
system load.  One can imagine auto-scaling by reducing, adding or
activating more worker CPUs on demand.

V4: tracepoint remove time_limit info, instead add sched info

V8: intro struct bpf_cpu_map_entry members cpu+map_id in this patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/cpumap.c        | 24 ++++++++++++----
 2 files changed, 89 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index eb2ece96c1a2..0c8dec61987e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -150,6 +150,76 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
+TRACE_EVENT(xdp_cpumap_kthread,
+
+	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
+		 int sched),
+
+	TP_ARGS(map_id, processed, drops, sched),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(int, cpu)
+		__field(unsigned int, drops)
+		__field(unsigned int, processed)
+		__field(int, sched)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map_id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->cpu		= smp_processor_id();
+		__entry->drops		= drops;
+		__entry->processed	= processed;
+		__entry->sched	= sched;
+	),
+
+	TP_printk("kthread"
+		  " cpu=%d map_id=%d action=%s"
+		  " processed=%u drops=%u"
+		  " sched=%d",
+		  __entry->cpu, __entry->map_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->processed, __entry->drops,
+		  __entry->sched)
+);
+
+TRACE_EVENT(xdp_cpumap_enqueue,
+
+	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
+		 int to_cpu),
+
+	TP_ARGS(map_id, processed, drops, to_cpu),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(int, cpu)
+		__field(unsigned int, drops)
+		__field(unsigned int, processed)
+		__field(int, to_cpu)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map_id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->cpu		= smp_processor_id();
+		__entry->drops		= drops;
+		__entry->processed	= processed;
+		__entry->to_cpu		= to_cpu;
+	),
+
+	TP_printk("enqueue"
+		  " cpu=%d map_id=%d action=%s"
+		  " processed=%u drops=%u"
+		  " to_cpu=%d",
+		  __entry->cpu, __entry->map_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->processed, __entry->drops,
+		  __entry->to_cpu)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ee7adf4352dd..b4358d84ddf1 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -24,6 +24,7 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/capability.h>
+#include <trace/events/xdp.h>
 
 #include <linux/netdevice.h>   /* netif_receive_skb_core */
 #include <linux/etherdevice.h> /* eth_type_trans */
@@ -43,6 +44,8 @@ struct xdp_bulk_queue {
 
 /* Struct for every remote "destination" CPU in map */
 struct bpf_cpu_map_entry {
+	u32 cpu;    /* kthread CPU and map index */
+	int map_id; /* Back reference to map */
 	u32 qsize;  /* Queue size placeholder for map lookup */
 
 	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
@@ -280,15 +283,16 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
-		unsigned int processed = 0, drops = 0;
+		unsigned int processed = 0, drops = 0, sched = 0;
 		struct xdp_pkt *xdp_pkt;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
 			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
+			sched = 1;
 		} else {
-			cond_resched();
+			sched = cond_resched();
 		}
 		__set_current_state(TASK_RUNNING);
 
@@ -318,6 +322,9 @@ static int cpu_map_kthread_run(void *data)
 			if (++processed == 8)
 				break;
 		}
+		/* Feedback loop via tracepoint */
+		trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
 		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
 	__set_current_state(TASK_RUNNING);
@@ -354,7 +361,9 @@ struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
 	if (err)
 		goto free_queue;
 
-	rcpu->qsize = qsize;
+	rcpu->cpu    = cpu;
+	rcpu->map_id = map_id;
+	rcpu->qsize  = qsize;
 
 	/* Setup kthread */
 	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
@@ -584,6 +593,8 @@ const struct bpf_map_ops cpu_map_ops = {
 static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 			     struct xdp_bulk_queue *bq)
 {
+	unsigned int processed = 0, drops = 0;
+	const int to_cpu = rcpu->cpu;
 	struct ptr_ring *q;
 	int i;
 
@@ -599,13 +610,16 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 
 		err = __ptr_ring_produce(q, xdp_pkt);
 		if (err) {
-			/* Free xdp_pkt */
-			page_frag_free(xdp_pkt);
+			drops++;
+			page_frag_free(xdp_pkt); /* Free xdp_pkt */
 		}
+		processed++;
 	}
 	bq->count = 0;
 	spin_unlock(&q->producer_lock);
 
+	/* Feedback loop via tracepoints */
+	trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f8b8b1cd5aadd221742b45eb0ee3c8a80abf036a Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Mon, 16 Oct 2017 11:12:18 -0400
Subject: net: dsa: split dsa_port's netdev member

The dsa_port structure has a "netdev" member, which can be used for
either the master device, or the slave device, depending on its type.

It is true that today, CPU port are not exposed to userspace, thus the
port's netdev member can be used to point to its master interface.

But it is still slightly confusing, so split it into more explicit
"master" and "slave" members inside an anonymous union.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c        |  6 +++---
 drivers/net/dsa/mt7530.c         |  2 +-
 drivers/net/dsa/mv88e6xxx/chip.c |  2 +-
 include/net/dsa.h                |  9 ++++++++-
 net/dsa/dsa.c                    |  6 +++---
 net/dsa/dsa2.c                   | 20 ++++++++++----------
 net/dsa/dsa_priv.h               |  4 ++--
 net/dsa/legacy.c                 | 14 +++++++-------
 net/dsa/slave.c                  |  6 +++---
 9 files changed, 38 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 32025b990437..b43c063b9634 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -601,7 +601,7 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port,
 		 * state machine and make it go in PHY_FORCING state instead.
 		 */
 		if (!status->link)
-			netif_carrier_off(ds->ports[port].netdev);
+			netif_carrier_off(ds->ports[port].slave);
 		status->duplex = 1;
 	} else {
 		status->link = 1;
@@ -690,7 +690,7 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds)
 static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 			       struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = ds->ports[port].cpu_dp->netdev;
+	struct net_device *p = ds->ports[port].cpu_dp->master;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	struct ethtool_wolinfo pwol;
 
@@ -713,7 +713,7 @@ static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port,
 			      struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = ds->ports[port].cpu_dp->netdev;
+	struct net_device *p = ds->ports[port].cpu_dp->master;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	s8 cpu_port = ds->ports[port].cpu_dp->index;
 	struct ethtool_wolinfo pwol;
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 034241696ce2..fea2e665d0cb 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -933,7 +933,7 @@ mt7530_setup(struct dsa_switch *ds)
 	 * controller also is the container for two GMACs nodes representing
 	 * as two netdev instances.
 	 */
-	dn = ds->ports[MT7530_CPU_PORT].netdev->dev.of_node->parent;
+	dn = ds->ports[MT7530_CPU_PORT].master->dev.of_node->parent;
 	priv->ethernet = syscon_node_to_regmap(dn);
 	if (IS_ERR(priv->ethernet))
 		return PTR_ERR(priv->ethernet);
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 76cf383dcf90..88b47fa09b41 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1137,7 +1137,7 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port,
 			if (dsa_is_dsa_port(ds, i) || dsa_is_cpu_port(ds, i))
 				continue;
 
-			if (!ds->ports[port].netdev)
+			if (!ds->ports[port].slave)
 				continue;
 
 			if (vlan.member[i] ==
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2746741f74cf..6ed1a17ed1bd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -164,6 +164,14 @@ struct dsa_mall_tc_entry {
 
 
 struct dsa_port {
+	/* A CPU port is physically connected to a master device.
+	 * A user port exposed to userspace has a slave device.
+	 */
+	union {
+		struct net_device *master;
+		struct net_device *slave;
+	};
+
 	/* CPU port tagging operations used by master or slave devices */
 	const struct dsa_device_ops *tag_ops;
 
@@ -176,7 +184,6 @@ struct dsa_port {
 	unsigned int		index;
 	const char		*name;
 	struct dsa_port		*cpu_dp;
-	struct net_device	*netdev;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
 	u8			stp_state;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 832c659ff993..a3abf7a7b9a2 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -201,7 +201,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 #ifdef CONFIG_PM_SLEEP
 static bool dsa_is_port_initialized(struct dsa_switch *ds, int p)
 {
-	return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev;
+	return ds->enabled_port_mask & (1 << p) && ds->ports[p].slave;
 }
 
 int dsa_switch_suspend(struct dsa_switch *ds)
@@ -213,7 +213,7 @@ int dsa_switch_suspend(struct dsa_switch *ds)
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
-		ret = dsa_slave_suspend(ds->ports[i].netdev);
+		ret = dsa_slave_suspend(ds->ports[i].slave);
 		if (ret)
 			return ret;
 	}
@@ -240,7 +240,7 @@ int dsa_switch_resume(struct dsa_switch *ds)
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
-		ret = dsa_slave_resume(ds->ports[i].netdev);
+		ret = dsa_slave_resume(ds->ports[i].slave);
 		if (ret)
 			return ret;
 	}
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 6ac9e11d385c..9e8b8aab049d 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -279,7 +279,7 @@ static int dsa_user_port_apply(struct dsa_port *port)
 	if (err) {
 		dev_warn(ds->dev, "Failed to create slave %d: %d\n",
 			 port->index, err);
-		port->netdev = NULL;
+		port->slave = NULL;
 		return err;
 	}
 
@@ -289,7 +289,7 @@ static int dsa_user_port_apply(struct dsa_port *port)
 	if (err)
 		return err;
 
-	devlink_port_type_eth_set(&port->devlink_port, port->netdev);
+	devlink_port_type_eth_set(&port->devlink_port, port->slave);
 
 	return 0;
 }
@@ -297,9 +297,9 @@ static int dsa_user_port_apply(struct dsa_port *port)
 static void dsa_user_port_unapply(struct dsa_port *port)
 {
 	devlink_port_unregister(&port->devlink_port);
-	if (port->netdev) {
-		dsa_slave_destroy(port->netdev);
-		port->netdev = NULL;
+	if (port->slave) {
+		dsa_slave_destroy(port->slave);
+		port->slave = NULL;
 		port->ds->enabled_port_mask &= ~(1 << port->index);
 	}
 }
@@ -432,9 +432,9 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
 	 * sent to the tag format's receive function.
 	 */
 	wmb();
-	dst->cpu_dp->netdev->dsa_ptr = dst->cpu_dp;
+	dst->cpu_dp->master->dsa_ptr = dst->cpu_dp;
 
-	err = dsa_master_ethtool_setup(dst->cpu_dp->netdev);
+	err = dsa_master_ethtool_setup(dst->cpu_dp->master);
 	if (err)
 		return err;
 
@@ -451,9 +451,9 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 	if (!dst->applied)
 		return;
 
-	dsa_master_ethtool_restore(dst->cpu_dp->netdev);
+	dsa_master_ethtool_restore(dst->cpu_dp->master);
 
-	dst->cpu_dp->netdev->dsa_ptr = NULL;
+	dst->cpu_dp->master->dsa_ptr = NULL;
 
 	/* If we used a tagging format that doesn't have an ethertype
 	 * field, make sure that all packets from this point get sent
@@ -499,7 +499,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 
 	if (!dst->cpu_dp) {
 		dst->cpu_dp = port;
-		dst->cpu_dp->netdev = ethernet_dev;
+		dst->cpu_dp->master = ethernet_dev;
 	}
 
 	/* Initialize cpu_port_mask now for drv->setup()
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 623c22b75e81..1e9914062d0b 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -130,7 +130,7 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
 	if (port < 0 || port >= ds->num_ports)
 		return NULL;
 
-	return ds->ports[port].netdev;
+	return ds->ports[port].slave;
 }
 
 /* port.c */
@@ -181,7 +181,7 @@ dsa_slave_to_master(const struct net_device *dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 
-	return dp->cpu_dp->netdev;
+	return dp->cpu_dp->master;
 }
 
 /* switch.c */
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index cc28c6f792a3..b6c88fd33d4f 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -120,7 +120,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 				return -EINVAL;
 			}
 			dst->cpu_dp = &ds->ports[i];
-			dst->cpu_dp->netdev = master;
+			dst->cpu_dp->master = master;
 			ds->cpu_port_mask |= 1 << i;
 		} else if (!strcmp(name, "dsa")) {
 			ds->dsa_port_mask |= 1 << i;
@@ -261,10 +261,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 		if (!(ds->enabled_port_mask & (1 << port)))
 			continue;
 
-		if (!ds->ports[port].netdev)
+		if (!ds->ports[port].slave)
 			continue;
 
-		dsa_slave_destroy(ds->ports[port].netdev);
+		dsa_slave_destroy(ds->ports[port].slave);
 	}
 
 	/* Disable configuration of the CPU and DSA ports */
@@ -601,7 +601,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
 	wmb();
 	dev->dsa_ptr = dst->cpu_dp;
 
-	return dsa_master_ethtool_setup(dst->cpu_dp->netdev);
+	return dsa_master_ethtool_setup(dst->cpu_dp->master);
 }
 
 static int dsa_probe(struct platform_device *pdev)
@@ -666,9 +666,9 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 {
 	int i;
 
-	dsa_master_ethtool_restore(dst->cpu_dp->netdev);
+	dsa_master_ethtool_restore(dst->cpu_dp->master);
 
-	dst->cpu_dp->netdev->dsa_ptr = NULL;
+	dst->cpu_dp->master->dsa_ptr = NULL;
 
 	/* If we used a tagging format that doesn't have an ethertype
 	 * field, make sure that all packets from this point get sent
@@ -683,7 +683,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 			dsa_switch_destroy(ds);
 	}
 
-	dev_put(dst->cpu_dp->netdev);
+	dev_put(dst->cpu_dp->master);
 }
 
 static int dsa_remove(struct platform_device *pdev)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index b72e07503a40..6906de0f0050 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1123,7 +1123,7 @@ static void dsa_slave_notify(struct net_device *dev, unsigned long val)
 int dsa_slave_create(struct dsa_port *port, const char *name)
 {
 	struct dsa_port *cpu_dp = port->cpu_dp;
-	struct net_device *master = cpu_dp->netdev;
+	struct net_device *master = cpu_dp->master;
 	struct dsa_switch *ds = port->ds;
 	struct net_device *slave_dev;
 	struct dsa_slave_priv *p;
@@ -1170,7 +1170,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name)
 	p->old_link = -1;
 	p->old_duplex = -1;
 
-	port->netdev = slave_dev;
+	port->slave = slave_dev;
 
 	netif_carrier_off(slave_dev);
 
@@ -1198,7 +1198,7 @@ out_phy:
 out_free:
 	free_percpu(p->stats64);
 	free_netdev(slave_dev);
-	port->netdev = NULL;
+	port->slave = NULL;
 	return ret;
 }
 
-- 
cgit v1.2.3


From c8652c83bc84ac8db44060ced0036de7628aa5e5 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Mon, 16 Oct 2017 11:12:19 -0400
Subject: net: dsa: add dsa_to_port helper

The dsa_port structure is part of DSA core data and must only be updated
by the later. It is OK and sometimes necessary for the DSA drivers to
access this data, but this has to be read only.

For that purpose, add a dsa_to_port() helper which returns a const
pointer to a dsa_port structure which must be used by DSA drivers from
now on instead of digging into ds->ports[] themselves.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 4 ++--
 drivers/net/dsa/lan9303-core.c   | 2 +-
 drivers/net/dsa/mt7530.c         | 4 ++--
 drivers/net/dsa/mv88e6060.c      | 2 +-
 drivers/net/dsa/mv88e6xxx/chip.c | 8 ++++----
 drivers/net/dsa/qca8k.c          | 4 ++--
 include/net/dsa.h                | 5 +++++
 7 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index d4ce092def83..b48cf0487b43 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1354,7 +1354,7 @@ int b53_br_join(struct dsa_switch *ds, int port, struct net_device *br)
 	b53_read16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(port), &pvlan);
 
 	b53_for_each_port(dev, i) {
-		if (ds->ports[i].bridge_dev != br)
+		if (dsa_to_port(ds, i)->bridge_dev != br)
 			continue;
 
 		/* Add this local port to the remote port VLAN control
@@ -1390,7 +1390,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *br)
 
 	b53_for_each_port(dev, i) {
 		/* Don't touch the remaining ports */
-		if (ds->ports[i].bridge_dev != br)
+		if (dsa_to_port(ds, i)->bridge_dev != br)
 			continue;
 
 		b53_read16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(i), &reg);
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index fecfe1fe67ea..09a748327fc6 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -863,7 +863,7 @@ static int lan9303_port_bridge_join(struct dsa_switch *ds, int port,
 	struct lan9303 *chip = ds->priv;
 
 	dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
-	if (ds->ports[1].bridge_dev ==  ds->ports[2].bridge_dev) {
+	if (dsa_to_port(ds, 1)->bridge_dev == dsa_to_port(ds, 2)->bridge_dev) {
 		lan9303_bridge_ports(chip);
 		chip->is_bridged = true;  /* unleash stp_state_set() */
 	}
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index fea2e665d0cb..21431be2831e 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -782,7 +782,7 @@ mt7530_port_bridge_join(struct dsa_switch *ds, int port,
 		 * and not being setup until the port becomes enabled.
 		 */
 		if (ds->enabled_port_mask & BIT(i) && i != port) {
-			if (ds->ports[i].bridge_dev != bridge)
+			if (dsa_to_port(ds, i)->bridge_dev != bridge)
 				continue;
 			if (priv->ports[i].enable)
 				mt7530_set(priv, MT7530_PCR_P(i),
@@ -819,7 +819,7 @@ mt7530_port_bridge_leave(struct dsa_switch *ds, int port,
 		 * is kept and not being setup until the port becomes enabled.
 		 */
 		if (ds->enabled_port_mask & BIT(i) && i != port) {
-			if (ds->ports[i].bridge_dev != bridge)
+			if (dsa_to_port(ds, i)->bridge_dev != bridge)
 				continue;
 			if (priv->ports[i].enable)
 				mt7530_clear(priv, MT7530_PCR_P(i),
diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index 6173be889d95..f78b9e13be1c 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -177,7 +177,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
 		  ((p & 0xf) << PORT_VLAN_MAP_DBNUM_SHIFT) |
 		   (dsa_is_cpu_port(ds, p) ?
 			ds->enabled_port_mask :
-			BIT(ds->ports[p].cpu_dp->index)));
+			BIT(dsa_to_port(ds, p)->cpu_dp->index)));
 
 	/* Port Association Vector: when learning source addresses
 	 * of packets, add the address to the address database using
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 88b47fa09b41..677d6902807e 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -851,7 +851,7 @@ static u16 mv88e6xxx_port_vlan(struct mv88e6xxx_chip *chip, int dev, int port)
 	for (i = 0; i < mv88e6xxx_num_ports(chip); ++i)
 		if (dsa_is_cpu_port(chip->ds, i) ||
 		    dsa_is_dsa_port(chip->ds, i) ||
-		    (br && chip->ds->ports[i].bridge_dev == br))
+		    (br && dsa_to_port(chip->ds, i)->bridge_dev == br))
 			pvlan |= BIT(i);
 
 	return pvlan;
@@ -1144,16 +1144,16 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port,
 			    MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_NON_MEMBER)
 				continue;
 
-			if (ds->ports[i].bridge_dev ==
+			if (dsa_to_port(ds, i)->bridge_dev ==
 			    ds->ports[port].bridge_dev)
 				break; /* same bridge, check next VLAN */
 
-			if (!ds->ports[i].bridge_dev)
+			if (!dsa_to_port(ds, i)->bridge_dev)
 				continue;
 
 			dev_err(ds->dev, "p%d: hw VLAN %d already used by %s\n",
 				port, vlan.vid,
-				netdev_name(ds->ports[i].bridge_dev));
+				netdev_name(dsa_to_port(ds, i)->bridge_dev));
 			err = -EOPNOTSUPP;
 			goto unlock;
 		}
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 82f09711ac1a..d1b0b1fb632f 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -700,7 +700,7 @@ qca8k_port_bridge_join(struct dsa_switch *ds, int port, struct net_device *br)
 	int i;
 
 	for (i = 1; i < QCA8K_NUM_PORTS; i++) {
-		if (ds->ports[i].bridge_dev != br)
+		if (dsa_to_port(ds, i)->bridge_dev != br)
 			continue;
 		/* Add this port to the portvlan mask of the other ports
 		 * in the bridge
@@ -725,7 +725,7 @@ qca8k_port_bridge_leave(struct dsa_switch *ds, int port, struct net_device *br)
 	int i;
 
 	for (i = 1; i < QCA8K_NUM_PORTS; i++) {
-		if (ds->ports[i].bridge_dev != br)
+		if (dsa_to_port(ds, i)->bridge_dev != br)
 			continue;
 		/* Remove this port to the portvlan mask of the other ports
 		 * in the bridge
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6ed1a17ed1bd..38961ef91d3d 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -269,6 +269,11 @@ static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p)
 	return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p);
 }
 
+static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
+{
+	return &ds->ports[p];
+}
+
 static inline u8 dsa_upstream_port(struct dsa_switch *ds)
 {
 	struct dsa_switch_tree *dst = ds->dst;
-- 
cgit v1.2.3


From eb4ddaf474285a4c6986f4a1c3205bdb0bed2da9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 17:28:45 -0700
Subject: net/decnet: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Cc: linux-decnet-user@lists.sourceforge.net
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dn.h        |  7 -------
 include/net/dn_nsp.h    |  1 -
 net/decnet/af_decnet.c  |  4 ----
 net/decnet/dn_dev.c     | 12 +++++-------
 net/decnet/dn_nsp_out.c | 11 -----------
 5 files changed, 5 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/dn.h b/include/net/dn.h
index 913b73d239f5..4394f7d5cfe8 100644
--- a/include/net/dn.h
+++ b/include/net/dn.h
@@ -122,13 +122,6 @@ struct dn_scp                                   /* Session Control Port */
 	unsigned long keepalive;
 	void (*keepalive_fxn)(struct sock *sk);
 
-	/*
-	 * This stuff is for the fast timer for delayed acks
-	 */
-	struct timer_list delack_timer;
-	int delack_pending;
-	void (*delack_fxn)(struct sock *sk);
-
 };
 
 static inline struct dn_scp *DN_SK(struct sock *sk)
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
index 3a3e33d18456..413a15e5339c 100644
--- a/include/net/dn_nsp.h
+++ b/include/net/dn_nsp.h
@@ -17,7 +17,6 @@
 
 void dn_nsp_send_data_ack(struct sock *sk);
 void dn_nsp_send_oth_ack(struct sock *sk);
-void dn_nsp_delayed_ack(struct sock *sk);
 void dn_send_conn_ack(struct sock *sk);
 void dn_send_conn_conf(struct sock *sk, gfp_t gfp);
 void dn_nsp_send_disc(struct sock *sk, unsigned char type,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 73a0399dc7a2..d4c9a8bbad3e 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -533,10 +533,6 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf
 	scp->keepalive = 10 * HZ;
 	scp->keepalive_fxn = dn_keepalive;
 
-	init_timer(&scp->delack_timer);
-	scp->delack_pending = 0;
-	scp->delack_fxn = dn_nsp_delayed_ack;
-
 	dn_start_slow_timer(sk);
 out:
 	return sk;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 4d339de56862..92dbaa3f1eae 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1038,14 +1038,14 @@ static void dn_eth_down(struct net_device *dev)
 
 static void dn_dev_set_timer(struct net_device *dev);
 
-static void dn_dev_timer_func(unsigned long arg)
+static void dn_dev_timer_func(struct timer_list *t)
 {
-	struct net_device *dev = (struct net_device *)arg;
-	struct dn_dev *dn_db;
+	struct dn_dev *dn_db = from_timer(dn_db, t, timer);
+	struct net_device *dev;
 	struct dn_ifaddr *ifa;
 
 	rcu_read_lock();
-	dn_db = rcu_dereference(dev->dn_ptr);
+	dev = dn_db->dev;
 	if (dn_db->t3 <= dn_db->parms.t2) {
 		if (dn_db->parms.timer3) {
 			for (ifa = rcu_dereference(dn_db->ifa_list);
@@ -1070,8 +1070,6 @@ static void dn_dev_set_timer(struct net_device *dev)
 	if (dn_db->parms.t2 > dn_db->parms.t3)
 		dn_db->parms.t2 = dn_db->parms.t3;
 
-	dn_db->timer.data = (unsigned long)dev;
-	dn_db->timer.function = dn_dev_timer_func;
 	dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ);
 
 	add_timer(&dn_db->timer);
@@ -1100,7 +1098,7 @@ static struct dn_dev *dn_dev_create(struct net_device *dev, int *err)
 
 	rcu_assign_pointer(dev->dn_ptr, dn_db);
 	dn_db->dev = dev;
-	init_timer(&dn_db->timer);
+	timer_setup(&dn_db->timer, dn_dev_timer_func, 0);
 
 	dn_db->uptime = jiffies;
 
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 66f035e476ea..e50a4adfcf7e 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -491,17 +491,6 @@ void dn_send_conn_ack (struct sock *sk)
 	dn_nsp_send(skb);
 }
 
-void dn_nsp_delayed_ack(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	if (scp->ackxmt_oth != scp->numoth_rcv)
-		dn_nsp_send_oth_ack(sk);
-
-	if (scp->ackxmt_dat != scp->numdat_rcv)
-		dn_nsp_send_data_ack(sk);
-}
-
 static int dn_nsp_retrans_conn_conf(struct sock *sk)
 {
 	struct dn_scp *scp = DN_SK(sk);
-- 
cgit v1.2.3


From 59f379f9046a9e0532ffd19b44e3c32fe79ec51b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 17:29:19 -0700
Subject: inet/connection_sock: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: netdev@vger.kernel.org
Cc: dccp@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  6 +++---
 net/dccp/timer.c                   | 18 ++++++++++--------
 net/ipv4/inet_connection_sock.c    | 21 +++++++++------------
 net/ipv4/tcp_timer.c               | 18 +++++++++++-------
 4 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 13e4c89a8231..0358745ea059 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -169,9 +169,9 @@ enum inet_csk_ack_state_t {
 };
 
 void inet_csk_init_xmit_timers(struct sock *sk,
-			       void (*retransmit_handler)(unsigned long),
-			       void (*delack_handler)(unsigned long),
-			       void (*keepalive_handler)(unsigned long));
+			       void (*retransmit_handler)(struct timer_list *),
+			       void (*delack_handler)(struct timer_list *),
+			       void (*keepalive_handler)(struct timer_list *));
 void inet_csk_clear_xmit_timers(struct sock *sk);
 
 static inline void inet_csk_schedule_ack(struct sock *sk)
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 3a2c34027758..1e35526bf436 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -125,10 +125,11 @@ static void dccp_retransmit_timer(struct sock *sk)
 		__sk_dst_reset(sk);
 }
 
-static void dccp_write_timer(unsigned long data)
+static void dccp_write_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)data;
-	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_connection_sock *icsk =
+			from_timer(icsk, t, icsk_retransmit_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
 	int event = 0;
 
 	bh_lock_sock(sk);
@@ -161,19 +162,20 @@ out:
 	sock_put(sk);
 }
 
-static void dccp_keepalive_timer(unsigned long data)
+static void dccp_keepalive_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)data;
+	struct sock *sk = from_timer(sk, t, sk_timer);
 
 	pr_err("dccp should not use a keepalive timer !\n");
 	sock_put(sk);
 }
 
 /* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
-static void dccp_delack_timer(unsigned long data)
+static void dccp_delack_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)data;
-	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_connection_sock *icsk =
+			from_timer(icsk, t, icsk_delack_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8a91ebbf0c01..5c965ecc96a0 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -494,17 +494,15 @@ EXPORT_SYMBOL(inet_csk_accept);
  * to optimize.
  */
 void inet_csk_init_xmit_timers(struct sock *sk,
-			       void (*retransmit_handler)(unsigned long),
-			       void (*delack_handler)(unsigned long),
-			       void (*keepalive_handler)(unsigned long))
+			       void (*retransmit_handler)(struct timer_list *t),
+			       void (*delack_handler)(struct timer_list *t),
+			       void (*keepalive_handler)(struct timer_list *t))
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
-			(unsigned long)sk);
-	setup_timer(&icsk->icsk_delack_timer, delack_handler,
-			(unsigned long)sk);
-	setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
+	timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
+	timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
+	timer_setup(&sk->sk_timer, keepalive_handler, 0);
 	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
 }
 EXPORT_SYMBOL(inet_csk_init_xmit_timers);
@@ -676,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
 
-static void reqsk_timer_handler(unsigned long data)
+static void reqsk_timer_handler(struct timer_list *t)
 {
-	struct request_sock *req = (struct request_sock *)data;
+	struct request_sock *req = from_timer(req, t, rsk_timer);
 	struct sock *sk_listener = req->rsk_listener;
 	struct net *net = sock_net(sk_listener);
 	struct inet_connection_sock *icsk = inet_csk(sk_listener);
@@ -749,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req,
 	req->num_timeout = 0;
 	req->sk = NULL;
 
-	setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler,
-			    (unsigned long)req);
+	timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
 	mod_timer(&req->rsk_timer, jiffies + timeout);
 
 	inet_ehash_insert(req_to_sk(req), NULL);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 7014cc00c74c..804a8d34ce86 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -288,15 +288,17 @@ out:
  *
  *  Returns: Nothing (void)
  */
-static void tcp_delack_timer(unsigned long data)
+static void tcp_delack_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)data;
+	struct inet_connection_sock *icsk =
+			from_timer(icsk, t, icsk_delack_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
 
 	bh_lock_sock(sk);
 	if (!sock_owned_by_user(sk)) {
 		tcp_delack_timer_handler(sk);
 	} else {
-		inet_csk(sk)->icsk_ack.blocked = 1;
+		icsk->icsk_ack.blocked = 1;
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
 		/* deleguate our work to tcp_release_cb() */
 		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
@@ -576,9 +578,11 @@ out:
 	sk_mem_reclaim(sk);
 }
 
-static void tcp_write_timer(unsigned long data)
+static void tcp_write_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)data;
+	struct inet_connection_sock *icsk =
+			from_timer(icsk, t, icsk_retransmit_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
 
 	bh_lock_sock(sk);
 	if (!sock_owned_by_user(sk)) {
@@ -613,9 +617,9 @@ void tcp_set_keepalive(struct sock *sk, int val)
 EXPORT_SYMBOL_GPL(tcp_set_keepalive);
 
 
-static void tcp_keepalive_timer (unsigned long data)
+static void tcp_keepalive_timer (struct timer_list *t)
 {
-	struct sock *sk = (struct sock *) data;
+	struct sock *sk = from_timer(sk, t, sk_timer);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 elapsed;
-- 
cgit v1.2.3


From 78802011fbe34331bdef6f2dfb1634011f0e4c32 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 17:29:20 -0700
Subject: inet: frags: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: Alexander Aring <alex.aring@gmail.com>
Cc: Stefan Schmidt <stefan@osg.samsung.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: Florian Westphal <fw@strlen.de>
Cc: linux-wpan@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: netfilter-devel@vger.kernel.org
Cc: coreteam@netfilter.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Stefan Schmidt <stefan@osg.samsung.com> # for ieee802154
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 | 2 +-
 net/ieee802154/6lowpan/reassembly.c     | 5 +++--
 net/ipv4/inet_fragment.c                | 4 ++--
 net/ipv4/ip_fragment.c                  | 5 +++--
 net/ipv6/netfilter/nf_conntrack_reasm.c | 5 +++--
 net/ipv6/reassembly.c                   | 5 +++--
 6 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index fc59e0775e00..c695807ca707 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -95,7 +95,7 @@ struct inet_frags {
 	void			(*constructor)(struct inet_frag_queue *q,
 					       const void *arg);
 	void			(*destructor)(struct inet_frag_queue *);
-	void			(*frag_expire)(unsigned long data);
+	void			(*frag_expire)(struct timer_list *t);
 	struct kmem_cache	*frags_cachep;
 	const char		*frags_cache_name;
 };
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index f85b08baff16..85bf86ad6b18 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -80,12 +80,13 @@ static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
 	fq->daddr = *arg->dst;
 }
 
-static void lowpan_frag_expire(unsigned long data)
+static void lowpan_frag_expire(struct timer_list *t)
 {
+	struct inet_frag_queue *frag = from_timer(frag, t, timer);
 	struct frag_queue *fq;
 	struct net *net;
 
-	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags);
 
 	spin_lock(&fq->q.lock);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index af74d0433453..7f3ef5c287a1 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 	spin_unlock(&hb->chain_lock);
 
 	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
-		f->frag_expire((unsigned long) fq);
+		f->frag_expire(&fq->timer);
 
 	return evicted;
 }
@@ -366,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 	f->constructor(q, arg);
 	add_frag_mem_limit(nf, f->qsize);
 
-	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
+	timer_setup(&q->timer, f->frag_expire, 0);
 	spin_lock_init(&q->lock);
 	refcount_set(&q->refcnt, 1);
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 46408c220d9d..9215654a401f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -190,12 +190,13 @@ static bool frag_expire_skip_icmp(u32 user)
 /*
  * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
  */
-static void ip_expire(unsigned long arg)
+static void ip_expire(struct timer_list *t)
 {
+	struct inet_frag_queue *frag = from_timer(frag, t, timer);
 	struct ipq *qp;
 	struct net *net;
 
-	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+	qp = container_of(frag, struct ipq, q);
 	net = container_of(qp->q.net, struct net, ipv4.frags);
 
 	rcu_read_lock();
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b263bf3a19f7..977d8900cfd1 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -169,12 +169,13 @@ static unsigned int nf_hashfn(const struct inet_frag_queue *q)
 	return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
 }
 
-static void nf_ct_frag6_expire(unsigned long data)
+static void nf_ct_frag6_expire(struct timer_list *t)
 {
+	struct inet_frag_queue *frag = from_timer(frag, t, timer);
 	struct frag_queue *fq;
 	struct net *net;
 
-	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, nf_frag.frags);
 
 	ip6_expire_frag_queue(net, fq, &nf_frags);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 846012eae526..afbc000ad4f2 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -170,12 +170,13 @@ out:
 }
 EXPORT_SYMBOL(ip6_expire_frag_queue);
 
-static void ip6_frag_expire(unsigned long data)
+static void ip6_frag_expire(struct timer_list *t)
 {
+	struct inet_frag_queue *frag = from_timer(frag, t, timer);
 	struct frag_queue *fq;
 	struct net *net;
 
-	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ipv6.frags);
 
 	ip6_expire_frag_queue(net, fq, &ip6_frags);
-- 
cgit v1.2.3


From fb6ff75e18937a20dbec1eb47b5f893f38eabae4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 16 Oct 2017 14:24:02 -0700
Subject: tcp: Use pI6c in tcp tracepoint

The compact form for IPv6 addresses is more user friendly than the full
version. For example:
   compact: 2001:db8:1::1
      full: 2001:0db8:0001:0000:0000:0000:0000:0004i

Update the tcp tracepoint to show the compact form.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 3d1cbd072b7e..1ffab6d96e94 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -57,7 +57,7 @@ TRACE_EVENT(tcp_retransmit_skb,
 		}
 	),
 
-	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6 daddrv6=%pI6",
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
 		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
-- 
cgit v1.2.3


From 386fd5da401dc6c4b0ab6a54d333609876b699fe Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 16 Oct 2017 15:32:07 -0700
Subject: tcp: Check daddr_cache before use in tracepoint

Running perf in one window to capture tcp_retransmit_skb tracepoint:
    $ perf record -e tcp:tcp_retransmit_skb -a

And causing a retransmission on an active TCP session (e.g., dropping
packets in the receiver, changing MTU on the interface to 500 and back
to 1500) triggers a panic:

[   58.543144] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
[   58.545300] IP: perf_trace_tcp_retransmit_skb+0xd0/0x145
[   58.546770] PGD 0 P4D 0
[   58.547472] Oops: 0000 [#1] SMP
[   58.548328] Modules linked in: vrf
[   58.549262] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc4+ #26
[   58.551004] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
[   58.554560] task: ffffffff81a0e540 task.stack: ffffffff81a00000
[   58.555817] RIP: 0010:perf_trace_tcp_retransmit_skb+0xd0/0x145
[   58.557137] RSP: 0018:ffff88003fc03d68 EFLAGS: 00010282
[   58.558292] RAX: 0000000000000000 RBX: ffffe8ffffc0ec80 RCX: ffff880038543098
[   58.559850] RDX: 0400000000000000 RSI: ffff88003fc03d70 RDI: ffff88003fc14b68
[   58.561099] RBP: ffff88003fc03da8 R08: 0000000000000000 R09: ffffea0000d3224a
[   58.562005] R10: ffff88003fc03db8 R11: 0000000000000010 R12: ffff8800385428c0
[   58.562930] R13: ffffe8ffffc0e478 R14: ffffffff81a93a40 R15: ffff88003d4f0c00
[   58.563845] FS:  0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000
[   58.564873] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   58.565613] CR2: 0000000000000008 CR3: 000000003d68f004 CR4: 00000000000606f0
[   58.566538] Call Trace:
[   58.566865]  <IRQ>
[   58.567140]  __tcp_retransmit_skb+0x4ab/0x4c6
[   58.567704]  ? tcp_set_ca_state+0x22/0x3f
[   58.568231]  tcp_retransmit_skb+0x14/0xa3
[   58.568754]  tcp_retransmit_timer+0x472/0x5e3
[   58.569324]  ? tcp_write_timer_handler+0x1e9/0x1e9
[   58.569946]  tcp_write_timer_handler+0x95/0x1e9
[   58.570548]  tcp_write_timer+0x2a/0x58

Check that daddr_cache is non-NULL before de-referencing.

Fixes: e086101b150a ("tcp: add a tracepoint for tcp retransmission")
Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 1ffab6d96e94..f51c130f1e0f 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -27,7 +27,6 @@ TRACE_EVENT(tcp_retransmit_skb,
 	),
 
 	TP_fast_assign(
-		struct ipv6_pinfo *np = inet6_sk(sk);
 		struct inet_sock *inet = inet_sk(sk);
 		struct in6_addr *pin6;
 		__be32 *p32;
@@ -44,11 +43,12 @@ TRACE_EVENT(tcp_retransmit_skb,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-		if (np) {
+		/* IPv6 socket ? */
+		if (inet6_sk(sk)) {
 			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = np->saddr;
+			*pin6 = sk->sk_v6_rcv_saddr;
 			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = *(np->daddr_cache);
+			*pin6 = sk->sk_v6_daddr;
 		} else {
 			pin6 = (struct in6_addr *)__entry->saddr_v6;
 			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-- 
cgit v1.2.3


From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:53 -0700
Subject: bpf: split verifier and program ops

struct bpf_verifier_ops contains both verifier ops and operations
used later during program's lifetime (test_run).  Split the runtime
ops into a different structure.

BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops
to the names.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       | 15 ++++++++++-----
 include/linux/bpf_types.h | 28 ++++++++++++++--------------
 kernel/bpf/syscall.c      | 16 +++++++++++++---
 kernel/bpf/verifier.c     | 12 ++++++------
 kernel/trace/bpf_trace.c  | 15 ++++++++++++---
 net/core/filter.c         | 45 ++++++++++++++++++++++++++++++++++++---------
 6 files changed, 91 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6d4dd844828a..e1fba5504ca5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -157,6 +157,11 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+struct bpf_prog_ops {
+	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
+			union bpf_attr __user *uattr);
+};
+
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -172,8 +177,6 @@ struct bpf_verifier_ops {
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
 				  struct bpf_prog *prog, u32 *target_size);
-	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
-			union bpf_attr __user *uattr);
 };
 
 struct bpf_prog_aux {
@@ -184,7 +187,8 @@ struct bpf_prog_aux {
 	u32 id;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
-	const struct bpf_verifier_ops *ops;
+	const struct bpf_prog_ops *ops;
+	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
@@ -279,8 +283,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
-#define BPF_PROG_TYPE(_id, _ops) \
-	extern const struct bpf_verifier_ops _ops;
+#define BPF_PROG_TYPE(_id, _name) \
+	extern const struct bpf_prog_ops _name ## _prog_ops; \
+	extern const struct bpf_verifier_ops _name ## _verifier_ops;
 #define BPF_MAP_TYPE(_id, _ops) \
 	extern const struct bpf_map_ops _ops;
 #include <linux/bpf_types.h>
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 814c1081a4a9..36418ad43245 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -1,22 +1,22 @@
 /* internal file - do not include directly */
 
 #ifdef CONFIG_NET
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 #endif
 #ifdef CONFIG_BPF_EVENTS
-BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
+BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 54fba06942f5..444902b5a30d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -739,9 +739,18 @@ err_put:
 	return err;
 }
 
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
-	[_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _prog_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
 #define BPF_MAP_TYPE(_id, _ops)
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
@@ -754,6 +763,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 		return -EINVAL;
 
 	prog->aux->ops = bpf_prog_types[type];
+	prog->aux->vops = bpf_verifier_ops[type];
 	prog->type = type;
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e4d5136725a2..38e24d69fc95 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -856,8 +856,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 			*reg_type = info.reg_type;
 			return 0;
 		}
-	} else if (env->prog->aux->ops->is_valid_access &&
-		   env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+	} else if (env->prog->aux->vops->is_valid_access &&
+		   env->prog->aux->vops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -1565,8 +1565,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->ops->get_func_proto)
-		fn = env->prog->aux->ops->get_func_proto(func_id);
+	if (env->prog->aux->vops->get_func_proto)
+		fn = env->prog->aux->vops->get_func_proto(func_id);
 
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
@@ -4035,7 +4035,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	const struct bpf_verifier_ops *ops = env->prog->aux->vops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4236,7 +4236,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->ops->get_func_proto(insn->imm);
+		fn = prog->aux->vops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 04ea5314f2bc..3126da2f468a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -561,11 +561,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
 	return true;
 }
 
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
 	.get_func_proto  = kprobe_prog_func_proto,
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
 BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
@@ -667,11 +670,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
 	.get_func_proto  = tp_prog_func_proto,
 	.is_valid_access = tp_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    struct bpf_insn_access_aux *info)
 {
@@ -727,8 +733,11 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
 	.get_func_proto		= tp_prog_func_proto,
 	.is_valid_access	= pe_prog_is_valid_access,
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
diff --git a/net/core/filter.c b/net/core/filter.c
index 4d88e0665c41..1dd3034f846f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4395,68 +4395,95 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops sk_filter_prog_ops = {
+const struct bpf_verifier_ops sk_filter_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops tc_cls_act_prog_ops = {
+const struct bpf_prog_ops sk_filter_prog_ops = {
+};
+
+const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops xdp_prog_ops = {
+const struct bpf_verifier_ops xdp_verifier_ops = {
 	.get_func_proto		= xdp_func_proto,
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
+};
+
+const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
 
-const struct bpf_verifier_ops cg_skb_prog_ops = {
+const struct bpf_verifier_ops cg_skb_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_skb_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_inout_prog_ops = {
+const struct bpf_verifier_ops lwt_inout_verifier_ops = {
 	.get_func_proto		= lwt_inout_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_inout_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_xmit_prog_ops = {
+const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
 	.get_func_proto		= lwt_xmit_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops lwt_xmit_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops cg_sock_prog_ops = {
+const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops sock_ops_prog_ops = {
+const struct bpf_prog_ops cg_sock_prog_ops = {
+};
+
+const struct bpf_verifier_ops sock_ops_verifier_ops = {
 	.get_func_proto		= sock_ops_func_proto,
 	.is_valid_access	= sock_ops_is_valid_access,
 	.convert_ctx_access	= sock_ops_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops sk_skb_prog_ops = {
+const struct bpf_prog_ops sock_ops_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_skb_verifier_ops = {
 	.get_func_proto		= sk_skb_func_proto,
 	.is_valid_access	= sk_skb_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= sk_skb_prologue,
 };
 
+const struct bpf_prog_ops sk_skb_prog_ops = {
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
-- 
cgit v1.2.3


From 00176a34d9e27ab1e77db75fe13abc005cffe0ca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:54 -0700
Subject: bpf: remove the verifier ops from program structure

Since the verifier ops don't have to be associated with
the program for its entire lifetime we can move it to
verifier's struct bpf_verifier_env.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 -
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/syscall.c         | 10 ----------
 kernel/bpf/verifier.c        | 23 +++++++++++++++++------
 4 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1fba5504ca5..cf91977e8719 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -188,7 +188,6 @@ struct bpf_prog_aux {
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_prog_ops *ops;
-	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f00ef751c1c5..feeaea93d959 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -141,6 +141,7 @@ struct bpf_ext_analyzer_ops {
  */
 struct bpf_verifier_env {
 	struct bpf_prog *prog;		/* eBPF program being verified */
+	const struct bpf_verifier_ops *ops;
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 444902b5a30d..0e893cac6795 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -748,22 +748,12 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #undef BPF_MAP_TYPE
 };
 
-static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
-#define BPF_PROG_TYPE(_id, _name) \
-	[_id] = & _name ## _verifier_ops,
-#define BPF_MAP_TYPE(_id, _ops)
-#include <linux/bpf_types.h>
-#undef BPF_PROG_TYPE
-#undef BPF_MAP_TYPE
-};
-
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 {
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
 	prog->aux->ops = bpf_prog_types[type];
-	prog->aux->vops = bpf_verifier_ops[type];
 	prog->type = type;
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 38e24d69fc95..3b6e2c550e96 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,15 @@
 
 #include "disasm.h"
 
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -856,8 +865,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 			*reg_type = info.reg_type;
 			return 0;
 		}
-	} else if (env->prog->aux->vops->is_valid_access &&
-		   env->prog->aux->vops->is_valid_access(off, size, t, &info)) {
+	} else if (env->ops->is_valid_access &&
+		   env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -1565,8 +1574,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->vops->get_func_proto)
-		fn = env->prog->aux->vops->get_func_proto(func_id);
+	if (env->ops->get_func_proto)
+		fn = env->ops->get_func_proto(func_id);
 
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
@@ -4035,7 +4044,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->vops;
+	const struct bpf_verifier_ops *ops = env->ops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4236,7 +4245,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->vops->get_func_proto(insn->imm);
+		fn = env->ops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
@@ -4294,6 +4303,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = *prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
@@ -4406,6 +4416,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 	env->analyzer_ops = ops;
 	env->analyzer_priv = priv;
 
-- 
cgit v1.2.3


From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:55 -0700
Subject: bpf: move knowledge about post-translation offsets out of verifier

Use the fact that verifier ops are now separate from program
ops to define a separate set of callbacks for verification of
already translated programs.

Since we expect the analyzer ops to be defined only for
a small subset of all program types initialize their array
by hand (don't use linux/bpf_types.h).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  3 +++
 kernel/bpf/verifier.c | 55 +++++++++++++++------------------------------------
 net/core/filter.c     | 40 +++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cf91977e8719..d67ccdc0099f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -291,6 +291,9 @@ DECLARE_PER_CPU(int, bpf_prog_active);
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
+extern const struct bpf_verifier_ops xdp_analyzer_ops;
+
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3b6e2c550e96..545b8c45a578 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -822,36 +822,6 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return err;
 }
 
-static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off,
-				     struct bpf_insn_access_aux *info)
-{
-	switch (env->prog->type) {
-	case BPF_PROG_TYPE_XDP:
-		switch (off) {
-		case offsetof(struct xdp_buff, data):
-			info->reg_type = PTR_TO_PACKET;
-			return true;
-		case offsetof(struct xdp_buff, data_end):
-			info->reg_type = PTR_TO_PACKET_END;
-			return true;
-		}
-		return false;
-	case BPF_PROG_TYPE_SCHED_CLS:
-		switch (off) {
-		case offsetof(struct sk_buff, data):
-			info->reg_type = PTR_TO_PACKET;
-			return true;
-		case offsetof(struct sk_buff, cb) +
-		     offsetof(struct bpf_skb_data_end, data_end):
-			info->reg_type = PTR_TO_PACKET_END;
-			return true;
-		}
-		return false;
-	default:
-		return false;
-	}
-}
-
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
@@ -860,13 +830,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		.reg_type = *reg_type,
 	};
 
-	if (env->analyzer_ops) {
-		if (analyzer_is_valid_access(env, off, &info)) {
-			*reg_type = info.reg_type;
-			return 0;
-		}
-	} else if (env->ops->is_valid_access &&
-		   env->ops->is_valid_access(off, size, t, &info)) {
+	if (env->ops->is_valid_access &&
+	    env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -874,9 +839,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 * will only allow for whole field access and rejects any other
 		 * type of narrower access.
 		 */
-		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		*reg_type = info.reg_type;
 
+		if (env->analyzer_ops)
+			return 0;
+
+		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -4400,12 +4368,21 @@ err_free_env:
 	return ret;
 }
 
+static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
+	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
+	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
+};
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv)
 {
 	struct bpf_verifier_env *env;
 	int ret;
 
+	if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
+	    !bpf_analyzer_ops[prog->type])
+		return -EOPNOTSUPP;
+
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
@@ -4416,7 +4393,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = prog;
-	env->ops = bpf_verifier_ops[env->prog->type];
+	env->ops = bpf_analyzer_ops[env->prog->type];
 	env->analyzer_ops = ops;
 	env->analyzer_priv = priv;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1dd3034f846f..7373a08fbef7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3732,6 +3732,23 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
+static bool
+tc_cls_act_is_valid_access_analyzer(int off, int size,
+				    enum bpf_access_type type,
+				    struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		return true;
+	case offsetof(struct sk_buff, cb) +
+	     offsetof(struct bpf_skb_data_end, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return true;
+	}
+	return false;
+}
+
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -3766,6 +3783,21 @@ static bool xdp_is_valid_access(int off, int size,
 	return __is_valid_xdp_access(off, size);
 }
 
+static bool xdp_is_valid_access_analyzer(int off, int size,
+					 enum bpf_access_type type,
+					 struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct xdp_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		return true;
+	case offsetof(struct xdp_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return true;
+	}
+	return false;
+}
+
 void bpf_warn_invalid_xdp_action(u32 act)
 {
 	const u32 act_max = XDP_REDIRECT;
@@ -4411,6 +4443,10 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
+const struct bpf_verifier_ops tc_cls_act_analyzer_ops = {
+	.is_valid_access	= tc_cls_act_is_valid_access_analyzer,
+};
+
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
@@ -4421,6 +4457,10 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
+const struct bpf_verifier_ops xdp_analyzer_ops = {
+	.is_valid_access	= xdp_is_valid_access_analyzer,
+};
+
 const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
-- 
cgit v1.2.3


From 7a0947e755084b918e33242fd558e55cb443408e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 17 Oct 2017 17:16:52 -0700
Subject: dql: make dql_init return void

dql_init always returned 0, and the only place that uses it
in network core code didn't care about the return value anyway.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Acked-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dynamic_queue_limits.h | 2 +-
 lib/dynamic_queue_limits.c           | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index a4be70398ce1..f69f98541953 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -98,7 +98,7 @@ void dql_completed(struct dql *dql, unsigned int count);
 void dql_reset(struct dql *dql);
 
 /* Initialize dql state */
-int dql_init(struct dql *dql, unsigned hold_time);
+void dql_init(struct dql *dql, unsigned int hold_time);
 
 #endif /* _KERNEL_ */
 
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index f346715e2255..dbe61c4c2a97 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -127,12 +127,11 @@ void dql_reset(struct dql *dql)
 }
 EXPORT_SYMBOL(dql_reset);
 
-int dql_init(struct dql *dql, unsigned hold_time)
+void dql_init(struct dql *dql, unsigned int hold_time)
 {
 	dql->max_limit = DQL_MAX_LIMIT;
 	dql->min_limit = 0;
 	dql->slack_hold_time = hold_time;
 	dql_reset(dql);
-	return 0;
 }
 EXPORT_SYMBOL(dql_init);
-- 
cgit v1.2.3


From 890056783c60ad9d0789774af2bc10fe4f27dd9d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Oct 2017 08:17:29 -0700
Subject: tcp: Remove use of inet6_sk and add IPv6 checks to tracepoint

386fd5da401d ("tcp: Check daddr_cache before use in tracepoint") was the
second version of the tracepoint fixup patch. This patch is the delta
between v2 and v3.  Specifically, remove the use of inet6_sk and check
sk_family as requested by Eric and add IS_ENABLED(CONFIG_IPV6) around
the use of sk_v6_rcv_saddr and sk_v6_daddr as done in sock_common (noted
by Cong).

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index f51c130f1e0f..c3220d914475 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -43,13 +43,15 @@ TRACE_EVENT(tcp_retransmit_skb,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-		/* IPv6 socket ? */
-		if (inet6_sk(sk)) {
+#if IS_ENABLED(CONFIG_IPV6)
+		if (sk->sk_family == AF_INET6) {
 			pin6 = (struct in6_addr *)__entry->saddr_v6;
 			*pin6 = sk->sk_v6_rcv_saddr;
 			pin6 = (struct in6_addr *)__entry->daddr_v6;
 			*pin6 = sk->sk_v6_daddr;
-		} else {
+		} else
+#endif
+		{
 			pin6 = (struct in6_addr *)__entry->saddr_v6;
 			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
 			pin6 = (struct in6_addr *)__entry->daddr_v6;
-- 
cgit v1.2.3


From de95e04791a03de5cb681980a3880db6919e3b4a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Oct 2017 09:56:54 -0700
Subject: net: Add extack to validator_info structs used for address notifier

Add extack to in_validator_info and in6_validator_info. Update the one
user of each, ipvlan, to return an error message for failures.

Only manual configuration of an address is plumbed in the IPv6 code path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipvlan/ipvlan_main.c | 10 ++++++++--
 include/linux/inetdevice.h       |  1 +
 include/net/addrconf.h           |  1 +
 net/ipv4/devinet.c               |  8 +++++---
 net/ipv6/addrconf.c              | 22 ++++++++++++----------
 5 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 6842739b6679..f0ab55df57f1 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -847,8 +847,11 @@ static int ipvlan_addr6_validator_event(struct notifier_block *unused,
 
 	switch (event) {
 	case NETDEV_UP:
-		if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true))
+		if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) {
+			NL_SET_ERR_MSG(i6vi->extack,
+				       "Address already assigned to an ipvlan device");
 			return notifier_from_errno(-EADDRINUSE);
+		}
 		break;
 	}
 
@@ -917,8 +920,11 @@ static int ipvlan_addr4_validator_event(struct notifier_block *unused,
 
 	switch (event) {
 	case NETDEV_UP:
-		if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false))
+		if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) {
+			NL_SET_ERR_MSG(ivi->extack,
+				       "Address already assigned to an ipvlan device");
 			return notifier_from_errno(-EADDRINUSE);
+		}
 		break;
 	}
 
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 751d051f0bc7..681dff30940b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -154,6 +154,7 @@ struct in_ifaddr {
 struct in_validator_info {
 	__be32			ivi_addr;
 	struct in_device	*ivi_dev;
+	struct netlink_ext_ack	*extack;
 };
 
 int register_inetaddr_notifier(struct notifier_block *nb);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 87981cd63180..b8b16437c6d5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -55,6 +55,7 @@ struct prefix_info {
 struct in6_validator_info {
 	struct in6_addr		i6vi_addr;
 	struct inet6_dev	*i6vi_dev;
+	struct netlink_ext_ack	*extack;
 };
 
 #define IN6_ADDR_HSIZE_SHIFT	4
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e1e2ec0525e6..a4573bccd6da 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -444,7 +444,7 @@ static void check_lifetime(struct work_struct *work);
 static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
 
 static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
-			     u32 portid)
+			     u32 portid, struct netlink_ext_ack *extack)
 {
 	struct in_device *in_dev = ifa->ifa_dev;
 	struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -489,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	 */
 	ivi.ivi_addr = ifa->ifa_address;
 	ivi.ivi_dev = ifa->ifa_dev;
+	ivi.extack = extack;
 	ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
 					   NETDEV_UP, &ivi);
 	ret = notifier_to_errno(ret);
@@ -521,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 
 static int inet_insert_ifa(struct in_ifaddr *ifa)
 {
-	return __inet_insert_ifa(ifa, NULL, 0);
+	return __inet_insert_ifa(ifa, NULL, 0, NULL);
 }
 
 static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
@@ -902,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 				return ret;
 			}
 		}
-		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
+					 extack);
 	} else {
 		inet_free_ifa(ifa);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index dd9c0c435f71..93f9c0a61911 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -987,7 +987,7 @@ static struct inet6_ifaddr *
 ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	      const struct in6_addr *peer_addr, int pfxlen,
 	      int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
-	      bool can_block)
+	      bool can_block, struct netlink_ext_ack *extack)
 {
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
 	struct net *net = dev_net(idev->dev);
@@ -1019,6 +1019,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		struct in6_validator_info i6vi = {
 			.i6vi_addr = *addr,
 			.i6vi_dev = idev,
+			.extack = extack,
 		};
 
 		err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi);
@@ -1356,7 +1357,7 @@ retry:
 
 	ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
 			    ipv6_addr_scope(&addr), addr_flags,
-			    tmp_valid_lft, tmp_prefered_lft, true);
+			    tmp_valid_lft, tmp_prefered_lft, true, NULL);
 	if (IS_ERR(ift)) {
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
@@ -2040,7 +2041,7 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
 
 		ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
 				     scope, flags, valid_lft,
-				     preferred_lft, false);
+				     preferred_lft, false, NULL);
 		if (IS_ERR(ifp2))
 			goto lock_errdad;
 
@@ -2498,7 +2499,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 					    pinfo->prefix_len,
 					    addr_type&IPV6_ADDR_SCOPE_MASK,
 					    addr_flags, valid_lft,
-					    prefered_lft, false);
+					    prefered_lft, false, NULL);
 
 		if (IS_ERR_OR_NULL(ifp))
 			return -1;
@@ -2808,7 +2809,8 @@ static int inet6_addr_add(struct net *net, int ifindex,
 			  const struct in6_addr *pfx,
 			  const struct in6_addr *peer_pfx,
 			  unsigned int plen, __u32 ifa_flags,
-			  __u32 prefered_lft, __u32 valid_lft)
+			  __u32 prefered_lft, __u32 valid_lft,
+			  struct netlink_ext_ack *extack)
 {
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
@@ -2867,7 +2869,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	}
 
 	ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
-			    valid_lft, prefered_lft, true);
+			    valid_lft, prefered_lft, true, extack);
 
 	if (!IS_ERR(ifp)) {
 		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
@@ -2952,7 +2954,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
 	rtnl_lock();
 	err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
 			     ireq.ifr6_prefixlen, IFA_F_PERMANENT,
-			     INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+			     INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
 	rtnl_unlock();
 	return err;
 }
@@ -2983,7 +2985,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	ifp = ipv6_add_addr(idev, addr, NULL, plen,
 			    scope, IFA_F_PERMANENT,
 			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME,
-			    true);
+			    true, NULL);
 	if (!IS_ERR(ifp)) {
 		spin_lock_bh(&ifp->lock);
 		ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3083,7 +3085,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
 #endif
 
 	ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
-			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true);
+			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
 	if (!IS_ERR(ifp)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
 		addrconf_dad_start(ifp);
@@ -4586,7 +4588,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		 */
 		return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
 				      ifm->ifa_prefixlen, ifa_flags,
-				      preferred_lft, valid_lft);
+				      preferred_lft, valid_lft, extack);
 	}
 
 	if (nlh->nlmsg_flags & NLM_F_EXCL ||
-- 
cgit v1.2.3


From 1fba70e5b6bed53496ba1f1f16127f5be01b5fb6 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Oct 2017 11:22:51 -0700
Subject: tcp: socket option to set TCP fast open key

New socket option TCP_FASTOPEN_KEY to allow different keys per
listener.  The listener by default uses the global key until the
socket option is set.  The key is a 16 bytes long binary data. This
option has no effect on regular non-listener TCP sockets.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/request_sock.h |  2 ++
 include/net/tcp.h          |  5 +++--
 include/uapi/linux/tcp.h   |  1 +
 net/ipv4/sysctl_net_ipv4.c |  3 ++-
 net/ipv4/tcp.c             | 33 +++++++++++++++++++++++++++
 net/ipv4/tcp_fastopen.c    | 56 +++++++++++++++++++++++++++++++++-------------
 net/ipv4/tcp_ipv4.c        |  1 +
 7 files changed, 82 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 23e22054aa60..347015515a7d 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -150,6 +150,8 @@ struct fastopen_queue {
 	spinlock_t	lock;
 	int		qlen;		/* # of pending (TCP_SYN_RECV) reqs */
 	int		max_qlen;	/* != 0 iff TFO is currently enabled */
+
+	struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */
 };
 
 /** struct request_sock_queue - queue of request_socks
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3b3b9b968e2d..1efe8365cb28 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1555,9 +1555,10 @@ struct tcp_fastopen_request {
 	int				copied;	/* queued in tcp_connect() */
 };
 void tcp_free_fastopen_req(struct tcp_sock *tp);
-
+void tcp_fastopen_destroy_cipher(struct sock *sk);
 void tcp_fastopen_ctx_destroy(struct net *net);
-int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len);
+int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
+			      void *key, unsigned int len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 15c25eccab2b..69c7493e42f8 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -119,6 +119,7 @@ enum {
 #define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
 #define TCP_ULP			31	/* Attach a ULP to a TCP connection */
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
+#define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index cac8dd309f39..81d218346cf7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -284,7 +284,8 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
-		tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH);
+		tcp_fastopen_reset_cipher(net, NULL, user_key,
+					  TCP_FASTOPEN_KEY_LENGTH);
 	}
 
 bad_key:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3b34850d361f..8b1fa4dd4538 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2571,6 +2571,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		release_sock(sk);
 		return err;
 	}
+	case TCP_FASTOPEN_KEY: {
+		__u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+		if (optlen != sizeof(key))
+			return -EINVAL;
+
+		if (copy_from_user(key, optval, optlen))
+			return -EFAULT;
+
+		return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+	}
 	default:
 		/* fallthru */
 		break;
@@ -3157,6 +3168,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		return 0;
 
+	case TCP_FASTOPEN_KEY: {
+		__u8 key[TCP_FASTOPEN_KEY_LENGTH];
+		struct tcp_fastopen_context *ctx;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		rcu_read_lock();
+		ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
+		if (ctx)
+			memcpy(key, ctx->key, sizeof(key));
+		else
+			len = 0;
+		rcu_read_unlock();
+
+		len = min_t(unsigned int, len, sizeof(key));
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, key, len))
+			return -EFAULT;
+		return 0;
+	}
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 7ee4aadcdd71..21075ce19cb6 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -29,7 +29,7 @@ void tcp_fastopen_init_key_once(struct net *net)
 	 * for a valid cookie, so this is an acceptable risk.
 	 */
 	get_random_bytes(key, sizeof(key));
-	tcp_fastopen_reset_cipher(net, key, sizeof(key));
+	tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key));
 }
 
 static void tcp_fastopen_ctx_free(struct rcu_head *head)
@@ -40,6 +40,16 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
 	kfree(ctx);
 }
 
+void tcp_fastopen_destroy_cipher(struct sock *sk)
+{
+	struct tcp_fastopen_context *ctx;
+
+	ctx = rcu_dereference_protected(
+			inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1);
+	if (ctx)
+		call_rcu(&ctx->rcu, tcp_fastopen_ctx_free);
+}
+
 void tcp_fastopen_ctx_destroy(struct net *net)
 {
 	struct tcp_fastopen_context *ctxt;
@@ -55,10 +65,12 @@ void tcp_fastopen_ctx_destroy(struct net *net)
 		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
 }
 
-int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len)
+int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
+			      void *key, unsigned int len)
 {
-	int err;
 	struct tcp_fastopen_context *ctx, *octx;
+	struct fastopen_queue *q;
+	int err;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
@@ -79,27 +91,39 @@ error:		kfree(ctx);
 	}
 	memcpy(ctx->key, key, len);
 
-	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
 
-	octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
-				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
-	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
-	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+	if (sk) {
+		q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+		spin_lock_bh(&q->lock);
+		octx = rcu_dereference_protected(q->ctx,
+						 lockdep_is_held(&q->lock));
+		rcu_assign_pointer(q->ctx, ctx);
+		spin_unlock_bh(&q->lock);
+	} else {
+		spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
+		octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+			lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+		rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+		spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+	}
 
 	if (octx)
 		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
 	return err;
 }
 
-static bool __tcp_fastopen_cookie_gen(struct net *net,
-				      const void *path,
+static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path,
 				      struct tcp_fastopen_cookie *foc)
 {
 	struct tcp_fastopen_context *ctx;
 	bool ok = false;
 
 	rcu_read_lock();
-	ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+
+	ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
+	if (!ctx)
+		ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
+
 	if (ctx) {
 		crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
@@ -115,7 +139,7 @@ static bool __tcp_fastopen_cookie_gen(struct net *net,
  *
  * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
  */
-static bool tcp_fastopen_cookie_gen(struct net *net,
+static bool tcp_fastopen_cookie_gen(struct sock *sk,
 				    struct request_sock *req,
 				    struct sk_buff *syn,
 				    struct tcp_fastopen_cookie *foc)
@@ -124,7 +148,7 @@ static bool tcp_fastopen_cookie_gen(struct net *net,
 		const struct iphdr *iph = ip_hdr(syn);
 
 		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
-		return __tcp_fastopen_cookie_gen(net, path, foc);
+		return __tcp_fastopen_cookie_gen(sk, path, foc);
 	}
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -132,13 +156,13 @@ static bool tcp_fastopen_cookie_gen(struct net *net,
 		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
 		struct tcp_fastopen_cookie tmp;
 
-		if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) {
+		if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) {
 			struct in6_addr *buf = &tmp.addr;
 			int i;
 
 			for (i = 0; i < 4; i++)
 				buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
-			return __tcp_fastopen_cookie_gen(net, buf, foc);
+			return __tcp_fastopen_cookie_gen(sk, buf, foc);
 		}
 	}
 #endif
@@ -313,7 +337,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 		goto fastopen;
 
 	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
-	    tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) &&
+	    tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) &&
 	    foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
 	    foc->len == valid_foc.len &&
 	    !memcmp(foc->val, valid_foc.val, foc->len)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ecee4ddb24c5..28ca4e177047 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1893,6 +1893,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	/* If socket is aborted during connect operation */
 	tcp_free_fastopen_req(tp);
+	tcp_fastopen_destroy_cipher(sk);
 	tcp_saved_syn_free(tp);
 
 	sk_sockets_allocated_dec(sk);
-- 
cgit v1.2.3


From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:22 -0700
Subject: bpf: Add file mode configuration into bpf maps

Introduce the map read/write flags to the eBPF syscalls that returns the
map fd. The flags is used to set up the file mode when construct a new
file descriptor for bpf maps. To not break the backward capability, the
f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise
it should be O_RDONLY or O_WRONLY. When the userspace want to modify or
read the map content, it will check the file mode to see if it is
allowed to make the change.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  8 +++--
 include/uapi/linux/bpf.h |  6 ++++
 kernel/bpf/arraymap.c    |  6 +++-
 kernel/bpf/devmap.c      |  5 ++-
 kernel/bpf/hashtab.c     |  5 +--
 kernel/bpf/inode.c       | 15 ++++++---
 kernel/bpf/lpm_trie.c    |  3 +-
 kernel/bpf/sockmap.c     |  5 ++-
 kernel/bpf/stackmap.c    |  5 ++-
 kernel/bpf/syscall.c     | 88 ++++++++++++++++++++++++++++++++++++++++++------
 net/netfilter/xt_bpf.c   |  2 +-
 11 files changed, 122 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d67ccdc0099f..3e5508f2fa87 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -315,11 +315,11 @@ void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-int bpf_map_new_fd(struct bpf_map *map);
+int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
-int bpf_obj_get_user(const char __user *pathname);
+int bpf_obj_get_user(const char __user *pathname, int flags);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -338,6 +338,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
+int bpf_get_file_flag(int flags);
+
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
  * Best-effort only.  No barriers here, since it _will_ race with concurrent
@@ -421,7 +423,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
-static inline int bpf_obj_get_user(const char __user *pathname)
+static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4303fb6c3817..d83f95ea6a1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,10 @@ enum bpf_attach_type {
 
 #define BPF_OBJ_NAME_LEN 16U
 
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY		(1U << 3)
+#define BPF_F_WRONLY		(1U << 4)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -260,6 +264,7 @@ union bpf_attr {
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
+		__u32		file_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
@@ -287,6 +292,7 @@ union bpf_attr {
 			__u32		map_id;
 		};
 		__u32		next_id;
+		__u32		open_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 68d866628be0..988c04c91e10 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
 
 #include "map_in_map.h"
 
+#define ARRAY_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
 	int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+	    attr->value_size == 0 ||
+	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return ERR_PTR(-EINVAL);
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e093d9a2c4dd..e5d3de7cff2e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 
+#define DEV_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_dtab_netdev {
 	struct net_device *dev;
 	struct bpf_dtab *dtab;
@@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..919955236e63 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
 
-#define HTAB_CREATE_FLAG_MASK \
-	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK						\
+	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
+	 BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bucket {
 	struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
 }
 
 static void *bpf_obj_do_get(const struct filename *pathname,
-			    enum bpf_type *type)
+			    enum bpf_type *type, int flags)
 {
 	struct inode *inode;
 	struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
 		return ERR_PTR(ret);
 
 	inode = d_backing_inode(path.dentry);
-	ret = inode_permission(inode, MAY_WRITE);
+	ret = inode_permission(inode, ACC_MODE(flags));
 	if (ret)
 		goto out;
 
@@ -326,18 +326,23 @@ out:
 	return ERR_PTR(ret);
 }
 
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	enum bpf_type type = BPF_TYPE_UNSPEC;
 	struct filename *pname;
 	int ret = -ENOENT;
+	int f_flags;
 	void *raw;
 
+	f_flags = bpf_get_file_flag(flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	pname = getname(pathname);
 	if (IS_ERR(pname))
 		return PTR_ERR(pname);
 
-	raw = bpf_obj_do_get(pname, &type);
+	raw = bpf_obj_do_get(pname, &type, f_flags);
 	if (IS_ERR(raw)) {
 		ret = PTR_ERR(raw);
 		goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
 	if (type == BPF_TYPE_PROG)
 		ret = bpf_prog_new_fd(raw);
 	else if (type == BPF_TYPE_MAP)
-		ret = bpf_map_new_fd(raw);
+		ret = bpf_map_new_fd(raw, f_flags);
 	else
 		goto out;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 34d8a690ea05..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -495,7 +495,8 @@ out:
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
-#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\
+				 BPF_F_RDONLY | BPF_F_WRONLY)
 
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a298d6666698..86ec846f2d5e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -40,6 +40,9 @@
 #include <linux/list.h>
 #include <net/strparser.h>
 
+#define SOCK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_stab {
 	struct bpf_map map;
 	struct sock **sock_map;
@@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
 #include <linux/perf_event.h>
 #include "percpu_freelist.h"
 
+#define STACK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct stack_map_bucket {
 	struct pcpu_freelist_node fnode;
 	u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags & ~BPF_F_NUMA_NODE)
+	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0e893cac6795..676a06e6b322 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -34,6 +34,8 @@
 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
 
+#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -294,17 +296,48 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+			      loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_READ.
+	 */
+	return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+			       size_t siz, loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_WRITE.
+	 */
+	return -EINVAL;
+}
+
 static const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
 	.release	= bpf_map_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
-				O_RDWR | O_CLOEXEC);
+				flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+		return -EINVAL;
+	if (flags & BPF_F_RDONLY)
+		return O_RDONLY;
+	if (flags & BPF_F_WRONLY)
+		return O_WRONLY;
+	return O_RDWR;
 }
 
 /* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -344,12 +377,17 @@ static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_map *map;
+	int f_flags;
 	int err;
 
 	err = CHECK_ATTR(BPF_MAP_CREATE);
 	if (err)
 		return -EINVAL;
 
+	f_flags = bpf_get_file_flag(attr->map_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	if (numa_node != NUMA_NO_NODE &&
 	    ((unsigned int)numa_node >= nr_node_ids ||
 	     !node_online(numa_node)))
@@ -375,7 +413,7 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map;
 
-	err = bpf_map_new_fd(map);
+	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
 		 * bpf_map_put() is needed because the above
@@ -490,6 +528,11 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -570,6 +613,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -659,6 +707,11 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -702,6 +755,11 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	if (ukey) {
 		key = memdup_user(ukey, map->key_size);
 		if (IS_ERR(key)) {
@@ -908,6 +966,8 @@ static const struct file_operations bpf_prog_fops = {
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
 	.release	= bpf_prog_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
@@ -1117,11 +1177,11 @@ free_prog_nouncharge:
 	return err;
 }
 
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
 
 static int bpf_obj_pin(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ))
+	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
 		return -EINVAL;
 
 	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1129,10 +1189,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
 
 static int bpf_obj_get(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+				attr->file_flags);
 }
 
 #ifdef CONFIG_CGROUP_BPF
@@ -1392,20 +1454,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
 
 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 {
 	struct bpf_map *map;
 	u32 id = attr->map_id;
+	int f_flags;
 	int fd;
 
-	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	f_flags = bpf_get_file_flag(attr->open_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	spin_lock_bh(&map_idr_lock);
 	map = idr_find(&map_idr, id);
 	if (map)
@@ -1417,7 +1485,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	fd = bpf_map_new_fd(map);
+	fd = bpf_map_new_fd(map, f_flags);
 	if (fd < 0)
 		bpf_map_put(map);
 
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 29123934887b..041da0d9c06f 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -56,7 +56,7 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 	int retval, fd;
 
 	set_fs(KERNEL_DS);
-	fd = bpf_obj_get_user(path);
+	fd = bpf_obj_get_user(path, 0);
 	set_fs(oldfs);
 	if (fd < 0)
 		return fd;
-- 
cgit v1.2.3


From afdb09c720b62b8090584c11151d856df330e57d Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:24 -0700
Subject: security: bpf: Add LSM hooks for bpf object related syscall

Introduce several LSM hooks for the syscalls that will allow the
userspace to access to eBPF object such as eBPF programs and eBPF maps.
The security check is aimed to enforce a per object security protection
for eBPF object so only processes with the right priviliges can
read/write to a specific map or use a specific eBPF program. Besides
that, a general security hook is added before the multiplexer of bpf
syscall to check the cmd and the attribute used for the command. The
actual security module can decide which command need to be checked and
how the cmd should be checked.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       |  6 ++++++
 include/linux/lsm_hooks.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/security.h  | 45 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      | 34 +++++++++++++++++++++++++++--
 security/security.c       | 32 ++++++++++++++++++++++++++++
 5 files changed, 169 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3e5508f2fa87..84c192da3e0b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -57,6 +57,9 @@ struct bpf_map {
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 /* function argument constraints */
@@ -193,6 +196,9 @@ struct bpf_prog_aux {
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index c9258124e417..7161d8e7ee79 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1351,6 +1351,40 @@
  *	@inode we wish to get the security context of.
  *	@ctx is a pointer in which to place the allocated security context.
  *	@ctxlen points to the place to put the length of @ctx.
+ *
+ * Security hooks for using the eBPF maps and programs functionalities through
+ * eBPF syscalls.
+ *
+ * @bpf:
+ *	Do a initial check for all bpf syscalls after the attribute is copied
+ *	into the kernel. The actual security module can implement their own
+ *	rules to check the specific cmd they need.
+ *
+ * @bpf_map:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF maps.
+ *
+ *	@map: bpf map that we want to access
+ *	@mask: the access flags
+ *
+ * @bpf_prog:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF programs.
+ *
+ *	@prog: bpf prog that userspace want to use.
+ *
+ * @bpf_map_alloc_security:
+ *	Initialize the security field inside bpf map.
+ *
+ * @bpf_map_free_security:
+ *	Clean up the security information stored inside bpf map.
+ *
+ * @bpf_prog_alloc_security:
+ *	Initialize the security field inside bpf program.
+ *
+ * @bpf_prog_free_security:
+ *	Clean up the security information stored inside bpf prog.
+ *
  */
 union security_list_options {
 	int (*binder_set_context_mgr)(struct task_struct *mgr);
@@ -1682,6 +1716,17 @@ union security_list_options {
 				struct audit_context *actx);
 	void (*audit_rule_free)(void *lsmrule);
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+	int (*bpf)(int cmd, union bpf_attr *attr,
+				 unsigned int size);
+	int (*bpf_map)(struct bpf_map *map, fmode_t fmode);
+	int (*bpf_prog)(struct bpf_prog *prog);
+	int (*bpf_map_alloc_security)(struct bpf_map *map);
+	void (*bpf_map_free_security)(struct bpf_map *map);
+	int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
+	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
+#endif /* CONFIG_BPF_SYSCALL */
 };
 
 struct security_hook_heads {
@@ -1901,6 +1946,15 @@ struct security_hook_heads {
 	struct list_head audit_rule_match;
 	struct list_head audit_rule_free;
 #endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+	struct list_head bpf;
+	struct list_head bpf_map;
+	struct list_head bpf_prog;
+	struct list_head bpf_map_alloc_security;
+	struct list_head bpf_map_free_security;
+	struct list_head bpf_prog_alloc_security;
+	struct list_head bpf_prog_free_security;
+#endif /* CONFIG_BPF_SYSCALL */
 } __randomize_layout;
 
 /*
diff --git a/include/linux/security.h b/include/linux/security.h
index ce6265960d6c..18800b0911e5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,6 +31,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -1730,6 +1731,50 @@ static inline void securityfs_remove(struct dentry *dentry)
 
 #endif
 
+#ifdef CONFIG_BPF_SYSCALL
+#ifdef CONFIG_SECURITY
+extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
+extern int security_bpf_prog(struct bpf_prog *prog);
+extern int security_bpf_map_alloc(struct bpf_map *map);
+extern void security_bpf_map_free(struct bpf_map *map);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+#else
+static inline int security_bpf(int cmd, union bpf_attr *attr,
+					     unsigned int size)
+{
+	return 0;
+}
+
+static inline int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return 0;
+}
+
+static inline int security_bpf_prog(struct bpf_prog *prog)
+{
+	return 0;
+}
+
+static inline int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return 0;
+}
+
+static inline void security_bpf_map_free(struct bpf_map *map)
+{ }
+
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return 0;
+}
+
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{ }
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_BPF_SYSCALL */
+
 #ifdef CONFIG_SECURITY
 
 static inline char *alloc_secdata(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 676a06e6b322..5cb56d06b48d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -212,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	bpf_map_uncharge_memlock(map);
+	security_bpf_map_free(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -325,6 +326,12 @@ static const struct file_operations bpf_map_fops = {
 
 int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
+	int ret;
+
+	ret = security_bpf_map(map, OPEN_FMODE(flags));
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
 				flags | O_CLOEXEC);
 }
@@ -405,10 +412,14 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
-	err = bpf_map_charge_memlock(map);
+	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
 
+	err = bpf_map_charge_memlock(map);
+	if (err)
+		goto free_map_sec;
+
 	err = bpf_map_alloc_id(map);
 	if (err)
 		goto free_map;
@@ -430,6 +441,8 @@ static int map_create(union bpf_attr *attr)
 
 free_map:
 	bpf_map_uncharge_memlock(map);
+free_map_sec:
+	security_bpf_map_free(map);
 free_map_nouncharge:
 	map->ops->map_free(map);
 	return err;
@@ -914,6 +927,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
+	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
 }
 
@@ -972,6 +986,12 @@ static const struct file_operations bpf_prog_fops = {
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
 {
+	int ret;
+
+	ret = security_bpf_prog(prog);
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
 				O_RDWR | O_CLOEXEC);
 }
@@ -1111,10 +1131,14 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (!prog)
 		return -ENOMEM;
 
-	err = bpf_prog_charge_memlock(prog);
+	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog_nouncharge;
 
+	err = bpf_prog_charge_memlock(prog);
+	if (err)
+		goto free_prog_sec;
+
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
@@ -1172,6 +1196,8 @@ free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
 	bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+	security_bpf_prog_free(prog->aux);
 free_prog_nouncharge:
 	bpf_prog_free(prog);
 	return err;
@@ -1640,6 +1666,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (copy_from_user(&attr, uattr, size) != 0)
 		return -EFAULT;
 
+	err = security_bpf(cmd, &attr, size);
+	if (err < 0)
+		return err;
+
 	switch (cmd) {
 	case BPF_MAP_CREATE:
 		err = map_create(&attr);
diff --git a/security/security.c b/security/security.c
index 4bf0f571b4ef..1cd8526cb0b7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -12,6 +12,7 @@
  *	(at your option) any later version.
  */
 
+#include <linux/bpf.h>
 #include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/module.h>
@@ -1703,3 +1704,34 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
 				actx);
 }
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+	return call_int_hook(bpf, 0, cmd, attr, size);
+}
+int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return call_int_hook(bpf_map, 0, map, fmode);
+}
+int security_bpf_prog(struct bpf_prog *prog)
+{
+	return call_int_hook(bpf_prog, 0, prog);
+}
+int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return call_int_hook(bpf_map_alloc_security, 0, map);
+}
+int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return call_int_hook(bpf_prog_alloc_security, 0, aux);
+}
+void security_bpf_map_free(struct bpf_map *map)
+{
+	call_void_hook(bpf_map_free_security, map);
+}
+void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{
+	call_void_hook(bpf_prog_free_security, aux);
+}
+#endif /* CONFIG_BPF_SYSCALL */
-- 
cgit v1.2.3


From f66e448cfda021b0bcd884f26709796fe19c7cc1 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:26 -0700
Subject: selinux: bpf: Add addtional check for bpf object file receive

Introduce a bpf object related check when sending and receiving files
through unix domain socket as well as binder. It checks if the receiving
process have privilege to read/write the bpf map or use the bpf program.
This check is necessary because the bpf maps and programs are using a
anonymous inode as their shared inode so the normal way of checking the
files and sockets when passing between processes cannot work properly on
eBPF object. This check only works when the BPF_SYSCALL is configured.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  3 +++
 kernel/bpf/syscall.c     |  4 ++--
 security/selinux/hooks.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 84c192da3e0b..1e334b248ff6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -288,6 +288,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
+extern const struct file_operations bpf_map_fops;
+extern const struct file_operations bpf_prog_fops;
+
 #define BPF_PROG_TYPE(_id, _name) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
 	extern const struct bpf_verifier_ops _name ## _verifier_ops;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5cb56d06b48d..323be2473c4b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -315,7 +315,7 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
 	return -EINVAL;
 }
 
-static const struct file_operations bpf_map_fops = {
+const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
@@ -975,7 +975,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 12cf7de8cbed..2e3a627fc0b1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1815,6 +1815,10 @@ static inline int file_path_has_perm(const struct cred *cred,
 	return inode_has_perm(cred, file_inode(file), av, &ad);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_fd_pass(struct file *file, u32 sid);
+#endif
+
 /* Check whether a task can use an open file descriptor to
    access an inode in a given way.  Check access to the
    descriptor itself, and then use dentry_has_perm to
@@ -1845,6 +1849,12 @@ static int file_has_perm(const struct cred *cred,
 			goto out;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	rc = bpf_fd_pass(file, cred_sid(cred));
+	if (rc)
+		return rc;
+#endif
+
 	/* av is zero if only checking access to the descriptor. */
 	rc = 0;
 	if (av)
@@ -2165,6 +2175,12 @@ static int selinux_binder_transfer_file(struct task_struct *from,
 			return rc;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	rc = bpf_fd_pass(file, sid);
+	if (rc)
+		return rc;
+#endif
+
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
 		return 0;
 
@@ -6288,6 +6304,39 @@ static u32 bpf_map_fmode_to_av(fmode_t fmode)
 	return av;
 }
 
+/* This function will check the file pass through unix socket or binder to see
+ * if it is a bpf related object. And apply correspinding checks on the bpf
+ * object based on the type. The bpf maps and programs, not like other files and
+ * socket, are using a shared anonymous inode inside the kernel as their inode.
+ * So checking that inode cannot identify if the process have privilege to
+ * access the bpf object and that's why we have to add this additional check in
+ * selinux_file_receive and selinux_binder_transfer_files.
+ */
+static int bpf_fd_pass(struct file *file, u32 sid)
+{
+	struct bpf_security_struct *bpfsec;
+	struct bpf_prog *prog;
+	struct bpf_map *map;
+	int ret;
+
+	if (file->f_op == &bpf_map_fops) {
+		map = file->private_data;
+		bpfsec = map->security;
+		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+				   bpf_map_fmode_to_av(file->f_mode), NULL);
+		if (ret)
+			return ret;
+	} else if (file->f_op == &bpf_prog_fops) {
+		prog = file->private_data;
+		bpfsec = prog->aux->security;
+		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+				   BPF__PROG_RUN, NULL);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
 {
 	u32 sid = current_sid();
-- 
cgit v1.2.3


From b65f164d37cf6d4aac59b0e13c2e5c4cfe293fd2 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 19 Oct 2017 09:31:43 +0200
Subject: ipv6: let trace_fib6_table_lookup() dereference the fib table

The perf traces for ipv6 routing code show a relevant cost around
trace_fib6_table_lookup(), even if no trace is enabled. This is
due to the fib6_table de-referencing currently performed by the
caller.

Let's the tracing code pay this overhead, passing to the trace
helper the table pointer. This gives small but measurable
performance improvement under UDP flood.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib6.h |  6 +++---
 net/ipv6/route.c            | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index d60096cddb2a..b34bed17abc7 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,9 +12,9 @@
 TRACE_EVENT(fib6_table_lookup,
 
 	TP_PROTO(const struct net *net, const struct rt6_info *rt,
-		 u32 tb_id, const struct flowi6 *flp),
+		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, rt, tb_id, flp),
+	TP_ARGS(net, rt, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -34,7 +34,7 @@ TRACE_EVENT(fib6_table_lookup,
 	TP_fast_assign(
 		struct in6_addr *in6;
 
-		__entry->tb_id = tb_id;
+		__entry->tb_id = table->tb6_id;
 		__entry->oif = flp->flowi6_oif;
 		__entry->iif = flp->flowi6_iif;
 		__entry->tos = ip6_tclass(flp->flowlabel);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 074fac966018..46c59a53c53f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -945,7 +945,7 @@ restart:
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 
 	return rt;
 
@@ -1682,7 +1682,7 @@ redo_rt6_select:
 	if (rt == net->ipv6.ip6_null_entry) {
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
-		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (rt->rt6i_flags & RTF_CACHE) {
 		if (ip6_hold_safe(net, &rt, true)) {
@@ -1690,7 +1690,7 @@ redo_rt6_select:
 			rt6_dst_from_metrics_check(rt);
 		}
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
@@ -1726,7 +1726,7 @@ redo_rt6_select:
 		}
 
 uncached_rt_out:
-		trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
+		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
 
 	} else {
@@ -1754,7 +1754,7 @@ uncached_rt_out:
 		}
 		local_bh_enable();
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
+		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
 		return pcpu_rt;
 	}
 }
@@ -2195,7 +2195,7 @@ out:
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 	return rt;
 };
 
-- 
cgit v1.2.3


From 8c4083b30e56fc71b0e94c26374b32d95d5ea461 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 19 Oct 2017 15:50:29 +0200
Subject: net: sched: add block bind/unbind notif. and extended block_get/put

Introduce new type of ndo_setup_tc message to propage binding/unbinding
of a block to driver. Call this ndo whenever qdisc gets/puts a block.
Alongside with this, there's need to propagate binder type from qdisc
code down to the notifier. So introduce extended variants of
block_get/put in order to pass this info.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_cls.h     | 40 +++++++++++++++++++++++++++++++++
 net/sched/cls_api.c       | 56 ++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 94 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf014afcb914..4de5b08ee0fb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -775,6 +775,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
+	TC_SETUP_BLOCK,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 49a143e0fe65..41bc7d774047 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -17,13 +17,27 @@ struct tcf_walker {
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
+enum tcf_block_binder_type {
+	TCF_BLOCK_BINDER_TYPE_UNSPEC,
+};
+
+struct tcf_block_ext_info {
+	enum tcf_block_binder_type binder_type;
+};
+
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
 void tcf_chain_put(struct tcf_chain *chain);
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q);
+int tcf_block_get_ext(struct tcf_block **p_block,
+		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		      struct tcf_block_ext_info *ei);
 void tcf_block_put(struct tcf_block *block);
+void tcf_block_put_ext(struct tcf_block *block,
+		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		       struct tcf_block_ext_info *ei);
 
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
@@ -46,10 +60,25 @@ int tcf_block_get(struct tcf_block **p_block,
 	return 0;
 }
 
+static inline
+int tcf_block_get_ext(struct tcf_block **p_block,
+		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		      struct tcf_block_ext_info *ei)
+{
+	return 0;
+}
+
 static inline void tcf_block_put(struct tcf_block *block)
 {
 }
 
+static inline
+void tcf_block_put_ext(struct tcf_block *block,
+		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		       struct tcf_block_ext_info *ei)
+{
+}
+
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
 	return NULL;
@@ -434,6 +463,17 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
 		     void *type_data, bool err_stop);
 
+enum tc_block_command {
+	TC_BLOCK_BIND,
+	TC_BLOCK_UNBIND,
+};
+
+struct tc_block_offload {
+	enum tc_block_command command;
+	enum tcf_block_binder_type binder_type;
+	struct tcf_block *block;
+};
+
 struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2e8e87fd9d97..92dce26d10e3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -240,8 +240,36 @@ tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain,
 	chain->p_filter_chain = p_filter_chain;
 }
 
-int tcf_block_get(struct tcf_block **p_block,
-		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
+static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q,
+				  struct tcf_block_ext_info *ei,
+				  enum tc_block_command command)
+{
+	struct net_device *dev = q->dev_queue->dev;
+	struct tc_block_offload bo = {};
+
+	if (!tc_can_offload(dev))
+		return;
+	bo.command = command;
+	bo.binder_type = ei->binder_type;
+	bo.block = block;
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+}
+
+static void tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
+				   struct tcf_block_ext_info *ei)
+{
+	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_BIND);
+}
+
+static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
+				     struct tcf_block_ext_info *ei)
+{
+	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND);
+}
+
+int tcf_block_get_ext(struct tcf_block **p_block,
+		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		      struct tcf_block_ext_info *ei)
 {
 	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
 	struct tcf_chain *chain;
@@ -259,6 +287,7 @@ int tcf_block_get(struct tcf_block **p_block,
 	tcf_chain_filter_chain_ptr_set(chain, p_filter_chain);
 	block->net = qdisc_net(q);
 	block->q = q;
+	tcf_block_offload_bind(block, q, ei);
 	*p_block = block;
 	return 0;
 
@@ -266,15 +295,28 @@ err_chain_create:
 	kfree(block);
 	return err;
 }
+EXPORT_SYMBOL(tcf_block_get_ext);
+
+int tcf_block_get(struct tcf_block **p_block,
+		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
+{
+	struct tcf_block_ext_info ei = {0, };
+
+	return tcf_block_get_ext(p_block, p_filter_chain, q, &ei);
+}
 EXPORT_SYMBOL(tcf_block_get);
 
-void tcf_block_put(struct tcf_block *block)
+void tcf_block_put_ext(struct tcf_block *block,
+		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		       struct tcf_block_ext_info *ei)
 {
 	struct tcf_chain *chain, *tmp;
 
 	if (!block)
 		return;
 
+	tcf_block_offload_unbind(block, q, ei);
+
 	/* XXX: Standalone actions are not allowed to jump to any chain, and
 	 * bound actions should be all removed after flushing. However,
 	 * filters are destroyed in RCU callbacks, we have to hold the chains
@@ -302,6 +344,14 @@ void tcf_block_put(struct tcf_block *block)
 		tcf_chain_put(chain);
 	kfree(block);
 }
+EXPORT_SYMBOL(tcf_block_put_ext);
+
+void tcf_block_put(struct tcf_block *block)
+{
+	struct tcf_block_ext_info ei = {0, };
+
+	tcf_block_put_ext(block, NULL, block->q, &ei);
+}
 EXPORT_SYMBOL(tcf_block_put);
 
 /* Main classifier routine: scans classifier chain attached
-- 
cgit v1.2.3


From 6e40cf2d4dee9dc22ff398041ce876bef8172dea Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 19 Oct 2017 15:50:30 +0200
Subject: net: sched: use extended variants of block_get/put in ingress and
 clsact qdiscs

Use previously introduced extended variants of block get and put
functions. This allows to specify a binder types specific to clsact
ingress/egress which is useful for drivers to distinguish who actually
got the block.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h   |  2 ++
 net/sched/sch_ingress.c | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 41bc7d774047..5c50af8f7183 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -19,6 +19,8 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
 enum tcf_block_binder_type {
 	TCF_BLOCK_BINDER_TYPE_UNSPEC,
+	TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+	TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
 };
 
 struct tcf_block_ext_info {
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 9ccc1b89b0d9..b599db26d34b 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -20,6 +20,7 @@
 
 struct ingress_sched_data {
 	struct tcf_block *block;
+	struct tcf_block_ext_info block_info;
 };
 
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
@@ -59,7 +60,10 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
-	err = tcf_block_get(&q->block, &dev->ingress_cl_list, sch);
+	q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+
+	err = tcf_block_get_ext(&q->block, &dev->ingress_cl_list,
+				sch, &q->block_info);
 	if (err)
 		return err;
 
@@ -72,8 +76,10 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 static void ingress_destroy(struct Qdisc *sch)
 {
 	struct ingress_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
-	tcf_block_put(q->block);
+	tcf_block_put_ext(q->block, &dev->ingress_cl_list,
+			  sch, &q->block_info);
 	net_dec_ingress_queue();
 }
 
@@ -114,6 +120,8 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
 struct clsact_sched_data {
 	struct tcf_block *ingress_block;
 	struct tcf_block *egress_block;
+	struct tcf_block_ext_info ingress_block_info;
+	struct tcf_block_ext_info egress_block_info;
 };
 
 static unsigned long clsact_find(struct Qdisc *sch, u32 classid)
@@ -153,13 +161,19 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
-	err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list, sch);
+	q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+
+	err = tcf_block_get_ext(&q->ingress_block, &dev->ingress_cl_list,
+				sch, &q->ingress_block_info);
 	if (err)
 		return err;
 
-	err = tcf_block_get(&q->egress_block, &dev->egress_cl_list, sch);
+	q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+
+	err = tcf_block_get_ext(&q->egress_block, &dev->egress_cl_list,
+				sch, &q->egress_block_info);
 	if (err)
-		return err;
+		goto err_egress_block_get;
 
 	net_inc_ingress_queue();
 	net_inc_egress_queue();
@@ -167,14 +181,22 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	sch->flags |= TCQ_F_CPUSTATS;
 
 	return 0;
+
+err_egress_block_get:
+	tcf_block_put_ext(q->ingress_block, &dev->ingress_cl_list,
+			  sch, &q->ingress_block_info);
+	return err;
 }
 
 static void clsact_destroy(struct Qdisc *sch)
 {
 	struct clsact_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
-	tcf_block_put(q->egress_block);
-	tcf_block_put(q->ingress_block);
+	tcf_block_put_ext(q->egress_block, &dev->egress_cl_list,
+			  sch, &q->egress_block_info);
+	tcf_block_put_ext(q->ingress_block, &dev->ingress_cl_list,
+			  sch, &q->ingress_block_info);
 
 	net_dec_ingress_queue();
 	net_dec_egress_queue();
-- 
cgit v1.2.3


From acb674428c3d57bccbe3f4a1a7a009f6d73e9f41 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 19 Oct 2017 15:50:31 +0200
Subject: net: sched: introduce per-block callbacks

Introduce infrastructure that allows drivers to register callbacks that
are called whenever tc would offload inserted rule for a specific block.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     |  81 +++++++++++++++++++++++++++++++++++
 include/net/sch_generic.h |   1 +
 net/sched/cls_api.c       | 105 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 187 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 5c50af8f7183..4bc6b1cc245d 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -27,6 +27,8 @@ struct tcf_block_ext_info {
 	enum tcf_block_binder_type binder_type;
 };
 
+struct tcf_block_cb;
+
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
@@ -51,6 +53,21 @@ static inline struct net_device *tcf_block_dev(struct tcf_block *block)
 	return tcf_block_q(block)->dev_queue->dev;
 }
 
+void *tcf_block_cb_priv(struct tcf_block_cb *block_cb);
+struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
+					 tc_setup_cb_t *cb, void *cb_ident);
+void tcf_block_cb_incref(struct tcf_block_cb *block_cb);
+unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb);
+struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
+					     tc_setup_cb_t *cb, void *cb_ident,
+					     void *cb_priv);
+int tcf_block_cb_register(struct tcf_block *block,
+			  tc_setup_cb_t *cb, void *cb_ident,
+			  void *cb_priv);
+void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb);
+void tcf_block_cb_unregister(struct tcf_block *block,
+			     tc_setup_cb_t *cb, void *cb_ident);
+
 int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		 struct tcf_result *res, bool compat_mode);
 
@@ -91,6 +108,70 @@ static inline struct net_device *tcf_block_dev(struct tcf_block *block)
 	return NULL;
 }
 
+static inline
+int tc_setup_cb_block_register(struct tcf_block *block, tc_setup_cb_t *cb,
+			       void *cb_priv)
+{
+	return 0;
+}
+
+static inline
+void tc_setup_cb_block_unregister(struct tcf_block *block, tc_setup_cb_t *cb,
+				  void *cb_priv)
+{
+}
+
+static inline
+void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
+{
+	return NULL;
+}
+
+static inline
+struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
+					 tc_setup_cb_t *cb, void *cb_ident)
+{
+	return NULL;
+}
+
+static inline
+void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
+{
+}
+
+static inline
+unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
+{
+	return 0;
+}
+
+static inline
+struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
+					     tc_setup_cb_t *cb, void *cb_ident,
+					     void *cb_priv)
+{
+	return NULL;
+}
+
+static inline
+int tcf_block_cb_register(struct tcf_block *block,
+			  tc_setup_cb_t *cb, void *cb_ident,
+			  void *cb_priv)
+{
+	return 0;
+}
+
+static inline
+void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb)
+{
+}
+
+static inline
+void tcf_block_cb_unregister(struct tcf_block *block,
+			     tc_setup_cb_t *cb, void *cb_ident)
+{
+}
+
 static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			       struct tcf_result *res, bool compat_mode)
 {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0aea9e23e97a..031dffd5836c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -272,6 +272,7 @@ struct tcf_block {
 	struct list_head chain_list;
 	struct net *net;
 	struct Qdisc *q;
+	struct list_head cb_list;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 92dce26d10e3..b16c79c47be5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -278,6 +278,8 @@ int tcf_block_get_ext(struct tcf_block **p_block,
 	if (!block)
 		return -ENOMEM;
 	INIT_LIST_HEAD(&block->chain_list);
+	INIT_LIST_HEAD(&block->cb_list);
+
 	/* Create chain 0 by default, it has to be always present. */
 	chain = tcf_chain_create(block, 0);
 	if (!chain) {
@@ -354,6 +356,109 @@ void tcf_block_put(struct tcf_block *block)
 }
 EXPORT_SYMBOL(tcf_block_put);
 
+struct tcf_block_cb {
+	struct list_head list;
+	tc_setup_cb_t *cb;
+	void *cb_ident;
+	void *cb_priv;
+	unsigned int refcnt;
+};
+
+void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
+{
+	return block_cb->cb_priv;
+}
+EXPORT_SYMBOL(tcf_block_cb_priv);
+
+struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
+					 tc_setup_cb_t *cb, void *cb_ident)
+{	struct tcf_block_cb *block_cb;
+
+	list_for_each_entry(block_cb, &block->cb_list, list)
+		if (block_cb->cb == cb && block_cb->cb_ident == cb_ident)
+			return block_cb;
+	return NULL;
+}
+EXPORT_SYMBOL(tcf_block_cb_lookup);
+
+void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
+{
+	block_cb->refcnt++;
+}
+EXPORT_SYMBOL(tcf_block_cb_incref);
+
+unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
+{
+	return --block_cb->refcnt;
+}
+EXPORT_SYMBOL(tcf_block_cb_decref);
+
+struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
+					     tc_setup_cb_t *cb, void *cb_ident,
+					     void *cb_priv)
+{
+	struct tcf_block_cb *block_cb;
+
+	block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
+	if (!block_cb)
+		return NULL;
+	block_cb->cb = cb;
+	block_cb->cb_ident = cb_ident;
+	block_cb->cb_priv = cb_priv;
+	list_add(&block_cb->list, &block->cb_list);
+	return block_cb;
+}
+EXPORT_SYMBOL(__tcf_block_cb_register);
+
+int tcf_block_cb_register(struct tcf_block *block,
+			  tc_setup_cb_t *cb, void *cb_ident,
+			  void *cb_priv)
+{
+	struct tcf_block_cb *block_cb;
+
+	block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv);
+	return block_cb ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(tcf_block_cb_register);
+
+void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb)
+{
+	list_del(&block_cb->list);
+	kfree(block_cb);
+}
+EXPORT_SYMBOL(__tcf_block_cb_unregister);
+
+void tcf_block_cb_unregister(struct tcf_block *block,
+			     tc_setup_cb_t *cb, void *cb_ident)
+{
+	struct tcf_block_cb *block_cb;
+
+	block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
+	if (!block_cb)
+		return;
+	__tcf_block_cb_unregister(block_cb);
+}
+EXPORT_SYMBOL(tcf_block_cb_unregister);
+
+static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
+			     void *type_data, bool err_stop)
+{
+	struct tcf_block_cb *block_cb;
+	int ok_count = 0;
+	int err;
+
+	list_for_each_entry(block_cb, &block->cb_list, list) {
+		err = block_cb->cb(type, type_data, block_cb->cb_priv);
+		if (err) {
+			if (err_stop)
+				return err;
+		} else {
+			ok_count++;
+		}
+	}
+	return ok_count;
+}
+
 /* Main classifier routine: scans classifier chain attached
  * to this qdisc, (optionally) tests for protocol and asks
  * specific classifiers.
-- 
cgit v1.2.3


From 208c0f4b5237f1d6611b2c679a8022d6901577d6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 19 Oct 2017 15:50:32 +0200
Subject: net: sched: use tc_setup_cb_call to call per-block callbacks

Extend the tc_setup_cb_call entrypoint function originally used only for
action egress devices callbacks to call per-block callbacks as well.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h  |  4 ++--
 net/sched/cls_api.c    | 21 ++++++++++++++++++---
 net/sched/cls_flower.c |  9 ++++++---
 3 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 4bc6b1cc245d..fcca5a9d9880 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -543,8 +543,8 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 }
 #endif /* CONFIG_NET_CLS_IND */
 
-int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
-		     void *type_data, bool err_stop);
+int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
+		     enum tc_setup_type type, void *type_data, bool err_stop);
 
 enum tc_block_command {
 	TC_BLOCK_BIND,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b16c79c47be5..cdfdc24b89cf 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1206,10 +1206,25 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
 	return ok_count;
 }
 
-int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
-		     void *type_data, bool err_stop)
+int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
+		     enum tc_setup_type type, void *type_data, bool err_stop)
 {
-	return tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
+	int ok_count;
+	int ret;
+
+	ret = tcf_block_cb_call(block, type, type_data, err_stop);
+	if (ret < 0)
+		return ret;
+	ok_count = ret;
+
+	if (!exts)
+		return ok_count;
+	ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
+	if (ret < 0)
+		return ret;
+	ok_count += ret;
+
+	return ok_count;
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5b7bb968d1d4..76b4e0a1c92f 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -201,6 +201,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
 	struct tc_cls_flower_offload cls_flower = {};
 	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tcf_block *block = tp->chain->block;
 
 	tc_cls_common_offload_init(&cls_flower.common, tp);
 	cls_flower.command = TC_CLSFLOWER_DESTROY;
@@ -209,7 +210,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	if (tc_can_offload(dev))
 		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
 					      &cls_flower);
-	tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER,
+	tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
 			 &cls_flower, false);
 }
 
@@ -220,6 +221,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload cls_flower = {};
+	struct tcf_block *block = tp->chain->block;
 	bool skip_sw = tc_skip_sw(f->flags);
 	int err;
 
@@ -242,7 +244,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 		}
 	}
 
-	err = tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER,
+	err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
 			       &cls_flower, skip_sw);
 	if (err < 0) {
 		fl_hw_destroy_filter(tp, f);
@@ -261,6 +263,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
 	struct tc_cls_flower_offload cls_flower = {};
 	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tcf_block *block = tp->chain->block;
 
 	tc_cls_common_offload_init(&cls_flower.common, tp);
 	cls_flower.command = TC_CLSFLOWER_STATS;
@@ -270,7 +273,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	if (tc_can_offload(dev))
 		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
 					      &cls_flower);
-	tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER,
+	tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
 			 &cls_flower, false);
 }
 
-- 
cgit v1.2.3


From d58d31a118690b578897749feda48416ac10ca43 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 19 Oct 2017 15:50:47 +0200
Subject: net: sched: remove unused classid field from tc_cls_common_offload

It is no longer used by the drivers, so remove it.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index fcca5a9d9880..04caa246e747 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -561,7 +561,6 @@ struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
 	u32 prio;
-	u32 classid;
 };
 
 static inline void
@@ -571,7 +570,6 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
-	cls_common->classid = tp->classid;
 }
 
 struct tc_cls_u32_knode {
-- 
cgit v1.2.3


From fa71212e91811ac67014ad19d4fc3b3c3446ccf7 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 19 Oct 2017 15:50:48 +0200
Subject: net: sched: remove unused is_classid_clsact_ingress/egress helpers

These helpers are no longer in use by drivers, so remove them.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 2d234af15f3e..b8ecafce4ba1 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -135,19 +135,6 @@ static inline unsigned int psched_mtu(const struct net_device *dev)
 	return dev->mtu + dev->hard_header_len;
 }
 
-static inline bool is_classid_clsact_ingress(u32 classid)
-{
-	/* This also returns true for ingress qdisc */
-	return TC_H_MAJ(classid) == TC_H_MAJ(TC_H_CLSACT) &&
-	       TC_H_MIN(classid) != TC_H_MIN(TC_H_MIN_EGRESS);
-}
-
-static inline bool is_classid_clsact_egress(u32 classid)
-{
-	return TC_H_MAJ(classid) == TC_H_MAJ(TC_H_CLSACT) &&
-	       TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS);
-}
-
 static inline struct net *qdisc_net(struct Qdisc *q)
 {
 	return dev_net(q->dev_queue->dev);
-- 
cgit v1.2.3


From ff61b5e3f041c2f1aa8d7c700af3007889973889 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:37 +0300
Subject: drivers, net, mlx4: convert mlx4_cq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_cq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cq.c | 8 ++++----
 include/linux/mlx4/device.h             | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 72eb50cd5ecd..d8e9a323122e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -69,7 +69,7 @@ void mlx4_cq_tasklet_cb(unsigned long data)
 	list_for_each_entry_safe(mcq, temp, &ctx->process_list, tasklet_ctx.list) {
 		list_del_init(&mcq->tasklet_ctx.list);
 		mcq->tasklet_ctx.comp(mcq);
-		if (atomic_dec_and_test(&mcq->refcount))
+		if (refcount_dec_and_test(&mcq->refcount))
 			complete(&mcq->free);
 		if (time_after(jiffies, end))
 			break;
@@ -92,7 +92,7 @@ static void mlx4_add_cq_to_tasklet(struct mlx4_cq *cq)
 	 * still arrive.
 	 */
 	if (list_empty_careful(&cq->tasklet_ctx.list)) {
-		atomic_inc(&cq->refcount);
+		refcount_inc(&cq->refcount);
 		kick = list_empty(&tasklet_ctx->list);
 		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
 		if (kick)
@@ -344,7 +344,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	cq->cons_index = 0;
 	cq->arm_sn     = 1;
 	cq->uar        = uar;
-	atomic_set(&cq->refcount, 1);
+	refcount_set(&cq->refcount, 1);
 	init_completion(&cq->free);
 	cq->comp = mlx4_add_cq_to_tasklet;
 	cq->tasklet_ctx.priv =
@@ -386,7 +386,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
 	    priv->eq_table.eq[MLX4_EQ_ASYNC].irq)
 		synchronize_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
 
-	if (atomic_dec_and_test(&cq->refcount))
+	if (refcount_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 	wait_for_completion(&cq->free);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b0a57e043fa3..daac2e3a1a58 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -40,7 +40,7 @@
 #include <linux/cpu_rmap.h>
 #include <linux/crash_dump.h>
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 #include <linux/timecounter.h>
 
@@ -751,7 +751,7 @@ struct mlx4_cq {
 	int			cqn;
 	unsigned		vector;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	struct {
 		struct list_head list;
-- 
cgit v1.2.3


From 0068895ff845c38e9e2b65c002c53c623379e436 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:38 +0300
Subject: drivers, net, mlx4: convert mlx4_qp.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_qp.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/qp.c | 8 ++++----
 include/linux/mlx4/device.h             | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 203320923340..769598f7b6c8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -55,7 +55,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
 
 	qp = __mlx4_qp_lookup(dev, qpn);
 	if (qp)
-		atomic_inc(&qp->refcount);
+		refcount_inc(&qp->refcount);
 
 	spin_unlock(&qp_table->lock);
 
@@ -66,7 +66,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
 
 	qp->event(qp, event_type);
 
-	if (atomic_dec_and_test(&qp->refcount))
+	if (refcount_dec_and_test(&qp->refcount))
 		complete(&qp->free);
 }
 
@@ -420,7 +420,7 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
 	if (err)
 		goto err_icm;
 
-	atomic_set(&qp->refcount, 1);
+	refcount_set(&qp->refcount, 1);
 	init_completion(&qp->free);
 
 	return 0;
@@ -520,7 +520,7 @@ EXPORT_SYMBOL_GPL(mlx4_qp_remove);
 
 void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
 {
-	if (atomic_dec_and_test(&qp->refcount))
+	if (refcount_dec_and_test(&qp->refcount))
 		complete(&qp->free);
 	wait_for_completion(&qp->free);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index daac2e3a1a58..b8e19c4d6caa 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -768,7 +768,7 @@ struct mlx4_qp {
 
 	int			qpn;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	u8			usage;
 };
-- 
cgit v1.2.3


From 17ac99b2b8d08ed40f4525491d6eff330329a6d2 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:39 +0300
Subject: drivers, net, mlx4: convert mlx4_srq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_srq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/srq.c | 8 ++++----
 include/linux/mlx4/device.h              | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c
index bedf52126824..cbe4d9746ddf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -49,7 +49,7 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
 	srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
 	rcu_read_unlock();
 	if (srq)
-		atomic_inc(&srq->refcount);
+		refcount_inc(&srq->refcount);
 	else {
 		mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
 		return;
@@ -57,7 +57,7 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
 
 	srq->event(srq, event_type);
 
-	if (atomic_dec_and_test(&srq->refcount))
+	if (refcount_dec_and_test(&srq->refcount))
 		complete(&srq->free);
 }
 
@@ -203,7 +203,7 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
 	if (err)
 		goto err_radix;
 
-	atomic_set(&srq->refcount, 1);
+	refcount_set(&srq->refcount, 1);
 	init_completion(&srq->free);
 
 	return 0;
@@ -232,7 +232,7 @@ void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
 	radix_tree_delete(&srq_table->tree, srq->srqn);
 	spin_unlock_irq(&srq_table->lock);
 
-	if (atomic_dec_and_test(&srq->refcount))
+	if (refcount_dec_and_test(&srq->refcount))
 		complete(&srq->free);
 	wait_for_completion(&srq->free);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b8e19c4d6caa..a9b5fed8f7c6 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -781,7 +781,7 @@ struct mlx4_srq {
 	int			max_gs;
 	int			wqe_shift;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 };
 
-- 
cgit v1.2.3


From a4b51a9f83c6d359ff8fc0c66009283b6fdeeaf8 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:40 +0300
Subject: drivers, net, mlx5: convert mlx5_cq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx5_cq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cq.c | 16 ++++++++--------
 include/linux/mlx5/cq.h                      |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 336d4738b807..1016e05c7ec7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -58,7 +58,7 @@ void mlx5_cq_tasklet_cb(unsigned long data)
 				 tasklet_ctx.list) {
 		list_del_init(&mcq->tasklet_ctx.list);
 		mcq->tasklet_ctx.comp(mcq);
-		if (atomic_dec_and_test(&mcq->refcount))
+		if (refcount_dec_and_test(&mcq->refcount))
 			complete(&mcq->free);
 		if (time_after(jiffies, end))
 			break;
@@ -80,7 +80,7 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
 	 * still arrive.
 	 */
 	if (list_empty_careful(&cq->tasklet_ctx.list)) {
-		atomic_inc(&cq->refcount);
+		refcount_inc(&cq->refcount);
 		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
 	}
 	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
@@ -94,7 +94,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
 	spin_lock(&table->lock);
 	cq = radix_tree_lookup(&table->tree, cqn);
 	if (likely(cq))
-		atomic_inc(&cq->refcount);
+		refcount_inc(&cq->refcount);
 	spin_unlock(&table->lock);
 
 	if (!cq) {
@@ -106,7 +106,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
 
 	cq->comp(cq);
 
-	if (atomic_dec_and_test(&cq->refcount))
+	if (refcount_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 }
 
@@ -119,7 +119,7 @@ void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
 
 	cq = radix_tree_lookup(&table->tree, cqn);
 	if (cq)
-		atomic_inc(&cq->refcount);
+		refcount_inc(&cq->refcount);
 
 	spin_unlock(&table->lock);
 
@@ -130,7 +130,7 @@ void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
 
 	cq->event(cq, event_type);
 
-	if (atomic_dec_and_test(&cq->refcount))
+	if (refcount_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 }
 
@@ -159,7 +159,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	cq->cqn = MLX5_GET(create_cq_out, out, cqn);
 	cq->cons_index = 0;
 	cq->arm_sn     = 0;
-	atomic_set(&cq->refcount, 1);
+	refcount_set(&cq->refcount, 1);
 	init_completion(&cq->free);
 	if (!cq->comp)
 		cq->comp = mlx5_add_cq_to_tasklet;
@@ -222,7 +222,7 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
 	synchronize_irq(cq->irqn);
 
 	mlx5_debug_cq_remove(dev, cq);
-	if (atomic_dec_and_test(&cq->refcount))
+	if (refcount_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 	wait_for_completion(&cq->free);
 
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 95898847c7d4..6a57ec2f1ef7 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -35,7 +35,7 @@
 
 #include <rdma/ib_verbs.h>
 #include <linux/mlx5/driver.h>
-
+#include <linux/refcount.h>
 
 struct mlx5_core_cq {
 	u32			cqn;
@@ -43,7 +43,7 @@ struct mlx5_core_cq {
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
 	struct mlx5_uars_page  *uar;
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	unsigned		vector;
 	unsigned int		irqn;
-- 
cgit v1.2.3


From e65f7ee39b4d7604a78b03ed35d723e1001fc241 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:49 +0300
Subject: drivers, connector: convert cn_callback_entry.refcnt from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable cn_callback_entry.refcnt is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/cn_queue.c  | 4 ++--
 drivers/connector/connector.c | 2 +-
 include/linux/connector.h     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/connector/cn_queue.c b/drivers/connector/cn_queue.c
index 1f8bf054d11c..9c54fdf7acea 100644
--- a/drivers/connector/cn_queue.c
+++ b/drivers/connector/cn_queue.c
@@ -45,7 +45,7 @@ cn_queue_alloc_callback_entry(struct cn_queue_dev *dev, const char *name,
 		return NULL;
 	}
 
-	atomic_set(&cbq->refcnt, 1);
+	refcount_set(&cbq->refcnt, 1);
 
 	atomic_inc(&dev->refcnt);
 	cbq->pdev = dev;
@@ -58,7 +58,7 @@ cn_queue_alloc_callback_entry(struct cn_queue_dev *dev, const char *name,
 
 void cn_queue_release_callback(struct cn_callback_entry *cbq)
 {
-	if (!atomic_dec_and_test(&cbq->refcnt))
+	if (!refcount_dec_and_test(&cbq->refcnt))
 		return;
 
 	atomic_dec(&cbq->pdev->refcnt);
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 25693b045371..8615594bd065 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -157,7 +157,7 @@ static int cn_call_callback(struct sk_buff *skb)
 	spin_lock_bh(&dev->cbdev->queue_lock);
 	list_for_each_entry(i, &dev->cbdev->queue_list, callback_entry) {
 		if (cn_cb_equal(&i->id.id, &msg->id)) {
-			atomic_inc(&i->refcnt);
+			refcount_inc(&i->refcnt);
 			cbq = i;
 			break;
 		}
diff --git a/include/linux/connector.h b/include/linux/connector.h
index f8fe8637d771..032102b19645 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -22,7 +22,7 @@
 #define __CONNECTOR_H
 
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 #include <linux/list.h>
 #include <linux/workqueue.h>
@@ -49,7 +49,7 @@ struct cn_callback_id {
 
 struct cn_callback_entry {
 	struct list_head callback_entry;
-	atomic_t refcnt;
+	refcount_t refcnt;
 	struct cn_queue_dev *pdev;
 
 	struct cn_callback_id id;
-- 
cgit v1.2.3


From e6546ef6d86d0fc38e0e84ccae80e641f3fc0087 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 20 Oct 2017 11:05:39 -0700
Subject: bpf: add support for BPF_SOCK_OPS_BASE_RTT

A congestion control algorithm can make a call to the BPF socket_ops
program to request the base RTT. The base RTT can be congestion control
dependent and is meant to represent a congestion threshold such that
RTTs above it indicate congestion. This is especially useful for flows
within a DC where the base RTT is easy to obtain.

Being provided a base RTT solves a basic problem in RTT based congestion
avoidance algorithms (such as Vegas, NV and BBR). Although it is easy
to get the base RTT when the network is not congested, it is very
diffcult to do when it is very congested. Newer connections get an
inflated value of the base RTT leading to unfariness (newer flows with a
larger base RTT get more bandwidth). As a result, RTT based congestion
avoidance algorithms tend to update their base RTTs to improve fairness.
In very congested networks this can lead to base RTT inflation, reducing
the ability of these RTT based congestion control algorithms to prevent
congestion.

Note that in my experiments with TCP-NV, the base RTT provided can be
much larger than the actual hardware RTT. For example, experimenting
with hosts within a rack where the hardware RTT is 16-20us, I've used
base RTTs up to 150us. The effect of using a larger base RTT is that the
congestion avoidance algorithm will allow more queueing. When there are
only a few flows the main effect is larger measured RTTs and RPC
latencies due to the increased queueing. When there are a lot of flows,
a larger base RTT can lead to more congestion and more packet drops.
For this case, where the hardware RTT is 20us, a base RTT of 80us
produces good results.

This patch only introduces BPF_SOCK_OPS_BASE_RTT, a later patch in this
set adds support for using it in TCP-NV. Further study and testing is
needed before support can be added to other delay based congestion
avoidance algorithms.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d83f95ea6a1b..1aca744c220f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -955,6 +955,13 @@ enum {
 	BPF_SOCK_OPS_NEEDS_ECN,		/* If connection's congestion control
 					 * needs ECN
 					 */
+	BPF_SOCK_OPS_BASE_RTT,		/* Get base RTT. The correct value is
+					 * based on the path and may be
+					 * dependent on the congestion control
+					 * algorithm. In general it indicates
+					 * a congestion threshold. RTTs above
+					 * this indicate congestion
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
-- 
cgit v1.2.3


From cd86d1fd21025fdd6daf23d1288da405e7ad0ec6 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 20 Oct 2017 11:05:40 -0700
Subject: bpf: Adding helper function bpf_getsockops

Adding support for helper function bpf_getsockops to socket_ops BPF
programs. This patch only supports TCP_CONGESTION.

Signed-off-by: Vlad Vysotsky <vlad@cs.ucla.edu>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h                  | 19 ++++++++++---
 net/core/filter.c                         | 46 ++++++++++++++++++++++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++
 3 files changed, 63 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1aca744c220f..f650346aaa1a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -613,12 +613,22 @@ union bpf_attr {
  * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
  *     Calls setsockopt. Not all opts are available, only those with
  *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPROTO_TCP
+ *     Supported levels: SOL_SOCKET and IPPROTO_TCP
  *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPROTO_TCP
+ *     @level: SOL_SOCKET or IPPROTO_TCP
  *     @optname: option name
  *     @optval: pointer to option value
- *     @optlen: length of optval in byes
+ *     @optlen: length of optval in bytes
+ *     Return: 0 or negative error
+ *
+ * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
+ *     Calls getsockopt. Not all opts are available.
+ *     Supported levels: IPPROTO_TCP
+ *     @bpf_socket: pointer to bpf_socket
+ *     @level: IPPROTO_TCP
+ *     @optname: option name
+ *     @optval: pointer to option value
+ *     @optlen: length of optval in bytes
  *     Return: 0 or negative error
  *
  * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
@@ -721,7 +731,8 @@ union bpf_attr {
 	FN(sock_map_update),		\
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
-	FN(perf_prog_read_value),
+	FN(perf_prog_read_value),	\
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 09e011f20291..ccf62f44140a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3273,7 +3273,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 
 static const struct bpf_func_proto bpf_setsockopt_proto = {
 	.func		= bpf_setsockopt,
-	.gpl_only	= true,
+	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
@@ -3282,6 +3282,48 @@ static const struct bpf_func_proto bpf_setsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, level, int, optname, char *, optval, int, optlen)
+{
+	struct sock *sk = bpf_sock->sk;
+	int ret = 0;
+
+	if (!sk_fullsock(sk))
+		goto err_clear;
+
+#ifdef CONFIG_INET
+	if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
+		if (optname == TCP_CONGESTION) {
+			struct inet_connection_sock *icsk = inet_csk(sk);
+
+			if (!icsk->icsk_ca_ops || optlen <= 1)
+				goto err_clear;
+			strncpy(optval, icsk->icsk_ca_ops->name, optlen);
+			optval[optlen - 1] = 0;
+		} else {
+			goto err_clear;
+		}
+	} else {
+		goto err_clear;
+	}
+	return ret;
+#endif
+err_clear:
+	memset(optval, 0, optlen);
+	return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_getsockopt_proto = {
+	.func		= bpf_getsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3460,6 +3502,8 @@ static const struct bpf_func_proto *
 	switch (func_id) {
 	case BPF_FUNC_setsockopt:
 		return &bpf_setsockopt_proto;
+	case BPF_FUNC_getsockopt:
+		return &bpf_getsockopt_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
 	default:
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index e25dbf6038cf..609514f74482 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -67,6 +67,9 @@ static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) =
 static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
 			     int optlen) =
 	(void *) BPF_FUNC_setsockopt;
+static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval,
+			     int optlen) =
+	(void *) BPF_FUNC_getsockopt;
 static int (*bpf_sk_redirect_map)(void *map, int key, int flags) =
 	(void *) BPF_FUNC_sk_redirect_map;
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
-- 
cgit v1.2.3


From 40b16b9be5773a314948656c96adf7bf7cfdbd0b Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 21 Oct 2017 11:45:46 +0200
Subject: batman-adv: use inline kernel-doc for uapi constants

The enums of constants for netlink tends to become rather large over time.
Documenting them is easier when the kernel-doc is actually next to constant
and not in a different block above the enum.

Also inline kernel-doc allows multi-paragraph description. This could be
required to better document the netlink command types and the expected
return values.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h | 369 +++++++++++++++++++++++++++++++---------
 1 file changed, 290 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index a83ddb7b63db..efd641c8a5d6 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -24,20 +24,6 @@
 
 /**
  * enum batadv_tt_client_flags - TT client specific flags
- * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
- * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new
- *  update telling its new real location has not been received/sent yet
- * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface.
- *  This information is used by the "AP Isolation" feature
- * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
- *  information is used by the Extended Isolation feature
- * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table
- * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has
- *  not been announced yet
- * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept
- *  in the table for one more originator interval for consistency purposes
- * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of
- *  the network but no nnode has already announced it
  *
  * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire.
  * Bits from 8 to 15 are called _local flags_ because they are used for local
@@ -48,160 +34,385 @@
  * in the TT CRC computation.
  */
 enum batadv_tt_client_flags {
+	/**
+	 * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
+	 */
 	BATADV_TT_CLIENT_DEL     = (1 << 0),
+
+	/**
+	 * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and
+	 * the new update telling its new real location has not been
+	 * received/sent yet
+	 */
 	BATADV_TT_CLIENT_ROAM    = (1 << 1),
+
+	/**
+	 * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi
+	 * interface. This information is used by the "AP Isolation" feature
+	 */
 	BATADV_TT_CLIENT_WIFI    = (1 << 4),
+
+	/**
+	 * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
+	 * information is used by the Extended Isolation feature
+	 */
 	BATADV_TT_CLIENT_ISOLA	 = (1 << 5),
+
+	/**
+	 * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from
+	 * the table
+	 */
 	BATADV_TT_CLIENT_NOPURGE = (1 << 8),
+
+	/**
+	 * @BATADV_TT_CLIENT_NEW: this client has been added to the local table
+	 * but has not been announced yet
+	 */
 	BATADV_TT_CLIENT_NEW     = (1 << 9),
+
+	/**
+	 * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it
+	 * is kept in the table for one more originator interval for consistency
+	 * purposes
+	 */
 	BATADV_TT_CLIENT_PENDING = (1 << 10),
+
+	/**
+	 * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be
+	 * part of the network but no nnode has already announced it
+	 */
 	BATADV_TT_CLIENT_TEMP	 = (1 << 11),
 };
 
 /**
  * enum batadv_nl_attrs - batman-adv netlink attributes
- *
- * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors
- * @BATADV_ATTR_VERSION: batman-adv version string
- * @BATADV_ATTR_ALGO_NAME: name of routing algorithm
- * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface
- * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface
- * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface
- * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface
- * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface
- * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv interface
- * @BATADV_ATTR_ORIG_ADDRESS: originator mac address
- * @BATADV_ATTR_TPMETER_RESULT: result of run (see batadv_tp_meter_status)
- * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took
- * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run
- * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session
- * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment
- * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active
- * @BATADV_ATTR_TT_ADDRESS: Client MAC address
- * @BATADV_ATTR_TT_TTVN: Translation table version
- * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version
- * @BATADV_ATTR_TT_CRC32: CRC32 over translation table
- * @BATADV_ATTR_TT_VID: VLAN ID
- * @BATADV_ATTR_TT_FLAGS: Translation table client flags
- * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best
- * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
- * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
- * @BATADV_ATTR_TQ: TQ to neighbour
- * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour
- * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth
- * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth
- * @BATADV_ATTR_ROUTER: Gateway router MAC address
- * @BATADV_ATTR_BLA_OWN: Flag indicating own originator
- * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address
- * @BATADV_ATTR_BLA_VID: BLA VLAN ID
- * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address
- * @BATADV_ATTR_BLA_CRC: BLA CRC
- * @__BATADV_ATTR_AFTER_LAST: internal use
- * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
- * @BATADV_ATTR_MAX: highest attribute number currently defined
  */
 enum batadv_nl_attrs {
+	/**
+	 * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors
+	 */
 	BATADV_ATTR_UNSPEC,
+
+	/**
+	 * @BATADV_ATTR_VERSION: batman-adv version string
+	 */
 	BATADV_ATTR_VERSION,
+
+	/**
+	 * @BATADV_ATTR_ALGO_NAME: name of routing algorithm
+	 */
 	BATADV_ATTR_ALGO_NAME,
+
+	/**
+	 * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_IFINDEX,
+
+	/**
+	 * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_IFNAME,
+
+	/**
+	 * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface
+	 */
 	BATADV_ATTR_HARD_IFINDEX,
+
+	/**
+	 * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface
+	 */
 	BATADV_ATTR_HARD_IFNAME,
+
+	/**
+	 * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv
+	 * interface
+	 */
 	BATADV_ATTR_HARD_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_ORIG_ADDRESS: originator mac address
+	 */
 	BATADV_ATTR_ORIG_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_RESULT: result of run (see
+	 * batadv_tp_meter_status)
+	 */
 	BATADV_ATTR_TPMETER_RESULT,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took
+	 */
 	BATADV_ATTR_TPMETER_TEST_TIME,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run
+	 */
 	BATADV_ATTR_TPMETER_BYTES,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session
+	 */
 	BATADV_ATTR_TPMETER_COOKIE,
+
+	/**
+	 * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment
+	 */
 	BATADV_ATTR_PAD,
+
+	/**
+	 * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active
+	 */
 	BATADV_ATTR_ACTIVE,
+
+	/**
+	 * @BATADV_ATTR_TT_ADDRESS: Client MAC address
+	 */
 	BATADV_ATTR_TT_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TT_TTVN: Translation table version
+	 */
 	BATADV_ATTR_TT_TTVN,
+
+	/**
+	 * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version
+	 */
 	BATADV_ATTR_TT_LAST_TTVN,
+
+	/**
+	 * @BATADV_ATTR_TT_CRC32: CRC32 over translation table
+	 */
 	BATADV_ATTR_TT_CRC32,
+
+	/**
+	 * @BATADV_ATTR_TT_VID: VLAN ID
+	 */
 	BATADV_ATTR_TT_VID,
+
+	/**
+	 * @BATADV_ATTR_TT_FLAGS: Translation table client flags
+	 */
 	BATADV_ATTR_TT_FLAGS,
+
+	/**
+	 * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best
+	 */
 	BATADV_ATTR_FLAG_BEST,
+
+	/**
+	 * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
+	 */
 	BATADV_ATTR_LAST_SEEN_MSECS,
+
+	/**
+	 * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
+	 */
 	BATADV_ATTR_NEIGH_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TQ: TQ to neighbour
+	 */
 	BATADV_ATTR_TQ,
+
+	/**
+	 * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour
+	 */
 	BATADV_ATTR_THROUGHPUT,
+
+	/**
+	 * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth
+	 */
 	BATADV_ATTR_BANDWIDTH_UP,
+
+	/**
+	 * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth
+	 */
 	BATADV_ATTR_BANDWIDTH_DOWN,
+
+	/**
+	 * @BATADV_ATTR_ROUTER: Gateway router MAC address
+	 */
 	BATADV_ATTR_ROUTER,
+
+	/**
+	 * @BATADV_ATTR_BLA_OWN: Flag indicating own originator
+	 */
 	BATADV_ATTR_BLA_OWN,
+
+	/**
+	 * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address
+	 */
 	BATADV_ATTR_BLA_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_BLA_VID: BLA VLAN ID
+	 */
 	BATADV_ATTR_BLA_VID,
+
+	/**
+	 * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address
+	 */
 	BATADV_ATTR_BLA_BACKBONE,
+
+	/**
+	 * @BATADV_ATTR_BLA_CRC: BLA CRC
+	 */
 	BATADV_ATTR_BLA_CRC,
+
 	/* add attributes above here, update the policy in netlink.c */
+
+	/**
+	 * @__BATADV_ATTR_AFTER_LAST: internal use
+	 */
 	__BATADV_ATTR_AFTER_LAST,
+
+	/**
+	 * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
+	 */
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
+
+	/**
+	 * @BATADV_ATTR_MAX: highest attribute number currently defined
+	 */
 	BATADV_ATTR_MAX = __BATADV_ATTR_AFTER_LAST - 1
 };
 
 /**
  * enum batadv_nl_commands - supported batman-adv netlink commands
- *
- * @BATADV_CMD_UNSPEC: unspecified command to catch errors
- * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv device
- * @BATADV_CMD_TP_METER: Start a tp meter session
- * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
- * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
- * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
- * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
- * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations
- * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
- * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
- * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
- * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims
- * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance backbones
- * @__BATADV_CMD_AFTER_LAST: internal use
- * @BATADV_CMD_MAX: highest used command number
  */
 enum batadv_nl_commands {
+	/**
+	 * @BATADV_CMD_UNSPEC: unspecified command to catch errors
+	 */
 	BATADV_CMD_UNSPEC,
+
+	/**
+	 * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv
+	 * device
+	 */
 	BATADV_CMD_GET_MESH_INFO,
+
+	/**
+	 * @BATADV_CMD_TP_METER: Start a tp meter session
+	 */
 	BATADV_CMD_TP_METER,
+
+	/**
+	 * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
+	 */
 	BATADV_CMD_TP_METER_CANCEL,
+
+	/**
+	 * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
+	 */
 	BATADV_CMD_GET_ROUTING_ALGOS,
+
+	/**
+	 * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
+	 */
 	BATADV_CMD_GET_HARDIFS,
+
+	/**
+	 * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
+	 */
 	BATADV_CMD_GET_TRANSTABLE_LOCAL,
+
+	/**
+	 * @BATADV_CMD_GET_TRANSTABLE_GLOBAL: Query list of global translations
+	 */
 	BATADV_CMD_GET_TRANSTABLE_GLOBAL,
+
+	/**
+	 * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
+	 */
 	BATADV_CMD_GET_ORIGINATORS,
+
+	/**
+	 * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
+	 */
 	BATADV_CMD_GET_NEIGHBORS,
+
+	/**
+	 * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
+	 */
 	BATADV_CMD_GET_GATEWAYS,
+
+	/**
+	 * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims
+	 */
 	BATADV_CMD_GET_BLA_CLAIM,
+
+	/**
+	 * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance
+	 * backbones
+	 */
 	BATADV_CMD_GET_BLA_BACKBONE,
+
 	/* add new commands above here */
+
+	/**
+	 * @__BATADV_CMD_AFTER_LAST: internal use
+	 */
 	__BATADV_CMD_AFTER_LAST,
+
+	/**
+	 * @BATADV_CMD_MAX: highest used command number
+	 */
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
 };
 
 /**
  * enum batadv_tp_meter_reason - reason of a tp meter test run stop
- * @BATADV_TP_REASON_COMPLETE: sender finished tp run
- * @BATADV_TP_REASON_CANCEL: sender was stopped during run
- * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or didn't
- *  answer
- * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit
- * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node already
- *  ongoing
- * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory
- * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface
- * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions
  */
 enum batadv_tp_meter_reason {
+	/**
+	 * @BATADV_TP_REASON_COMPLETE: sender finished tp run
+	 */
 	BATADV_TP_REASON_COMPLETE		= 3,
+
+	/**
+	 * @BATADV_TP_REASON_CANCEL: sender was stopped during run
+	 */
 	BATADV_TP_REASON_CANCEL			= 4,
+
 	/* error status >= 128 */
+
+	/**
+	 * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or
+	 * didn't answer
+	 */
 	BATADV_TP_REASON_DST_UNREACHABLE	= 128,
+
+	/**
+	 * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit
+	 */
 	BATADV_TP_REASON_RESEND_LIMIT		= 129,
+
+	/**
+	 * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node
+	 * already ongoing
+	 */
 	BATADV_TP_REASON_ALREADY_ONGOING	= 130,
+
+	/**
+	 * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory
+	 */
 	BATADV_TP_REASON_MEMORY_ERROR		= 131,
+
+	/**
+	 * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface
+	 */
 	BATADV_TP_REASON_CANT_SEND		= 132,
+
+	/**
+	 * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions
+	 */
 	BATADV_TP_REASON_TOO_MANY		= 133,
 };
 
-- 
cgit v1.2.3


From f6e37b25413cf636369668652e9752ee77c7d9f7 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 23 Oct 2017 09:20:22 -0700
Subject: tcp: add trace event class tcp_event_sk_skb

Introduce event class tcp_event_sk_skb for tcp tracepoints that
have arguments sk and skb.

Existing tracepoint trace_tcp_retransmit_skb() falls into this class.
This patch rewrites the definition of trace_tcp_retransmit_skb() with
tcp_event_sk_skb.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index c3220d914475..14b0a7083f1d 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -9,7 +9,13 @@
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
 
-TRACE_EVENT(tcp_retransmit_skb,
+/*
+ * tcp event with arguments sk and skb
+ *
+ * Note: this class requires a valid sk pointer; while skb pointer could
+ *       be NULL.
+ */
+DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
 	TP_PROTO(struct sock *sk, struct sk_buff *skb),
 
@@ -64,6 +70,13 @@ TRACE_EVENT(tcp_retransmit_skb,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
+DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
+
+	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+
+	TP_ARGS(sk, skb)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 7344e29f285a94b965075599731811c352f3ab40 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 23 Oct 2017 09:20:23 -0700
Subject: tcp: mark trace event arguments sk and skb as const

Some functions that we plan to add trace points require const sk
and/or skb. So we mark these fields as const in the tracepoint.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 14b0a7083f1d..2b6fe72c6781 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -17,13 +17,13 @@
  */
 DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
-	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
 
 	TP_ARGS(sk, skb),
 
 	TP_STRUCT__entry(
-		__field(void *, skbaddr)
-		__field(void *, skaddr)
+		__field(const void *, skbaddr)
+		__field(const void *, skaddr)
 		__field(__u16, sport)
 		__field(__u16, dport)
 		__array(__u8, saddr, 4)
@@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
 DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
 
-	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
 
 	TP_ARGS(sk, skb)
 );
-- 
cgit v1.2.3


From c24b14c46bb88d844275de5c4024c8745ae89d42 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 23 Oct 2017 09:20:24 -0700
Subject: tcp: add tracepoint trace_tcp_send_reset

New tracepoint trace_tcp_send_reset is added and called from
tcp_v4_send_reset(), tcp_v6_send_reset() and tcp_send_active_reset().

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 11 +++++++++++
 net/core/net-traces.c      |  2 ++
 net/ipv4/tcp_ipv4.c        |  6 +++++-
 net/ipv4/tcp_output.c      |  5 +++++
 net/ipv6/tcp_ipv6.c        | 10 ++++++++--
 5 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 2b6fe72c6781..3e57e1ae1c6b 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -77,6 +77,17 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
 	TP_ARGS(sk, skb)
 );
 
+/*
+ * skb of trace_tcp_send_reset is the skb that caused RST. In case of
+ * active reset, skb should be NULL
+ */
+DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
+
+	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+
+	TP_ARGS(sk, skb)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index f4e4fa2db505..8dcd9b0be04a 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -49,3 +49,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e22439f05e46..eb3f3b8e1e4b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -85,6 +85,8 @@
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
 
+#include <trace/events/tcp.h>
+
 #ifdef CONFIG_TCP_MD5SIG
 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
@@ -701,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	 * routing might fail in this case. No choice here, if we choose to force
 	 * input interface, we will misroute in case of asymmetric route.
 	 */
-	if (sk)
+	if (sk) {
 		arg.bound_dev_if = sk->sk_bound_dev_if;
+		trace_tcp_send_reset(sk, skb);
+	}
 
 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 988733f289c8..1f01f4c9c738 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3084,6 +3084,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 	/* Send it off. */
 	if (tcp_transmit_skb(sk, skb, 0, priority))
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+
+	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
+	 * skb here is different to the troublesome skb, so use NULL
+	 */
+	trace_tcp_send_reset(sk, NULL);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ae83615b7f6d..0e2529958b52 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -69,6 +69,8 @@
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
 
+#include <trace/events/tcp.h>
+
 static void	tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
 static void	tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 				      struct request_sock *req);
@@ -890,7 +892,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 	int genhash;
 	struct sock *sk1 = NULL;
 #endif
-	int oif;
+	int oif = 0;
 
 	if (th->rst)
 		return;
@@ -939,7 +941,11 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
 			  (th->doff << 2);
 
-	oif = sk ? sk->sk_bound_dev_if : 0;
+	if (sk) {
+		oif = sk->sk_bound_dev_if;
+		trace_tcp_send_reset(sk, skb);
+	}
+
 	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
cgit v1.2.3


From 5941521c05d69cf3f2b1293eefd21207e083b70f Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 23 Oct 2017 09:20:25 -0700
Subject: tcp: add tracepoint trace_tcp_receive_reset

New tracepoint trace_tcp_receive_reset is added and called from
tcp_reset(). This tracepoint is define with a new class tcp_event_sk.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       |  3 +++
 2 files changed, 69 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 3e57e1ae1c6b..c83c71187719 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -88,6 +88,72 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
 	TP_ARGS(sk, skb)
 );
 
+/*
+ * tcp event with arguments sk
+ *
+ * Note: this class requires a valid sk pointer.
+ */
+DECLARE_EVENT_CLASS(tcp_event_sk,
+
+	TP_PROTO(const struct sock *sk),
+
+	TP_ARGS(sk),
+
+	TP_STRUCT__entry(
+		__field(const void *, skaddr)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+	),
+
+	TP_fast_assign(
+		struct inet_sock *inet = inet_sk(sk);
+		struct in6_addr *pin6;
+		__be32 *p32;
+
+		__entry->skaddr = sk;
+
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = inet->inet_saddr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 =  inet->inet_daddr;
+
+#if IS_ENABLED(CONFIG_IPV6)
+		if (sk->sk_family == AF_INET6) {
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			*pin6 = sk->sk_v6_rcv_saddr;
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			*pin6 = sk->sk_v6_daddr;
+		} else
+#endif
+		{
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
+		}
+	),
+
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
+		  __entry->sport, __entry->dport,
+		  __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6)
+);
+
+DEFINE_EVENT(tcp_event_sk, tcp_receive_reset,
+
+	TP_PROTO(const struct sock *sk),
+
+	TP_ARGS(sk)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ab3f12898245..c5e64d4b5839 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -75,6 +75,7 @@
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 #include <linux/errqueue.h>
+#include <trace/events/tcp.h>
 
 int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
@@ -4010,6 +4011,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
 /* When we get a reset we do this. */
 void tcp_reset(struct sock *sk)
 {
+	trace_tcp_receive_reset(sk);
+
 	/* We want the right error as BSD sees it (and indeed as we do). */
 	switch (sk->sk_state) {
 	case TCP_SYN_SENT:
-- 
cgit v1.2.3


From e1a4aa50f47303ebb3ca0cfd01687884551ce03d Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 23 Oct 2017 09:20:26 -0700
Subject: tcp: add tracepoint trace_tcp_destroy_sock

This patch adds trace event trace_tcp_destroy_sock.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 7 +++++++
 net/ipv4/tcp_ipv4.c        | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index c83c71187719..1724c12c25cf 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -154,6 +154,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_receive_reset,
 	TP_ARGS(sk)
 );
 
+DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,
+
+	TP_PROTO(const struct sock *sk),
+
+	TP_ARGS(sk)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index eb3f3b8e1e4b..23a8100af5ad 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1869,6 +1869,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	trace_tcp_destroy_sock(sk);
+
 	tcp_clear_xmit_timers(sk);
 
 	tcp_cleanup_congestion_control(sk);
-- 
cgit v1.2.3


From e8fce23946b7e7eadf25ad78d8207c22903dfe27 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 23 Oct 2017 09:20:27 -0700
Subject: tcp: add tracepoint trace_tcp_set_state()

This patch adds tracepoint trace_tcp_set_state. Besides usual fields
(s/d ports, IP addresses), old and new state of the socket is also
printed with TP_printk, with __print_symbolic().

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp.c             |  4 +++
 2 files changed, 80 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 1724c12c25cf..03699ba71623 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -9,6 +9,22 @@
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
 
+#define tcp_state_name(state)	{ state, #state }
+#define show_tcp_state_name(val)			\
+	__print_symbolic(val,				\
+		tcp_state_name(TCP_ESTABLISHED),	\
+		tcp_state_name(TCP_SYN_SENT),		\
+		tcp_state_name(TCP_SYN_RECV),		\
+		tcp_state_name(TCP_FIN_WAIT1),		\
+		tcp_state_name(TCP_FIN_WAIT2),		\
+		tcp_state_name(TCP_TIME_WAIT),		\
+		tcp_state_name(TCP_CLOSE),		\
+		tcp_state_name(TCP_CLOSE_WAIT),		\
+		tcp_state_name(TCP_LAST_ACK),		\
+		tcp_state_name(TCP_LISTEN),		\
+		tcp_state_name(TCP_CLOSING),		\
+		tcp_state_name(TCP_NEW_SYN_RECV))
+
 /*
  * tcp event with arguments sk and skb
  *
@@ -161,6 +177,66 @@ DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,
 	TP_ARGS(sk)
 );
 
+TRACE_EVENT(tcp_set_state,
+
+	TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),
+
+	TP_ARGS(sk, oldstate, newstate),
+
+	TP_STRUCT__entry(
+		__field(const void *, skaddr)
+		__field(int, oldstate)
+		__field(int, newstate)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+	),
+
+	TP_fast_assign(
+		struct inet_sock *inet = inet_sk(sk);
+		struct in6_addr *pin6;
+		__be32 *p32;
+
+		__entry->skaddr = sk;
+		__entry->oldstate = oldstate;
+		__entry->newstate = newstate;
+
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = inet->inet_saddr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 =  inet->inet_daddr;
+
+#if IS_ENABLED(CONFIG_IPV6)
+		if (sk->sk_family == AF_INET6) {
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			*pin6 = sk->sk_v6_rcv_saddr;
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			*pin6 = sk->sk_v6_daddr;
+		} else
+#endif
+		{
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
+		}
+	),
+
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
+		  __entry->sport, __entry->dport,
+		  __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6,
+		  show_tcp_state_name(__entry->oldstate),
+		  show_tcp_state_name(__entry->newstate))
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8b1fa4dd4538..be07e9b6dbdd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,6 +282,8 @@
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
 
+#include <trace/events/tcp.h>
+
 int sysctl_tcp_min_tso_segs __read_mostly = 2;
 
 int sysctl_tcp_autocorking __read_mostly = 1;
@@ -2040,6 +2042,8 @@ void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
+	trace_tcp_set_state(sk, oldstate, state);
+
 	switch (state) {
 	case TCP_ESTABLISHED:
 		if (oldstate != TCP_ESTABLISHED)
-- 
cgit v1.2.3


From 3f27fb23219e75343b094366f2358bff34300493 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Oct 2017 16:17:47 -0700
Subject: ipv6: addrconf: add per netns perturbation in inet6_addr_hash()

Bring IPv6 in par with IPv4 :

- Use net_hash_mix() to spread addresses a bit more.
- Use 256 slots hash table instead of 16

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  2 +-
 net/ipv6/addrconf.c    | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index b8b16437c6d5..15b5ffd7253d 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -58,7 +58,7 @@ struct in6_validator_info {
 	struct netlink_ext_ack	*extack;
 };
 
-#define IN6_ADDR_HSIZE_SHIFT	4
+#define IN6_ADDR_HSIZE_SHIFT	8
 #define IN6_ADDR_HSIZE		(1 << IN6_ADDR_HSIZE_SHIFT)
 
 int addrconf_init(void);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c1a5028f394c..d70d98122053 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -950,9 +950,11 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
 	list_add_tail_rcu(&ifp->if_list, p);
 }
 
-static u32 inet6_addr_hash(const struct in6_addr *addr)
+static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr)
 {
-	return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT);
+	u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+	return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
 }
 
 static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
@@ -973,7 +975,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
 
 static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 {
-	unsigned int hash = inet6_addr_hash(&ifa->addr);
+	unsigned int hash = inet6_addr_hash(dev_net(dev), &ifa->addr);
 	int err = 0;
 
 	spin_lock(&addrconf_hash_lock);
@@ -1838,8 +1840,8 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
 			    const struct net_device *dev, int strict,
 			    u32 banned_flags)
 {
+	unsigned int hash = inet6_addr_hash(net, addr);
 	struct inet6_ifaddr *ifp;
-	unsigned int hash = inet6_addr_hash(addr);
 	u32 ifp_flags;
 
 	rcu_read_lock_bh();
@@ -1917,8 +1919,8 @@ EXPORT_SYMBOL(ipv6_chk_prefix);
 struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
 				     struct net_device *dev, int strict)
 {
+	unsigned int hash = inet6_addr_hash(net, addr);
 	struct inet6_ifaddr *ifp, *result = NULL;
-	unsigned int hash = inet6_addr_hash(addr);
 
 	rcu_read_lock_bh();
 	hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) {
@@ -4242,9 +4244,9 @@ void if6_proc_exit(void)
 /* Check if address is a home address configured on any interface. */
 int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
 {
-	int ret = 0;
+	unsigned int hash = inet6_addr_hash(net, addr);
 	struct inet6_ifaddr *ifp = NULL;
-	unsigned int hash = inet6_addr_hash(addr);
+	int ret = 0;
 
 	rcu_read_lock_bh();
 	hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) {
-- 
cgit v1.2.3


From b6f4f8484d88b69f700907200a9a9ec73806355f Mon Sep 17 00:00:00 2001
From: Tim Hansen <devtimhansen@gmail.com>
Date: Mon, 23 Oct 2017 15:35:58 -0400
Subject: net/sock: Update sk rcu iterator macro.

Mark hlist node in sk rcu iterator as protected by the rcu.
hlist_next_rcu accomplishes this and silences the warnings
sparse throws.

Found with make C=1 net/ipv4/udp.o on linux-next tag
next-20171009.

Signed-off-by: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 4827094f1db4..6f1be9726e02 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -737,10 +737,10 @@ static inline void sk_add_bind_node(struct sock *sk,
  *
  */
 #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)		       \
-	for (pos = rcu_dereference((head)->first);			       \
+	for (pos = rcu_dereference(hlist_first_rcu(head));		       \
 	     pos != NULL &&						       \
 		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
-	     pos = rcu_dereference(pos->next))
+	     pos = rcu_dereference(hlist_next_rcu(pos)))
 
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
-- 
cgit v1.2.3


From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Mon, 23 Oct 2017 13:22:23 -0700
Subject: tcp: Configure TFO without cookie per socket and/or per route

We already allow to enable TFO without a cookie by using the
fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or
TFO_CLIENT_NO_COOKIE).
This is safe to do in certain environments where we know that there
isn't a malicous host (aka., data-centers) or when the
application-protocol already provides an authentication mechanism in the
first flight of data.

A server however might be providing multiple services or talking to both
sides (public Internet and data-center). So, this server would want to
enable cookie-less TFO for certain services and/or for connections that
go to the data-center.

This patch exposes a socket-option and a per-route attribute to enable such
fine-grained configurations.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h            |  3 ++-
 include/net/tcp.h              |  3 ++-
 include/uapi/linux/rtnetlink.h |  2 ++
 include/uapi/linux/tcp.h       |  1 +
 net/ipv4/tcp.c                 | 12 ++++++++++++
 net/ipv4/tcp_fastopen.c        | 20 +++++++++++++++++---
 net/ipv4/tcp_input.c           |  2 +-
 7 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1d2c44e09e31..173a7c2f9636 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -215,7 +215,8 @@ struct tcp_sock {
 	u8	chrono_type:2,	/* current chronograph type */
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
-		unused:4;
+		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+		unused:3;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2c13484704cb..2392f74074e7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1567,7 +1567,8 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
-			      struct tcp_fastopen_cookie *foc);
+			      struct tcp_fastopen_cookie *foc,
+			      const struct dst_entry *dst);
 void tcp_fastopen_init_key_once(struct net *net);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index dab7dad9e01a..fe6679268901 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -430,6 +430,8 @@ enum {
 #define RTAX_QUICKACK RTAX_QUICKACK
 	RTAX_CC_ALGO,
 #define RTAX_CC_ALGO RTAX_CC_ALGO
+	RTAX_FASTOPEN_NO_COOKIE,
+#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE
 	__RTAX_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 69c7493e42f8..d67e1d40c6d6 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -120,6 +120,7 @@ enum {
 #define TCP_ULP			31	/* Attach a ULP to a TCP connection */
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
+#define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index be07e9b6dbdd..8f36277e82e9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2836,6 +2836,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EOPNOTSUPP;
 		}
 		break;
+	case TCP_FASTOPEN_NO_COOKIE:
+		if (val > 1 || val < 0)
+			err = -EINVAL;
+		else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+			err = -EINVAL;
+		else
+			tp->fastopen_no_cookie = val;
+		break;
 	case TCP_TIMESTAMP:
 		if (!tp->repair)
 			err = -EPERM;
@@ -3256,6 +3264,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = tp->fastopen_connect;
 		break;
 
+	case TCP_FASTOPEN_NO_COOKIE:
+		val = tp->fastopen_no_cookie;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp_raw() + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 21075ce19cb6..e0a4b56644aa 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -310,13 +310,23 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
 	return true;
 }
 
+static bool tcp_fastopen_no_cookie(const struct sock *sk,
+				   const struct dst_entry *dst,
+				   int flag)
+{
+	return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) ||
+	       tcp_sk(sk)->fastopen_no_cookie ||
+	       (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
+}
+
 /* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
  * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
  * cookie request (foc->len == 0).
  */
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
-			      struct tcp_fastopen_cookie *foc)
+			      struct tcp_fastopen_cookie *foc,
+			      const struct dst_entry *dst)
 {
 	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
 	int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
@@ -333,7 +343,8 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 		return NULL;
 	}
 
-	if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+	if (syn_data &&
+	    tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
 		goto fastopen;
 
 	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
@@ -370,6 +381,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			       struct tcp_fastopen_cookie *cookie)
 {
 	unsigned long last_syn_loss = 0;
+	const struct dst_entry *dst;
 	int syn_loss = 0;
 
 	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
@@ -387,7 +399,9 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 		return false;
 	}
 
-	if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+	dst = __sk_dst_get(sk);
+
+	if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
 		cookie->len = -1;
 		return true;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5e64d4b5839..893286db4623 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6332,7 +6332,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_openreq_init_rwin(req, sk, dst);
 	if (!want_cookie) {
 		tcp_reqsk_record_syn(sk, req, skb);
-		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
+		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
 	}
 	if (fastopen_sk) {
 		af_ops->send_synack(fastopen_sk, dst, &fl, req,
-- 
cgit v1.2.3


From c4f3db15958277c03d1c324894255ea3ecbf86e1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Oct 2017 10:47:40 +0200
Subject: netfilter: conntrack: add and use nf_l4proto_log_invalid

We currently pass down the l4 protocol to the conntrack ->packet()
function, but the only user of this is the debug info decision.

Same information can be derived from struct nf_conn.
As a first step, add and use a new log function for this, similar to
nf_ct_helper_log().

Add __cold annotation -- invalid packets should be infrequent so
gcc can consider all call paths that lead to such a function as
unlikely.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   | 10 +++++++
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 18 ++++++------
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 14 +++++----
 net/netfilter/nf_conntrack_proto.c             | 24 ++++++++++++++++
 net/netfilter/nf_conntrack_proto_dccp.c        |  3 +-
 net/netfilter/nf_conntrack_proto_sctp.c        |  3 +-
 net/netfilter/nf_conntrack_proto_tcp.c         | 22 +++++++-------
 net/netfilter/nf_conntrack_proto_udp.c         | 40 ++++++++++++--------------
 8 files changed, 82 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 738a0307a96b..6d79a061d360 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -152,8 +152,18 @@ extern const struct nla_policy nf_ct_port_nla_policy[];
 #define LOG_INVALID(net, proto)				\
 	((net)->ct.sysctl_log_invalid == (proto) ||	\
 	 (net)->ct.sysctl_log_invalid == IPPROTO_RAW)
+
+__printf(5, 6) __cold
+void nf_l4proto_log_invalid(const struct sk_buff *skb,
+			    struct net *net,
+			    u16 pf, u8 protonum,
+			    const char *fmt, ...);
 #else
 static inline int LOG_INVALID(struct net *net, int proto) { return 0; }
+
+static inline __printf(5, 6) __cold
+void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net,
+			    u16 pf, u8 protonum, const char *fmt, ...) {}
 #endif /* CONFIG_SYSCTL */
 
 #endif /*_NF_CONNTRACK_PROTOCOL_H*/
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index a046c298413a..7281a7b77a0e 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -165,6 +165,12 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
+static void icmp_error_log(const struct sk_buff *skb, struct net *net,
+			   u8 pf, const char *msg)
+{
+	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg);
+}
+
 /* Small and modified version of icmp_rcv */
 static int
 icmp_error(struct net *net, struct nf_conn *tmpl,
@@ -177,18 +183,14 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	/* Not enough header? */
 	icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
 	if (icmph == NULL) {
-		if (LOG_INVALID(net, IPPROTO_ICMP))
-			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
-				      NULL, "nf_ct_icmp: short packet ");
+		icmp_error_log(skb, net, pf, "short packet");
 		return -NF_ACCEPT;
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
-		if (LOG_INVALID(net, IPPROTO_ICMP))
-			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_icmp: bad HW ICMP checksum ");
+		icmp_error_log(skb, net, pf, "bad hw icmp checksum");
 		return -NF_ACCEPT;
 	}
 
@@ -199,9 +201,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	 *		  discarded.
 	 */
 	if (icmph->type > NR_ICMP_TYPES) {
-		if (LOG_INVALID(net, IPPROTO_ICMP))
-			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_icmp: invalid ICMP type ");
+		icmp_error_log(skb, net, pf, "invalid icmp type");
 		return -NF_ACCEPT;
 	}
 
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index a9e1fd1a8536..0f227ca4a5a2 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -176,6 +176,12 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	return NF_ACCEPT;
 }
 
+static void icmpv6_error_log(const struct sk_buff *skb, struct net *net,
+			     u8 pf, const char *msg)
+{
+	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg);
+}
+
 static int
 icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	     struct sk_buff *skb, unsigned int dataoff,
@@ -187,17 +193,13 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 
 	icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
 	if (icmp6h == NULL) {
-		if (LOG_INVALID(net, IPPROTO_ICMPV6))
-			nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL,
-			      "nf_ct_icmpv6: short packet ");
+		icmpv6_error_log(skb, net, pf, "short packet");
 		return -NF_ACCEPT;
 	}
 
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
-		if (LOG_INVALID(net, IPPROTO_ICMPV6))
-			nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_icmpv6: ICMPv6 checksum failed ");
+		icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed");
 		return -NF_ACCEPT;
 	}
 
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index b3e489c859ec..bcd3ee270d75 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -27,6 +27,7 @@
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_log.h>
 
 static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly;
 struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly;
@@ -63,6 +64,29 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header,
 	*header = NULL;
 	*table = NULL;
 }
+
+__printf(5, 6)
+void nf_l4proto_log_invalid(const struct sk_buff *skb,
+			    struct net *net,
+			    u16 pf, u8 protonum,
+			    const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (net->ct.sysctl_log_invalid != protonum ||
+	    net->ct.sysctl_log_invalid != IPPROTO_RAW)
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+		      "nf_ct_proto_%d: %pV ", protonum, &vaf);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid);
 #endif
 
 const struct nf_conntrack_l4proto *
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 0f5a4d79f6b8..ef501c7edb96 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -604,8 +604,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,
 	return NF_ACCEPT;
 
 out_invalid:
-	if (LOG_INVALID(net, IPPROTO_DCCP))
-		nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg);
+	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg);
 	return -NF_ACCEPT;
 }
 
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 6303a88af12b..aa630c561361 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -522,8 +522,7 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
 	}
 	return NF_ACCEPT;
 out_invalid:
-	if (LOG_INVALID(net, IPPROTO_SCTP))
-		nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg);
+	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg);
 	return -NF_ACCEPT;
 }
 
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index cba1c6ffe51a..14198b2a2e2c 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -738,6 +738,12 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
 	[TCPHDR_ACK|TCPHDR_URG]			= 1,
 };
 
+static void tcp_error_log(const struct sk_buff *skb, struct net *net,
+			  u8 pf, const char *msg)
+{
+	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg);
+}
+
 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
 static int tcp_error(struct net *net, struct nf_conn *tmpl,
 		     struct sk_buff *skb,
@@ -753,17 +759,13 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	/* Smaller that minimal TCP header? */
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
-		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				"nf_ct_tcp: short packet ");
+		tcp_error_log(skb, net, pf, "short packet");
 		return -NF_ACCEPT;
 	}
 
 	/* Not whole TCP header or malformed packet */
 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
-		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				"nf_ct_tcp: truncated/malformed packet ");
+		tcp_error_log(skb, net, pf, "truncated packet");
 		return -NF_ACCEPT;
 	}
 
@@ -774,18 +776,14 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	/* FIXME: Source route IP option packets --RR */
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
-		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				  "nf_ct_tcp: bad TCP checksum ");
+		tcp_error_log(skb, net, pf, "bad checksum");
 		return -NF_ACCEPT;
 	}
 
 	/* Check TCP flags. */
 	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
 	if (!tcp_valid_flags[tcpflags]) {
-		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				  "nf_ct_tcp: invalid TCP flag combination ");
+		tcp_error_log(skb, net, pf, "invalid tcp flag combination");
 		return -NF_ACCEPT;
 	}
 
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 8af734cd1a94..fc20cf430251 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -99,6 +99,12 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
 }
 
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
+static void udplite_error_log(const struct sk_buff *skb, struct net *net,
+			      u8 pf, const char *msg)
+{
+	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg);
+}
+
 static int udplite_error(struct net *net, struct nf_conn *tmpl,
 			 struct sk_buff *skb,
 			 unsigned int dataoff,
@@ -112,9 +118,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 	/* Header is too small? */
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (!hdr) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: short packet ");
+		udplite_error_log(skb, net, pf, "short packet");
 		return -NF_ACCEPT;
 	}
 
@@ -122,17 +126,13 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 	if (cscov == 0) {
 		cscov = udplen;
 	} else if (cscov < sizeof(*hdr) || cscov > udplen) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: invalid checksum coverage ");
+		udplite_error_log(skb, net, pf, "invalid checksum coverage");
 		return -NF_ACCEPT;
 	}
 
 	/* UDPLITE mandates checksums */
 	if (!hdr->check) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: checksum missing ");
+		udplite_error_log(skb, net, pf, "checksum missing");
 		return -NF_ACCEPT;
 	}
 
@@ -140,9 +140,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
 				pf)) {
-		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udplite: bad UDPLite checksum ");
+		udplite_error_log(skb, net, pf, "bad checksum");
 		return -NF_ACCEPT;
 	}
 
@@ -150,6 +148,12 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 }
 #endif
 
+static void udp_error_log(const struct sk_buff *skb, struct net *net,
+			  u8 pf, const char *msg)
+{
+	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg);
+}
+
 static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 		     unsigned int dataoff,
 		     u_int8_t pf,
@@ -162,17 +166,13 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	/* Header is too small? */
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (hdr == NULL) {
-		if (LOG_INVALID(net, IPPROTO_UDP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_udp: short packet ");
+		udp_error_log(skb, net, pf, "short packet");
 		return -NF_ACCEPT;
 	}
 
 	/* Truncated/malformed packets */
 	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
-		if (LOG_INVALID(net, IPPROTO_UDP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				"nf_ct_udp: truncated/malformed packet ");
+		udp_error_log(skb, net, pf, "truncated/malformed packet");
 		return -NF_ACCEPT;
 	}
 
@@ -186,9 +186,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	 * FIXME: Source route IP option packets --RR */
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
-		if (LOG_INVALID(net, IPPROTO_UDP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				"nf_ct_udp: bad UDP checksum ");
+		udp_error_log(skb, net, pf, "bad checksum");
 		return -NF_ACCEPT;
 	}
 
-- 
cgit v1.2.3


From 3d0b527bc9dc0e8c4428eb1a98d4cd27bd1114c7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Oct 2017 10:47:41 +0200
Subject: netfilter: conntrack: add and use nf_ct_l4proto_log_invalid

We currently pass down the l4 protocol to the conntrack ->packet()
function, but the only user of this is the debug info decision.

Same information can be derived from struct nf_conn.
Add a wrapper for the previous patch that extracs the information
from nf_conn and passes it to nf_l4proto_log_invalid().

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 14 ++++++++------
 net/netfilter/nf_conntrack_proto.c           | 23 +++++++++++++++++++++++
 net/netfilter/nf_conntrack_proto_dccp.c      | 17 +++++------------
 net/netfilter/nf_conntrack_proto_tcp.c       | 25 +++++++++----------------
 4 files changed, 45 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 6d79a061d360..5d51255b5bfb 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -149,21 +149,23 @@ int nf_ct_port_nlattr_tuple_size(void);
 extern const struct nla_policy nf_ct_port_nla_policy[];
 
 #ifdef CONFIG_SYSCTL
-#define LOG_INVALID(net, proto)				\
-	((net)->ct.sysctl_log_invalid == (proto) ||	\
-	 (net)->ct.sysctl_log_invalid == IPPROTO_RAW)
-
+__printf(3, 4) __cold
+void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
+			       const struct nf_conn *ct,
+			       const char *fmt, ...);
 __printf(5, 6) __cold
 void nf_l4proto_log_invalid(const struct sk_buff *skb,
 			    struct net *net,
 			    u16 pf, u8 protonum,
 			    const char *fmt, ...);
 #else
-static inline int LOG_INVALID(struct net *net, int proto) { return 0; }
-
 static inline __printf(5, 6) __cold
 void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net,
 			    u16 pf, u8 protonum, const char *fmt, ...) {}
+static inline __printf(3, 4) __cold
+void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
+			       const struct nf_conn *ct,
+			       const char *fmt, ...) { }
 #endif /* CONFIG_SYSCTL */
 
 #endif /*_NF_CONNTRACK_PROTOCOL_H*/
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index bcd3ee270d75..83f739e9dc08 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -87,6 +87,29 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb,
 	va_end(args);
 }
 EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid);
+
+__printf(3, 4)
+void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
+			       const struct nf_conn *ct,
+			       const char *fmt, ...)
+{
+	struct va_format vaf;
+	struct net *net;
+	va_list args;
+
+	net = nf_ct_net(ct);
+	if (likely(net->ct.sysctl_log_invalid == 0))
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct),
+			       nf_ct_protonum(ct), "%pV", &vaf);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid);
 #endif
 
 const struct nf_conntrack_l4proto *
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index ef501c7edb96..49e0abcdc6f4 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -428,13 +428,13 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	default:
 		dn = dccp_pernet(net);
 		if (dn->dccp_loose == 0) {
-			msg = "nf_ct_dccp: not picking up existing connection ";
+			msg = "not picking up existing connection ";
 			goto out_invalid;
 		}
 	case CT_DCCP_REQUEST:
 		break;
 	case CT_DCCP_INVALID:
-		msg = "nf_ct_dccp: invalid state transition ";
+		msg = "invalid state transition ";
 		goto out_invalid;
 	}
 
@@ -447,9 +447,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 
 out_invalid:
-	if (LOG_INVALID(net, IPPROTO_DCCP))
-		nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL,
-			      NULL, "%s", msg);
+	nf_ct_l4proto_log_invalid(skb, ct, "%s", msg);
 	return false;
 }
 
@@ -472,7 +470,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		       u_int8_t pf,
 		       unsigned int *timeouts)
 {
-	struct net *net = nf_ct_net(ct);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	struct dccp_hdr _dh, *dh;
 	u_int8_t type, old_state, new_state;
@@ -534,15 +531,11 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		ct->proto.dccp.last_pkt = type;
 
 		spin_unlock_bh(&ct->lock);
-		if (LOG_INVALID(net, IPPROTO_DCCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_dccp: invalid packet ignored ");
+		nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet");
 		return NF_ACCEPT;
 	case CT_DCCP_INVALID:
 		spin_unlock_bh(&ct->lock);
-		if (LOG_INVALID(net, IPPROTO_DCCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_dccp: invalid state transition ");
+		nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition");
 		return -NF_ACCEPT;
 	}
 
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 14198b2a2e2c..dced574f6006 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -702,9 +702,9 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
 		    tn->tcp_be_liberal)
 			res = true;
-		if (!res && LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-			"nf_ct_tcp: %s ",
+		if (!res) {
+			nf_ct_l4proto_log_invalid(skb, ct,
+			"%s",
 			before(seq, sender->td_maxend + 1) ?
 			in_recv_win ?
 			before(sack, receiver->td_end + 1) ?
@@ -713,6 +713,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			: "ACK is over the upper bound (ACKed data not seen yet)"
 			: "SEQ is under the lower bound (already ACKed data retransmitted)"
 			: "SEQ is over the upper bound (over the window of the receiver)");
+		}
 	}
 
 	pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
@@ -937,10 +938,8 @@ static int tcp_packet(struct nf_conn *ct,
 					IP_CT_EXP_CHALLENGE_ACK;
 		}
 		spin_unlock_bh(&ct->lock);
-		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				  "nf_ct_tcp: invalid packet ignored in "
-				  "state %s ", tcp_conntrack_names[old_state]);
+		nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in "
+					  "state %s ", tcp_conntrack_names[old_state]);
 		return NF_ACCEPT;
 	case TCP_CONNTRACK_MAX:
 		/* Special case for SYN proxy: when the SYN to the server or
@@ -962,9 +961,7 @@ static int tcp_packet(struct nf_conn *ct,
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
 		spin_unlock_bh(&ct->lock);
-		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				  "nf_ct_tcp: invalid state ");
+		nf_ct_l4proto_log_invalid(skb, ct, "invalid state");
 		return -NF_ACCEPT;
 	case TCP_CONNTRACK_TIME_WAIT:
 		/* RFC5961 compliance cause stack to send "challenge-ACK"
@@ -979,9 +976,7 @@ static int tcp_packet(struct nf_conn *ct,
 			/* Detected RFC5961 challenge ACK */
 			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
 			spin_unlock_bh(&ct->lock);
-			if (LOG_INVALID(net, IPPROTO_TCP))
-				nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_tcp: challenge-ACK ignored ");
+			nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored");
 			return NF_ACCEPT; /* Don't change state */
 		}
 		break;
@@ -991,9 +986,7 @@ static int tcp_packet(struct nf_conn *ct,
 		    && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
 			/* Invalid RST  */
 			spin_unlock_bh(&ct->lock);
-			if (LOG_INVALID(net, IPPROTO_TCP))
-				nf_log_packet(net, pf, 0, skb, NULL, NULL,
-					      NULL, "nf_ct_tcp: invalid RST ");
+			nf_ct_l4proto_log_invalid(skb, ct, "invalid rst");
 			return -NF_ACCEPT;
 		}
 		if (index == TCP_RST_SET
-- 
cgit v1.2.3


From eb6fad5a4a328b85d3faa8b301b522e3f316b49d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Oct 2017 10:47:42 +0200
Subject: netfilter: conntrack: remove pf argument from l4 packet functions

not needed/used anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   | 1 -
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 1 -
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 1 -
 net/netfilter/nf_conntrack_core.c              | 2 +-
 net/netfilter/nf_conntrack_proto_dccp.c        | 1 -
 net/netfilter/nf_conntrack_proto_generic.c     | 1 -
 net/netfilter/nf_conntrack_proto_gre.c         | 1 -
 net/netfilter/nf_conntrack_proto_sctp.c        | 1 -
 net/netfilter/nf_conntrack_proto_tcp.c         | 6 ++----
 net/netfilter/nf_conntrack_proto_udp.c         | 1 -
 10 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 5d51255b5bfb..e06518874144 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -42,7 +42,6 @@ struct nf_conntrack_l4proto {
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
-		      u_int8_t pf,
 		      unsigned int *timeouts);
 
 	/* Called when a new connection for this protocol found;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 7281a7b77a0e..8969420cecc3 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -81,7 +81,6 @@ static int icmp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
-		       u_int8_t pf,
 		       unsigned int *timeout)
 {
 	/* Do not immediately delete the connection after the first
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 0f227ca4a5a2..dca921df28e1 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -94,7 +94,6 @@ static int icmpv6_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
-		       u_int8_t pf,
 		       unsigned int *timeout)
 {
 	/* Do not immediately delete the connection after the first
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 01130392b7c0..28e675150853 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1419,7 +1419,7 @@ repeat:
 	/* Decide what timeout policy we want to apply to this flow. */
 	timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
 
-	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, timeouts);
+	ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts);
 	if (ret <= 0) {
 		/* Invalid: inverse of the return code tells
 		 * the netfilter core what to do */
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 49e0abcdc6f4..2a446f4a554c 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -467,7 +467,6 @@ static unsigned int *dccp_get_timeouts(struct net *net)
 
 static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		       unsigned int dataoff, enum ip_conntrack_info ctinfo,
-		       u_int8_t pf,
 		       unsigned int *timeouts)
 {
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 9cd40700842e..1f86ddf6649a 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -60,7 +60,6 @@ static int generic_packet(struct nf_conn *ct,
 			  const struct sk_buff *skb,
 			  unsigned int dataoff,
 			  enum ip_conntrack_info ctinfo,
-			  u_int8_t pf,
 			  unsigned int *timeout)
 {
 	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 09a90484c27d..a2503005d80b 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -244,7 +244,6 @@ static int gre_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
-		      u_int8_t pf,
 		      unsigned int *timeouts)
 {
 	/* If we've seen traffic both ways, this is a GRE connection.
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index aa630c561361..80faf04ddf15 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -306,7 +306,6 @@ static int sctp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
-		       u_int8_t pf,
 		       unsigned int *timeouts)
 {
 	enum sctp_conntrack new_state, old_state;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index dced574f6006..8f283294d70f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -493,8 +493,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			  unsigned int index,
 			  const struct sk_buff *skb,
 			  unsigned int dataoff,
-			  const struct tcphdr *tcph,
-			  u_int8_t pf)
+			  const struct tcphdr *tcph)
 {
 	struct net *net = nf_ct_net(ct);
 	struct nf_tcp_net *tn = tcp_pernet(net);
@@ -801,7 +800,6 @@ static int tcp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
-		      u_int8_t pf,
 		      unsigned int *timeouts)
 {
 	struct net *net = nf_ct_net(ct);
@@ -1013,7 +1011,7 @@ static int tcp_packet(struct nf_conn *ct,
 	}
 
 	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
-			   skb, dataoff, th, pf)) {
+			   skb, dataoff, th)) {
 		spin_unlock_bh(&ct->lock);
 		return -NF_ACCEPT;
 	}
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index fc20cf430251..3a5f727103af 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -73,7 +73,6 @@ static int udp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
-		      u_int8_t pf,
 		      unsigned int *timeouts)
 {
 	/* If we've seen traffic both ways, this is some kind of UDP
-- 
cgit v1.2.3


From 28efb0046512e8a13ed9f9bdf0d68d10bbfbe9cf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 12 Oct 2017 09:38:30 +0200
Subject: netfilter: conntrack: make l3proto trackers const

previous patches removed all writes to them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 2 +-
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 2 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 919e4e8af327..5534ecca7a5d 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -10,7 +10,7 @@
 #define _NF_CONNTRACK_IPV4_H
 
 
-extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
+const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
 
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index eaea968f8657..30dc57980866 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -1,7 +1,7 @@
 #ifndef _NF_CONNTRACK_IPV6_H
 #define _NF_CONNTRACK_IPV6_H
 
-extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
+extern const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
 
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fe374da4bc13..89af9d88ca21 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -344,7 +344,7 @@ static void ipv4_hooks_unregister(struct net *net)
 	mutex_unlock(&register_ipv4_hooks);
 }
 
-struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
+const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
 	.l3proto	 = PF_INET,
 	.pkt_to_tuple	 = ipv4_pkt_to_tuple,
 	.invert_tuple	 = ipv4_invert_tuple,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index fe01dc953c56..3b80a38f62b8 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -339,7 +339,7 @@ static void ipv6_hooks_unregister(struct net *net)
 	mutex_unlock(&register_ipv6_hooks);
 }
 
-struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
+const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
 	.l3proto		= PF_INET6,
 	.pkt_to_tuple		= ipv6_pkt_to_tuple,
 	.invert_tuple		= ipv6_invert_tuple,
-- 
cgit v1.2.3


From 908d140a87a794bf89717ceae54aba5ce86c52e4 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Sat, 21 Oct 2017 00:25:15 +0300
Subject: ip6_tunnel: Allow rcv/xmit even if remote address is a local address

Currently, ip6_tnl_xmit_ctl drops tunneled packets if the remote
address (outer v6 destination) is one of host's locally configured
addresses.
Same applies to ip6_tnl_rcv_ctl: it drops packets if the remote address
(outer v6 source) is a local address.

This prevents using ipxip6 (and ip6_gre) tunnels whose local/remote
endpoints are on same host; OTOH v4 tunnels (ipip or gre) allow such
configurations.

An example where this proves useful is a system where entities are
identified by their unique v6 addresses, and use tunnels to encapsulate
traffic between them. The limitation prevents placing several entities
on same host.

Introduce IP6_TNL_F_ALLOW_LOCAL_REMOTE which allows to bypass this
restriction.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ip6_tunnel.h | 2 ++
 net/ipv6/ip6_tunnel.c           | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h
index 425926c467d7..ffebbe365478 100644
--- a/include/uapi/linux/ip6_tunnel.h
+++ b/include/uapi/linux/ip6_tunnel.h
@@ -20,6 +20,8 @@
 #define IP6_TNL_F_RCV_DSCP_COPY 0x10
 /* copy fwmark from inner packet */
 #define IP6_TNL_F_USE_ORIG_FWMARK 0x20
+/* allow remote endpoint on the local node */
+#define IP6_TNL_F_ALLOW_LOCAL_REMOTE 0x40
 
 struct ip6_tnl_parm {
 	char name[IFNAMSIZ];	/* name of tunnel device */
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 4212879ff35e..439d65f7e094 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -770,7 +770,8 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
 
 		if ((ipv6_addr_is_multicast(laddr) ||
 		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
-		    likely(!ipv6_chk_addr(net, raddr, NULL, 0)))
+		    ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
+		     likely(!ipv6_chk_addr(net, raddr, NULL, 0))))
 			ret = 1;
 	}
 	return ret;
@@ -1000,7 +1001,8 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
 		if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0)))
 			pr_warn("%s xmit: Local address not yet configured!\n",
 				p->name);
-		else if (!ipv6_addr_is_multicast(raddr) &&
+		else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
+			 !ipv6_addr_is_multicast(raddr) &&
 			 unlikely(ipv6_chk_addr(net, raddr, NULL, 0)))
 			pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
 				p->name);
-- 
cgit v1.2.3


From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 23 Oct 2017 23:53:08 -0700
Subject: bpf: permit multiple bpf attachments for a single perf event

This patch enables multiple bpf attachments for a
kprobe/uprobe/tracepoint single trace event.
Each trace_event keeps a list of attached perf events.
When an event happens, all attached bpf programs will
be executed based on the order of attachment.

A global bpf_event_mutex lock is introduced to protect
prog_array attaching and detaching. An alternative will
be introduce a mutex lock in every trace_event_call
structure, but it takes a lot of extra memory.
So a global bpf_event_mutex lock is a good compromise.

The bpf prog detachment involves allocation of memory.
If the allocation fails, a dummy do-nothing program
will replace to-be-detached program in-place.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h           | 30 +++++++++++++---
 include/linux/trace_events.h  | 43 ++++++++++++++++++++---
 include/trace/perf.h          |  6 ++--
 kernel/bpf/core.c             | 81 ++++++++++++++++++++++++++++++++++++++++++
 kernel/events/core.c          | 26 +++++---------
 kernel/trace/bpf_trace.c      | 82 ++++++++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_kprobe.c   |  6 ++--
 kernel/trace/trace_syscalls.c | 34 ++++++++++--------
 kernel/trace/trace_uprobe.c   |  3 +-
 9 files changed, 255 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e334b248ff6..172be7faf7ba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -273,18 +273,38 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
-#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog);
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array);
+
+#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
 	({						\
-		struct bpf_prog **_prog;		\
+		struct bpf_prog **_prog, *__prog;	\
+		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
 		rcu_read_lock();			\
-		_prog = rcu_dereference(array)->progs;	\
-		for (; *_prog; _prog++)			\
-			_ret &= func(*_prog, ctx);	\
+		_array = rcu_dereference(array);	\
+		if (unlikely(check_non_null && !_array))\
+			goto _out;			\
+		_prog = _array->progs;			\
+		while ((__prog = READ_ONCE(*_prog))) {	\
+			_ret &= func(__prog, ctx);	\
+			_prog++;			\
+		}					\
+_out:							\
 		rcu_read_unlock();			\
 		_ret;					\
 	 })
 
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+
+#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)	\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2e0f22298fe9..fc6aeca945db 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -271,14 +271,37 @@ struct trace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
-	struct bpf_prog			*prog;
-	struct perf_event		*bpf_prog_owner;
+	struct bpf_prog_array __rcu	*prog_array;
 
 	int	(*perf_perm)(struct trace_event_call *,
 			     struct perf_event *);
 #endif
 };
 
+#ifdef CONFIG_PERF_EVENTS
+static inline bool bpf_prog_array_valid(struct trace_event_call *call)
+{
+	/*
+	 * This inline function checks whether call->prog_array
+	 * is valid or not. The function is called in various places,
+	 * outside rcu_read_lock/unlock, as a heuristic to speed up execution.
+	 *
+	 * If this function returns true, and later call->prog_array
+	 * becomes false inside rcu_read_lock/unlock region,
+	 * we bail out then. If this function return false,
+	 * there is a risk that we might miss a few events if the checking
+	 * were delayed until inside rcu_read_lock/unlock region and
+	 * call->prog_array happened to become non-NULL then.
+	 *
+	 * Here, READ_ONCE() is used instead of rcu_access_pointer().
+	 * rcu_access_pointer() requires the actual definition of
+	 * "struct bpf_prog_array" while READ_ONCE() only needs
+	 * a declaration of the same type.
+	 */
+	return !!READ_ONCE(call->prog_array);
+}
+#endif
+
 static inline const char *
 trace_event_name(struct trace_event_call *call)
 {
@@ -435,12 +458,23 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 }
 
 #ifdef CONFIG_BPF_EVENTS
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_detach_bpf_prog(struct perf_event *event);
 #else
-static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	return 1;
 }
+
+static inline int
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void perf_event_detach_bpf_prog(struct perf_event *event) { }
+
 #endif
 
 enum {
@@ -511,6 +545,7 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 {
 	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
 }
+
 #endif
 
 #endif /* _LINUX_TRACE_EVENT_H */
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 04fe68bbe767..14f127b6acf5 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -34,7 +34,6 @@ perf_trace_##call(void *__data, proto)					\
 	struct trace_event_call *event_call = __data;			\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
 	struct trace_event_raw_##call *entry;				\
-	struct bpf_prog *prog = event_call->prog;			\
 	struct pt_regs *__regs;						\
 	u64 __count = 1;						\
 	struct task_struct *__task = NULL;				\
@@ -46,8 +45,9 @@ perf_trace_##call(void *__data, proto)					\
 	__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
-	if (!prog && __builtin_constant_p(!__task) && !__task &&	\
-				hlist_empty(head))			\
+	if (!bpf_prog_array_valid(event_call) &&			\
+	    __builtin_constant_p(!__task) && !__task &&			\
+	    hlist_empty(head))						\
 		return;							\
 									\
 	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8e7c8bf2b687..7fe448799d76 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1394,6 +1394,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+static unsigned int __bpf_prog_ret1(const void *ctx,
+				    const struct bpf_insn *insn)
+{
+	return 1;
+}
+
+static struct bpf_prog_dummy {
+	struct bpf_prog prog;
+} dummy_bpf_prog = {
+	.prog = {
+		.bpf_func = __bpf_prog_ret1,
+	},
+};
+
 /* to avoid allocating empty bpf_prog_array for cgroups that
  * don't have bpf program attached use one global 'empty_prog_array'
  * It will not be modified the caller of bpf_prog_array_alloc()
@@ -1463,6 +1477,73 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 	return 0;
 }
 
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog)
+{
+	struct bpf_prog **prog = progs->progs;
+
+	for (; *prog; prog++)
+		if (*prog == old_prog) {
+			WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+			break;
+		}
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array)
+{
+	int new_prog_cnt, carry_prog_cnt = 0;
+	struct bpf_prog **existing_prog;
+	struct bpf_prog_array *array;
+	int new_prog_idx = 0;
+
+	/* Figure out how many existing progs we need to carry over to
+	 * the new array.
+	 */
+	if (old_array) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++) {
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				carry_prog_cnt++;
+			if (*existing_prog == include_prog)
+				return -EEXIST;
+		}
+	}
+
+	/* How many progs (not NULL) will be in the new array? */
+	new_prog_cnt = carry_prog_cnt;
+	if (include_prog)
+		new_prog_cnt += 1;
+
+	/* Do we have any prog (not NULL) in the new array? */
+	if (!new_prog_cnt) {
+		*new_array = NULL;
+		return 0;
+	}
+
+	/* +1 as the end of prog_array is marked with NULL */
+	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+
+	/* Fill in the new prog array */
+	if (carry_prog_cnt) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++)
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				array->progs[new_prog_idx++] = *existing_prog;
+	}
+	if (include_prog)
+		array->progs[new_prog_idx++] = include_prog;
+	array->progs[new_prog_idx] = NULL;
+	*new_array = array;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9f78a6825bbe..9660ee65fbef 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7954,11 +7954,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 			       struct pt_regs *regs, struct hlist_head *head,
 			       struct task_struct *task)
 {
-	struct bpf_prog *prog = call->prog;
-
-	if (prog) {
+	if (bpf_prog_array_valid(call)) {
 		*(struct pt_regs **)raw_data = regs;
-		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
 			perf_swevent_put_recursion_context(rctx);
 			return;
 		}
@@ -8147,13 +8145,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
+	int ret;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return perf_event_set_bpf_handler(event, prog_fd);
 
-	if (event->tp_event->prog)
-		return -EEXIST;
-
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 	is_syscall_tp = is_syscall_trace_event(event->tp_event);
@@ -8181,26 +8177,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 			return -EACCES;
 		}
 	}
-	event->tp_event->prog = prog;
-	event->tp_event->bpf_prog_owner = event;
 
-	return 0;
+	ret = perf_event_attach_bpf_prog(event, prog);
+	if (ret)
+		bpf_prog_put(prog);
+	return ret;
 }
 
 static void perf_event_free_bpf_prog(struct perf_event *event)
 {
-	struct bpf_prog *prog;
-
 	if (event->attr.type != PERF_TYPE_TRACEPOINT) {
 		perf_event_free_bpf_handler(event);
 		return;
 	}
-
-	prog = event->tp_event->prog;
-	if (prog && event->tp_event->bpf_prog_owner == event) {
-		event->tp_event->prog = NULL;
-		bpf_prog_put(prog);
-	}
+	perf_event_detach_bpf_prog(event);
 }
 
 #else
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3126da2f468a..b65011d320e3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,7 +17,7 @@
 
 /**
  * trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
  * @ctx: opaque context pointer
  *
  * kprobe handlers execute BPF programs via this helper.
@@ -29,7 +29,7 @@
  * 1 - store kprobe event into ring buffer
  * Other values are reserved and currently alias to 1
  */
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	unsigned int ret;
 
@@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 		goto out;
 	}
 
-	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, ctx);
-	rcu_read_unlock();
+	/*
+	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+	 * to all call sites, we did a bpf_prog_array_valid() there to check
+	 * whether call->prog_array is empty or not, which is
+	 * a heurisitc to speed up execution.
+	 *
+	 * If bpf_prog_array_valid() fetched prog_array was
+	 * non-NULL, we go into trace_call_bpf() and do the actual
+	 * proper rcu_dereference() under RCU lock.
+	 * If it turns out that prog_array is NULL then, we bail out.
+	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
+	 * was NULL, you'll skip the prog_array with the risk of missing
+	 * out of events when it was updated in between this and the
+	 * rcu_dereference() which is accepted risk.
+	 */
+	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
 
  out:
 	__this_cpu_dec(bpf_prog_active);
@@ -741,3 +754,62 @@ const struct bpf_verifier_ops perf_event_verifier_ops = {
 
 const struct bpf_prog_ops perf_event_prog_ops = {
 };
+
+static DEFINE_MUTEX(bpf_event_mutex);
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+			       struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret = -EEXIST;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (event->prog)
+		goto out;
+
+	old_array = rcu_dereference_protected(event->tp_event->prog_array,
+					      lockdep_is_held(&bpf_event_mutex));
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto out;
+
+	/* set the new array to event->tp_event and set event->prog */
+	event->prog = prog;
+	rcu_assign_pointer(event->tp_event->prog_array, new_array);
+	bpf_prog_array_free(old_array);
+
+out:
+	mutex_unlock(&bpf_event_mutex);
+	return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (!event->prog)
+		goto out;
+
+	old_array = rcu_dereference_protected(event->tp_event->prog_array,
+					      lockdep_is_held(&bpf_event_mutex));
+
+	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	if (ret < 0) {
+		bpf_prog_array_delete_safe(old_array, event->prog);
+	} else {
+		rcu_assign_pointer(event->tp_event->prog_array, new_array);
+		bpf_prog_array_free(old_array);
+	}
+
+	bpf_prog_put(event->prog);
+	event->prog = NULL;
+
+out:
+	mutex_unlock(&bpf_event_mutex);
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1174,13 +1174,12 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
@@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 696afe72d3b1..71a6af34d7a9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,9 +559,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
-static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_metadata *sys_data,
-			      struct syscall_trace_enter *rec) {
+static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+			       struct syscall_metadata *sys_data,
+			       struct syscall_trace_enter *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -573,7 +574,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
 	param.syscall_nr = rec->nr;
 	for (i = 0; i < sys_data->nb_args; i++)
 		param.args[i] = rec->args[i];
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
@@ -581,7 +582,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -596,9 +597,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->enter_event->prog);
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
@@ -614,7 +615,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 
-	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
@@ -659,8 +661,9 @@ static void perf_sysenter_disable(struct trace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_trace_exit *rec) {
+static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
+			      struct syscall_trace_exit *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -670,7 +673,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
 	*(struct pt_regs **)&param = regs;
 	param.syscall_nr = rec->nr;
 	param.ret = rec->ret;
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -678,7 +681,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -693,9 +696,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->exit_event->prog);
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -709,7 +712,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..153c0e411461 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
-	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-- 
cgit v1.2.3


From ef5201c83d1400570a3b6f004ad7a23d71934411 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 24 Oct 2017 13:54:20 +0800
Subject: bonding: remove rtmsg_ifinfo called after bond_lower_state_changed

After the patch 'rtnetlink: bring NETDEV_CHANGELOWERSTATE event
process back to rtnetlink_event', bond_lower_state_changed would
generate NETDEV_CHANGEUPPER event which would send a notification
to userspace in rtnetlink_event.

There's no need to call rtmsg_ifinfo to send the notification
any more. So this patch is to remove it from these places after
bond_lower_state_changed.

Besides, after this, rtmsg_ifinfo is not needed to be exported.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bonding.h | 4 ----
 net/core/rtnetlink.c  | 1 -
 2 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index 2860cc66c2bb..f801fc940b29 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -330,7 +330,6 @@ static inline void bond_set_active_slave(struct slave *slave)
 		slave->backup = 0;
 		bond_queue_slave_event(slave);
 		bond_lower_state_changed(slave);
-		rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC);
 	}
 }
 
@@ -340,7 +339,6 @@ static inline void bond_set_backup_slave(struct slave *slave)
 		slave->backup = 1;
 		bond_queue_slave_event(slave);
 		bond_lower_state_changed(slave);
-		rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC);
 	}
 }
 
@@ -353,7 +351,6 @@ static inline void bond_set_slave_state(struct slave *slave,
 	slave->backup = slave_state;
 	if (notify) {
 		bond_lower_state_changed(slave);
-		rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC);
 		bond_queue_slave_event(slave);
 		slave->should_notify = 0;
 	} else {
@@ -385,7 +382,6 @@ static inline void bond_slave_state_notify(struct bonding *bond)
 	bond_for_each_slave(bond, tmp, iter) {
 		if (tmp->should_notify) {
 			bond_lower_state_changed(tmp);
-			rtmsg_ifinfo(RTM_NEWLINK, tmp->dev, 0, GFP_ATOMIC);
 			tmp->should_notify = 0;
 		}
 	}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 854a848842ea..de24d394c69e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2989,7 +2989,6 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 {
 	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL);
 }
-EXPORT_SYMBOL(rtmsg_ifinfo);
 
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
 			 gfp_t flags, int *new_nsid)
-- 
cgit v1.2.3


From 9c3b57518363577d4e2ea1964ef4fa03e100beaa Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 24 Oct 2017 01:45:31 -0700
Subject: net: sctp: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: Vlad Yasevich <vyasevich@gmail.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-sctp@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    | 10 +++---
 net/sctp/associola.c     |  3 +-
 net/sctp/protocol.c      |  7 ++--
 net/sctp/sm_sideeffect.c | 85 +++++++++++++++++++++++++++++-------------------
 net/sctp/transport.c     | 13 +++-----
 5 files changed, 66 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 2db3d3a9ce1d..13cc4963e905 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -72,7 +72,7 @@ typedef enum sctp_disposition (sctp_state_fn_t) (
 					const union sctp_subtype type,
 					void *arg,
 					struct sctp_cmd_seq *commands);
-typedef void (sctp_timer_event_t) (unsigned long);
+typedef void (sctp_timer_event_t) (struct timer_list *);
 struct sctp_sm_table_entry {
 	sctp_state_fn_t *fn;
 	const char *name;
@@ -314,10 +314,10 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type,
 	       void *event_arg, gfp_t gfp);
 
 /* 2nd level prototypes */
-void sctp_generate_t3_rtx_event(unsigned long peer);
-void sctp_generate_heartbeat_event(unsigned long peer);
-void sctp_generate_reconf_event(unsigned long peer);
-void sctp_generate_proto_unreach_event(unsigned long peer);
+void sctp_generate_t3_rtx_event(struct timer_list *t);
+void sctp_generate_heartbeat_event(struct timer_list *t);
+void sctp_generate_reconf_event(struct timer_list *t);
+void sctp_generate_proto_unreach_event(struct timer_list *t);
 
 void sctp_ootb_pkt_free(struct sctp_packet *packet);
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index dfb9651e818b..69394f4d6091 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -149,8 +149,7 @@ static struct sctp_association *sctp_association_init(
 
 	/* Initializes the timers */
 	for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i)
-		setup_timer(&asoc->timers[i], sctp_timer_events[i],
-				(unsigned long)asoc);
+		timer_setup(&asoc->timers[i], sctp_timer_events[i], 0);
 
 	/* Pull default initialization values from the sock options.
 	 * Note: This assumes that the values have already been
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index fcd80feb293f..f5172c21349b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -622,9 +622,9 @@ static void sctp_v4_ecn_capable(struct sock *sk)
 	INET_ECN_xmit(sk);
 }
 
-static void sctp_addr_wq_timeout_handler(unsigned long arg)
+static void sctp_addr_wq_timeout_handler(struct timer_list *t)
 {
-	struct net *net = (struct net *)arg;
+	struct net *net = from_timer(net, t, sctp.addr_wq_timer);
 	struct sctp_sockaddr_entry *addrw, *temp;
 	struct sctp_sock *sp;
 
@@ -1304,8 +1304,7 @@ static int __net_init sctp_defaults_init(struct net *net)
 	INIT_LIST_HEAD(&net->sctp.auto_asconf_splist);
 	spin_lock_init(&net->sctp.addr_wq_lock);
 	net->sctp.addr_wq_timer.expires = 0;
-	setup_timer(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler,
-		    (unsigned long)net);
+	timer_setup(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler, 0);
 
 	return 0;
 
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 402bfbb888cd..1c2699b424af 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -243,9 +243,10 @@ nomem:
 /* When the T3-RTX timer expires, it calls this function to create the
  * relevant state machine event.
  */
-void sctp_generate_t3_rtx_event(unsigned long peer)
+void sctp_generate_t3_rtx_event(struct timer_list *t)
 {
-	struct sctp_transport *transport = (struct sctp_transport *) peer;
+	struct sctp_transport *transport =
+		from_timer(transport, t, T3_rtx_timer);
 	struct sctp_association *asoc = transport->asoc;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
@@ -319,50 +320,63 @@ out_unlock:
 	sctp_association_put(asoc);
 }
 
-static void sctp_generate_t1_cookie_event(unsigned long data)
+static void sctp_generate_t1_cookie_event(struct timer_list *t)
 {
-	struct sctp_association *asoc = (struct sctp_association *) data;
+	struct sctp_association *asoc =
+		from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_COOKIE]);
+
 	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE);
 }
 
-static void sctp_generate_t1_init_event(unsigned long data)
+static void sctp_generate_t1_init_event(struct timer_list *t)
 {
-	struct sctp_association *asoc = (struct sctp_association *) data;
+	struct sctp_association *asoc =
+		from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_INIT]);
+
 	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT);
 }
 
-static void sctp_generate_t2_shutdown_event(unsigned long data)
+static void sctp_generate_t2_shutdown_event(struct timer_list *t)
 {
-	struct sctp_association *asoc = (struct sctp_association *) data;
+	struct sctp_association *asoc =
+		from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN]);
+
 	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN);
 }
 
-static void sctp_generate_t4_rto_event(unsigned long data)
+static void sctp_generate_t4_rto_event(struct timer_list *t)
 {
-	struct sctp_association *asoc = (struct sctp_association *) data;
+	struct sctp_association *asoc =
+		from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T4_RTO]);
+
 	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO);
 }
 
-static void sctp_generate_t5_shutdown_guard_event(unsigned long data)
+static void sctp_generate_t5_shutdown_guard_event(struct timer_list *t)
 {
-	struct sctp_association *asoc = (struct sctp_association *)data;
+	struct sctp_association *asoc =
+		from_timer(asoc, t,
+			   timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]);
+
 	sctp_generate_timeout_event(asoc,
 				    SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD);
 
 } /* sctp_generate_t5_shutdown_guard_event() */
 
-static void sctp_generate_autoclose_event(unsigned long data)
+static void sctp_generate_autoclose_event(struct timer_list *t)
 {
-	struct sctp_association *asoc = (struct sctp_association *) data;
+	struct sctp_association *asoc =
+		from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]);
+
 	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE);
 }
 
 /* Generate a heart beat event.  If the sock is busy, reschedule.   Make
  * sure that the transport is still valid.
  */
-void sctp_generate_heartbeat_event(unsigned long data)
+void sctp_generate_heartbeat_event(struct timer_list *t)
 {
-	struct sctp_transport *transport = (struct sctp_transport *) data;
+	struct sctp_transport *transport = from_timer(transport, t, hb_timer);
 	struct sctp_association *asoc = transport->asoc;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
@@ -405,9 +419,10 @@ out_unlock:
 /* Handle the timeout of the ICMP protocol unreachable timer.  Trigger
  * the correct state machine transition that will close the association.
  */
-void sctp_generate_proto_unreach_event(unsigned long data)
+void sctp_generate_proto_unreach_event(struct timer_list *t)
 {
-	struct sctp_transport *transport = (struct sctp_transport *)data;
+	struct sctp_transport *transport =
+		from_timer(transport, t, proto_unreach_timer);
 	struct sctp_association *asoc = transport->asoc;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
@@ -439,9 +454,10 @@ out_unlock:
 }
 
  /* Handle the timeout of the RE-CONFIG timer. */
-void sctp_generate_reconf_event(unsigned long data)
+void sctp_generate_reconf_event(struct timer_list *t)
 {
-	struct sctp_transport *transport = (struct sctp_transport *)data;
+	struct sctp_transport *transport =
+		from_timer(transport, t, reconf_timer);
 	struct sctp_association *asoc = transport->asoc;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
@@ -471,24 +487,27 @@ out_unlock:
 }
 
 /* Inject a SACK Timeout event into the state machine.  */
-static void sctp_generate_sack_event(unsigned long data)
+static void sctp_generate_sack_event(struct timer_list *t)
 {
-	struct sctp_association *asoc = (struct sctp_association *)data;
+	struct sctp_association *asoc =
+		from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_SACK]);
+
 	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK);
 }
 
 sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
-	NULL,
-	sctp_generate_t1_cookie_event,
-	sctp_generate_t1_init_event,
-	sctp_generate_t2_shutdown_event,
-	NULL,
-	sctp_generate_t4_rto_event,
-	sctp_generate_t5_shutdown_guard_event,
-	NULL,
-	NULL,
-	sctp_generate_sack_event,
-	sctp_generate_autoclose_event,
+	[SCTP_EVENT_TIMEOUT_NONE] =		NULL,
+	[SCTP_EVENT_TIMEOUT_T1_COOKIE] =	sctp_generate_t1_cookie_event,
+	[SCTP_EVENT_TIMEOUT_T1_INIT] =		sctp_generate_t1_init_event,
+	[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =	sctp_generate_t2_shutdown_event,
+	[SCTP_EVENT_TIMEOUT_T3_RTX] =		NULL,
+	[SCTP_EVENT_TIMEOUT_T4_RTO] =		sctp_generate_t4_rto_event,
+	[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] =
+					sctp_generate_t5_shutdown_guard_event,
+	[SCTP_EVENT_TIMEOUT_HEARTBEAT] =	NULL,
+	[SCTP_EVENT_TIMEOUT_RECONF] =		NULL,
+	[SCTP_EVENT_TIMEOUT_SACK] =		sctp_generate_sack_event,
+	[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =	sctp_generate_autoclose_event,
 };
 
 
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 2d9bd3776bc8..1e5a22430cf5 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -87,14 +87,11 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
 	INIT_LIST_HEAD(&peer->send_ready);
 	INIT_LIST_HEAD(&peer->transports);
 
-	setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event,
-		    (unsigned long)peer);
-	setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
-		    (unsigned long)peer);
-	setup_timer(&peer->reconf_timer, sctp_generate_reconf_event,
-		    (unsigned long)peer);
-	setup_timer(&peer->proto_unreach_timer,
-		    sctp_generate_proto_unreach_event, (unsigned long)peer);
+	timer_setup(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 0);
+	timer_setup(&peer->hb_timer, sctp_generate_heartbeat_event, 0);
+	timer_setup(&peer->reconf_timer, sctp_generate_reconf_event, 0);
+	timer_setup(&peer->proto_unreach_timer,
+		    sctp_generate_proto_unreach_event, 0);
 
 	/* Initialize the 64-bit random nonce sent with heartbeat. */
 	get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce));
-- 
cgit v1.2.3


From fc8bcaa05160528d56432e4612f522e3ceafc513 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 24 Oct 2017 01:45:48 -0700
Subject: net: LLC: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Hans Liljestrand <ishkamiel@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Reshetova, Elena" <elena.reshetova@intel.com>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/llc_c_ac.h |  8 ++++----
 net/llc/llc_c_ac.c     | 27 +++++++++++++++++----------
 net/llc/llc_conn.c     | 12 ++++--------
 3 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/llc_c_ac.h b/include/net/llc_c_ac.h
index f3be818e73c1..e766300b3e99 100644
--- a/include/net/llc_c_ac.h
+++ b/include/net/llc_c_ac.h
@@ -171,10 +171,10 @@ int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb);
 int llc_conn_ac_send_i_rsp_as_ack(struct sock *sk, struct sk_buff *skb);
 int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb);
 
-void llc_conn_busy_tmr_cb(unsigned long timeout_data);
-void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data);
-void llc_conn_ack_tmr_cb(unsigned long timeout_data);
-void llc_conn_rej_tmr_cb(unsigned long timeout_data);
+void llc_conn_busy_tmr_cb(struct timer_list *t);
+void llc_conn_pf_cycle_tmr_cb(struct timer_list *t);
+void llc_conn_ack_tmr_cb(struct timer_list *t);
+void llc_conn_rej_tmr_cb(struct timer_list *t);
 
 void llc_conn_set_p_flag(struct sock *sk, u8 value);
 #endif /* LLC_C_AC_H */
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
index ea225bd2672c..f59648018060 100644
--- a/net/llc/llc_c_ac.c
+++ b/net/llc/llc_c_ac.c
@@ -1318,9 +1318,8 @@ static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static void llc_conn_tmr_common_cb(unsigned long timeout_data, u8 type)
+static void llc_conn_tmr_common_cb(struct sock *sk, u8 type)
 {
-	struct sock *sk = (struct sock *)timeout_data;
 	struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
 
 	bh_lock_sock(sk);
@@ -1334,24 +1333,32 @@ static void llc_conn_tmr_common_cb(unsigned long timeout_data, u8 type)
 	bh_unlock_sock(sk);
 }
 
-void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data)
+void llc_conn_pf_cycle_tmr_cb(struct timer_list *t)
 {
-	llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_P_TMR);
+	struct llc_sock *llc = from_timer(llc, t, pf_cycle_timer.timer);
+
+	llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_P_TMR);
 }
 
-void llc_conn_busy_tmr_cb(unsigned long timeout_data)
+void llc_conn_busy_tmr_cb(struct timer_list *t)
 {
-	llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_BUSY_TMR);
+	struct llc_sock *llc = from_timer(llc, t, busy_state_timer.timer);
+
+	llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_BUSY_TMR);
 }
 
-void llc_conn_ack_tmr_cb(unsigned long timeout_data)
+void llc_conn_ack_tmr_cb(struct timer_list *t)
 {
-	llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_ACK_TMR);
+	struct llc_sock *llc = from_timer(llc, t, ack_timer.timer);
+
+	llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_ACK_TMR);
 }
 
-void llc_conn_rej_tmr_cb(unsigned long timeout_data)
+void llc_conn_rej_tmr_cb(struct timer_list *t)
 {
-	llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_REJ_TMR);
+	struct llc_sock *llc = from_timer(llc, t, rej_sent_timer.timer);
+
+	llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_REJ_TMR);
 }
 
 int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb)
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 5e91b47f0d2a..9177dbb16dce 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -902,20 +902,16 @@ static void llc_sk_init(struct sock *sk)
 	llc->inc_cntr = llc->dec_cntr = 2;
 	llc->dec_step = llc->connect_step = 1;
 
-	setup_timer(&llc->ack_timer.timer, llc_conn_ack_tmr_cb,
-			(unsigned long)sk);
+	timer_setup(&llc->ack_timer.timer, llc_conn_ack_tmr_cb, 0);
 	llc->ack_timer.expire	      = sysctl_llc2_ack_timeout;
 
-	setup_timer(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb,
-			(unsigned long)sk);
+	timer_setup(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb, 0);
 	llc->pf_cycle_timer.expire	   = sysctl_llc2_p_timeout;
 
-	setup_timer(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb,
-			(unsigned long)sk);
+	timer_setup(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb, 0);
 	llc->rej_sent_timer.expire	   = sysctl_llc2_rej_timeout;
 
-	setup_timer(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb,
-			(unsigned long)sk);
+	timer_setup(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb, 0);
 	llc->busy_state_timer.expire	     = sysctl_llc2_busy_timeout;
 
 	llc->n2 = 2;   /* max retransmit */
-- 
cgit v1.2.3


From 88ca59d1aaf28c25b47a9f933090e480ba6dc92a Mon Sep 17 00:00:00 2001
From: Girish Moodalbail <girish.moodalbail@oracle.com>
Date: Wed, 25 Oct 2017 12:26:43 -0700
Subject: macvlan: remove unused fields in struct macvlan_dev

commit 635b8c8ecdd2 ("tap: Renaming tap related APIs, data structures,
macros") captured all the tap related fields into a new struct tap_dev.
However, it failed to remove those fields from struct macvlan_dev.
Those fields are currently unused and must be removed. While there
I moved the comment for MAX_TAP_QUEUES to the right place.

Fixes: 635b8c8ecdd27142 (tap: Renaming tap related APIs, data structures, macros)
Signed-off-by: Girish Moodalbail <girish.moodalbail@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_macvlan.h | 15 ---------------
 include/linux/if_tap.h     |  4 ++++
 2 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 10e319f41fb1..e13b369df02b 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -10,13 +10,6 @@
 #include <linux/u64_stats_sync.h>
 
 struct macvlan_port;
-struct macvtap_queue;
-
-/*
- * Maximum times a macvtap device can be opened. This can be used to
- * configure the number of receive queue, e.g. for multiqueue virtio.
- */
-#define MAX_TAP_QUEUES	256
 
 #define MACVLAN_MC_FILTER_BITS	8
 #define MACVLAN_MC_FILTER_SZ	(1 << MACVLAN_MC_FILTER_BITS)
@@ -35,14 +28,6 @@ struct macvlan_dev {
 	netdev_features_t	set_features;
 	enum macvlan_mode	mode;
 	u16			flags;
-	/* This array tracks active taps. */
-	struct tap_queue	__rcu *taps[MAX_TAP_QUEUES];
-	/* This list tracks all taps (both enabled and disabled) */
-	struct list_head	queue_list;
-	int			numvtaps;
-	int			numqueues;
-	netdev_features_t	tap_features;
-	int			minor;
 	int			nest_level;
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 4837157da0dc..d1b5173ad8f0 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -22,6 +22,10 @@ static inline struct skb_array *tap_get_skb_array(struct file *f)
 #include <net/sock.h>
 #include <linux/skb_array.h>
 
+/*
+ * Maximum times a tap device can be opened. This can be used to
+ * configure the number of receive queue, e.g. for multiqueue virtio.
+ */
 #define MAX_TAP_QUEUES 256
 
 struct tap_queue;
-- 
cgit v1.2.3


From 60e2a7780793bae0debc275a9ccd57f7da0cf195 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 25 Oct 2017 11:01:45 +0200
Subject: tcp: TCP experimental option for SMC

The SMC protocol [1] relies on the use of a new TCP experimental
option [2, 3]. With this option, SMC capabilities are exchanged
between peers during the TCP three way handshake. This patch adds
support for this experimental option to TCP.

References:
[1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609
[2] Shared Use of TCP Experimental Options RFC 6994:
    https://tools.ietf.org/rfc/rfc6994.txt
[3] IANA ExID SMCR:
http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-exids

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  9 +++++--
 include/net/inet_sock.h  |  3 ++-
 include/net/tcp.h        |  7 ++++++
 net/ipv4/tcp.c           |  6 +++++
 net/ipv4/tcp_input.c     | 35 +++++++++++++++++++++++++++
 net/ipv4/tcp_minisocks.c | 19 +++++++++++++++
 net/ipv4/tcp_output.c    | 63 +++++++++++++++++++++++++++++++++++++++++++++---
 7 files changed, 136 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 173a7c2f9636..8c431385b272 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -98,7 +98,8 @@ struct tcp_options_received {
 		tstamp_ok : 1,	/* TIMESTAMP seen on SYN packet		*/
 		dsack : 1,	/* D-SACK is scheduled			*/
 		wscale_ok : 1,	/* Wscale seen on SYN packet		*/
-		sack_ok : 4,	/* SACK seen on SYN packet		*/
+		sack_ok : 3,	/* SACK seen on SYN packet		*/
+		smc_ok : 1,	/* SMC seen on SYN packet		*/
 		snd_wscale : 4,	/* Window scaling received from sender	*/
 		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
 	u8	num_sacks;	/* Number of SACK blocks		*/
@@ -110,6 +111,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
 	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
 	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+#if IS_ENABLED(CONFIG_SMC)
+	rx_opt->smc_ok = 0;
+#endif
 }
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
@@ -229,7 +233,8 @@ struct tcp_sock {
 		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
 		save_syn:1,	/* Save headers of SYN packet */
-		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
+		is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+		syn_smc:1;	/* SYN includes SMC */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 425752f768d2..c49938d1481a 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -92,7 +92,8 @@ struct inet_request_sock {
 				wscale_ok  : 1,
 				ecn_ok	   : 1,
 				acked	   : 1,
-				no_srccheck: 1;
+				no_srccheck: 1,
+				smc_ok	   : 1;
 	kmemcheck_bitfield_end(flags);
 	u32                     ir_mark;
 	union {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2392f74074e7..285bc82dea41 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
  */
 #define TCPOPT_FASTOPEN_MAGIC	0xF989
+#define TCPOPT_SMC_MAGIC	0xE2D4C3D9
 
 /*
  *     TCP option lengths
@@ -203,6 +204,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_MD5SIG         18
 #define TCPOLEN_FASTOPEN_BASE  2
 #define TCPOLEN_EXP_FASTOPEN_BASE  4
+#define TCPOLEN_EXP_SMC_BASE   6
 
 /* But this is what stacks really send out. */
 #define TCPOLEN_TSTAMP_ALIGNED		12
@@ -213,6 +215,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERBLOCK		8
 #define TCPOLEN_MD5SIG_ALIGNED		20
 #define TCPOLEN_MSS_ALIGNED		4
+#define TCPOLEN_EXP_SMC_BASE_ALIGNED	8
 
 /* Flags in tp->nonagle */
 #define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
@@ -2108,4 +2111,8 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
 	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
 }
+
+#if IS_ENABLED(CONFIG_SMC)
+extern struct static_key_false tcp_have_smc;
+#endif
 #endif	/* _TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8f36277e82e9..f6e1c00e300e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/errqueue.h>
+#include <linux/static_key.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -302,6 +303,11 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
 atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
+#if IS_ENABLED(CONFIG_SMC)
+DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
+EXPORT_SYMBOL(tcp_have_smc);
+#endif
+
 /*
  * Current number of TCP sockets.
  */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 893286db4623..337f6011528a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -76,6 +76,8 @@
 #include <asm/unaligned.h>
 #include <linux/errqueue.h>
 #include <trace/events/tcp.h>
+#include <linux/unaligned/access_ok.h>
+#include <linux/static_key.h>
 
 int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
@@ -3737,6 +3739,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
 	foc->exp = exp_opt;
 }
 
+static void smc_parse_options(const struct tcphdr *th,
+			      struct tcp_options_received *opt_rx,
+			      const unsigned char *ptr,
+			      int opsize)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (th->syn && !(opsize & 1) &&
+		    opsize >= TCPOLEN_EXP_SMC_BASE &&
+		    get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+			opt_rx->smc_ok = 1;
+	}
+#endif
+}
+
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
  * the fast version below fails.
@@ -3844,6 +3861,9 @@ void tcp_parse_options(const struct net *net,
 					tcp_parse_fastopen_option(opsize -
 						TCPOLEN_EXP_FASTOPEN_BASE,
 						ptr + 2, th->syn, foc, true);
+				else
+					smc_parse_options(th, opt_rx, ptr,
+							  opsize);
 				break;
 
 			}
@@ -5598,6 +5618,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	return false;
 }
 
+static void smc_check_reset_syn(struct tcp_sock *tp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc && !tp->rx_opt.smc_ok)
+			tp->syn_smc = 0;
+	}
+#endif
+}
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 const struct tcphdr *th)
 {
@@ -5704,6 +5734,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * is initialized. */
 		tp->copied_seq = tp->rcv_nxt;
 
+		smc_check_reset_syn(tp);
+
 		smp_mb();
 
 		tcp_finish_connect(sk, skb);
@@ -6157,6 +6189,9 @@ static void tcp_openreq_init(struct request_sock *req,
 	ireq->ir_rmt_port = tcp_hdr(skb)->source;
 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
 	ireq->ir_mark = inet_request_mark(sk, skb);
+#if IS_ENABLED(CONFIG_SMC)
+	ireq->smc_ok = rx_opt->smc_ok;
+#endif
 }
 
 struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a952357054f4..056009f1c14f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/workqueue.h>
+#include <linux/static_key.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>
@@ -416,6 +417,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 }
 EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
 
+static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
+				    struct request_sock *req,
+				    struct tcp_sock *newtp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	struct inet_request_sock *ireq;
+
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		ireq = inet_rsk(req);
+		if (oldtp->syn_smc && !ireq->smc_ok)
+			newtp->syn_smc = 0;
+	}
+#endif
+}
+
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -433,6 +449,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		struct tcp_request_sock *treq = tcp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct tcp_sock *newtp = tcp_sk(newsk);
+		struct tcp_sock *oldtp = tcp_sk(sk);
+
+		smc_check_reset_syn_req(oldtp, req, newtp);
 
 		/* Now setup tcp_sock */
 		newtp->pred_flags = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1f01f4c9c738..c8fc512e0bbb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,6 +41,7 @@
 #include <linux/compiler.h>
 #include <linux/gfp.h>
 #include <linux/module.h>
+#include <linux/static_key.h>
 
 #include <trace/events/tcp.h>
 
@@ -422,6 +423,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5		(1 << 2)
 #define OPTION_WSCALE		(1 << 3)
 #define OPTION_FAST_OPEN_COOKIE	(1 << 8)
+#define OPTION_SMC		(1 << 9)
+
+static void smc_options_write(__be32 *ptr, u16 *options)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (unlikely(OPTION_SMC & *options)) {
+			*ptr++ = htonl((TCPOPT_NOP  << 24) |
+				       (TCPOPT_NOP  << 16) |
+				       (TCPOPT_EXP <<  8) |
+				       (TCPOLEN_EXP_SMC_BASE));
+			*ptr++ = htonl(TCPOPT_SMC_MAGIC);
+		}
+	}
+#endif
+}
 
 struct tcp_out_options {
 	u16 options;		/* bit field of OPTION_* */
@@ -540,6 +557,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 		}
 		ptr += (len + 3) >> 2;
 	}
+
+	smc_options_write(ptr, &options);
+}
+
+static void smc_set_option(const struct tcp_sock *tp,
+			   struct tcp_out_options *opts,
+			   unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc) {
+			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+				opts->options |= OPTION_SMC;
+				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+			}
+		}
+	}
+#endif
+}
+
+static void smc_set_option_cond(const struct tcp_sock *tp,
+				const struct inet_request_sock *ireq,
+				struct tcp_out_options *opts,
+				unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc && ireq->smc_ok) {
+			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+				opts->options |= OPTION_SMC;
+				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+			}
+		}
+	}
+#endif
 }
 
 /* Compute TCP options for SYN packets. This is not the final
@@ -607,11 +659,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	smc_set_option(tp, opts, &remaining);
+
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
 /* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct request_sock *req,
+static unsigned int tcp_synack_options(const struct sock *sk,
+				       struct request_sock *req,
 				       unsigned int mss, struct sk_buff *skb,
 				       struct tcp_out_options *opts,
 				       const struct tcp_md5sig_key *md5,
@@ -667,6 +722,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
 		}
 	}
 
+	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -3195,8 +3252,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
 #endif
 	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
-	tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
-			  sizeof(*th);
+	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+					     foc) + sizeof(*th);
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
-- 
cgit v1.2.3


From 32d18ab1d44166cbb1dcaf8b359183636734ddb1 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 24 Oct 2017 12:41:01 +0200
Subject: net: updating dst lastusage is an unlikely event.

Since commit 0da4af00b2ed ("ipv6: only update __use and lastusetime
once per jiffy at most"), updating the dst lastuse field is an
unlikely action: it happens at most once per jiffy, out of
potentially millions of calls per second.

Mark explicitly the code as such, and let the compiler generate
better code.

Note: gcc 7.2 and several older versions do actually generate
different - better - code when the unlikely() hint is in place,
avoid jump in the fast path and keeping better code locality.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 5047e8053d6c..2f53ecc2c296 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -257,7 +257,7 @@ static inline void dst_hold(struct dst_entry *dst)
 
 static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
 {
-	if (time != dst->lastuse) {
+	if (unlikely(time != dst->lastuse)) {
 		dst->__use++;
 		dst->lastuse = time;
 	}
-- 
cgit v1.2.3


From 032cfd66afcc2dd2c7be89c71b020fcb15bcc37d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 25 Oct 2017 03:53:59 -0700
Subject: drivers/net: wan/sdla: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: Allen Pais <allen.lkml@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Tobias Klauser <tklauser@distanz.ch>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/sdla.c  | 12 +++++-------
 include/linux/if_frad.h |  1 +
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 0cc48902dbb9..57ed259c8208 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -927,13 +927,10 @@ static irqreturn_t sdla_isr(int dummy, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void sdla_poll(unsigned long device)
+static void sdla_poll(struct timer_list *t)
 {
-	struct net_device	  *dev;
-	struct frad_local *flp;
-
-	dev = (struct net_device *) device;
-	flp = netdev_priv(dev);
+	struct frad_local *flp = from_timer(flp, t, timer);
+	struct net_device *dev = flp->dev;
 
 	if (sdla_byte(dev, SDLA_502_RCV_BUF))
 		sdla_receive(dev);
@@ -1616,8 +1613,9 @@ static void setup_sdla(struct net_device *dev)
 	flp->assoc		= sdla_assoc;
 	flp->deassoc		= sdla_deassoc;
 	flp->dlci_conf		= sdla_dlci_conf;
+	flp->dev		= dev;
 
-	setup_timer(&flp->timer, sdla_poll, (unsigned long)dev);
+	timer_setup(&flp->timer, sdla_poll, 0);
 	flp->timer.expires	= 1;
 }
 
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 46df7e565d6f..82a1b4e93570 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -83,6 +83,7 @@ struct frad_local
 
    /* fields that are used by the Sangoma SDLA cards */
    struct timer_list timer;
+   struct net_device *dev;
    int               type;		/* adapter type */
    int               state;		/* state of the S502/8 control latch */
    int               buffer;		/* current buffer for S508 firmware */
-- 
cgit v1.2.3


From 2ae21cf527da0e5cf9d7ee14bd5b0909bb9d1a75 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:54:56 -0700
Subject: tcp: Namespace-ify sysctl_tcp_early_retrans

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp_input.c       |  1 -
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      |  4 +++-
 6 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2c4222a5d102..a7f39e3ea666 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -128,6 +128,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_sack;
 	int sysctl_tcp_window_scaling;
 	int sysctl_tcp_timestamps;
+	int sysctl_tcp_early_retrans;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 285bc82dea41..a12b71d4118b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -265,7 +265,6 @@ extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
-extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_recovery;
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 81d218346cf7..f0f650f020af 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -634,15 +634,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec
 	},
-	{
-		.procname	= "tcp_early_retrans",
-		.data		= &sysctl_tcp_early_retrans,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &four,
-	},
 	{
 		.procname	= "tcp_min_tso_segs",
 		.data		= &sysctl_tcp_min_tso_segs,
@@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_early_retrans",
+		.data		= &init_net.ipv4.sysctl_tcp_early_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &four,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 337f6011528a..7656b1e6d504 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,7 +95,6 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
-int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 23a8100af5ad..7ab313f6768e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2484,6 +2484,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_sack = 1;
 	net->ipv4.sysctl_tcp_window_scaling = 1;
 	net->ipv4.sysctl_tcp_timestamps = 1;
+	net->ipv4.sysctl_tcp_early_retrans = 3;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c8fc512e0bbb..21713836d46a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2435,6 +2435,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 timeout, rto_delta_us;
+	int early_retrans;
 
 	/* Don't do any loss probe on a Fast Open connection before 3WHS
 	 * finishes.
@@ -2442,10 +2443,11 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	if (tp->fastopen_rsk)
 		return false;
 
+	early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
 	/* Schedule a loss probe in 2*RTT for SACK capable connections
 	 * in Open state, that are either limited by cwnd or application.
 	 */
-	if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+	if ((early_retrans != 3 && early_retrans != 4) ||
 	    !tp->packets_out || !tcp_is_sack(tp) ||
 	    icsk->icsk_ca_state != TCP_CA_Open)
 		return false;
-- 
cgit v1.2.3


From e20223f1962831d1b1c416d59d259879d0639d68 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:54:57 -0700
Subject: tcp: Namespace-ify sysctl_tcp_recovery

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  2 +-
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  2 +-
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_recovery.c    |  2 --
 6 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index a7f39e3ea666..d6ed718075d4 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -129,6 +129,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_window_scaling;
 	int sysctl_tcp_timestamps;
 	int sysctl_tcp_early_retrans;
+	int sysctl_tcp_recovery;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a12b71d4118b..c7f51534fc44 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -265,7 +265,7 @@ extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
-extern int sysctl_tcp_recovery;
+
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
 extern int sysctl_tcp_limit_output_bytes;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index f0f650f020af..78019adcae87 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -449,13 +449,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_recovery",
-		.data		= &sysctl_tcp_recovery,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "tcp_max_reordering",
 		.data		= &sysctl_tcp_max_reordering,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &four,
 	},
+	{
+		.procname	= "tcp_recovery",
+		.data		= &init_net.ipv4.sysctl_tcp_recovery,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7656b1e6d504..5b2272dbf6a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2788,7 +2788,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	/* Use RACK to detect loss */
-	if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+	if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
 		u32 prior_retrans = tp->retrans_out;
 
 		tcp_rack_mark_lost(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7ab313f6768e..517ff1948a71 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2485,6 +2485,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_window_scaling = 1;
 	net->ipv4.sysctl_tcp_timestamps = 1;
 	net->ipv4.sysctl_tcp_early_retrans = 3;
+	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index cda6074a429a..d3603a9e24ea 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,8 +1,6 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
-
 static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-- 
cgit v1.2.3


From 2c04ac8ae0b61e0780a30b7069a11bb202b21f26 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:54:58 -0700
Subject: tcp: Namespace-ify sysctl_tcp_thin_linear_timeouts

Note that sysctl_tcp_thin_dupack was not used, I deleted it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  2 --
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_timer.c       |  4 +---
 4 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d6ed718075d4..2a9f37b39c45 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -130,6 +130,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_timestamps;
 	int sysctl_tcp_early_retrans;
 	int sysctl_tcp_recovery;
+	int sysctl_tcp_thin_linear_timeouts;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c7f51534fc44..063a7a48b7fe 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -263,8 +263,6 @@ extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
-extern int sysctl_tcp_thin_linear_timeouts;
-extern int sysctl_tcp_thin_dupack;
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 78019adcae87..12003214f4d8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -620,13 +620,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler   = proc_allowed_congestion_control,
 	},
-	{
-		.procname       = "tcp_thin_linear_timeouts",
-		.data           = &sysctl_tcp_thin_linear_timeouts,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec
-	},
 	{
 		.procname	= "tcp_min_tso_segs",
 		.data		= &sysctl_tcp_min_tso_segs,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname       = "tcp_thin_linear_timeouts",
+		.data           = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 804a8d34ce86..035a1ef1f2d8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,8 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_thin_linear_timeouts __read_mostly;
-
 /**
  *  tcp_write_err() - close socket and save error info
  *  @sk:  The socket the error has appeared on.
@@ -522,7 +520,7 @@ out_reset_timer:
 	 * linear-timeout retransmissions into a black hole
 	 */
 	if (sk->sk_state == TCP_ESTABLISHED &&
-	    (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+	    (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) &&
 	    tcp_stream_is_thin(tp) &&
 	    icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
 		icsk->icsk_backoff = 0;
-- 
cgit v1.2.3


From b510f0d23a47c3d1f074fe583e7867dc4918fe02 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:54:59 -0700
Subject: tcp: Namespace-ify sysctl_tcp_slow_start_after_idle

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  3 +--
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      |  5 +----
 5 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2a9f37b39c45..8662692686b3 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -131,6 +131,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_early_retrans;
 	int sysctl_tcp_recovery;
 	int sysctl_tcp_thin_linear_timeouts;
+	int sysctl_tcp_slow_start_after_idle;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 063a7a48b7fe..cc2ab522eb5c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,7 +262,6 @@ extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_workaround_signed_windows;
-extern int sysctl_tcp_slow_start_after_idle;
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
@@ -1308,7 +1307,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	s32 delta;
 
-	if (!sysctl_tcp_slow_start_after_idle || tp->packets_out ||
+	if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out ||
 	    ca_ops->cong_control)
 		return;
 	delta = tcp_jiffies32 - tp->lsndtime;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 12003214f4d8..40d69af8b363 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -571,13 +571,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_slow_start_after_idle",
-		.data		= &sysctl_tcp_slow_start_after_idle,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 #ifdef CONFIG_NETLABEL
 	{
 		.procname	= "cipso_cache_enable",
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec
 	},
+	{
+		.procname	= "tcp_slow_start_after_idle",
+		.data		= &init_net.ipv4.sysctl_tcp_slow_start_after_idle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 517ff1948a71..cea63a4b5965 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2486,6 +2486,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_timestamps = 1;
 	net->ipv4.sysctl_tcp_early_retrans = 3;
 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
+	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 21713836d46a..bdc288a06f94 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -62,9 +62,6 @@ int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
  */
 int sysctl_tcp_tso_win_divisor __read_mostly = 3;
 
-/* By default, RFC2861 behavior.  */
-int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
-
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
@@ -1690,7 +1687,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
 		if (tp->packets_out > tp->snd_cwnd_used)
 			tp->snd_cwnd_used = tp->packets_out;
 
-		if (sysctl_tcp_slow_start_after_idle &&
+		if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
 		    (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
 		    !ca_ops->cong_control)
 			tcp_cwnd_application_limited(sk);
-- 
cgit v1.2.3


From e0a1e5b519236dc1662ff25e42560dd1be9e3776 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:00 -0700
Subject: tcp: Namespace-ify sysctl_tcp_retrans_collapse

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/tcp_output.c      |  5 +----
 5 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 8662692686b3..b28c172b10e4 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -132,6 +132,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_recovery;
 	int sysctl_tcp_thin_linear_timeouts;
 	int sysctl_tcp_slow_start_after_idle;
+	int sysctl_tcp_retrans_collapse;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cc2ab522eb5c..33cc86355b8f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 
 /* sysctl variables for tcp */
-extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
 extern int sysctl_tcp_rfc1337;
 extern int sysctl_tcp_abort_on_overflow;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 40d69af8b363..533b92ad39dd 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -386,13 +386,6 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl,
 }
 
 static struct ctl_table ipv4_table[] = {
-	{
-		.procname	= "tcp_retrans_collapse",
-		.data		= &sysctl_tcp_retrans_collapse,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_max_orphans",
 		.data		= &sysctl_tcp_max_orphans,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_retrans_collapse",
+		.data		= &init_net.ipv4.sysctl_tcp_retrans_collapse,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cea63a4b5965..2bc6ba2059d3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2487,7 +2487,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_early_retrans = 3;
 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
-
+	net->ipv4.sysctl_tcp_retrans_collapse = 1;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bdc288a06f94..55a0aa4b96df 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,9 +45,6 @@
 
 #include <trace/events/tcp.h>
 
-/* People can turn this off for buggy TCP's found in printers etc. */
-int sysctl_tcp_retrans_collapse __read_mostly = 1;
-
 /* People can turn this on to work with those rare, broken TCPs that
  * interpret the window field as a signed quantity.
  */
@@ -2804,7 +2801,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 	struct sk_buff *skb = to, *tmp;
 	bool first = true;
 
-	if (!sysctl_tcp_retrans_collapse)
+	if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
 		return;
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 		return;
-- 
cgit v1.2.3


From 3f4c7c6f6a9053493ce7dd8a0f17ed8eafc53893 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:01 -0700
Subject: tcp: Namespace-ify sysctl_tcp_stdurg

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  3 +--
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index b28c172b10e4..ffa2cf3dc747 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -133,6 +133,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_thin_linear_timeouts;
 	int sysctl_tcp_slow_start_after_idle;
 	int sysctl_tcp_retrans_collapse;
+	int sysctl_tcp_stdurg;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 33cc86355b8f..cf3fac7008d7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 
 /* sysctl variables for tcp */
-extern int sysctl_tcp_stdurg;
 extern int sysctl_tcp_rfc1337;
 extern int sysctl_tcp_abort_on_overflow;
 extern int sysctl_tcp_max_orphans;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 533b92ad39dd..a34bb75815c1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -400,13 +400,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_stdurg",
-		.data		= &sysctl_tcp_stdurg,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_rfc1337",
 		.data		= &sysctl_tcp_rfc1337,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_stdurg",
+		.data		= &init_net.ipv4.sysctl_tcp_stdurg,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5b2272dbf6a9..14b06963c102 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -89,7 +89,6 @@ EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 /* rfc5961 challenge ack rate limiting */
 int sysctl_tcp_challenge_ack_limit = 1000;
 
-int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
@@ -5123,7 +5122,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 ptr = ntohs(th->urg_ptr);
 
-	if (ptr && !sysctl_tcp_stdurg)
+	if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
 		ptr--;
 	ptr += ntohl(th->seq);
 
-- 
cgit v1.2.3


From 625357aa175c688d219da43c8cfaa2e1629e0e1a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:02 -0700
Subject: tcp: Namespace-ify sysctl_tcp_rfc1337

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  1 -
 net/ipv4/tcp_minisocks.c   |  2 +-
 5 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index ffa2cf3dc747..968edce38eb5 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -134,6 +134,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_slow_start_after_idle;
 	int sysctl_tcp_retrans_collapse;
 	int sysctl_tcp_stdurg;
+	int sysctl_tcp_rfc1337;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf3fac7008d7..2aea2b3373b3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 
 /* sysctl variables for tcp */
-extern int sysctl_tcp_rfc1337;
 extern int sysctl_tcp_abort_on_overflow;
 extern int sysctl_tcp_max_orphans;
 extern int sysctl_tcp_fack;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a34bb75815c1..832e554235df 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -400,13 +400,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_rfc1337",
-		.data		= &sysctl_tcp_rfc1337,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "inet_peer_threshold",
 		.data		= &inet_peer_threshold,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_rfc1337",
+		.data		= &init_net.ipv4.sysctl_tcp_rfc1337,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 14b06963c102..64fde81b0eb7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -89,7 +89,6 @@ EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 /* rfc5961 challenge ack rate limiting */
 int sysctl_tcp_challenge_ack_limit = 1000;
 
-int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 056009f1c14f..11836667763c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -181,7 +181,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 			 * Oh well... nobody has a sufficient solution to this
 			 * protocol bug yet.
 			 */
-			if (sysctl_tcp_rfc1337 == 0) {
+			if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) {
 kill:
 				inet_twsk_deschedule_put(tw);
 				return TCP_TW_SUCCESS;
-- 
cgit v1.2.3


From 65c9410cf55ecf32da1b720f563365d565d6289a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:03 -0700
Subject: tcp: Namespace-ify sysctl_tcp_abort_on_overflow

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_minisocks.c   |  4 +---
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 968edce38eb5..3875fdf6b186 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -135,6 +135,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_retrans_collapse;
 	int sysctl_tcp_stdurg;
 	int sysctl_tcp_rfc1337;
+	int sysctl_tcp_abort_on_overflow;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2aea2b3373b3..7331281a2292 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 
 /* sysctl variables for tcp */
-extern int sysctl_tcp_abort_on_overflow;
 extern int sysctl_tcp_max_orphans;
 extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 832e554235df..ffd1fd769bba 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -393,13 +393,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_abort_on_overflow",
-		.data		= &sysctl_tcp_abort_on_overflow,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "inet_peer_threshold",
 		.data		= &inet_peer_threshold,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_abort_on_overflow",
+		.data		= &init_net.ipv4.sysctl_tcp_abort_on_overflow,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 11836667763c..3674d63170b2 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -29,8 +29,6 @@
 #include <net/xfrm.h>
 #include <net/busy_poll.h>
 
-int sysctl_tcp_abort_on_overflow __read_mostly;
-
 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
 	if (seq == s_win)
@@ -783,7 +781,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	return inet_csk_complete_hashdance(sk, child, req, own_req);
 
 listen_overflow:
-	if (!sysctl_tcp_abort_on_overflow) {
+	if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) {
 		inet_rsk(req)->acked = 1;
 		return NULL;
 	}
-- 
cgit v1.2.3


From 0bc65a28ae2aeb14aab7f4a930e0d8cf4cad9dc4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:04 -0700
Subject: tcp: Namespace-ify sysctl_tcp_fack

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp.c             |  2 +-
 net/ipv4/tcp_input.c       |  3 +--
 net/ipv4/tcp_minisocks.c   |  2 +-
 6 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 3875fdf6b186..f0e792beeea9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -136,6 +136,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_stdurg;
 	int sysctl_tcp_rfc1337;
 	int sysctl_tcp_abort_on_overflow;
+	int sysctl_tcp_fack;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7331281a2292..e7b15e9f6e28 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* sysctl variables for tcp */
 extern int sysctl_tcp_max_orphans;
-extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_max_reordering;
 extern int sysctl_tcp_dsack;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ffd1fd769bba..1f23be13ce7b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -414,13 +414,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "tcp_fack",
-		.data		= &sysctl_tcp_fack,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_max_reordering",
 		.data		= &sysctl_tcp_max_reordering,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_fack",
+		.data		= &init_net.ipv4.sysctl_tcp_fack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f6e1c00e300e..c7c983f0f817 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2517,7 +2517,7 @@ static int tcp_repair_options_est(struct sock *sk,
 				return -EINVAL;
 
 			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
-			if (sysctl_tcp_fack)
+			if (sock_net(sk)->ipv4.sysctl_tcp_fack)
 				tcp_enable_fack(tp);
 			break;
 		case TCPOPT_TIMESTAMP:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 64fde81b0eb7..c5b94460793f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,6 @@
 #include <linux/unaligned/access_ok.h>
 #include <linux/static_key.h>
 
-int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
@@ -5720,7 +5719,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			tp->tcp_header_len = sizeof(struct tcphdr);
 		}
 
-		if (tcp_is_sack(tp) && sysctl_tcp_fack)
+		if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack)
 			tcp_enable_fack(tp);
 
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3674d63170b2..3270ab8416ce 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -510,7 +510,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 
 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 		if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
-			if (sysctl_tcp_fack)
+			if (sock_net(sk)->ipv4.sysctl_tcp_fack)
 				tcp_enable_fack(newtp);
 		}
 		newtp->window_clamp = req->rsk_window_clamp;
-- 
cgit v1.2.3


From 773d4bb96ceca6829ae9928f6b002b93e2e62cdc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:05 -0700
Subject: tcp: remove stale sysctl_tcp_reordering

This extern is no longer used.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e7b15e9f6e28..fc134ba74c7d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* sysctl variables for tcp */
 extern int sysctl_tcp_max_orphans;
-extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_max_reordering;
 extern int sysctl_tcp_dsack;
 extern long sysctl_tcp_mem[3];
-- 
cgit v1.2.3


From c6e218035913e14952b04ceecf1a543205106fdb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:06 -0700
Subject: tcp: Namespace-ify sysctl_tcp_max_reordering

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  3 +--
 net/ipv4/tcp_ipv4.c        |  2 ++
 5 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index f0e792beeea9..3f6844665a2f 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -137,6 +137,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_rfc1337;
 	int sysctl_tcp_abort_on_overflow;
 	int sysctl_tcp_fack;
+	int sysctl_tcp_max_reordering;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fc134ba74c7d..8cd286226a1e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* sysctl variables for tcp */
 extern int sysctl_tcp_max_orphans;
-extern int sysctl_tcp_max_reordering;
 extern int sysctl_tcp_dsack;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1f23be13ce7b..18cd228a2069 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -414,13 +414,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "tcp_max_reordering",
-		.data		= &sysctl_tcp_max_reordering,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_dsack",
 		.data		= &sysctl_tcp_dsack,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_max_reordering",
+		.data		= &init_net.ipv4.sysctl_tcp_max_reordering,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5b94460793f..c118657f06ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,6 @@
 #include <linux/unaligned/access_ok.h>
 #include <linux/static_key.h>
 
-int sysctl_tcp_max_reordering __read_mostly = 300;
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
@@ -889,7 +888,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 		return;
 
 	if (metric > tp->reordering) {
-		tp->reordering = min(sysctl_tcp_max_reordering, metric);
+		tp->reordering = min(sock_net(sk)->ipv4.sysctl_tcp_max_reordering, metric);
 
 #if FASTRETRANS_DEBUG > 1
 		pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2bc6ba2059d3..c379a242abb3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2488,6 +2488,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
+	net->ipv4.sysctl_tcp_max_reordering = 300;
+
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
-- 
cgit v1.2.3


From 6496f6bde0c323fba5e8c5b5cbf3a7bf28dad7ed Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:07 -0700
Subject: tcp: Namespace-ify sysctl_tcp_dsack

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  5 ++---
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 3f6844665a2f..956957a77db9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -138,6 +138,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_abort_on_overflow;
 	int sysctl_tcp_fack;
 	int sysctl_tcp_max_reordering;
+	int sysctl_tcp_dsack;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8cd286226a1e..8b2ae3e8d79f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* sysctl variables for tcp */
 extern int sysctl_tcp_max_orphans;
-extern int sysctl_tcp_dsack;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 18cd228a2069..7652a9c2a65d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -414,13 +414,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "tcp_dsack",
-		.data		= &sysctl_tcp_dsack,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_mem",
 		.maxlen		= sizeof(sysctl_tcp_mem),
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_dsack",
+		.data		= &init_net.ipv4.sysctl_tcp_dsack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c118657f06ee..fd77037ac800 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,6 @@
 #include <linux/unaligned/access_ok.h>
 #include <linux/static_key.h>
 
-int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
@@ -4150,7 +4149,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
 		int mib_idx;
 
 		if (before(seq, tp->rcv_nxt))
@@ -4185,7 +4184,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
 		tcp_enter_quickack_mode(sk);
 
-		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+		if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
 			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c379a242abb3..d9d4d191e8f3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2489,6 +2489,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
 	net->ipv4.sysctl_tcp_max_reordering = 300;
+	net->ipv4.sysctl_tcp_dsack = 1;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 0c12654ac6d9004b9538b2a969b2b59e9a5ed831 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:08 -0700
Subject: tcp: Namespace-ify sysctl_tcp_app_win

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  8 ++++----
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 956957a77db9..63f91d52cbc0 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -139,6 +139,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_fack;
 	int sysctl_tcp_max_reordering;
 	int sysctl_tcp_dsack;
+	int sysctl_tcp_app_win;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8b2ae3e8d79f..7aa3d65062a1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
-extern int sysctl_tcp_app_win;
 extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_nometrics_save;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7652a9c2a65d..e057788834a9 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
-	{
-		.procname	= "tcp_app_win",
-		.data		= &sysctl_tcp_app_win,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_adv_win_scale",
 		.data		= &sysctl_tcp_adv_win_scale,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_app_win",
+		.data		= &init_net.ipv4.sysctl_tcp_app_win,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fd77037ac800..6af4b58ac6d5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,6 @@
 #include <linux/unaligned/access_ok.h>
 #include <linux/static_key.h>
 
-int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
@@ -428,6 +427,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
  */
 void tcp_init_buffer_space(struct sock *sk)
 {
+	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
 	struct tcp_sock *tp = tcp_sk(sk);
 	int maxwin;
 
@@ -446,14 +446,14 @@ void tcp_init_buffer_space(struct sock *sk)
 	if (tp->window_clamp >= maxwin) {
 		tp->window_clamp = maxwin;
 
-		if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
+		if (tcp_app_win && maxwin > 4 * tp->advmss)
 			tp->window_clamp = max(maxwin -
-					       (maxwin >> sysctl_tcp_app_win),
+					       (maxwin >> tcp_app_win),
 					       4 * tp->advmss);
 	}
 
 	/* Force reservation of one segment. */
-	if (sysctl_tcp_app_win &&
+	if (tcp_app_win &&
 	    tp->window_clamp > 2 * tp->advmss &&
 	    tp->window_clamp + tp->advmss > maxwin)
 		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9d4d191e8f3..189664ebd28e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2490,6 +2490,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
 	net->ipv4.sysctl_tcp_max_reordering = 300;
 	net->ipv4.sysctl_tcp_dsack = 1;
+	net->ipv4.sysctl_tcp_app_win = 31;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 94f0893e0c27219f4a726932618505aab6795973 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:09 -0700
Subject: tcp: Namespace-ify sysctl_tcp_adv_win_scale

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  9 ++++-----
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp_input.c       | 13 +++++--------
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 63f91d52cbc0..9dbb07d4eff4 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -140,6 +140,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_max_reordering;
 	int sysctl_tcp_dsack;
 	int sysctl_tcp_app_win;
+	int sysctl_tcp_adv_win_scale;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7aa3d65062a1..0dc27cd24899 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
-extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
@@ -1311,9 +1310,9 @@ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
 			       __u32 *window_clamp, int wscale_ok,
 			       __u8 *rcv_wscale, __u32 init_rcv_wnd);
 
-static inline int tcp_win_from_space(int space)
+static inline int tcp_win_from_space(const struct sock *sk, int space)
 {
-	int tcp_adv_win_scale = sysctl_tcp_adv_win_scale;
+	int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale;
 
 	return tcp_adv_win_scale <= 0 ?
 		(space>>(-tcp_adv_win_scale)) :
@@ -1323,13 +1322,13 @@ static inline int tcp_win_from_space(int space)
 /* Note: caller must be prepared to deal with negative returns */
 static inline int tcp_space(const struct sock *sk)
 {
-	return tcp_win_from_space(sk->sk_rcvbuf -
+	return tcp_win_from_space(sk, sk->sk_rcvbuf -
 				  atomic_read(&sk->sk_rmem_alloc));
 }
 
 static inline int tcp_full_space(const struct sock *sk)
 {
-	return tcp_win_from_space(sk->sk_rcvbuf);
+	return tcp_win_from_space(sk, sk->sk_rcvbuf);
 }
 
 extern void tcp_openreq_init_rwin(struct request_sock *req,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e057788834a9..a95123e1e7da 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -437,15 +437,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
-	{
-		.procname	= "tcp_adv_win_scale",
-		.data		= &sysctl_tcp_adv_win_scale,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &tcp_adv_win_scale_min,
-		.extra2		= &tcp_adv_win_scale_max,
-	},
 	{
 		.procname	= "tcp_frto",
 		.data		= &sysctl_tcp_frto,
@@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_adv_win_scale",
+		.data		= &init_net.ipv4.sysctl_tcp_adv_win_scale,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_adv_win_scale_min,
+		.extra2		= &tcp_adv_win_scale_max,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6af4b58ac6d5..8ee2c84b0bc6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,9 +79,6 @@
 #include <linux/unaligned/access_ok.h>
 #include <linux/static_key.h>
 
-int sysctl_tcp_adv_win_scale __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
-
 /* rfc5961 challenge ack rate limiting */
 int sysctl_tcp_challenge_ack_limit = 1000;
 
@@ -363,8 +360,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	/* Optimize this! */
-	int truesize = tcp_win_from_space(skb->truesize) >> 1;
-	int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
+	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
+	int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1;
 
 	while (tp->rcv_ssthresh <= window) {
 		if (truesize <= skb->len)
@@ -389,7 +386,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 		/* Check #2. Increase window, if skb with such overhead
 		 * will fit to rcvbuf in future.
 		 */
-		if (tcp_win_from_space(skb->truesize) <= skb->len)
+		if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
 			incr = 2 * tp->advmss;
 		else
 			incr = __tcp_grow_window(sk, skb);
@@ -630,7 +627,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
 		}
 
 		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
-		while (tcp_win_from_space(rcvmem) < tp->advmss)
+		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
 			rcvmem += 128;
 
 		rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
@@ -4809,7 +4806,7 @@ restart:
 		 *   overlaps to the next one.
 		 */
 		if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
-		    (tcp_win_from_space(skb->truesize) > skb->len ||
+		    (tcp_win_from_space(sk, skb->truesize) > skb->len ||
 		     before(TCP_SKB_CB(skb)->seq, start))) {
 			end_of_skbs = false;
 			break;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 189664ebd28e..1fe30fb99308 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2491,6 +2491,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_max_reordering = 300;
 	net->ipv4.sysctl_tcp_dsack = 1;
 	net->ipv4.sysctl_tcp_app_win = 31;
+	net->ipv4.sysctl_tcp_adv_win_scale = 1;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From af9b69a7a6ca6b817e8d6f416e7aa5b2a5bf1d91 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Oct 2017 21:55:10 -0700
Subject: tcp: Namespace-ify sysctl_tcp_frto

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  3 +--
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 9dbb07d4eff4..f4622e28db3a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -141,6 +141,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_dsack;
 	int sysctl_tcp_app_win;
 	int sysctl_tcp_adv_win_scale;
+	int sysctl_tcp_frto;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0dc27cd24899..18f047501f53 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
-extern int sysctl_tcp_frto;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a95123e1e7da..f1bcb9b7e082 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
-	{
-		.procname	= "tcp_frto",
-		.data		= &sysctl_tcp_frto,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_min_rtt_wlen",
 		.data		= &sysctl_tcp_min_rtt_wlen,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &tcp_adv_win_scale_min,
 		.extra2		= &tcp_adv_win_scale_max,
 	},
+	{
+		.procname	= "tcp_frto",
+		.data		= &init_net.ipv4.sysctl_tcp_frto,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8ee2c84b0bc6..90d76f1c8f96 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -83,7 +83,6 @@
 int sysctl_tcp_challenge_ack_limit = 1000;
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -2026,7 +2025,7 @@ void tcp_enter_loss(struct sock *sk)
 	 * falsely raise the receive window, which results in repeated
 	 * timeouts and stop-and-go behavior.
 	 */
-	tp->frto = sysctl_tcp_frto &&
+	tp->frto = net->ipv4.sysctl_tcp_frto &&
 		   (new_recovery || icsk->icsk_retransmits) &&
 		   !inet_csk(sk)->icsk_mtup.probe_size;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1fe30fb99308..49757c758211 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2492,6 +2492,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_dsack = 1;
 	net->ipv4.sysctl_tcp_app_win = 31;
 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
+	net->ipv4.sysctl_tcp_frto = 2;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 035226b964c820f65e201cdf123705a8f1d7c670 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Thu, 26 Oct 2017 01:47:42 +0000
Subject: bpf: remove tail_call and get_stackid helper declarations from bpf.h

commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".

>From bpf.h:

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

Whereas in bpf_helpers.h they are:

static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);

Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 3 ---
 kernel/trace/bpf_trace.c | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 172be7faf7ba..520aeebe0d93 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -231,9 +231,6 @@ struct bpf_event_entry {
 	struct rcu_head rcu;
 };
 
-u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 int bpf_prog_calc_tag(struct bpf_prog *fp);
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b65011d320e3..136aa6bb0422 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -15,6 +15,8 @@
 #include <linux/ctype.h>
 #include "trace.h"
 
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
 /**
  * trace_call_bpf - invoke BPF program
  * @call: tracepoint event
-- 
cgit v1.2.3


From 356c3e9afac0cc19c3d3b0cbc67106ce8efa0743 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Thu, 26 Oct 2017 11:00:48 +0200
Subject: net: dsa: lan9303: Move struct lan9303 to include/linux/dsa/lan9303.h

The next patch require net/dsa/tag_lan9303.c to access struct lan9303.
Therefore move struct lan9303 definitions from drivers/net/dsa/lan9303.h
to new file include/linux/dsa/lan9303.h.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                 |  1 +
 drivers/net/dsa/lan9303.h   | 34 +---------------------------------
 include/linux/dsa/lan9303.h | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/dsa/lan9303.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index e3a7ca9d2783..c9ee7abf4627 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9415,6 +9415,7 @@ M:	Florian Fainelli <f.fainelli@gmail.com>
 S:	Maintained
 F:	net/dsa/
 F:	include/net/dsa.h
+F:	include/linux/dsa/
 F:	drivers/net/dsa/
 
 NETWORKING [GENERAL]
diff --git a/drivers/net/dsa/lan9303.h b/drivers/net/dsa/lan9303.h
index d807b1be35f2..b868e5040830 100644
--- a/drivers/net/dsa/lan9303.h
+++ b/drivers/net/dsa/lan9303.h
@@ -2,39 +2,7 @@
 #include <linux/device.h>
 #include <net/dsa.h>
 
-struct lan9303;
-
-struct lan9303_phy_ops {
-	/* PHY 1 and 2 access*/
-	int	(*phy_read)(struct lan9303 *chip, int port, int regnum);
-	int	(*phy_write)(struct lan9303 *chip, int port,
-			     int regnum, u16 val);
-};
-
-#define LAN9303_NUM_ALR_RECORDS 512
-struct lan9303_alr_cache_entry {
-	u8  mac_addr[ETH_ALEN];
-	u8  port_map;           /* Bitmap of ports. Zero if unused entry */
-	u8  stp_override;       /* non zero if set ALR_DAT1_AGE_OVERRID */
-};
-
-struct lan9303 {
-	struct device *dev;
-	struct regmap *regmap;
-	struct regmap_irq_chip_data *irq_data;
-	struct gpio_desc *reset_gpio;
-	u32 reset_duration; /* in [ms] */
-	bool phy_addr_sel_strap;
-	struct dsa_switch *ds;
-	struct mutex indirect_mutex; /* protect indexed register access */
-	const struct lan9303_phy_ops *ops;
-	bool is_bridged; /* true if port 1 and 2 are bridged */
-	u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
-	/* LAN9303 do not offer reading specific ALR entry. Cache all
-	 * static entries in a flat table
-	 **/
-	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
-};
+#include <linux/dsa/lan9303.h>
 
 extern const struct regmap_access_table lan9303_register_set;
 extern const struct lan9303_phy_ops lan9303_indirect_phy_ops;
diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
new file mode 100644
index 000000000000..05d8d136baab
--- /dev/null
+++ b/include/linux/dsa/lan9303.h
@@ -0,0 +1,36 @@
+/* Included by drivers/net/dsa/lan9303.h and net/dsa/tag_lan9303.c */
+#include <linux/if_ether.h>
+
+struct lan9303;
+
+struct lan9303_phy_ops {
+	/* PHY 1 and 2 access*/
+	int	(*phy_read)(struct lan9303 *chip, int port, int regnum);
+	int	(*phy_write)(struct lan9303 *chip, int port,
+			     int regnum, u16 val);
+};
+
+#define LAN9303_NUM_ALR_RECORDS 512
+struct lan9303_alr_cache_entry {
+	u8  mac_addr[ETH_ALEN];
+	u8  port_map;           /* Bitmap of ports. Zero if unused entry */
+	u8  stp_override;       /* non zero if set ALR_DAT1_AGE_OVERRID */
+};
+
+struct lan9303 {
+	struct device *dev;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *irq_data;
+	struct gpio_desc *reset_gpio;
+	u32 reset_duration; /* in [ms] */
+	bool phy_addr_sel_strap;
+	struct dsa_switch *ds;
+	struct mutex indirect_mutex; /* protect indexed register access */
+	const struct lan9303_phy_ops *ops;
+	bool is_bridged; /* true if port 1 and 2 are bridged */
+	u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
+	/* LAN9303 do not offer reading specific ALR entry. Cache all
+	 * static entries in a flat table
+	 **/
+	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
+};
-- 
cgit v1.2.3


From bff7b688d5b10a8cb8cecefdea5e255408b78f2f Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Oct 2017 11:22:51 -0400
Subject: net: dsa: add dsa_is_unused_port helper

As the comment above the chunk states, the b53 driver attempts to
disable the unused ports. But using ds->enabled_port_mask is misleading,
because this mask reports in fact the user ports.

To avoid confusion and fix this, this patch introduces an explicit
dsa_is_unused_port helper which ensures the corresponding bit is not
masked in any of the switch port masks.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 2 +-
 include/net/dsa.h                | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index b48cf0487b43..c74a50112551 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -873,7 +873,7 @@ static int b53_setup(struct dsa_switch *ds)
 	for (port = 0; port < dev->num_ports; port++) {
 		if (dsa_is_cpu_port(ds, port))
 			b53_enable_cpu_port(dev, port);
-		else if (!(BIT(port) & ds->enabled_port_mask))
+		else if (dsa_is_unused_port(ds, port))
 			b53_disable_port(ds, port, NULL);
 	}
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 38961ef91d3d..6b1bc1c8f7e2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -254,6 +254,13 @@ struct dsa_switch {
 	struct dsa_port ports[];
 };
 
+static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
+{
+	u32 m = ds->enabled_port_mask | ds->dsa_port_mask | ds->cpu_port_mask;
+
+	return !(m & BIT(p));
+}
+
 static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
 {
 	return !!(ds->cpu_port_mask & (1 << p));
-- 
cgit v1.2.3


From deb8ee0b51204273c120a3b3848efbb5695ad658 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Oct 2017 11:22:53 -0400
Subject: net: dsa: fix dsa_is_normal_port helper

In order to know if a port is of type user, dsa_is_normal_port checks
that the given port is not of type DSA nor CPU. This is not enough
because a port can be unused.

Without the previous fix, this caused the unused mv88e6xxx ports to be
configured in normal mode.

The ds->enabled_port_mask reports the user ports, so check this instead.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6b1bc1c8f7e2..4ad432ad2d40 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -273,7 +273,7 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
 
 static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p)
 {
-	return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p);
+	return !!(ds->enabled_port_mask & BIT(p));
 }
 
 static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
-- 
cgit v1.2.3


From 2b3e9891cb607f7c7d5a4b11fb5a6e775e7f3ef4 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Oct 2017 11:22:54 -0400
Subject: net: dsa: rename dsa_is_normal_port helper

This patch renames dsa_is_normal_port to dsa_is_user_port because "user"
is the correct term in the DSA terminology, not "normal".

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 +-
 include/net/dsa.h                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 2d8cf66e8f74..09a66d4d9492 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1676,7 +1676,7 @@ static int mv88e6xxx_setup_port_mode(struct mv88e6xxx_chip *chip, int port)
 	if (dsa_is_dsa_port(chip->ds, port))
 		return mv88e6xxx_set_port_mode_dsa(chip, port);
 
-	if (dsa_is_normal_port(chip->ds, port))
+	if (dsa_is_user_port(chip->ds, port))
 		return mv88e6xxx_set_port_mode_normal(chip, port);
 
 	/* Setup CPU port mode depending on its supported tag format */
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4ad432ad2d40..49701d958663 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -271,7 +271,7 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
 	return !!((ds->dsa_port_mask) & (1 << p));
 }
 
-static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p)
+static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
 {
 	return !!(ds->enabled_port_mask & BIT(p));
 }
-- 
cgit v1.2.3


From 02bc6e546e858b209c3ebe380a13a73b333b1b3f Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Oct 2017 11:22:56 -0400
Subject: net: dsa: introduce dsa_user_ports helper

Introduce a dsa_user_ports() helper to return the ds->enabled_port_mask
mask which is more explicit. This will also minimize diffs when touching
this internal mask.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mt7530.c    | 2 +-
 drivers/net/dsa/mv88e6060.c | 5 ++---
 drivers/net/dsa/qca8k.c     | 3 +--
 include/net/dsa.h           | 5 +++++
 net/dsa/dsa2.c              | 2 +-
 net/dsa/legacy.c            | 2 +-
 6 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index d1d4eea6a875..627c039f12ca 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -688,7 +688,7 @@ mt7530_cpu_port_enable(struct mt7530_priv *priv,
 	 * the switch
 	 */
 	mt7530_write(priv, MT7530_PCR_P(port),
-		     PCR_MATRIX(priv->ds->enabled_port_mask));
+		     PCR_MATRIX(dsa_user_ports(priv->ds)));
 
 	return 0;
 }
diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index f78b9e13be1c..45768e3c5bc5 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -175,9 +175,8 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
 	 */
 	REG_WRITE(addr, PORT_VLAN_MAP,
 		  ((p & 0xf) << PORT_VLAN_MAP_DBNUM_SHIFT) |
-		   (dsa_is_cpu_port(ds, p) ?
-			ds->enabled_port_mask :
-			BIT(dsa_to_port(ds, p)->cpu_dp->index)));
+		   (dsa_is_cpu_port(ds, p) ? dsa_user_ports(ds) :
+		    BIT(dsa_to_port(ds, p)->cpu_dp->index)));
 
 	/* Port Association Vector: when learning source addresses
 	 * of packets, add the address to the address database using
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 37125858fe80..cf72e274275f 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -551,8 +551,7 @@ qca8k_setup(struct dsa_switch *ds)
 		/* CPU port gets connected to all user ports of the switch */
 		if (dsa_is_cpu_port(ds, i)) {
 			qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(QCA8K_CPU_PORT),
-				  QCA8K_PORT_LOOKUP_MEMBER,
-				  ds->enabled_port_mask);
+				  QCA8K_PORT_LOOKUP_MEMBER, dsa_user_ports(ds));
 		}
 
 		/* Invividual user ports get connected to CPU port only */
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 49701d958663..dc7728062396 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -276,6 +276,11 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
 	return !!(ds->enabled_port_mask & BIT(p));
 }
 
+static inline u32 dsa_user_ports(struct dsa_switch *ds)
+{
+	return ds->enabled_port_mask;
+}
+
 static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
 {
 	return &ds->ports[p];
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 62485a57dbfc..d43c59c91058 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -312,7 +312,7 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	 * the slave MDIO bus driver rely on these values for probing PHY
 	 * devices or not
 	 */
-	ds->phys_mii_mask = ds->enabled_port_mask;
+	ds->phys_mii_mask |= dsa_user_ports(ds);
 
 	/* Add the switch to devlink before calling setup, so that setup can
 	 * add dpipe tables
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 0b79c6171d0d..fa543c4a6061 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -136,7 +136,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 	/* Make the built-in MII bus mask match the number of ports,
 	 * switch drivers can override this later
 	 */
-	ds->phys_mii_mask = ds->enabled_port_mask;
+	ds->phys_mii_mask |= dsa_user_ports(ds);
 
 	/*
 	 * If the CPU connects to this switch, set the switch tree
-- 
cgit v1.2.3


From 057cad2c59d73b0c4a6638546f3099d6fb444094 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Oct 2017 11:22:57 -0400
Subject: net: dsa: define port types

Introduce an enumerated type for ports, which will be way more explicit
to identify a port type instead of digging into switch port masks.

A port can be of type CPU, DSA, user, or unused by default. This is a
static parsed information that cannot be changed at runtime.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 7 +++++++
 net/dsa/dsa2.c    | 3 +++
 net/dsa/legacy.c  | 6 ++++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index dc7728062396..8da20c4a6552 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -180,6 +180,13 @@ struct dsa_port {
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
 			       struct packet_type *pt);
 
+	enum {
+		DSA_PORT_TYPE_UNUSED = 0,
+		DSA_PORT_TYPE_CPU,
+		DSA_PORT_TYPE_DSA,
+		DSA_PORT_TYPE_USER,
+	} type;
+
 	struct dsa_switch	*ds;
 	unsigned int		index;
 	const char		*name;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index d43c59c91058..dd6f35b92937 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -185,6 +185,7 @@ static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 			return err;
 
 		ds->dsa_port_mask |= BIT(index);
+		port->type = DSA_PORT_TYPE_DSA;
 	}
 
 	return 0;
@@ -504,6 +505,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 	 * net/dsa/dsa.c::dsa_switch_setup_one does.
 	 */
 	ds->cpu_port_mask |= BIT(index);
+	port->type = DSA_PORT_TYPE_CPU;
 
 	tag_protocol = ds->ops->get_tag_protocol(ds);
 	tag_ops = dsa_resolve_tag_protocol(tag_protocol);
@@ -543,6 +545,7 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 			 * net/dsa/dsa.c::dsa_switch_setup_one does.
 			 */
 			ds->enabled_port_mask |= BIT(index);
+			port->type = DSA_PORT_TYPE_USER;
 		}
 
 	}
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index fa543c4a6061..9fd5b3adab1e 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -101,6 +101,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 	struct dsa_chip_data *cd = ds->cd;
 	bool valid_name_found = false;
 	int index = ds->index;
+	struct dsa_port *dp;
 	int i, ret;
 
 	/*
@@ -109,6 +110,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 	for (i = 0; i < ds->num_ports; i++) {
 		char *name;
 
+		dp = &ds->ports[i];
+
 		name = cd->port_names[i];
 		if (name == NULL)
 			continue;
@@ -122,10 +125,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 			dst->cpu_dp = &ds->ports[i];
 			dst->cpu_dp->master = master;
 			ds->cpu_port_mask |= 1 << i;
+			dp->type = DSA_PORT_TYPE_CPU;
 		} else if (!strcmp(name, "dsa")) {
 			ds->dsa_port_mask |= 1 << i;
+			dp->type = DSA_PORT_TYPE_DSA;
 		} else {
 			ds->enabled_port_mask |= 1 << i;
+			dp->type = DSA_PORT_TYPE_USER;
 		}
 		valid_name_found = true;
 	}
-- 
cgit v1.2.3


From c38c5a66506e4e8223fd03e950b1bde99190701e Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Oct 2017 11:22:58 -0400
Subject: net: dsa: use new port type in helpers

Now that DSA exposes an enumerated type for the ports, we can use them
directly instead of checking bitmaps, which is more consistent.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8da20c4a6552..07dfbd7f4fd5 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -261,36 +261,41 @@ struct dsa_switch {
 	struct dsa_port ports[];
 };
 
-static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
+static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
 {
-	u32 m = ds->enabled_port_mask | ds->dsa_port_mask | ds->cpu_port_mask;
+	return &ds->ports[p];
+}
 
-	return !(m & BIT(p));
+static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
+{
+	return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED;
 }
 
 static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
 {
-	return !!(ds->cpu_port_mask & (1 << p));
+	return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU;
 }
 
 static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
 {
-	return !!((ds->dsa_port_mask) & (1 << p));
+	return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA;
 }
 
 static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
 {
-	return !!(ds->enabled_port_mask & BIT(p));
+	return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER;
 }
 
 static inline u32 dsa_user_ports(struct dsa_switch *ds)
 {
-	return ds->enabled_port_mask;
-}
+	u32 mask = 0;
+	int p;
 
-static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
-{
-	return &ds->ports[p];
+	for (p = 0; p < ds->num_ports; p++)
+		if (dsa_is_user_port(ds, p))
+			mask |= BIT(p);
+
+	return mask;
 }
 
 static inline u8 dsa_upstream_port(struct dsa_switch *ds)
-- 
cgit v1.2.3


From 5749f0f3772b9d98f37e3a92539f49fafaa64eca Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Oct 2017 11:22:59 -0400
Subject: net: dsa: remove port masks

Now that DSA core provides port types, there is no need to keep this
information at the switch level. This is a static information that is
part of a DSA core dsa_port structure. Remove them.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  3 ---
 net/dsa/dsa2.c    | 11 -----------
 net/dsa/legacy.c  |  3 ---
 3 files changed, 17 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 07dfbd7f4fd5..50e276dc4c01 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -240,9 +240,6 @@ struct dsa_switch {
 	/*
 	 * Slave mii_bus and devices for the individual ports.
 	 */
-	u32			dsa_port_mask;
-	u32			cpu_port_mask;
-	u32			enabled_port_mask;
 	u32			phys_mii_mask;
 	struct mii_bus		*slave_mii_bus;
 
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index dd6f35b92937..ec58654a71cd 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -184,7 +184,6 @@ static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 		if (err != 0)
 			return err;
 
-		ds->dsa_port_mask |= BIT(index);
 		port->type = DSA_PORT_TYPE_DSA;
 	}
 
@@ -500,11 +499,6 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 		dst->cpu_dp->master = ethernet_dev;
 	}
 
-	/* Initialize cpu_port_mask now for drv->setup()
-	 * to have access to a correct value, just like what
-	 * net/dsa/dsa.c::dsa_switch_setup_one does.
-	 */
-	ds->cpu_port_mask |= BIT(index);
 	port->type = DSA_PORT_TYPE_CPU;
 
 	tag_protocol = ds->ops->get_tag_protocol(ds);
@@ -540,11 +534,6 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 			if (err)
 				return err;
 		} else {
-			/* Initialize enabled_port_mask now for drv->setup()
-			 * to have access to a correct value, just like what
-			 * net/dsa/dsa.c::dsa_switch_setup_one does.
-			 */
-			ds->enabled_port_mask |= BIT(index);
 			port->type = DSA_PORT_TYPE_USER;
 		}
 
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 9fd5b3adab1e..93e1b116ef83 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -124,13 +124,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 			}
 			dst->cpu_dp = &ds->ports[i];
 			dst->cpu_dp->master = master;
-			ds->cpu_port_mask |= 1 << i;
 			dp->type = DSA_PORT_TYPE_CPU;
 		} else if (!strcmp(name, "dsa")) {
-			ds->dsa_port_mask |= 1 << i;
 			dp->type = DSA_PORT_TYPE_DSA;
 		} else {
-			ds->enabled_port_mask |= 1 << i;
 			dp->type = DSA_PORT_TYPE_USER;
 		}
 		valid_name_found = true;
-- 
cgit v1.2.3


From 585d763af09cc21daf48ecc873604ccdb70f6014 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 16 Oct 2017 18:01:26 -0700
Subject: net/sched: Introduce Credit Based Shaper (CBS) qdisc

This queueing discipline implements the shaper algorithm defined by
the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.

It's primary usage is to apply some bandwidth reservation to user
defined traffic classes, which are mapped to different queues via the
mqprio qdisc.

Only a simple software implementation is added for now.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Tested-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/uapi/linux/pkt_sched.h |  19 +++
 net/sched/Kconfig              |  11 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_cbs.c            | 293 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 324 insertions(+)
 create mode 100644 net/sched/sch_cbs.c

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e7cc3d3c7421..0e88cc262ca0 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -904,4 +904,23 @@ struct tc_pie_xstats {
 	__u32 maxq;             /* maximum queue size */
 	__u32 ecn_mark;         /* packets marked with ecn*/
 };
+
+/* CBS */
+struct tc_cbs_qopt {
+	__u8 offload;
+	__u8 _pad[3];
+	__s32 hicredit;
+	__s32 locredit;
+	__s32 idleslope;
+	__s32 sendslope;
+};
+
+enum {
+	TCA_CBS_UNSPEC,
+	TCA_CBS_PARMS,
+	__TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e70ed26485a2..c03d86a7775e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -172,6 +172,17 @@ config NET_SCH_TBF
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_tbf.
 
+config NET_SCH_CBS
+	tristate "Credit Based Shaper (CBS)"
+	---help---
+	  Say Y here if you want to use the Credit Based Shaper (CBS) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_cbs.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_cbs.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 7b915d226de7..80c8f92d162d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
 obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
+obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
new file mode 100644
index 000000000000..0e85133c5653
--- /dev/null
+++ b/net/sched/sch_cbs.c
@@ -0,0 +1,293 @@
+/*
+ * net/sched/sch_cbs.c	Credit Based Shaper
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+/* Credit Based Shaper (CBS)
+ * =========================
+ *
+ * This is a simple rate-limiting shaper aimed at TSN applications on
+ * systems with known traffic workloads.
+ *
+ * Its algorithm is defined by the IEEE 802.1Q-2014 Specification,
+ * Section 8.6.8.2, and explained in more detail in the Annex L of the
+ * same specification.
+ *
+ * There are four tunables to be considered:
+ *
+ *	'idleslope': Idleslope is the rate of credits that is
+ *	accumulated (in kilobits per second) when there is at least
+ *	one packet waiting for transmission. Packets are transmitted
+ *	when the current value of credits is equal or greater than
+ *	zero. When there is no packet to be transmitted the amount of
+ *	credits is set to zero. This is the main tunable of the CBS
+ *	algorithm.
+ *
+ *	'sendslope':
+ *	Sendslope is the rate of credits that is depleted (it should be a
+ *	negative number of kilobits per second) when a transmission is
+ *	ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
+ *	8.6.8.2 item g):
+ *
+ *	sendslope = idleslope - port_transmit_rate
+ *
+ *	'hicredit': Hicredit defines the maximum amount of credits (in
+ *	bytes) that can be accumulated. Hicredit depends on the
+ *	characteristics of interfering traffic,
+ *	'max_interference_size' is the maximum size of any burst of
+ *	traffic that can delay the transmission of a frame that is
+ *	available for transmission for this traffic class, (IEEE
+ *	802.1Q-2014 Annex L, Equation L-3):
+ *
+ *	hicredit = max_interference_size * (idleslope / port_transmit_rate)
+ *
+ *	'locredit': Locredit is the minimum amount of credits that can
+ *	be reached. It is a function of the traffic flowing through
+ *	this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2):
+ *
+ *	locredit = max_frame_size * (sendslope / port_transmit_rate)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+#define BYTES_PER_KBIT (1000LL / 8)
+
+struct cbs_sched_data {
+	s64 port_rate; /* in bytes/s */
+	s64 last; /* timestamp in ns */
+	s64 credits; /* in bytes */
+	s32 locredit; /* in bytes */
+	s32 hicredit; /* in bytes */
+	s64 sendslope; /* in bytes/s */
+	s64 idleslope; /* in bytes/s */
+	struct qdisc_watchdog watchdog;
+	int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch);
+	struct sk_buff *(*dequeue)(struct Qdisc *sch);
+};
+
+static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	if (sch->q.qlen == 0 && q->credits > 0) {
+		/* We need to stop accumulating credits when there's
+		 * no enqueued packets and q->credits is positive.
+		 */
+		q->credits = 0;
+		q->last = ktime_get_ns();
+	}
+
+	return qdisc_enqueue_tail(skb, sch);
+}
+
+static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+		       struct sk_buff **to_free)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	return q->enqueue(skb, sch);
+}
+
+/* timediff is in ns, slope is in bytes/s */
+static s64 timediff_to_credits(s64 timediff, s64 slope)
+{
+	return div64_s64(timediff * slope, NSEC_PER_SEC);
+}
+
+static s64 delay_from_credits(s64 credits, s64 slope)
+{
+	if (unlikely(slope == 0))
+		return S64_MAX;
+
+	return div64_s64(-credits * NSEC_PER_SEC, slope);
+}
+
+static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate)
+{
+	if (unlikely(port_rate == 0))
+		return S64_MAX;
+
+	return div64_s64(len * slope, port_rate);
+}
+
+static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	s64 now = ktime_get_ns();
+	struct sk_buff *skb;
+	s64 credits;
+	int len;
+
+	if (q->credits < 0) {
+		credits = timediff_to_credits(now - q->last, q->idleslope);
+
+		credits = q->credits + credits;
+		q->credits = min_t(s64, credits, q->hicredit);
+
+		if (q->credits < 0) {
+			s64 delay;
+
+			delay = delay_from_credits(q->credits, q->idleslope);
+			qdisc_watchdog_schedule_ns(&q->watchdog, now + delay);
+
+			q->last = now;
+
+			return NULL;
+		}
+	}
+
+	skb = qdisc_dequeue_head(sch);
+	if (!skb)
+		return NULL;
+
+	len = qdisc_pkt_len(skb);
+
+	/* As sendslope is a negative number, this will decrease the
+	 * amount of q->credits.
+	 */
+	credits = credits_from_len(len, q->sendslope, q->port_rate);
+	credits += q->credits;
+
+	q->credits = max_t(s64, credits, q->locredit);
+	q->last = now;
+
+	return skb;
+}
+
+static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	return q->dequeue(sch);
+}
+
+static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
+	[TCA_CBS_PARMS]	= { .len = sizeof(struct tc_cbs_qopt) },
+};
+
+static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct nlattr *tb[TCA_CBS_MAX + 1];
+	struct ethtool_link_ksettings ecmd;
+	struct tc_cbs_qopt *qopt;
+	s64 link_speed;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_CBS_PARMS])
+		return -EINVAL;
+
+	qopt = nla_data(tb[TCA_CBS_PARMS]);
+
+	if (qopt->offload)
+		return -EOPNOTSUPP;
+
+	if (!__ethtool_get_link_ksettings(dev, &ecmd))
+		link_speed = ecmd.base.speed;
+	else
+		link_speed = SPEED_1000;
+
+	q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
+
+	q->enqueue = cbs_enqueue_soft;
+	q->dequeue = cbs_dequeue_soft;
+
+	q->hicredit = qopt->hicredit;
+	q->locredit = qopt->locredit;
+	q->idleslope = qopt->idleslope * BYTES_PER_KBIT;
+	q->sendslope = qopt->sendslope * BYTES_PER_KBIT;
+
+	return 0;
+}
+
+static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	if (!opt)
+		return -EINVAL;
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	return cbs_change(sch, opt);
+}
+
+static void cbs_destroy(struct Qdisc *sch)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct tc_cbs_qopt opt = { };
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.hicredit = q->hicredit;
+	opt.locredit = q->locredit;
+	opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT);
+	opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT);
+	opt.offload = 0;
+
+	if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
+	.id		=	"cbs",
+	.priv_size	=	sizeof(struct cbs_sched_data),
+	.enqueue	=	cbs_enqueue,
+	.dequeue	=	cbs_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	cbs_init,
+	.reset		=	qdisc_reset_queue,
+	.destroy	=	cbs_destroy,
+	.change		=	cbs_change,
+	.dump		=	cbs_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init cbs_module_init(void)
+{
+	return register_qdisc(&cbs_qdisc_ops);
+}
+
+static void __exit cbs_module_exit(void)
+{
+	unregister_qdisc(&cbs_qdisc_ops);
+}
+module_init(cbs_module_init)
+module_exit(cbs_module_exit)
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 3d0bd028ffb4a4915cb64cfa0d2cee1578cc0321 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 16 Oct 2017 18:01:27 -0700
Subject: net/sched: Add support for HW offloading for CBS

This adds support for offloading the CBS algorithm to the controller,
if supported. Drivers wanting to support CBS offload must implement
the .ndo_setup_tc callback and handle the TC_SETUP_CBS (introduced
here) type.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/netdevice.h |   1 +
 include/net/pkt_sched.h   |   9 ++++
 net/sched/sch_cbs.c       | 104 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 102 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6c7960c8338a..5e02f79b2110 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -776,6 +776,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
+	TC_SETUP_CBS,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b8ecafce4ba1..02f2db26e30c 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -140,4 +140,13 @@ static inline struct net *qdisc_net(struct Qdisc *q)
 	return dev_net(q->dev_queue->dev);
 }
 
+struct tc_cbs_qopt_offload {
+	u8 enable;
+	s32 queue;
+	s32 hicredit;
+	s32 locredit;
+	s32 idleslope;
+	s32 sendslope;
+};
+
 #endif
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 0e85133c5653..bdb533b7fb8c 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -68,6 +68,8 @@
 #define BYTES_PER_KBIT (1000LL / 8)
 
 struct cbs_sched_data {
+	bool offload;
+	int queue;
 	s64 port_rate; /* in bytes/s */
 	s64 last; /* timestamp in ns */
 	s64 credits; /* in bytes */
@@ -80,6 +82,11 @@ struct cbs_sched_data {
 	struct sk_buff *(*dequeue)(struct Qdisc *sch);
 };
 
+static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch)
+{
+	return qdisc_enqueue_tail(skb, sch);
+}
+
 static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
@@ -169,6 +176,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
 	return skb;
 }
 
+static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch)
+{
+	return qdisc_dequeue_head(sch);
+}
+
 static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
@@ -180,14 +192,66 @@ static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
 	[TCA_CBS_PARMS]	= { .len = sizeof(struct tc_cbs_qopt) },
 };
 
+static void cbs_disable_offload(struct net_device *dev,
+				struct cbs_sched_data *q)
+{
+	struct tc_cbs_qopt_offload cbs = { };
+	const struct net_device_ops *ops;
+	int err;
+
+	if (!q->offload)
+		return;
+
+	q->enqueue = cbs_enqueue_soft;
+	q->dequeue = cbs_dequeue_soft;
+
+	ops = dev->netdev_ops;
+	if (!ops->ndo_setup_tc)
+		return;
+
+	cbs.queue = q->queue;
+	cbs.enable = 0;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	if (err < 0)
+		pr_warn("Couldn't disable CBS offload for queue %d\n",
+			cbs.queue);
+}
+
+static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
+			      const struct tc_cbs_qopt *opt)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct tc_cbs_qopt_offload cbs = { };
+	int err;
+
+	if (!ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	cbs.queue = q->queue;
+
+	cbs.enable = 1;
+	cbs.hicredit = opt->hicredit;
+	cbs.locredit = opt->locredit;
+	cbs.idleslope = opt->idleslope;
+	cbs.sendslope = opt->sendslope;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	if (err < 0)
+		return err;
+
+	q->enqueue = cbs_enqueue_offload;
+	q->dequeue = cbs_dequeue_offload;
+
+	return 0;
+}
+
 static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
 	struct nlattr *tb[TCA_CBS_MAX + 1];
-	struct ethtool_link_ksettings ecmd;
 	struct tc_cbs_qopt *qopt;
-	s64 link_speed;
 	int err;
 
 	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
@@ -199,23 +263,30 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
 
 	qopt = nla_data(tb[TCA_CBS_PARMS]);
 
-	if (qopt->offload)
-		return -EOPNOTSUPP;
+	if (!qopt->offload) {
+		struct ethtool_link_ksettings ecmd;
+		s64 link_speed;
 
-	if (!__ethtool_get_link_ksettings(dev, &ecmd))
-		link_speed = ecmd.base.speed;
-	else
-		link_speed = SPEED_1000;
+		if (!__ethtool_get_link_ksettings(dev, &ecmd))
+			link_speed = ecmd.base.speed;
+		else
+			link_speed = SPEED_1000;
 
-	q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
+		q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
 
-	q->enqueue = cbs_enqueue_soft;
-	q->dequeue = cbs_dequeue_soft;
+		cbs_disable_offload(dev, q);
+	} else {
+		err = cbs_enable_offload(dev, q, qopt);
+		if (err < 0)
+			return err;
+	}
 
+	/* Everything went OK, save the parameters used. */
 	q->hicredit = qopt->hicredit;
 	q->locredit = qopt->locredit;
 	q->idleslope = qopt->idleslope * BYTES_PER_KBIT;
 	q->sendslope = qopt->sendslope * BYTES_PER_KBIT;
+	q->offload = qopt->offload;
 
 	return 0;
 }
@@ -223,10 +294,16 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
 static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
 	if (!opt)
 		return -EINVAL;
 
+	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+	q->enqueue = cbs_enqueue_soft;
+	q->dequeue = cbs_dequeue_soft;
+
 	qdisc_watchdog_init(&q->watchdog, sch);
 
 	return cbs_change(sch, opt);
@@ -235,8 +312,11 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
 static void cbs_destroy(struct Qdisc *sch)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
 	qdisc_watchdog_cancel(&q->watchdog);
+
+	cbs_disable_offload(dev, q);
 }
 
 static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -253,7 +333,7 @@ static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opt.locredit = q->locredit;
 	opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT);
 	opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT);
-	opt.offload = 0;
+	opt.offload = q->offload;
 
 	if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From ec36e416f06f6a8659281053fdc46ce484ad2211 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:21 -0700
Subject: tcp: Namespace-ify sysctl_tcp_nometrics_save

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_metrics.c     |  4 +---
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index f4622e28db3a..9606e2ea1f14 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -142,6 +142,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_app_win;
 	int sysctl_tcp_adv_win_scale;
 	int sysctl_tcp_frto;
+	int sysctl_tcp_nometrics_save;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 18f047501f53..6ab7fa4154b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
-extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_workaround_signed_windows;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index f1bcb9b7e082..b742a5e26a9d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -451,13 +451,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_no_metrics_save",
-		.data		= &sysctl_tcp_nometrics_save,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "tcp_moderate_rcvbuf",
 		.data		= &sysctl_tcp_moderate_rcvbuf,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_no_metrics_save",
+		.data		= &init_net.ipv4.sysctl_tcp_nometrics_save,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0ab78abc811b..0507b56b6d4b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -20,8 +20,6 @@
 #include <net/tcp.h>
 #include <net/genetlink.h>
 
-int sysctl_tcp_nometrics_save __read_mostly;
-
 static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
 						   const struct inetpeer_addr *daddr,
 						   struct net *net, unsigned int hash);
@@ -330,7 +328,7 @@ void tcp_update_metrics(struct sock *sk)
 	int m;
 
 	sk_dst_confirm(sk);
-	if (sysctl_tcp_nometrics_save || !dst)
+	if (net->ipv4.sysctl_tcp_nometrics_save || !dst)
 		return;
 
 	rcu_read_lock();
-- 
cgit v1.2.3


From 4540c0cf98b8892a642d2453eec20ae3eb5696fb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:22 -0700
Subject: tcp: Namespace-ify sysctl_tcp_moderate_rcvbuf

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  5 ++---
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 9606e2ea1f14..4458a54fe3f4 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -143,6 +143,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_adv_win_scale;
 	int sysctl_tcp_frto;
 	int sysctl_tcp_nometrics_save;
+	int sysctl_tcp_moderate_rcvbuf;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6ab7fa4154b2..f954e74578ff 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
-extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_workaround_signed_windows;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b742a5e26a9d..2ebe87fd1169 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -451,13 +451,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_moderate_rcvbuf",
-		.data		= &sysctl_tcp_moderate_rcvbuf,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "tcp_tso_win_divisor",
 		.data		= &sysctl_tcp_tso_win_divisor,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_moderate_rcvbuf",
+		.data		= &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 90d76f1c8f96..ce481325115f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -84,7 +84,6 @@ int sysctl_tcp_challenge_ack_limit = 1000;
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
@@ -411,7 +410,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
 	/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
 	 * Allow enough cushion so that sender is not limited by our window
 	 */
-	if (sysctl_tcp_moderate_rcvbuf)
+	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
 		rcvmem <<= 2;
 
 	if (sk->sk_rcvbuf < rcvmem)
@@ -602,7 +601,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
 	 * <prev RTT . ><current RTT .. ><next RTT .... >
 	 */
 
-	if (sysctl_tcp_moderate_rcvbuf &&
+	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
 		int rcvwin, rcvmem, rcvbuf;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 49757c758211..27f376b90913 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2493,6 +2493,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_app_win = 31;
 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
 	net->ipv4.sysctl_tcp_frto = 2;
+	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From d06a99045837d3f4d5431793c4c390b0daf2a08d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:23 -0700
Subject: tcp: Namespace-ify sysctl_tcp_tso_win_divisor

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  5 +++++
 net/ipv4/tcp_output.c      |  8 +-------
 5 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 4458a54fe3f4..60bccda046db 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -144,6 +144,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_frto;
 	int sysctl_tcp_nometrics_save;
 	int sysctl_tcp_moderate_rcvbuf;
+	int sysctl_tcp_tso_win_divisor;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f954e74578ff..ed0828dc82f1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
-extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_workaround_signed_windows;
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2ebe87fd1169..a053cacb8290 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -451,13 +451,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_tso_win_divisor",
-		.data		= &sysctl_tcp_tso_win_divisor,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "tcp_congestion_control",
 		.mode		= 0644,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_tso_win_divisor",
+		.data		= &init_net.ipv4.sysctl_tcp_tso_win_divisor,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 27f376b90913..284ff16148df 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2494,6 +2494,11 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
 	net->ipv4.sysctl_tcp_frto = 2;
 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
+	/* This limits the percentage of the congestion window which we
+	 * will allow a single TSO frame to consume.  Building TSO frames
+	 * which are too large can cause TCP streams to be bursty.
+	 */
+	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 55a0aa4b96df..60df3ab52166 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -53,12 +53,6 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
 /* Default TSQ limit of four TSO segments */
 int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
 
-/* This limits the percentage of the congestion window which we
- * will allow a single TSO frame to consume.  Building TSO frames
- * which are too large can cause TCP streams to be bursty.
- */
-int sysctl_tcp_tso_win_divisor __read_mostly = 3;
-
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
@@ -1988,7 +1982,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
 		goto send_now;
 
-	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+	win_divisor = ACCESS_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
 	if (win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
-- 
cgit v1.2.3


From ceef9ab6be7234f9e49f79769e0da88d1dccfcc7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:24 -0700
Subject: tcp: Namespace-ify sysctl_tcp_workaround_signed_windows

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  4 ++--
 net/ipv4/syncookies.c      |  2 +-
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_minisocks.c   |  2 +-
 net/ipv4/tcp_output.c      | 14 +++++---------
 net/ipv6/syncookies.c      |  2 +-
 7 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 60bccda046db..e74c7c1b0d18 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -145,6 +145,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_nometrics_save;
 	int sysctl_tcp_moderate_rcvbuf;
 	int sysctl_tcp_tso_win_divisor;
+	int sysctl_tcp_workaround_signed_windows;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ed0828dc82f1..e338e16178dd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
-extern int sysctl_tcp_workaround_signed_windows;
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
@@ -1302,7 +1301,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
 }
 
 /* Determine a window scaling and initial window to offer. */
-void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
+void tcp_select_initial_window(const struct sock *sk, int __space,
+			       __u32 mss, __u32 *rcv_wnd,
 			       __u32 *window_clamp, int wscale_ok,
 			       __u8 *rcv_wscale, __u32 init_rcv_wnd);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 77cf32a80952..fda37f2862c9 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -385,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	/* Try to redo what tcp_v4_send_synack did. */
 	req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
 
-	tcp_select_initial_window(tcp_full_space(sk), req->mss,
+	tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
 				  &req->rsk_rcv_wnd, &req->rsk_window_clamp,
 				  ireq->wscale_ok, &rcv_wscale,
 				  dst_metric(&rt->dst, RTAX_INITRWND));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a053cacb8290..3ae9012a4979 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -457,13 +457,6 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= TCP_CA_NAME_MAX,
 		.proc_handler	= proc_tcp_congestion_control,
 	},
-	{
-		.procname	= "tcp_workaround_signed_windows",
-		.data		= &sysctl_tcp_workaround_signed_windows,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_limit_output_bytes",
 		.data		= &sysctl_tcp_limit_output_bytes,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_workaround_signed_windows",
+		.data		= &init_net.ipv4.sysctl_tcp_workaround_signed_windows,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3270ab8416ce..3c65c1a3f944 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -369,7 +369,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 		full_space = rcv_wnd * mss;
 
 	/* tcp_full_space because it is guaranteed to be the first packet */
-	tcp_select_initial_window(full_space,
+	tcp_select_initial_window(sk_listener, full_space,
 		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
 		&req->rsk_rcv_wnd,
 		&req->rsk_window_clamp,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 60df3ab52166..5bbed67c27e9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,11 +45,6 @@
 
 #include <trace/events/tcp.h>
 
-/* People can turn this on to work with those rare, broken TCPs that
- * interpret the window field as a signed quantity.
- */
-int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
-
 /* Default TSQ limit of four TSO segments */
 int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
 
@@ -196,7 +191,7 @@ u32 tcp_default_init_rwnd(u32 mss)
  * be a multiple of mss if possible. We assume here that mss >= 1.
  * This MUST be enforced by all callers.
  */
-void tcp_select_initial_window(int __space, __u32 mss,
+void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
 			       __u32 *rcv_wnd, __u32 *window_clamp,
 			       int wscale_ok, __u8 *rcv_wscale,
 			       __u32 init_rcv_wnd)
@@ -220,7 +215,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
 	 * which we interpret as a sign the remote TCP is not
 	 * misinterpreting the window field as a signed quantity.
 	 */
-	if (sysctl_tcp_workaround_signed_windows)
+	if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
 		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
 	else
 		(*rcv_wnd) = space;
@@ -280,7 +275,8 @@ static u16 tcp_select_window(struct sock *sk)
 	/* Make sure we do not exceed the maximum possible
 	 * scaled window.
 	 */
-	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
+	if (!tp->rx_opt.rcv_wscale &&
+	    sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
 		new_win = min(new_win, MAX_TCP_WINDOW);
 	else
 		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -3349,7 +3345,7 @@ static void tcp_connect_init(struct sock *sk)
 	if (rcv_wnd == 0)
 		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
 
-	tcp_select_initial_window(tcp_full_space(sk),
+	tcp_select_initial_window(sk, tcp_full_space(sk),
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 				  &tp->rcv_wnd,
 				  &tp->window_clamp,
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 4e7817abc0b9..e7a3a6b6cf56 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -244,7 +244,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	}
 
 	req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
-	tcp_select_initial_window(tcp_full_space(sk), req->mss,
+	tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
 				  &req->rsk_rcv_wnd, &req->rsk_window_clamp,
 				  ireq->wscale_ok, &rcv_wscale,
 				  dst_metric(dst, RTAX_INITRWND));
-- 
cgit v1.2.3


From 9184d8bb448a3d2c2d9f90f1e2f5de625292e769 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:25 -0700
Subject: tcp: Namespace-ify sysctl_tcp_limit_output_bytes

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  2 ++
 net/ipv4/tcp_output.c      |  6 ++----
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e74c7c1b0d18..e98f473bab13 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -146,6 +146,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_moderate_rcvbuf;
 	int sysctl_tcp_tso_win_divisor;
 	int sysctl_tcp_workaround_signed_windows;
+	int sysctl_tcp_limit_output_bytes;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e338e16178dd..33f9d30a6905 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
-extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_min_rtt_wlen;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3ae9012a4979..6caf5c40730f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -457,13 +457,6 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= TCP_CA_NAME_MAX,
 		.proc_handler	= proc_tcp_congestion_control,
 	},
-	{
-		.procname	= "tcp_limit_output_bytes",
-		.data		= &sysctl_tcp_limit_output_bytes,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_challenge_ack_limit",
 		.data		= &sysctl_tcp_challenge_ack_limit,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_limit_output_bytes",
+		.data		= &init_net.ipv4.sysctl_tcp_limit_output_bytes,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 284ff16148df..713b80261e4f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2499,6 +2499,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	 * which are too large can cause TCP streams to be bursty.
 	 */
 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
+	/* Default TSQ limit of four TSO segments */
+	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5bbed67c27e9..f018892c6a98 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,9 +45,6 @@
 
 #include <trace/events/tcp.h>
 
-/* Default TSQ limit of four TSO segments */
-int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
-
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
@@ -2215,7 +2212,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 	unsigned int limit;
 
 	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
-	limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+	limit = min_t(u32, limit,
+		      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
 	limit <<= factor;
 
 	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
-- 
cgit v1.2.3


From b530b68148301d73775cd27cc136ce4dd5738ae8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:26 -0700
Subject: tcp: Namespace-ify sysctl_tcp_challenge_ack_limit

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       | 14 ++++++--------
 net/ipv4/tcp_ipv4.c        |  2 ++
 5 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e98f473bab13..e9895d40868e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -147,6 +147,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_tso_win_divisor;
 	int sysctl_tcp_workaround_signed_windows;
 	int sysctl_tcp_limit_output_bytes;
+	int sysctl_tcp_challenge_ack_limit;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 33f9d30a6905..afc23596e9aa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
-extern int sysctl_tcp_challenge_ack_limit;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_min_rtt_wlen;
 extern int sysctl_tcp_autocorking;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6caf5c40730f..e28b3b7a7bbc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -457,13 +457,6 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= TCP_CA_NAME_MAX,
 		.proc_handler	= proc_tcp_congestion_control,
 	},
-	{
-		.procname	= "tcp_challenge_ack_limit",
-		.data		= &sysctl_tcp_challenge_ack_limit,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 #ifdef CONFIG_NETLABEL
 	{
 		.procname	= "cipso_cache_enable",
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_challenge_ack_limit",
+		.data		= &init_net.ipv4.sysctl_tcp_challenge_ack_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ce481325115f..928048a4e2c5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,9 +79,6 @@
 #include <linux/unaligned/access_ok.h>
 #include <linux/static_key.h>
 
-/* rfc5961 challenge ack rate limiting */
-int sysctl_tcp_challenge_ack_limit = 1000;
-
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -3443,10 +3440,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 	static u32 challenge_timestamp;
 	static unsigned int challenge_count;
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	u32 count, now;
 
 	/* First check our per-socket dupack rate limit. */
-	if (__tcp_oow_rate_limited(sock_net(sk),
+	if (__tcp_oow_rate_limited(net,
 				   LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
 				   &tp->last_oow_ack_time))
 		return;
@@ -3454,16 +3452,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 	/* Then check host-wide RFC 5961 rate limit. */
 	now = jiffies / HZ;
 	if (now != challenge_timestamp) {
-		u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
+		u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
+		u32 half = (ack_limit + 1) >> 1;
 
 		challenge_timestamp = now;
-		WRITE_ONCE(challenge_count, half +
-			   prandom_u32_max(sysctl_tcp_challenge_ack_limit));
+		WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
 	}
 	count = READ_ONCE(challenge_count);
 	if (count > 0) {
 		WRITE_ONCE(challenge_count, count - 1);
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+		NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
 		tcp_send_ack(sk);
 	}
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 713b80261e4f..50ab3a3eced3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2501,6 +2501,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
 	/* Default TSQ limit of four TSO segments */
 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
+	/* rfc5961 challenge ack rate limiting */
+	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 26e9596e5b8f11025b57b12e7265df649129ab00 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:27 -0700
Subject: tcp: Namespace-ify sysctl_tcp_min_tso_segs

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp.c             |  2 --
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      |  3 ++-
 6 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e9895d40868e..a2da3e19a977 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -148,6 +148,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_workaround_signed_windows;
 	int sysctl_tcp_limit_output_bytes;
 	int sysctl_tcp_challenge_ack_limit;
+	int sysctl_tcp_min_tso_segs;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index afc23596e9aa..0735303a6575 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
-extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_min_rtt_wlen;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e28b3b7a7bbc..00b4aea3705b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -499,15 +499,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler   = proc_allowed_congestion_control,
 	},
-	{
-		.procname	= "tcp_min_tso_segs",
-		.data		= &sysctl_tcp_min_tso_segs,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &gso_max_segs,
-	},
 	{
 		.procname	= "tcp_pacing_ss_ratio",
 		.data		= &sysctl_tcp_pacing_ss_ratio,
@@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_min_tso_segs",
+		.data		= &init_net.ipv4.sysctl_tcp_min_tso_segs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &gso_max_segs,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c7c983f0f817..a01c97708d83 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -285,8 +285,6 @@
 
 #include <trace/events/tcp.h>
 
-int sysctl_tcp_min_tso_segs __read_mostly = 2;
-
 int sysctl_tcp_autocorking __read_mostly = 1;
 
 struct percpu_counter tcp_orphan_count;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 50ab3a3eced3..6192f26145d3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2503,6 +2503,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
 	/* rfc5961 challenge ack rate limiting */
 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
+	net->ipv4.sysctl_tcp_min_tso_segs = 2;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f018892c6a98..aab6e7145013 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1758,7 +1758,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
 
 	return tso_segs ? :
-		tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
+		tcp_tso_autosize(sk, mss_now,
+				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
 }
 
 /* Returns the portion of skb which can be sent right away */
-- 
cgit v1.2.3


From bd239704295c66196e6b77c5717ec4aec076ddd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:28 -0700
Subject: tcp: Namespace-ify sysctl_tcp_min_rtt_wlen

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  3 +--
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index a2da3e19a977..1a66af8a0d32 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -149,6 +149,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_limit_output_bytes;
 	int sysctl_tcp_challenge_ack_limit;
 	int sysctl_tcp_min_tso_segs;
+	int sysctl_tcp_min_rtt_wlen;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0735303a6575..56f50c9a3e6a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
-extern int sysctl_tcp_min_rtt_wlen;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
 extern int sysctl_tcp_pacing_ss_ratio;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 00b4aea3705b..029692d2e4ae 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
-	{
-		.procname	= "tcp_min_rtt_wlen",
-		.data		= &sysctl_tcp_min_rtt_wlen,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_low_latency",
 		.data		= &sysctl_tcp_low_latency,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &one,
 		.extra2		= &gso_max_segs,
 	},
+	{
+		.procname	= "tcp_min_rtt_wlen",
+		.data		= &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 928048a4e2c5..da1ef666d1f9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -80,7 +80,6 @@
 #include <linux/static_key.h>
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
@@ -2915,8 +2914,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 
 static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
 {
+	u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
 
 	minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
 			   rtt_us ? : jiffies_to_usecs(1));
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6192f26145d3..ced35af5737a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2504,6 +2504,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	/* rfc5961 challenge ack rate limiting */
 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
+	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 790f00e19f65673c3c169dfc137c09a9236847d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:29 -0700
Subject: tcp: Namespace-ify sysctl_tcp_autocorking

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp.c             |  4 +---
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 1a66af8a0d32..537830882149 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -150,6 +150,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_challenge_ack_limit;
 	int sysctl_tcp_min_tso_segs;
 	int sysctl_tcp_min_rtt_wlen;
+	int sysctl_tcp_autocorking;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 56f50c9a3e6a..0268f1025d9d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
-extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
 extern int sysctl_tcp_pacing_ss_ratio;
 extern int sysctl_tcp_pacing_ca_ratio;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 029692d2e4ae..43a18a317053 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -510,15 +510,6 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &zero,
 		.extra2		= &thousand,
 	},
-	{
-		.procname	= "tcp_autocorking",
-		.data		= &sysctl_tcp_autocorking,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
 	{
 		.procname	= "tcp_invalid_ratelimit",
 		.data		= &sysctl_tcp_invalid_ratelimit,
@@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_autocorking",
+		.data		= &init_net.ipv4.sysctl_tcp_autocorking,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a01c97708d83..a7a0f316eb86 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -285,8 +285,6 @@
 
 #include <trace/events/tcp.h>
 
-int sysctl_tcp_autocorking __read_mostly = 1;
-
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
@@ -697,7 +695,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
 				int size_goal)
 {
 	return skb->len < size_goal &&
-	       sysctl_tcp_autocorking &&
+	       sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
 	       skb != tcp_write_queue_head(sk) &&
 	       refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ced35af5737a..351e3497c8f3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2505,6 +2505,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
+	net->ipv4.sysctl_tcp_autocorking = 1;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 4170ba6b589ced82da56c7e4f71cc84b2be036d6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:30 -0700
Subject: tcp: Namespace-ify sysctl_tcp_invalid_ratelimit

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_input.c       |  3 +--
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 537830882149..e52c2124b32e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -151,6 +151,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_min_tso_segs;
 	int sysctl_tcp_min_rtt_wlen;
 	int sysctl_tcp_autocorking;
+	int sysctl_tcp_invalid_ratelimit;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0268f1025d9d..5869a822ecb1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
-extern int sysctl_tcp_invalid_ratelimit;
 extern int sysctl_tcp_pacing_ss_ratio;
 extern int sysctl_tcp_pacing_ca_ratio;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 43a18a317053..6a9349c27f00 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -510,13 +510,6 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &zero,
 		.extra2		= &thousand,
 	},
-	{
-		.procname	= "tcp_invalid_ratelimit",
-		.data		= &sysctl_tcp_invalid_ratelimit,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
-	},
 	{
 		.procname	= "tcp_available_ulp",
 		.maxlen		= TCP_ULP_BUF_MAX,
@@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "tcp_invalid_ratelimit",
+		.data		= &init_net.ipv4.sysctl_tcp_invalid_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index da1ef666d1f9..db4d458d0205 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -80,7 +80,6 @@
 #include <linux/static_key.h>
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
@@ -3403,7 +3402,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
 	if (*last_oow_ack_time) {
 		s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
 
-		if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+		if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
 			NET_INC_STATS(net, mib_idx);
 			return true;	/* rate-limited: don't send yet! */
 		}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 351e3497c8f3..6617aae18ba2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2506,6 +2506,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
 	net->ipv4.sysctl_tcp_autocorking = 1;
+	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 23a7102a2d1068508fa2a0ce593a0df7f8fdc0ac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:31 -0700
Subject: tcp: Namespace-ify sysctl_tcp_pacing_ss_ratio

Also remove an obsolete comment about TCP pacing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp_input.c       |  9 +--------
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e52c2124b32e..eb2dcf1cbe61 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -152,6 +152,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_min_rtt_wlen;
 	int sysctl_tcp_autocorking;
 	int sysctl_tcp_invalid_ratelimit;
+	int sysctl_tcp_pacing_ss_ratio;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5869a822ecb1..2a5f8261ca03 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
-extern int sysctl_tcp_pacing_ss_ratio;
 extern int sysctl_tcp_pacing_ca_ratio;
 
 extern atomic_long_t tcp_memory_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6a9349c27f00..7f0dba852d47 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -492,15 +492,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler   = proc_allowed_congestion_control,
 	},
-	{
-		.procname	= "tcp_pacing_ss_ratio",
-		.data		= &sysctl_tcp_pacing_ss_ratio,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &thousand,
-	},
 	{
 		.procname	= "tcp_pacing_ca_ratio",
 		.data		= &sysctl_tcp_pacing_ca_ratio,
@@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
+	{
+		.procname	= "tcp_pacing_ss_ratio",
+		.data		= &init_net.ipv4.sysctl_tcp_pacing_ss_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &thousand,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index db4d458d0205..29539d39e61a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -767,13 +767,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 	tp->srtt_us = max(1U, srtt);
 }
 
-/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
- * Note: TCP stack does not yet implement pacing.
- * FQ packet scheduler can be used to implement cheap but effective
- * TCP pacing, to smooth the burst on large writes when packets
- * in flight is significantly lower than cwnd (or rwin)
- */
-int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
 int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
 
 static void tcp_update_pacing_rate(struct sock *sk)
@@ -793,7 +786,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
 	 *	 end of slow start and should slow down.
 	 */
 	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
-		rate *= sysctl_tcp_pacing_ss_ratio;
+		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
 	else
 		rate *= sysctl_tcp_pacing_ca_ratio;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6617aae18ba2..1d8fc663af51 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2507,6 +2507,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
 	net->ipv4.sysctl_tcp_autocorking = 1;
 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
+	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From c26e91f8b9b8e1fd252e07c1f60e50220cd7ebab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Oct 2017 07:47:32 -0700
Subject: tcp: Namespace-ify sysctl_tcp_pacing_ca_ratio

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  2 --
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp_input.c       |  4 +---
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index eb2dcf1cbe61..141ba82b5efb 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -153,6 +153,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_autocorking;
 	int sysctl_tcp_invalid_ratelimit;
 	int sysctl_tcp_pacing_ss_ratio;
+	int sysctl_tcp_pacing_ca_ratio;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2a5f8261ca03..092d606fcc16 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -250,8 +250,6 @@ extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 
-extern int sysctl_tcp_pacing_ca_ratio;
-
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
 extern unsigned long tcp_memory_pressure;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7f0dba852d47..4602af6d5358 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -492,15 +492,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler   = proc_allowed_congestion_control,
 	},
-	{
-		.procname	= "tcp_pacing_ca_ratio",
-		.data		= &sysctl_tcp_pacing_ca_ratio,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &thousand,
-	},
 	{
 		.procname	= "tcp_available_ulp",
 		.maxlen		= TCP_ULP_BUF_MAX,
@@ -1145,6 +1136,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &thousand,
 	},
+	{
+		.procname	= "tcp_pacing_ca_ratio",
+		.data		= &init_net.ipv4.sysctl_tcp_pacing_ca_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &thousand,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 29539d39e61a..21c358c0cf2e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -767,8 +767,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 	tp->srtt_us = max(1U, srtt);
 }
 
-int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
-
 static void tcp_update_pacing_rate(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
@@ -788,7 +786,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
 	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
 		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
 	else
-		rate *= sysctl_tcp_pacing_ca_ratio;
+		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
 
 	rate *= max(tp->snd_cwnd, tp->packets_out);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1d8fc663af51..7c1dae6493c3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2508,6 +2508,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_autocorking = 1;
 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
+	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
 
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 5b52a4c3acf5f4b4854d1c3ddc8be8770330a79c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 27 Oct 2017 10:01:39 -0700
Subject: tcp: remove unnecessary include

two extra #include are not necessary in tcp.h
Remove them.

Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 092d606fcc16..aa1cc90fdc02 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,9 +45,6 @@
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
-
-#include <linux/bpf.h>
-#include <linux/filter.h>
 #include <linux/bpf-cgroup.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
-- 
cgit v1.2.3


From 2ea2352ede9d97585164a7e19224955f4e4ca8db Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 27 Oct 2017 17:30:12 -0700
Subject: ipv6: prevent user from adding cached routes

Cached routes should only be created by the system when receiving pmtu
discovery or ip redirect msg. Users should not be allowed to create
cached routes.

Furthermore, after the patch series to move cached routes into exception
table, user added cached routes will trigger the following warning in
fib6_add():

WARNING: CPU: 0 PID: 2985 at net/ipv6/ip6_fib.c:1137
fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137
Kernel panic - not syncing: panic_on_warn set ...

CPU: 0 PID: 2985 Comm: syzkaller320388 Not tainted 4.14.0-rc3+ #74
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 panic+0x1e4/0x417 kernel/panic.c:181
 __warn+0x1c4/0x1d9 kernel/panic.c:542
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178
 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:261
 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311
 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137
RSP: 0018:ffff8801cf09f6a0 EFLAGS: 00010297
RAX: ffff8801ce45e340 RBX: 1ffff10039e13eec RCX: ffff8801d749c814
RDX: 0000000000000000 RSI: ffff8801d749c700 RDI: ffff8801d749c780
RBP: ffff8801cf09fa08 R08: 0000000000000000 R09: ffff8801cf09f360
R10: ffff8801cf09f2d8 R11: 1ffff10039c8befb R12: 0000000000000001
R13: dffffc0000000000 R14: ffff8801d749c700 R15: ffffffff860655c0
 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011
 ip6_route_add+0x148/0x1a0 net/ipv6/route.c:2782
 ipv6_route_ioctl+0x4d5/0x690 net/ipv6/route.c:3291
 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:521
 sock_do_ioctl+0x65/0xb0 net/socket.c:961
 sock_ioctl+0x2c2/0x440 net/socket.c:1058
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

So we fix this by failing the attemp to add cached routes from userspace
with returning EINVAL error.

Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6_route.h | 2 +-
 net/ipv6/route.c                | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index d496c02e14bc..c15d8054905c 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -28,7 +28,7 @@
 
 #define RTF_ROUTEINFO	0x00800000	/* route information - RA	*/
 
-#define RTF_CACHE	0x01000000	/* cache entry			*/
+#define RTF_CACHE	0x01000000	/* read-only: can not be set by user */
 #define RTF_FLOW	0x02000000	/* flow significant route	*/
 #define RTF_POLICY	0x04000000	/* policy route			*/
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 605e5dc1c010..70d9659fc1e9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2478,6 +2478,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	/* RTF_CACHE is an internal flag; can not be set by userspace */
+	if (cfg->fc_flags & RTF_CACHE) {
+		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
+		goto out;
+	}
+
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
-- 
cgit v1.2.3


From a190d04db93710ae166749055b6985397c6d13f5 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 26 Oct 2017 15:09:21 -0700
Subject: ipvlan: introduce 'private' attribute for all existing modes.

IPvlan has always operated in bridge mode. However there are scenarios
where each slave should be able to talk through the master device but
not necessarily across each other. Think of an environment where each
of a namespace is a private and independant customer. In this scenario
the machine which is hosting these namespaces neither want to tell who
their neighbor is nor the individual namespaces care to talk to neighbor
on short-circuited network path.

This patch implements the mode that is very similar to the 'private' mode
in macvlan where individual slaves can send and receive traffic through
the master device, just that they can not talk among slave devices.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ipvlan.txt | 30 ++++++++++++++++++++++++++---
 drivers/net/ipvlan/ipvlan.h         | 16 ++++++++++++++++
 drivers/net/ipvlan/ipvlan_core.c    | 15 ++++++++++++---
 drivers/net/ipvlan/ipvlan_main.c    | 38 +++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_link.h        |  3 +++
 5 files changed, 94 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index 1fe42a874aae..bfa91c77a4c9 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,9 +22,19 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
 	There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-	ip link add link <master-dev> name <slave-dev> type ipvlan mode { l2 | l3 | l3s }
+    ip link add link <master> name <slave> type ipvlan [ mode MODE ] [ FLAGS ]
+       where
+         MODE: l3 (default) | l3s | l2
+         FLAGS: bridge (default) | private
 
-	e.g. ip link add link eth0 name ipvl0 type ipvlan mode l2
+    e.g.
+    (a) Following will create IPvlan link with eth0 as master in
+        L3 bridge mode
+          bash# ip link add link eth0 name ipvl0 type ipvlan
+    (b) This command will create IPvlan link in L2 bridge mode.
+          bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge
+    (c) This command will create an IPvlan device in L2 private mode.
+          bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private
 
 
 4. Operating modes:
@@ -54,7 +64,21 @@ works in this mode and hence it is L3-symmetric (L3s). This will have slightly l
 performance but that shouldn't matter since you are choosing this mode over plain-L3
 mode to make conn-tracking work.
 
-5. What to choose (macvlan vs. ipvlan)?
+5. Mode flags:
+	At this time following mode flags are available
+
+5.1 bridge:
+	This is the default option. To configure the IPvlan port in this mode,
+user can choose to either add this option on the command-line or don't specify
+anything. This is the traditional mode where slaves can cross-talk among
+themseleves apart from talking through the master device.
+
+5.2 private:
+	If this option is added to the command-line, the port is set in private
+mode. i.e. port wont allow cross communication between slaves.
+
+
+6. What to choose (macvlan vs. ipvlan)?
 	These two devices are very similar in many regards and the specific use
 case could very well define which device to choose. if one of the following
 situations defines your use case then you can choose to use ipvlan -
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index ba8173a0b62e..9941851bcc13 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -96,6 +96,7 @@ struct ipvl_port {
 	struct hlist_head	hlhead[IPVLAN_HASH_SIZE];
 	struct list_head	ipvlans;
 	u16			mode;
+	u16			flags;
 	u16			dev_id_start;
 	struct work_struct	wq;
 	struct sk_buff_head	backlog;
@@ -123,6 +124,21 @@ static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d)
 	return rtnl_dereference(d->rx_handler_data);
 }
 
+static inline bool ipvlan_is_private(const struct ipvl_port *port)
+{
+	return !!(port->flags & IPVLAN_F_PRIVATE);
+}
+
+static inline void ipvlan_mark_private(struct ipvl_port *port)
+{
+	port->flags |= IPVLAN_F_PRIVATE;
+}
+
+static inline void ipvlan_clear_private(struct ipvl_port *port)
+{
+	port->flags &= ~IPVLAN_F_PRIVATE;
+}
+
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 1f3295e274d0..72fd56de9c00 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -515,9 +515,13 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
 		goto out;
 
 	addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
-	if (addr)
+	if (addr) {
+		if (ipvlan_is_private(ipvlan->port)) {
+			consume_skb(skb);
+			return NET_XMIT_DROP;
+		}
 		return ipvlan_rcv_frame(addr, &skb, true);
-
+	}
 out:
 	ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
 	return ipvlan_process_outbound(skb);
@@ -535,8 +539,13 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
 		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
 		if (lyr3h) {
 			addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
-			if (addr)
+			if (addr) {
+				if (ipvlan_is_private(ipvlan->port)) {
+					consume_skb(skb);
+					return NET_XMIT_DROP;
+				}
 				return ipvlan_rcv_frame(addr, &skb, true);
+			}
 		}
 		skb = skb_share_check(skb, GFP_ATOMIC);
 		if (!skb)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index f0ab55df57f1..4368afb1934c 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -462,11 +462,24 @@ static int ipvlan_nl_changelink(struct net_device *dev,
 	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
 	int err = 0;
 
-	if (data && data[IFLA_IPVLAN_MODE]) {
+	if (!data)
+		return 0;
+
+	if (data[IFLA_IPVLAN_MODE]) {
 		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
 		err = ipvlan_set_port_mode(port, nmode);
 	}
+
+	if (!err && data[IFLA_IPVLAN_FLAGS]) {
+		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
+
+		if (flags & IPVLAN_F_PRIVATE)
+			ipvlan_mark_private(port);
+		else
+			ipvlan_clear_private(port);
+	}
+
 	return err;
 }
 
@@ -474,18 +487,30 @@ static size_t ipvlan_nl_getsize(const struct net_device *dev)
 {
 	return (0
 		+ nla_total_size(2) /* IFLA_IPVLAN_MODE */
+		+ nla_total_size(2) /* IFLA_IPVLAN_FLAGS */
 		);
 }
 
 static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[],
 			      struct netlink_ext_ack *extack)
 {
-	if (data && data[IFLA_IPVLAN_MODE]) {
+	if (!data)
+		return 0;
+
+	if (data[IFLA_IPVLAN_MODE]) {
 		u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
 		if (mode < IPVLAN_MODE_L2 || mode >= IPVLAN_MODE_MAX)
 			return -EINVAL;
 	}
+	if (data[IFLA_IPVLAN_FLAGS]) {
+		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
+
+		/* Only one bit is used at this moment. */
+		if (flags & ~IPVLAN_F_PRIVATE)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -502,6 +527,8 @@ static int ipvlan_nl_fillinfo(struct sk_buff *skb,
 	ret = -EMSGSIZE;
 	if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode))
 		goto err;
+	if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags))
+		goto err;
 
 	return 0;
 
@@ -549,6 +576,12 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 	ipvlan_adjust_mtu(ipvlan, phy_dev);
 	INIT_LIST_HEAD(&ipvlan->addrs);
 
+	/* Flags are per port and latest update overrides. User has
+	 * to be consistent in setting it just like the mode attribute.
+	 */
+	if (data && data[IFLA_IPVLAN_FLAGS])
+		ipvlan->port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
+
 	/* If the port-id base is at the MAX value, then wrap it around and
 	 * begin from 0x1 again. This may be due to a busy system where lots
 	 * of slaves are getting created and deleted.
@@ -644,6 +677,7 @@ EXPORT_SYMBOL_GPL(ipvlan_link_setup);
 static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] =
 {
 	[IFLA_IPVLAN_MODE] = { .type = NLA_U16 },
+	[IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 },
 };
 
 static struct rtnl_link_ops ipvlan_link_ops = {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b037e0ab1975..052e32cd584c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -465,6 +465,7 @@ enum macsec_validation_type {
 enum {
 	IFLA_IPVLAN_UNSPEC,
 	IFLA_IPVLAN_MODE,
+	IFLA_IPVLAN_FLAGS,
 	__IFLA_IPVLAN_MAX
 };
 
@@ -477,6 +478,8 @@ enum ipvlan_mode {
 	IPVLAN_MODE_MAX
 };
 
+#define IPVLAN_F_PRIVATE	0x01
+
 /* VXLAN section */
 enum {
 	IFLA_VXLAN_UNSPEC,
-- 
cgit v1.2.3


From fe89aa6b250c1011ccf425fbb7998e96bd54263f Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 26 Oct 2017 15:09:25 -0700
Subject: ipvlan: implement VEPA mode

This is very similar to the Macvlan VEPA mode, however, there is some
difference. IPvlan uses the mac-address of the lower device, so the VEPA
mode has implications of ICMP-redirects for packets destined for its
immediate neighbors sharing same master since the packets will have same
source and dest mac. The external switch/router will send redirect msg.

Having said that, this will be useful tool in terms of debugging
since IPvlan will not switch packets within its slaves and rely completely
on the external entity as intended in 802.1Qbg.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ipvlan.txt | 12 +++++++++++-
 drivers/net/ipvlan/ipvlan.h         | 15 +++++++++++++++
 drivers/net/ipvlan/ipvlan_core.c    | 17 ++++++++++-------
 drivers/net/ipvlan/ipvlan_main.c    | 13 +++++++++++--
 include/uapi/linux/if_link.h        |  1 +
 5 files changed, 48 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index bfa91c77a4c9..812ef003e0a8 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -25,7 +25,7 @@ using IProute2/ip utility.
     ip link add link <master> name <slave> type ipvlan [ mode MODE ] [ FLAGS ]
        where
          MODE: l3 (default) | l3s | l2
-         FLAGS: bridge (default) | private
+         FLAGS: bridge (default) | private | vepa
 
     e.g.
     (a) Following will create IPvlan link with eth0 as master in
@@ -35,6 +35,8 @@ using IProute2/ip utility.
           bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge
     (c) This command will create an IPvlan device in L2 private mode.
           bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private
+    (d) This command will create an IPvlan device in L2 vepa mode.
+          bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa
 
 
 4. Operating modes:
@@ -77,6 +79,14 @@ themseleves apart from talking through the master device.
 	If this option is added to the command-line, the port is set in private
 mode. i.e. port wont allow cross communication between slaves.
 
+5.3 vepa:
+	If this is added to the command-line, the port is set in VEPA mode.
+i.e. port will offload switching functionality to the external entity as
+described in 802.1Qbg
+Note: VEPA mode in IPvlan has limitations. IPvlan uses the mac-address of the
+master-device, so the packets which are emitted in this mode for the adjacent
+neighbor will have source and destination mac same. This will make the switch /
+router send the redirect message.
 
 6. What to choose (macvlan vs. ipvlan)?
 	These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 9941851bcc13..5166575a164d 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -139,6 +139,21 @@ static inline void ipvlan_clear_private(struct ipvl_port *port)
 	port->flags &= ~IPVLAN_F_PRIVATE;
 }
 
+static inline bool ipvlan_is_vepa(const struct ipvl_port *port)
+{
+	return !!(port->flags & IPVLAN_F_VEPA);
+}
+
+static inline void ipvlan_mark_vepa(struct ipvl_port *port)
+{
+	port->flags |= IPVLAN_F_VEPA;
+}
+
+static inline void ipvlan_clear_vepa(struct ipvl_port *port)
+{
+	port->flags &= ~IPVLAN_F_VEPA;
+}
+
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 72fd56de9c00..034ae4c57196 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -514,13 +514,15 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
 	if (!lyr3h)
 		goto out;
 
-	addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
-	if (addr) {
-		if (ipvlan_is_private(ipvlan->port)) {
-			consume_skb(skb);
-			return NET_XMIT_DROP;
+	if (!ipvlan_is_vepa(ipvlan->port)) {
+		addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
+		if (addr) {
+			if (ipvlan_is_private(ipvlan->port)) {
+				consume_skb(skb);
+				return NET_XMIT_DROP;
+			}
+			return ipvlan_rcv_frame(addr, &skb, true);
 		}
-		return ipvlan_rcv_frame(addr, &skb, true);
 	}
 out:
 	ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
@@ -535,7 +537,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
 	void *lyr3h;
 	int addr_type;
 
-	if (ether_addr_equal(eth->h_dest, eth->h_source)) {
+	if (!ipvlan_is_vepa(ipvlan->port) &&
+	    ether_addr_equal(eth->h_dest, eth->h_source)) {
 		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
 		if (lyr3h) {
 			addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 4368afb1934c..a266aa435d4d 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -478,6 +478,11 @@ static int ipvlan_nl_changelink(struct net_device *dev,
 			ipvlan_mark_private(port);
 		else
 			ipvlan_clear_private(port);
+
+		if (flags & IPVLAN_F_VEPA)
+			ipvlan_mark_vepa(port);
+		else
+			ipvlan_clear_vepa(port);
 	}
 
 	return err;
@@ -506,8 +511,12 @@ static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_IPVLAN_FLAGS]) {
 		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
 
-		/* Only one bit is used at this moment. */
-		if (flags & ~IPVLAN_F_PRIVATE)
+		/* Only two bits are used at this moment. */
+		if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
+			return -EINVAL;
+		/* Also both flags can't be active at the same time. */
+		if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ==
+		    (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
 			return -EINVAL;
 	}
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 052e32cd584c..81f26473d728 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -479,6 +479,7 @@ enum ipvlan_mode {
 };
 
 #define IPVLAN_F_PRIVATE	0x01
+#define IPVLAN_F_VEPA		0x02
 
 /* VXLAN section */
 enum {
-- 
cgit v1.2.3


From 1f01d8be0e6a04bd682a55f6d50c14c1679e7571 Mon Sep 17 00:00:00 2001
From: Konrad Zapałowicz <konrad.zapalowicz@canonical.com>
Date: Tue, 17 Oct 2017 15:53:49 +0200
Subject: Bluetooth: increase timeout for le auto connections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch increases the connection timeout for LE connections that are
triggered by the advertising report to 4 seconds.

It has been observed that devices equipped with wifi+bt combo SoC fail
to create a connection with BLE devices due to their coexistence issues.
Increasing this timeout gives them enough time to complete the
connection with success.

Signed-off-by: Konrad Zapałowicz <konrad.zapalowicz@canonical.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index fe98f0a5bef0..1668211297a9 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -273,7 +273,7 @@ enum {
 #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
 #define HCI_POWER_OFF_TIMEOUT	msecs_to_jiffies(5000)	/* 5 seconds */
 #define HCI_LE_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
-#define HCI_LE_AUTOCONN_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
+#define HCI_LE_AUTOCONN_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 
 /* HCI data types */
 #define HCI_COMMAND_PKT		0x01
-- 
cgit v1.2.3


From 2064ee332e4c1b7495cf68b84355c213d8fe71fd Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 30 Oct 2017 10:42:59 +0100
Subject: Bluetooth: Use bt_dev_err and bt_dev_info when possible

In case of using BT_ERR and BT_INFO, convert to bt_dev_err and
bt_dev_info when possible. This allows for controller specific
reporting.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 drivers/bluetooth/bpa10x.c        |  15 ++--
 drivers/bluetooth/bt3c_cs.c       |   2 +-
 drivers/bluetooth/btbcm.c         | 103 ++++++++++++-------------
 drivers/bluetooth/btintel.c       |  42 +++++------
 drivers/bluetooth/btqca.c         |   6 +-
 drivers/bluetooth/btrtl.c         |  21 +++---
 drivers/bluetooth/btusb.c         | 155 ++++++++++++++++++--------------------
 drivers/bluetooth/hci_ath.c       |   2 +-
 drivers/bluetooth/hci_h4.c        |   2 +-
 drivers/bluetooth/hci_qca.c       |  12 +--
 drivers/bluetooth/hci_serdev.c    |   9 +--
 include/net/bluetooth/bluetooth.h |   3 +
 net/bluetooth/a2mp.c              |   2 +-
 net/bluetooth/amp.c               |   4 +-
 net/bluetooth/hci_conn.c          |   4 +-
 net/bluetooth/hci_core.c          |  35 +++++----
 net/bluetooth/hci_event.c         |  46 +++++------
 net/bluetooth/hci_request.c       |  16 ++--
 net/bluetooth/hci_sysfs.c         |   2 +-
 net/bluetooth/mgmt.c              |  56 +++++++-------
 net/bluetooth/smp.c               |  22 +++---
 21 files changed, 279 insertions(+), 280 deletions(-)

(limited to 'include')

diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c
index 48d10cb5c9a1..7971bfbd4321 100644
--- a/drivers/bluetooth/bpa10x.c
+++ b/drivers/bluetooth/bpa10x.c
@@ -117,7 +117,7 @@ static void bpa10x_rx_complete(struct urb *urb)
 						bpa10x_recv_pkts,
 						ARRAY_SIZE(bpa10x_recv_pkts));
 		if (IS_ERR(data->rx_skb[idx])) {
-			BT_ERR("%s corrupted event packet", hdev->name);
+			bt_dev_err(hdev, "corrupted event packet");
 			hdev->stat.err_rx++;
 			data->rx_skb[idx] = NULL;
 		}
@@ -127,8 +127,7 @@ static void bpa10x_rx_complete(struct urb *urb)
 
 	err = usb_submit_urb(urb, GFP_ATOMIC);
 	if (err < 0) {
-		BT_ERR("%s urb %p failed to resubmit (%d)",
-						hdev->name, urb, -err);
+		bt_dev_err(hdev, "urb %p failed to resubmit (%d)", urb, -err);
 		usb_unanchor_urb(urb);
 	}
 }
@@ -164,8 +163,7 @@ static inline int bpa10x_submit_intr_urb(struct hci_dev *hdev)
 
 	err = usb_submit_urb(urb, GFP_KERNEL);
 	if (err < 0) {
-		BT_ERR("%s urb %p submission failed (%d)",
-						hdev->name, urb, -err);
+		bt_dev_err(hdev, "urb %p submission failed (%d)", urb, -err);
 		usb_unanchor_urb(urb);
 	}
 
@@ -205,8 +203,7 @@ static inline int bpa10x_submit_bulk_urb(struct hci_dev *hdev)
 
 	err = usb_submit_urb(urb, GFP_KERNEL);
 	if (err < 0) {
-		BT_ERR("%s urb %p submission failed (%d)",
-						hdev->name, urb, -err);
+		bt_dev_err(hdev, "urb %p submission failed (%d)", urb, -err);
 		usb_unanchor_urb(urb);
 	}
 
@@ -272,7 +269,7 @@ static int bpa10x_setup(struct hci_dev *hdev)
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
-	BT_INFO("%s: %s", hdev->name, (char *)(skb->data + 1));
+	bt_dev_info(hdev, "%s", (char *)(skb->data + 1));
 
 	hci_set_fw_info(hdev, "%s", skb->data + 1);
 
@@ -348,7 +345,7 @@ static int bpa10x_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
 
 	err = usb_submit_urb(urb, GFP_ATOMIC);
 	if (err < 0) {
-		BT_ERR("%s urb %p submission failed", hdev->name, urb);
+		bt_dev_err(hdev, "urb %p submission failed", urb);
 		kfree(urb->setup_packet);
 		usb_unanchor_urb(urb);
 	}
diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c
index 194788739a83..25b0cf952b91 100644
--- a/drivers/bluetooth/bt3c_cs.c
+++ b/drivers/bluetooth/bt3c_cs.c
@@ -355,7 +355,7 @@ static irqreturn_t bt3c_interrupt(int irq, void *dev_inst)
 		} else if ((stat & 0xff) != 0xff) {
 			if (stat & 0x0020) {
 				int status = bt3c_read(iobase, 0x7002) & 0x10;
-				BT_INFO("%s: Antenna %s", info->hdev->name,
+				bt_dev_info(info->hdev, "Antenna %s",
 							status ? "out" : "in");
 			}
 			if (stat & 0x0001)
diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index ae1fa390f508..afa4cb3b16e3 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -45,13 +45,12 @@ int btbcm_check_bdaddr(struct hci_dev *hdev)
 			     HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		int err = PTR_ERR(skb);
-		BT_ERR("%s: BCM: Reading device address failed (%d)",
-		       hdev->name, err);
+		bt_dev_err(hdev, "BCM: Reading device address failed (%d)", err);
 		return err;
 	}
 
 	if (skb->len != sizeof(*bda)) {
-		BT_ERR("%s: BCM: Device address length mismatch", hdev->name);
+		bt_dev_err(hdev, "BCM: Device address length mismatch");
 		kfree_skb(skb);
 		return -EIO;
 	}
@@ -74,8 +73,8 @@ int btbcm_check_bdaddr(struct hci_dev *hdev)
 	if (!bacmp(&bda->bdaddr, BDADDR_BCM20702A0) ||
 	    !bacmp(&bda->bdaddr, BDADDR_BCM4324B3) ||
 	    !bacmp(&bda->bdaddr, BDADDR_BCM4330B1)) {
-		BT_INFO("%s: BCM: Using default device address (%pMR)",
-			hdev->name, &bda->bdaddr);
+		bt_dev_info(hdev, "BCM: Using default device address (%pMR)",
+			    &bda->bdaddr);
 		set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
 	}
 
@@ -93,8 +92,7 @@ int btbcm_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr)
 	skb = __hci_cmd_sync(hdev, 0xfc01, 6, bdaddr, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		err = PTR_ERR(skb);
-		BT_ERR("%s: BCM: Change address command failed (%d)",
-		       hdev->name, err);
+		bt_dev_err(hdev, "BCM: Change address command failed (%d)", err);
 		return err;
 	}
 	kfree_skb(skb);
@@ -116,8 +114,8 @@ int btbcm_patchram(struct hci_dev *hdev, const struct firmware *fw)
 	skb = __hci_cmd_sync(hdev, 0xfc2e, 0, NULL, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		err = PTR_ERR(skb);
-		BT_ERR("%s: BCM: Download Minidrv command failed (%d)",
-		       hdev->name, err);
+		bt_dev_err(hdev, "BCM: Download Minidrv command failed (%d)",
+			   err);
 		goto done;
 	}
 	kfree_skb(skb);
@@ -136,7 +134,7 @@ int btbcm_patchram(struct hci_dev *hdev, const struct firmware *fw)
 		fw_size -= sizeof(*cmd);
 
 		if (fw_size < cmd->plen) {
-			BT_ERR("%s: BCM: Patch is corrupted", hdev->name);
+			bt_dev_err(hdev, "BCM: Patch is corrupted");
 			err = -EINVAL;
 			goto done;
 		}
@@ -151,8 +149,8 @@ int btbcm_patchram(struct hci_dev *hdev, const struct firmware *fw)
 				     HCI_INIT_TIMEOUT);
 		if (IS_ERR(skb)) {
 			err = PTR_ERR(skb);
-			BT_ERR("%s: BCM: Patch command %04x failed (%d)",
-			       hdev->name, opcode, err);
+			bt_dev_err(hdev, "BCM: Patch command %04x failed (%d)",
+				   opcode, err);
 			goto done;
 		}
 		kfree_skb(skb);
@@ -173,7 +171,7 @@ static int btbcm_reset(struct hci_dev *hdev)
 	skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		int err = PTR_ERR(skb);
-		BT_ERR("%s: BCM: Reset failed (%d)", hdev->name, err);
+		bt_dev_err(hdev, "BCM: Reset failed (%d)", err);
 		return err;
 	}
 	kfree_skb(skb);
@@ -191,13 +189,13 @@ static struct sk_buff *btbcm_read_local_name(struct hci_dev *hdev)
 	skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_NAME, 0, NULL,
 			     HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
-		BT_ERR("%s: BCM: Reading local name failed (%ld)",
-		       hdev->name, PTR_ERR(skb));
+		bt_dev_err(hdev, "BCM: Reading local name failed (%ld)",
+			   PTR_ERR(skb));
 		return skb;
 	}
 
 	if (skb->len != sizeof(struct hci_rp_read_local_name)) {
-		BT_ERR("%s: BCM: Local name length mismatch", hdev->name);
+		bt_dev_err(hdev, "BCM: Local name length mismatch");
 		kfree_skb(skb);
 		return ERR_PTR(-EIO);
 	}
@@ -212,13 +210,13 @@ static struct sk_buff *btbcm_read_local_version(struct hci_dev *hdev)
 	skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL,
 			     HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
-		BT_ERR("%s: BCM: Reading local version info failed (%ld)",
-		       hdev->name, PTR_ERR(skb));
+		bt_dev_err(hdev, "BCM: Reading local version info failed (%ld)",
+			   PTR_ERR(skb));
 		return skb;
 	}
 
 	if (skb->len != sizeof(struct hci_rp_read_local_version)) {
-		BT_ERR("%s: BCM: Local version length mismatch", hdev->name);
+		bt_dev_err(hdev, "BCM: Local version length mismatch");
 		kfree_skb(skb);
 		return ERR_PTR(-EIO);
 	}
@@ -232,13 +230,13 @@ static struct sk_buff *btbcm_read_verbose_config(struct hci_dev *hdev)
 
 	skb = __hci_cmd_sync(hdev, 0xfc79, 0, NULL, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
-		BT_ERR("%s: BCM: Read verbose config info failed (%ld)",
-		       hdev->name, PTR_ERR(skb));
+		bt_dev_err(hdev, "BCM: Read verbose config info failed (%ld)",
+			   PTR_ERR(skb));
 		return skb;
 	}
 
 	if (skb->len != 7) {
-		BT_ERR("%s: BCM: Verbose config length mismatch", hdev->name);
+		bt_dev_err(hdev, "BCM: Verbose config length mismatch");
 		kfree_skb(skb);
 		return ERR_PTR(-EIO);
 	}
@@ -252,14 +250,13 @@ static struct sk_buff *btbcm_read_controller_features(struct hci_dev *hdev)
 
 	skb = __hci_cmd_sync(hdev, 0xfc6e, 0, NULL, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
-		BT_ERR("%s: BCM: Read controller features failed (%ld)",
-		       hdev->name, PTR_ERR(skb));
+		bt_dev_err(hdev, "BCM: Read controller features failed (%ld)",
+			   PTR_ERR(skb));
 		return skb;
 	}
 
 	if (skb->len != 9) {
-		BT_ERR("%s: BCM: Controller features length mismatch",
-		       hdev->name);
+		bt_dev_err(hdev, "BCM: Controller features length mismatch");
 		kfree_skb(skb);
 		return ERR_PTR(-EIO);
 	}
@@ -273,13 +270,13 @@ static struct sk_buff *btbcm_read_usb_product(struct hci_dev *hdev)
 
 	skb = __hci_cmd_sync(hdev, 0xfc5a, 0, NULL, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
-		BT_ERR("%s: BCM: Read USB product info failed (%ld)",
-		       hdev->name, PTR_ERR(skb));
+		bt_dev_err(hdev, "BCM: Read USB product info failed (%ld)",
+			   PTR_ERR(skb));
 		return skb;
 	}
 
 	if (skb->len != 5) {
-		BT_ERR("%s: BCM: USB product length mismatch", hdev->name);
+		bt_dev_err(hdev, "BCM: USB product length mismatch");
 		kfree_skb(skb);
 		return ERR_PTR(-EIO);
 	}
@@ -296,7 +293,7 @@ static int btbcm_read_info(struct hci_dev *hdev)
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
-	BT_INFO("%s: BCM: chip id %u", hdev->name, skb->data[1]);
+	bt_dev_info(hdev, "BCM: chip id %u", skb->data[1]);
 	kfree_skb(skb);
 
 	/* Read Controller Features */
@@ -304,7 +301,7 @@ static int btbcm_read_info(struct hci_dev *hdev)
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
-	BT_INFO("%s: BCM: features 0x%2.2x", hdev->name, skb->data[1]);
+	bt_dev_info(hdev, "BCM: features 0x%2.2x", skb->data[1]);
 	kfree_skb(skb);
 
 	/* Read Local Name */
@@ -312,7 +309,7 @@ static int btbcm_read_info(struct hci_dev *hdev)
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
-	BT_INFO("%s: %s", hdev->name, (char *)(skb->data + 1));
+	bt_dev_info(hdev, "%s", (char *)(skb->data + 1));
 	kfree_skb(skb);
 
 	return 0;
@@ -378,9 +375,9 @@ int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len)
 		return 0;
 	}
 
-	BT_INFO("%s: %s (%3.3u.%3.3u.%3.3u) build %4.4u", hdev->name,
-		hw_name ? : "BCM", (subver & 0xe000) >> 13,
-		(subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
+	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
+		    hw_name ? : "BCM", (subver & 0xe000) >> 13,
+		    (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
 
 	return 0;
 }
@@ -408,9 +405,9 @@ int btbcm_finalize(struct hci_dev *hdev)
 	subver = le16_to_cpu(ver->lmp_subver);
 	kfree_skb(skb);
 
-	BT_INFO("%s: BCM (%3.3u.%3.3u.%3.3u) build %4.4u", hdev->name,
-		(subver & 0xe000) >> 13, (subver & 0x1f00) >> 8,
-		(subver & 0x00ff), rev & 0x0fff);
+	bt_dev_info(hdev, "BCM (%3.3u.%3.3u.%3.3u) build %4.4u",
+		    (subver & 0xe000) >> 13, (subver & 0x1f00) >> 8,
+		    (subver & 0x00ff), rev & 0x0fff);
 
 	btbcm_check_bdaddr(hdev);
 
@@ -505,13 +502,13 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 		return 0;
 	}
 
-	BT_INFO("%s: %s (%3.3u.%3.3u.%3.3u) build %4.4u", hdev->name,
-		hw_name ? : "BCM", (subver & 0xe000) >> 13,
-		(subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
+	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
+		    hw_name ? : "BCM", (subver & 0xe000) >> 13,
+		    (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
 
 	err = request_firmware(&fw, fw_name, &hdev->dev);
 	if (err < 0) {
-		BT_INFO("%s: BCM: Patch %s not found", hdev->name, fw_name);
+		bt_dev_info(hdev, "BCM: Patch %s not found", fw_name);
 		goto done;
 	}
 
@@ -534,16 +531,16 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 	subver = le16_to_cpu(ver->lmp_subver);
 	kfree_skb(skb);
 
-	BT_INFO("%s: %s (%3.3u.%3.3u.%3.3u) build %4.4u", hdev->name,
-		hw_name ? : "BCM", (subver & 0xe000) >> 13,
-		(subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
+	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
+		    hw_name ? : "BCM", (subver & 0xe000) >> 13,
+		    (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
 
 	/* Read Local Name */
 	skb = btbcm_read_local_name(hdev);
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
-	BT_INFO("%s: %s", hdev->name, (char *)(skb->data + 1));
+	bt_dev_info(hdev, "%s", (char *)(skb->data + 1));
 	kfree_skb(skb);
 
 done:
@@ -568,31 +565,31 @@ int btbcm_setup_apple(struct hci_dev *hdev)
 	/* Read Verbose Config Version Info */
 	skb = btbcm_read_verbose_config(hdev);
 	if (!IS_ERR(skb)) {
-		BT_INFO("%s: BCM: chip id %u build %4.4u", hdev->name,
-			skb->data[1], get_unaligned_le16(skb->data + 5));
+		bt_dev_info(hdev, "BCM: chip id %u build %4.4u",
+			    skb->data[1], get_unaligned_le16(skb->data + 5));
 		kfree_skb(skb);
 	}
 
 	/* Read USB Product Info */
 	skb = btbcm_read_usb_product(hdev);
 	if (!IS_ERR(skb)) {
-		BT_INFO("%s: BCM: product %4.4x:%4.4x", hdev->name,
-			get_unaligned_le16(skb->data + 1),
-			get_unaligned_le16(skb->data + 3));
+		bt_dev_info(hdev, "BCM: product %4.4x:%4.4x",
+			    get_unaligned_le16(skb->data + 1),
+			    get_unaligned_le16(skb->data + 3));
 		kfree_skb(skb);
 	}
 
 	/* Read Controller Features */
 	skb = btbcm_read_controller_features(hdev);
 	if (!IS_ERR(skb)) {
-		BT_INFO("%s: BCM: features 0x%2.2x", hdev->name, skb->data[1]);
+		bt_dev_info(hdev, "BCM: features 0x%2.2x", skb->data[1]);
 		kfree_skb(skb);
 	}
 
 	/* Read Local Name */
 	skb = btbcm_read_local_name(hdev);
 	if (!IS_ERR(skb)) {
-		BT_INFO("%s: %s", hdev->name, (char *)(skb->data + 1));
+		bt_dev_info(hdev, "%s", (char *)(skb->data + 1));
 		kfree_skb(skb);
 	}
 
diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
index d32e109bd5cb..4459555c9d88 100644
--- a/drivers/bluetooth/btintel.c
+++ b/drivers/bluetooth/btintel.c
@@ -43,13 +43,13 @@ int btintel_check_bdaddr(struct hci_dev *hdev)
 			     HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		int err = PTR_ERR(skb);
-		BT_ERR("%s: Reading Intel device address failed (%d)",
-		       hdev->name, err);
+		bt_dev_err(hdev, "Reading Intel device address failed (%d)",
+			   err);
 		return err;
 	}
 
 	if (skb->len != sizeof(*bda)) {
-		BT_ERR("%s: Intel device address length mismatch", hdev->name);
+		bt_dev_err(hdev, "Intel device address length mismatch");
 		kfree_skb(skb);
 		return -EIO;
 	}
@@ -62,8 +62,8 @@ int btintel_check_bdaddr(struct hci_dev *hdev)
 	 * and that in turn can cause problems with Bluetooth operation.
 	 */
 	if (!bacmp(&bda->bdaddr, BDADDR_INTEL)) {
-		BT_ERR("%s: Found Intel default device address (%pMR)",
-		       hdev->name, &bda->bdaddr);
+		bt_dev_err(hdev, "Found Intel default device address (%pMR)",
+			   &bda->bdaddr);
 		set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
 	}
 
@@ -123,8 +123,8 @@ int btintel_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr)
 	skb = __hci_cmd_sync(hdev, 0xfc31, 6, bdaddr, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		err = PTR_ERR(skb);
-		BT_ERR("%s: Changing Intel device address failed (%d)",
-		       hdev->name, err);
+		bt_dev_err(hdev, "Changing Intel device address failed (%d)",
+			   err);
 		return err;
 	}
 	kfree_skb(skb);
@@ -154,8 +154,8 @@ int btintel_set_diag(struct hci_dev *hdev, bool enable)
 		err = PTR_ERR(skb);
 		if (err == -ENODATA)
 			goto done;
-		BT_ERR("%s: Changing Intel diagnostic mode failed (%d)",
-		       hdev->name, err);
+		bt_dev_err(hdev, "Changing Intel diagnostic mode failed (%d)",
+			   err);
 		return err;
 	}
 	kfree_skb(skb);
@@ -189,30 +189,30 @@ void btintel_hw_error(struct hci_dev *hdev, u8 code)
 	struct sk_buff *skb;
 	u8 type = 0x00;
 
-	BT_ERR("%s: Hardware error 0x%2.2x", hdev->name, code);
+	bt_dev_err(hdev, "Hardware error 0x%2.2x", code);
 
 	skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
-		BT_ERR("%s: Reset after hardware error failed (%ld)",
-		       hdev->name, PTR_ERR(skb));
+		bt_dev_err(hdev, "Reset after hardware error failed (%ld)",
+			   PTR_ERR(skb));
 		return;
 	}
 	kfree_skb(skb);
 
 	skb = __hci_cmd_sync(hdev, 0xfc22, 1, &type, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
-		BT_ERR("%s: Retrieving Intel exception info failed (%ld)",
-		       hdev->name, PTR_ERR(skb));
+		bt_dev_err(hdev, "Retrieving Intel exception info failed (%ld)",
+			   PTR_ERR(skb));
 		return;
 	}
 
 	if (skb->len != 13) {
-		BT_ERR("%s: Exception info size mismatch", hdev->name);
+		bt_dev_err(hdev, "Exception info size mismatch");
 		kfree_skb(skb);
 		return;
 	}
 
-	BT_ERR("%s: Exception info %s", hdev->name, (char *)(skb->data + 1));
+	bt_dev_err(hdev, "Exception info %s", (char *)(skb->data + 1));
 
 	kfree_skb(skb);
 }
@@ -233,9 +233,10 @@ void btintel_version_info(struct hci_dev *hdev, struct intel_version *ver)
 		return;
 	}
 
-	BT_INFO("%s: %s revision %u.%u build %u week %u %u", hdev->name,
-		variant, ver->fw_revision >> 4, ver->fw_revision & 0x0f,
-		ver->fw_build_num, ver->fw_build_ww, 2000 + ver->fw_build_yy);
+	bt_dev_info(hdev, "%s revision %u.%u build %u week %u %u",
+		    variant, ver->fw_revision >> 4, ver->fw_revision & 0x0f,
+		    ver->fw_build_num, ver->fw_build_ww,
+		    2000 + ver->fw_build_yy);
 }
 EXPORT_SYMBOL_GPL(btintel_version_info);
 
@@ -321,8 +322,7 @@ int btintel_set_event_mask(struct hci_dev *hdev, bool debug)
 	skb = __hci_cmd_sync(hdev, 0xfc52, 8, mask, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		err = PTR_ERR(skb);
-		BT_ERR("%s: Setting Intel event mask failed (%d)",
-		       hdev->name, err);
+		bt_dev_err(hdev, "Setting Intel event mask failed (%d)", err);
 		return err;
 	}
 	kfree_skb(skb);
diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c
index 0bbdfcef2aa8..2793d4180d2f 100644
--- a/drivers/bluetooth/btqca.c
+++ b/drivers/bluetooth/btqca.c
@@ -287,7 +287,7 @@ static int rome_download_firmware(struct hci_dev *hdev,
 	const struct firmware *fw;
 	int ret;
 
-	BT_INFO("%s: ROME Downloading %s", hdev->name, config->fwname);
+	bt_dev_info(hdev, "ROME Downloading %s", config->fwname);
 
 	ret = request_firmware(&fw, config->fwname, &hdev->dev);
 	if (ret) {
@@ -351,7 +351,7 @@ int qca_uart_setup_rome(struct hci_dev *hdev, uint8_t baudrate)
 		return err;
 	}
 
-	BT_INFO("%s: ROME controller version 0x%08x", hdev->name, rome_ver);
+	bt_dev_info(hdev, "ROME controller version 0x%08x", rome_ver);
 
 	/* Download rampatch file */
 	config.type = TLV_TYPE_PATCH;
@@ -380,7 +380,7 @@ int qca_uart_setup_rome(struct hci_dev *hdev, uint8_t baudrate)
 		return err;
 	}
 
-	BT_INFO("%s: ROME setup on UART is completed", hdev->name);
+	bt_dev_info(hdev, "ROME setup on UART is completed");
 
 	return 0;
 }
diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c
index d9a99b4302ea..6e2ad748abba 100644
--- a/drivers/bluetooth/btrtl.c
+++ b/drivers/bluetooth/btrtl.c
@@ -55,8 +55,8 @@ static int rtl_read_rom_version(struct hci_dev *hdev, u8 *version)
 	}
 
 	rom_version = (struct rtl_rom_version_evt *)skb->data;
-	BT_INFO("%s: rom_version status=%x version=%x",
-		hdev->name, rom_version->status, rom_version->version);
+	bt_dev_info(hdev, "rom_version status=%x version=%x",
+		    rom_version->status, rom_version->version);
 
 	*version = rom_version->version;
 
@@ -273,7 +273,7 @@ static int rtl_load_config(struct hci_dev *hdev, const char *name, u8 **buff)
 	const struct firmware *fw;
 	int ret;
 
-	BT_INFO("%s: rtl: loading %s", hdev->name, name);
+	bt_dev_info(hdev, "rtl: loading %s", name);
 	ret = request_firmware(&fw, name, &hdev->dev);
 	if (ret < 0)
 		return ret;
@@ -292,7 +292,7 @@ static int btrtl_setup_rtl8723a(struct hci_dev *hdev)
 	const struct firmware *fw;
 	int ret;
 
-	BT_INFO("%s: rtl: loading rtl_bt/rtl8723a_fw.bin", hdev->name);
+	bt_dev_info(hdev, "rtl: loading rtl_bt/rtl8723a_fw.bin");
 	ret = request_firmware(&fw, "rtl_bt/rtl8723a_fw.bin", &hdev->dev);
 	if (ret < 0) {
 		BT_ERR("%s: Failed to load rtl_bt/rtl8723a_fw.bin", hdev->name);
@@ -363,7 +363,7 @@ static int btrtl_setup_rtl8723b(struct hci_dev *hdev, u16 lmp_subver,
 	} else
 		cfg_sz = 0;
 
-	BT_INFO("%s: rtl: loading %s", hdev->name, fw_name);
+	bt_dev_info(hdev, "rtl: loading %s", fw_name);
 	ret = request_firmware(&fw, fw_name, &hdev->dev);
 	if (ret < 0) {
 		BT_ERR("%s: Failed to load %s", hdev->name, fw_name);
@@ -390,7 +390,7 @@ static int btrtl_setup_rtl8723b(struct hci_dev *hdev, u16 lmp_subver,
 		fw_data = tbuff;
 	}
 
-	BT_INFO("cfg_sz %d, total size %d", cfg_sz, ret);
+	bt_dev_info(hdev, "cfg_sz %d, total size %d", cfg_sz, ret);
 
 	ret = rtl_download_firmware(hdev, fw_data, ret);
 
@@ -436,9 +436,10 @@ int btrtl_setup_realtek(struct hci_dev *hdev)
 		return -PTR_ERR(skb);
 
 	resp = (struct hci_rp_read_local_version *)skb->data;
-	BT_INFO("%s: rtl: examining hci_ver=%02x hci_rev=%04x lmp_ver=%02x "
-		"lmp_subver=%04x", hdev->name, resp->hci_ver, resp->hci_rev,
-		resp->lmp_ver, resp->lmp_subver);
+	bt_dev_info(hdev, "rtl: examining hci_ver=%02x hci_rev=%04x "
+		    "lmp_ver=%02x lmp_subver=%04x",
+		    resp->hci_ver, resp->hci_rev,
+		    resp->lmp_ver, resp->lmp_subver);
 
 	lmp_subver = le16_to_cpu(resp->lmp_subver);
 	kfree_skb(skb);
@@ -466,7 +467,7 @@ int btrtl_setup_realtek(struct hci_dev *hdev)
 		return btrtl_setup_rtl8723b(hdev, lmp_subver,
 					    "rtl_bt/rtl8822b_fw.bin");
 	default:
-		BT_INFO("rtl: assuming no firmware upload needed.");
+		bt_dev_info(hdev, "rtl: assuming no firmware upload needed");
 		return 0;
 	}
 }
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index b8eb39436ef2..8701140e11f9 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -643,7 +643,7 @@ static void btusb_intr_complete(struct urb *urb)
 
 		if (btusb_recv_intr(data, urb->transfer_buffer,
 				    urb->actual_length) < 0) {
-			BT_ERR("%s corrupted event packet", hdev->name);
+			bt_dev_err(hdev, "corrupted event packet");
 			hdev->stat.err_rx++;
 		}
 	} else if (urb->status == -ENOENT) {
@@ -663,8 +663,8 @@ static void btusb_intr_complete(struct urb *urb)
 		 * -ENODEV: device got disconnected
 		 */
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p failed to resubmit (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p failed to resubmit (%d)",
+				   urb, -err);
 		usb_unanchor_urb(urb);
 	}
 }
@@ -706,8 +706,8 @@ static int btusb_submit_intr_urb(struct hci_dev *hdev, gfp_t mem_flags)
 	err = usb_submit_urb(urb, mem_flags);
 	if (err < 0) {
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p submission failed (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p submission failed (%d)",
+				   urb, -err);
 		usb_unanchor_urb(urb);
 	}
 
@@ -733,7 +733,7 @@ static void btusb_bulk_complete(struct urb *urb)
 
 		if (data->recv_bulk(data, urb->transfer_buffer,
 				    urb->actual_length) < 0) {
-			BT_ERR("%s corrupted ACL packet", hdev->name);
+			bt_dev_err(hdev, "corrupted ACL packet");
 			hdev->stat.err_rx++;
 		}
 	} else if (urb->status == -ENOENT) {
@@ -753,8 +753,8 @@ static void btusb_bulk_complete(struct urb *urb)
 		 * -ENODEV: device got disconnected
 		 */
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p failed to resubmit (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p failed to resubmit (%d)",
+				   urb, -err);
 		usb_unanchor_urb(urb);
 	}
 }
@@ -795,8 +795,8 @@ static int btusb_submit_bulk_urb(struct hci_dev *hdev, gfp_t mem_flags)
 	err = usb_submit_urb(urb, mem_flags);
 	if (err < 0) {
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p submission failed (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p submission failed (%d)",
+				   urb, -err);
 		usb_unanchor_urb(urb);
 	}
 
@@ -829,7 +829,7 @@ static void btusb_isoc_complete(struct urb *urb)
 
 			if (btusb_recv_isoc(data, urb->transfer_buffer + offset,
 					    length) < 0) {
-				BT_ERR("%s corrupted SCO packet", hdev->name);
+				bt_dev_err(hdev, "corrupted SCO packet");
 				hdev->stat.err_rx++;
 			}
 		}
@@ -849,8 +849,8 @@ static void btusb_isoc_complete(struct urb *urb)
 		 * -ENODEV: device got disconnected
 		 */
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p failed to resubmit (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p failed to resubmit (%d)",
+				   urb, -err);
 		usb_unanchor_urb(urb);
 	}
 }
@@ -917,8 +917,8 @@ static int btusb_submit_isoc_urb(struct hci_dev *hdev, gfp_t mem_flags)
 	err = usb_submit_urb(urb, mem_flags);
 	if (err < 0) {
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p submission failed (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p submission failed (%d)",
+				   urb, -err);
 		usb_unanchor_urb(urb);
 	}
 
@@ -962,8 +962,8 @@ static void btusb_diag_complete(struct urb *urb)
 		 * -ENODEV: device got disconnected
 		 */
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p failed to resubmit (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p failed to resubmit (%d)",
+				   urb, -err);
 		usb_unanchor_urb(urb);
 	}
 }
@@ -1004,8 +1004,8 @@ static int btusb_submit_diag_urb(struct hci_dev *hdev, gfp_t mem_flags)
 	err = usb_submit_urb(urb, mem_flags);
 	if (err < 0) {
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p submission failed (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p submission failed (%d)",
+				   urb, -err);
 		usb_unanchor_urb(urb);
 	}
 
@@ -1266,8 +1266,8 @@ static int submit_tx_urb(struct hci_dev *hdev, struct urb *urb)
 	err = usb_submit_urb(urb, GFP_KERNEL);
 	if (err < 0) {
 		if (err != -EPERM && err != -ENODEV)
-			BT_ERR("%s urb %p submission failed (%d)",
-			       hdev->name, urb, -err);
+			bt_dev_err(hdev, "urb %p submission failed (%d)",
+				   urb, -err);
 		kfree(urb->setup_packet);
 		usb_unanchor_urb(urb);
 	} else {
@@ -1362,7 +1362,7 @@ static inline int __set_isoc_interface(struct hci_dev *hdev, int altsetting)
 
 	err = usb_set_interface(data->udev, 1, altsetting);
 	if (err < 0) {
-		BT_ERR("%s setting interface failed (%d)", hdev->name, -err);
+		bt_dev_err(hdev, "setting interface failed (%d)", -err);
 		return err;
 	}
 
@@ -1386,7 +1386,7 @@ static inline int __set_isoc_interface(struct hci_dev *hdev, int altsetting)
 	}
 
 	if (!data->isoc_tx_ep || !data->isoc_rx_ep) {
-		BT_ERR("%s invalid SCO descriptors", hdev->name);
+		bt_dev_err(hdev, "invalid SCO descriptors");
 		return -ENODEV;
 	}
 
@@ -1481,7 +1481,7 @@ static int btusb_setup_bcm92035(struct hci_dev *hdev)
 
 	skb = __hci_cmd_sync(hdev, 0xfc3b, 1, &val, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb))
-		BT_ERR("BCM92035 command failed (%ld)", -PTR_ERR(skb));
+		bt_dev_err(hdev, "BCM92035 command failed (%ld)", PTR_ERR(skb));
 	else
 		kfree_skb(skb);
 
@@ -1499,12 +1499,12 @@ static int btusb_setup_csr(struct hci_dev *hdev)
 			     HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		int err = PTR_ERR(skb);
-		BT_ERR("%s: CSR: Local version failed (%d)", hdev->name, err);
+		bt_dev_err(hdev, "CSR: Local version failed (%d)", err);
 		return err;
 	}
 
 	if (skb->len != sizeof(struct hci_rp_read_local_version)) {
-		BT_ERR("%s: CSR: Local version length mismatch", hdev->name);
+		bt_dev_err(hdev, "CSR: Local version length mismatch");
 		kfree_skb(skb);
 		return -EIO;
 	}
@@ -1566,7 +1566,7 @@ static const struct firmware *btusb_setup_intel_get_fw(struct hci_dev *hdev,
 		}
 	}
 
-	BT_INFO("%s: Intel Bluetooth firmware file: %s", hdev->name, fwname);
+	bt_dev_info(hdev, "Intel Bluetooth firmware file: %s", fwname);
 
 	return fw;
 }
@@ -1722,18 +1722,18 @@ static int btusb_setup_intel(struct hci_dev *hdev)
 	if (err)
 		return err;
 
-	BT_INFO("%s: read Intel version: %02x%02x%02x%02x%02x%02x%02x%02x%02x",
-		hdev->name, ver.hw_platform, ver.hw_variant, ver.hw_revision,
-		ver.fw_variant,  ver.fw_revision, ver.fw_build_num,
-		ver.fw_build_ww, ver.fw_build_yy, ver.fw_patch_num);
+	bt_dev_info(hdev, "read Intel version: %02x%02x%02x%02x%02x%02x%02x%02x%02x",
+		    ver.hw_platform, ver.hw_variant, ver.hw_revision,
+		    ver.fw_variant,  ver.fw_revision, ver.fw_build_num,
+		    ver.fw_build_ww, ver.fw_build_yy, ver.fw_patch_num);
 
 	/* fw_patch_num indicates the version of patch the device currently
 	 * have. If there is no patch data in the device, it is always 0x00.
 	 * So, if it is other than 0x00, no need to patch the device again.
 	 */
 	if (ver.fw_patch_num) {
-		BT_INFO("%s: Intel device is already patched. patch num: %02x",
-			hdev->name, ver.fw_patch_num);
+		bt_dev_info(hdev, "Intel device is already patched. "
+			    "patch num: %02x", ver.fw_patch_num);
 		goto complete;
 	}
 
@@ -1801,8 +1801,7 @@ static int btusb_setup_intel(struct hci_dev *hdev)
 	if (err)
 		return err;
 
-	BT_INFO("%s: Intel Bluetooth firmware patch completed and activated",
-		hdev->name);
+	bt_dev_info(hdev, "Intel firmware patch completed and activated");
 
 	goto complete;
 
@@ -1812,7 +1811,7 @@ exit_mfg_disable:
 	if (err)
 		return err;
 
-	BT_INFO("%s: Intel Bluetooth firmware patch completed", hdev->name);
+	bt_dev_info(hdev, "Intel firmware patch completed");
 
 	goto complete;
 
@@ -1826,8 +1825,7 @@ exit_mfg_deactivate:
 	if (err)
 		return err;
 
-	BT_INFO("%s: Intel Bluetooth firmware patch completed and deactivated",
-		hdev->name);
+	bt_dev_info(hdev, "Intel firmware patch completed and deactivated");
 
 complete:
 	/* Set the event mask for Intel specific vendor events. This enables
@@ -2108,24 +2106,24 @@ static int btusb_setup_intel_new(struct hci_dev *hdev)
 
 	params = (struct intel_boot_params *)skb->data;
 
-	BT_INFO("%s: Device revision is %u", hdev->name,
-		le16_to_cpu(params->dev_revid));
+	bt_dev_info(hdev, "Device revision is %u",
+		    le16_to_cpu(params->dev_revid));
 
-	BT_INFO("%s: Secure boot is %s", hdev->name,
-		params->secure_boot ? "enabled" : "disabled");
+	bt_dev_info(hdev, "Secure boot is %s",
+		    params->secure_boot ? "enabled" : "disabled");
 
-	BT_INFO("%s: OTP lock is %s", hdev->name,
-		params->otp_lock ? "enabled" : "disabled");
+	bt_dev_info(hdev, "OTP lock is %s",
+		    params->otp_lock ? "enabled" : "disabled");
 
-	BT_INFO("%s: API lock is %s", hdev->name,
-		params->api_lock ? "enabled" : "disabled");
+	bt_dev_info(hdev, "API lock is %s",
+		    params->api_lock ? "enabled" : "disabled");
 
-	BT_INFO("%s: Debug lock is %s", hdev->name,
-		params->debug_lock ? "enabled" : "disabled");
+	bt_dev_info(hdev, "Debug lock is %s",
+		    params->debug_lock ? "enabled" : "disabled");
 
-	BT_INFO("%s: Minimum firmware build %u week %u %u", hdev->name,
-		params->min_fw_build_nn, params->min_fw_build_cw,
-		2000 + params->min_fw_build_yy);
+	bt_dev_info(hdev, "Minimum firmware build %u week %u %u",
+		    params->min_fw_build_nn, params->min_fw_build_cw,
+		    2000 + params->min_fw_build_yy);
 
 	/* It is required that every single firmware fragment is acknowledged
 	 * with a command complete event. If the boot parameters indicate
@@ -2142,7 +2140,7 @@ static int btusb_setup_intel_new(struct hci_dev *hdev)
 	 * also be no valid address for the operational firmware.
 	 */
 	if (!bacmp(&params->otp_bdaddr, BDADDR_ANY)) {
-		BT_INFO("%s: No device address configured", hdev->name);
+		bt_dev_info(hdev, "No device address configured");
 		set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
 	}
 
@@ -2169,7 +2167,7 @@ static int btusb_setup_intel_new(struct hci_dev *hdev)
 		return err;
 	}
 
-	BT_INFO("%s: Found device firmware: %s", hdev->name, fwname);
+	bt_dev_info(hdev, "Found device firmware: %s", fwname);
 
 	/* Save the DDC file name for later use to apply once the firmware
 	 * downloading is done.
@@ -2250,7 +2248,7 @@ static int btusb_setup_intel_new(struct hci_dev *hdev)
 
 	set_bit(BTUSB_FIRMWARE_LOADED, &data->flags);
 
-	BT_INFO("%s: Waiting for firmware download to complete", hdev->name);
+	bt_dev_info(hdev, "Waiting for firmware download to complete");
 
 	/* Before switching the device into operational mode and with that
 	 * booting the loaded firmware, wait for the bootloader notification
@@ -2287,7 +2285,7 @@ static int btusb_setup_intel_new(struct hci_dev *hdev)
 	delta = ktime_sub(rettime, calltime);
 	duration = (unsigned long long) ktime_to_ns(delta) >> 10;
 
-	BT_INFO("%s: Firmware loaded in %llu usecs", hdev->name, duration);
+	bt_dev_info(hdev, "Firmware loaded in %llu usecs", duration);
 
 done:
 	release_firmware(fw);
@@ -2313,7 +2311,7 @@ done:
 	 * 1 second. However if that happens, then just fail the setup
 	 * since something went wrong.
 	 */
-	BT_INFO("%s: Waiting for device to boot", hdev->name);
+	bt_dev_info(hdev, "Waiting for device to boot");
 
 	err = wait_on_bit_timeout(&data->flags, BTUSB_BOOTING,
 				  TASK_INTERRUPTIBLE,
@@ -2333,7 +2331,7 @@ done:
 	delta = ktime_sub(rettime, calltime);
 	duration = (unsigned long long) ktime_to_ns(delta) >> 10;
 
-	BT_INFO("%s: Device booted in %llu usecs", hdev->name, duration);
+	bt_dev_info(hdev, "Device booted in %llu usecs", duration);
 
 	clear_bit(BTUSB_BOOTLOADER, &data->flags);
 
@@ -2436,8 +2434,8 @@ static int btusb_set_bdaddr_marvell(struct hci_dev *hdev,
 	skb = __hci_cmd_sync(hdev, 0xfc22, sizeof(buf), buf, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		ret = PTR_ERR(skb);
-		BT_ERR("%s: changing Marvell device address failed (%ld)",
-		       hdev->name, ret);
+		bt_dev_err(hdev, "changing Marvell device address failed (%ld)",
+			   ret);
 		return ret;
 	}
 	kfree_skb(skb);
@@ -2461,8 +2459,7 @@ static int btusb_set_bdaddr_ath3012(struct hci_dev *hdev,
 	skb = __hci_cmd_sync(hdev, 0xfc0b, sizeof(buf), buf, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		ret = PTR_ERR(skb);
-		BT_ERR("%s: Change address command failed (%ld)",
-		       hdev->name, ret);
+		bt_dev_err(hdev, "Change address command failed (%ld)", ret);
 		return ret;
 	}
 	kfree_skb(skb);
@@ -2528,7 +2525,7 @@ static int btusb_qca_send_vendor_req(struct hci_dev *hdev, u8 request,
 	err = usb_control_msg(udev, pipe, request, USB_TYPE_VENDOR | USB_DIR_IN,
 			      0, 0, buf, size, USB_CTRL_SET_TIMEOUT);
 	if (err < 0) {
-		BT_ERR("%s: Failed to access otp area (%d)", hdev->name, err);
+		bt_dev_err(hdev, "Failed to access otp area (%d)", err);
 		goto done;
 	}
 
@@ -2568,7 +2565,7 @@ static int btusb_setup_qca_download_fw(struct hci_dev *hdev,
 	err = usb_control_msg(udev, pipe, QCA_DFU_DOWNLOAD, USB_TYPE_VENDOR,
 			      0, 0, buf, size, USB_CTRL_SET_TIMEOUT);
 	if (err < 0) {
-		BT_ERR("%s: Failed to send headers (%d)", hdev->name, err);
+		bt_dev_err(hdev, "Failed to send headers (%d)", err);
 		goto done;
 	}
 
@@ -2584,13 +2581,13 @@ static int btusb_setup_qca_download_fw(struct hci_dev *hdev,
 		err = usb_bulk_msg(udev, pipe, buf, size, &len,
 				   QCA_DFU_TIMEOUT);
 		if (err < 0) {
-			BT_ERR("%s: Failed to send body at %zd of %zd (%d)",
-			       hdev->name, sent, firmware->size, err);
+			bt_dev_err(hdev, "Failed to send body at %zd of %zd (%d)",
+				   sent, firmware->size, err);
 			break;
 		}
 
 		if (size != len) {
-			BT_ERR("%s: Failed to get bulk buffer", hdev->name);
+			bt_dev_err(hdev, "Failed to get bulk buffer");
 			err = -EILSEQ;
 			break;
 		}
@@ -2622,24 +2619,23 @@ static int btusb_setup_qca_load_rampatch(struct hci_dev *hdev,
 
 	err = request_firmware(&fw, fwname, &hdev->dev);
 	if (err) {
-		BT_ERR("%s: failed to request rampatch file: %s (%d)",
-		       hdev->name, fwname, err);
+		bt_dev_err(hdev, "failed to request rampatch file: %s (%d)",
+			   fwname, err);
 		return err;
 	}
 
-	BT_INFO("%s: using rampatch file: %s", hdev->name, fwname);
+	bt_dev_info(hdev, "using rampatch file: %s", fwname);
 
 	rver = (struct qca_rampatch_version *)(fw->data + info->ver_offset);
 	rver_rom = le16_to_cpu(rver->rom_version);
 	rver_patch = le16_to_cpu(rver->patch_version);
 
-	BT_INFO("%s: QCA: patch rome 0x%x build 0x%x, firmware rome 0x%x "
-		"build 0x%x", hdev->name, rver_rom, rver_patch, ver_rom,
-		ver_patch);
+	bt_dev_info(hdev, "QCA: patch rome 0x%x build 0x%x, "
+		    "firmware rome 0x%x build 0x%x",
+		    rver_rom, rver_patch, ver_rom, ver_patch);
 
 	if (rver_rom != ver_rom || rver_patch <= ver_patch) {
-		BT_ERR("%s: rampatch file version did not match with firmware",
-		       hdev->name);
+		bt_dev_err(hdev, "rampatch file version did not match with firmware");
 		err = -EINVAL;
 		goto done;
 	}
@@ -2665,12 +2661,12 @@ static int btusb_setup_qca_load_nvm(struct hci_dev *hdev,
 
 	err = request_firmware(&fw, fwname, &hdev->dev);
 	if (err) {
-		BT_ERR("%s: failed to request NVM file: %s (%d)",
-		       hdev->name, fwname, err);
+		bt_dev_err(hdev, "failed to request NVM file: %s (%d)",
+			   fwname, err);
 		return err;
 	}
 
-	BT_INFO("%s: using NVM file: %s", hdev->name, fwname);
+	bt_dev_info(hdev, "using NVM file: %s", fwname);
 
 	err = btusb_setup_qca_download_fw(hdev, fw, info->nvm_hdr);
 
@@ -2698,8 +2694,7 @@ static int btusb_setup_qca(struct hci_dev *hdev)
 			info = &qca_devices_table[i];
 	}
 	if (!info) {
-		BT_ERR("%s: don't support firmware rome 0x%x", hdev->name,
-		       ver_rom);
+		bt_dev_err(hdev, "don't support firmware rome 0x%x", ver_rom);
 		return -ENODEV;
 	}
 
@@ -2753,7 +2748,7 @@ static inline int __set_diag_interface(struct hci_dev *hdev)
 	}
 
 	if (!data->diag_tx_ep || !data->diag_rx_ep) {
-		BT_ERR("%s invalid diagnostic descriptors", hdev->name);
+		bt_dev_err(hdev, "invalid diagnostic descriptors");
 		return -ENODEV;
 	}
 
diff --git a/drivers/bluetooth/hci_ath.c b/drivers/bluetooth/hci_ath.c
index 5f17ec339fb8..14ae7ee88acb 100644
--- a/drivers/bluetooth/hci_ath.c
+++ b/drivers/bluetooth/hci_ath.c
@@ -206,7 +206,7 @@ static int ath_recv(struct hci_uart *hu, const void *data, int count)
 				  ath_recv_pkts, ARRAY_SIZE(ath_recv_pkts));
 	if (IS_ERR(ath->rx_skb)) {
 		int err = PTR_ERR(ath->rx_skb);
-		BT_ERR("%s: Frame reassembly failed (%d)", hu->hdev->name, err);
+		bt_dev_err(hu->hdev, "Frame reassembly failed (%d)", err);
 		ath->rx_skb = NULL;
 		return err;
 	}
diff --git a/drivers/bluetooth/hci_h4.c b/drivers/bluetooth/hci_h4.c
index 3b82a87224a9..fb97a3bf069b 100644
--- a/drivers/bluetooth/hci_h4.c
+++ b/drivers/bluetooth/hci_h4.c
@@ -132,7 +132,7 @@ static int h4_recv(struct hci_uart *hu, const void *data, int count)
 				 h4_recv_pkts, ARRAY_SIZE(h4_recv_pkts));
 	if (IS_ERR(h4->rx_skb)) {
 		int err = PTR_ERR(h4->rx_skb);
-		BT_ERR("%s: Frame reassembly failed (%d)", hu->hdev->name, err);
+		bt_dev_err(hu->hdev, "Frame reassembly failed (%d)", err);
 		h4->rx_skb = NULL;
 		return err;
 	}
diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 4a949bb60394..bbd7db7384e6 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -800,7 +800,7 @@ static int qca_recv(struct hci_uart *hu, const void *data, int count)
 				  qca_recv_pkts, ARRAY_SIZE(qca_recv_pkts));
 	if (IS_ERR(qca->rx_skb)) {
 		int err = PTR_ERR(qca->rx_skb);
-		BT_ERR("%s: Frame reassembly failed (%d)", hu->hdev->name, err);
+		bt_dev_err(hu->hdev, "Frame reassembly failed (%d)", err);
 		qca->rx_skb = NULL;
 		return err;
 	}
@@ -863,7 +863,7 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate)
 
 	skb = bt_skb_alloc(sizeof(cmd), GFP_ATOMIC);
 	if (!skb) {
-		BT_ERR("Failed to allocate memory for baudrate packet");
+		bt_dev_err(hdev, "Failed to allocate baudrate packet");
 		return -ENOMEM;
 	}
 
@@ -892,7 +892,7 @@ static int qca_setup(struct hci_uart *hu)
 	unsigned int speed, qca_baudrate = QCA_BAUDRATE_115200;
 	int ret;
 
-	BT_INFO("%s: ROME setup", hdev->name);
+	bt_dev_info(hdev, "ROME setup");
 
 	/* Patch downloading has to be done without IBS mode */
 	clear_bit(STATE_IN_BAND_SLEEP_ENABLED, &qca->flags);
@@ -917,11 +917,11 @@ static int qca_setup(struct hci_uart *hu)
 	if (speed) {
 		qca_baudrate = qca_get_baudrate_value(speed);
 
-		BT_INFO("%s: Set UART speed to %d", hdev->name, speed);
+		bt_dev_info(hdev, "Set UART speed to %d", speed);
 		ret = qca_set_baudrate(hdev, qca_baudrate);
 		if (ret) {
-			BT_ERR("%s: Failed to change the baud rate (%d)",
-			       hdev->name, ret);
+			bt_dev_err(hdev, "Failed to change the baud rate (%d)",
+				   ret);
 			return ret;
 		}
 		hci_uart_set_baudrate(hu, speed);
diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index b725ac4f7ff6..71664b22ec9d 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -185,7 +185,7 @@ static int hci_uart_setup(struct hci_dev *hdev)
 	if (hu->proto->set_baudrate && speed) {
 		err = hu->proto->set_baudrate(hu, speed);
 		if (err)
-			BT_ERR("%s: failed to set baudrate", hdev->name);
+			bt_dev_err(hdev, "Failed to set baudrate");
 		else
 			serdev_device_set_baudrate(hu->serdev, speed);
 	}
@@ -199,14 +199,13 @@ static int hci_uart_setup(struct hci_dev *hdev)
 	skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL,
 			     HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
-		BT_ERR("%s: Reading local version information failed (%ld)",
-		       hdev->name, PTR_ERR(skb));
+		bt_dev_err(hdev, "Reading local version info failed (%ld)",
+			   PTR_ERR(skb));
 		return 0;
 	}
 
 	if (skb->len != sizeof(*ver)) {
-		BT_ERR("%s: Event length mismatch for version information",
-		       hdev->name);
+		bt_dev_err(hdev, "Event length mismatch for version info");
 	}
 
 	kfree_skb(skb);
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 020142bb9735..e89cff0c4c23 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -147,6 +147,9 @@ void bt_err_ratelimited(const char *fmt, ...);
 #define bt_dev_dbg(hdev, fmt, ...)				\
 	BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
 
+#define bt_dev_err_ratelimited(hdev, fmt, ...)			\
+	BT_ERR_RATELIMITED("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+
 /* Connection and socket states */
 enum {
 	BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index aad994edd3bb..51c2cf2d8923 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -573,7 +573,7 @@ static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
 	hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
 				       &mgr->l2cap_conn->hcon->dst);
 	if (!hcon) {
-		BT_ERR("No phys link exist");
+		bt_dev_err(hdev, "no phys link exist");
 		rsp.status = A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS;
 		goto clean;
 	}
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index ebcab5bbadd7..78bec8df8525 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -187,7 +187,7 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type)
 
 	/* Legacy key */
 	if (conn->key_type < 3) {
-		BT_ERR("Legacy key type %d", conn->key_type);
+		bt_dev_err(hdev, "legacy key type %d", conn->key_type);
 		return -EACCES;
 	}
 
@@ -207,7 +207,7 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type)
 	/* Derive Generic AMP Link Key (gamp) */
 	err = hmac_sha256(keybuf, HCI_AMP_LINK_KEY_SIZE, "gamp", 4, gamp_key);
 	if (err) {
-		BT_ERR("Could not derive Generic AMP Key: err %d", err);
+		bt_dev_err(hdev, "could not derive Generic AMP Key: err %d", err);
 		return err;
 	}
 
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 746adcb62259..a9682534c377 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -729,8 +729,8 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 		goto done;
 	}
 
-	BT_ERR("HCI request failed to create LE connection: status 0x%2.2x",
-	       status);
+	bt_dev_err(hdev, "request failed to create LE connection: "
+		   "status 0x%2.2x", status);
 
 	if (!conn)
 		goto done;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 6bc679cd3481..40d260f2bea5 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -267,7 +267,7 @@ static int hci_init1_req(struct hci_request *req, unsigned long opt)
 		amp_init1(req);
 		break;
 	default:
-		BT_ERR("Unknown device type %d", hdev->dev_type);
+		bt_dev_err(hdev, "Unknown device type %d", hdev->dev_type);
 		break;
 	}
 
@@ -2150,8 +2150,7 @@ static void hci_error_reset(struct work_struct *work)
 	if (hdev->hw_error)
 		hdev->hw_error(hdev, hdev->hw_error_code);
 	else
-		BT_ERR("%s hardware error 0x%2.2x", hdev->name,
-		       hdev->hw_error_code);
+		bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code);
 
 	if (hci_dev_do_close(hdev))
 		return;
@@ -2524,9 +2523,9 @@ static void hci_cmd_timeout(struct work_struct *work)
 		struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data;
 		u16 opcode = __le16_to_cpu(sent->opcode);
 
-		BT_ERR("%s command 0x%4.4x tx timeout", hdev->name, opcode);
+		bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode);
 	} else {
-		BT_ERR("%s command tx timeout", hdev->name);
+		bt_dev_err(hdev, "command tx timeout");
 	}
 
 	atomic_set(&hdev->cmd_cnt, 1);
@@ -2858,7 +2857,7 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
 
 	params = kzalloc(sizeof(*params), GFP_KERNEL);
 	if (!params) {
-		BT_ERR("Out of memory");
+		bt_dev_err(hdev, "out of memory");
 		return NULL;
 	}
 
@@ -3393,7 +3392,7 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
 
 	err = hdev->send(hdev, skb);
 	if (err < 0) {
-		BT_ERR("%s sending frame failed (%d)", hdev->name, err);
+		bt_dev_err(hdev, "sending frame failed (%d)", err);
 		kfree_skb(skb);
 	}
 }
@@ -3408,7 +3407,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
 
 	skb = hci_prepare_cmd(hdev, opcode, plen, param);
 	if (!skb) {
-		BT_ERR("%s no memory for command", hdev->name);
+		bt_dev_err(hdev, "no memory for command");
 		return -ENOMEM;
 	}
 
@@ -3493,7 +3492,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
 		hci_add_acl_hdr(skb, chan->handle, flags);
 		break;
 	default:
-		BT_ERR("%s unknown dev_type %d", hdev->name, hdev->dev_type);
+		bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type);
 		return;
 	}
 
@@ -3618,7 +3617,7 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
 			break;
 		default:
 			cnt = 0;
-			BT_ERR("Unknown link type");
+			bt_dev_err(hdev, "unknown link type %d", conn->type);
 		}
 
 		q = cnt / num;
@@ -3635,15 +3634,15 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type)
 	struct hci_conn_hash *h = &hdev->conn_hash;
 	struct hci_conn *c;
 
-	BT_ERR("%s link tx timeout", hdev->name);
+	bt_dev_err(hdev, "link tx timeout");
 
 	rcu_read_lock();
 
 	/* Kill stalled connections */
 	list_for_each_entry_rcu(c, &h->list, list) {
 		if (c->type == type && c->sent) {
-			BT_ERR("%s killing stalled connection %pMR",
-			       hdev->name, &c->dst);
+			bt_dev_err(hdev, "killing stalled connection %pMR",
+				   &c->dst);
 			hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM);
 		}
 	}
@@ -3724,7 +3723,7 @@ static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type,
 		break;
 	default:
 		cnt = 0;
-		BT_ERR("Unknown link type");
+		bt_dev_err(hdev, "unknown link type %d", chan->conn->type);
 	}
 
 	q = cnt / num;
@@ -4066,8 +4065,8 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 		l2cap_recv_acldata(conn, skb, flags);
 		return;
 	} else {
-		BT_ERR("%s ACL packet for unknown connection handle %d",
-		       hdev->name, handle);
+		bt_dev_err(hdev, "ACL packet for unknown connection handle %d",
+			   handle);
 	}
 
 	kfree_skb(skb);
@@ -4097,8 +4096,8 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
 		sco_recv_scodata(conn, skb);
 		return;
 	} else {
-		BT_ERR("%s SCO packet for unknown connection handle %d",
-		       hdev->name, handle);
+		bt_dev_err(hdev, "SCO packet for unknown connection handle %d",
+			   handle);
 	}
 
 	kfree_skb(skb);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 0b4dba08a14e..cd3bbb766c24 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1188,7 +1188,8 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
 		break;
 
 	default:
-		BT_ERR("Used reserved LE_Scan_Enable param %d", cp->enable);
+		bt_dev_err(hdev, "use of reserved LE_Scan_Enable param %d",
+			   cp->enable);
 		break;
 	}
 
@@ -1485,7 +1486,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 			conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr,
 					    HCI_ROLE_MASTER);
 			if (!conn)
-				BT_ERR("No memory for new connection");
+				bt_dev_err(hdev, "no memory for new connection");
 		}
 	}
 
@@ -2269,7 +2270,7 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
 		conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr,
 				    HCI_ROLE_SLAVE);
 		if (!conn) {
-			BT_ERR("No memory for new connection");
+			bt_dev_err(hdev, "no memory for new connection");
 			hci_dev_unlock(hdev);
 			return;
 		}
@@ -2431,7 +2432,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 
 		if (!hci_conn_ssp_enabled(conn) &&
 		    test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) {
-			BT_INFO("re-auth of legacy device is not possible.");
+			bt_dev_info(hdev, "re-auth of legacy device is not possible.");
 		} else {
 			set_bit(HCI_CONN_AUTH, &conn->flags);
 			conn->sec_level = conn->pending_sec_level;
@@ -2535,8 +2536,7 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status,
 	BT_DBG("%s status 0x%02x", hdev->name, status);
 
 	if (!skb || skb->len < sizeof(*rp)) {
-		BT_ERR("%s invalid HCI Read Encryption Key Size response",
-		       hdev->name);
+		bt_dev_err(hdev, "invalid read key size response");
 		return;
 	}
 
@@ -2554,8 +2554,8 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status,
 	 * supported.
 	 */
 	if (rp->status) {
-		BT_ERR("%s failed to read key size for handle %u", hdev->name,
-		       handle);
+		bt_dev_err(hdev, "failed to read key size for handle %u",
+			   handle);
 		conn->enc_key_size = HCI_LINK_KEY_SIZE;
 	} else {
 		conn->enc_key_size = rp->key_size;
@@ -2664,7 +2664,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
 		hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp);
 
 		if (hci_req_run_skb(&req, read_enc_key_size_complete)) {
-			BT_ERR("Sending HCI Read Encryption Key Size failed");
+			bt_dev_err(hdev, "sending read key size failed");
 			conn->enc_key_size = HCI_LINK_KEY_SIZE;
 			goto notify;
 		}
@@ -3197,7 +3197,7 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	int i;
 
 	if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_PACKET_BASED) {
-		BT_ERR("Wrong event for mode %d", hdev->flow_ctl_mode);
+		bt_dev_err(hdev, "wrong event for mode %d", hdev->flow_ctl_mode);
 		return;
 	}
 
@@ -3249,7 +3249,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
 			break;
 
 		default:
-			BT_ERR("Unknown type %d conn %p", conn->type, conn);
+			bt_dev_err(hdev, "unknown type %d conn %p",
+				   conn->type, conn);
 			break;
 		}
 	}
@@ -3271,7 +3272,7 @@ static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev,
 			return chan->conn;
 		break;
 	default:
-		BT_ERR("%s unknown dev_type %d", hdev->name, hdev->dev_type);
+		bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type);
 		break;
 	}
 
@@ -3284,7 +3285,7 @@ static void hci_num_comp_blocks_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	int i;
 
 	if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_BLOCK_BASED) {
-		BT_ERR("Wrong event for mode %d", hdev->flow_ctl_mode);
+		bt_dev_err(hdev, "wrong event for mode %d", hdev->flow_ctl_mode);
 		return;
 	}
 
@@ -3320,7 +3321,8 @@ static void hci_num_comp_blocks_evt(struct hci_dev *hdev, struct sk_buff *skb)
 			break;
 
 		default:
-			BT_ERR("Unknown type %d conn %p", conn->type, conn);
+			bt_dev_err(hdev, "unknown type %d conn %p",
+				   conn->type, conn);
 			break;
 		}
 	}
@@ -4479,7 +4481,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	if (!conn) {
 		conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role);
 		if (!conn) {
-			BT_ERR("No memory for new connection");
+			bt_dev_err(hdev, "no memory for new connection");
 			goto unlock;
 		}
 
@@ -4749,8 +4751,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 	case LE_ADV_SCAN_RSP:
 		break;
 	default:
-		BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
-				   type);
+		bt_dev_err_ratelimited(hdev, "unknown advertising packet "
+				       "type: 0x%02x", type);
 		return;
 	}
 
@@ -4769,8 +4771,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 
 	/* Adjust for actual length */
 	if (len != real_len) {
-		BT_ERR_RATELIMITED("%s advertising data length corrected",
-				   hdev->name);
+		bt_dev_err_ratelimited(hdev, "advertising data len corrected");
 		len = real_len;
 	}
 
@@ -5192,7 +5193,7 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
 		return false;
 
 	if (skb->len < sizeof(*hdr)) {
-		BT_ERR("Too short HCI event");
+		bt_dev_err(hdev, "too short HCI event");
 		return false;
 	}
 
@@ -5206,12 +5207,13 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
 	}
 
 	if (hdr->evt != HCI_EV_CMD_COMPLETE) {
-		BT_DBG("Last event is not cmd complete (0x%2.2x)", hdr->evt);
+		bt_dev_err(hdev, "last event is not cmd complete (0x%2.2x)",
+			   hdr->evt);
 		return false;
 	}
 
 	if (skb->len < sizeof(*ev)) {
-		BT_ERR("Too short cmd_complete event");
+		bt_dev_err(hdev, "too short cmd_complete event");
 		return false;
 	}
 
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 7f28d17dc792..abc0f3224dd1 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -336,8 +336,8 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
 
 	skb = hci_prepare_cmd(hdev, opcode, plen, param);
 	if (!skb) {
-		BT_ERR("%s no memory for command (opcode 0x%4.4x)",
-		       hdev->name, opcode);
+		bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
+			   opcode);
 		req->err = -ENOMEM;
 		return;
 	}
@@ -1426,7 +1426,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
 
 		err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
 		if (err < 0) {
-			BT_ERR("%s failed to generate new RPA", hdev->name);
+			bt_dev_err(hdev, "failed to generate new RPA");
 			return err;
 		}
 
@@ -1788,7 +1788,7 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 
 	err = hci_req_run(&req, abort_conn_complete);
 	if (err && err != -ENODATA) {
-		BT_ERR("Failed to run HCI request: err %d", err);
+		bt_dev_err(conn->hdev, "failed to run HCI request: err %d", err);
 		return err;
 	}
 
@@ -1872,7 +1872,8 @@ static void le_scan_disable_work(struct work_struct *work)
 
 	hci_req_sync(hdev, le_scan_disable, 0, HCI_CMD_TIMEOUT, &status);
 	if (status) {
-		BT_ERR("Failed to disable LE scan: status 0x%02x", status);
+		bt_dev_err(hdev, "failed to disable LE scan: status 0x%02x",
+			   status);
 		return;
 	}
 
@@ -1903,7 +1904,7 @@ static void le_scan_disable_work(struct work_struct *work)
 	hci_req_sync(hdev, bredr_inquiry, DISCOV_INTERLEAVED_INQUIRY_LEN,
 		     HCI_CMD_TIMEOUT, &status);
 	if (status) {
-		BT_ERR("Inquiry failed: status 0x%02x", status);
+		bt_dev_err(hdev, "inquiry failed: status 0x%02x", status);
 		goto discov_stopped;
 	}
 
@@ -1945,7 +1946,8 @@ static void le_scan_restart_work(struct work_struct *work)
 
 	hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status);
 	if (status) {
-		BT_ERR("Failed to restart LE scan: status %d", status);
+		bt_dev_err(hdev, "failed to restart LE scan: status %d",
+			   status);
 		return;
 	}
 
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index aa300f3a0d51..34aaa2340ac8 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -50,7 +50,7 @@ void hci_conn_add_sysfs(struct hci_conn *conn)
 	dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle);
 
 	if (device_add(&conn->dev) < 0) {
-		BT_ERR("Failed to register connection device");
+		bt_dev_err(hdev, "failed to register connection device");
 		return;
 	}
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 07a3cc29f426..6e9fc86d8daf 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -2159,8 +2159,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	key_count = __le16_to_cpu(cp->key_count);
 	if (key_count > max_key_count) {
-		BT_ERR("load_link_keys: too big key_count value %u",
-		       key_count);
+		bt_dev_err(hdev, "load_link_keys: too big key_count value %u",
+			   key_count);
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
@@ -2168,8 +2168,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
 	expected_len = sizeof(*cp) + key_count *
 					sizeof(struct mgmt_link_key_info);
 	if (expected_len != len) {
-		BT_ERR("load_link_keys: expected %u bytes, got %u bytes",
-		       expected_len, len);
+		bt_dev_err(hdev, "load_link_keys: expected %u bytes, got %u bytes",
+			   expected_len, len);
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
@@ -2561,7 +2561,7 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data,
 
 		memcpy(&ncp.addr, &cp->addr, sizeof(ncp.addr));
 
-		BT_ERR("PIN code is not 16 bytes long");
+		bt_dev_err(hdev, "PIN code is not 16 bytes long");
 
 		err = send_pin_code_neg_reply(sk, hdev, &ncp);
 		if (err >= 0)
@@ -3391,7 +3391,8 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
 					MGMT_OP_ADD_REMOTE_OOB_DATA,
 					status, &cp->addr, sizeof(cp->addr));
 	} else {
-		BT_ERR("add_remote_oob_data: invalid length of %u bytes", len);
+		bt_dev_err(hdev, "add_remote_oob_data: invalid len of %u bytes",
+			   len);
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA,
 				      MGMT_STATUS_INVALID_PARAMS);
 	}
@@ -3604,8 +3605,8 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
 
 	uuid_count = __le16_to_cpu(cp->uuid_count);
 	if (uuid_count > max_uuid_count) {
-		BT_ERR("service_discovery: too big uuid_count value %u",
-		       uuid_count);
+		bt_dev_err(hdev, "service_discovery: too big uuid_count value %u",
+			   uuid_count);
 		err = mgmt_cmd_complete(sk, hdev->id,
 					MGMT_OP_START_SERVICE_DISCOVERY,
 					MGMT_STATUS_INVALID_PARAMS, &cp->type,
@@ -3615,8 +3616,8 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
 
 	expected_len = sizeof(*cp) + uuid_count * 16;
 	if (expected_len != len) {
-		BT_ERR("service_discovery: expected %u bytes, got %u bytes",
-		       expected_len, len);
+		bt_dev_err(hdev, "service_discovery: expected %u bytes, got %u bytes",
+			   expected_len, len);
 		err = mgmt_cmd_complete(sk, hdev->id,
 					MGMT_OP_START_SERVICE_DISCOVERY,
 					MGMT_STATUS_INVALID_PARAMS, &cp->type,
@@ -3943,7 +3944,7 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status,
 		err = hci_req_run(&req, enable_advertising_instance);
 
 	if (err)
-		BT_ERR("Failed to re-configure advertising");
+		bt_dev_err(hdev, "failed to re-configure advertising");
 
 unlock:
 	hci_dev_unlock(hdev);
@@ -4664,15 +4665,16 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
 
 	irk_count = __le16_to_cpu(cp->irk_count);
 	if (irk_count > max_irk_count) {
-		BT_ERR("load_irks: too big irk_count value %u", irk_count);
+		bt_dev_err(hdev, "load_irks: too big irk_count value %u",
+			   irk_count);
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
 
 	expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info);
 	if (expected_len != len) {
-		BT_ERR("load_irks: expected %u bytes, got %u bytes",
-		       expected_len, len);
+		bt_dev_err(hdev, "load_irks: expected %u bytes, got %u bytes",
+			   expected_len, len);
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
@@ -4745,7 +4747,8 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
 
 	key_count = __le16_to_cpu(cp->key_count);
 	if (key_count > max_key_count) {
-		BT_ERR("load_ltks: too big key_count value %u", key_count);
+		bt_dev_err(hdev, "load_ltks: too big key_count value %u",
+			   key_count);
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
@@ -4753,8 +4756,8 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
 	expected_len = sizeof(*cp) + key_count *
 					sizeof(struct mgmt_ltk_info);
 	if (expected_len != len) {
-		BT_ERR("load_keys: expected %u bytes, got %u bytes",
-		       expected_len, len);
+		bt_dev_err(hdev, "load_keys: expected %u bytes, got %u bytes",
+			   expected_len, len);
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
@@ -4873,14 +4876,15 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status,
 	}
 
 	if (!cp) {
-		BT_ERR("invalid sent_cmd in conn_info response");
+		bt_dev_err(hdev, "invalid sent_cmd in conn_info response");
 		goto unlock;
 	}
 
 	handle = __le16_to_cpu(cp->handle);
 	conn = hci_conn_hash_lookup_handle(hdev, handle);
 	if (!conn) {
-		BT_ERR("unknown handle (%d) in conn_info response", handle);
+		bt_dev_err(hdev, "unknown handle (%d) in conn_info response",
+			   handle);
 		goto unlock;
 	}
 
@@ -5477,8 +5481,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	param_count = __le16_to_cpu(cp->param_count);
 	if (param_count > max_param_count) {
-		BT_ERR("load_conn_param: too big param_count value %u",
-		       param_count);
+		bt_dev_err(hdev, "load_conn_param: too big param_count value %u",
+			   param_count);
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
@@ -5486,8 +5490,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
 	expected_len = sizeof(*cp) + param_count *
 					sizeof(struct mgmt_conn_param);
 	if (expected_len != len) {
-		BT_ERR("load_conn_param: expected %u bytes, got %u bytes",
-		       expected_len, len);
+		bt_dev_err(hdev, "load_conn_param: expected %u bytes, got %u bytes",
+			   expected_len, len);
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
@@ -5512,7 +5516,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
 		} else if (param->addr.type == BDADDR_LE_RANDOM) {
 			addr_type = ADDR_LE_DEV_RANDOM;
 		} else {
-			BT_ERR("Ignoring invalid connection parameters");
+			bt_dev_err(hdev, "ignoring invalid connection parameters");
 			continue;
 		}
 
@@ -5525,14 +5529,14 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
 		       min, max, latency, timeout);
 
 		if (hci_check_conn_params(min, max, latency, timeout) < 0) {
-			BT_ERR("Ignoring invalid connection parameters");
+			bt_dev_err(hdev, "ignoring invalid connection parameters");
 			continue;
 		}
 
 		hci_param = hci_conn_params_add(hdev, &param->addr.bdaddr,
 						addr_type);
 		if (!hci_param) {
-			BT_ERR("Failed to add connection parameters");
+			bt_dev_err(hdev, "failed to add connection parameters");
 			continue;
 		}
 
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index d41449b9e9d6..01117ae84f1d 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -996,7 +996,8 @@ static u8 smp_random(struct smp_chan *smp)
 		return SMP_UNSPECIFIED;
 
 	if (crypto_memneq(smp->pcnf, confirm, sizeof(smp->pcnf))) {
-		BT_ERR("Pairing failed (confirmation values mismatch)");
+		bt_dev_err(hcon->hdev, "pairing failed "
+			   "(confirmation values mismatch)");
 		return SMP_CONFIRM_FAILED;
 	}
 
@@ -1210,7 +1211,7 @@ static void sc_generate_ltk(struct smp_chan *smp)
 
 	key = hci_find_link_key(hdev, &hcon->dst);
 	if (!key) {
-		BT_ERR("%s No Link Key found to generate LTK", hdev->name);
+		bt_dev_err(hdev, "no Link Key found to generate LTK");
 		return;
 	}
 
@@ -2067,11 +2068,11 @@ static int fixup_sc_false_positive(struct smp_chan *smp)
 		return SMP_UNSPECIFIED;
 
 	if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
-		BT_ERR("Refusing SMP SC -> legacy fallback in SC-only mode");
+		bt_dev_err(hdev, "refusing legacy fallback in SC-only mode");
 		return SMP_UNSPECIFIED;
 	}
 
-	BT_ERR("Trying to fall back to legacy SMP");
+	bt_dev_err(hdev, "trying to fall back to legacy SMP");
 
 	req = (void *) &smp->preq[1];
 	rsp = (void *) &smp->prsp[1];
@@ -2082,7 +2083,7 @@ static int fixup_sc_false_positive(struct smp_chan *smp)
 	auth = req->auth_req & AUTH_REQ_MASK(hdev);
 
 	if (tk_request(conn, 0, auth, rsp->io_capability, req->io_capability)) {
-		BT_ERR("Failed to fall back to legacy SMP");
+		bt_dev_err(hdev, "failed to fall back to legacy SMP");
 		return SMP_UNSPECIFIED;
 	}
 
@@ -2355,7 +2356,7 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
 
 	chan = conn->smp;
 	if (!chan) {
-		BT_ERR("SMP security requested but not available");
+		bt_dev_err(hcon->hdev, "security requested but not available");
 		return 1;
 	}
 
@@ -2548,7 +2549,7 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn,
 	 */
 	if (!bacmp(&info->bdaddr, BDADDR_ANY) ||
 	    !hci_is_identity_address(&info->bdaddr, info->addr_type)) {
-		BT_ERR("Ignoring IRK with no identity address");
+		bt_dev_err(hcon->hdev, "ignoring IRK with no identity address");
 		goto distribute;
 	}
 
@@ -2953,8 +2954,8 @@ done:
 	return err;
 
 drop:
-	BT_ERR("%s unexpected SMP command 0x%02x from %pMR", hcon->hdev->name,
-	       code, &hcon->dst);
+	bt_dev_err(hcon->hdev, "unexpected SMP command 0x%02x from %pMR",
+		   code, &hcon->dst);
 	kfree_skb(skb);
 	return 0;
 }
@@ -3021,8 +3022,7 @@ static void bredr_pairing(struct l2cap_chan *chan)
 
 	smp = smp_chan_create(conn);
 	if (!smp) {
-		BT_ERR("%s unable to create SMP context for BR/EDR",
-		       hdev->name);
+		bt_dev_err(hdev, "unable to create SMP context for BR/EDR");
 		return;
 	}
 
-- 
cgit v1.2.3


From 384c181e3780ddc45e70483e29d84495b484730d Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 27 Oct 2017 02:35:34 -0700
Subject: net: sched: Identify hardware traffic classes using classid

This patch offloads the classid to hardware and uses the classid
reserved in the range :ffe0 - :ffef to identify hardware traffic
classes reported via dev->num_tc.

tcf_result structure contains the class ID of the class to which
the packet belongs and is offloaded to hardware via flower filter.
A new helper function is introduced to represent HW traffic
classes 0 through 15 using the reserved classid values :ffe0 - :ffef.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Acked-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/net/pkt_cls.h     | 1 +
 include/net/sch_generic.h | 7 +++++++
 net/sched/cls_flower.c    | 2 ++
 3 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bf73e1675519..37c5ef766655 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -666,6 +666,7 @@ struct tc_cls_flower_offload {
 	struct fl_flow_key *mask;
 	struct fl_flow_key *key;
 	struct tcf_exts *exts;
+	u32 classid;
 };
 
 enum tc_matchall_command {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 07c179dab478..c23e938f5b19 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -411,6 +411,13 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id)
 	return NULL;
 }
 
+static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
+{
+	u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY;
+
+	return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
+}
+
 int qdisc_class_hash_init(struct Qdisc_class_hash *);
 void qdisc_class_hash_insert(struct Qdisc_class_hash *,
 			     struct Qdisc_class_common *);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 35cb6d684e44..c99fa9e5be46 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -241,6 +241,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	cls_flower.mask = mask;
 	cls_flower.key = &f->mkey;
 	cls_flower.exts = &f->exts;
+	cls_flower.classid = f->res.classid;
 
 	err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
 			       &cls_flower, skip_sw);
@@ -266,6 +267,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	cls_flower.command = TC_CLSFLOWER_STATS;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.exts = &f->exts;
+	cls_flower.classid = f->res.classid;
 
 	tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
 			 &cls_flower, false);
-- 
cgit v1.2.3


From 638f5b90d46016372a8e3e0a434f199cc5e12b8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 31 Oct 2017 18:16:05 -0700
Subject: bpf: reduce verifier memory consumption

the verifier got progressively smarter over time and size of its internal
state grew as well. Time to reduce the memory consumption.

Before:
sizeof(struct bpf_verifier_state) = 6520
After:
sizeof(struct bpf_verifier_state) = 896

It's done by observing that majority of BPF programs use little to
no stack whereas verifier kept all of 512 stack slots ready always.
Instead dynamically reallocate struct verifier state when stack
access is detected.
Runtime difference before vs after is within a noise.
The number of processed instructions stays the same.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |   8 +-
 include/linux/bpf_verifier.h                      |  16 +-
 kernel/bpf/verifier.c                             | 437 ++++++++++++++--------
 3 files changed, 305 insertions(+), 156 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 3d3dcac1c942..a8c7615546a9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -76,9 +76,9 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 
 static int
 nfp_bpf_check_exit(struct nfp_prog *nfp_prog,
-		   const struct bpf_verifier_env *env)
+		   struct bpf_verifier_env *env)
 {
-	const struct bpf_reg_state *reg0 = &env->cur_state.regs[0];
+	const struct bpf_reg_state *reg0 = cur_regs(env) + BPF_REG_0;
 	u64 imm;
 
 	if (nfp_prog->act == NN_ACT_XDP)
@@ -144,9 +144,9 @@ nfp_bpf_check_stack_access(struct nfp_prog *nfp_prog,
 
 static int
 nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-		  const struct bpf_verifier_env *env, u8 reg_no)
+		  struct bpf_verifier_env *env, u8 reg_no)
 {
-	const struct bpf_reg_state *reg = &env->cur_state.regs[reg_no];
+	const struct bpf_reg_state *reg = cur_regs(env) + reg_no;
 	int err;
 
 	if (reg->type != PTR_TO_CTX &&
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index feeaea93d959..3b0976aaac75 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -88,14 +88,19 @@ enum bpf_stack_slot_type {
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
 
+struct bpf_stack_state {
+	struct bpf_reg_state spilled_ptr;
+	u8 slot_type[BPF_REG_SIZE];
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
 struct bpf_verifier_state {
 	struct bpf_reg_state regs[MAX_BPF_REG];
-	u8 stack_slot_type[MAX_BPF_STACK];
-	struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
 	struct bpf_verifier_state *parent;
+	int allocated_stack;
+	struct bpf_stack_state *stack;
 };
 
 /* linked list of verifier states used to prune search */
@@ -145,7 +150,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
-	struct bpf_verifier_state cur_state; /* current verifier state */
+	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
@@ -159,6 +164,11 @@ struct bpf_verifier_env {
 	struct bpf_verifer_log log;
 };
 
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+	return env->cur_state->regs;
+}
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d906775e12c1..5f26f7ad124f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -276,43 +276,132 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			verbose(env, ")");
 		}
 	}
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
-				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] == STACK_SPILL)
+			verbose(env, " fp%d=%s",
+				-MAX_BPF_STACK + i * BPF_REG_SIZE,
+				reg_type_str[state->stack[i].spilled_ptr.type]);
 	}
 	verbose(env, "\n");
 }
 
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int copy_stack_state(struct bpf_verifier_state *dst,
+			    const struct bpf_verifier_state *src)
 {
-	struct bpf_verifier_stack_elem *elem;
-	int insn_idx;
+	if (!src->stack)
+		return 0;
+	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+		/* internal bug, make state invalid to reject the program */
+		memset(dst, 0, sizeof(*dst));
+		return -EFAULT;
+	}
+	memcpy(dst->stack, src->stack,
+	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+	return 0;
+}
+
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+				  bool copy_old)
+{
+	u32 old_size = state->allocated_stack;
+	struct bpf_stack_state *new_stack;
+	int slot = size / BPF_REG_SIZE;
+
+	if (size <= old_size || !size) {
+		if (copy_old)
+			return 0;
+		state->allocated_stack = slot * BPF_REG_SIZE;
+		if (!size && old_size) {
+			kfree(state->stack);
+			state->stack = NULL;
+		}
+		return 0;
+	}
+	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+				  GFP_KERNEL);
+	if (!new_stack)
+		return -ENOMEM;
+	if (copy_old) {
+		if (state->stack)
+			memcpy(new_stack, state->stack,
+			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+		memset(new_stack + old_size / BPF_REG_SIZE, 0,
+		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+	}
+	state->allocated_stack = slot * BPF_REG_SIZE;
+	kfree(state->stack);
+	state->stack = new_stack;
+	return 0;
+}
+
+static void free_verifier_state(struct bpf_verifier_state *state)
+{
+	kfree(state->stack);
+	kfree(state);
+}
+
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+			       const struct bpf_verifier_state *src)
+{
+	int err;
+
+	err = realloc_verifier_state(dst, src->allocated_stack, false);
+	if (err)
+		return err;
+	memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+	return copy_stack_state(dst, src);
+}
+
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+		     int *insn_idx)
+{
+	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_verifier_stack_elem *elem, *head = env->head;
+	int err;
 
 	if (env->head == NULL)
-		return -1;
+		return -ENOENT;
 
-	memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
-	insn_idx = env->head->insn_idx;
+	if (cur) {
+		err = copy_verifier_state(cur, &head->st);
+		if (err)
+			return err;
+	}
+	if (insn_idx)
+		*insn_idx = head->insn_idx;
 	if (prev_insn_idx)
-		*prev_insn_idx = env->head->prev_insn_idx;
-	elem = env->head->next;
-	kfree(env->head);
+		*prev_insn_idx = head->prev_insn_idx;
+	elem = head->next;
+	kfree(head);
 	env->head = elem;
 	env->stack_size--;
-	return insn_idx;
+	return 0;
 }
 
 static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 					     int insn_idx, int prev_insn_idx)
 {
+	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_verifier_stack_elem *elem;
+	int err;
 
-	elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
 	if (!elem)
 		goto err;
 
-	memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+	err = copy_verifier_state(&elem->st, cur);
+	if (err)
+		return NULL;
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
@@ -325,7 +414,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	return &elem->st;
 err:
 	/* pop all elements and return */
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	return NULL;
 }
 
@@ -550,7 +639,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state->regs;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
@@ -563,7 +652,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
-		mark_reg_read(&env->cur_state, regno);
+		mark_reg_read(env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -601,10 +690,21 @@ static int check_stack_write(struct bpf_verifier_env *env,
 			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
-	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+	err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+				     true);
+	if (err)
+		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
+	if (!env->allow_ptr_leaks &&
+	    state->stack[spi].slot_type[0] == STACK_SPILL &&
+	    size != BPF_REG_SIZE) {
+		verbose(env, "attempt to corrupt spilled pointer on stack\n");
+		return -EACCES;
+	}
 
 	if (value_regno >= 0 &&
 	    is_spillable_regtype(state->regs[value_regno].type)) {
@@ -616,17 +716,18 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		}
 
 		/* save register state */
-		state->spilled_regs[spi] = state->regs[value_regno];
-		state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+		state->stack[spi].spilled_ptr = state->regs[value_regno];
+		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
 		for (i = 0; i < BPF_REG_SIZE; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+			state->stack[spi].slot_type[i] = STACK_SPILL;
 	} else {
 		/* regular write of data into stack */
-		state->spilled_regs[spi] = (struct bpf_reg_state) {};
+		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
 
 		for (i = 0; i < size; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+				STACK_MISC;
 	}
 	return 0;
 }
@@ -637,10 +738,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
-		if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+		if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
 			break;
 		/* ... then we depend on parent's value */
-		parent->spilled_regs[slot].live |= REG_LIVE_READ;
+		parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
 		state = parent;
 		parent = state->parent;
 	}
@@ -650,34 +751,37 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
-	u8 *slot_type;
-	int i, spi;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+	u8 *stype;
 
-	slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+	if (state->allocated_stack <= slot) {
+		verbose(env, "invalid read from stack off %d+0 size %d\n",
+			off, size);
+		return -EACCES;
+	}
+	stype = state->stack[spi].slot_type;
 
-	if (slot_type[0] == STACK_SPILL) {
+	if (stype[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
 			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
-			if (slot_type[i] != STACK_SPILL) {
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
 				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
 
-		spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
 		if (value_regno >= 0) {
 			/* restore register state from stack */
-			state->regs[value_regno] = state->spilled_regs[spi];
+			state->regs[value_regno] = state->stack[spi].spilled_ptr;
 			mark_stack_slot_read(state, spi);
 		}
 		return 0;
 	} else {
 		for (i = 0; i < size; i++) {
-			if (slot_type[i] != STACK_MISC) {
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
 				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
@@ -694,7 +798,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			    int size)
 {
-	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_map *map = regs[regno].map_ptr;
 
 	if (off < 0 || size <= 0 || off + size > map->value_size) {
 		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
@@ -706,9 +811,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-				int off, int size)
+			    int off, int size)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *reg = &state->regs[regno];
 	int err;
 
@@ -783,7 +888,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 				 int off, int size)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
@@ -797,7 +902,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 			       int size)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 	int err;
 
@@ -866,7 +971,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
 
 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
-	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
 }
 
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
@@ -968,8 +1073,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
-	struct bpf_reg_state *reg = &state->regs[regno];
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = regs + regno;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -993,7 +1099,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(env, state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -1028,14 +1134,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(env, state->regs, value_regno);
+				mark_reg_unknown(env, regs, value_regno);
 			else
-				mark_reg_known_zero(env, state->regs,
+				mark_reg_known_zero(env, regs,
 						    value_regno);
-			state->regs[value_regno].id = 0;
-			state->regs[value_regno].off = 0;
-			state->regs[value_regno].range = 0;
-			state->regs[value_regno].type = reg_type;
+			regs[value_regno].id = 0;
+			regs[value_regno].off = 0;
+			regs[value_regno].range = 0;
+			regs[value_regno].type = reg_type;
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
@@ -1061,19 +1167,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (env->prog->aux->stack_depth < -off)
 			env->prog->aux->stack_depth = -off;
 
-		if (t == BPF_WRITE) {
-			if (!env->allow_ptr_leaks &&
-			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
-			    size != BPF_REG_SIZE) {
-				verbose(env, "attempt to corrupt spilled pointer on stack\n");
-				return -EACCES;
-			}
+		if (t == BPF_WRITE)
 			err = check_stack_write(env, state, off, size,
 						value_regno);
-		} else {
+		else
 			err = check_stack_read(env, state, off, size,
 					       value_regno);
-		}
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose(env, "cannot write into packet\n");
@@ -1087,7 +1186,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(env, state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1095,11 +1194,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	}
 
 	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
-	    state->regs[value_regno].type == SCALAR_VALUE) {
+	    regs[value_regno].type == SCALAR_VALUE) {
 		/* b/h/w load zero-extends, mark upper bits as known 0 */
-		state->regs[value_regno].var_off = tnum_cast(
-					state->regs[value_regno].var_off, size);
-		__update_reg_bounds(&state->regs[value_regno]);
+		regs[value_regno].var_off =
+			tnum_cast(regs[value_regno].var_off, size);
+		__update_reg_bounds(&regs[value_regno]);
 	}
 	return err;
 }
@@ -1156,9 +1255,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 				int access_size, bool zero_size_allowed,
 				struct bpf_call_arg_meta *meta)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs;
-	int off, i;
+	int off, i, slot, spi;
 
 	if (regs[regno].type != PTR_TO_STACK) {
 		/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1198,7 +1297,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	}
 
 	for (i = 0; i < access_size; i++) {
-		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+		slot = -(off + i) - 1;
+		spi = slot / BPF_REG_SIZE;
+		if (state->allocated_stack <= slot ||
+		    state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+			STACK_MISC) {
 			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
@@ -1211,7 +1314,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				   int access_size, bool zero_size_allowed,
 				   struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
@@ -1229,7 +1332,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_reg_type expected_type, type = reg->type;
 	int err = 0;
 
@@ -1514,7 +1617,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
@@ -1522,10 +1625,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(env, regs, i);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		reg = &state->stack[i].spilled_ptr;
 		if (reg_is_pkt_pointer_any(reg))
 			__mark_reg_unknown(reg);
 	}
@@ -1533,9 +1636,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
 static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
 	int i, err;
@@ -1603,6 +1705,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 			return err;
 	}
 
+	regs = cur_regs(env);
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
@@ -1691,7 +1794,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *ptr_reg,
 				   const struct bpf_reg_state *off_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
 	bool known = tnum_is_const(off_reg->var_off);
 	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1703,13 +1806,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env,
 			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env,
 			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
@@ -1890,7 +1993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_reg_state *dst_reg,
 				      struct bpf_reg_state src_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	bool src_known, dst_known;
 	s64 smin_val, smax_val;
@@ -2111,7 +2214,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				   struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
 	int rc;
@@ -2185,12 +2288,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
@@ -2200,7 +2303,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 /* check validity of 32-bit and 64-bit arithmetic operations */
 static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -2421,10 +2524,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 			/* keep the maximum range already checked */
 			regs[i].range = max(regs[i].range, new_range);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		reg = &state->stack[i].spilled_ptr;
 		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, new_range);
 	}
@@ -2674,17 +2777,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
 	for (i = 0; i < MAX_BPF_REG; i++)
 		mark_map_reg(regs, i, id, is_null);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+		mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
 	}
 }
 
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
-	struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+	struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
 	struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
@@ -2876,7 +2979,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 /* verify BPF_LD_IMM64 instruction */
 static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -2937,7 +3040,7 @@ static bool may_access_skb(enum bpf_prog_type type)
  */
 static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 mode = BPF_MODE(insn->code);
 	int i, err;
 
@@ -2999,7 +3102,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 		return 0;
 	}
 
-	reg = &env->cur_state.regs[BPF_REG_0];
+	reg = cur_regs(env) + BPF_REG_0;
 	if (reg->type != SCALAR_VALUE) {
 		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
 			reg_type_str[reg->type]);
@@ -3363,6 +3466,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	return false;
 }
 
+static bool stacksafe(struct bpf_verifier_state *old,
+		      struct bpf_verifier_state *cur,
+		      struct idpair *idmap)
+{
+	int i, spi;
+
+	/* if explored stack has more populated slots than current stack
+	 * such stacks are not equivalent
+	 */
+	if (old->allocated_stack > cur->allocated_stack)
+		return false;
+
+	/* walk slots of the explored stack and ignore any additional
+	 * slots in the current stack, since explored(safe) state
+	 * didn't use them
+	 */
+	for (i = 0; i < old->allocated_stack; i++) {
+		spi = i / BPF_REG_SIZE;
+
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+			continue;
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+			/* Ex: old explored (safe) state has STACK_SPILL in
+			 * this stack slot, but current has has STACK_MISC ->
+			 * this verifier states are not equivalent,
+			 * return false to continue verification of this path
+			 */
+			return false;
+		if (i % BPF_REG_SIZE)
+			continue;
+		if (old->stack[spi].slot_type[0] != STACK_SPILL)
+			continue;
+		if (!regsafe(&old->stack[spi].spilled_ptr,
+			     &cur->stack[spi].spilled_ptr,
+			     idmap))
+			/* when explored and current stack slot are both storing
+			 * spilled registers, check that stored pointers types
+			 * are the same as well.
+			 * Ex: explored safe path could have stored
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+			 * but current path has stored:
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+			 * such verifier states are not equivalent.
+			 * return false to continue verification of this path
+			 */
+			return false;
+	}
+	return true;
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -3407,37 +3561,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 			goto out_free;
 	}
 
-	for (i = 0; i < MAX_BPF_STACK; i++) {
-		if (old->stack_slot_type[i] == STACK_INVALID)
-			continue;
-		if (old->stack_slot_type[i] != cur->stack_slot_type[i])
-			/* Ex: old explored (safe) state has STACK_SPILL in
-			 * this stack slot, but current has has STACK_MISC ->
-			 * this verifier states are not equivalent,
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		if (i % BPF_REG_SIZE)
-			continue;
-		if (old->stack_slot_type[i] != STACK_SPILL)
-			continue;
-		if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
-			     &cur->spilled_regs[i / BPF_REG_SIZE],
-			     idmap))
-			/* when explored and current stack slot are both storing
-			 * spilled registers, check that stored pointers types
-			 * are the same as well.
-			 * Ex: explored safe path could have stored
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
-			 * but current path has stored:
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
-			 * such verifier states are not equivalent.
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		else
-			continue;
-	}
+	if (!stacksafe(old, cur, idmap))
+		goto out_free;
 	ret = true;
 out_free:
 	kfree(idmap);
@@ -3473,17 +3598,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
 		}
 	}
 	/* ... and stack slots */
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
-		if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+		    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+		if (parent->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (parent->spilled_regs[i].live & REG_LIVE_READ)
+		if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
 			continue;
-		if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+		if (writes &&
+		    (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
 			continue;
-		if (state->spilled_regs[i].live & REG_LIVE_READ) {
-			parent->spilled_regs[i].live |= REG_LIVE_READ;
+		if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+			parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
 			touched = true;
 		}
 	}
@@ -3513,6 +3640,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state *cur = env->cur_state;
 	int i;
 
 	sl = env->explored_states[insn_idx];
@@ -3523,7 +3651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return 0;
 
 	while (sl != STATE_LIST_MARK) {
-		if (states_equal(env, &sl->state, &env->cur_state)) {
+		if (states_equal(env, &sl->state, cur)) {
 			/* reached equivalent register/stack state,
 			 * prune the search.
 			 * Registers read by the continuation are read by us.
@@ -3534,7 +3662,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * they'll be immediately forgotten as we're pruning
 			 * this state and will pop a new one.
 			 */
-			propagate_liveness(&sl->state, &env->cur_state);
+			propagate_liveness(&sl->state, cur);
 			return 1;
 		}
 		sl = sl->next;
@@ -3546,16 +3674,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * it will be rejected. Since there are no loops, we won't be
 	 * seeing this 'insn_idx' instruction again on the way to bpf_exit
 	 */
-	new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+	copy_verifier_state(&new_sl->state, cur);
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
-	env->cur_state.parent = &new_sl->state;
+	cur->parent = &new_sl->state;
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
@@ -3563,10 +3691,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * explored_states can get read marks.)
 	 */
 	for (i = 0; i < BPF_REG_FP; i++)
-		env->cur_state.regs[i].live = REG_LIVE_NONE;
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
-		if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
-			env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+		cur->regs[i].live = REG_LIVE_NONE;
+	for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+		if (cur->stack[i].slot_type[0] == STACK_SPILL)
+			cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
 	return 0;
 }
 
@@ -3581,15 +3709,19 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 
 static int do_check(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state;
 	struct bpf_insn *insns = env->prog->insnsi;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len;
 	int insn_idx, prev_insn_idx = 0;
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(env, regs);
+	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+	env->cur_state = state;
+	init_reg_state(env, state->regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3637,7 +3769,7 @@ static int do_check(struct bpf_verifier_env *env)
 			else
 				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(env, &env->cur_state);
+			print_verifier_state(env, state);
 			do_print_state = false;
 		}
 
@@ -3651,6 +3783,7 @@ static int do_check(struct bpf_verifier_env *env)
 		if (err)
 			return err;
 
+		regs = cur_regs(env);
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -3818,8 +3951,10 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
-				insn_idx = pop_stack(env, &prev_insn_idx);
-				if (insn_idx < 0) {
+				err = pop_stack(env, &prev_insn_idx, &insn_idx);
+				if (err < 0) {
+					if (err != -ENOENT)
+						return err;
 					break;
 				} else {
 					do_print_state = true;
@@ -4359,9 +4494,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	free_verifier_state(env->cur_state);
+	env->cur_state = NULL;
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	if (ret == 0)
@@ -4464,9 +4601,11 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	free_verifier_state(env->cur_state);
+	env->cur_state = NULL;
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	mutex_unlock(&bpf_verifier_lock);
-- 
cgit v1.2.3


From 6c31e5a91fde2e718e59c8a627c56451f88be54c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 27 Oct 2017 17:37:13 -0700
Subject: net: Add extack to fib_notifier_info

Add extack to fib_notifier_info and plumb through stack to
call_fib_rule_notifiers, call_fib_entry_notifiers and
call_fib6_entry_notifiers. This allows notifer handlers to
return messages to user.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_notifier.h |  1 +
 net/core/fib_rules.c       |  9 ++++++---
 net/ipv4/fib_trie.c        | 13 ++++++++-----
 net/ipv6/ip6_fib.c         | 15 +++++++++------
 4 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index 54cd6b839d2f..c91ec732afd6 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -9,6 +9,7 @@
 struct fib_notifier_info {
 	struct net *net;
 	int family;
+	struct netlink_ext_ack  *extack;
 };
 
 enum fib_event_type {
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 9a6d97c1d810..fafd0a41e3f7 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -314,10 +314,12 @@ static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
 static int call_fib_rule_notifiers(struct net *net,
 				   enum fib_event_type event_type,
 				   struct fib_rule *rule,
-				   struct fib_rules_ops *ops)
+				   struct fib_rules_ops *ops,
+				   struct netlink_ext_ack *extack)
 {
 	struct fib_rule_notifier_info info = {
 		.info.family = ops->family,
+		.info.extack = extack,
 		.rule = rule,
 	};
 
@@ -609,7 +611,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (rule->tun_id)
 		ip_tunnel_need_metadata();
 
-	call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops);
+	call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack);
 	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
 	flush_route_cache(ops);
 	rules_ops_put(ops);
@@ -749,7 +751,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 			}
 		}
 
-		call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops);
+		call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
+					NULL);
 		notify_rule_change(RTM_DELRULE, rule, ops, nlh,
 				   NETLINK_CB(skb).portid);
 		fib_rule_put(rule);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index aaa1ba09afaa..5ddc4aefff12 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -102,9 +102,11 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
 
 static int call_fib_entry_notifiers(struct net *net,
 				    enum fib_event_type event_type, u32 dst,
-				    int dst_len, struct fib_alias *fa)
+				    int dst_len, struct fib_alias *fa,
+				    struct netlink_ext_ack *extack)
 {
 	struct fib_entry_notifier_info info = {
+		.info.extack = extack,
 		.dst = dst,
 		.dst_len = dst_len,
 		.fi = fa->fa_info,
@@ -1214,7 +1216,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 			new_fa->fa_default = -1;
 
 			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
-						 key, plen, new_fa);
+						 key, plen, new_fa, extack);
 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
 				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
 
@@ -1269,7 +1271,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 		tb->tb_num_default++;
 
 	rt_cache_flush(cfg->fc_nlinfo.nl_net);
-	call_fib_entry_notifiers(net, event, key, plen, new_fa);
+	call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
 		  &cfg->fc_nlinfo, nlflags);
 succeeded:
@@ -1569,7 +1571,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
 		return -ESRCH;
 
 	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
-				 fa_to_delete);
+				 fa_to_delete, extack);
 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
 
@@ -1886,7 +1888,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
 
 			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
 						 n->key,
-						 KEYLENGTH - fa->fa_slen, fa);
+						 KEYLENGTH - fa->fa_slen, fa,
+						 NULL);
 			hlist_del_rcu(&fa->fa_list);
 			fib_release_info(fa->fa_info);
 			alias_free_mem_rcu(fa);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1ada9672d198..2e2804f5823e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -353,9 +353,11 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 
 static int call_fib6_entry_notifiers(struct net *net,
 				     enum fib_event_type event_type,
-				     struct rt6_info *rt)
+				     struct rt6_info *rt,
+				     struct netlink_ext_ack *extack)
 {
 	struct fib6_entry_notifier_info info = {
+		.info.extack = extack,
 		.rt = rt,
 	};
 
@@ -868,7 +870,8 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
  */
 
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
-			    struct nl_info *info, struct mx6_config *mxc)
+			    struct nl_info *info, struct mx6_config *mxc,
+			    struct netlink_ext_ack *extack)
 {
 	struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
@@ -1011,7 +1014,7 @@ add:
 		rcu_assign_pointer(rt->rt6i_node, fn);
 		rcu_assign_pointer(*ins, rt);
 		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
-					  rt);
+					  rt, extack);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
@@ -1040,7 +1043,7 @@ add:
 		rt->dst.rt6_next = iter->dst.rt6_next;
 		rcu_assign_pointer(*ins, rt);
 		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
-					  rt);
+					  rt, extack);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -1225,7 +1228,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	}
 #endif
 
-	err = fib6_add_rt2node(fn, rt, info, mxc);
+	err = fib6_add_rt2node(fn, rt, info, mxc, extack);
 	if (!err) {
 		fib6_update_sernum_upto_root(rt, sernum);
 		fib6_start_gc(info->nl_net, rt);
@@ -1686,7 +1689,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	fib6_purge_rt(rt, fn, net);
 
-	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt);
+	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
 	if (!info->skip_notify)
 		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
 	rt6_release(rt);
-- 
cgit v1.2.3


From da13c59b9936dfedcf9f2203bd29fbf83ad672bf Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Mon, 30 Oct 2017 19:38:52 -0400
Subject: net: display hw address of source machine during ipv6 DAD failure

This patch updates the error messages displayed in kernel log to include
hwaddress of the source machine that caused ipv6 duplicate address
detection failures.

Examples:

a) When we receive a NA packet from another machine advertising our
address:

ICMPv6: NA: 34:ab:cd:56:11:e8 advertised our address 2001:db8:: on eth0!

b) When we detect DAD failure during address assignment to an interface:

IPv6: eth0: IPv6 duplicate address 2001:db8:: used by 34:ab:cd:56:11:e8
detected!

v2:
    Changed %pI6 to %pI6c in ndisc_recv_na()
    Chaged the v6 address in the commit message to 2001:db8::

Suggested-by: Igor Lubashev <ilubashe@akamai.com>
Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 2 +-
 net/ipv6/addrconf.c    | 6 +++---
 net/ipv6/ndisc.c       | 9 +++++----
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 15b5ffd7253d..2a616ea53956 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -208,7 +208,7 @@ void ipv6_mc_remap(struct inet6_dev *idev);
 void ipv6_mc_init_dev(struct inet6_dev *idev);
 void ipv6_mc_destroy_dev(struct inet6_dev *idev);
 int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed);
-void addrconf_dad_failure(struct inet6_ifaddr *ifp);
+void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);
 
 bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 			 const struct in6_addr *src_addr);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5a8a10229a07..cfa374c8b54c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1987,7 +1987,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
 	return err;
 }
 
-void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 {
 	struct inet6_dev *idev = ifp->idev;
 	struct net *net = dev_net(ifp->idev->dev);
@@ -1997,8 +1997,8 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
 		return;
 	}
 
-	net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n",
-			     ifp->idev->dev->name, &ifp->addr);
+	net_info_ratelimited("%s: IPv6 duplicate address %pI6c used by %pM detected!\n",
+			     ifp->idev->dev->name, &ifp->addr, eth_hdr(skb)->h_source);
 
 	spin_lock_bh(&ifp->lock);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 266a530414d7..f9c3ffe04382 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -46,6 +46,7 @@
 #endif
 
 #include <linux/if_addr.h>
+#include <linux/if_ether.h>
 #include <linux/if_arp.h>
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
@@ -822,7 +823,7 @@ have_ifp:
 				 * who is doing DAD
 				 * so fail our DAD process
 				 */
-				addrconf_dad_failure(ifp);
+				addrconf_dad_failure(skb, ifp);
 				return;
 			} else {
 				/*
@@ -975,7 +976,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 	if (ifp) {
 		if (skb->pkt_type != PACKET_LOOPBACK
 		    && (ifp->flags & IFA_F_TENTATIVE)) {
-				addrconf_dad_failure(ifp);
+				addrconf_dad_failure(skb, ifp);
 				return;
 		}
 		/* What should we make now? The advertisement
@@ -989,8 +990,8 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		 */
 		if (skb->pkt_type != PACKET_LOOPBACK)
 			ND_PRINTK(1, warn,
-				  "NA: someone advertises our address %pI6 on %s!\n",
-				  &ifp->addr, ifp->idev->dev->name);
+				  "NA: %pM advertised our address %pI6c on %s!\n",
+				  eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name);
 		in6_ifa_put(ifp);
 		return;
 	}
-- 
cgit v1.2.3


From e9292f2c03851ef81bef38579a0ee9c42140e586 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Tue, 31 Oct 2017 15:48:01 +0100
Subject: net: dsa: lan9303: Add STP ALR entry on port 0

STP BPDUs arriving on user ports must sent to CPU port only,
for processing by the SW bridge.

Add an ALR entry with STP state override to fix that.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/lan9303-core.c | 2 ++
 include/linux/dsa/lan9303.h    | 2 ++
 net/dsa/tag_lan9303.c          | 1 -
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index 4c412bd52319..c4afc8f1a66d 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -773,6 +773,7 @@ static int lan9303_separate_ports(struct lan9303 *chip)
 {
 	int ret;
 
+	lan9303_alr_del_port(chip, eth_stp_addr, 0);
 	ret = lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_MIRROR,
 				LAN9303_SWE_PORT_MIRROR_SNIFFER_PORT0 |
 				LAN9303_SWE_PORT_MIRROR_MIRRORED_PORT1 |
@@ -797,6 +798,7 @@ static void lan9303_bridge_ports(struct lan9303 *chip)
 
 	lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE,
 				 chip->swe_port_state);
+	lan9303_alr_add_port(chip, eth_stp_addr, 0, true);
 }
 
 static int lan9303_handle_reset(struct lan9303 *chip)
diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index 05d8d136baab..b2110e69630f 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -34,3 +34,5 @@ struct lan9303 {
 	 **/
 	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
 };
+
+#define eth_stp_addr eth_reserved_addr_base
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 537ca991fafe..18f45cd9f625 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -42,7 +42,6 @@
 #define LAN9303_TAG_LEN 4
 # define LAN9303_TAG_TX_USE_ALR BIT(3)
 # define LAN9303_TAG_TX_STP_OVERRIDE BIT(4)
-#define eth_stp_addr eth_reserved_addr_base
 
 /* Decide whether to transmit using ALR lookup, or transmit directly to
  * port using tag. ALR learning is performed only when using ALR lookup.
-- 
cgit v1.2.3


From 0b5a89caee5c9958c18cd933c7f8891e35b21781 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 1 Nov 2017 11:47:38 +0100
Subject: net: sched: remove unused tc_should_offload helper

tc_should_offload is no longer used, remove it.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 37c5ef766655..108dcdd96421 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -623,13 +623,6 @@ static inline bool tc_skip_hw(u32 flags)
 	return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false;
 }
 
-static inline bool tc_should_offload(const struct net_device *dev, u32 flags)
-{
-	if (tc_skip_hw(flags))
-		return false;
-	return tc_can_offload(dev);
-}
-
 static inline bool tc_skip_sw(u32 flags)
 {
 	return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false;
-- 
cgit v1.2.3


From 70b5aee46782208c14d93b715e9f62f7fec844f1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 1 Nov 2017 11:47:41 +0100
Subject: net: sched: remove ndo_setup_tc check from tc_can_offload

Since tc_can_offload is always called from block callback or egdev
callback, no need to check if ndo_setup_tc exists.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 108dcdd96421..d15c40c7bde7 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -611,11 +611,7 @@ struct tc_cls_u32_offload {
 
 static inline bool tc_can_offload(const struct net_device *dev)
 {
-	if (!(dev->features & NETIF_F_HW_TC))
-		return false;
-	if (!dev->netdev_ops->ndo_setup_tc)
-		return false;
-	return true;
+	return dev->features & NETIF_F_HW_TC;
 }
 
 static inline bool tc_skip_hw(u32 flags)
-- 
cgit v1.2.3


From 1495dc9f0a711a54f8fec849ce7f3a8f585a11e5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 1 Nov 2017 11:48:00 -0700
Subject: security: bpf: replace include of linux/bpf.h with forward
 declarations

Touching linux/bpf.h makes us rebuild a surprisingly large
portion of the kernel.  Remove the unnecessary dependency
from security.h, it only needs forward declarations.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 18800b0911e5..73f1ef625d40 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,7 +31,6 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
-#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -1732,6 +1731,10 @@ static inline void securityfs_remove(struct dentry *dentry)
 #endif
 
 #ifdef CONFIG_BPF_SYSCALL
+union bpf_attr;
+struct bpf_map;
+struct bpf_prog;
+struct bpf_prog_aux;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
-- 
cgit v1.2.3


From 47d3d7ac656a1ffb9d0f0d3c845663ed6fd7e78d Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Mon, 30 Oct 2017 14:16:00 -0700
Subject: ipv6: Implement limits on Hop-by-Hop and Destination options

RFC 8200 (IPv6) defines Hop-by-Hop options and Destination options
extension headers. Both of these carry a list of TLVs which is
only limited by the maximum length of the extension header (2048
bytes). By the spec a host must process all the TLVs in these
options, however these could be used as a fairly obvious
denial of service attack. I think this could in fact be
a significant DOS vector on the Internet, one mitigating
factor might be that many FWs drop all packets with EH (and
obviously this is only IPv6) so an Internet wide attack might not
be so effective (yet!).

By my calculation, the worse case packet with TLVs in a standard
1500 byte MTU packet that would be processed by the stack contains
1282 invidual TLVs (including pad TLVS) or 724 two byte TLVs. I
wrote a quick test program that floods a whole bunch of these
packets to a host and sure enough there is substantial time spent
in ip6_parse_tlv. These packets contain nothing but unknown TLVS
(that are ignored), TLV padding, and bogus UDP header with zero
payload length.

  25.38%  [kernel]                    [k] __fib6_clean_all
  21.63%  [kernel]                    [k] ip6_parse_tlv
   4.21%  [kernel]                    [k] __local_bh_enable_ip
   2.18%  [kernel]                    [k] ip6_pol_route.isra.39
   1.98%  [kernel]                    [k] fib6_walk_continue
   1.88%  [kernel]                    [k] _raw_write_lock_bh
   1.65%  [kernel]                    [k] dst_release

This patch adds configurable limits to Destination and Hop-by-Hop
options. There are three limits that may be set:
  - Limit the number of options in a Hop-by-Hop or Destination options
    extension header.
  - Limit the byte length of a Hop-by-Hop or Destination options
    extension header.
  - Disallow unrecognized options in a Hop-by-Hop or Destination
    options extension header.

The limits are set in corresponding sysctls:

  ipv6.sysctl.max_dst_opts_cnt
  ipv6.sysctl.max_hbh_opts_cnt
  ipv6.sysctl.max_dst_opts_len
  ipv6.sysctl.max_hbh_opts_len

If a max_*_opts_cnt is less than zero then unknown TLVs are disallowed.
The number of known TLVs that are allowed is the absolute value of
this number.

If a limit is exceeded when processing an extension header the packet is
dropped.

Default values are set to 8 for options counts, and set to INT_MAX
for maximum length. Note the choice to limit options to 8 is an
arbitrary guess (roughly based on the fact that the stack supports
three HBH options and just one destination option).

These limits have being proposed in draft-ietf-6man-rfc6434-bis.

Tested (by Martin Lau)

I tested out 1 thread (i.e. one raw_udp process).

I changed the net.ipv6.max_dst_(opts|hbh)_number between 8 to 2048.
With sysctls setting to 2048, the softirq% is packed to 100%.
With 8, the softirq% is almost unnoticable from mpstat.

v2;
  - Code and documention cleanup.
  - Change references of RFC2460 to be RFC8200.
  - Add reference to RFC6434-bis where the limits will be in standard.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 24 ++++++++++++
 include/net/ipv6.h                     | 40 ++++++++++++++++++++
 include/net/netns/ipv6.h               |  4 ++
 net/ipv6/af_inet6.c                    |  4 ++
 net/ipv6/exthdrs.c                     | 67 ++++++++++++++++++++++++++++------
 net/ipv6/sysctl_net_ipv6.c             | 32 ++++++++++++++++
 6 files changed, 159 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 77f4de59dc9c..e6661b205f72 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1385,6 +1385,30 @@ mld_qrv - INTEGER
 	Default: 2 (as specified by RFC3810 9.1)
 	Minimum: 1 (as specified by RFC6636 4.5)
 
+max_dst_opts_cnt - INTEGER
+	Maximum number of non-padding TLVs allowed in a Destination
+	options extension header. If this value is less than zero
+	then unknown options are disallowed and the number of known
+	TLVs allowed is the absolute value of this number.
+	Default: 8
+
+max_hbh_opts_cnt - INTEGER
+	Maximum number of non-padding TLVs allowed in a Hop-by-Hop
+	options extension header. If this value is less than zero
+	then unknown options are disallowed and the number of known
+	TLVs allowed is the absolute value of this number.
+	Default: 8
+
+max dst_opts_len - INTEGER
+	Maximum length allowed for a Destination options extension
+	header.
+	Default: INT_MAX (unlimited)
+
+max hbh_opts_len - INTEGER
+	Maximum length allowed for a Hop-by-Hop options extension
+	header.
+	Default: INT_MAX (unlimited)
+
 IPv6 Fragmentation:
 
 ip6frag_high_thresh - INTEGER
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 3cda3b521c36..fb6d67012de6 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -51,6 +51,46 @@
 #define IPV6_DEFAULT_HOPLIMIT   64
 #define IPV6_DEFAULT_MCASTHOPS	1
 
+/* Limits on Hop-by-Hop and Destination options.
+ *
+ * Per RFC8200 there is no limit on the maximum number or lengths of options in
+ * Hop-by-Hop or Destination options other then the packet must fit in an MTU.
+ * We allow configurable limits in order to mitigate potential denial of
+ * service attacks.
+ *
+ * There are three limits that may be set:
+ *   - Limit the number of options in a Hop-by-Hop or Destination options
+ *     extension header
+ *   - Limit the byte length of a Hop-by-Hop or Destination options extension
+ *     header
+ *   - Disallow unknown options
+ *
+ * The limits are expressed in corresponding sysctls:
+ *
+ * ipv6.sysctl.max_dst_opts_cnt
+ * ipv6.sysctl.max_hbh_opts_cnt
+ * ipv6.sysctl.max_dst_opts_len
+ * ipv6.sysctl.max_hbh_opts_len
+ *
+ * max_*_opts_cnt is the number of TLVs that are allowed for Destination
+ * options or Hop-by-Hop options. If the number is less than zero then unknown
+ * TLVs are disallowed and the number of known options that are allowed is the
+ * absolute value. Setting the value to INT_MAX indicates no limit.
+ *
+ * max_*_opts_len is the length limit in bytes of a Destination or
+ * Hop-by-Hop options extension header. Setting the value to INT_MAX
+ * indicates no length limit.
+ *
+ * If a limit is exceeded when processing an extension header the packet is
+ * silently discarded.
+ */
+
+/* Default limits for Hop-by-Hop and Destination options */
+#define IP6_DEFAULT_MAX_DST_OPTS_CNT	 8
+#define IP6_DEFAULT_MAX_HBH_OPTS_CNT	 8
+#define IP6_DEFAULT_MAX_DST_OPTS_LEN	 INT_MAX /* No limit */
+#define IP6_DEFAULT_MAX_HBH_OPTS_LEN	 INT_MAX /* No limit */
+
 /*
  *	Addr type
  *	
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 2ea1ed341ef8..600ba1c1befc 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -37,6 +37,10 @@ struct netns_sysctl_ipv6 {
 	int idgen_delay;
 	int flowlabel_state_ranges;
 	int flowlabel_reflect;
+	int max_dst_opts_cnt;
+	int max_hbh_opts_cnt;
+	int max_dst_opts_len;
+	int max_hbh_opts_len;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fe5262fd6aa5..c26f71234b9c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -810,6 +810,10 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.idgen_retries = 3;
 	net->ipv6.sysctl.idgen_delay = 1 * HZ;
 	net->ipv6.sysctl.flowlabel_state_ranges = 0;
+	net->ipv6.sysctl.max_dst_opts_cnt = IP6_DEFAULT_MAX_DST_OPTS_CNT;
+	net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT;
+	net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN;
+	net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN;
 	atomic_set(&net->ipv6.fib6_sernum, 1);
 
 	err = ipv6_init_mibs(net);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 9f918a770f87..83bd75713535 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -74,8 +74,20 @@ struct tlvtype_proc {
 
 /* An unknown option is detected, decide what to do */
 
-static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff)
+static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff,
+			       bool disallow_unknowns)
 {
+	if (disallow_unknowns) {
+		/* If unknown TLVs are disallowed by configuration
+		 * then always silently drop packet. Note this also
+		 * means no ICMP parameter problem is sent which
+		 * could be a good property to mitigate a reflection DOS
+		 * attack.
+		 */
+
+		goto drop;
+	}
+
 	switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) {
 	case 0: /* ignore */
 		return true;
@@ -95,20 +107,30 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff)
 		return false;
 	}
 
+drop:
 	kfree_skb(skb);
 	return false;
 }
 
 /* Parse tlv encoded option header (hop-by-hop or destination) */
 
-static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb)
+static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
+			  struct sk_buff *skb,
+			  int max_count)
 {
-	const struct tlvtype_proc *curr;
+	int len = (skb_transport_header(skb)[1] + 1) << 3;
 	const unsigned char *nh = skb_network_header(skb);
 	int off = skb_network_header_len(skb);
-	int len = (skb_transport_header(skb)[1] + 1) << 3;
+	const struct tlvtype_proc *curr;
+	bool disallow_unknowns = false;
+	int tlv_count = 0;
 	int padlen = 0;
 
+	if (unlikely(max_count < 0)) {
+		disallow_unknowns = true;
+		max_count = -max_count;
+	}
+
 	if (skb_transport_offset(skb) + len > skb_headlen(skb))
 		goto bad;
 
@@ -149,6 +171,11 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb)
 		default: /* Other TLV code so scan list */
 			if (optlen > len)
 				goto bad;
+
+			tlv_count++;
+			if (tlv_count > max_count)
+				goto bad;
+
 			for (curr = procs; curr->type >= 0; curr++) {
 				if (curr->type == nh[off]) {
 					/* type specific length/alignment
@@ -159,10 +186,10 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb)
 					break;
 				}
 			}
-			if (curr->type < 0) {
-				if (ip6_tlvopt_unknown(skb, off) == 0)
-					return false;
-			}
+			if (curr->type < 0 &&
+			    !ip6_tlvopt_unknown(skb, off, disallow_unknowns))
+				return false;
+
 			padlen = 0;
 			break;
 		}
@@ -258,23 +285,31 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 	__u16 dstbuf;
 #endif
 	struct dst_entry *dst = skb_dst(skb);
+	struct net *net = dev_net(skb->dev);
+	int extlen;
 
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
 		__IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
 				IPSTATS_MIB_INHDRERRORS);
+fail_and_free:
 		kfree_skb(skb);
 		return -1;
 	}
 
+	extlen = (skb_transport_header(skb)[1] + 1) << 3;
+	if (extlen > net->ipv6.sysctl.max_dst_opts_len)
+		goto fail_and_free;
+
 	opt->lastopt = opt->dst1 = skb_network_header_len(skb);
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
 	dstbuf = opt->dst1;
 #endif
 
-	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
-		skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
+	if (ip6_parse_tlv(tlvprocdestopt_lst, skb,
+			  init_net.ipv6.sysctl.max_dst_opts_cnt)) {
+		skb->transport_header += extlen;
 		opt = IP6CB(skb);
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
 		opt->nhoff = dstbuf;
@@ -803,6 +838,8 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = {
 int ipv6_parse_hopopts(struct sk_buff *skb)
 {
 	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct net *net = dev_net(skb->dev);
+	int extlen;
 
 	/*
 	 * skb_network_header(skb) is equal to skb->data, and
@@ -813,13 +850,19 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) ||
 	    !pskb_may_pull(skb, (sizeof(struct ipv6hdr) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
+fail_and_free:
 		kfree_skb(skb);
 		return -1;
 	}
 
+	extlen = (skb_transport_header(skb)[1] + 1) << 3;
+	if (extlen > net->ipv6.sysctl.max_hbh_opts_len)
+		goto fail_and_free;
+
 	opt->flags |= IP6SKB_HOPBYHOP;
-	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
-		skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb,
+			  init_net.ipv6.sysctl.max_hbh_opts_cnt)) {
+		skb->transport_header += extlen;
 		opt = IP6CB(skb);
 		opt->nhoff = sizeof(struct ipv6hdr);
 		return 1;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 6fbf8ae5e52c..4a2f0fd870bc 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -97,6 +97,34 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "max_dst_opts_number",
+		.data		= &init_net.ipv6.sysctl.max_dst_opts_cnt,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "max_hbh_opts_number",
+		.data		= &init_net.ipv6.sysctl.max_hbh_opts_cnt,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "max_dst_opts_length",
+		.data		= &init_net.ipv6.sysctl.max_dst_opts_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "max_hbh_length",
+		.data		= &init_net.ipv6.sysctl.max_hbh_opts_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
@@ -157,6 +185,10 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
 	ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
 	ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect;
+	ipv6_table[10].data = &net->ipv6.sysctl.max_dst_opts_cnt;
+	ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
+	ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
+	ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)
-- 
cgit v1.2.3


From cf34ce3da1e41579296364509266c7dac573822a Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 30 Oct 2017 14:41:35 -0700
Subject: tcp: add tracepoint trace_tcp_retransmit_synack()

This tracepoint can be used to trace synack retransmits. It maintains
pointer to struct request_sock.

We cannot simply reuse trace_tcp_retransmit_skb() here, because the
sk here is the LISTEN socket. The IP addresses and ports should be
extracted from struct request_sock.

Note that, like many other tracepoints, this patch uses IS_ENABLED
in TP_fast_assign macro, which triggers sparse warning like:

./include/trace/events/tcp.h:274:1: error: directive in argument list
./include/trace/events/tcp.h:281:1: error: directive in argument list

However, there is no good solution to avoid these warnings. To the
best of our knowledge, these warnings are harmless.

Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_output.c      |  1 +
 2 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 03699ba71623..07cccca6cbf1 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -237,6 +237,62 @@ TRACE_EVENT(tcp_set_state,
 		  show_tcp_state_name(__entry->newstate))
 );
 
+TRACE_EVENT(tcp_retransmit_synack,
+
+	TP_PROTO(const struct sock *sk, const struct request_sock *req),
+
+	TP_ARGS(sk, req),
+
+	TP_STRUCT__entry(
+		__field(const void *, skaddr)
+		__field(const void *, req)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+	),
+
+	TP_fast_assign(
+		struct inet_request_sock *ireq = inet_rsk(req);
+		struct in6_addr *pin6;
+		__be32 *p32;
+
+		__entry->skaddr = sk;
+		__entry->req = req;
+
+		__entry->sport = ireq->ir_num;
+		__entry->dport = ntohs(ireq->ir_rmt_port);
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = ireq->ir_loc_addr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 = ireq->ir_rmt_addr;
+
+#if IS_ENABLED(CONFIG_IPV6)
+		if (sk->sk_family == AF_INET6) {
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			*pin6 = ireq->ir_v6_loc_addr;
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			*pin6 = ireq->ir_v6_rmt_addr;
+		} else
+#endif
+		{
+			pin6 = (struct in6_addr *)__entry->saddr_v6;
+			ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6);
+			pin6 = (struct in6_addr *)__entry->daddr_v6;
+			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6);
+		}
+	),
+
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
+		  __entry->sport, __entry->dport,
+		  __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a85e8a282d17..06a0c89ffe40 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3782,6 +3782,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
 		if (unlikely(tcp_passive_fastopen(sk)))
 			tcp_sk(sk)->total_retrans++;
+		trace_tcp_retransmit_synack(sk, req);
 	}
 	return res;
 }
-- 
cgit v1.2.3


From 054287295b1132c8742ea55f8e3af9cbd630c932 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Thu, 2 Nov 2017 10:36:48 +0100
Subject: net: Define eth_stp_addr in linux/etherdevice.h

The lan9303 driver defines eth_stp_addr as a synonym to
eth_reserved_addr_base to get the STP ethernet address 01:80:c2:00:00:00.

eth_reserved_addr_base is also used to define the start of Bridge Reserved
ethernet address range, which happen to be the STP address.

br_dev_setup refer to eth_reserved_addr_base as a definition of STP
address.

Clean up by:
 - Move the eth_stp_addr definition to linux/etherdevice.h
 - Use eth_stp_addr instead of eth_reserved_addr_base in br_dev_setup.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/lan9303.h | 2 --
 include/linux/etherdevice.h | 1 +
 net/bridge/br_device.c      | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index b2110e69630f..05d8d136baab 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -34,5 +34,3 @@ struct lan9303 {
 	 **/
 	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
 };
-
-#define eth_stp_addr eth_reserved_addr_base
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 2d9f80848d4b..263dbcad22fc 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -66,6 +66,7 @@ int eth_gro_complete(struct sk_buff *skb, int nhoff);
 /* Reserved Ethernet Addresses per IEEE 802.1Q */
 static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
 { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
+#define eth_stp_addr eth_reserved_addr_base
 
 /**
  * is_link_local_ether_addr - Determine if given Ethernet address is link-local
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 28bb22186fa0..af5b8c87f590 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -421,7 +421,7 @@ void br_dev_setup(struct net_device *dev)
 	br->bridge_id.prio[0] = 0x80;
 	br->bridge_id.prio[1] = 0x00;
 
-	ether_addr_copy(br->group_addr, eth_reserved_addr_base);
+	ether_addr_copy(br->group_addr, eth_stp_addr);
 
 	br->stp_enabled = BR_NO_STP;
 	br->group_fwd_mask = BR_GROUPFWD_DEFAULT;
-- 
cgit v1.2.3


From 3ae6ec08292f01c6782d1a80be0b2cc675e0ecfc Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 2 Nov 2017 17:14:05 +0100
Subject: ipv4: Send a netevent whenever multipath hash policy is changed

Devices performing IPv4 forwarding need to update their multipath hash
policy whenever it is changed.

Inform these devices by generating a netevent.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netevent.h     |  1 +
 net/ipv4/sysctl_net_ipv4.c | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netevent.h b/include/net/netevent.h
index f440df172b56..e3f0e8f2f6e8 100644
--- a/include/net/netevent.h
+++ b/include/net/netevent.h
@@ -25,6 +25,7 @@ enum netevent_notif_type {
 	NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */
 	NETEVENT_REDIRECT,	   /* arg is struct netevent_redirect ptr */
 	NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
+	NETEVENT_MULTIPATH_HASH_UPDATE, /* arg is struct net ptr */
 };
 
 int register_netevent_notifier(struct notifier_block *nb);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4602af6d5358..8dcc2b185fcc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/inet_frag.h>
 #include <net/ping.h>
 #include <net/protocol.h>
+#include <net/netevent.h>
 
 static int zero;
 static int one = 1;
@@ -385,6 +386,23 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl,
 	return ret;
 }
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_fib_multipath_hash_policy);
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net);
+
+	return ret;
+}
+#endif
+
 static struct ctl_table ipv4_table[] = {
 	{
 		.procname	= "tcp_max_orphans",
@@ -907,7 +925,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_fib_multipath_hash_policy,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-- 
cgit v1.2.3


From c7eb7d7230509ec862d4144f7a831f995bc5d028 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 3 Nov 2017 11:46:24 +0100
Subject: net: sched: introduce chain_head_change callback

Add a callback that is to be called whenever head of the chain changes.
Also provide a callback for the default case when the caller gets a
block using non-extended getter.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     | 14 ++++++------
 include/net/sch_generic.h |  5 ++++-
 net/sched/cls_api.c       | 54 ++++++++++++++++++++++++++++-------------------
 net/sched/sch_ingress.c   | 36 +++++++++++++++++--------------
 4 files changed, 62 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index d15c40c7bde7..505d4b71975f 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -26,6 +26,8 @@ enum tcf_block_binder_type {
 
 struct tcf_block_ext_info {
 	enum tcf_block_binder_type binder_type;
+	tcf_chain_head_change_t *chain_head_change;
+	void *chain_head_change_priv;
 };
 
 struct tcf_block_cb;
@@ -37,12 +39,10 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 void tcf_chain_put(struct tcf_chain *chain);
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q);
-int tcf_block_get_ext(struct tcf_block **p_block,
-		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		      struct tcf_block_ext_info *ei);
 void tcf_block_put(struct tcf_block *block);
-void tcf_block_put_ext(struct tcf_block *block,
-		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei);
 
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
@@ -82,8 +82,7 @@ int tcf_block_get(struct tcf_block **p_block,
 }
 
 static inline
-int tcf_block_get_ext(struct tcf_block **p_block,
-		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		      struct tcf_block_ext_info *ei)
 {
 	return 0;
@@ -94,8 +93,7 @@ static inline void tcf_block_put(struct tcf_block *block)
 }
 
 static inline
-void tcf_block_put_ext(struct tcf_block *block,
-		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei)
 {
 }
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c23e938f5b19..f230269e0bfb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -260,9 +260,12 @@ struct qdisc_skb_cb {
 	unsigned char		data[QDISC_CB_PRIV_LEN];
 };
 
+typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
+
 struct tcf_chain {
 	struct tcf_proto __rcu *filter_chain;
-	struct tcf_proto __rcu **p_filter_chain;
+	tcf_chain_head_change_t *chain_head_change;
+	void *chain_head_change_priv;
 	struct list_head list;
 	struct tcf_block *block;
 	u32 index; /* chain index */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8d1885abee83..206e19f4fc01 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -195,12 +195,19 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 	return chain;
 }
 
+static void tcf_chain_head_change(struct tcf_chain *chain,
+				  struct tcf_proto *tp_head)
+{
+	if (chain->chain_head_change)
+		chain->chain_head_change(tp_head,
+					 chain->chain_head_change_priv);
+}
+
 static void tcf_chain_flush(struct tcf_chain *chain)
 {
 	struct tcf_proto *tp;
 
-	if (chain->p_filter_chain)
-		RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
+	tcf_chain_head_change(chain, NULL);
 	while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
 		RCU_INIT_POINTER(chain->filter_chain, tp->next);
 		tcf_chain_put(chain);
@@ -242,13 +249,6 @@ void tcf_chain_put(struct tcf_chain *chain)
 }
 EXPORT_SYMBOL(tcf_chain_put);
 
-static void
-tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain,
-			       struct tcf_proto __rcu **p_filter_chain)
-{
-	chain->p_filter_chain = p_filter_chain;
-}
-
 static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q,
 				  struct tcf_block_ext_info *ei,
 				  enum tc_block_command command)
@@ -276,8 +276,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND);
 }
 
-int tcf_block_get_ext(struct tcf_block **p_block,
-		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		      struct tcf_block_ext_info *ei)
 {
 	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
@@ -295,7 +294,9 @@ int tcf_block_get_ext(struct tcf_block **p_block,
 		err = -ENOMEM;
 		goto err_chain_create;
 	}
-	tcf_chain_filter_chain_ptr_set(chain, p_filter_chain);
+	WARN_ON(!ei->chain_head_change);
+	chain->chain_head_change = ei->chain_head_change;
+	chain->chain_head_change_priv = ei->chain_head_change_priv;
 	block->net = qdisc_net(q);
 	block->q = q;
 	tcf_block_offload_bind(block, q, ei);
@@ -308,12 +309,23 @@ err_chain_create:
 }
 EXPORT_SYMBOL(tcf_block_get_ext);
 
+static void tcf_chain_head_change_dflt(struct tcf_proto *tp_head, void *priv)
+{
+	struct tcf_proto __rcu **p_filter_chain = priv;
+
+	rcu_assign_pointer(*p_filter_chain, tp_head);
+}
+
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
 {
-	struct tcf_block_ext_info ei = {0, };
+	struct tcf_block_ext_info ei = {
+		.chain_head_change = tcf_chain_head_change_dflt,
+		.chain_head_change_priv = p_filter_chain,
+	};
 
-	return tcf_block_get_ext(p_block, p_filter_chain, q, &ei);
+	WARN_ON(!p_filter_chain);
+	return tcf_block_get_ext(p_block, q, &ei);
 }
 EXPORT_SYMBOL(tcf_block_get);
 
@@ -334,8 +346,7 @@ static void tcf_block_put_final(struct work_struct *work)
  * actions should be all removed after flushing. However, filters are now
  * destroyed in tc filter workqueue with RTNL lock, they can not race here.
  */
-void tcf_block_put_ext(struct tcf_block *block,
-		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei)
 {
 	struct tcf_chain *chain, *tmp;
@@ -361,7 +372,7 @@ void tcf_block_put(struct tcf_block *block)
 
 	if (!block)
 		return;
-	tcf_block_put_ext(block, NULL, block->q, &ei);
+	tcf_block_put_ext(block, block->q, &ei);
 }
 
 EXPORT_SYMBOL(tcf_block_put);
@@ -537,9 +548,8 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain,
 				struct tcf_chain_info *chain_info,
 				struct tcf_proto *tp)
 {
-	if (chain->p_filter_chain &&
-	    *chain_info->pprev == chain->filter_chain)
-		rcu_assign_pointer(*chain->p_filter_chain, tp);
+	if (*chain_info->pprev == chain->filter_chain)
+		tcf_chain_head_change(chain, tp);
 	RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
 	rcu_assign_pointer(*chain_info->pprev, tp);
 	tcf_chain_hold(chain);
@@ -551,8 +561,8 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain,
 {
 	struct tcf_proto *next = rtnl_dereference(chain_info->next);
 
-	if (chain->p_filter_chain && tp == chain->filter_chain)
-		RCU_INIT_POINTER(*chain->p_filter_chain, next);
+	if (tp == chain->filter_chain)
+		tcf_chain_head_change(chain, next);
 	RCU_INIT_POINTER(*chain_info->pprev, next);
 	tcf_chain_put(chain);
 }
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index b599db26d34b..811845815b8c 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -54,6 +54,13 @@ static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl)
 	return q->block;
 }
 
+static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
+{
+	struct tcf_proto __rcu **p_filter_chain = priv;
+
+	rcu_assign_pointer(*p_filter_chain, tp_head);
+}
+
 static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct ingress_sched_data *q = qdisc_priv(sch);
@@ -61,9 +68,10 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 	int err;
 
 	q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	q->block_info.chain_head_change = clsact_chain_head_change;
+	q->block_info.chain_head_change_priv = &dev->ingress_cl_list;
 
-	err = tcf_block_get_ext(&q->block, &dev->ingress_cl_list,
-				sch, &q->block_info);
+	err = tcf_block_get_ext(&q->block, sch, &q->block_info);
 	if (err)
 		return err;
 
@@ -76,10 +84,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 static void ingress_destroy(struct Qdisc *sch)
 {
 	struct ingress_sched_data *q = qdisc_priv(sch);
-	struct net_device *dev = qdisc_dev(sch);
 
-	tcf_block_put_ext(q->block, &dev->ingress_cl_list,
-			  sch, &q->block_info);
+	tcf_block_put_ext(q->block, sch, &q->block_info);
 	net_dec_ingress_queue();
 }
 
@@ -162,16 +168,18 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	int err;
 
 	q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	q->ingress_block_info.chain_head_change = clsact_chain_head_change;
+	q->ingress_block_info.chain_head_change_priv = &dev->ingress_cl_list;
 
-	err = tcf_block_get_ext(&q->ingress_block, &dev->ingress_cl_list,
-				sch, &q->ingress_block_info);
+	err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info);
 	if (err)
 		return err;
 
 	q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+	q->egress_block_info.chain_head_change = clsact_chain_head_change;
+	q->egress_block_info.chain_head_change_priv = &dev->egress_cl_list;
 
-	err = tcf_block_get_ext(&q->egress_block, &dev->egress_cl_list,
-				sch, &q->egress_block_info);
+	err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info);
 	if (err)
 		goto err_egress_block_get;
 
@@ -183,20 +191,16 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 
 err_egress_block_get:
-	tcf_block_put_ext(q->ingress_block, &dev->ingress_cl_list,
-			  sch, &q->ingress_block_info);
+	tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
 	return err;
 }
 
 static void clsact_destroy(struct Qdisc *sch)
 {
 	struct clsact_sched_data *q = qdisc_priv(sch);
-	struct net_device *dev = qdisc_dev(sch);
 
-	tcf_block_put_ext(q->egress_block, &dev->egress_cl_list,
-			  sch, &q->egress_block_info);
-	tcf_block_put_ext(q->ingress_block, &dev->ingress_cl_list,
-			  sch, &q->ingress_block_info);
+	tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
+	tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
 
 	net_dec_ingress_queue();
 	net_dec_egress_queue();
-- 
cgit v1.2.3


From 46209401f8f6116bd0b2c2d14a63958e83ffca0b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 3 Nov 2017 11:46:25 +0100
Subject: net: core: introduce mini_Qdisc and eliminate usage of tp->q for
 clsact fastpath

In sch_handle_egress and sch_handle_ingress tp->q is used only in order
to update stats. So stats and filter list are the only things that are
needed in clsact qdisc fastpath processing. Introduce new mini_Qdisc
struct to hold those items. Also, introduce a helper to swap the
mini_Qdisc structures in case filter list head changes.

This removes need for tp->q usage without added overhead.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 ++++++---
 include/net/sch_generic.h | 32 ++++++++++++++++++++++++++++++++
 net/core/dev.c            | 21 +++++++++++----------
 net/sched/sch_generic.c   | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 net/sched/sch_ingress.c   | 19 ++++++++++++++-----
 5 files changed, 109 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5e02f79b2110..7de7656550c2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1559,6 +1559,8 @@ enum netdev_priv_flags {
  *
  *	@rx_handler:		handler for received packets
  *	@rx_handler_data: 	XXX: need comments on this one
+ *	@miniq_ingress:		ingress/clsact qdisc specific data for
+ *				ingress processing
  *	@ingress_queue:		XXX: need comments on this one
  *	@broadcast:		hw bcast address
  *
@@ -1576,7 +1578,8 @@ enum netdev_priv_flags {
  *	@tx_global_lock: 	XXX: need comments on this one
  *
  *	@xps_maps:	XXX: need comments on this one
- *
+ *	@miniq_egress:		clsact qdisc specific data for
+ *				egress processing
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
  *	@watchdog_timer:	List of timers
@@ -1795,7 +1798,7 @@ struct net_device {
 	void __rcu		*rx_handler_data;
 
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto __rcu  *ingress_cl_list;
+	struct mini_Qdisc __rcu	*miniq_ingress;
 #endif
 	struct netdev_queue __rcu *ingress_queue;
 #ifdef CONFIG_NETFILTER_INGRESS
@@ -1826,7 +1829,7 @@ struct net_device {
 	struct xps_dev_maps __rcu *xps_maps;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto __rcu  *egress_cl_list;
+	struct mini_Qdisc __rcu	*miniq_egress;
 #endif
 
 	/* These may be needed for future network-power-down code. */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f230269e0bfb..c64e62c9450a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -904,4 +904,36 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
 	res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
 }
 
+/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
+ * The fast path only needs to access filter list and to update stats
+ */
+struct mini_Qdisc {
+	struct tcf_proto *filter_list;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct gnet_stats_queue	__percpu *cpu_qstats;
+	struct rcu_head rcu;
+};
+
+static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq,
+						const struct sk_buff *skb)
+{
+	bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb);
+}
+
+static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq)
+{
+	this_cpu_inc(miniq->cpu_qstats->drops);
+}
+
+struct mini_Qdisc_pair {
+	struct mini_Qdisc miniq1;
+	struct mini_Qdisc miniq2;
+	struct mini_Qdisc __rcu **p_miniq;
+};
+
+void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
+			  struct tcf_proto *tp_head);
+void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
+			  struct mini_Qdisc __rcu **p_miniq);
+
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 24ac9083bc13..1423cf4d695c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3274,22 +3274,22 @@ EXPORT_SYMBOL(dev_loopback_xmit);
 static struct sk_buff *
 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
-	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
+	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 	struct tcf_result cl_res;
 
-	if (!cl)
+	if (!miniq)
 		return skb;
 
 	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
-	qdisc_bstats_cpu_update(cl->q, skb);
+	mini_qdisc_bstats_cpu_update(miniq, skb);
 
-	switch (tcf_classify(skb, cl, &cl_res, false)) {
+	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 	case TC_ACT_OK:
 	case TC_ACT_RECLASSIFY:
 		skb->tc_index = TC_H_MIN(cl_res.classid);
 		break;
 	case TC_ACT_SHOT:
-		qdisc_qstats_cpu_drop(cl->q);
+		mini_qdisc_qstats_cpu_drop(miniq);
 		*ret = NET_XMIT_DROP;
 		kfree_skb(skb);
 		return NULL;
@@ -4189,7 +4189,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		   struct net_device *orig_dev)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 	struct tcf_result cl_res;
 
 	/* If there's at least one ingress present somewhere (so
@@ -4197,8 +4197,9 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	 * that are not configured with an ingress qdisc will bail
 	 * out here.
 	 */
-	if (!cl)
+	if (!miniq)
 		return skb;
+
 	if (*pt_prev) {
 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 		*pt_prev = NULL;
@@ -4206,15 +4207,15 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
 	skb->tc_at_ingress = 1;
-	qdisc_bstats_cpu_update(cl->q, skb);
+	mini_qdisc_bstats_cpu_update(miniq, skb);
 
-	switch (tcf_classify(skb, cl, &cl_res, false)) {
+	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 	case TC_ACT_OK:
 	case TC_ACT_RECLASSIFY:
 		skb->tc_index = TC_H_MIN(cl_res.classid);
 		break;
 	case TC_ACT_SHOT:
-		qdisc_qstats_cpu_drop(cl->q);
+		mini_qdisc_qstats_cpu_drop(miniq);
 		kfree_skb(skb);
 		return NULL;
 	case TC_ACT_STOLEN:
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index aa74aa42b5d7..3839cbbdc32b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1024,3 +1024,49 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r,
 	}
 }
 EXPORT_SYMBOL(psched_ratecfg_precompute);
+
+static void mini_qdisc_rcu_func(struct rcu_head *head)
+{
+}
+
+void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
+			  struct tcf_proto *tp_head)
+{
+	struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
+	struct mini_Qdisc *miniq;
+
+	if (!tp_head) {
+		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
+		return;
+	}
+
+	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
+		&miniqp->miniq1 : &miniqp->miniq2;
+
+	/* We need to make sure that readers won't see the miniq
+	 * we are about to modify. So wait until previous call_rcu_bh callback
+	 * is done.
+	 */
+	rcu_barrier_bh();
+	miniq->filter_list = tp_head;
+	rcu_assign_pointer(*miniqp->p_miniq, miniq);
+
+	if (miniq_old)
+		/* This is counterpart of the rcu barrier above. We need to
+		 * block potential new user of miniq_old until all readers
+		 * are not seeing it.
+		 */
+		call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
+}
+EXPORT_SYMBOL(mini_qdisc_pair_swap);
+
+void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
+			  struct mini_Qdisc __rcu **p_miniq)
+{
+	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
+	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
+	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
+	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
+	miniqp->p_miniq = p_miniq;
+}
+EXPORT_SYMBOL(mini_qdisc_pair_init);
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 811845815b8c..5ecc38f35d47 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -21,6 +21,7 @@
 struct ingress_sched_data {
 	struct tcf_block *block;
 	struct tcf_block_ext_info block_info;
+	struct mini_Qdisc_pair miniqp;
 };
 
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
@@ -56,9 +57,9 @@ static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl)
 
 static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
 {
-	struct tcf_proto __rcu **p_filter_chain = priv;
+	struct mini_Qdisc_pair *miniqp = priv;
 
-	rcu_assign_pointer(*p_filter_chain, tp_head);
+	mini_qdisc_pair_swap(miniqp, tp_head);
 }
 
 static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
@@ -67,9 +68,11 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
+	mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
+
 	q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	q->block_info.chain_head_change = clsact_chain_head_change;
-	q->block_info.chain_head_change_priv = &dev->ingress_cl_list;
+	q->block_info.chain_head_change_priv = &q->miniqp;
 
 	err = tcf_block_get_ext(&q->block, sch, &q->block_info);
 	if (err)
@@ -128,6 +131,8 @@ struct clsact_sched_data {
 	struct tcf_block *egress_block;
 	struct tcf_block_ext_info ingress_block_info;
 	struct tcf_block_ext_info egress_block_info;
+	struct mini_Qdisc_pair miniqp_ingress;
+	struct mini_Qdisc_pair miniqp_egress;
 };
 
 static unsigned long clsact_find(struct Qdisc *sch, u32 classid)
@@ -167,17 +172,21 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
+	mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
+
 	q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	q->ingress_block_info.chain_head_change = clsact_chain_head_change;
-	q->ingress_block_info.chain_head_change_priv = &dev->ingress_cl_list;
+	q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
 
 	err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info);
 	if (err)
 		return err;
 
+	mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
+
 	q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
 	q->egress_block_info.chain_head_change = clsact_chain_head_change;
-	q->egress_block_info.chain_head_change_priv = &dev->egress_cl_list;
+	q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
 
 	err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info);
 	if (err)
-- 
cgit v1.2.3


From 27c565ae9d554fa1c00c799754cff43476c8d3b5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Nov 2017 08:53:27 -0700
Subject: ipv6: remove IN6_ADDR_HSIZE from addrconf.h

IN6_ADDR_HSIZE is private to addrconf.c, move it here to avoid
confusion.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 3 ---
 net/ipv6/addrconf.c    | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 3357332ea375..b623b65a79d1 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -59,9 +59,6 @@ struct in6_validator_info {
 	struct netlink_ext_ack	*extack;
 };
 
-#define IN6_ADDR_HSIZE_SHIFT	8
-#define IN6_ADDR_HSIZE		(1 << IN6_ADDR_HSIZE_SHIFT)
-
 int addrconf_init(void);
 void addrconf_cleanup(void);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 69b8cdb43aa2..66d8c3d912fd 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -157,6 +157,8 @@ static int ipv6_generate_stable_address(struct in6_addr *addr,
 					u8 dad_count,
 					const struct inet6_dev *idev);
 
+#define IN6_ADDR_HSIZE_SHIFT	8
+#define IN6_ADDR_HSIZE		(1 << IN6_ADDR_HSIZE_SHIFT)
 /*
  *	Configured unicast address hash table
  */
-- 
cgit v1.2.3


From ee20598194500e82c477cf13e52b58e569446ed0 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 15:42:15 -0500
Subject: net/dcb: Add dscp to priority selector type

IEEE specification P802.1Qcd/D2.1 defines priority selector 5.
This APP TLV selector defines DSCP to priority map.
This patch defines such DSCP selector.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/uapi/linux/dcbnl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index b6170a6af7c2..2c0c6453c3f4 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -206,6 +206,7 @@ struct cee_pfc {
 #define IEEE_8021QAZ_APP_SEL_STREAM	2
 #define IEEE_8021QAZ_APP_SEL_DGRAM	3
 #define IEEE_8021QAZ_APP_SEL_ANY	4
+#define IEEE_8021QAZ_APP_SEL_DSCP       5
 
 /* This structure contains the IEEE 802.1Qaz APP managed object. This
  * object is also used for the CEE std as well.
-- 
cgit v1.2.3


From c02762eb20cb57ec5b7c037b056c37d5838c803f Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 16:03:17 -0500
Subject: net/mlx5: QCAM register firmware command support

The QCAM register provides capability bit for all the QoS registers
using ACCESS_REG command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c       | 10 ++++++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 ++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 12 +++++++
 include/linux/mlx5/device.h                        | 14 ++++++++
 include/linux/mlx5/driver.h                        |  2 ++
 include/linux/mlx5/mlx5_ifc.h                      | 40 +++++++++++++++++++++-
 6 files changed, 79 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 2c71557d1cee..5ef1b56b6a96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -106,6 +106,13 @@ static int mlx5_get_mcam_reg(struct mlx5_core_dev *dev)
 				   MLX5_MCAM_REGS_FIRST_128);
 }
 
+static int mlx5_get_qcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_qcam_reg(dev, dev->caps.qcam,
+				   MLX5_QCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_QCAM_REGS_FIRST_128);
+}
+
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 {
 	int err;
@@ -182,6 +189,9 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, mcam_reg))
 		mlx5_get_mcam_reg(dev);
 
+	if (MLX5_CAP_GEN(dev, qcam_reg))
+		mlx5_get_qcam_reg(dev);
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 8f00de2fe283..ff4a0b889a6f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -122,6 +122,8 @@ int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group,
 			u8 access_reg_group);
 int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcap, u8 feature_group,
 			u8 access_reg_group);
+int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
+			u8 feature_group, u8 access_reg_group);
 
 void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev);
 void mlx5_lag_remove(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index e07061f565d6..b6553be841f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -98,6 +98,18 @@ int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcam, u8 feature_group,
 	return mlx5_core_access_reg(dev, in, sz, mcam, sz, MLX5_REG_MCAM, 0, 0);
 }
 
+int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
+			u8 feature_group, u8 access_reg_group)
+{
+	u32 in[MLX5_ST_SZ_DW(qcam_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(qcam_reg);
+
+	MLX5_SET(qcam_reg, in, feature_group, feature_group);
+	MLX5_SET(qcam_reg, in, access_reg_group, access_reg_group);
+
+	return mlx5_core_access_reg(mdev, in, sz, qcam, sz, MLX5_REG_QCAM, 0, 0);
+}
+
 struct mlx5_reg_pcap {
 	u8			rsvd0;
 	u8			port_num;
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index e32dbc4934db..6d79b3f79458 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1000,6 +1000,14 @@ enum mlx5_mcam_feature_groups {
 	MLX5_MCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
 };
 
+enum mlx5_qcam_reg_groups {
+	MLX5_QCAM_REGS_FIRST_128                    = 0x0,
+};
+
+enum mlx5_qcam_feature_groups {
+	MLX5_QCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
+};
+
 /* GET Dev Caps macros */
 #define MLX5_CAP_GEN(mdev, cap) \
 	MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
@@ -1108,6 +1116,12 @@ enum mlx5_mcam_feature_groups {
 #define MLX5_CAP_MCAM_FEATURE(mdev, fld) \
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld)
 
+#define MLX5_CAP_QCAM_REG(mdev, fld) \
+	MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_access_reg_cap_mask.reg_cap.fld)
+
+#define MLX5_CAP_QCAM_FEATURE(mdev, fld) \
+	MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_feature_cap_mask.feature_cap.fld)
+
 #define MLX5_CAP_FPGA(mdev, cap) \
 	MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap)
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 08c77b7e59cb..ed5be52282ea 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -109,6 +109,7 @@ enum {
 enum {
 	MLX5_REG_QETCR		 = 0x4005,
 	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_QCAM            = 0x4019,
 	MLX5_REG_DCBX_PARAM      = 0x4020,
 	MLX5_REG_DCBX_APP        = 0x4021,
 	MLX5_REG_FPGA_CAP	 = 0x4022,
@@ -798,6 +799,7 @@ struct mlx5_core_dev {
 		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 		u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
 		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
+		u32 qcam[MLX5_ST_SZ_DW(qcam_reg)];
 	} caps;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 69772347f866..f127c5b310c5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -838,7 +838,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cc_modify_allowed[0x1];
 	u8         start_pad[0x1];
 	u8         cache_line_128byte[0x1];
-	u8         reserved_at_165[0xb];
+	u8         reserved_at_165[0xa];
+	u8         qcam_reg[0x1];
 	u8         gid_table_size[0x10];
 
 	u8         out_of_seq_cnt[0x1];
@@ -7890,6 +7891,43 @@ struct mlx5_ifc_mcam_reg_bits {
 	u8         reserved_at_1c0[0x80];
 };
 
+struct mlx5_ifc_qcam_access_reg_cap_mask {
+	u8         qcam_access_reg_cap_mask_127_to_20[0x6C];
+	u8         qpdpm[0x1];
+	u8         qcam_access_reg_cap_mask_18_to_4[0x0F];
+	u8         qdpm[0x1];
+	u8         qpts[0x1];
+	u8         qcap[0x1];
+	u8         qcam_access_reg_cap_mask_0[0x1];
+};
+
+struct mlx5_ifc_qcam_qos_feature_cap_mask {
+	u8         qcam_qos_feature_cap_mask_127_to_1[0x7F];
+	u8         qpts_trust_both[0x1];
+};
+
+struct mlx5_ifc_qcam_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         feature_group[0x8];
+	u8         reserved_at_10[0x8];
+	u8         access_reg_group[0x8];
+	u8         reserved_at_20[0x20];
+
+	union {
+		struct mlx5_ifc_qcam_access_reg_cap_mask reg_cap;
+		u8  reserved_at_0[0x80];
+	} qos_access_reg_cap_mask;
+
+	u8         reserved_at_c0[0x80];
+
+	union {
+		struct mlx5_ifc_qcam_qos_feature_cap_mask feature_cap;
+		u8  reserved_at_0[0x80];
+	} qos_feature_cap_mask;
+
+	u8         reserved_at_1c0[0x80];
+};
+
 struct mlx5_ifc_pcap_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         local_port[0x8];
-- 
cgit v1.2.3


From 71c70eb21c33c60433b95e72a59d40bb128db649 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Wed, 2 Aug 2017 21:36:23 -0500
Subject: net/mlx5: Add MLX5_SET16 and MLX5_GET16

Add MLX5_SET16 and MLX5_GET16 for 16bit structure field in firmware
command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 6d79b3f79458..409ffb14298a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -49,11 +49,15 @@
 #define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0)
 #define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld)
 #define __mlx5_bit_off(typ, fld) (offsetof(struct mlx5_ifc_##typ##_bits, fld))
+#define __mlx5_16_off(typ, fld) (__mlx5_bit_off(typ, fld) / 16)
 #define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32)
 #define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64)
+#define __mlx5_16_bit_off(typ, fld) (16 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0xf))
 #define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f))
 #define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
 #define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld))
+#define __mlx5_mask16(typ, fld) ((u16)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
+#define __mlx5_16_mask(typ, fld) (__mlx5_mask16(typ, fld) << __mlx5_16_bit_off(typ, fld))
 #define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
 
 #define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
@@ -116,6 +120,19 @@ __mlx5_mask(typ, fld))
 	___t; \
 })
 
+#define MLX5_GET16(typ, p, fld) ((be16_to_cpu(*((__be16 *)(p) +\
+__mlx5_16_off(typ, fld))) >> __mlx5_16_bit_off(typ, fld)) & \
+__mlx5_mask16(typ, fld))
+
+#define MLX5_SET16(typ, p, fld, v) do { \
+	u16 _v = v; \
+	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 16);             \
+	*((__be16 *)(p) + __mlx5_16_off(typ, fld)) = \
+	cpu_to_be16((be16_to_cpu(*((__be16 *)(p) + __mlx5_16_off(typ, fld))) & \
+		     (~__mlx5_16_mask(typ, fld))) | (((_v) & __mlx5_mask16(typ, fld)) \
+		     << __mlx5_16_bit_off(typ, fld))); \
+} while (0)
+
 /* Big endian getters */
 #define MLX5_GET64_BE(typ, p, fld) (*((__be64 *)(p) +\
 	__mlx5_64_off(typ, fld)))
-- 
cgit v1.2.3


From 415a64aa8dc6b4fc478609c549ca652d95a12f13 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 16:08:46 -0500
Subject: net/mlx5: QPTS and QPDPM register firmware command support

The QPTS register allows changing the priority trust state between pcp and
dscp. Add support to get/set trust state from device. When the port is
in pcp/dscp trust state, packet is routed by hardware to matching priority
based on its pcp/dscp value respectively.

The QPDPM register allow channing the dscp to priority mapping. Add support
to get/set dscp to priority mapping from device.
Note that to change a dscp mapping, the "e" bit of this dscp structure
must be set in the QPDPM firmware command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 99 ++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                    |  7 ++
 include/linux/mlx5/mlx5_ifc.h                  | 20 ++++++
 include/linux/mlx5/port.h                      |  5 ++
 4 files changed, 131 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index b6553be841f9..c37d00cd472a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -971,3 +971,102 @@ int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode)
 	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
 				    sizeof(out), MLX5_REG_MTPPSE, 0, 1);
 }
+
+int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state)
+{
+	u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	int err;
+
+	MLX5_SET(qpts_reg, in, local_port, 1);
+	MLX5_SET(qpts_reg, in, trust_state, trust_state);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QPTS, 0, 1);
+	return err;
+}
+
+int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state)
+{
+	u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	int err;
+
+	MLX5_SET(qpts_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QPTS, 0, 0);
+	if (!err)
+		*trust_state = MLX5_GET(qpts_reg, out, trust_state);
+
+	return err;
+}
+
+int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio)
+{
+	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
+	void *qpdpm_dscp;
+	void *out;
+	void *in;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0);
+	if (err)
+		goto out;
+
+	memcpy(in, out, sz);
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+
+	/* Update the corresponding dscp entry */
+	qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, in, dscp[dscp]);
+	MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, prio, prio);
+	MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, e, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 1);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+
+/* dscp2prio[i]: priority that dscp i mapped to */
+#define MLX5E_SUPPORTED_DSCP 64
+int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio)
+{
+	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
+	void *qpdpm_dscp;
+	void *out;
+	void *in;
+	int err;
+	int i;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0);
+	if (err)
+		goto out;
+
+	for (i = 0; i < (MLX5E_SUPPORTED_DSCP); i++) {
+		qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, out, dscp[i]);
+		dscp2prio[i] = MLX5_GET16(qpdpm_dscp_reg, qpdpm_dscp, prio);
+	}
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ed5be52282ea..a886b51511ab 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -107,8 +107,10 @@ enum {
 };
 
 enum {
+	MLX5_REG_QPTS            = 0x4002,
 	MLX5_REG_QETCR		 = 0x4005,
 	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_QPDPM           = 0x4013,
 	MLX5_REG_QCAM            = 0x4019,
 	MLX5_REG_DCBX_PARAM      = 0x4020,
 	MLX5_REG_DCBX_APP        = 0x4021,
@@ -142,6 +144,11 @@ enum {
 	MLX5_REG_MCAM		 = 0x907f,
 };
 
+enum mlx5_qpts_trust_state {
+	MLX5_QPTS_TRUST_PCP  = 1,
+	MLX5_QPTS_TRUST_DSCP = 2,
+};
+
 enum mlx5_dcbx_oper_mode {
 	MLX5E_DCBX_PARAM_VER_OPER_HOST  = 0x0,
 	MLX5E_DCBX_PARAM_VER_OPER_AUTO  = 0x3,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f127c5b310c5..3e5363f760dd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8578,6 +8578,26 @@ struct mlx5_ifc_qetc_reg_bits {
 	struct mlx5_ifc_ets_global_config_reg_bits global_configuration;
 };
 
+struct mlx5_ifc_qpdpm_dscp_reg_bits {
+	u8         e[0x1];
+	u8         reserved_at_01[0x0b];
+	u8         prio[0x04];
+};
+
+struct mlx5_ifc_qpdpm_reg_bits {
+	u8                                     reserved_at_0[0x8];
+	u8                                     local_port[0x8];
+	u8                                     reserved_at_10[0x10];
+	struct mlx5_ifc_qpdpm_dscp_reg_bits    dscp[64];
+};
+
+struct mlx5_ifc_qpts_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2d];
+	u8         trust_state[0x3];
+};
+
 struct mlx5_ifc_qtct_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         port_number[0x8];
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index c59af8ab753a..035f0d4dc9fe 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -179,4 +179,9 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
 
 int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
 int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
+
+int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state);
+int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state);
+int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio);
+int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio);
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From 9354d452034273a50a4fd703bea31e5d6b1fc20b Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 2 Nov 2017 17:04:37 -0200
Subject: openvswitch: reliable interface indentification in port dumps

This patch allows reliable identification of netdevice interfaces connected
to openvswitch bridges. In particular, user space queries the netdev
interfaces belonging to the ports for statistics, up/down state, etc.
Datapath dump needs to provide enough information for the user space to be
able to do that.

Currently, only interface names are returned. This is not sufficient, as
openvswitch allows its ports to be in different name spaces and the
interface name is valid only in its name space. What is needed and generally
used in other netlink APIs, is the pair ifindex+netnsid.

The solution is addition of the ifindex+netnsid pair (or only ifindex if in
the same name space) to vport get/dump operation.

On request side, ideally the ifindex+netnsid pair could be used to
get/set/del the corresponding vport. This is not implemented by this patch
and can be added later if needed.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  2 ++
 net/openvswitch/datapath.c       | 47 +++++++++++++++++++++++++++++-----------
 net/openvswitch/datapath.h       |  4 ++--
 net/openvswitch/dp_notify.c      |  4 ++--
 4 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index ffe397daad49..501e4c4e2a03 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -258,6 +258,8 @@ enum ovs_vport_attr {
 				/* receiving upcalls */
 	OVS_VPORT_ATTR_STATS,	/* struct ovs_vport_stats */
 	OVS_VPORT_ATTR_PAD,
+	OVS_VPORT_ATTR_IFINDEX,
+	OVS_VPORT_ATTR_NETNSID,
 	__OVS_VPORT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c3aec6227c91..4d38ac044cee 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1848,7 +1848,8 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = {
 
 /* Called with ovs_mutex or RCU read lock. */
 static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
-				   u32 portid, u32 seq, u32 flags, u8 cmd)
+				   struct net *net, u32 portid, u32 seq,
+				   u32 flags, u8 cmd)
 {
 	struct ovs_header *ovs_header;
 	struct ovs_vport_stats vport_stats;
@@ -1864,9 +1865,17 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 	if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
 	    nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
 	    nla_put_string(skb, OVS_VPORT_ATTR_NAME,
-			   ovs_vport_name(vport)))
+			   ovs_vport_name(vport)) ||
+	    nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex))
 		goto nla_put_failure;
 
+	if (!net_eq(net, dev_net(vport->dev))) {
+		int id = peernet2id_alloc(net, dev_net(vport->dev));
+
+		if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
+			goto nla_put_failure;
+	}
+
 	ovs_vport_get_stats(vport, &vport_stats);
 	if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS,
 			  sizeof(struct ovs_vport_stats), &vport_stats,
@@ -1896,8 +1905,8 @@ static struct sk_buff *ovs_vport_cmd_alloc_info(void)
 }
 
 /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
-struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
-					 u32 seq, u8 cmd)
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
+					 u32 portid, u32 seq, u8 cmd)
 {
 	struct sk_buff *skb;
 	int retval;
@@ -1906,7 +1915,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
-	retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
+	retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd);
 	BUG_ON(retval < 0);
 
 	return skb;
@@ -1920,6 +1929,8 @@ static struct vport *lookup_vport(struct net *net,
 	struct datapath *dp;
 	struct vport *vport;
 
+	if (a[OVS_VPORT_ATTR_IFINDEX])
+		return ERR_PTR(-EOPNOTSUPP);
 	if (a[OVS_VPORT_ATTR_NAME]) {
 		vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
 		if (!vport)
@@ -1944,6 +1955,7 @@ static struct vport *lookup_vport(struct net *net,
 		return vport;
 	} else
 		return ERR_PTR(-EINVAL);
+
 }
 
 /* Called with ovs_mutex */
@@ -1983,6 +1995,8 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
 	    !a[OVS_VPORT_ATTR_UPCALL_PID])
 		return -EINVAL;
+	if (a[OVS_VPORT_ATTR_IFINDEX])
+		return -EOPNOTSUPP;
 
 	port_no = a[OVS_VPORT_ATTR_PORT_NO]
 		? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
@@ -2032,8 +2046,9 @@ restart:
 		goto exit_unlock_free;
 	}
 
-	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
-				      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
+				      info->snd_portid, info->snd_seq, 0,
+				      OVS_VPORT_CMD_NEW);
 
 	if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
 		update_headroom(dp);
@@ -2090,8 +2105,9 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
 			goto exit_unlock_free;
 	}
 
-	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
-				      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
+				      info->snd_portid, info->snd_seq, 0,
+				      OVS_VPORT_CMD_NEW);
 	BUG_ON(err < 0);
 
 	ovs_unlock();
@@ -2128,8 +2144,9 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		goto exit_unlock_free;
 	}
 
-	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
-				      info->snd_seq, 0, OVS_VPORT_CMD_DEL);
+	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
+				      info->snd_portid, info->snd_seq, 0,
+				      OVS_VPORT_CMD_DEL);
 	BUG_ON(err < 0);
 
 	/* the vport deletion may trigger dp headroom update */
@@ -2169,8 +2186,9 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	err = PTR_ERR(vport);
 	if (IS_ERR(vport))
 		goto exit_unlock_free;
-	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
-				      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
+				      info->snd_portid, info->snd_seq, 0,
+				      OVS_VPORT_CMD_NEW);
 	BUG_ON(err < 0);
 	rcu_read_unlock();
 
@@ -2202,6 +2220,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
 			if (j >= skip &&
 			    ovs_vport_cmd_fill_info(vport, skb,
+						    sock_net(skb->sk),
 						    NETLINK_CB(cb->skb).portid,
 						    cb->nlh->nlmsg_seq,
 						    NLM_F_MULTI,
@@ -2228,6 +2247,8 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
 	[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
 	[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
 	[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
+	[OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 },
+	[OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 },
 };
 
 static const struct genl_ops dp_vport_genl_ops[] = {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 480600649d0b..4a104ef9e12c 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -200,8 +200,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *,
 		  uint32_t cutlen);
 
 const char *ovs_dp_name(const struct datapath *dp);
-struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
-					 u8 cmd);
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
+					 u32 portid, u32 seq, u8 cmd);
 
 int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			const struct sw_flow_actions *, struct sw_flow_key *);
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 653d073bae45..f3ee2f2825c0 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -30,8 +30,8 @@ static void dp_detach_port_notify(struct vport *vport)
 	struct datapath *dp;
 
 	dp = vport->dp;
-	notify = ovs_vport_cmd_build_info(vport, 0, 0,
-					  OVS_VPORT_CMD_DEL);
+	notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp),
+					  0, 0, OVS_VPORT_CMD_DEL);
 	ovs_dp_detach_port(vport);
 	if (IS_ERR(notify)) {
 		genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0,
-- 
cgit v1.2.3


From 79e1ad148c844f5c8b9d76b36b26e3886dca95ae Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 2 Nov 2017 17:04:38 -0200
Subject: rtnetlink: use netnsid to query interface

Currently, when an application gets netnsid from the kernel (for example as
the result of RTM_GETLINK call on one end of the veth pair), it's not much
useful. There's no reliable way to get to the netns fd from the netnsid, nor
does any kernel API accept netnsid.

Extend the RTM_GETLINK call to also accept netnsid. It will operate on the
netns with the given netnsid in such case. Of course, the calling process
needs to have enough capabilities in the target name space; for now, require
CAP_NET_ADMIN. This can be relaxed in the future.

To signal to the calling process that the kernel understood the new
IFLA_IF_NETNSID attribute in the query, it will include it in the response.
This is needed to detect older kernels, as they will just ignore
IFLA_IF_NETNSID and query in the current name space.

This patch implemetns IFLA_IF_NETNSID only for get and dump. For set
operations, this can be extended later.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |   1 +
 net/core/rtnetlink.c         | 103 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 86 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b3cf5639ac8f..19fc02660e0c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -160,6 +160,7 @@ enum {
 	IFLA_XDP,
 	IFLA_EVENT,
 	IFLA_NEW_NETNSID,
+	IFLA_IF_NETNSID,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index de24d394c69e..8a8c51937edf 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -921,7 +921,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4)  /* IFLA_EVENT */
 	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
-
+	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
+	       + 0;
 }
 
 static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1370,13 +1371,14 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
 }
 
 static int rtnl_fill_link_netnsid(struct sk_buff *skb,
-				  const struct net_device *dev)
+				  const struct net_device *dev,
+				  struct net *src_net)
 {
 	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
 		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
 
 		if (!net_eq(dev_net(dev), link_net)) {
-			int id = peernet2id_alloc(dev_net(dev), link_net);
+			int id = peernet2id_alloc(src_net, link_net);
 
 			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
 				return -EMSGSIZE;
@@ -1427,10 +1429,11 @@ static int rtnl_fill_link_af(struct sk_buff *skb,
 	return 0;
 }
 
-static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
+static int rtnl_fill_ifinfo(struct sk_buff *skb,
+			    struct net_device *dev, struct net *src_net,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event, int *new_nsid)
+			    u32 event, int *new_nsid, int tgt_netnsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1448,6 +1451,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	ifm->ifi_flags = dev_get_flags(dev);
 	ifm->ifi_change = change;
 
+	if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid))
+		goto nla_put_failure;
+
 	if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
 	    nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
 	    nla_put_u8(skb, IFLA_OPERSTATE,
@@ -1513,7 +1519,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			goto nla_put_failure;
 	}
 
-	if (rtnl_fill_link_netnsid(skb, dev))
+	if (rtnl_fill_link_netnsid(skb, dev, src_net))
 		goto nla_put_failure;
 
 	if (new_nsid &&
@@ -1571,6 +1577,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_XDP]		= { .type = NLA_NESTED },
 	[IFLA_EVENT]		= { .type = NLA_U32 },
 	[IFLA_GROUP]		= { .type = NLA_U32 },
+	[IFLA_IF_NETNSID]	= { .type = NLA_S32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1674,9 +1681,28 @@ static bool link_dump_filtered(struct net_device *dev,
 	return false;
 }
 
+static struct net *get_target_net(struct sk_buff *skb, int netnsid)
+{
+	struct net *net;
+
+	net = get_net_ns_by_id(sock_net(skb->sk), netnsid);
+	if (!net)
+		return ERR_PTR(-EINVAL);
+
+	/* For now, the caller is required to have CAP_NET_ADMIN in
+	 * the user namespace owning the target net ns.
+	 */
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+		put_net(net);
+		return ERR_PTR(-EACCES);
+	}
+	return net;
+}
+
 static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
+	struct net *tgt_net = net;
 	int h, s_h;
 	int idx = 0, s_idx;
 	struct net_device *dev;
@@ -1686,6 +1712,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct rtnl_link_ops *kind_ops = NULL;
 	unsigned int flags = NLM_F_MULTI;
 	int master_idx = 0;
+	int netnsid = -1;
 	int err;
 	int hdrlen;
 
@@ -1704,6 +1731,15 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX,
 			ifla_policy, NULL) >= 0) {
+		if (tb[IFLA_IF_NETNSID]) {
+			netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+			tgt_net = get_target_net(skb, netnsid);
+			if (IS_ERR(tgt_net)) {
+				tgt_net = net;
+				netnsid = -1;
+			}
+		}
+
 		if (tb[IFLA_EXT_MASK])
 			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
 
@@ -1719,17 +1755,19 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
-		head = &net->dev_index_head[h];
+		head = &tgt_net->dev_index_head[h];
 		hlist_for_each_entry(dev, head, index_hlist) {
 			if (link_dump_filtered(dev, master_idx, kind_ops))
 				goto cont;
 			if (idx < s_idx)
 				goto cont;
-			err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+			err = rtnl_fill_ifinfo(skb, dev, net,
+					       RTM_NEWLINK,
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0, NULL);
+					       ext_filter_mask, 0, NULL,
+					       netnsid);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -1748,6 +1786,8 @@ out_err:
 	cb->args[0] = h;
 	cb->seq = net->dev_base_seq;
 	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+	if (netnsid >= 0)
+		put_net(tgt_net);
 
 	return err;
 }
@@ -2360,6 +2400,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
+	if (tb[IFLA_IF_NETNSID])
+		return -EOPNOTSUPP;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
@@ -2454,6 +2497,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	if (tb[IFLA_IF_NETNSID])
+		return -EOPNOTSUPP;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 
@@ -2585,6 +2631,9 @@ replay:
 	if (err < 0)
 		return err;
 
+	if (tb[IFLA_IF_NETNSID])
+		return -EOPNOTSUPP;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
@@ -2818,11 +2867,13 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
+	struct net *tgt_net = net;
 	struct ifinfomsg *ifm;
 	char ifname[IFNAMSIZ];
 	struct nlattr *tb[IFLA_MAX+1];
 	struct net_device *dev = NULL;
 	struct sk_buff *nskb;
+	int netnsid = -1;
 	int err;
 	u32 ext_filter_mask = 0;
 
@@ -2830,35 +2881,50 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	if (tb[IFLA_IF_NETNSID]) {
+		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
+		tgt_net = get_target_net(skb, netnsid);
+		if (IS_ERR(tgt_net))
+			return PTR_ERR(tgt_net);
+	}
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 
 	if (tb[IFLA_EXT_MASK])
 		ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
 
+	err = -EINVAL;
 	ifm = nlmsg_data(nlh);
 	if (ifm->ifi_index > 0)
-		dev = __dev_get_by_index(net, ifm->ifi_index);
+		dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
 	else if (tb[IFLA_IFNAME])
-		dev = __dev_get_by_name(net, ifname);
+		dev = __dev_get_by_name(tgt_net, ifname);
 	else
-		return -EINVAL;
+		goto out;
 
+	err = -ENODEV;
 	if (dev == NULL)
-		return -ENODEV;
+		goto out;
 
+	err = -ENOBUFS;
 	nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
 	if (nskb == NULL)
-		return -ENOBUFS;
+		goto out;
 
-	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL);
+	err = rtnl_fill_ifinfo(nskb, dev, net,
+			       RTM_NEWLINK, NETLINK_CB(skb).portid,
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask,
+			       0, NULL, netnsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(nskb);
 	} else
 		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+out:
+	if (netnsid >= 0)
+		put_net(tgt_net);
 
 	return err;
 }
@@ -2948,8 +3014,9 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event,
-			       new_nsid);
+	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
+			       type, 0, 0, change, 0, 0, event,
+			       new_nsid, -1);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
cgit v1.2.3


From f4e63525ee35f9c02e9f51f90571718363e9a9a9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:16 -0700
Subject: net: bpf: rename ndo_xdp to ndo_bpf

ndo_xdp is a control path callback for setting up XDP in the
driver.  We can reuse it for other forms of communication
between the eBPF stack and the drivers.  Rename the callback
and associated structures and definitions.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h      |  2 +-
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  6 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  4 +--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  6 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  4 +--
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  4 +--
 drivers/net/ethernet/qlogic/qede/qede.h            |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_filter.c     |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  4 +--
 drivers/net/tun.c                                  |  4 +--
 drivers/net/virtio_net.c                           |  4 +--
 include/linux/netdevice.h                          | 23 ++++++++-------
 net/core/dev.c                                     | 34 +++++++++++-----------
 net/core/rtnetlink.c                               |  4 +--
 17 files changed, 56 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4e3d569bf32e..96416f5d97f3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7775,7 +7775,7 @@ static const struct net_device_ops bnxt_netdev_ops = {
 #endif
 	.ndo_udp_tunnel_add	= bnxt_udp_tunnel_add,
 	.ndo_udp_tunnel_del	= bnxt_udp_tunnel_del,
-	.ndo_xdp		= bnxt_xdp,
+	.ndo_bpf		= bnxt_xdp,
 	.ndo_bridge_getlink	= bnxt_bridge_getlink,
 	.ndo_bridge_setlink	= bnxt_bridge_setlink,
 	.ndo_get_phys_port_name = bnxt_get_phys_port_name
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 06ce63c00821..261e5847557a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -208,7 +208,7 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog)
 	return 0;
 }
 
-int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct bnxt *bp = netdev_priv(dev);
 	int rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
index 12a5ad66b564..414b748038ca 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
@@ -16,6 +16,6 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts);
 bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 		 struct page *page, u8 **data_ptr, unsigned int *len,
 		 u8 *event);
-int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp);
+int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp);
 
 #endif
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 71989e180289..a063c36c4c58 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1741,7 +1741,7 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog)
 	return 0;
 }
 
-static int nicvf_xdp(struct net_device *netdev, struct netdev_xdp *xdp)
+static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 {
 	struct nicvf *nic = netdev_priv(netdev);
 
@@ -1774,7 +1774,7 @@ static const struct net_device_ops nicvf_netdev_ops = {
 	.ndo_tx_timeout         = nicvf_tx_timeout,
 	.ndo_fix_features       = nicvf_fix_features,
 	.ndo_set_features       = nicvf_set_features,
-	.ndo_xdp		= nicvf_xdp,
+	.ndo_bpf		= nicvf_xdp,
 };
 
 static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index dfecaeda0654..05b94d87a6c3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11648,12 +11648,12 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
 }
 
 /**
- * i40e_xdp - implements ndo_xdp for i40e
+ * i40e_xdp - implements ndo_bpf for i40e
  * @dev: netdevice
  * @xdp: XDP command
  **/
 static int i40e_xdp(struct net_device *dev,
-		    struct netdev_xdp *xdp)
+		    struct netdev_bpf *xdp)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -11705,7 +11705,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_features_check	= i40e_features_check,
 	.ndo_bridge_getlink	= i40e_ndo_bridge_getlink,
 	.ndo_bridge_setlink	= i40e_ndo_bridge_setlink,
-	.ndo_xdp		= i40e_xdp,
+	.ndo_bpf		= i40e_xdp,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 507977994a03..e5dcb25be398 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10004,7 +10004,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
 	return 0;
 }
 
-static int ixgbe_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 
@@ -10113,7 +10113,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_udp_tunnel_add	= ixgbe_add_udp_tunnel_port,
 	.ndo_udp_tunnel_del	= ixgbe_del_udp_tunnel_port,
 	.ndo_features_check	= ixgbe_features_check,
-	.ndo_xdp		= ixgbe_xdp,
+	.ndo_bpf		= ixgbe_xdp,
 	.ndo_xdp_xmit		= ixgbe_xdp_xmit,
 	.ndo_xdp_flush		= ixgbe_xdp_flush,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index d611df2f274d..736a6ccaf05e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2916,7 +2916,7 @@ static u32 mlx4_xdp_query(struct net_device *dev)
 	return prog_id;
 }
 
-static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
@@ -2958,7 +2958,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
 	.ndo_features_check	= mlx4_en_features_check,
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
-	.ndo_xdp		= mlx4_xdp,
+	.ndo_bpf		= mlx4_xdp,
 };
 
 static const struct net_device_ops mlx4_netdev_ops_master = {
@@ -2995,7 +2995,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
 	.ndo_features_check	= mlx4_en_features_check,
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
-	.ndo_xdp		= mlx4_xdp,
+	.ndo_bpf		= mlx4_xdp,
 };
 
 struct mlx4_en_bond {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 28ae00b3eb88..3b7b7bb84eb0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3831,7 +3831,7 @@ static u32 mlx5e_xdp_query(struct net_device *dev)
 	return prog_id;
 }
 
-static int mlx5e_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
@@ -3883,7 +3883,7 @@ static const struct net_device_ops mlx5e_netdev_ops = {
 	.ndo_rx_flow_steer	 = mlx5e_rx_flow_steer,
 #endif
 	.ndo_tx_timeout          = mlx5e_tx_timeout,
-	.ndo_xdp		 = mlx5e_xdp,
+	.ndo_bpf		 = mlx5e_xdp,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller     = mlx5e_netpoll,
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 185a3dd35a3f..f6c6ad4e8a59 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3378,7 +3378,7 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags,
 	return 0;
 }
 
-static int nfp_net_xdp(struct net_device *netdev, struct netdev_xdp *xdp)
+static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 
@@ -3441,7 +3441,7 @@ const struct net_device_ops nfp_net_netdev_ops = {
 	.ndo_get_phys_port_name	= nfp_port_get_phys_port_name,
 	.ndo_udp_tunnel_add	= nfp_net_add_vxlan_port,
 	.ndo_udp_tunnel_del	= nfp_net_del_vxlan_port,
-	.ndo_xdp		= nfp_net_xdp,
+	.ndo_bpf		= nfp_net_xdp,
 };
 
 /**
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index adb700512baa..a3a70ade411f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -503,7 +503,7 @@ void qede_fill_rss_params(struct qede_dev *edev,
 void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti);
 void qede_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *ti);
 
-int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp);
+int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp);
 
 #ifdef CONFIG_DCB
 void qede_set_dcbnl_ops(struct net_device *ndev);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index f79e36e4060a..c1a0708a7d7c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1065,7 +1065,7 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog)
 	return 0;
 }
 
-int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index e5ee9f274a71..8f9b3eb82137 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -556,7 +556,7 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_udp_tunnel_add = qede_udp_tunnel_add,
 	.ndo_udp_tunnel_del = qede_udp_tunnel_del,
 	.ndo_features_check = qede_features_check,
-	.ndo_xdp = qede_xdp,
+	.ndo_bpf = qede_xdp,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer = qede_rx_flow_steer,
 #endif
@@ -594,7 +594,7 @@ static const struct net_device_ops qede_netdev_vf_xdp_ops = {
 	.ndo_udp_tunnel_add = qede_udp_tunnel_add,
 	.ndo_udp_tunnel_del = qede_udp_tunnel_del,
 	.ndo_features_check = qede_features_check,
-	.ndo_xdp = qede_xdp,
+	.ndo_bpf = qede_xdp,
 };
 
 /* -------------------------------------------------------------------------
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8125956f62a1..1a326b697221 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1141,7 +1141,7 @@ static u32 tun_xdp_query(struct net_device *dev)
 	return 0;
 }
 
-static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
@@ -1185,7 +1185,7 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= tun_set_headroom,
 	.ndo_get_stats64	= tun_net_get_stats64,
-	.ndo_xdp		= tun_xdp,
+	.ndo_bpf		= tun_xdp,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fc059f193e7d..edf984406ba0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2088,7 +2088,7 @@ static u32 virtnet_xdp_query(struct net_device *dev)
 	return 0;
 }
 
-static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
@@ -2115,7 +2115,7 @@ static const struct net_device_ops virtnet_netdev = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = virtnet_netpoll,
 #endif
-	.ndo_xdp		= virtnet_xdp,
+	.ndo_bpf		= virtnet_xdp,
 	.ndo_xdp_xmit		= virtnet_xdp_xmit,
 	.ndo_xdp_flush		= virtnet_xdp_flush,
 	.ndo_features_check	= passthru_features_check,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7de7656550c2..9af9feaaeb64 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -779,10 +779,10 @@ enum tc_setup_type {
 	TC_SETUP_CBS,
 };
 
-/* These structures hold the attributes of xdp state that are being passed
- * to the netdevice through the xdp op.
+/* These structures hold the attributes of bpf state that are being passed
+ * to the netdevice through the bpf op.
  */
-enum xdp_netdev_command {
+enum bpf_netdev_command {
 	/* Set or clear a bpf program used in the earliest stages of packet
 	 * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
 	 * is responsible for calling bpf_prog_put on any old progs that are
@@ -801,8 +801,8 @@ enum xdp_netdev_command {
 
 struct netlink_ext_ack;
 
-struct netdev_xdp {
-	enum xdp_netdev_command command;
+struct netdev_bpf {
+	enum bpf_netdev_command command;
 	union {
 		/* XDP_SETUP_PROG */
 		struct {
@@ -1124,9 +1124,10 @@ struct dev_ifalias {
  *	appropriate rx headroom value allows avoiding skb head copy on
  *	forward. Setting a negative value resets the rx headroom to the
  *	default value.
- * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp);
+ * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
  *	This function is used to set or query state related to XDP on the
- *	netdevice. See definition of enum xdp_netdev_command for details.
+ *	netdevice and manage BPF offload. See definition of
+ *	enum bpf_netdev_command for details.
  * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
  *	This function is used to submit a XDP packet for transmit on a
  *	netdevice.
@@ -1315,8 +1316,8 @@ struct net_device_ops {
 						       struct sk_buff *skb);
 	void			(*ndo_set_rx_headroom)(struct net_device *dev,
 						       int needed_headroom);
-	int			(*ndo_xdp)(struct net_device *dev,
-					   struct netdev_xdp *xdp);
+	int			(*ndo_bpf)(struct net_device *dev,
+					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev,
 						struct xdp_buff *xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
@@ -3311,10 +3312,10 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
 
-typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
+typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id);
+u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1423cf4d695c..10cde58d3275 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4545,7 +4545,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
-static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
+static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 	struct bpf_prog *new = xdp->prog;
@@ -7090,26 +7090,26 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id)
+u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id)
 {
-	struct netdev_xdp xdp;
+	struct netdev_bpf xdp;
 
 	memset(&xdp, 0, sizeof(xdp));
 	xdp.command = XDP_QUERY_PROG;
 
 	/* Query must always succeed. */
-	WARN_ON(xdp_op(dev, &xdp) < 0);
+	WARN_ON(bpf_op(dev, &xdp) < 0);
 	if (prog_id)
 		*prog_id = xdp.prog_id;
 
 	return xdp.prog_attached;
 }
 
-static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 			   struct netlink_ext_ack *extack, u32 flags,
 			   struct bpf_prog *prog)
 {
-	struct netdev_xdp xdp;
+	struct netdev_bpf xdp;
 
 	memset(&xdp, 0, sizeof(xdp));
 	if (flags & XDP_FLAGS_HW_MODE)
@@ -7120,7 +7120,7 @@ static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
 	xdp.flags = flags;
 	xdp.prog = prog;
 
-	return xdp_op(dev, &xdp);
+	return bpf_op(dev, &xdp);
 }
 
 /**
@@ -7137,24 +7137,24 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct bpf_prog *prog = NULL;
-	xdp_op_t xdp_op, xdp_chk;
+	bpf_op_t bpf_op, bpf_chk;
 	int err;
 
 	ASSERT_RTNL();
 
-	xdp_op = xdp_chk = ops->ndo_xdp;
-	if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
+	bpf_op = bpf_chk = ops->ndo_bpf;
+	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
 		return -EOPNOTSUPP;
-	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
-		xdp_op = generic_xdp_install;
-	if (xdp_op == xdp_chk)
-		xdp_chk = generic_xdp_install;
+	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
+		bpf_op = generic_xdp_install;
+	if (bpf_op == bpf_chk)
+		bpf_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL))
+		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, xdp_op, NULL))
+		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
@@ -7162,7 +7162,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 			return PTR_ERR(prog);
 	}
 
-	err = dev_xdp_install(dev, xdp_op, extack, flags, prog);
+	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
 	if (err < 0 && prog)
 		bpf_prog_put(prog);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8a8c51937edf..dc5ad84ac096 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1270,10 +1270,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 		*prog_id = generic_xdp_prog->aux->id;
 		return XDP_ATTACHED_SKB;
 	}
-	if (!ops->ndo_xdp)
+	if (!ops->ndo_bpf)
 		return XDP_ATTACHED_NONE;
 
-	return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id);
+	return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id);
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
-- 
cgit v1.2.3


From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:17 -0700
Subject: bpf: offload: add infrastructure for loading programs for a specific
 netdev

The fact that we don't know which device the program is going
to be used on is quite limiting in current eBPF infrastructure.
We have to reverse or limit the changes which kernel makes to
the loaded bytecode if we want it to be offloaded to a networking
device.  We also have to invent new APIs for debugging and
troubleshooting support.

Make it possible to load programs for a specific netdev.  This
helps us to bring the debug information closer to the core
eBPF infrastructure (e.g. we will be able to reuse the verifer
log in device JIT).  It allows device JITs to perform translation
on the original bytecode.

__bpf_prog_get() when called to get a reference for an attachment
point will now refuse to give it if program has a device assigned.
Following patches will add a version of that function which passes
the expected netdev in. @type argument in __bpf_prog_get() is
renamed to attach_type to make it clearer that it's only set on
attachment.

All calls to ndo_bpf are protected by rtnl, only verifier callbacks
are not.  We need a wait queue to make sure netdev doesn't get
destroyed while verifier is still running and calling its driver.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  36 +++++++++
 include/linux/bpf_verifier.h |  10 +++
 include/linux/netdevice.h    |  14 ++++
 include/uapi/linux/bpf.h     |   1 +
 kernel/bpf/Makefile          |   1 +
 kernel/bpf/core.c            |  10 ++-
 kernel/bpf/offload.c         | 182 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c         |  17 +++-
 kernel/bpf/verifier.c        |  15 +++-
 9 files changed, 278 insertions(+), 8 deletions(-)
 create mode 100644 kernel/bpf/offload.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520aeebe0d93..e45d43f9ec92 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
+#include <linux/wait.h>
 
 struct perf_event;
 struct bpf_prog;
@@ -182,6 +183,16 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 };
 
+struct bpf_dev_offload {
+	struct bpf_prog		*prog;
+	struct net_device	*netdev;
+	void			*dev_priv;
+	struct list_head	offloads;
+	bool			dev_state;
+	bool			verifier_running;
+	wait_queue_head_t	verifier_done;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -199,6 +210,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_dev_offload *offload;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -317,6 +329,7 @@ extern const struct file_operations bpf_prog_fops;
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_prog_ops bpf_offload_prog_ops;
 extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
@@ -491,6 +504,29 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+int bpf_prog_offload_compile(struct bpf_prog *prog);
+void bpf_prog_offload_destroy(struct bpf_prog *prog);
+
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return aux->offload;
+}
+#else
+static inline int bpf_prog_offload_init(struct bpf_prog *prog,
+					union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return false;
+}
+#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
+
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3b0976aaac75..e45011dbc02d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -153,6 +153,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
+	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
@@ -169,6 +170,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 	return env->cur_state->regs;
 }
 
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
+#else
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9af9feaaeb64..fda527ccb263 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -797,8 +797,13 @@ enum bpf_netdev_command {
 	 * is equivalent to XDP_ATTACHED_DRV.
 	 */
 	XDP_QUERY_PROG,
+	/* BPF program for offload callbacks, invoked at program load time. */
+	BPF_OFFLOAD_VERIFIER_PREP,
+	BPF_OFFLOAD_TRANSLATE,
+	BPF_OFFLOAD_DESTROY,
 };
 
+struct bpf_ext_analyzer_ops;
 struct netlink_ext_ack;
 
 struct netdev_bpf {
@@ -815,6 +820,15 @@ struct netdev_bpf {
 			u8 prog_attached;
 			u32 prog_id;
 		};
+		/* BPF_OFFLOAD_VERIFIER_PREP */
+		struct {
+			struct bpf_prog *prog;
+			const struct bpf_ext_analyzer_ops *ops; /* callee set */
+		} verifier;
+		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
+		struct {
+			struct bpf_prog *prog;
+		} offload;
 	};
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9820677c2ff..80d191a93fb0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -260,6 +260,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
+		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 16e95c8e749e..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7fe448799d76..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1380,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 * valid program, which in this case would simply not
 	 * be JITed, but falls back to the interpreter.
 	 */
-	fp = bpf_int_jit_compile(fp);
+	if (!bpf_prog_is_dev_bound(fp->aux)) {
+		fp = bpf_int_jit_compile(fp);
+	} else {
+		*err = bpf_prog_offload_compile(fp);
+		if (*err)
+			return fp;
+	}
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	struct bpf_prog_aux *aux;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+	if (bpf_prog_is_dev_bound(aux))
+		bpf_prog_offload_destroy(aux->prog);
 	bpf_jit_free(aux->prog);
 }
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..5553e0e2f8b1
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,182 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dev_offload *offload;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->prog_flags)
+		return -EINVAL;
+
+	offload = kzalloc(sizeof(*offload), GFP_USER);
+	if (!offload)
+		return -ENOMEM;
+
+	offload->prog = prog;
+	init_waitqueue_head(&offload->verifier_done);
+
+	rtnl_lock();
+	offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+	if (!offload->netdev) {
+		rtnl_unlock();
+		kfree(offload);
+		return -EINVAL;
+	}
+
+	prog->aux->offload = offload;
+	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+			     struct netdev_bpf *data)
+{
+	struct net_device *netdev = prog->aux->offload->netdev;
+
+	ASSERT_RTNL();
+
+	if (!netdev)
+		return -ENODEV;
+	if (!netdev->netdev_ops->ndo_bpf)
+		return -EOPNOTSUPP;
+
+	data->command = cmd;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	struct netdev_bpf data = {};
+	int err;
+
+	data.verifier.prog = env->prog;
+
+	rtnl_lock();
+	err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+	if (err)
+		goto exit_unlock;
+
+	env->dev_ops = data.verifier.ops;
+
+	env->prog->aux->offload->dev_state = true;
+	env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+	rtnl_unlock();
+	return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+
+	data.offload.prog = prog;
+
+	if (offload->verifier_running)
+		wait_event(offload->verifier_done, !offload->verifier_running);
+
+	if (offload->dev_state)
+		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+	offload->dev_state = false;
+	list_del_init(&offload->offloads);
+	offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	__bpf_prog_offload_destroy(prog);
+	rtnl_unlock();
+
+	kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+	int ret;
+
+	data.offload.prog = prog;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+	rtnl_unlock();
+
+	return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+					  const struct bpf_insn *insn)
+{
+	WARN(1, "attempt to execute device eBPF program on the host!");
+	return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+	prog->bpf_func = bpf_prog_warn_on_exec;
+
+	return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+				    ulong event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_dev_offload *offload, *tmp;
+
+	ASSERT_RTNL();
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+					 offloads) {
+			if (offload->netdev == netdev)
+				__bpf_prog_offload_destroy(offload->prog);
+		}
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+	.notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+	register_netdevice_notifier(&bpf_offload_notifier);
+	return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 323be2473c4b..1574b9f0f24e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
-	prog->aux->ops = bpf_prog_types[type];
+	if (!bpf_prog_is_dev_bound(prog->aux))
+		prog->aux->ops = bpf_prog_types[type];
+	else
+		prog->aux->ops = &bpf_offload_prog_ops;
 	prog->type = type;
 	return 0;
 }
@@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (type && prog->type != *type) {
+	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_name
+#define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1152,6 +1155,12 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
+	if (attr->prog_target_ifindex) {
+		err = bpf_prog_offload_init(prog, attr);
+		if (err)
+			goto free_prog;
+	}
+
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04357ad5a812..51aabb32ad67 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3736,10 +3736,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
-		return 0;
+	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
+		return env->analyzer_ops->insn_hook(env, insn_idx,
+						    prev_insn_idx);
+	if (env->dev_ops && env->dev_ops->insn_hook)
+		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
-	return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+	return 0;
 }
 
 static int do_check(struct bpf_verifier_env *env)
@@ -4516,6 +4519,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
+	if (env->prog->aux->offload) {
+		ret = bpf_prog_offload_verifier_prep(env);
+		if (ret)
+			goto err_unlock;
+	}
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3


From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:18 -0700
Subject: bpf: report offload info to user space

Extend struct bpf_prog_info to contain information about program
being bound to a device.  Since the netdev may get destroyed while
program still exists we need a flag to indicate the program is
loaded for a device, even if the device is gone.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/offload.c     | 12 ++++++++++++
 kernel/bpf/syscall.c     |  5 +++++
 4 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e45d43f9ec92..98bacd0fa5cc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -506,6 +506,7 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 80d191a93fb0..4455dd195201 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -895,6 +895,10 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
+enum bpf_prog_status {
+	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
+};
+
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -908,6 +912,8 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 5553e0e2f8b1..2816feb38be1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -144,6 +144,18 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	u32 ifindex;
+
+	rtnl_lock();
+	ifindex = offload->netdev ? offload->netdev->ifindex : 0;
+	rtnl_unlock();
+
+	return ifindex;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1574b9f0f24e..3217c20ea91b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1592,6 +1592,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		info.status |= BPF_PROG_STATUS_DEV_BOUND;
+		info.ifindex = bpf_prog_offload_ifindex(prog);
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:20 -0700
Subject: xdp: allow attaching programs loaded for specific device

Pass the netdev pointer to bpf_prog_get_type().  This way
BPF code can decide whether the device matches what the
code was loaded/translated for.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  | 10 ++++++++++
 kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++----
 net/core/dev.c       |  6 +++++-
 3 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 98bacd0fa5cc..c397934f91dd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -335,6 +335,8 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -428,6 +430,14 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
+						     enum bpf_prog_type type,
+						     struct net_device *netdev)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
 							  int i)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3217c20ea91b..68f9123acd39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,22 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
+static bool bpf_prog_can_attach(struct bpf_prog *prog,
+				enum bpf_prog_type *attach_type,
+				struct net_device *netdev)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	if (prog->type != *attach_type)
+		return false;
+	if (offload && offload->netdev != netdev)
+		return false;
+
+	return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+				       struct net_device *netdev)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1065,7 +1080,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
+	if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1078,12 +1093,12 @@ out:
 
 struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-	return __bpf_prog_get(ufd, NULL);
+	return __bpf_prog_get(ufd, NULL, NULL);
 }
 
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
@@ -1091,6 +1106,16 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev)
+{
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev);
+
+	if (!IS_ERR(prog))
+		trace_bpf_prog_get_type(prog);
+	return prog;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 10cde58d3275..30b5fe32c525 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7157,7 +7157,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
-		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+		if (bpf_op == ops->ndo_bpf)
+			prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+						     dev);
+		else
+			prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
-- 
cgit v1.2.3


From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:30 -0700
Subject: bpf: remove old offload/analyzer

Thanks to the ability to load a program for a specific device,
running verifier twice is no longer needed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  5 ---
 kernel/bpf/verifier.c        | 75 --------------------------------------------
 net/core/filter.c            | 42 -------------------------
 3 files changed, 122 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e45011dbc02d..07b96aaca256 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,9 +152,7 @@ struct bpf_verifier_env {
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
-	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
-	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
@@ -179,7 +177,4 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 }
 #endif
 
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv);
-
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 51aabb32ad67..add845fe788a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -949,9 +949,6 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		if (env->analyzer_ops)
-			return 0;
-
 		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
@@ -3736,9 +3733,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
-		return env->analyzer_ops->insn_hook(env, insn_idx,
-						    prev_insn_idx);
 	if (env->dev_ops && env->dev_ops->insn_hook)
 		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
@@ -4601,72 +4595,3 @@ err_free_env:
 	kfree(env);
 	return ret;
 }
-
-static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
-#ifdef CONFIG_NET
-	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
-	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
-#endif
-};
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv)
-{
-	struct bpf_verifier_env *env;
-	int ret;
-
-	if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
-	    !bpf_analyzer_ops[prog->type])
-		return -EOPNOTSUPP;
-
-	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
-	if (!env)
-		return -ENOMEM;
-
-	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
-				     prog->len);
-	ret = -ENOMEM;
-	if (!env->insn_aux_data)
-		goto err_free_env;
-	env->prog = prog;
-	env->ops = bpf_analyzer_ops[env->prog->type];
-	env->analyzer_ops = ops;
-	env->analyzer_priv = priv;
-
-	/* grab the mutex to protect few globals used by verifier */
-	mutex_lock(&bpf_verifier_lock);
-
-	env->strict_alignment = false;
-	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-		env->strict_alignment = true;
-
-	env->explored_states = kcalloc(env->prog->len,
-				       sizeof(struct bpf_verifier_state_list *),
-				       GFP_KERNEL);
-	ret = -ENOMEM;
-	if (!env->explored_states)
-		goto skip_full_check;
-
-	ret = check_cfg(env);
-	if (ret < 0)
-		goto skip_full_check;
-
-	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
-	ret = do_check(env);
-	if (env->cur_state) {
-		free_verifier_state(env->cur_state, true);
-		env->cur_state = NULL;
-	}
-
-skip_full_check:
-	while (!pop_stack(env, NULL, NULL));
-	free_states(env);
-
-	mutex_unlock(&bpf_verifier_lock);
-	vfree(env->insn_aux_data);
-err_free_env:
-	kfree(env);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/net/core/filter.c b/net/core/filter.c
index a0112168d6f9..1afa17935954 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3777,25 +3777,6 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
-static bool
-tc_cls_act_is_valid_access_analyzer(int off, int size,
-				    enum bpf_access_type type,
-				    struct bpf_insn_access_aux *info)
-{
-	switch (off) {
-	case offsetof(struct sk_buff, len):
-		return true;
-	case offsetof(struct sk_buff, data):
-		info->reg_type = PTR_TO_PACKET;
-		return true;
-	case offsetof(struct sk_buff, cb) +
-	     offsetof(struct bpf_skb_data_end, data_end):
-		info->reg_type = PTR_TO_PACKET_END;
-		return true;
-	}
-	return false;
-}
-
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -3830,21 +3811,6 @@ static bool xdp_is_valid_access(int off, int size,
 	return __is_valid_xdp_access(off, size);
 }
 
-static bool xdp_is_valid_access_analyzer(int off, int size,
-					 enum bpf_access_type type,
-					 struct bpf_insn_access_aux *info)
-{
-	switch (off) {
-	case offsetof(struct xdp_buff, data):
-		info->reg_type = PTR_TO_PACKET;
-		return true;
-	case offsetof(struct xdp_buff, data_end):
-		info->reg_type = PTR_TO_PACKET_END;
-		return true;
-	}
-	return false;
-}
-
 void bpf_warn_invalid_xdp_action(u32 act)
 {
 	const u32 act_max = XDP_REDIRECT;
@@ -4516,10 +4482,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
-const struct bpf_verifier_ops tc_cls_act_analyzer_ops = {
-	.is_valid_access	= tc_cls_act_is_valid_access_analyzer,
-};
-
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
@@ -4530,10 +4492,6 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops xdp_analyzer_ops = {
-	.is_valid_access	= xdp_is_valid_access_analyzer,
-};
-
 const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
-- 
cgit v1.2.3


From 99feaafcdb566e8f032e7acc2a303713ad6bf196 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 3 Nov 2017 19:05:20 -0400
Subject: net: dsa: make switch index unsigned

Define the DSA switch index as an unsigned int, because it will never be
less than 0.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 50e276dc4c01..fa1c21ab8092 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -209,7 +209,7 @@ struct dsa_switch {
 	 * Parent switch tree, and switch index.
 	 */
 	struct dsa_switch_tree	*dst;
-	int			index;
+	unsigned int		index;
 
 	/* Listener for switch fabric events */
 	struct notifier_block	nb;
-- 
cgit v1.2.3


From 49463b7f2da1a115404b02c5533bc2c2125833a3 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 3 Nov 2017 19:05:21 -0400
Subject: net: dsa: make tree index unsigned

Similarly to a DSA switch and port, rename the tree index from "tree" to
"index" and make it an unsigned int because it isn't supposed to be less
than 0.

u32 is an OF specific data used to retrieve the value and has no need to
be propagated up to the tree index.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  2 +-
 net/dsa/dsa2.c    | 14 +++++++-------
 net/dsa/slave.c   |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index fa1c21ab8092..e54332968417 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -116,7 +116,7 @@ struct dsa_switch_tree {
 	struct raw_notifier_head	nh;
 
 	/* Tree identifier */
-	u32 tree;
+	unsigned int index;
 
 	/* Number of switches attached to this tree */
 	struct kref refcount;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 797d1156b4e6..8b68dc2f5707 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -27,12 +27,12 @@ static DEFINE_MUTEX(dsa2_mutex);
 static const struct devlink_ops dsa_devlink_ops = {
 };
 
-static struct dsa_switch_tree *dsa_get_dst(u32 tree)
+static struct dsa_switch_tree *dsa_get_dst(unsigned int index)
 {
 	struct dsa_switch_tree *dst;
 
 	list_for_each_entry(dst, &dsa_switch_trees, list)
-		if (dst->tree == tree) {
+		if (dst->index == index) {
 			kref_get(&dst->refcount);
 			return dst;
 		}
@@ -53,14 +53,14 @@ static void dsa_put_dst(struct dsa_switch_tree *dst)
 	kref_put(&dst->refcount, dsa_free_dst);
 }
 
-static struct dsa_switch_tree *dsa_add_dst(u32 tree)
+static struct dsa_switch_tree *dsa_add_dst(unsigned int index)
 {
 	struct dsa_switch_tree *dst;
 
 	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
 	if (!dst)
 		return NULL;
-	dst->tree = tree;
+	dst->index = index;
 	INIT_LIST_HEAD(&dst->list);
 	list_add_tail(&dsa_switch_trees, &dst->list);
 	kref_init(&dst->refcount);
@@ -454,7 +454,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 
 	dst->cpu_dp = NULL;
 
-	pr_info("DSA: tree %d unapplied\n", dst->tree);
+	pr_info("DSA: tree %d unapplied\n", dst->index);
 	dst->applied = false;
 }
 
@@ -504,7 +504,7 @@ static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 
 	}
 
-	pr_info("DSA: switch %d %d parsed\n", dst->tree, ds->index);
+	pr_info("DSA: switch %d %d parsed\n", dst->index, ds->index);
 
 	return 0;
 }
@@ -549,7 +549,7 @@ static int dsa_dst_parse(struct dsa_switch_tree *dst)
 		}
 	}
 
-	pr_info("DSA: tree %d parsed\n", dst->tree);
+	pr_info("DSA: tree %d parsed\n", dst->index);
 
 	return 0;
 }
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9b75d0ac4092..814ced75a0cc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -55,7 +55,7 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds)
 	ds->slave_mii_bus->read = dsa_slave_phy_read;
 	ds->slave_mii_bus->write = dsa_slave_phy_write;
 	snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
-		 ds->dst->tree, ds->index);
+		 ds->dst->index, ds->index);
 	ds->slave_mii_bus->parent = ds->dev;
 	ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
 }
-- 
cgit v1.2.3


From 1f2556916d974cfb62b6af51660186b5f58bd869 Mon Sep 17 00:00:00 2001
From: Priyaranjan Jha <priyarjha@google.com>
Date: Fri, 3 Nov 2017 16:38:48 -0700
Subject: tcp: higher throughput under reordering with adaptive RACK reordering
 wnd

Currently TCP RACK loss detection does not work well if packets are
being reordered beyond its static reordering window (min_rtt/4).Under
such reordering it may falsely trigger loss recoveries and reduce TCP
throughput significantly.

This patch improves that by increasing and reducing the reordering
window based on DSACK, which is now supported in major TCP implementations.
It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries.

- If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
  by srtt), since there is possibility that spurious retransmission was
  due to reordering delay longer than reo_wnd.

- Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
  no. of successful recoveries (accounts for full DSACK-based loss
  recovery undo). After that, reset it to default (min_rtt/4).

- At max, reo_wnd is incremented only once per rtt. So that the new
  DSACK on which we are reacting, is due to the spurious retx (approx)
  after the reo_wnd has been updated last time.

- reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
  absolute value to account for change in rtt.

In our internal testing, we observed significant increase in throughput,
in scenarios where reordering exceeds min_rtt/4 (previous static value).

Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  1 +
 include/linux/tcp.h                    |  9 +++++--
 include/net/tcp.h                      |  2 ++
 net/ipv4/tcp.c                         |  1 +
 net/ipv4/tcp_input.c                   |  7 +++++
 net/ipv4/tcp_minisocks.c               |  4 +++
 net/ipv4/tcp_recovery.c                | 48 ++++++++++++++++++++++++++++++++--
 7 files changed, 68 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e6661b205f72..54410a1d4065 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -454,6 +454,7 @@ tcp_recovery - INTEGER
 
 	RACK: 0x1 enables the RACK loss detection for fast detection of lost
 	      retransmissions and tail drops.
+	RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
 
 	Default: 0x1
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8c431385b272..22f40c96a15b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -210,8 +210,13 @@ struct tcp_sock {
 		u64 mstamp; /* (Re)sent time of the skb */
 		u32 rtt_us;  /* Associated RTT */
 		u32 end_seq; /* Ending TCP sequence of the skb */
-		u8 advanced; /* mstamp advanced since last lost marking */
-		u8 reord;    /* reordering detected */
+		u32 last_delivered; /* tp->delivered at last reo_wnd adj */
+		u8 reo_wnd_steps;   /* Allowed reordering window */
+#define TCP_RACK_RECOVERY_THRESH 16
+		u8 reo_wnd_persist:5, /* No. of recovery since last adj */
+		   dsack_seen:1, /* Whether DSACK seen after last adj */
+		   advanced:1,	 /* mstamp advanced since last lost marking */
+		   reord:1;	 /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c2bf2a822b10..babfd4da1515 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -246,6 +246,7 @@ extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
+#define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1901,6 +1902,7 @@ extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     u64 xmit_time);
 extern void tcp_rack_reo_timeout(struct sock *sk);
+extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
 
 /* At how many usecs into the future should the RTO fire? */
 static inline s64 tcp_rto_delta_us(const struct sock *sk)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a7a0f316eb86..c4cb19ed4628 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk)
 	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
+	tp->rack.reo_wnd_steps = 1;
 
 	sk->sk_state = TCP_CLOSE;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8393b405ea98..0ada8bfc2ebd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -856,6 +856,7 @@ void tcp_disable_fack(struct tcp_sock *tp)
 static void tcp_dsack_seen(struct tcp_sock *tp)
 {
 	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+	tp->rack.dsack_seen = 1;
 }
 
 static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -2408,6 +2409,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
 			mib_idx = LINUX_MIB_TCPFULLUNDO;
 
 		NET_INC_STATS(sock_net(sk), mib_idx);
+	} else if (tp->rack.reo_wnd_persist) {
+		tp->rack.reo_wnd_persist--;
 	}
 	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
 		/* Hold old state until something *above* high_seq
@@ -2427,6 +2430,8 @@ static bool tcp_try_undo_dsack(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tp->undo_marker && !tp->undo_retrans) {
+		tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
+					       tp->rack.reo_wnd_persist + 1);
 		DBGUNDO(sk, "D-SACK");
 		tcp_undo_cwnd_reduction(sk, false);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
@@ -3644,6 +3649,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
 				    &sack_state);
 
+	tcp_rack_update_reo_wnd(sk, &rs);
+
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 	/* If needed, reset TLP/RTO timer; RACK may later override this. */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3c65c1a3f944..4bb86580decd 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -551,6 +551,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->syn_data_acked = 0;
 		newtp->rack.mstamp = 0;
 		newtp->rack.advanced = 0;
+		newtp->rack.reo_wnd_steps = 1;
+		newtp->rack.last_delivered = 0;
+		newtp->rack.reo_wnd_persist = 0;
+		newtp->rack.dsack_seen = 0;
 
 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index ac3e9c6d3a3d..d3ea89020c69 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -44,6 +44,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 min_rtt = tcp_min_rtt(tp);
 	struct sk_buff *skb, *n;
 	u32 reo_wnd;
 
@@ -54,8 +55,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 	 * to queuing or delayed ACKs.
 	 */
 	reo_wnd = 1000;
-	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
-		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+	if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) {
+		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
+		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
+	}
 
 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
 				 tcp_tsorted_anchor) {
@@ -160,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk)
 	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
 		tcp_rearm_rto(sk);
 }
+
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
+	    !rs->prior_delivered)
+		return;
+
+	/* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+	if (before(rs->prior_delivered, tp->rack.last_delivered))
+		tp->rack.dsack_seen = 0;
+
+	/* Adjust the reo_wnd if update is pending */
+	if (tp->rack.dsack_seen) {
+		tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+					       tp->rack.reo_wnd_steps + 1);
+		tp->rack.dsack_seen = 0;
+		tp->rack.last_delivered = tp->delivered;
+		tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+	} else if (!tp->rack.reo_wnd_persist) {
+		tp->rack.reo_wnd_steps = 1;
+	}
+}
-- 
cgit v1.2.3


From ecf8fecb7828648cba0e42de7464a7e600c93459 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:31 -0500
Subject: device_cgroup: prepare code for bpf-based device controller

This is non-functional change to prepare the device cgroup code
for adding eBPF-based controller for cgroups v2.

The patch performs the following changes:
1) __devcgroup_inode_permission() and devcgroup_inode_mknod()
   are moving to the device-cgroup.h and converting into static inline.
2) __devcgroup_check_permission() is exported.
3) devcgroup_check_permission() wrapper is introduced to be used
   by both existing and new bpf-based implementations.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/device_cgroup.h | 61 ++++++++++++++++++++++++++++++++++++++++---
 security/device_cgroup.c      | 47 ++-------------------------------
 2 files changed, 59 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index cdbc344a92e4..2d93d7ecd479 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,17 +1,70 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
 
+#define DEVCG_ACC_MKNOD 1
+#define DEVCG_ACC_READ  2
+#define DEVCG_ACC_WRITE 4
+#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)
+
+#define DEVCG_DEV_BLOCK 1
+#define DEVCG_DEV_CHAR  2
+#define DEVCG_DEV_ALL   4  /* this represents all devices */
+
+#ifdef CONFIG_CGROUP_DEVICE
+extern int __devcgroup_check_permission(short type, u32 major, u32 minor,
+					short access);
+#else
+static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
+					       short access)
+{ return 0; }
+#endif
+
 #ifdef CONFIG_CGROUP_DEVICE
-extern int __devcgroup_inode_permission(struct inode *inode, int mask);
-extern int devcgroup_inode_mknod(int mode, dev_t dev);
+static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
+					     short access)
+{
+	return __devcgroup_check_permission(type, major, minor, access);
+}
+
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 {
+	short type, access = 0;
+
 	if (likely(!inode->i_rdev))
 		return 0;
-	if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
+
+	if (S_ISBLK(inode->i_mode))
+		type = DEVCG_DEV_BLOCK;
+	else if (S_ISCHR(inode->i_mode))
+		type = DEVCG_DEV_CHAR;
+	else
 		return 0;
-	return __devcgroup_inode_permission(inode, mask);
+
+	if (mask & MAY_WRITE)
+		access |= DEVCG_ACC_WRITE;
+	if (mask & MAY_READ)
+		access |= DEVCG_ACC_READ;
+
+	return devcgroup_check_permission(type, imajor(inode), iminor(inode),
+					  access);
 }
+
+static inline int devcgroup_inode_mknod(int mode, dev_t dev)
+{
+	short type;
+
+	if (!S_ISBLK(mode) && !S_ISCHR(mode))
+		return 0;
+
+	if (S_ISBLK(mode))
+		type = DEVCG_DEV_BLOCK;
+	else
+		type = DEVCG_DEV_CHAR;
+
+	return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
+					  DEVCG_ACC_MKNOD);
+}
+
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 { return 0; }
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 968c21557ba7..c65b39bafdfe 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -15,15 +15,6 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 
-#define DEVCG_ACC_MKNOD 1
-#define DEVCG_ACC_READ  2
-#define DEVCG_ACC_WRITE 4
-#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)
-
-#define DEVCG_DEV_BLOCK 1
-#define DEVCG_DEV_CHAR  2
-#define DEVCG_DEV_ALL   4  /* this represents all devices */
-
 static DEFINE_MUTEX(devcgroup_mutex);
 
 enum devcg_behavior {
@@ -810,8 +801,8 @@ struct cgroup_subsys devices_cgrp_subsys = {
  *
  * returns 0 on success, -EPERM case the operation is not permitted
  */
-static int __devcgroup_check_permission(short type, u32 major, u32 minor,
-				        short access)
+int __devcgroup_check_permission(short type, u32 major, u32 minor,
+				 short access)
 {
 	struct dev_cgroup *dev_cgroup;
 	bool rc;
@@ -833,37 +824,3 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
 
 	return 0;
 }
-
-int __devcgroup_inode_permission(struct inode *inode, int mask)
-{
-	short type, access = 0;
-
-	if (S_ISBLK(inode->i_mode))
-		type = DEVCG_DEV_BLOCK;
-	if (S_ISCHR(inode->i_mode))
-		type = DEVCG_DEV_CHAR;
-	if (mask & MAY_WRITE)
-		access |= DEVCG_ACC_WRITE;
-	if (mask & MAY_READ)
-		access |= DEVCG_ACC_READ;
-
-	return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
-			access);
-}
-
-int devcgroup_inode_mknod(int mode, dev_t dev)
-{
-	short type;
-
-	if (!S_ISBLK(mode) && !S_ISCHR(mode))
-		return 0;
-
-	if (S_ISBLK(mode))
-		type = DEVCG_DEV_BLOCK;
-	else
-		type = DEVCG_DEV_CHAR;
-
-	return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
-			DEVCG_ACC_MKNOD);
-
-}
-- 
cgit v1.2.3


From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:32 -0500
Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2

Cgroup v2 lacks the device controller, provided by cgroup v1.
This patch adds a new eBPF program type, which in combination
of previously added ability to attach multiple eBPF programs
to a cgroup, will provide a similar functionality, but with some
additional flexibility.

This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type.
A program takes major and minor device numbers, device type
(block/character) and access type (mknod/read/write) as parameters
and returns an integer which defines if the operation should be
allowed or terminated with -EPERM.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h     | 15 ++++++++++
 include/linux/bpf_types.h      |  3 ++
 include/linux/device_cgroup.h  |  8 ++++-
 include/uapi/linux/bpf.h       | 15 ++++++++++
 kernel/bpf/cgroup.c            | 67 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  7 +++++
 kernel/bpf/verifier.c          |  1 +
 tools/include/uapi/linux/bpf.h | 15 ++++++++++
 8 files changed, 130 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 87a7db9feb38..a7f16e0f8d68 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
 				     enum bpf_attach_type type);
 
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	}								       \
 	__ret;								       \
 })
+
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+							  access,	      \
+							  BPF_CGROUP_DEVICE); \
+									      \
+	__ret;								      \
+})
 #else
 
 struct cgroup_bpf {};
@@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 53c5b9ad7220..978c1d9c9383 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index 2d93d7ecd479..8557efe096dc 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
+#include <linux/bpf-cgroup.h>
 
 #define DEVCG_ACC_MKNOD 1
 #define DEVCG_ACC_READ  2
@@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
 { return 0; }
 #endif
 
-#ifdef CONFIG_CGROUP_DEVICE
+#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
 static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
 					     short access)
 {
+	int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);
+
+	if (rc)
+		return -EPERM;
+
 	return __devcgroup_check_permission(type, major, minor, access);
 }
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4455dd195201..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -141,6 +142,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -991,4 +993,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 3db5a17fcfe8..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -522,3 +522,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	struct bpf_cgroup_dev_ctx ctx = {
+		.access_type = (access << 16) | dev_type,
+		.major = major,
+		.minor = minor,
+	};
+	int allow = 1;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+				   BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
+
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+	default:
+		return NULL;
+	}
+}
+
+static bool cgroup_dev_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+	.get_func_proto		= cgroup_dev_func_proto,
+	.is_valid_access	= cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 416d70cdfc76..09badc37e864 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1326,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, true);
@@ -1378,6 +1381,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, false);
@@ -1420,6 +1426,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
 	case BPF_CGROUP_SOCK_OPS:
+	case BPF_CGROUP_DEVICE:
 		break;
 	default:
 		return -EINVAL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index add845fe788a..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3124,6 +3124,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
 		break;
 	default:
 		return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e92f62cf933a..b280f37cd057 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -131,6 +131,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -140,6 +141,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -990,4 +992,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 5caaed151a68ae36aca2981cc245f5960a0a7603 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 2 Nov 2017 19:41:09 +0100
Subject: netfilter: conntrack: don't cache nlattr_tuple_size result in
 nla_size

We currently call ->nlattr_tuple_size() once at register time and
cache result in l4proto->nla_size.

nla_size is the only member that is written to, avoiding this would
allow to make l4proto trackers const.

We can use ->nlattr_tuple_size() at run time, and cache result in
the individual trackers instead.

This is an intermediate step, next patch removes nlattr_size()
callback and computes size at compile time, then removes nla_size.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   |  4 ++--
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  9 +++++++--
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  9 +++++++--
 net/netfilter/nf_conntrack_core.c              |  9 +++++++--
 net/netfilter/nf_conntrack_netlink.c           | 10 +++++++---
 net/netfilter/nf_conntrack_proto.c             |  2 --
 net/netfilter/nf_conntrack_proto_tcp.c         |  9 +++++++--
 7 files changed, 37 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index e06518874144..46e786ffcf2f 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -74,7 +74,7 @@ struct nf_conntrack_l4proto {
 	int (*tuple_to_nlattr)(struct sk_buff *skb,
 			       const struct nf_conntrack_tuple *t);
 	/* Calculate tuple nlattr size */
-	int (*nlattr_tuple_size)(void);
+	unsigned int (*nlattr_tuple_size)(void);
 	int (*nlattr_to_tuple)(struct nlattr *tb[],
 			       struct nf_conntrack_tuple *t);
 	const struct nla_policy *nla_policy;
@@ -144,7 +144,7 @@ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
 			       const struct nf_conntrack_tuple *tuple);
 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
 			       struct nf_conntrack_tuple *t);
-int nf_ct_port_nlattr_tuple_size(void);
+unsigned int nf_ct_port_nlattr_tuple_size(void);
 extern const struct nla_policy nf_ct_port_nla_policy[];
 
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 8969420cecc3..1849fedd9b81 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -258,9 +258,14 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[],
 	return 0;
 }
 
-static int icmp_nlattr_tuple_size(void)
+static unsigned int icmp_nlattr_tuple_size(void)
 {
-	return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+	static unsigned int size __read_mostly;
+
+	if (!size)
+		size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+
+	return size;
 }
 #endif
 
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index dca921df28e1..3ac0d826afc4 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -259,9 +259,14 @@ static int icmpv6_nlattr_to_tuple(struct nlattr *tb[],
 	return 0;
 }
 
-static int icmpv6_nlattr_tuple_size(void)
+static unsigned int icmpv6_nlattr_tuple_size(void)
 {
-	return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1);
+	static unsigned int size __read_mostly;
+
+	if (!size)
+		size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1);
+
+	return size;
 }
 #endif
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 28e675150853..0e516947c16f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1563,9 +1563,14 @@ int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
 }
 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
 
-int nf_ct_port_nlattr_tuple_size(void)
+unsigned int nf_ct_port_nlattr_tuple_size(void)
 {
-	return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+	static unsigned int size __read_mostly;
+
+	if (!size)
+		size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+
+	return size;
 }
 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
 #endif
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index de4053d84364..6e0adfefb9ed 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -533,11 +533,11 @@ nla_put_failure:
 	return -1;
 }
 
-static inline size_t ctnetlink_proto_size(const struct nf_conn *ct)
+static size_t ctnetlink_proto_size(const struct nf_conn *ct)
 {
 	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
-	size_t len;
+	size_t len, len4 = 0;
 
 	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
 	len = l3proto->nla_size;
@@ -545,8 +545,12 @@ static inline size_t ctnetlink_proto_size(const struct nf_conn *ct)
 
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	len += l4proto->nla_size;
+	if (l4proto->nlattr_tuple_size) {
+		len4 = l4proto->nlattr_tuple_size();
+		len4 *= 3u; /* ORIG, REPLY, MASTER */
+	}
 
-	return len;
+	return len + len4;
 }
 
 static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 83f739e9dc08..3b06ff3f2dee 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -398,8 +398,6 @@ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto)
 	l4proto->nla_size = 0;
 	if (l4proto->nlattr_size)
 		l4proto->nla_size += l4proto->nlattr_size();
-	if (l4proto->nlattr_tuple_size)
-		l4proto->nla_size += 3 * l4proto->nlattr_tuple_size();
 
 	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
 			   l4proto);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 8f283294d70f..b12fc07111d0 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1277,9 +1277,14 @@ static int tcp_nlattr_size(void)
 		+ nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1);
 }
 
-static int tcp_nlattr_tuple_size(void)
+static unsigned int tcp_nlattr_tuple_size(void)
 {
-	return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+	static unsigned int size __read_mostly;
+
+	if (!size)
+		size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+
+	return size;
 }
 #endif
 
-- 
cgit v1.2.3


From ba0e4d9917b43dfa746cbbcb4477da59aae73bd6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 9 Oct 2017 19:52:28 +0200
Subject: netfilter: nf_tables: get set elements via netlink

This patch adds a new get operation to look up for specific elements in
a set via netlink interface. You can also use it to check if an interval
already exists.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   5 ++
 net/netfilter/nf_tables_api.c     | 184 ++++++++++++++++++++++++++------------
 net/netfilter/nft_set_bitmap.c    |  18 ++++
 net/netfilter/nft_set_hash.c      |  39 ++++++++
 net/netfilter/nft_set_rbtree.c    |  73 +++++++++++++++
 5 files changed, 264 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 0f5b12a4ad09..d011e56cc7a9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -311,6 +311,7 @@ struct nft_expr;
  *	@flush: deactivate element in the next generation
  *	@remove: remove element from set
  *	@walk: iterate over all set elemeennts
+ *	@get: get set elements
  *	@privsize: function to return size of set private data
  *	@init: initialize private data of new set instance
  *	@destroy: destroy private data of set instance
@@ -350,6 +351,10 @@ struct nft_set_ops {
 	void				(*walk)(const struct nft_ctx *ctx,
 						struct nft_set *set,
 						struct nft_set_iter *iter);
+	void *				(*get)(const struct net *net,
+					       const struct nft_set *set,
+					       const struct nft_set_elem *elem,
+					       unsigned int flags);
 
 	unsigned int			(*privsize)(const struct nlattr * const nla[],
 						    const struct nft_set_desc *desc);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3b4a0739ee39..1d66be0d8ef7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3586,45 +3586,6 @@ static int nf_tables_dump_set_done(struct netlink_callback *cb)
 	return 0;
 }
 
-static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
-				struct sk_buff *skb, const struct nlmsghdr *nlh,
-				const struct nlattr * const nla[],
-				struct netlink_ext_ack *extack)
-{
-	u8 genmask = nft_genmask_cur(net);
-	const struct nft_set *set;
-	struct nft_ctx ctx;
-	int err;
-
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
-	if (err < 0)
-		return err;
-
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
-				   genmask);
-	if (IS_ERR(set))
-		return PTR_ERR(set);
-
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
-		struct netlink_dump_control c = {
-			.dump = nf_tables_dump_set,
-			.done = nf_tables_dump_set_done,
-		};
-		struct nft_set_dump_ctx *dump_ctx;
-
-		dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
-		if (!dump_ctx)
-			return -ENOMEM;
-
-		dump_ctx->set = set;
-		dump_ctx->ctx = ctx;
-
-		c.data = dump_ctx;
-		return netlink_dump_start(nlsk, skb, nlh, &c);
-	}
-	return -EOPNOTSUPP;
-}
-
 static int nf_tables_fill_setelem_info(struct sk_buff *skb,
 				       const struct nft_ctx *ctx, u32 seq,
 				       u32 portid, int event, u16 flags,
@@ -3670,6 +3631,135 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_setelem_parse_flags(const struct nft_set *set,
+				   const struct nlattr *attr, u32 *flags)
+{
+	if (attr == NULL)
+		return 0;
+
+	*flags = ntohl(nla_get_be32(attr));
+	if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
+		return -EINVAL;
+	if (!(set->flags & NFT_SET_INTERVAL) &&
+	    *flags & NFT_SET_ELEM_INTERVAL_END)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
+			    const struct nlattr *attr)
+{
+	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+	const struct nft_set_ext *ext;
+	struct nft_data_desc desc;
+	struct nft_set_elem elem;
+	struct sk_buff *skb;
+	uint32_t flags = 0;
+	void *priv;
+	int err;
+
+	err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+			       nft_set_elem_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!nla[NFTA_SET_ELEM_KEY])
+		return -EINVAL;
+
+	err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+	if (err < 0)
+		return err;
+
+	err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
+			    nla[NFTA_SET_ELEM_KEY]);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
+		return err;
+
+	priv = set->ops->get(ctx->net, set, &elem, flags);
+	if (IS_ERR(priv))
+		return PTR_ERR(priv);
+
+	elem.priv = priv;
+	ext = nft_set_elem_ext(set, &elem);
+
+	err = -ENOMEM;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err1;
+
+	err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
+					  NFT_MSG_NEWSETELEM, 0, set, &elem);
+	if (err < 0)
+		goto err2;
+
+	err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT);
+	/* This avoids a loop in nfnetlink. */
+	if (err < 0)
+		goto err1;
+
+	return 0;
+err2:
+	kfree_skb(skb);
+err1:
+	/* this avoids a loop in nfnetlink. */
+	return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
+				struct sk_buff *skb, const struct nlmsghdr *nlh,
+				const struct nlattr * const nla[],
+				struct netlink_ext_ack *extack)
+{
+	u8 genmask = nft_genmask_cur(net);
+	struct nft_set *set;
+	struct nlattr *attr;
+	struct nft_ctx ctx;
+	int rem, err = 0;
+
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+	if (err < 0)
+		return err;
+
+	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+				   genmask);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_set,
+			.done = nf_tables_dump_set_done,
+		};
+		struct nft_set_dump_ctx *dump_ctx;
+
+		dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
+		if (!dump_ctx)
+			return -ENOMEM;
+
+		dump_ctx->set = set;
+		dump_ctx->ctx = ctx;
+
+		c.data = dump_ctx;
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
+		return -EINVAL;
+
+	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+		err = nft_get_set_elem(&ctx, set, attr);
+		if (err < 0)
+			break;
+	}
+
+	return err;
+}
+
 static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
 				     const struct nft_set *set,
 				     const struct nft_set_elem *elem,
@@ -3770,22 +3860,6 @@ static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem)
 	kfree(elem);
 }
 
-static int nft_setelem_parse_flags(const struct nft_set *set,
-				   const struct nlattr *attr, u32 *flags)
-{
-	if (attr == NULL)
-		return 0;
-
-	*flags = ntohl(nla_get_be32(attr));
-	if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
-		return -EINVAL;
-	if (!(set->flags & NFT_SET_INTERVAL) &&
-	    *flags & NFT_SET_ELEM_INTERVAL_END)
-		return -EINVAL;
-
-	return 0;
-}
-
 static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			    const struct nlattr *attr, u32 nlmsg_flags)
 {
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 734989c40579..45fb2752fb63 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -106,6 +106,23 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
 	return NULL;
 }
 
+static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
+			    const struct nft_set_elem *elem, unsigned int flags)
+{
+	const struct nft_bitmap *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_cur(net);
+	struct nft_bitmap_elem *be;
+
+	list_for_each_entry_rcu(be, &priv->list, head) {
+		if (memcmp(nft_set_ext_key(&be->ext), elem->key.val.data, set->klen) ||
+		    !nft_set_elem_active(&be->ext, genmask))
+			continue;
+
+		return be;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
 			     const struct nft_set_elem *elem,
 			     struct nft_set_ext **ext)
@@ -294,6 +311,7 @@ static struct nft_set_ops nft_bitmap_ops __read_mostly = {
 	.activate	= nft_bitmap_activate,
 	.lookup		= nft_bitmap_lookup,
 	.walk		= nft_bitmap_walk,
+	.get		= nft_bitmap_get,
 };
 
 static struct nft_set_type nft_bitmap_type __read_mostly = {
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 650677f1e539..c68a7e0fcf1e 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -95,6 +95,24 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
 	return !!he;
 }
 
+static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
+			   const struct nft_set_elem *elem, unsigned int flags)
+{
+	struct nft_rhash *priv = nft_set_priv(set);
+	struct nft_rhash_elem *he;
+	struct nft_rhash_cmp_arg arg = {
+		.genmask = nft_genmask_cur(net),
+		.set	 = set,
+		.key	 = elem->key.val.data,
+	};
+
+	he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params);
+	if (he != NULL)
+		return he;
+
+	return ERR_PTR(-ENOENT);
+}
+
 static bool nft_rhash_update(struct nft_set *set, const u32 *key,
 			     void *(*new)(struct nft_set *,
 					  const struct nft_expr *,
@@ -409,6 +427,24 @@ static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
 	return false;
 }
 
+static void *nft_hash_get(const struct net *net, const struct nft_set *set,
+			  const struct nft_set_elem *elem, unsigned int flags)
+{
+	struct nft_hash *priv = nft_set_priv(set);
+	u8 genmask = nft_genmask_cur(net);
+	struct nft_hash_elem *he;
+	u32 hash;
+
+	hash = jhash(elem->key.val.data, set->klen, priv->seed);
+	hash = reciprocal_scale(hash, priv->buckets);
+	hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
+		if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) &&
+		    nft_set_elem_active(&he->ext, genmask))
+			return he;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 /* nft_hash_select_ops() makes sure key size can be either 2 or 4 bytes . */
 static inline u32 nft_hash_key(const u32 *key, u32 klen)
 {
@@ -600,6 +636,7 @@ static struct nft_set_ops nft_rhash_ops __read_mostly = {
 	.lookup		= nft_rhash_lookup,
 	.update		= nft_rhash_update,
 	.walk		= nft_rhash_walk,
+	.get		= nft_rhash_get,
 	.features	= NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 };
 
@@ -617,6 +654,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = {
 	.remove		= nft_hash_remove,
 	.lookup		= nft_hash_lookup,
 	.walk		= nft_hash_walk,
+	.get		= nft_hash_get,
 	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
 };
 
@@ -634,6 +672,7 @@ static struct nft_set_ops nft_hash_fast_ops __read_mostly = {
 	.remove		= nft_hash_remove,
 	.lookup		= nft_hash_lookup_fast,
 	.walk		= nft_hash_walk,
+	.get		= nft_hash_get,
 	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
 };
 
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index d83a4ec5900d..e6f08bc5f359 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -113,6 +113,78 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 	return ret;
 }
 
+static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
+			     const u32 *key, struct nft_rbtree_elem **elem,
+			     unsigned int seq, unsigned int flags, u8 genmask)
+{
+	struct nft_rbtree_elem *rbe, *interval = NULL;
+	struct nft_rbtree *priv = nft_set_priv(set);
+	const struct rb_node *parent;
+	const void *this;
+	int d;
+
+	parent = rcu_dereference_raw(priv->root.rb_node);
+	while (parent != NULL) {
+		if (read_seqcount_retry(&priv->count, seq))
+			return false;
+
+		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+		this = nft_set_ext_key(&rbe->ext);
+		d = memcmp(this, key, set->klen);
+		if (d < 0) {
+			parent = rcu_dereference_raw(parent->rb_left);
+			interval = rbe;
+		} else if (d > 0) {
+			parent = rcu_dereference_raw(parent->rb_right);
+		} else {
+			if (!nft_set_elem_active(&rbe->ext, genmask))
+				parent = rcu_dereference_raw(parent->rb_left);
+
+			if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
+			    (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
+			    (flags & NFT_SET_ELEM_INTERVAL_END)) {
+				*elem = rbe;
+				return true;
+			}
+			return false;
+		}
+	}
+
+	if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
+	    nft_set_elem_active(&interval->ext, genmask) &&
+	    !nft_rbtree_interval_end(interval)) {
+		*elem = interval;
+		return true;
+	}
+
+	return false;
+}
+
+static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
+			    const struct nft_set_elem *elem, unsigned int flags)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+	unsigned int seq = read_seqcount_begin(&priv->count);
+	struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
+	const u32 *key = (const u32 *)&elem->key.val;
+	u8 genmask = nft_genmask_cur(net);
+	bool ret;
+
+	ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
+	if (ret || !read_seqcount_retry(&priv->count, seq))
+		return rbe;
+
+	read_lock_bh(&priv->lock);
+	seq = read_seqcount_begin(&priv->count);
+	ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
+	if (!ret)
+		rbe = ERR_PTR(-ENOENT);
+	read_unlock_bh(&priv->lock);
+
+	return rbe;
+}
+
 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			       struct nft_rbtree_elem *new,
 			       struct nft_set_ext **ext)
@@ -336,6 +408,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
 	.activate	= nft_rbtree_activate,
 	.lookup		= nft_rbtree_lookup,
 	.walk		= nft_rbtree_walk,
+	.get		= nft_rbtree_get,
 	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
 };
 
-- 
cgit v1.2.3


From 3928ee6485a316c8abde7e24c7f82033a1c8d3ae Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <mail@maciej.szmigiero.name>
Date: Thu, 2 Nov 2017 00:49:18 +0100
Subject: net: phy: leds: Add support for "link" trigger

Currently, we create a LED trigger for any link speed known to a PHY.
These triggers only fire when their exact link speed had been negotiated
(they aren't cumulative, that is, they don't fire for "their or any higher"
link speed).

What we are missing, however, is a trigger which will fire on any link
speed known to the PHY. Such trigger can then be used for implementing a
poor man's substitute of the "link" LED on boards that lack it.
Let's add it.

Signed-off-by: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig            |  7 +++++--
 drivers/net/phy/phy_led_triggers.c | 43 +++++++++++++++++++++++++++++++++++---
 include/linux/phy.h                |  2 ++
 3 files changed, 47 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 8125412c8814..bdfbabb86ee0 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -191,11 +191,14 @@ config LED_TRIGGER_PHY
 	  Adds support for a set of LED trigger events per-PHY.  Link
 	  state change will trigger the events, for consumption by an
 	  LED class driver.  There are triggers for each link speed currently
-	  supported by the phy, and are of the form:
+	  supported by the PHY and also a one common "link" trigger as a
+	  logical-or of all the link speed ones.
+	  All these triggers are named according to the following pattern:
 	      <mii bus id>:<phy>:<speed>
 
 	  Where speed is in the form:
-		<Speed in megabits>Mbps or <Speed in gigabits>Gbps
+		<Speed in megabits>Mbps OR <Speed in gigabits>Gbps OR link
+		for any speed known to the PHY.
 
 
 comment "MII PHY device drivers"
diff --git a/drivers/net/phy/phy_led_triggers.c b/drivers/net/phy/phy_led_triggers.c
index c736f29b3b2a..39ecad25b201 100644
--- a/drivers/net/phy/phy_led_triggers.c
+++ b/drivers/net/phy/phy_led_triggers.c
@@ -31,6 +31,7 @@ static void phy_led_trigger_no_link(struct phy_device *phy)
 {
 	if (phy->last_triggered) {
 		led_trigger_event(&phy->last_triggered->trigger, LED_OFF);
+		led_trigger_event(&phy->led_link_trigger->trigger, LED_OFF);
 		phy->last_triggered = NULL;
 	}
 }
@@ -54,6 +55,10 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
 	}
 
 	if (plt != phy->last_triggered) {
+		if (!phy->last_triggered)
+			led_trigger_event(&phy->led_link_trigger->trigger,
+					  LED_FULL);
+
 		led_trigger_event(&phy->last_triggered->trigger, LED_OFF);
 		led_trigger_event(&plt->trigger, LED_FULL);
 		phy->last_triggered = plt;
@@ -61,6 +66,13 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
 }
 EXPORT_SYMBOL_GPL(phy_led_trigger_change_speed);
 
+static void phy_led_trigger_format_name(struct phy_device *phy, char *buf,
+					size_t size, char *suffix)
+{
+	snprintf(buf, size, PHY_ID_FMT ":%s",
+		 phy->mdio.bus->id, phy->mdio.addr, suffix);
+}
+
 static int phy_led_trigger_register(struct phy_device *phy,
 				    struct phy_led_trigger *plt,
 				    unsigned int speed)
@@ -77,8 +89,8 @@ static int phy_led_trigger_register(struct phy_device *phy,
 		snprintf(name_suffix, sizeof(name_suffix), "%dGbps",
 			 DIV_ROUND_CLOSEST(speed, 1000));
 
-	snprintf(plt->name, sizeof(plt->name), PHY_ID_FMT ":%s",
-		 phy->mdio.bus->id, phy->mdio.addr, name_suffix);
+	phy_led_trigger_format_name(phy, plt->name, sizeof(plt->name),
+				    name_suffix);
 	plt->trigger.name = plt->name;
 
 	return led_trigger_register(&plt->trigger);
@@ -99,13 +111,30 @@ int phy_led_triggers_register(struct phy_device *phy)
 	if (!phy->phy_num_led_triggers)
 		return 0;
 
+	phy->led_link_trigger = devm_kzalloc(&phy->mdio.dev,
+					     sizeof(*phy->led_link_trigger),
+					     GFP_KERNEL);
+	if (!phy->led_link_trigger) {
+		err = -ENOMEM;
+		goto out_clear;
+	}
+
+	phy_led_trigger_format_name(phy, phy->led_link_trigger->name,
+				    sizeof(phy->led_link_trigger->name),
+				    "link");
+	phy->led_link_trigger->trigger.name = phy->led_link_trigger->name;
+
+	err = led_trigger_register(&phy->led_link_trigger->trigger);
+	if (err)
+		goto out_free_link;
+
 	phy->phy_led_triggers = devm_kzalloc(&phy->mdio.dev,
 					    sizeof(struct phy_led_trigger) *
 						   phy->phy_num_led_triggers,
 					    GFP_KERNEL);
 	if (!phy->phy_led_triggers) {
 		err = -ENOMEM;
-		goto out_clear;
+		goto out_unreg_link;
 	}
 
 	for (i = 0; i < phy->phy_num_led_triggers; i++) {
@@ -123,6 +152,11 @@ out_unreg:
 	while (i--)
 		phy_led_trigger_unregister(&phy->phy_led_triggers[i]);
 	devm_kfree(&phy->mdio.dev, phy->phy_led_triggers);
+out_unreg_link:
+	phy_led_trigger_unregister(phy->led_link_trigger);
+out_free_link:
+	devm_kfree(&phy->mdio.dev, phy->led_link_trigger);
+	phy->led_link_trigger = NULL;
 out_clear:
 	phy->phy_num_led_triggers = 0;
 	return err;
@@ -135,5 +169,8 @@ void phy_led_triggers_unregister(struct phy_device *phy)
 
 	for (i = 0; i < phy->phy_num_led_triggers; i++)
 		phy_led_trigger_unregister(&phy->phy_led_triggers[i]);
+
+	if (phy->led_link_trigger)
+		phy_led_trigger_unregister(phy->led_link_trigger);
 }
 EXPORT_SYMBOL_GPL(phy_led_triggers_unregister);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d78cd01ea513..dc82a07cb4fd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -451,6 +451,8 @@ struct phy_device {
 	struct phy_led_trigger *phy_led_triggers;
 	unsigned int phy_num_led_triggers;
 	struct phy_led_trigger *last_triggered;
+
+	struct phy_led_trigger *led_link_trigger;
 #endif
 
 	/*
-- 
cgit v1.2.3


From 84287bb3285634b60c55c00a1d5ed843b44fde92 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:23 -0800
Subject: ila: add checksum neutral map auto

Add checksum neutral auto that performs checksum neutral mapping
without using the C-bit. This is enabled by configuration of
a mapping.

The checksum neutral function has been split into
ila_csum_do_neutral_fmt and ila_csum_do_neutral_nofmt. The former
handles the C-bit and includes it in the adjustment value. The latter
just sets the adjustment value on the locator diff only.

Added configuration for checksum neutral map aut in ila_lwt
and ila_xlat.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h  |  1 +
 net/ipv6/ila/ila_common.c | 65 ++++++++++++++++++++++++++++-------------------
 net/ipv6/ila/ila_lwt.c    | 29 +++++++++++----------
 net/ipv6/ila/ila_xlat.c   | 10 +++++---
 4 files changed, 61 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index f54853288f99..0744881dcef3 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -41,6 +41,7 @@ enum {
 	ILA_CSUM_ADJUST_TRANSPORT,
 	ILA_CSUM_NEUTRAL_MAP,
 	ILA_CSUM_NO_ACTION,
+	ILA_CSUM_NEUTRAL_MAP_AUTO,
 };
 
 #endif /* _UAPI_LINUX_ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index f1d9248d8b86..8c88ecf29b93 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -37,8 +37,8 @@ static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
 	return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p);
 }
 
-static void ila_csum_do_neutral(struct ila_addr *iaddr,
-				struct ila_params *p)
+static void ila_csum_do_neutral_fmt(struct ila_addr *iaddr,
+				    struct ila_params *p)
 {
 	__sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3];
 	__wsum diff, fval;
@@ -60,13 +60,23 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr,
 	iaddr->ident.csum_neutral ^= 1;
 }
 
-static void ila_csum_adjust_transport(struct sk_buff *skb,
+static void ila_csum_do_neutral_nofmt(struct ila_addr *iaddr,
 				      struct ila_params *p)
 {
+	__sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3];
 	__wsum diff;
-	struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
+
+	diff = get_csum_diff_iaddr(iaddr, p);
+
+	*adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust)));
+}
+
+static void ila_csum_adjust_transport(struct sk_buff *skb,
+				      struct ila_params *p)
+{
 	size_t nhoff = sizeof(struct ipv6hdr);
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__wsum diff;
 
 	switch (ip6h->nexthdr) {
 	case NEXTHDR_TCP:
@@ -105,36 +115,39 @@ static void ila_csum_adjust_transport(struct sk_buff *skb,
 		}
 		break;
 	}
-
-	/* Now change destination address */
-	iaddr->loc = p->locator;
 }
 
 void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
-			     bool set_csum_neutral)
+			     bool sir2ila)
 {
 	struct ipv6hdr *ip6h = ipv6_hdr(skb);
 	struct ila_addr *iaddr = ila_a2i(&ip6h->daddr);
 
-	/* First deal with the transport checksum */
-	if (ila_csum_neutral_set(iaddr->ident)) {
-		/* C-bit is set in the locator indicating that this
-		 * is a locator being translated to a SIR address.
-		 * Perform (receiver) checksum-neutral translation.
-		 */
-		if (!set_csum_neutral)
-			ila_csum_do_neutral(iaddr, p);
-	} else {
-		switch (p->csum_mode) {
-		case ILA_CSUM_ADJUST_TRANSPORT:
-			ila_csum_adjust_transport(skb, p);
-			break;
-		case ILA_CSUM_NEUTRAL_MAP:
-			ila_csum_do_neutral(iaddr, p);
-			break;
-		case ILA_CSUM_NO_ACTION:
+	switch (p->csum_mode) {
+	case ILA_CSUM_ADJUST_TRANSPORT:
+		ila_csum_adjust_transport(skb, p);
+		break;
+	case ILA_CSUM_NEUTRAL_MAP:
+		if (sir2ila) {
+			if (WARN_ON(ila_csum_neutral_set(iaddr->ident))) {
+				/* Checksum flag should never be
+				 * set in a formatted SIR address.
+				 */
+				break;
+			}
+		} else if (!ila_csum_neutral_set(iaddr->ident)) {
+			/* ILA to SIR translation and C-bit isn't
+			 * set so we're good.
+			 */
 			break;
 		}
+		ila_csum_do_neutral_fmt(iaddr, p);
+		break;
+	case ILA_CSUM_NEUTRAL_MAP_AUTO:
+		ila_csum_do_neutral_nofmt(iaddr, p);
+		break;
+	case ILA_CSUM_NO_ACTION:
+		break;
 	}
 
 	/* Now change destination address */
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 696281b4bca2..104af07d83a6 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -127,6 +127,7 @@ static int ila_build_state(struct nlattr *nla,
 	struct lwtunnel_state *newts;
 	const struct fib6_config *cfg6 = cfg;
 	struct ila_addr *iaddr;
+	u8 csum_mode = ILA_CSUM_NO_ACTION;
 	int ret;
 
 	if (family != AF_INET6)
@@ -139,15 +140,6 @@ static int ila_build_state(struct nlattr *nla,
 		return -EINVAL;
 	}
 
-	iaddr = (struct ila_addr *)&cfg6->fc_dst;
-
-	if (!ila_addr_is_ila(iaddr) || ila_csum_neutral_set(iaddr->ident)) {
-		/* Don't allow translation for a non-ILA address or checksum
-		 * neutral flag to be set.
-		 */
-		return -EINVAL;
-	}
-
 	ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack);
 	if (ret < 0)
 		return ret;
@@ -155,6 +147,19 @@ static int ila_build_state(struct nlattr *nla,
 	if (!tb[ILA_ATTR_LOCATOR])
 		return -EINVAL;
 
+	iaddr = (struct ila_addr *)&cfg6->fc_dst;
+
+	if (tb[ILA_ATTR_CSUM_MODE])
+		csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
+
+	if (csum_mode == ILA_CSUM_NEUTRAL_MAP &&
+	    ila_csum_neutral_set(iaddr->ident)) {
+		/* Don't allow translation if checksum neutral bit is
+		 * configured and it's set in the SIR address.
+		 */
+		return -EINVAL;
+	}
+
 	newts = lwtunnel_state_alloc(sizeof(*ilwt));
 	if (!newts)
 		return -ENOMEM;
@@ -168,17 +173,13 @@ static int ila_build_state(struct nlattr *nla,
 
 	p = ila_params_lwtunnel(newts);
 
+	p->csum_mode = csum_mode;
 	p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
 
 	/* Precompute checksum difference for translation since we
 	 * know both the old locator and the new one.
 	 */
 	p->locator_match = iaddr->loc;
-	p->csum_diff = compute_csum_diff8(
-		(__be32 *)&p->locator_match, (__be32 *)&p->locator);
-
-	if (tb[ILA_ATTR_CSUM_MODE])
-		p->csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
 
 	ila_init_saved_csum(p);
 
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 3123b9de91b5..213259629e66 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -138,6 +138,8 @@ static int parse_nl_config(struct genl_info *info,
 
 	if (info->attrs[ILA_ATTR_CSUM_MODE])
 		xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]);
+	else
+		xp->ip.csum_mode = ILA_CSUM_NO_ACTION;
 
 	if (info->attrs[ILA_ATTR_IFINDEX])
 		xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
@@ -198,7 +200,7 @@ static void ila_free_cb(void *ptr, void *arg)
 	}
 }
 
-static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral);
+static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
 
 static unsigned int
 ila_nf_input(void *priv,
@@ -396,7 +398,7 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
 			      (__force u64)ila->xp.ip.locator_match.v64,
 			      ILA_ATTR_PAD) ||
 	    nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) ||
-	    nla_put_u32(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode))
+	    nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode))
 		return -1;
 
 	return 0;
@@ -607,7 +609,7 @@ static struct pernet_operations ila_net_ops = {
 	.size = sizeof(struct ila_net),
 };
 
-static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
+static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
 {
 	struct ila_map *ila;
 	struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -626,7 +628,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
 
 	ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan);
 	if (ila)
-		ila_update_ipv6_locator(skb, &ila->xp.ip, set_csum_neutral);
+		ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila);
 
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 70d5aef48a421a68bd9d1bf8f8267af406681580 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:24 -0800
Subject: ila: allow configuration of identifier type

Allow identifier to be explicitly configured for a mapping.
This can either be one of the identifier types specified in the
ILA draft or a value of ILA_ATYPE_USE_FORMAT which means the
identifier type is inferred from the identifier type field.
If a value other than ILA_ATYPE_USE_FORMAT is set for a
mapping then it is assumed that the identifier type field is
not present in an identifier.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h | 13 ++++++++++++
 net/ipv6/ila/ila.h       | 12 +-----------
 net/ipv6/ila/ila_lwt.c   | 51 +++++++++++++++++++++++++++++++++++++++++-------
 net/ipv6/ila/ila_xlat.c  | 18 ++++++++++++-----
 4 files changed, 71 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 0744881dcef3..8353c78a7781 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -17,6 +17,7 @@ enum {
 	ILA_ATTR_DIR,				/* u32 */
 	ILA_ATTR_PAD,
 	ILA_ATTR_CSUM_MODE,			/* u8 */
+	ILA_ATTR_IDENT_TYPE,			/* u8 */
 
 	__ILA_ATTR_MAX,
 };
@@ -44,4 +45,16 @@ enum {
 	ILA_CSUM_NEUTRAL_MAP_AUTO,
 };
 
+enum {
+	ILA_ATYPE_IID = 0,
+	ILA_ATYPE_LUID,
+	ILA_ATYPE_VIRT_V4,
+	ILA_ATYPE_VIRT_UNI_V6,
+	ILA_ATYPE_VIRT_MULTI_V6,
+	ILA_ATYPE_NONLOCAL_ADDR,
+	ILA_ATYPE_RSVD_1,
+	ILA_ATYPE_RSVD_2,
+
+	ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */
+};
 #endif /* _UAPI_LINUX_ILA_H */
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index e0170f62bc39..3c7a11b62334 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -55,17 +55,6 @@ struct ila_identifier {
 	};
 };
 
-enum {
-	ILA_ATYPE_IID = 0,
-	ILA_ATYPE_LUID,
-	ILA_ATYPE_VIRT_V4,
-	ILA_ATYPE_VIRT_UNI_V6,
-	ILA_ATYPE_VIRT_MULTI_V6,
-	ILA_ATYPE_RSVD_1,
-	ILA_ATYPE_RSVD_2,
-	ILA_ATYPE_RSVD_3,
-};
-
 #define CSUM_NEUTRAL_FLAG	htonl(0x10000000)
 
 struct ila_addr {
@@ -93,6 +82,7 @@ struct ila_params {
 	struct ila_locator locator_match;
 	__wsum csum_diff;
 	u8 csum_mode;
+	u8 ident_type;
 };
 
 static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 104af07d83a6..4b97d573f223 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -114,6 +114,7 @@ drop:
 static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
 };
 
 static int ila_build_state(struct nlattr *nla,
@@ -127,19 +128,14 @@ static int ila_build_state(struct nlattr *nla,
 	struct lwtunnel_state *newts;
 	const struct fib6_config *cfg6 = cfg;
 	struct ila_addr *iaddr;
+	u8 ident_type = ILA_ATYPE_USE_FORMAT;
 	u8 csum_mode = ILA_CSUM_NO_ACTION;
+	u8 eff_ident_type;
 	int ret;
 
 	if (family != AF_INET6)
 		return -EINVAL;
 
-	if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
-		/* Need to have full locator and at least type field
-		 * included in destination
-		 */
-		return -EINVAL;
-	}
-
 	ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack);
 	if (ret < 0)
 		return ret;
@@ -149,6 +145,41 @@ static int ila_build_state(struct nlattr *nla,
 
 	iaddr = (struct ila_addr *)&cfg6->fc_dst;
 
+	if (tb[ILA_ATTR_IDENT_TYPE])
+		ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]);
+
+	if (ident_type == ILA_ATYPE_USE_FORMAT) {
+		/* Infer identifier type from type field in formatted
+		 * identifier.
+		 */
+
+		if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
+			/* Need to have full locator and at least type field
+			 * included in destination
+			 */
+			return -EINVAL;
+		}
+
+		eff_ident_type = iaddr->ident.type;
+	} else {
+		eff_ident_type = ident_type;
+	}
+
+	switch (eff_ident_type) {
+	case ILA_ATYPE_IID:
+		/* Don't allow ILA for IID type */
+		return -EINVAL;
+	case ILA_ATYPE_LUID:
+		break;
+	case ILA_ATYPE_VIRT_V4:
+	case ILA_ATYPE_VIRT_UNI_V6:
+	case ILA_ATYPE_VIRT_MULTI_V6:
+	case ILA_ATYPE_NONLOCAL_ADDR:
+		/* These ILA formats are not supported yet. */
+	default:
+		return -EINVAL;
+	}
+
 	if (tb[ILA_ATTR_CSUM_MODE])
 		csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
 
@@ -174,6 +205,7 @@ static int ila_build_state(struct nlattr *nla,
 	p = ila_params_lwtunnel(newts);
 
 	p->csum_mode = csum_mode;
+	p->ident_type = ident_type;
 	p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
 
 	/* Precompute checksum difference for translation since we
@@ -208,9 +240,13 @@ static int ila_fill_encap_info(struct sk_buff *skb,
 	if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64,
 			      ILA_ATTR_PAD))
 		goto nla_put_failure;
+
 	if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -221,6 +257,7 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
 {
 	return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_CSUM_MODE */
+	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_IDENT_TYPE */
 	       0;
 }
 
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 213259629e66..6eb5e68f112a 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -121,6 +121,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
 	[ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
 };
 
 static int parse_nl_config(struct genl_info *info,
@@ -141,6 +142,12 @@ static int parse_nl_config(struct genl_info *info,
 	else
 		xp->ip.csum_mode = ILA_CSUM_NO_ACTION;
 
+	if (info->attrs[ILA_ATTR_IDENT_TYPE])
+		xp->ip.ident_type = nla_get_u8(
+				info->attrs[ILA_ATTR_IDENT_TYPE]);
+	else
+		xp->ip.ident_type = ILA_ATYPE_USE_FORMAT;
+
 	if (info->attrs[ILA_ATTR_IFINDEX])
 		xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
 
@@ -398,7 +405,8 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
 			      (__force u64)ila->xp.ip.locator_match.v64,
 			      ILA_ATTR_PAD) ||
 	    nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) ||
-	    nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode))
+	    nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) ||
+	    nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type))
 		return -1;
 
 	return 0;
@@ -619,10 +627,10 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
 
 	/* Assumes skb contains a valid IPv6 header that is pulled */
 
-	if (!ila_addr_is_ila(iaddr)) {
-		/* Type indicates this is not an ILA address */
-		return 0;
-	}
+	/* No check here that ILA type in the mapping matches what is in the
+	 * address. We assume that whatever sender gaves us can be translated.
+	 * The checksum mode however is relevant.
+	 */
 
 	rcu_read_lock();
 
-- 
cgit v1.2.3


From fddb231ebe647749782a9ebf11106a81f7168ba7 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:25 -0800
Subject: ila: Add a hook type for LWT routes

In LWT tunnels both an input and output route method is defined.
If both of these are executed in the same path then double translation
happens and the effect is not correct.

This patch adds a new attribute that indicates the hook type. Two
values are defined for route output and route output. ILA
translation is only done for the one that is set. The default is
to enable ILA on route output.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h |  7 +++++++
 net/ipv6/ila/ila_lwt.c   | 39 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 8353c78a7781..483b77af4eb8 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -18,6 +18,7 @@ enum {
 	ILA_ATTR_PAD,
 	ILA_ATTR_CSUM_MODE,			/* u8 */
 	ILA_ATTR_IDENT_TYPE,			/* u8 */
+	ILA_ATTR_HOOK_TYPE,			/* u8 */
 
 	__ILA_ATTR_MAX,
 };
@@ -57,4 +58,10 @@ enum {
 
 	ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */
 };
+
+enum {
+	ILA_HOOK_ROUTE_OUTPUT,
+	ILA_HOOK_ROUTE_INPUT,
+};
+
 #endif /* _UAPI_LINUX_ILA_H */
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 4b97d573f223..3d56a2fb6f86 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -20,6 +20,7 @@ struct ila_lwt {
 	struct ila_params p;
 	struct dst_cache dst_cache;
 	u32 connected : 1;
+	u32 lwt_output : 1;
 };
 
 static inline struct ila_lwt *ila_lwt_lwtunnel(
@@ -45,8 +46,10 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto drop;
 
-	ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate),
-				true);
+	if (ilwt->lwt_output)
+		ila_update_ipv6_locator(skb,
+					ila_params_lwtunnel(orig_dst->lwtstate),
+					true);
 
 	if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) {
 		/* Already have a next hop address in route, no need for
@@ -98,11 +101,15 @@ drop:
 static int ila_input(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
+	struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate);
 
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto drop;
 
-	ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false);
+	if (!ilwt->lwt_output)
+		ila_update_ipv6_locator(skb,
+					ila_params_lwtunnel(dst->lwtstate),
+					false);
 
 	return dst->lwtstate->orig_input(skb);
 
@@ -115,6 +122,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
 	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
+	[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
 };
 
 static int ila_build_state(struct nlattr *nla,
@@ -129,7 +137,9 @@ static int ila_build_state(struct nlattr *nla,
 	const struct fib6_config *cfg6 = cfg;
 	struct ila_addr *iaddr;
 	u8 ident_type = ILA_ATYPE_USE_FORMAT;
+	u8 hook_type = ILA_HOOK_ROUTE_OUTPUT;
 	u8 csum_mode = ILA_CSUM_NO_ACTION;
+	bool lwt_output = true;
 	u8 eff_ident_type;
 	int ret;
 
@@ -180,6 +190,20 @@ static int ila_build_state(struct nlattr *nla,
 		return -EINVAL;
 	}
 
+	if (tb[ILA_ATTR_HOOK_TYPE])
+		hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]);
+
+	switch (hook_type) {
+	case ILA_HOOK_ROUTE_OUTPUT:
+		lwt_output = true;
+		break;
+	case ILA_HOOK_ROUTE_INPUT:
+		lwt_output = false;
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	if (tb[ILA_ATTR_CSUM_MODE])
 		csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
 
@@ -202,6 +226,8 @@ static int ila_build_state(struct nlattr *nla,
 		return ret;
 	}
 
+	ilwt->lwt_output = !!lwt_output;
+
 	p = ila_params_lwtunnel(newts);
 
 	p->csum_mode = csum_mode;
@@ -236,6 +262,7 @@ static int ila_fill_encap_info(struct sk_buff *skb,
 			       struct lwtunnel_state *lwtstate)
 {
 	struct ila_params *p = ila_params_lwtunnel(lwtstate);
+	struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate);
 
 	if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64,
 			      ILA_ATTR_PAD))
@@ -247,6 +274,11 @@ static int ila_fill_encap_info(struct sk_buff *skb,
 	if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE,
+		       ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT :
+					  ILA_HOOK_ROUTE_INPUT))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -258,6 +290,7 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
 	return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_CSUM_MODE */
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_IDENT_TYPE */
+	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_HOOK_TYPE */
 	       0;
 }
 
-- 
cgit v1.2.3


From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:41 +0100
Subject: net_sch: red: Add offload ability to RED qdisc

Add the ability to offload RED qdisc by using ndo_setup_tc.
There are four commands for RED offloading:
* TC_RED_SET: handles set and change.
* TC_RED_DESTROY: handle qdisc destroy.
* TC_RED_STATS: update the qdiscs counters (given as reference)
* TC_RED_XSTAT: returns red xstats.

Whether RED is being offloaded is being determined every time dump action
is being called because parent change of this qdisc could change its
offload state but doesn't require any RED function to be called.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      |  1 +
 include/net/pkt_cls.h          | 30 ++++++++++++++++
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_red.c            | 79 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fda527ccb263..71968a2ca9f3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -777,6 +777,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
 	TC_SETUP_CBS,
+	TC_SETUP_QDISC_RED,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 98fef3221227..03c208d3c922 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -703,4 +703,34 @@ struct tc_cookie {
 	u8  *data;
 	u32 len;
 };
+
+enum tc_red_command {
+	TC_RED_REPLACE,
+	TC_RED_DESTROY,
+	TC_RED_STATS,
+	TC_RED_XSTATS,
+};
+
+struct tc_red_qopt_offload_params {
+	u32 min;
+	u32 max;
+	u32 probability;
+	bool is_ecn;
+};
+struct tc_red_qopt_offload_stats {
+	struct gnet_stats_basic_packed *bstats;
+	struct gnet_stats_queue *qstats;
+};
+
+struct tc_red_qopt_offload {
+	enum tc_red_command command;
+	u32 handle;
+	u32 parent;
+	union {
+		struct tc_red_qopt_offload_params set;
+		struct tc_red_qopt_offload_stats stats;
+		struct red_stats *xstats;
+	};
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 5002562868cc..6a2c5ea7e9c4 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,6 +256,7 @@ struct tc_red_qopt {
 #define TC_RED_ECN		1
 #define TC_RED_HARDDROP		2
 #define TC_RED_ADAPTATIVE	4
+#define TC_RED_OFFLOADED	8
 };
 
 struct tc_red_xstats {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index fdfdb56aaae2..007dd8ef8aac 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
 
@@ -148,11 +149,37 @@ static void red_reset(struct Qdisc *sch)
 	red_restart(&q->vars);
 }
 
+static int red_offload(struct Qdisc *sch, bool enable)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_red_qopt_offload opt = {
+		.handle = sch->handle,
+		.parent = sch->parent,
+	};
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	if (enable) {
+		opt.command = TC_RED_REPLACE;
+		opt.set.min = q->parms.qth_min >> q->parms.Wlog;
+		opt.set.max = q->parms.qth_max >> q->parms.Wlog;
+		opt.set.probability = q->parms.max_P;
+		opt.set.is_ecn = red_use_ecn(q);
+	} else {
+		opt.command = TC_RED_DESTROY;
+	}
+
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
+}
+
 static void red_destroy(struct Qdisc *sch)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 
 	del_timer_sync(&q->adapt_timer);
+	red_offload(sch, false);
 	qdisc_destroy(q->qdisc);
 }
 
@@ -219,6 +246,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
 		red_start_of_idle_period(&q->vars);
 
 	sch_tree_unlock(sch);
+	red_offload(sch, true);
 	return 0;
 }
 
@@ -244,6 +272,33 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt)
 	return red_change(sch, opt);
 }
 
+static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_red_qopt_offload hw_stats = {
+		.handle = sch->handle,
+		.parent = sch->parent,
+		.command = TC_RED_STATS,
+		.stats.bstats = &sch->bstats,
+		.stats.qstats = &sch->qstats,
+	};
+	int err;
+
+	opt->flags &= ~TC_RED_OFFLOADED;
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return 0;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+					    &hw_stats);
+	if (err == -EOPNOTSUPP)
+		return 0;
+
+	if (!err)
+		opt->flags |= TC_RED_OFFLOADED;
+
+	return err;
+}
+
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
@@ -257,8 +312,13 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 		.Plog		= q->parms.Plog,
 		.Scell_log	= q->parms.Scell_log,
 	};
+	int err;
 
 	sch->qstats.backlog = q->qdisc->qstats.backlog;
+	err = red_dump_offload(sch, &opt);
+	if (err)
+		goto nla_put_failure;
+
 	opts = nla_nest_start(skb, TCA_OPTIONS);
 	if (opts == NULL)
 		goto nla_put_failure;
@@ -275,6 +335,7 @@ nla_put_failure:
 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 	struct tc_red_xstats st = {
 		.early	= q->stats.prob_drop + q->stats.forced_drop,
 		.pdrop	= q->stats.pdrop,
@@ -282,6 +343,24 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
 	};
 
+	if (tc_can_offload(dev) &&  dev->netdev_ops->ndo_setup_tc) {
+		struct red_stats hw_stats = {0};
+		struct tc_red_qopt_offload hw_stats_request = {
+			.handle = sch->handle,
+			.parent = sch->parent,
+			.command = TC_RED_XSTATS,
+			.xstats = &hw_stats,
+		};
+		if (!dev->netdev_ops->ndo_setup_tc(dev,
+						   TC_SETUP_QDISC_RED,
+						   &hw_stats_request)) {
+			st.early += hw_stats.prob_drop + hw_stats.forced_drop;
+			st.pdrop += hw_stats.pdrop;
+			st.other += hw_stats.other;
+			st.marked += hw_stats.prob_mark + hw_stats.forced_mark;
+		}
+	}
+
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }
 
-- 
cgit v1.2.3


From 575ed7d39e2fbe602a3894bc766a8cb49af83bd3 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:42 +0100
Subject: net_sch: mqprio: Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO

Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO to match the new
convention.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c               | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c        | 2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c              | 2 +-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c         | 2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 2 +-
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c        | 2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c            | 2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c          | 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c         | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c      | 2 +-
 drivers/net/ethernet/sfc/falcon/tx.c                   | 2 +-
 drivers/net/ethernet/sfc/tx.c                          | 2 +-
 drivers/net/ethernet/ti/netcp_core.c                   | 2 +-
 include/linux/netdevice.h                              | 2 +-
 net/sched/sch_mqprio.c                                 | 5 +++--
 15 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 3d53153ce751..a74a8fbad53a 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -2206,7 +2206,7 @@ static int xgbe_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 	struct tc_mqprio_qopt *mqprio = type_data;
 	u8 tc;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 1216c1f1e052..4c739d5355d2 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4289,7 +4289,7 @@ int __bnx2x_setup_tc(struct net_device *dev, enum tc_setup_type type,
 {
 	struct tc_mqprio_qopt *mqprio = type_data;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 96416f5d97f3..e5472e5ae7b2 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7388,7 +7388,7 @@ static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	switch (type) {
 	case TC_SETUP_BLOCK:
 		return bnxt_setup_tc_block(dev, type_data);
-	case TC_SETUP_MQPRIO: {
+	case TC_SETUP_QDISC_MQPRIO: {
 		struct tc_mqprio_qopt *mqprio = type_data;
 
 		mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index ebc55b6a6349..784dbf5a3e12 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -351,7 +351,7 @@ static int dpaa_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
 	u8 num_tc;
 	int i;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index 2a0af11c9b59..59415090ff0f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -1252,7 +1252,7 @@ out:
 static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			     void *type_data)
 {
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	return hns3_setup_tc(dev, type_data);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 81e4425f0529..adc62fb38c49 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1389,7 +1389,7 @@ static int __fm10k_setup_tc(struct net_device *dev, enum tc_setup_type type,
 {
 	struct tc_mqprio_qopt *mqprio = type_data;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 05b94d87a6c3..17e6f64299cf 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7550,7 +7550,7 @@ static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 			   void *type_data)
 {
 	switch (type) {
-	case TC_SETUP_MQPRIO:
+	case TC_SETUP_QDISC_MQPRIO:
 		return i40e_setup_tc(netdev, type_data);
 	case TC_SETUP_BLOCK:
 		return i40e_setup_tc_block(netdev, type_data);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index e5dcb25be398..6eaca8366ac8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9431,7 +9431,7 @@ static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	switch (type) {
 	case TC_SETUP_BLOCK:
 		return ixgbe_setup_tc_block(dev, type_data);
-	case TC_SETUP_MQPRIO:
+	case TC_SETUP_QDISC_MQPRIO:
 		return ixgbe_setup_tc_mqprio(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 736a6ccaf05e..99051a294fa6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -135,7 +135,7 @@ static int __mlx4_en_setup_tc(struct net_device *dev, enum tc_setup_type type,
 {
 	struct tc_mqprio_qopt *mqprio = type_data;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	if (mqprio->num_tc && mqprio->num_tc != MLX4_EN_NUM_UP_HIGH)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f877f2f5f2a5..5d5d2e50e4bf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3146,7 +3146,7 @@ int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	case TC_SETUP_BLOCK:
 		return mlx5e_setup_tc_block(dev, type_data);
 #endif
-	case TC_SETUP_MQPRIO:
+	case TC_SETUP_QDISC_MQPRIO:
 		return mlx5e_setup_tc_mqprio(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index 6a75f4140a4b..1b978d69e702 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -435,7 +435,7 @@ int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
 	unsigned tc, num_tc;
 	int rc;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	num_tc = mqprio->num_tc;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 32bf1fecf864..ea27b8a7f465 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -663,7 +663,7 @@ int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
 	unsigned tc, num_tc;
 	int rc;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	num_tc = mqprio->num_tc;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 437d36289786..15e2e3031d36 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1887,7 +1887,7 @@ static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	/* setup tc must be called under rtnl lock */
 	ASSERT_RTNL();
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 71968a2ca9f3..703885aed856 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -770,7 +770,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
 enum tc_setup_type {
-	TC_SETUP_MQPRIO,
+	TC_SETUP_QDISC_MQPRIO,
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_CLSMATCHALL,
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 4d5ed45123f0..b85885a9d8a1 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -50,7 +50,8 @@ static void mqprio_destroy(struct Qdisc *sch)
 		switch (priv->mode) {
 		case TC_MQPRIO_MODE_DCB:
 		case TC_MQPRIO_MODE_CHANNEL:
-			dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+			dev->netdev_ops->ndo_setup_tc(dev,
+						      TC_SETUP_QDISC_MQPRIO,
 						      &mqprio);
 			break;
 		default:
@@ -265,7 +266,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 			return -EINVAL;
 		}
 		err = dev->netdev_ops->ndo_setup_tc(dev,
-						    TC_SETUP_MQPRIO,
+						    TC_SETUP_QDISC_MQPRIO,
 						    &mqprio);
 		if (err)
 			return err;
-- 
cgit v1.2.3


From 8521db4c7e155d12fb280686c0552e47f77e9110 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:43 +0100
Subject: net_sch: cbs: Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS

Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS to match the new convention..

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 +-
 include/linux/netdevice.h                 | 2 +-
 net/sched/sch_cbs.c                       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index e22bce7cdacd..43cf39527660 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2488,7 +2488,7 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	struct igb_adapter *adapter = netdev_priv(dev);
 
 	switch (type) {
-	case TC_SETUP_CBS:
+	case TC_SETUP_QDISC_CBS:
 		return igb_offload_cbs(adapter, type_data);
 
 	default:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 703885aed856..30f0f2928808 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -776,7 +776,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
-	TC_SETUP_CBS,
+	TC_SETUP_QDISC_CBS,
 	TC_SETUP_QDISC_RED,
 };
 
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index bdb533b7fb8c..7a72980c1509 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -212,7 +212,7 @@ static void cbs_disable_offload(struct net_device *dev,
 	cbs.queue = q->queue;
 	cbs.enable = 0;
 
-	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
 	if (err < 0)
 		pr_warn("Couldn't disable CBS offload for queue %d\n",
 			cbs.queue);
@@ -236,7 +236,7 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
 	cbs.idleslope = opt->idleslope;
 	cbs.sendslope = opt->sendslope;
 
-	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From 620a5c860b774a81ce3f193eefb52bf4d128cca5 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Mon, 6 Nov 2017 12:42:01 +0100
Subject: net: dsa: lan9303: Correct register names in comments

Two comments refer to registers, but lack the LAN9303_ prefix.
Fix that.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/lan9303.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index 05d8d136baab..f48a85c377de 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -13,8 +13,8 @@ struct lan9303_phy_ops {
 #define LAN9303_NUM_ALR_RECORDS 512
 struct lan9303_alr_cache_entry {
 	u8  mac_addr[ETH_ALEN];
-	u8  port_map;           /* Bitmap of ports. Zero if unused entry */
-	u8  stp_override;       /* non zero if set ALR_DAT1_AGE_OVERRID */
+	u8  port_map;         /* Bitmap of ports. Zero if unused entry */
+	u8  stp_override;     /* non zero if set LAN9303_ALR_DAT1_AGE_OVERRID */
 };
 
 struct lan9303 {
@@ -28,7 +28,9 @@ struct lan9303 {
 	struct mutex indirect_mutex; /* protect indexed register access */
 	const struct lan9303_phy_ops *ops;
 	bool is_bridged; /* true if port 1 and 2 are bridged */
-	u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
+
+	/* remember LAN9303_SWE_PORT_STATE while not bridged */
+	u32 swe_port_state;
 	/* LAN9303 do not offer reading specific ALR entry. Cache all
 	 * static entries in a flat table
 	 **/
-- 
cgit v1.2.3


From 96c623e51f1c40bf524decc48c6fac7ce5dd41f7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 6 Nov 2017 14:26:10 +0100
Subject: of: add of_property_read_variable_* dummy helpers

Commit a67e9472da42 ("of: Add array read functions with min/max size
limits") added a new interface for reading variable-length arrays from
DT properties. One user was added in dsa recently and this causes a
build error because that code can be built with CONFIG_OF disabled:

net/dsa/dsa2.c: In function 'dsa_switch_parse_member_of':
net/dsa/dsa2.c:678:7: error: implicit declaration of function 'of_property_read_variable_u32_array'; did you mean 'of_property_read_u32_array'? [-Werror=implicit-function-declaration]

This adds a dummy functions for of_property_read_variable_u32_array()
and a few others that had been missing here. I decided to move
of_property_read_string() and of_property_read_string_helper() in the
process to make it easier to compare the two sets of function prototypes
to make sure they match.

Fixes: 975e6e32215e ("net: dsa: rework switch parsing")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of.h | 62 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/of.h b/include/linux/of.h
index b240ed69dc96..b32d418d011a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -675,12 +675,6 @@ static inline int of_property_count_elems_of_size(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_property_read_u32_index(const struct device_node *np,
-			const char *propname, u32 index, u32 *out_value)
-{
-	return -ENOSYS;
-}
-
 static inline int of_property_read_u8_array(const struct device_node *np,
 			const char *propname, u8 *out_values, size_t sz)
 {
@@ -707,16 +701,14 @@ static inline int of_property_read_u64_array(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_property_read_string(const struct device_node *np,
-					  const char *propname,
-					  const char **out_string)
+static inline int of_property_read_u32_index(const struct device_node *np,
+			const char *propname, u32 index, u32 *out_value)
 {
 	return -ENOSYS;
 }
 
-static inline int of_property_read_string_helper(const struct device_node *np,
-						 const char *propname,
-						 const char **out_strs, size_t sz, int index)
+static inline int of_property_read_u64_index(const struct device_node *np,
+			const char *propname, u32 index, u64 *out_value)
 {
 	return -ENOSYS;
 }
@@ -744,12 +736,51 @@ static inline int of_n_size_cells(struct device_node *np)
 	return 0;
 }
 
+static inline int of_property_read_variable_u8_array(const struct device_node *np,
+					const char *propname, u8 *out_values,
+					size_t sz_min, size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_variable_u16_array(const struct device_node *np,
+					const char *propname, u16 *out_values,
+					size_t sz_min, size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_variable_u32_array(const struct device_node *np,
+					const char *propname,
+					u32 *out_values,
+					size_t sz_min,
+					size_t sz_max)
+{
+	return -ENOSYS;
+}
+
 static inline int of_property_read_u64(const struct device_node *np,
 				       const char *propname, u64 *out_value)
 {
 	return -ENOSYS;
 }
 
+static inline int of_property_read_variable_u64_array(const struct device_node *np,
+					const char *propname,
+					u64 *out_values,
+					size_t sz_min,
+					size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_string(const struct device_node *np,
+					  const char *propname,
+					  const char **out_string)
+{
+	return -ENOSYS;
+}
+
 static inline int of_property_match_string(const struct device_node *np,
 					   const char *propname,
 					   const char *string)
@@ -757,6 +788,13 @@ static inline int of_property_match_string(const struct device_node *np,
 	return -ENOSYS;
 }
 
+static inline int of_property_read_string_helper(const struct device_node *np,
+						 const char *propname,
+						 const char **out_strs, size_t sz, int index)
+{
+	return -ENOSYS;
+}
+
 static inline struct device_node *of_parse_phandle(const struct device_node *np,
 						   const char *phandle_name,
 						   int index)
-- 
cgit v1.2.3


From b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3 Mon Sep 17 00:00:00 2001
From: Yi Yang <yi.y.yang@intel.com>
Date: Tue, 7 Nov 2017 21:07:02 +0800
Subject: openvswitch: enable NSH support

v16->17
 - Fixed disputed check code: keep them in nsh_push and nsh_pop
   but also add them in __ovs_nla_copy_actions

v15->v16
 - Add csum recalculation for nsh_push, nsh_pop and set_nsh
   pointed out by Pravin
 - Move nsh key into the union with ipv4 and ipv6 and add
   check for nsh key in match_validate pointed out by Pravin
 - Add nsh check in validate_set and __ovs_nla_copy_actions

v14->v15
 - Check size in nsh_hdr_from_nlattr
 - Fixed four small issues pointed out By Jiri and Eric

v13->v14
 - Rename skb_push_nsh to nsh_push per Dave's comment
 - Rename skb_pop_nsh to nsh_pop per Dave's comment

v12->v13
 - Fix NSH header length check in set_nsh

v11->v12
 - Fix missing changes old comments pointed out
 - Fix new comments for v11

v10->v11
 - Fix the left three disputable comments for v9
   but not fixed in v10.

v9->v10
 - Change struct ovs_key_nsh to
       struct ovs_nsh_key_base base;
       __be32 context[NSH_MD1_CONTEXT_SIZE];
 - Fix new comments for v9

v8->v9
 - Fix build error reported by daily intel build
   because nsh module isn't selected by openvswitch

v7->v8
 - Rework nested value and mask for OVS_KEY_ATTR_NSH
 - Change pop_nsh to adapt to nsh kernel module
 - Fix many issues per comments from Jiri Benc

v6->v7
 - Remove NSH GSO patches in v6 because Jiri Benc
   reworked it as another patch series and they have
   been merged.
 - Change it to adapt to nsh kernel module added by NSH
   GSO patch series

v5->v6
 - Fix the rest comments for v4.
 - Add NSH GSO support for VxLAN-gpe + NSH and
   Eth + NSH.

v4->v5
 - Fix many comments by Jiri Benc and Eric Garver
   for v4.

v3->v4
 - Add new NSH match field ttl
 - Update NSH header to the latest format
   which will be final format and won't change
   per its author's confirmation.
 - Fix comments for v3.

v2->v3
 - Change OVS_KEY_ATTR_NSH to nested key to handle
   length-fixed attributes and length-variable
   attriubte more flexibly.
 - Remove struct ovs_action_push_nsh completely
 - Add code to handle nested attribute for SET_MASKED
 - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
   to transfer NSH header data.
 - Fix comments and coding style issues by Jiri and Eric

v1->v2
 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
 - Dynamically allocate struct ovs_action_push_nsh for
   length-variable metadata.

OVS master and 2.8 branch has merged NSH userspace
patch series, this patch is to enable NSH support
in kernel data path in order that OVS can support
NSH in compat mode by porting this.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Eric Garver <e@erig.me>
Acked-by: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nsh.h                |   3 +
 include/uapi/linux/openvswitch.h |  29 ++++
 net/nsh/nsh.c                    |  60 +++++++
 net/openvswitch/Kconfig          |   1 +
 net/openvswitch/actions.c        | 116 +++++++++++++
 net/openvswitch/flow.c           |  51 ++++++
 net/openvswitch/flow.h           |   7 +
 net/openvswitch/flow_netlink.c   | 343 ++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/flow_netlink.h   |   5 +
 9 files changed, 613 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/nsh.h b/include/net/nsh.h
index a1eaea20be96..350b1ad11c7f 100644
--- a/include/net/nsh.h
+++ b/include/net/nsh.h
@@ -304,4 +304,7 @@ static inline void nsh_set_flags_ttl_len(struct nshhdr *nsh, u8 flags,
 			NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK);
 }
 
+int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh);
+int nsh_pop(struct sk_buff *skb);
+
 #endif /* __NET_NSH_H */
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 501e4c4e2a03..ec75a685f1dd 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -336,6 +336,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_LABELS,	/* 16-octet connection tracking label */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
+	OVS_KEY_ATTR_NSH,       /* Nested set of ovs_nsh_key_* */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -495,6 +496,30 @@ struct ovs_key_ct_tuple_ipv6 {
 	__u8   ipv6_proto;
 };
 
+enum ovs_nsh_key_attr {
+	OVS_NSH_KEY_ATTR_UNSPEC,
+	OVS_NSH_KEY_ATTR_BASE,  /* struct ovs_nsh_key_base. */
+	OVS_NSH_KEY_ATTR_MD1,   /* struct ovs_nsh_key_md1. */
+	OVS_NSH_KEY_ATTR_MD2,   /* variable-length octets for MD type 2. */
+	__OVS_NSH_KEY_ATTR_MAX
+};
+
+#define OVS_NSH_KEY_ATTR_MAX (__OVS_NSH_KEY_ATTR_MAX - 1)
+
+struct ovs_nsh_key_base {
+	__u8 flags;
+	__u8 ttl;
+	__u8 mdtype;
+	__u8 np;
+	__be32 path_hdr;
+};
+
+#define NSH_MD1_CONTEXT_SIZE 4
+
+struct ovs_nsh_key_md1 {
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -811,6 +836,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
+ * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
+ * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -841,6 +868,8 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
 	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
+	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
+	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index 58fb827439a8..d7da99a0b0b8 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -14,6 +14,66 @@
 #include <net/nsh.h>
 #include <net/tun_proto.h>
 
+int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh)
+{
+	struct nshhdr *nh;
+	size_t length = nsh_hdr_len(pushed_nh);
+	u8 next_proto;
+
+	if (skb->mac_len) {
+		next_proto = TUN_P_ETHERNET;
+	} else {
+		next_proto = tun_p_from_eth_p(skb->protocol);
+		if (!next_proto)
+			return -EAFNOSUPPORT;
+	}
+
+	/* Add the NSH header */
+	if (skb_cow_head(skb, length) < 0)
+		return -ENOMEM;
+
+	skb_push(skb, length);
+	nh = (struct nshhdr *)(skb->data);
+	memcpy(nh, pushed_nh, length);
+	nh->np = next_proto;
+	skb_postpush_rcsum(skb, nh, length);
+
+	skb->protocol = htons(ETH_P_NSH);
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_mac_len(skb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nsh_push);
+
+int nsh_pop(struct sk_buff *skb)
+{
+	struct nshhdr *nh;
+	size_t length;
+	__be16 inner_proto;
+
+	if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN))
+		return -ENOMEM;
+	nh = (struct nshhdr *)(skb->data);
+	length = nsh_hdr_len(nh);
+	inner_proto = tun_p_to_eth_p(nh->np);
+	if (!pskb_may_pull(skb, length))
+		return -ENOMEM;
+
+	if (!inner_proto)
+		return -EAFNOSUPPORT;
+
+	skb_pull_rcsum(skb, length);
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_mac_len(skb);
+	skb->protocol = inner_proto;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nsh_pop);
+
 static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index ce947292ae77..2650205cdaf9 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -14,6 +14,7 @@ config OPENVSWITCH
 	select MPLS
 	select NET_MPLS_GSO
 	select DST_CACHE
+	select NET_NSH
 	---help---
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a551232daf61..9a6a6d51e421 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -43,6 +43,7 @@
 #include "flow.h"
 #include "conntrack.h"
 #include "vport.h"
+#include "flow_netlink.h"
 
 struct deferred_action {
 	struct sk_buff *skb;
@@ -380,6 +381,38 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
+		    const struct nshhdr *nh)
+{
+	int err;
+
+	err = nsh_push(skb, nh);
+	if (err)
+		return err;
+
+	/* safe right before invalidate_flow_key */
+	key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
+static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	int err;
+
+	err = nsh_pop(skb);
+	if (err)
+		return err;
+
+	/* safe right before invalidate_flow_key */
+	if (skb->protocol == htons(ETH_P_TEB))
+		key->mac_proto = MAC_PROTO_ETHERNET;
+	else
+		key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
 static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
 				  __be32 addr, __be32 new_addr)
 {
@@ -602,6 +635,69 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	return 0;
 }
 
+static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
+		   const struct nlattr *a)
+{
+	struct nshhdr *nh;
+	size_t length;
+	int err;
+	u8 flags;
+	u8 ttl;
+	int i;
+
+	struct ovs_key_nsh key;
+	struct ovs_key_nsh mask;
+
+	err = nsh_key_from_nlattr(a, &key, &mask);
+	if (err)
+		return err;
+
+	/* Make sure the NSH base header is there */
+	if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN))
+		return -ENOMEM;
+
+	nh = nsh_hdr(skb);
+	length = nsh_hdr_len(nh);
+
+	/* Make sure the whole NSH header is there */
+	err = skb_ensure_writable(skb, skb_network_offset(skb) +
+				       length);
+	if (unlikely(err))
+		return err;
+
+	nh = nsh_hdr(skb);
+	skb_postpull_rcsum(skb, nh, length);
+	flags = nsh_get_flags(nh);
+	flags = OVS_MASKED(flags, key.base.flags, mask.base.flags);
+	flow_key->nsh.base.flags = flags;
+	ttl = nsh_get_ttl(nh);
+	ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl);
+	flow_key->nsh.base.ttl = ttl;
+	nsh_set_flags_and_ttl(nh, flags, ttl);
+	nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr,
+				  mask.base.path_hdr);
+	flow_key->nsh.base.path_hdr = nh->path_hdr;
+	switch (nh->mdtype) {
+	case NSH_M_TYPE1:
+		for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) {
+			nh->md1.context[i] =
+			    OVS_MASKED(nh->md1.context[i], key.context[i],
+				       mask.context[i]);
+		}
+		memcpy(flow_key->nsh.context, nh->md1.context,
+		       sizeof(nh->md1.context));
+		break;
+	case NSH_M_TYPE2:
+		memset(flow_key->nsh.context, 0,
+		       sizeof(flow_key->nsh.context));
+		break;
+	default:
+		return -EINVAL;
+	}
+	skb_postpush_rcsum(skb, nh, length);
+	return 0;
+}
+
 /* Must follow skb_ensure_writable() since that can move the skb data. */
 static void set_tp_port(struct sk_buff *skb, __be16 *port,
 			__be16 new_port, __sum16 *check)
@@ -1024,6 +1120,10 @@ static int execute_masked_set_action(struct sk_buff *skb,
 				   get_mask(a, struct ovs_key_ethernet *));
 		break;
 
+	case OVS_KEY_ATTR_NSH:
+		err = set_nsh(skb, flow_key, a);
+		break;
+
 	case OVS_KEY_ATTR_IPV4:
 		err = set_ipv4(skb, flow_key, nla_data(a),
 			       get_mask(a, struct ovs_key_ipv4 *));
@@ -1214,6 +1314,22 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_POP_ETH:
 			err = pop_eth(skb, key);
 			break;
+
+		case OVS_ACTION_ATTR_PUSH_NSH: {
+			u8 buffer[NSH_HDR_MAX_LEN];
+			struct nshhdr *nh = (struct nshhdr *)buffer;
+
+			err = nsh_hdr_from_nlattr(nla_data(a), nh,
+						  NSH_HDR_MAX_LEN);
+			if (unlikely(err))
+				break;
+			err = push_nsh(skb, key, nh);
+			break;
+		}
+
+		case OVS_ACTION_ATTR_POP_NSH:
+			err = pop_nsh(skb, key);
+			break;
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 8c94cef25a72..864ddb1e3642 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -46,6 +46,7 @@
 #include <net/ipv6.h>
 #include <net/mpls.h>
 #include <net/ndisc.h>
+#include <net/nsh.h>
 
 #include "conntrack.h"
 #include "datapath.h"
@@ -490,6 +491,52 @@ invalid:
 	return 0;
 }
 
+static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	struct nshhdr *nh;
+	unsigned int nh_ofs = skb_network_offset(skb);
+	u8 version, length;
+	int err;
+
+	err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN);
+	if (unlikely(err))
+		return err;
+
+	nh = nsh_hdr(skb);
+	version = nsh_get_ver(nh);
+	length = nsh_hdr_len(nh);
+
+	if (version != 0)
+		return -EINVAL;
+
+	err = check_header(skb, nh_ofs + length);
+	if (unlikely(err))
+		return err;
+
+	nh = nsh_hdr(skb);
+	key->nsh.base.flags = nsh_get_flags(nh);
+	key->nsh.base.ttl = nsh_get_ttl(nh);
+	key->nsh.base.mdtype = nh->mdtype;
+	key->nsh.base.np = nh->np;
+	key->nsh.base.path_hdr = nh->path_hdr;
+	switch (key->nsh.base.mdtype) {
+	case NSH_M_TYPE1:
+		if (length != NSH_M_TYPE1_LEN)
+			return -EINVAL;
+		memcpy(key->nsh.context, nh->md1.context,
+		       sizeof(nh->md1));
+		break;
+	case NSH_M_TYPE2:
+		memset(key->nsh.context, 0,
+		       sizeof(nh->md1));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * key_extract - extracts a flow key from an Ethernet frame.
  * @skb: sk_buff that contains the frame, with skb->data pointing to the
@@ -735,6 +782,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
+	} else if (key->eth.type == htons(ETH_P_NSH)) {
+		error = parse_nsh(skb, key);
+		if (error)
+			return error;
 	}
 	return 0;
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1875bba4f865..c670dd24b8b7 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -35,6 +35,7 @@
 #include <net/inet_ecn.h>
 #include <net/ip_tunnels.h>
 #include <net/dst_metadata.h>
+#include <net/nsh.h>
 
 struct sk_buff;
 
@@ -66,6 +67,11 @@ struct vlan_head {
 	(offsetof(struct sw_flow_key, recirc_id) +	\
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
 
+struct ovs_key_nsh {
+	struct ovs_nsh_key_base base;
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 struct sw_flow_key {
 	u8 tun_opts[IP_TUNNEL_OPTS_MAX];
 	u8 tun_opts_len;
@@ -143,6 +149,7 @@ struct sw_flow_key {
 				} nd;
 			};
 		} ipv6;
+		struct ovs_key_nsh nsh;         /* network service header */
 	};
 	struct {
 		/* Connection tracking fields not packed above. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index dc0d79092e74..4201f9293af3 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -48,6 +48,7 @@
 #include <net/ndisc.h>
 #include <net/mpls.h>
 #include <net/vxlan.h>
+#include <net/tun_proto.h>
 #include <net/erspan.h>
 
 #include "flow_netlink.h"
@@ -80,9 +81,11 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 		case OVS_ACTION_ATTR_HASH:
 		case OVS_ACTION_ATTR_POP_ETH:
 		case OVS_ACTION_ATTR_POP_MPLS:
+		case OVS_ACTION_ATTR_POP_NSH:
 		case OVS_ACTION_ATTR_POP_VLAN:
 		case OVS_ACTION_ATTR_PUSH_ETH:
 		case OVS_ACTION_ATTR_PUSH_MPLS:
+		case OVS_ACTION_ATTR_PUSH_NSH:
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 		case OVS_ACTION_ATTR_SAMPLE:
 		case OVS_ACTION_ATTR_SET:
@@ -175,7 +178,8 @@ static bool match_validate(const struct sw_flow_match *match,
 			| (1 << OVS_KEY_ATTR_ICMPV6)
 			| (1 << OVS_KEY_ATTR_ARP)
 			| (1 << OVS_KEY_ATTR_ND)
-			| (1 << OVS_KEY_ATTR_MPLS));
+			| (1 << OVS_KEY_ATTR_MPLS)
+			| (1 << OVS_KEY_ATTR_NSH));
 
 	/* Always allowed mask fields. */
 	mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
@@ -284,6 +288,14 @@ static bool match_validate(const struct sw_flow_match *match,
 		}
 	}
 
+	if (match->key->eth.type == htons(ETH_P_NSH)) {
+		key_expected |= 1 << OVS_KEY_ATTR_NSH;
+		if (match->mask &&
+		    match->mask->key.eth.type == htons(0xffff)) {
+			mask_allowed |= 1 << OVS_KEY_ATTR_NSH;
+		}
+	}
+
 	if ((key_attrs & key_expected) != key_expected) {
 		/* Key attributes check failed. */
 		OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)",
@@ -325,12 +337,25 @@ size_t ovs_tun_key_attr_size(void)
 		+ nla_total_size(4);   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */
 }
 
+size_t ovs_nsh_key_attr_size(void)
+{
+	/* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider
+	 * updating this function.
+	 */
+	return  nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */
+		/* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are
+		 * mutually exclusive, so the bigger one can cover
+		 * the small one.
+		 */
+		+ nla_total_size(NSH_CTX_HDRS_MAX_LEN);
+}
+
 size_t ovs_key_attr_size(void)
 {
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -344,6 +369,8 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
 		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
+		+ nla_total_size(0)   /* OVS_KEY_ATTR_NSH */
+		  + ovs_nsh_key_attr_size()
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -377,6 +404,13 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 	[OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = sizeof(u32) },
 };
 
+static const struct ovs_len_tbl
+ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = {
+	[OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) },
+	[OVS_NSH_KEY_ATTR_MD1]  = { .len = sizeof(struct ovs_nsh_key_md1) },
+	[OVS_NSH_KEY_ATTR_MD2]  = { .len = OVS_ATTR_VARIABLE },
+};
+
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
 static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_ENCAP]	 = { .len = OVS_ATTR_NESTED },
@@ -409,6 +443,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
 	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
+	[OVS_KEY_ATTR_NSH]       = { .len = OVS_ATTR_NESTED,
+				     .next = ovs_nsh_key_attr_lens, },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -1227,6 +1263,221 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 	return 0;
 }
 
+int nsh_hdr_from_nlattr(const struct nlattr *attr,
+			struct nshhdr *nh, size_t size)
+{
+	struct nlattr *a;
+	int rem;
+	u8 flags = 0;
+	u8 ttl = 0;
+	int mdlen = 0;
+
+	/* validate_nsh has check this, so we needn't do duplicate check here
+	 */
+	if (size < NSH_BASE_HDR_LEN)
+		return -ENOBUFS;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base = nla_data(a);
+
+			flags = base->flags;
+			ttl = base->ttl;
+			nh->np = base->np;
+			nh->mdtype = base->mdtype;
+			nh->path_hdr = base->path_hdr;
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1:
+			mdlen = nla_len(a);
+			if (mdlen > size - NSH_BASE_HDR_LEN)
+				return -ENOBUFS;
+			memcpy(&nh->md1, nla_data(a), mdlen);
+			break;
+
+		case OVS_NSH_KEY_ATTR_MD2:
+			mdlen = nla_len(a);
+			if (mdlen > size - NSH_BASE_HDR_LEN)
+				return -ENOBUFS;
+			memcpy(&nh->md2, nla_data(a), mdlen);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* nsh header length  = NSH_BASE_HDR_LEN + mdlen */
+	nh->ver_flags_ttl_len = 0;
+	nsh_set_flags_ttl_len(nh, flags, ttl, NSH_BASE_HDR_LEN + mdlen);
+
+	return 0;
+}
+
+int nsh_key_from_nlattr(const struct nlattr *attr,
+			struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask)
+{
+	struct nlattr *a;
+	int rem;
+
+	/* validate_nsh has check this, so we needn't do duplicate check here
+	 */
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base = nla_data(a);
+			const struct ovs_nsh_key_base *base_mask = base + 1;
+
+			nsh->base = *base;
+			nsh_mask->base = *base_mask;
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 = nla_data(a);
+			const struct ovs_nsh_key_md1 *md1_mask = md1 + 1;
+
+			memcpy(nsh->context, md1->context, sizeof(*md1));
+			memcpy(nsh_mask->context, md1_mask->context,
+			       sizeof(*md1_mask));
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			/* Not supported yet */
+			return -ENOTSUPP;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int nsh_key_put_from_nlattr(const struct nlattr *attr,
+				   struct sw_flow_match *match, bool is_mask,
+				   bool is_push_nsh, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool has_base = false;
+	bool has_md1 = false;
+	bool has_md2 = false;
+	u8 mdtype = 0;
+	int mdlen = 0;
+
+	if (WARN_ON(is_push_nsh && is_mask))
+		return -EINVAL;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+		int i;
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(log, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    log,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base = nla_data(a);
+
+			has_base = true;
+			mdtype = base->mdtype;
+			SW_FLOW_KEY_PUT(match, nsh.base.flags,
+					base->flags, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.base.ttl,
+					base->ttl, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.base.mdtype,
+					base->mdtype, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.base.np,
+					base->np, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.base.path_hdr,
+					base->path_hdr, is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 = nla_data(a);
+
+			has_md1 = true;
+			for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++)
+				SW_FLOW_KEY_PUT(match, nsh.context[i],
+						md1->context[i], is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			if (!is_push_nsh) /* Not supported MD type 2 yet */
+				return -ENOTSUPP;
+
+			has_md2 = true;
+			mdlen = nla_len(a);
+			if (mdlen > NSH_CTX_HDRS_MAX_LEN || mdlen <= 0) {
+				OVS_NLERR(
+				    log,
+				    "Invalid MD length %d for MD type %d",
+				    mdlen,
+				    mdtype
+				);
+				return -EINVAL;
+			}
+			break;
+		default:
+			OVS_NLERR(log, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if (has_md1 && has_md2) {
+		OVS_NLERR(
+		    1,
+		    "invalid nsh attribute: md1 and md2 are exclusive."
+		);
+		return -EINVAL;
+	}
+
+	if (!is_mask) {
+		if ((has_md1 && mdtype != NSH_M_TYPE1) ||
+		    (has_md2 && mdtype != NSH_M_TYPE2)) {
+			OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
+				  mdtype);
+			return -EINVAL;
+		}
+
+		if (is_push_nsh &&
+		    (!has_base || (!has_md1 && !has_md2))) {
+			OVS_NLERR(
+			    1,
+			    "push_nsh: missing base or metadata attributes"
+			);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				u64 attrs, const struct nlattr **a,
 				bool is_mask, bool log)
@@ -1354,6 +1605,13 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_NSH)) {
+		if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match,
+					    is_mask, false, log) < 0)
+			return -EINVAL;
+		attrs &= ~(1 << OVS_KEY_ATTR_NSH);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
 		const struct ovs_key_mpls *mpls_key;
 
@@ -1670,6 +1928,34 @@ static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
 	return 0;
 }
 
+static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask,
+			     struct sk_buff *skb)
+{
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, OVS_KEY_ATTR_NSH);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(nsh->base), &nsh->base))
+		goto nla_put_failure;
+
+	if (is_mask || nsh->base.mdtype == NSH_M_TYPE1) {
+		if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1,
+			    sizeof(nsh->context), nsh->context))
+			goto nla_put_failure;
+	}
+
+	/* Don't support MD type 2 yet */
+
+	nla_nest_end(skb, start);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 			     const struct sw_flow_key *output, bool is_mask,
 			     struct sk_buff *skb)
@@ -1798,6 +2084,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 		ipv6_key->ipv6_tclass = output->ip.tos;
 		ipv6_key->ipv6_hlimit = output->ip.ttl;
 		ipv6_key->ipv6_frag = output->ip.frag;
+	} else if (swkey->eth.type == htons(ETH_P_NSH)) {
+		if (nsh_key_to_nlattr(&output->nsh, is_mask, skb))
+			goto nla_put_failure;
 	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
 		   swkey->eth.type == htons(ETH_P_RARP)) {
 		struct ovs_key_arp *arp_key;
@@ -2292,6 +2581,19 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	return err;
 }
 
+static bool validate_nsh(const struct nlattr *attr, bool is_mask,
+			 bool is_push_nsh, bool log)
+{
+	struct sw_flow_match match;
+	struct sw_flow_key key;
+	int ret = 0;
+
+	ovs_match_init(&match, &key, true, NULL);
+	ret = nsh_key_put_from_nlattr(attr, &match, is_mask,
+				      is_push_nsh, log);
+	return !ret;
+}
+
 /* Return false if there are any non-masked bits set.
  * Mask follows data immediately, before any netlink padding.
  */
@@ -2434,6 +2736,13 @@ static int validate_set(const struct nlattr *a,
 
 		break;
 
+	case OVS_KEY_ATTR_NSH:
+		if (eth_type != htons(ETH_P_NSH))
+			return -EINVAL;
+		if (!validate_nsh(nla_data(a), masked, false, log))
+			return -EINVAL;
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -2533,6 +2842,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
+			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
+			[OVS_ACTION_ATTR_POP_NSH] = 0,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -2690,6 +3001,34 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			mac_proto = MAC_PROTO_ETHERNET;
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_NSH:
+			if (mac_proto != MAC_PROTO_ETHERNET) {
+				u8 next_proto;
+
+				next_proto = tun_p_from_eth_p(eth_type);
+				if (!next_proto)
+					return -EINVAL;
+			}
+			mac_proto = MAC_PROTO_NONE;
+			if (!validate_nsh(nla_data(a), false, true, true))
+				return -EINVAL;
+			break;
+
+		case OVS_ACTION_ATTR_POP_NSH: {
+			__be16 inner_proto;
+
+			if (eth_type != htons(ETH_P_NSH))
+				return -EINVAL;
+			inner_proto = tun_p_to_eth_p(key->nsh.base.np);
+			if (!inner_proto)
+				return -EINVAL;
+			if (key->nsh.base.np == TUN_P_ETHERNET)
+				mac_proto = MAC_PROTO_ETHERNET;
+			else
+				mac_proto = MAC_PROTO_NONE;
+			break;
+		}
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 929c665ac3aa..6657606b2b47 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -79,4 +79,9 @@ int ovs_nla_put_actions(const struct nlattr *attr,
 void ovs_nla_free_flow_actions(struct sw_flow_actions *);
 void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
 
+int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh,
+			struct ovs_key_nsh *nsh_mask);
+int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh,
+			size_t size);
+
 #endif /* flow_netlink.h */
-- 
cgit v1.2.3


From 24a9332a58b7f41a0d36c35a2c6897242bffdbc0 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Mon, 6 Nov 2017 16:11:43 -0500
Subject: net: dsa: constify cpu_dp member of dsa_port

A DSA port has a dedicated CPU port assigned to it, stored in the cpu_dp
member. It is not meant to be modified by a port, thus make it const.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 +-
 net/dsa/slave.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index e54332968417..2a8613b5a23d 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -190,7 +190,7 @@ struct dsa_port {
 	struct dsa_switch	*ds;
 	unsigned int		index;
 	const char		*name;
-	struct dsa_port		*cpu_dp;
+	const struct dsa_port	*cpu_dp;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
 	u8			stp_state;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 814ced75a0cc..cc7fe47dd4bf 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1147,7 +1147,7 @@ static void dsa_slave_notify(struct net_device *dev, unsigned long val)
 
 int dsa_slave_create(struct dsa_port *port)
 {
-	struct dsa_port *cpu_dp = port->cpu_dp;
+	const struct dsa_port *cpu_dp = port->cpu_dp;
 	struct net_device *master = cpu_dp->master;
 	struct dsa_switch *ds = port->ds;
 	const char *name = port->name;
-- 
cgit v1.2.3


From ec15dd4269d0cbf947c9a2dfdcf08a917098fab1 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Mon, 6 Nov 2017 16:11:46 -0500
Subject: net: dsa: setup and teardown tree

This commit provides better scope for the DSA tree setup and teardown
functions. It renames the "applied" bool to "setup" and print a message
when the tree is setup, as it is done during teardown.

At the same time, check dst->setup in dsa_tree_setup, where it is set to
true.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  2 +-
 net/dsa/dsa2.c    | 32 ++++++++++++++++++--------------
 2 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2a8613b5a23d..6c239257309b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -122,7 +122,7 @@ struct dsa_switch_tree {
 	struct kref refcount;
 
 	/* Has this tree been applied to the hardware? */
-	bool applied;
+	bool setup;
 
 	/*
 	 * Configuration data for the platform device that owns
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index c9b50339fcac..1a8df0a177b5 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -486,12 +486,18 @@ static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
 	return dsa_master_teardown(master);
 }
 
-static int dsa_dst_apply(struct dsa_switch_tree *dst)
+static int dsa_tree_setup(struct dsa_switch_tree *dst)
 {
 	struct dsa_switch *ds;
 	u32 index;
 	int err;
 
+	if (dst->setup) {
+		pr_err("DSA: tree %d already setup! Disjoint trees?\n",
+		       dst->index);
+		return -EEXIST;
+	}
+
 	err = dsa_tree_setup_default_cpu(dst);
 	if (err)
 		return err;
@@ -510,17 +516,19 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
 	if (err)
 		return err;
 
-	dst->applied = true;
+	dst->setup = true;
+
+	pr_info("DSA: tree %d setup\n", dst->index);
 
 	return 0;
 }
 
-static void dsa_dst_unapply(struct dsa_switch_tree *dst)
+static void dsa_tree_teardown(struct dsa_switch_tree *dst)
 {
 	struct dsa_switch *ds;
 	u32 index;
 
-	if (!dst->applied)
+	if (!dst->setup)
 		return;
 
 	dsa_tree_teardown_master(dst);
@@ -535,8 +543,9 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 
 	dsa_tree_teardown_default_cpu(dst);
 
-	pr_info("DSA: tree %d unapplied\n", dst->index);
-	dst->applied = false;
+	pr_info("DSA: tree %d torn down\n", dst->index);
+
+	dst->setup = false;
 }
 
 static void dsa_tree_remove_switch(struct dsa_switch_tree *dst,
@@ -794,14 +803,9 @@ static int _dsa_register_switch(struct dsa_switch *ds)
 	if (err == 1)
 		return 0;
 
-	if (dst->applied) {
-		pr_info("DSA: Disjoint trees?\n");
-		return -EINVAL;
-	}
-
-	err = dsa_dst_apply(dst);
+	err = dsa_tree_setup(dst);
 	if (err) {
-		dsa_dst_unapply(dst);
+		dsa_tree_teardown(dst);
 		goto out_del_dst;
 	}
 
@@ -852,7 +856,7 @@ static void _dsa_unregister_switch(struct dsa_switch *ds)
 	struct dsa_switch_tree *dst = ds->dst;
 	unsigned int index = ds->index;
 
-	dsa_dst_unapply(dst);
+	dsa_tree_teardown(dst);
 
 	dsa_tree_remove_switch(dst, index);
 }
-- 
cgit v1.2.3


From 375ef2b1f0d0b43b0d36ffdd521637ff59b0c13c Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 17 Sep 2017 13:43:58 +0300
Subject: net: Introduce netdev_*_once functions

Extend the net device error logging with netdev_*_once macros.
netdev_*_once are the equivalents of the dev_*_once macros which are
useful for messages that should only be logged once.

Also add netdev_WARN_ONCE, which is the "once" extension for the already
existing netdev_WARN macro.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/netdevice.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 30f0f2928808..79518ede3170 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4336,6 +4336,31 @@ void netdev_notice(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
 void netdev_info(const struct net_device *dev, const char *format, ...);
 
+#define netdev_level_once(level, dev, fmt, ...)			\
+do {								\
+	static bool __print_once __read_mostly;			\
+								\
+	if (!__print_once) {					\
+		__print_once = true;				\
+		netdev_printk(level, dev, fmt, ##__VA_ARGS__);	\
+	}							\
+} while (0)
+
+#define netdev_emerg_once(dev, fmt, ...) \
+	netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__)
+#define netdev_alert_once(dev, fmt, ...) \
+	netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__)
+#define netdev_crit_once(dev, fmt, ...) \
+	netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__)
+#define netdev_err_once(dev, fmt, ...) \
+	netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__)
+#define netdev_warn_once(dev, fmt, ...) \
+	netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__)
+#define netdev_notice_once(dev, fmt, ...) \
+	netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__)
+#define netdev_info_once(dev, fmt, ...) \
+	netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)
+
 #define MODULE_ALIAS_NETDEV(device) \
 	MODULE_ALIAS("netdev-" device)
 
@@ -4376,6 +4401,10 @@ do {								\
 	WARN(1, "netdevice: %s%s\n" format, netdev_name(dev),	\
 	     netdev_reg_state(dev), ##args)
 
+#define netdev_WARN_ONCE(dev, condition, format, arg...)		\
+	WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev)	\
+		  netdev_reg_state(dev), ##args)
+
 /* netif printk helpers, similar to netdev_printk */
 
 #define netif_printk(priv, type, level, dev, fmt, args...)	\
-- 
cgit v1.2.3


From 4382c7b92a1db397874ca62c73aa8b023af6dba8 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 10 Sep 2017 13:22:51 +0300
Subject: net/mlx5e: Add 802.1ad VLAN insertion support

Report VLAN insertion support for S-tagged packets and add support by
choosing the correct VLAN type in the WQE.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 2 ++
 include/linux/mlx5/qp.h                           | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 59b8a2d62b8d..c408b7efa42e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4194,6 +4194,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_TX;
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_RX;
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_STAG_TX;
 
 	if (mlx5e_vxlan_allowed(mdev) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
 		netdev->hw_features     |= NETIF_F_GSO_PARTIAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index de651de35c9b..c62305b214cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -369,6 +369,8 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), MLX5_SEND_WQE_DS);
 	} else if (skb_vlan_tag_present(skb)) {
 		eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
+		if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD))
+			eseg->insert.type |= cpu_to_be16(MLX5_ETH_WQE_SVLAN);
 		eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb));
 	}
 
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 66d19b611fe4..62af7512dabb 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -221,6 +221,7 @@ enum {
 };
 
 enum {
+	MLX5_ETH_WQE_SVLAN              = 1 << 0,
 	MLX5_ETH_WQE_INSERT_VLAN        = 1 << 15,
 };
 
-- 
cgit v1.2.3


From 4d63adfe12dd9cb61ed8badb4d798955399048c2 Mon Sep 17 00:00:00 2001
From: Mark Greer <mgreer@animalcreek.com>
Date: Thu, 15 Jun 2017 20:34:22 -0700
Subject: NFC: Add NFC_CMD_DEACTIVATE_TARGET support

Once an NFC target (i.e., a tag) is found, it remains active until
there is a failure reading or writing it (often caused by the target
moving out of range).  While the target is active, the NFC adapter
and antenna must remain powered.  This wastes power when the target
remains in range but the client application no longer cares whether
it is there or not.

To mitigate this, add a new netlink command that allows userspace
to deactivate an active target.  When issued, this command will cause
the NFC subsystem to act as though the target was moved out of range.
Once the command has been executed, the client application can power
off the NFC adapter to reduce power consumption.

Signed-off-by: Mark Greer <mgreer@animalcreek.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h |  2 ++
 net/nfc/netlink.c        | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 399f39ff8048..f6e3c8c9c744 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -89,6 +89,7 @@
  * @NFC_CMD_ACTIVATE_TARGET: Request NFC controller to reactivate target.
  * @NFC_CMD_VENDOR: Vendor specific command, to be implemented directly
  *	from the driver in order to support hardware specific operations.
+ * @NFC_CMD_DEACTIVATE_TARGET: Request NFC controller to deactivate target.
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -121,6 +122,7 @@ enum nfc_commands {
 	NFC_CMD_SE_IO,
 	NFC_CMD_ACTIVATE_TARGET,
 	NFC_CMD_VENDOR,
+	NFC_CMD_DEACTIVATE_TARGET,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index b251fb936a27..f6359c277212 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -928,6 +928,30 @@ static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info)
 	return rc;
 }
 
+static int nfc_genl_deactivate_target(struct sk_buff *skb,
+				      struct genl_info *info)
+{
+	struct nfc_dev *dev;
+	u32 device_idx, target_idx;
+	int rc;
+
+	if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+		return -EINVAL;
+
+	device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+	dev = nfc_get_device(device_idx);
+	if (!dev)
+		return -ENODEV;
+
+	target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]);
+
+	rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP);
+
+	nfc_put_device(dev);
+	return rc;
+}
+
 static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nfc_dev *dev;
@@ -1751,6 +1775,11 @@ static const struct genl_ops nfc_genl_ops[] = {
 		.doit = nfc_genl_vendor_cmd,
 		.policy = nfc_genl_policy,
 	},
+	{
+		.cmd = NFC_CMD_DEACTIVATE_TARGET,
+		.doit = nfc_genl_deactivate_target,
+		.policy = nfc_genl_policy,
+	},
 };
 
 static struct genl_family nfc_genl_family __ro_after_init = {
-- 
cgit v1.2.3


From 47d5b6db2afa766d7af85db684d0b5f092e4fc46 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 9 Nov 2017 23:10:59 +0100
Subject: net: bridge: Add/del switchdev object on host join/leave

When the host joins or leaves a multicast group, use switchdev to add
an object to the hardware to forward traffic for the group to the
host.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  1 +
 net/bridge/br_mdb.c       | 43 +++++++++++++++++++++++++++++++++++++++++++
 net/switchdev/switchdev.c |  2 ++
 3 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d756fbe46625..39bc855d7fee 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -76,6 +76,7 @@ enum switchdev_obj_id {
 	SWITCHDEV_OBJ_ID_UNDEFINED,
 	SWITCHDEV_OBJ_ID_PORT_VLAN,
 	SWITCHDEV_OBJ_ID_PORT_MDB,
+	SWITCHDEV_OBJ_ID_HOST_MDB,
 };
 
 struct switchdev_obj {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 702408d2a93c..b0f4c734900b 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -292,6 +292,46 @@ err:
 	kfree(priv);
 }
 
+static void br_mdb_switchdev_host_port(struct net_device *dev,
+				       struct net_device *lower_dev,
+				       struct br_mdb_entry *entry, int type)
+{
+	struct switchdev_obj_port_mdb mdb = {
+		.obj = {
+			.id = SWITCHDEV_OBJ_ID_HOST_MDB,
+			.flags = SWITCHDEV_F_DEFER,
+		},
+		.vid = entry->vid,
+	};
+
+	if (entry->addr.proto == htons(ETH_P_IP))
+		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+#endif
+
+	mdb.obj.orig_dev = dev;
+	switch (type) {
+	case RTM_NEWMDB:
+		switchdev_port_obj_add(lower_dev, &mdb.obj);
+		break;
+	case RTM_DELMDB:
+		switchdev_port_obj_del(lower_dev, &mdb.obj);
+		break;
+	}
+}
+
+static void br_mdb_switchdev_host(struct net_device *dev,
+				  struct br_mdb_entry *entry, int type)
+{
+	struct net_device *lower_dev;
+	struct list_head *iter;
+
+	netdev_for_each_lower_dev(dev, lower_dev, iter)
+		br_mdb_switchdev_host_port(dev, lower_dev, entry, type);
+}
+
 static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
 			    struct br_mdb_entry *entry, int type)
 {
@@ -331,6 +371,9 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
 		switchdev_port_obj_del(port_dev, &mdb.obj);
 	}
 
+	if (!p)
+		br_mdb_switchdev_host(dev, entry, type);
+
 	skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 0531b41d1f2d..74b9d916a58b 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -345,6 +345,8 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj)
 		return sizeof(struct switchdev_obj_port_vlan);
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		return sizeof(struct switchdev_obj_port_mdb);
+	case SWITCHDEV_OBJ_ID_HOST_MDB:
+		return sizeof(struct switchdev_obj_port_mdb);
 	default:
 		BUG();
 	}
-- 
cgit v1.2.3


From a3dcaf17ee54f1d01d22cc2b22cab0b4f60d78cf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 Nov 2017 00:29:27 -0800
Subject: net: allow per netns sysctl_rmem and sysctl_wmem for protos

As we want to gradually implement per netns sysctl_rmem and sysctl_wmem
on per protocol basis, add two new fields in struct proto,
and two new helpers : sk_get_wmem0() and sk_get_rmem0()

First user will be TCP. Then UDP and SCTP can be easily converted,
while DECNET probably wont get this support.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h          | 22 ++++++++++++++++++++++
 include/trace/events/sock.h |  2 +-
 net/core/sock.c             | 10 ++++++----
 3 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 6f1be9726e02..688a823dccc3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1101,8 +1101,12 @@ struct proto {
 	 */
 	unsigned long		*memory_pressure;
 	long			*sysctl_mem;
+
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
+	u32			sysctl_wmem_offset;
+	u32			sysctl_rmem_offset;
+
 	int			max_header;
 	bool			no_autobind;
 
@@ -2390,4 +2394,22 @@ extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
+{
+	/* Does this proto have per netns sysctl_wmem ? */
+	if (proto->sysctl_wmem_offset)
+		return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
+
+	return *proto->sysctl_wmem;
+}
+
+static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
+{
+	/* Does this proto have per netns sysctl_rmem ? */
+	if (proto->sysctl_rmem_offset)
+		return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
+
+	return *proto->sysctl_rmem;
+}
+
 #endif	/* _SOCK_H */
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 6d31c0520ef3..ec4dade24466 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -48,7 +48,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
 		strncpy(__entry->name, prot->name, 32);
 		__entry->sysctl_mem = prot->sysctl_mem;
 		__entry->allocated = allocated;
-		__entry->sysctl_rmem = prot->sysctl_rmem[0];
+		__entry->sysctl_rmem = sk_get_rmem0(sk, prot);
 		__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
 	),
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 759400053110..c59bcf90d905 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2346,16 +2346,18 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 
 	/* guarantee minimum buffer size under pressure */
 	if (kind == SK_MEM_RECV) {
-		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
 			return 1;
 
 	} else { /* SK_MEM_SEND */
+		int wmem0 = sk_get_wmem0(sk, prot);
+
 		if (sk->sk_type == SOCK_STREAM) {
-			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+			if (sk->sk_wmem_queued < wmem0)
 				return 1;
-		} else if (refcount_read(&sk->sk_wmem_alloc) <
-			   prot->sysctl_wmem[0])
+		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
 				return 1;
+		}
 	}
 
 	if (sk_has_memory_pressure(sk)) {
-- 
cgit v1.2.3


From 356d1833b638bd465672aefeb71def3ab93fc17d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 Nov 2017 00:29:28 -0800
Subject: tcp: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem

Note that when a new netns is created, it inherits its
sysctl_tcp_rmem and sysctl_tcp_wmem from initial netns.

This change is needed so that we can refine TCP rcvbuf autotuning,
to take RTT into consideration.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h          |  2 --
 net/ipv4/sysctl_net_ipv4.c | 32 ++++++++++++++++----------------
 net/ipv4/tcp.c             | 21 ++++++++-------------
 net/ipv4/tcp_input.c       | 14 ++++++++------
 net/ipv4/tcp_ipv4.c        | 13 ++++++++++---
 net/ipv4/tcp_output.c      |  2 +-
 net/ipv6/tcp_ipv6.c        |  4 ++--
 8 files changed, 47 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 379550f8124a..5e12975fc658 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -155,6 +155,8 @@ struct netns_ipv4 {
 	int sysctl_tcp_invalid_ratelimit;
 	int sysctl_tcp_pacing_ss_ratio;
 	int sysctl_tcp_pacing_ca_ratio;
+	int sysctl_tcp_wmem[3];
+	int sysctl_tcp_rmem[3];
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index babfd4da1515..2f2c69ad31b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -242,8 +242,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* sysctl variables for tcp */
 extern int sysctl_tcp_max_orphans;
 extern long sysctl_tcp_mem[3];
-extern int sysctl_tcp_wmem[3];
-extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a82b44038308..ef0ff3357a44 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -440,22 +440,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
-	{
-		.procname	= "tcp_wmem",
-		.data		= &sysctl_tcp_wmem,
-		.maxlen		= sizeof(sysctl_tcp_wmem),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-	},
-	{
-		.procname	= "tcp_rmem",
-		.data		= &sysctl_tcp_rmem,
-		.maxlen		= sizeof(sysctl_tcp_rmem),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-	},
 	{
 		.procname	= "tcp_low_latency",
 		.data		= &sysctl_tcp_low_latency,
@@ -1164,6 +1148,22 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &zero,
 		.extra2		= &thousand,
 	},
+	{
+		.procname	= "tcp_wmem",
+		.data		= &init_net.ipv4.sysctl_tcp_wmem,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_wmem),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+	},
+	{
+		.procname	= "tcp_rmem",
+		.data		= &init_net.ipv4.sysctl_tcp_rmem,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_rmem),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c4cb19ed4628..bc71a27d5ad9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -289,12 +289,7 @@ struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
 long sysctl_tcp_mem[3] __read_mostly;
-int sysctl_tcp_wmem[3] __read_mostly;
-int sysctl_tcp_rmem[3] __read_mostly;
-
 EXPORT_SYMBOL(sysctl_tcp_mem);
-EXPORT_SYMBOL(sysctl_tcp_rmem);
-EXPORT_SYMBOL(sysctl_tcp_wmem);
 
 atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
@@ -456,8 +451,8 @@ void tcp_init_sock(struct sock *sk)
 
 	icsk->icsk_sync_mss = tcp_sync_mss;
 
-	sk->sk_sndbuf = sysctl_tcp_wmem[1];
-	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+	sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+	sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
 
 	sk_sockets_allocated_inc(sk);
 }
@@ -3636,13 +3631,13 @@ void __init tcp_init(void)
 	max_wshare = min(4UL*1024*1024, limit);
 	max_rshare = min(6UL*1024*1024, limit);
 
-	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
-	sysctl_tcp_wmem[1] = 16*1024;
-	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+	init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+	init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
 
-	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
-	sysctl_tcp_rmem[1] = 87380;
-	sysctl_tcp_rmem[2] = max(87380, max_rshare);
+	init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+	init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
+	init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
 
 	pr_info("Hash tables configured (established %u bind %u)\n",
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b54ee09cbcf7..9ceaa1fdc3ab 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -320,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
 	sndmem *= nr_segs * per_mss;
 
 	if (sk->sk_sndbuf < sndmem)
-		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+		sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
 }
 
 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -354,7 +354,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	/* Optimize this! */
 	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
-	int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1;
+	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
 
 	while (tp->rcv_ssthresh <= window) {
 		if (truesize <= skb->len)
@@ -409,7 +409,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
 		rcvmem <<= 2;
 
 	if (sk->sk_rcvbuf < rcvmem)
-		sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
+		sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
 }
 
 /* 4. Try to fixup all. It is made immediately after connection enters
@@ -457,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct net *net = sock_net(sk);
 
 	icsk->icsk_ack.quick = 0;
 
-	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
 	    !tcp_under_memory_pressure(sk) &&
 	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
 		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
-				    sysctl_tcp_rmem[2]);
+				    net->ipv4.sysctl_tcp_rmem[2]);
 	}
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
 		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -623,7 +624,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
 		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
 			rcvmem += 128;
 
-		rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+		rcvbuf = min(rcvwin / tp->advmss * rcvmem,
+			     sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
 		if (rcvbuf > sk->sk_rcvbuf) {
 			sk->sk_rcvbuf = rcvbuf;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0162c577bb9c..1eac84b8044e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2409,8 +2409,8 @@ struct proto tcp_prot = {
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
 	.sysctl_mem		= sysctl_tcp_mem,
-	.sysctl_wmem		= sysctl_tcp_wmem,
-	.sysctl_rmem		= sysctl_tcp_rmem,
+	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
+	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
@@ -2509,7 +2509,14 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
-
+	if (net != &init_net) {
+		memcpy(net->ipv4.sysctl_tcp_rmem,
+		       init_net.ipv4.sysctl_tcp_rmem,
+		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
+		memcpy(net->ipv4.sysctl_tcp_wmem,
+		       init_net.ipv4.sysctl_tcp_wmem,
+		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
+	}
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a9d917e4dad5..9b98d35aa0d8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -220,7 +220,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
 	(*rcv_wscale) = 0;
 	if (wscale_ok) {
 		/* Set window scaling on max possible window */
-		space = max_t(u32, space, sysctl_tcp_rmem[2]);
+		space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
 		space = max_t(u32, space, sysctl_rmem_max);
 		space = min_t(u32, space, *window_clamp);
 		while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0e2529958b52..6bb98c93edfe 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1940,8 +1940,8 @@ struct proto tcpv6_prot = {
 	.memory_pressure	= &tcp_memory_pressure,
 	.orphan_count		= &tcp_orphan_count,
 	.sysctl_mem		= sysctl_tcp_mem,
-	.sysctl_wmem		= sysctl_tcp_wmem,
-	.sysctl_rmem		= sysctl_tcp_rmem,
+	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
+	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
-- 
cgit v1.2.3


From 4c5b9d9642c859f7369338fc42c0f62f4151bef3 Mon Sep 17 00:00:00 2001
From: Manish Kurup <kurup.manish@gmail.com>
Date: Tue, 7 Nov 2017 15:49:05 -0500
Subject: act_vlan: VLAN action rewrite to use RCU lock/unlock and update

Using a spinlock in the VLAN action causes performance issues when the VLAN
action is used on multiple cores. Rewrote the VLAN action to use RCU read
locking for reads and updates instead.
All functions now use an RCU dereferenced pointer to access the VLAN action
context. Modified helper functions used by other modules, to use the RCU as
opposed to directly accessing the structure.

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Manish Kurup <manish.kurup@verizon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_vlan.h | 46 +++++++++++++++++++++------
 net/sched/act_vlan.c         | 75 ++++++++++++++++++++++++++++++--------------
 2 files changed, 88 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index c2090df944ff..22ae260d6869 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -13,12 +13,17 @@
 #include <net/act_api.h>
 #include <linux/tc_act/tc_vlan.h>
 
+struct tcf_vlan_params {
+	int               tcfv_action;
+	u16               tcfv_push_vid;
+	__be16            tcfv_push_proto;
+	u8                tcfv_push_prio;
+	struct rcu_head   rcu;
+};
+
 struct tcf_vlan {
 	struct tc_action	common;
-	int			tcfv_action;
-	u16			tcfv_push_vid;
-	__be16			tcfv_push_proto;
-	u8			tcfv_push_prio;
+	struct tcf_vlan_params __rcu *vlan_p;
 };
 #define to_vlan(a) ((struct tcf_vlan *)a)
 
@@ -33,22 +38,45 @@ static inline bool is_tcf_vlan(const struct tc_action *a)
 
 static inline u32 tcf_vlan_action(const struct tc_action *a)
 {
-	return to_vlan(a)->tcfv_action;
+	u32 tcfv_action;
+
+	rcu_read_lock();
+	tcfv_action = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_action;
+	rcu_read_unlock();
+
+	return tcfv_action;
 }
 
 static inline u16 tcf_vlan_push_vid(const struct tc_action *a)
 {
-	return to_vlan(a)->tcfv_push_vid;
+	u16 tcfv_push_vid;
+
+	rcu_read_lock();
+	tcfv_push_vid = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_vid;
+	rcu_read_unlock();
+
+	return tcfv_push_vid;
 }
 
 static inline __be16 tcf_vlan_push_proto(const struct tc_action *a)
 {
-	return to_vlan(a)->tcfv_push_proto;
+	__be16 tcfv_push_proto;
+
+	rcu_read_lock();
+	tcfv_push_proto = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_proto;
+	rcu_read_unlock();
+
+	return tcfv_push_proto;
 }
 
 static inline u8 tcf_vlan_push_prio(const struct tc_action *a)
 {
-	return to_vlan(a)->tcfv_push_prio;
-}
+	u8 tcfv_push_prio;
 
+	rcu_read_lock();
+	tcfv_push_prio = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_prio;
+	rcu_read_unlock();
+
+	return tcfv_push_prio;
+}
 #endif /* __NET_TC_VLAN_H */
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index b093badc1450..97f717a13ad5 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -26,6 +26,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
 	struct tcf_vlan *v = to_vlan(a);
+	struct tcf_vlan_params *p;
 	int action;
 	int err;
 	u16 tci;
@@ -33,24 +34,27 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 	tcf_lastuse_update(&v->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
 
-	spin_lock(&v->tcf_lock);
-	action = v->tcf_action;
-
 	/* Ensure 'data' points at mac_header prior calling vlan manipulating
 	 * functions.
 	 */
 	if (skb_at_tc_ingress(skb))
 		skb_push_rcsum(skb, skb->mac_len);
 
-	switch (v->tcfv_action) {
+	rcu_read_lock();
+
+	action = READ_ONCE(v->tcf_action);
+
+	p = rcu_dereference(v->vlan_p);
+
+	switch (p->tcfv_action) {
 	case TCA_VLAN_ACT_POP:
 		err = skb_vlan_pop(skb);
 		if (err)
 			goto drop;
 		break;
 	case TCA_VLAN_ACT_PUSH:
-		err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid |
-				    (v->tcfv_push_prio << VLAN_PRIO_SHIFT));
+		err = skb_vlan_push(skb, p->tcfv_push_proto, p->tcfv_push_vid |
+				    (p->tcfv_push_prio << VLAN_PRIO_SHIFT));
 		if (err)
 			goto drop;
 		break;
@@ -69,14 +73,14 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 				goto drop;
 		}
 		/* replace the vid */
-		tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
+		tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid;
 		/* replace prio bits, if tcfv_push_prio specified */
-		if (v->tcfv_push_prio) {
+		if (p->tcfv_push_prio) {
 			tci &= ~VLAN_PRIO_MASK;
-			tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
+			tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT;
 		}
 		/* put updated tci as hwaccel tag */
-		__vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci);
+		__vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci);
 		break;
 	default:
 		BUG();
@@ -89,10 +93,10 @@ drop:
 	qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
 
 unlock:
+	rcu_read_unlock();
 	if (skb_at_tc_ingress(skb))
 		skb_pull_rcsum(skb, skb->mac_len);
 
-	spin_unlock(&v->tcf_lock);
 	return action;
 }
 
@@ -109,6 +113,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 	struct nlattr *tb[TCA_VLAN_MAX + 1];
+	struct tcf_vlan_params *p, *p_old;
 	struct tc_vlan *parm;
 	struct tcf_vlan *v;
 	int action;
@@ -187,46 +192,67 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 
 	v = to_vlan(*a);
 
-	spin_lock_bh(&v->tcf_lock);
-
-	v->tcfv_action = action;
-	v->tcfv_push_vid = push_vid;
-	v->tcfv_push_prio = push_prio;
-	v->tcfv_push_proto = push_proto;
+	ASSERT_RTNL();
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		if (ovr)
+			tcf_idr_release(*a, bind);
+		return -ENOMEM;
+	}
 
 	v->tcf_action = parm->action;
 
-	spin_unlock_bh(&v->tcf_lock);
+	p_old = rtnl_dereference(v->vlan_p);
+
+	p->tcfv_action = action;
+	p->tcfv_push_vid = push_vid;
+	p->tcfv_push_prio = push_prio;
+	p->tcfv_push_proto = push_proto;
+
+	rcu_assign_pointer(v->vlan_p, p);
+
+	if (p_old)
+		kfree_rcu(p_old, rcu);
 
 	if (ret == ACT_P_CREATED)
 		tcf_idr_insert(tn, *a);
 	return ret;
 }
 
+static void tcf_vlan_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_vlan *v = to_vlan(a);
+	struct tcf_vlan_params *p;
+
+	p = rcu_dereference_protected(v->vlan_p, 1);
+	kfree_rcu(p, rcu);
+}
+
 static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 			 int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_vlan *v = to_vlan(a);
+	struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p);
 	struct tc_vlan opt = {
 		.index    = v->tcf_index,
 		.refcnt   = v->tcf_refcnt - ref,
 		.bindcnt  = v->tcf_bindcnt - bind,
 		.action   = v->tcf_action,
-		.v_action = v->tcfv_action,
+		.v_action = p->tcfv_action,
 	};
 	struct tcf_t t;
 
 	if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
-	if ((v->tcfv_action == TCA_VLAN_ACT_PUSH ||
-	     v->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
-	    (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
+	if ((p->tcfv_action == TCA_VLAN_ACT_PUSH ||
+	     p->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
+	    (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) ||
 	     nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
-			  v->tcfv_push_proto) ||
+			  p->tcfv_push_proto) ||
 	     (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY,
-					      v->tcfv_push_prio))))
+					      p->tcfv_push_prio))))
 		goto nla_put_failure;
 
 	tcf_tm_dump(&t, &v->tcf_tm);
@@ -262,6 +288,7 @@ static struct tc_action_ops act_vlan_ops = {
 	.act		=	tcf_vlan,
 	.dump		=	tcf_vlan_dump,
 	.init		=	tcf_vlan_init,
+	.cleanup	=	tcf_vlan_cleanup,
 	.walk		=	tcf_vlan_walker,
 	.lookup		=	tcf_vlan_search,
 	.size		=	sizeof(struct tcf_vlan),
-- 
cgit v1.2.3


From 54985120a1c461b74f9510e5d730971f2a2383b1 Mon Sep 17 00:00:00 2001
From: Girish Moodalbail <girish.moodalbail@oracle.com>
Date: Tue, 7 Nov 2017 11:32:11 -0800
Subject: net: fix incorrect comment with regard to VLAN packet handling

The commit bcc6d4790361 ("net: vlan: make non-hw-accel rx path similar
to hw-accel") unified accel and non-accel path for VLAN RX. With that
fix we do not register any packet_type handler for VLANs anymore, so fix
the incorrect comment.

Signed-off-by: Girish Moodalbail <girish.moodalbail@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 79518ede3170..6b274bfe489f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4479,15 +4479,7 @@ do {								\
  *	Why 16. Because with 16 the only overlap we get on a hash of the
  *	low nibble of the protocol value is RARP/SNAP/X.25.
  *
- *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
- *             sure which should go first, but I bet it won't make much
- *             difference if we are running VLANs.  The good news is that
- *             this protocol won't be in the list unless compiled in, so
- *             the average user (w/out VLANs) will not be adversely affected.
- *             --BLG
- *
  *		0800	IP
- *		8100    802.1Q VLAN
  *		0001	802.3
  *		0002	AX.25
  *		0004	802.2
-- 
cgit v1.2.3


From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 7 Nov 2017 15:28:42 -0500
Subject: bpf: add a bpf_override_function helper

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                     |  3 +++
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/kprobes.h   |  4 ++++
 arch/x86/include/asm/ptrace.h    |  5 +++++
 arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++++++++
 include/linux/filter.h           |  3 ++-
 include/linux/trace_events.h     |  1 +
 include/uapi/linux/bpf.h         |  7 ++++++-
 kernel/bpf/core.c                |  3 +++
 kernel/bpf/verifier.c            |  2 ++
 kernel/events/core.c             |  7 +++++++
 kernel/trace/Kconfig             | 11 +++++++++++
 kernel/trace/bpf_trace.c         | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c      | 40 +++++++++++++++++++++++++++++++++-------
 kernel/trace/trace_probe.h       |  6 ++++++
 15 files changed, 133 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 057370a0ac4e..6e8520f09bc1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_KPROBE_OVERRIDE
+	bool
+
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fdb23313dd5..51458c1a0b4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index c0e3c45cf6ab..2370bb0149cc 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+	".type override_func, @function\n"
+	"override_func:\n"
+	"	ret\n"
+	".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+	regs->ip = (unsigned long)&override_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0cd02ff4ae30..eaec066f99e8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,7 +459,8 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				kprobe_override:1; /* Do we override a kprobe? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 84014ecfa67f..17e5e820a84c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,6 +523,7 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..adb66f78b674 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8a6c37762330..271daad31f37 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,6 +1326,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
+	if (fp->kprobe_override)
+		return false;
+
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4a942e2e753d..bc464b8ec91e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,6 +4357,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_override_return)
+			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 42d24bd64ea4..ac240d31b5bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	/* Kprobe override only works for kprobes, not uprobes. */
+	if (prog->kprobe_override &&
+	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..9dc0deeaad2b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,6 +518,17 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
+config BPF_KPROBE_OVERRIDE
+	bool "Enable BPF programs to override a kprobed function"
+	depends on BPF_EVENTS
+	depends on KPROBES_ON_FTRACE
+	depends on HAVE_KPROBE_OVERRIDE
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	default n
+	help
+	 Allows BPF to override the execution of a probed function and
+	 set a different return value.  This is used for error injection.
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 506efe6e8ed9..1865b0d4cdeb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <asm/kprobes.h>
+
+#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	__this_cpu_write(bpf_kprobe_override, 1);
+	regs_set_return_value(regs, rc);
+	arch_ftrace_kprobe_override_function(regs);
+	return 0;
+}
+#else
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	return -EINVAL;
+}
+#endif
+
+static const struct bpf_func_proto bpf_override_return_proto = {
+	.func		= bpf_override_return,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -551,6 +578,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
+	case BPF_FUNC_override_return:
+		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
+				    current->comm, task_pid_nr(current));
+		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -766,6 +797,10 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
+	/* Kprobe override only works for ftrace based kprobes. */
+	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
+		return -EINVAL;
+
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index abf92e478cfb..8e3c9ec1faf7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,6 +42,7 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -87,6 +88,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
+int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	return kprobe_ftrace(&tk->rp.kp);
+}
+
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1170,7 +1177,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static void
+static int
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1186,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
-		return;
+	if (bpf_prog_array_valid(call)) {
+		int ret;
+
+		ret = trace_call_bpf(call, regs);
+
+		/*
+		 * We need to check and see if we modified the pc of the
+		 * pt_regs, and if so clear the kprobe and return 1 so that we
+		 * don't do the instruction skipping.  Also reset our state so
+		 * we are clean the next pass through.
+		 */
+		if (__this_cpu_read(bpf_kprobe_override)) {
+			__this_cpu_write(bpf_kprobe_override, 0);
+			reset_current_kprobe();
+			return 1;
+		}
+		if (!ret)
+			return 0;
+	}
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return;
+		return 0;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1217,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return;
+		return 0;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
+	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1275,6 +1300,7 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1282,9 +1308,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		kprobe_perf_func(tk, regs);
+		ret = kprobe_perf_func(tk, regs);
 #endif
-	return 0;	/* We don't tweek kernel, so just return 0 */
+	return ret;
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..adbb3f7d1fb5 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,6 +253,7 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -278,6 +279,11 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
+
+static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	return 0;
+}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
-- 
cgit v1.2.3


From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 7 Nov 2017 21:52:09 -0800
Subject: net: ipv6: sysctl to specify IPv6 ND traffic class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a per-device sysctl to specify the default traffic class to use for
kernel originated IPv6 Neighbour Discovery packets.

Currently this includes:

  - Router Solicitation (ICMPv6 type 133)
    ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Solicitation (ICMPv6 type 135)
    ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Advertisement (ICMPv6 type 136)
    ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Redirect (ICMPv6 type 137)
    ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr()

and if the kernel ever gets around to generating RA's,
it would presumably also include:

  - Router Advertisement (ICMPv6 type 134)
    (radvd daemon could pick up on the kernel setting and use it)

Interface drivers may examine the Traffic Class value and translate
the DiffServ Code Point into a link-layer appropriate traffic
prioritization scheme.  An example of mapping IETF DSCP values to
IEEE 802.11 User Priority values can be found here:

    https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11

The expected primary use case is to properly prioritize ND over wifi.

Testing:
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  0
  jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  255
  jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  34

  jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1
  tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
  IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24)
  jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement,
  length 24, tgt is jzem22.pgc, Flags [solicited]

(based on original change written by Erik Kline, with minor changes)

v2: fix 'suspicious rcu_dereference_check() usage'
    by explicitly grabbing the rcu_read_lock.

Cc: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 11 +++++++++++
 net/ipv6/ndisc.c                       |  9 ++++++++-
 5 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 54410a1d4065..d8676dda7fa6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1732,6 +1732,15 @@ ndisc_notify - BOOLEAN
 	1 - Generate unsolicited neighbour advertisements when device is brought
 	    up or hardware address changes.
 
+ndisc_tclass - INTEGER
+	The IPv6 Traffic Class to use by default when sending IPv6 Neighbor
+	Discovery (Router Solicitation, Router Advertisement, Neighbor
+	Solicitation, Neighbor Advertisement, Redirect) messages.
+	These 8 bits can be interpreted as 6 high order bits holding the DSCP
+	value and 2 low order bits representing ECN (which you probably want
+	to leave cleared).
+	0 - (default)
+
 mldv1_unsolicited_report_interval - INTEGER
 	The interval in milliseconds in which the next unsolicited
 	MLDv1 report retransmit will take place.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ea04ca024f0d..cb18c6290ca8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -73,6 +73,7 @@ struct ipv6_devconf {
 	__u32		enhanced_dad;
 	__u32		addr_gen_mode;
 	__s32		disable_policy;
+	__s32           ndisc_tclass;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index b22a9c4e1b12..9c0f4a92bcff 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -186,6 +186,7 @@ enum {
 	DEVCONF_ADDR_GEN_MODE,
 	DEVCONF_DISABLE_POLICY,
 	DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
+	DEVCONF_NDISC_TCLASS,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6233e06fa35c..a6dffd65eb9d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5059,6 +5059,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
 	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
 	array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
+	array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5986,6 +5987,7 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
 }
 
 static int minus_one = -1;
+static const int zero = 0;
 static const int one = 1;
 static const int two_five_five = 255;
 
@@ -6356,6 +6358,15 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode           = 0644,
 		.proc_handler   = addrconf_sysctl_disable_policy,
 	},
+	{
+		.procname	= "ndisc_tclass",
+		.data		= &ipv6_devconf.ndisc_tclass,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (void *)&zero,
+		.extra2		= (void *)&two_five_five,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f9c3ffe04382..b3cea200c85e 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -427,12 +427,19 @@ static void ip6_nd_hdr(struct sk_buff *skb,
 		       int hop_limit, int len)
 {
 	struct ipv6hdr *hdr;
+	struct inet6_dev *idev;
+	unsigned tclass;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(skb->dev);
+	tclass = idev ? idev->cnf.ndisc_tclass : 0;
+	rcu_read_unlock();
 
 	skb_push(skb, sizeof(*hdr));
 	skb_reset_network_header(skb);
 	hdr = ipv6_hdr(skb);
 
-	ip6_flow_hdr(hdr, 0, 0);
+	ip6_flow_hdr(hdr, tclass, 0);
 
 	hdr->payload_len = htons(len);
 	hdr->nexthdr = IPPROTO_ICMPV6;
-- 
cgit v1.2.3


From 8d6e79d3ce13e34957de87f7584cbf1bcde74c57 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 8 Nov 2017 09:59:26 +0100
Subject: tipc: improve link resiliency when rps is activated

Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.

To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.

To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.

It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 12 ++++-----
 include/net/tipc.h           | 62 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/flow_dissector.c    | 30 ++++++++++-----------
 net/tipc/link.c              | 26 +++++++++++--------
 net/tipc/msg.h               | 10 +++++++
 5 files changed, 108 insertions(+), 32 deletions(-)
 create mode 100644 include/net/tipc.h

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 22aba321282d..9a074776f70b 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -84,11 +84,11 @@ struct flow_dissector_key_ipv6_addrs {
 };
 
 /**
- * struct flow_dissector_key_tipc_addrs:
- * @srcnode: source node address
+ * struct flow_dissector_key_tipc:
+ * @key: source node address combined with selector
  */
-struct flow_dissector_key_tipc_addrs {
-	__be32 srcnode;
+struct flow_dissector_key_tipc {
+	__be32 key;
 };
 
 /**
@@ -100,7 +100,7 @@ struct flow_dissector_key_addrs {
 	union {
 		struct flow_dissector_key_ipv4_addrs v4addrs;
 		struct flow_dissector_key_ipv6_addrs v6addrs;
-		struct flow_dissector_key_tipc_addrs tipcaddrs;
+		struct flow_dissector_key_tipc tipckey;
 	};
 };
 
@@ -192,7 +192,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
-	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
+	FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */
 	FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
 	FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */
 	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
diff --git a/include/net/tipc.h b/include/net/tipc.h
new file mode 100644
index 000000000000..07670ec022a7
--- /dev/null
+++ b/include/net/tipc.h
@@ -0,0 +1,62 @@
+/*
+ * include/net/tipc.h: Include file for TIPC message header routines
+ *
+ * Copyright (c) 2017 Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_HDR_H
+#define _TIPC_HDR_H
+
+#include <linux/random.h>
+
+#define KEEPALIVE_MSG_MASK 0x0e080000  /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */
+
+struct tipc_basic_hdr {
+	__be32 w[4];
+};
+
+static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
+{
+	u32 w0 = ntohl(hdr->w[0]);
+	bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
+	int key;
+
+	/* Return source node identity as key */
+	if (likely(!keepalive_msg))
+		return hdr->w[3];
+
+	/* Spread PROBE/PROBE_REPLY messages across the cores */
+	get_random_bytes(&key, sizeof(key));
+	return key;
+}
+
+#endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1f5caafb4492..15ce30063765 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -10,6 +10,7 @@
 #include <net/ipv6.h>
 #include <net/gre.h>
 #include <net/pptp.h>
+#include <net/tipc.h>
 #include <linux/igmp.h>
 #include <linux/icmp.h>
 #include <linux/sctp.h>
@@ -772,23 +773,22 @@ proto_again:
 		break;
 	}
 	case htons(ETH_P_TIPC): {
-		struct {
-			__be32 pre[3];
-			__be32 srcnode;
-		} *hdr, _hdr;
-		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+		struct tipc_basic_hdr *hdr, _hdr;
+
+		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr),
+					   data, hlen, &_hdr);
 		if (!hdr) {
 			fdret = FLOW_DISSECT_RET_OUT_BAD;
 			break;
 		}
 
 		if (dissector_uses_key(flow_dissector,
-				       FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
+				       FLOW_DISSECTOR_KEY_TIPC)) {
 			key_addrs = skb_flow_dissector_target(flow_dissector,
-							      FLOW_DISSECTOR_KEY_TIPC_ADDRS,
+							      FLOW_DISSECTOR_KEY_TIPC,
 							      target_container);
-			key_addrs->tipcaddrs.srcnode = hdr->srcnode;
-			key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS;
+			key_addrs->tipckey.key = tipc_hdr_rps_key(hdr);
+			key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC;
 		}
 		fdret = FLOW_DISSECT_RET_OUT_GOOD;
 		break;
@@ -1024,8 +1024,8 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 		diff -= sizeof(flow->addrs.v6addrs);
 		break;
-	case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
-		diff -= sizeof(flow->addrs.tipcaddrs);
+	case FLOW_DISSECTOR_KEY_TIPC:
+		diff -= sizeof(flow->addrs.tipckey);
 		break;
 	}
 	return (sizeof(*flow) - diff) / sizeof(u32);
@@ -1039,8 +1039,8 @@ __be32 flow_get_u32_src(const struct flow_keys *flow)
 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 		return (__force __be32)ipv6_addr_hash(
 			&flow->addrs.v6addrs.src);
-	case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
-		return flow->addrs.tipcaddrs.srcnode;
+	case FLOW_DISSECTOR_KEY_TIPC:
+		return flow->addrs.tipckey.key;
 	default:
 		return 0;
 	}
@@ -1321,8 +1321,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.offset = offsetof(struct flow_keys, addrs.v6addrs),
 	},
 	{
-		.key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS,
-		.offset = offsetof(struct flow_keys, addrs.tipcaddrs),
+		.key_id = FLOW_DISSECTOR_KEY_TIPC,
+		.offset = offsetof(struct flow_keys, addrs.tipckey),
 	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_PORTS,
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 870b9b8f877a..6bce0b1117bd 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -239,7 +239,8 @@ static int link_is_up(struct tipc_link *l)
 static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 			       struct sk_buff_head *xmitq);
 static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
-				      u16 rcvgap, int tolerance, int priority,
+				      bool probe_reply, u16 rcvgap,
+				      int tolerance, int priority,
 				      struct sk_buff_head *xmitq);
 static void link_print(struct tipc_link *l, const char *str);
 static int tipc_link_build_nack_msg(struct tipc_link *l,
@@ -773,7 +774,7 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
 	}
 
 	if (state || probe || setup)
-		tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, xmitq);
+		tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, 0, xmitq);
 
 	return rc;
 }
@@ -1174,7 +1175,7 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
 	/* Unicast ACK */
 	l->rcv_unacked = 0;
 	l->stats.sent_acks++;
-	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq);
+	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq);
 	return 0;
 }
 
@@ -1188,7 +1189,7 @@ void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
 	if (l->state == LINK_ESTABLISHING)
 		mtyp = ACTIVATE_MSG;
 
-	tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq);
+	tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, 0, xmitq);
 
 	/* Inform peer that this endpoint is going down if applicable */
 	skb = skb_peek_tail(xmitq);
@@ -1215,7 +1216,7 @@ static int tipc_link_build_nack_msg(struct tipc_link *l,
 	}
 
 	if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV))
-		tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq);
+		tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq);
 	return 0;
 }
 
@@ -1289,7 +1290,8 @@ drop:
 }
 
 static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
-				      u16 rcvgap, int tolerance, int priority,
+				      bool probe_reply, u16 rcvgap,
+				      int tolerance, int priority,
 				      struct sk_buff_head *xmitq)
 {
 	struct tipc_link *bcl = l->bc_rcvlink;
@@ -1337,6 +1339,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 		msg_set_seq_gap(hdr, rcvgap);
 		msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
 		msg_set_probe(hdr, probe);
+		msg_set_is_keepalive(hdr, probe || probe_reply);
 		tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id);
 		msg_set_size(hdr, INT_H_SIZE + dlen);
 		skb_trim(skb, INT_H_SIZE + dlen);
@@ -1442,6 +1445,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 	u16 rcv_nxt = l->rcv_nxt;
 	u16 dlen = msg_data_sz(hdr);
 	int mtyp = msg_type(hdr);
+	bool reply = msg_probe(hdr);
 	void *data;
 	char *if_name;
 	int rc = 0;
@@ -1528,9 +1532,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 		/* Send NACK if peer has sent pkts we haven't received yet */
 		if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l))
 			rcvgap = peers_snd_nxt - l->rcv_nxt;
-		if (rcvgap || (msg_probe(hdr)))
-			tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap,
-						  0, 0, xmitq);
+		if (rcvgap || reply)
+			tipc_link_build_proto_msg(l, STATE_MSG, 0, reply,
+						  rcvgap, 0, 0, xmitq);
 		tipc_link_release_pkts(l, ack);
 
 		/* If NACK, retransmit will now start at right position */
@@ -2122,14 +2126,14 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
 			     struct sk_buff_head *xmitq)
 {
 	l->tolerance = tol;
-	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq);
+	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq);
 }
 
 void tipc_link_set_prio(struct tipc_link *l, u32 prio,
 			struct sk_buff_head *xmitq)
 {
 	l->priority = prio;
-	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq);
+	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, prio, xmitq);
 }
 
 void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit)
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index cedf811317fb..bf8f57ccc70c 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -226,6 +226,16 @@ static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d)
 	msg_set_bits(m, 0, 19, 1, d);
 }
 
+static inline int msg_is_keepalive(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 19, 1);
+}
+
+static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d)
+{
+	msg_set_bits(m, 0, 19, 1, d);
+}
+
 static inline int msg_src_droppable(struct tipc_msg *m)
 {
 	return msg_bits(m, 0, 18, 1);
-- 
cgit v1.2.3


From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Nov 2017 18:24:55 +0900
Subject: bpf: Revert bpf_overrid_function() helper changes.

NACK'd by x86 maintainer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                              |  3 ---
 arch/x86/Kconfig                          |  1 -
 arch/x86/include/asm/kprobes.h            |  4 ----
 arch/x86/include/asm/ptrace.h             |  5 ----
 arch/x86/kernel/kprobes/ftrace.c          | 14 -----------
 include/linux/filter.h                    |  3 +--
 include/linux/trace_events.h              |  1 -
 include/uapi/linux/bpf.h                  |  7 +-----
 kernel/bpf/core.c                         |  3 ---
 kernel/bpf/verifier.c                     |  2 --
 kernel/events/core.c                      |  7 ------
 kernel/trace/Kconfig                      | 11 ---------
 kernel/trace/bpf_trace.c                  | 35 ---------------------------
 kernel/trace/trace_kprobe.c               | 40 ++++++-------------------------
 kernel/trace/trace_probe.h                |  6 -----
 samples/bpf/Makefile                      |  4 ----
 samples/bpf/test_override_return.sh       | 15 ------------
 samples/bpf/tracex7_kern.c                | 16 -------------
 samples/bpf/tracex7_user.c                | 28 ----------------------
 tools/include/uapi/linux/bpf.h            |  7 +-----
 tools/testing/selftests/bpf/bpf_helpers.h |  3 +--
 21 files changed, 11 insertions(+), 204 deletions(-)
 delete mode 100755 samples/bpf/test_override_return.sh
 delete mode 100644 samples/bpf/tracex7_kern.c
 delete mode 100644 samples/bpf/tracex7_user.c

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 6e8520f09bc1..057370a0ac4e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,9 +196,6 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
-config HAVE_KPROBE_OVERRIDE
-	bool
-
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51458c1a0b4a..2fdb23313dd5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,7 +153,6 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
-	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index c6c3b1f4306a..6cf65437b5e5 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,10 +67,6 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
-#ifdef CONFIG_KPROBES_ON_FTRACE
-extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
-#endif
-
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2370bb0149cc..c0e3c45cf6ab 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,11 +109,6 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
-static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
-{
-	regs->ax = rc;
-}
-
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 3c455bf490cb..041f7b6dfa0f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
-
-asmlinkage void override_func(void);
-asm(
-	".type override_func, @function\n"
-	"override_func:\n"
-	"	ret\n"
-	".size override_func, .-override_func\n"
-);
-
-void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
-{
-	regs->ip = (unsigned long)&override_func;
-}
-NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index eaec066f99e8..0cd02ff4ae30 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,8 +459,7 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				kprobe_override:1; /* Do we override a kprobe? */
+				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 17e5e820a84c..84014ecfa67f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,7 +523,6 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
-DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 271daad31f37..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,9 +1326,6 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
-	if (fp->kprobe_override)
-		return false;
-
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc464b8ec91e..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,8 +4357,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
-		if (insn->imm == BPF_FUNC_override_return)
-			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac240d31b5bf..42d24bd64ea4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,13 +8171,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
-	/* Kprobe override only works for kprobes, not uprobes. */
-	if (prog->kprobe_override &&
-	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
-
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9dc0deeaad2b..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,17 +518,6 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
-config BPF_KPROBE_OVERRIDE
-	bool "Enable BPF programs to override a kprobed function"
-	depends on BPF_EVENTS
-	depends on KPROBES_ON_FTRACE
-	depends on HAVE_KPROBE_OVERRIDE
-	depends on DYNAMIC_FTRACE_WITH_REGS
-	default n
-	help
-	 Allows BPF to override the execution of a probed function and
-	 set a different return value.  This is used for error injection.
-
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1865b0d4cdeb..506efe6e8ed9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,10 +13,6 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
-#include <linux/kprobes.h>
-#include <asm/kprobes.h>
-
-#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -80,29 +76,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	__this_cpu_write(bpf_kprobe_override, 1);
-	regs_set_return_value(regs, rc);
-	arch_ftrace_kprobe_override_function(regs);
-	return 0;
-}
-#else
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	return -EINVAL;
-}
-#endif
-
-static const struct bpf_func_proto bpf_override_return_proto = {
-	.func		= bpf_override_return,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_ANYTHING,
-};
-
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -578,10 +551,6 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
-	case BPF_FUNC_override_return:
-		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
-				    current->comm, task_pid_nr(current));
-		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -797,10 +766,6 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
-	/* Kprobe override only works for ftrace based kprobes. */
-	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
-		return -EINVAL;
-
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8e3c9ec1faf7..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,7 +42,6 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
-DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -88,12 +87,6 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
-int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
-	return kprobe_ftrace(&tk->rp.kp);
-}
-
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1177,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static int
+static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1186,29 +1179,12 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call)) {
-		int ret;
-
-		ret = trace_call_bpf(call, regs);
-
-		/*
-		 * We need to check and see if we modified the pc of the
-		 * pt_regs, and if so clear the kprobe and return 1 so that we
-		 * don't do the instruction skipping.  Also reset our state so
-		 * we are clean the next pass through.
-		 */
-		if (__this_cpu_read(bpf_kprobe_override)) {
-			__this_cpu_write(bpf_kprobe_override, 0);
-			reset_current_kprobe();
-			return 1;
-		}
-		if (!ret)
-			return 0;
-	}
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
+		return;
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return 0;
+		return;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1217,14 +1193,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return 0;
+		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
-	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1300,7 +1275,6 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
-	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1308,9 +1282,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		ret = kprobe_perf_func(tk, regs);
+		kprobe_perf_func(tk, regs);
 #endif
-	return ret;
+	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index adbb3f7d1fb5..903273c93e61 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,7 +253,6 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
-int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -279,11 +278,6 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
-
-static inline int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	return 0;
-}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 87db0f9a4c15..3b4945c1eab0 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -15,7 +15,6 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
-hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -62,7 +61,6 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
-tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -106,7 +104,6 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
-always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -161,7 +158,6 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
-HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
deleted file mode 100755
index e68b9ee6814b..000000000000
--- a/samples/bpf/test_override_return.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-rm -f testfile.img
-dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
-DEVICE=$(losetup --show -f testfile.img)
-mkfs.btrfs -f $DEVICE
-mkdir tmpmnt
-./tracex7 $DEVICE
-if [ $? -eq 0 ]
-then
-	echo "SUCCESS!"
-else
-	echo "FAILED!"
-fi
-losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
deleted file mode 100644
index 1ab308a43e0f..000000000000
--- a/samples/bpf/tracex7_kern.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <uapi/linux/ptrace.h>
-#include <uapi/linux/bpf.h>
-#include <linux/version.h>
-#include "bpf_helpers.h"
-
-SEC("kprobe/open_ctree")
-int bpf_prog1(struct pt_regs *ctx)
-{
-	unsigned long rc = -12;
-
-	bpf_override_return(ctx, rc);
-	return 0;
-}
-
-char _license[] SEC("license") = "GPL";
-u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
deleted file mode 100644
index 8a52ac492e8b..000000000000
--- a/samples/bpf/tracex7_user.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <linux/bpf.h>
-#include <unistd.h>
-#include "libbpf.h"
-#include "bpf_load.h"
-
-int main(int argc, char **argv)
-{
-	FILE *f;
-	char filename[256];
-	char command[256];
-	int ret;
-
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
-		return 1;
-	}
-
-	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
-	f = popen(command, "r");
-	ret = pclose(f);
-
-	return ret ? 0 : 1;
-}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 33cb00e46c49..fd9a17fa8a8b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -82,8 +82,7 @@ static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
 				       unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_prog_read_value;
-static int (*bpf_override_return)(void *ctx, unsigned long rc) =
-	(void *) BPF_FUNC_override_return;
+
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Nov 2017 13:01:26 -0800
Subject: tcp: retire FACK loss detection

FACK loss detection has been disabled by default and the
successor RACK subsumed FACK and can handle reordering better.
This patch removes FACK to simplify TCP loss recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  3 +-
 include/linux/tcp.h                    |  1 -
 include/net/tcp.h                      | 14 +--------
 include/uapi/linux/snmp.h              |  1 -
 net/ipv4/proc.c                        |  1 -
 net/ipv4/tcp.c                         |  2 --
 net/ipv4/tcp_input.c                   | 53 +++++-----------------------------
 net/ipv4/tcp_metrics.c                 |  4 +--
 net/ipv4/tcp_minisocks.c               |  5 +---
 net/ipv4/tcp_output.c                  |  5 +---
 10 files changed, 12 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d8676dda7fa6..46c7e1085efc 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -289,8 +289,7 @@ tcp_ecn_fallback - BOOLEAN
 	Default: 1 (fallback enabled)
 
 tcp_fack - BOOLEAN
-	Enable FACK congestion avoidance and fast retransmission.
-	The value is not used, if tcp_sack is not enabled.
+	This is a legacy option, it has no effect anymore.
 
 tcp_fin_timeout - INTEGER
 	The length of time an orphaned (no longer referenced by any
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 22f40c96a15b..9574936fe041 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -85,7 +85,6 @@ struct tcp_sack_block {
 
 /*These are used to set the sack_ok field in struct tcp_options_received */
 #define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
-#define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
 #define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/
 
 struct tcp_options_received {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2f2c69ad31b2..ed71511e67a6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -384,7 +384,6 @@ void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
 void tcp_metrics_init(void);
 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
-void tcp_disable_fack(struct tcp_sock *tp);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
 void tcp_init_transfer(struct sock *sk, int bpf_op);
@@ -776,7 +775,7 @@ struct tcp_skb_cb {
 	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
-	__u8		sacked;		/* State flags for SACK/FACK.	*/
+	__u8		sacked;		/* State flags for SACK.	*/
 #define TCPCB_SACKED_ACKED	0x01	/* SKB ACK'd by a SACK block	*/
 #define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
 #define TCPCB_LOST		0x04	/* SKB is lost			*/
@@ -1066,7 +1065,6 @@ void tcp_rate_check_app_limited(struct sock *sk);
  *
  * tcp_is_sack - SACK enabled
  * tcp_is_reno - No SACK
- * tcp_is_fack - FACK enabled, implies SACK enabled
  */
 static inline int tcp_is_sack(const struct tcp_sock *tp)
 {
@@ -1078,16 +1076,6 @@ static inline bool tcp_is_reno(const struct tcp_sock *tp)
 	return !tcp_is_sack(tp);
 }
 
-static inline bool tcp_is_fack(const struct tcp_sock *tp)
-{
-	return tp->rx_opt.sack_ok & TCP_FACK_ENABLED;
-}
-
-static inline void tcp_enable_fack(struct tcp_sock *tp)
-{
-	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
-}
-
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 0d941cdd8e8c..33a70ece462f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -191,7 +191,6 @@ enum
 	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
 	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
 	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
-	LINUX_MIB_TCPFACKREORDER,		/* TCPFACKReorder */
 	LINUX_MIB_TCPSACKREORDER,		/* TCPSACKReorder */
 	LINUX_MIB_TCPRENOREORDER,		/* TCPRenoReorder */
 	LINUX_MIB_TCPTSREORDER,			/* TCPTSReorder */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 127153f1ed8a..9f37c4727861 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -212,7 +212,6 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
 	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
 	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
-	SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
 	SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
 	SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
 	SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bc71a27d5ad9..337555076043 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2509,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk,
 				return -EINVAL;
 
 			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
-			if (sock_net(sk)->ipv4.sysctl_tcp_fack)
-				tcp_enable_fack(tp);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (opt.opt_val != 0)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9ceaa1fdc3ab..487e181cff86 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -842,18 +842,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
-/*
- * Packet counting of FACK is based on in-order assumptions, therefore TCP
- * disables it when reordering is detected
- */
-void tcp_disable_fack(struct tcp_sock *tp)
-{
-	/* RFC3517 uses different metric in lost marker => reset on change */
-	if (tcp_is_fack(tp))
-		tp->lost_skb_hint = NULL;
-	tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
-}
-
 /* Take a notice that peer is sending D-SACKs */
 static void tcp_dsack_seen(struct tcp_sock *tp)
 {
@@ -881,7 +869,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 			 tp->sacked_out,
 			 tp->undo_marker ? tp->undo_retrans : 0);
 #endif
-		tcp_disable_fack(tp);
 	}
 
 	tp->rack.reord = 1;
@@ -891,8 +878,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 		mib_idx = LINUX_MIB_TCPTSREORDER;
 	else if (tcp_is_reno(tp))
 		mib_idx = LINUX_MIB_TCPRENOREORDER;
-	else if (tcp_is_fack(tp))
-		mib_idx = LINUX_MIB_TCPFACKREORDER;
 	else
 		mib_idx = LINUX_MIB_TCPSACKREORDER;
 
@@ -970,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
  * 3. Loss detection event of two flavors:
  *	A. Scoreboard estimator decided the packet is lost.
  *	   A'. Reno "three dupacks" marks head of queue lost.
- *	   A''. Its FACK modification, head until snd.fack is lost.
  *	B. SACK arrives sacking SND.NXT at the moment, when the
  *	   segment was retransmitted.
  * 4. D-SACK added new rule: D-SACK changes any tag to S.
@@ -1248,7 +1232,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		fack_count += pcount;
 
 		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
-		if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
+		if (tp->lost_skb_hint &&
 		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
 			tp->lost_cnt_hint += pcount;
 
@@ -2051,10 +2035,6 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
  * counter when SACK is enabled (without SACK, sacked_out is used for
  * that purpose).
  *
- * Instead, with FACK TCP uses fackets_out that includes both SACKed
- * segments up to the highest received SACK block so far and holes in
- * between them.
- *
  * With reordering, holes may still be in flight, so RFC3517 recovery
  * uses pure sacked_out (total number of SACKed segments) even though
  * it violates the RFC that uses duplicate ACKs, often these are equal
@@ -2064,10 +2044,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
  */
 static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 {
-	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
+	return tp->sacked_out + 1;
 }
 
-/* Linux NewReno/SACK/FACK/ECN state machine.
+/* Linux NewReno/SACK/ECN state machine.
  * --------------------------------------
  *
  * "Open"	Normal state, no dubious events, fast path.
@@ -2132,16 +2112,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
  *		dynamically measured and adjusted. This is implemented in
  *		tcp_rack_mark_lost.
  *
- *		FACK (Disabled by default. Subsumbed by RACK):
- *		It is the simplest heuristics. As soon as we decided
- *		that something is lost, we decide that _all_ not SACKed
- *		packets until the most forward SACK are lost. I.e.
- *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
- *		It is absolutely correct estimate, if network does not reorder
- *		packets. And it loses any connection to reality when reordering
- *		takes place. We use FACK by default until reordering
- *		is suspected on the path to this destination.
- *
  *		If the receiver does not support SACK:
  *
  *		NewReno (RFC6582): in Recovery we assume that one segment
@@ -2190,7 +2160,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 }
 
 /* Detect loss in event "A" above by marking head of queue up as lost.
- * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * For non-SACK(Reno) senders, the first "packets" number of segments
  * are considered lost. For RFC3517 SACK, a segment is considered lost if it
  * has at least tp->reordering SACKed seqments above it; "packets" refers to
  * the maximum SACKed segments to pass before reaching this limit.
@@ -2226,12 +2196,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 			break;
 
 		oldcnt = cnt;
-		if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
+		if (tcp_is_reno(tp) ||
 		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 			cnt += tcp_skb_pcount(skb);
 
 		if (cnt > packets) {
-			if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+			if (tcp_is_sack(tp) ||
 			    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
 			    (oldcnt >= packets))
 				break;
@@ -2262,11 +2232,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 
 	if (tcp_is_reno(tp)) {
 		tcp_mark_head_lost(sk, 1, 1);
-	} else if (tcp_is_fack(tp)) {
-		int lost = tp->fackets_out - tp->reordering;
-		if (lost <= 0)
-			lost = 1;
-		tcp_mark_head_lost(sk, lost, 0);
 	} else {
 		int sacked_upto = tp->sacked_out - tp->reordering;
 		if (sacked_upto >= 0)
@@ -3199,8 +3164,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			if (reord < prior_fackets && reord <= tp->fackets_out)
 				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
 
-			delta = tcp_is_fack(tp) ? pkts_acked :
-						  prior_sacked - tp->sacked_out;
+			delta = prior_sacked - tp->sacked_out;
 			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
 
@@ -5708,9 +5672,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			tp->tcp_header_len = sizeof(struct tcphdr);
 		}
 
-		if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack)
-			tcp_enable_fack(tp);
-
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9d5ddebfd831..7097f92d16e5 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -470,10 +470,8 @@ void tcp_init_metrics(struct sock *sk)
 		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	}
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
-	if (val && tp->reordering != val) {
-		tcp_disable_fack(tp);
+	if (val && tp->reordering != val)
 		tp->reordering = val;
-	}
 
 	crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4bb86580decd..326c9282bf94 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -509,10 +509,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 						       keepalive_time_when(newtp));
 
 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
-		if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
-			if (sock_net(sk)->ipv4.sysctl_tcp_fack)
-				tcp_enable_fack(newtp);
-		}
+		newtp->rx_opt.sack_ok = ireq->sack_ok;
 		newtp->window_clamp = req->rsk_window_clamp;
 		newtp->rcv_ssthresh = req->rsk_rcv_wnd;
 		newtp->rcv_wnd = req->rsk_rcv_wnd;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9b98d35aa0d8..094c429b4401 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1257,7 +1257,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 
 	if (tp->lost_skb_hint &&
 	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
-	    (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		tp->lost_cnt_hint -= decr;
 
 	tcp_verify_left_out(tp);
@@ -2961,9 +2961,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
  * we've sent it all or the congestion window limit is reached.
- * If doing SACK, the first ACK which comes back for a timeout
- * based retransmit packet might feed us FACK information again.
- * If so, we use it to avoid unnecessarily retransmissions.
  */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
-- 
cgit v1.2.3


From 737ff314563ca27f044f9a3a041e9d42491ef7ce Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Nov 2017 13:01:27 -0800
Subject: tcp: use sequence distance to detect reordering

Replace the reordering distance measurement in packet unit with
sequence based approach. Previously it trackes the number of "packets"
toward the forward ACK (i.e.  highest sacked sequence)in a state
variable "fackets_out".

Precisely measuring reordering degree on packet distance has not much
benefit, as the degree constantly changes by factors like path, load,
and congestion window. It is also complicated and prone to arcane bugs.
This patch replaces with sequence-based approach that's much simpler.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |   1 -
 net/ipv4/tcp.c           |   1 -
 net/ipv4/tcp_input.c     | 155 +++++++++++++++++++++--------------------------
 net/ipv4/tcp_minisocks.c |   1 -
 net/ipv4/tcp_output.c    |  17 ------
 5 files changed, 68 insertions(+), 107 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9574936fe041..df5d97a85e1a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -293,7 +293,6 @@ struct tcp_sock {
 	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
 	u32	lost_out;	/* Lost packets			*/
 	u32	sacked_out;	/* SACK'd packets			*/
-	u32	fackets_out;	/* FACK'd packets			*/
 
 	struct hrtimer	pacing_timer;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 337555076043..bf97317e6c97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2977,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_lost = tp->lost_out;
 	info->tcpi_retrans = tp->retrans_out;
-	info->tcpi_fackets = tp->fackets_out;
 
 	now = tcp_jiffies32;
 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 487e181cff86..94d729be42a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -849,39 +849,39 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
 	tp->rack.dsack_seen = 1;
 }
 
-static void tcp_update_reordering(struct sock *sk, const int metric,
-				  const int ts)
+/* It's reordering when higher sequence was delivered (i.e. sacked) before
+ * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
+ * distance is approximated in full-mss packet distance ("reordering").
+ */
+static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
+				      const int ts)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mib_idx;
+	const u32 mss = tp->mss_cache;
+	u32 fack, metric;
 
-	if (WARN_ON_ONCE(metric < 0))
+	fack = tcp_highest_sack_seq(tp);
+	if (!before(low_seq, fack))
 		return;
 
-	if (metric > tp->reordering) {
-		tp->reordering = min(sock_net(sk)->ipv4.sysctl_tcp_max_reordering, metric);
-
+	metric = fack - low_seq;
+	if ((metric > tp->reordering * mss) && mss) {
 #if FASTRETRANS_DEBUG > 1
 		pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
 			 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
 			 tp->reordering,
-			 tp->fackets_out,
+			 0,
 			 tp->sacked_out,
 			 tp->undo_marker ? tp->undo_retrans : 0);
 #endif
+		tp->reordering = min_t(u32, (metric + mss - 1) / mss,
+				       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
 	}
 
 	tp->rack.reord = 1;
-
 	/* This exciting event is worth to be remembered. 8) */
-	if (ts)
-		mib_idx = LINUX_MIB_TCPTSREORDER;
-	else if (tcp_is_reno(tp))
-		mib_idx = LINUX_MIB_TCPRENOREORDER;
-	else
-		mib_idx = LINUX_MIB_TCPSACKREORDER;
-
-	NET_INC_STATS(sock_net(sk), mib_idx);
+	NET_INC_STATS(sock_net(sk),
+		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
 }
 
 /* This must be called before lost_out is incremented */
@@ -1097,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
 }
 
 struct tcp_sacktag_state {
-	int	reord;
-	int	fack_count;
+	u32	reord;
 	/* Timestamps for earliest and latest never-retransmitted segment
 	 * that was SACKed. RTO needs the earliest RTT to stay conservative,
 	 * but congestion control should still get an accurate delay signal.
@@ -1174,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk,
 			  u64 xmit_time)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int fack_count = state->fack_count;
 
 	/* Account D-SACK for retransmitted packet. */
 	if (dup_sack && (sacked & TCPCB_RETRANS)) {
 		if (tp->undo_marker && tp->undo_retrans > 0 &&
 		    after(end_seq, tp->undo_marker))
 			tp->undo_retrans--;
-		if (sacked & TCPCB_SACKED_ACKED)
-			state->reord = min(fack_count, state->reord);
+		if ((sacked & TCPCB_SACKED_ACKED) &&
+		    before(start_seq, state->reord))
+				state->reord = start_seq;
 	}
 
 	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
@@ -1208,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
 				 * which was in hole. It is reordering.
 				 */
 				if (before(start_seq,
-					   tcp_highest_sack_seq(tp)))
-					state->reord = min(fack_count,
-							   state->reord);
+					   tcp_highest_sack_seq(tp)) &&
+				    before(start_seq, state->reord))
+					state->reord = start_seq;
+
 				if (!after(end_seq, tp->high_seq))
 					state->flag |= FLAG_ORIG_SACK_ACKED;
 				if (state->first_sackt == 0)
@@ -1229,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		tp->sacked_out += pcount;
 		tp->delivered += pcount;  /* Out-of-order packets delivered */
 
-		fack_count += pcount;
-
 		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
 		if (tp->lost_skb_hint &&
 		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
 			tp->lost_cnt_hint += pcount;
-
-		if (fack_count > tp->fackets_out)
-			tp->fackets_out = fack_count;
 	}
 
 	/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1484,7 +1479,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	}
 
 out:
-	state->fack_count += pcount;
 	return prev;
 
 noop:
@@ -1563,8 +1557,6 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 				    tcp_highest_sack_seq(tp)))
 				tcp_advance_highest_sack(sk, skb);
 		}
-
-		state->fack_count += tcp_skb_pcount(skb);
 	}
 	return skb;
 }
@@ -1575,7 +1567,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
 {
 	struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
 	struct sk_buff *skb;
-	int unack_bytes;
 
 	while (*p) {
 		parent = *p;
@@ -1588,12 +1579,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
 			p = &parent->rb_right;
 			continue;
 		}
-
-		state->fack_count = 0;
-		unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
-		if (state->mss_now && unack_bytes > 0)
-			state->fack_count = unack_bytes / state->mss_now;
-
 		return skb;
 	}
 	return NULL;
@@ -1651,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 	int first_sack_index;
 
 	state->flag = 0;
-	state->reord = tp->packets_out;
+	state->reord = tp->snd_nxt;
 
-	if (!tp->sacked_out) {
-		if (WARN_ON(tp->fackets_out))
-			tp->fackets_out = 0;
+	if (!tp->sacked_out)
 		tcp_highest_sack_reset(sk);
-	}
 
 	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
 					 num_sacks, prior_snd_una);
@@ -1729,7 +1711,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 	}
 
 	state->mss_now = tcp_current_mss(sk);
-	state->fack_count = 0;
 	skb = NULL;
 	i = 0;
 
@@ -1787,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 				skb = tcp_highest_sack(sk);
 				if (!skb)
 					break;
-				state->fack_count = tp->fackets_out;
 				cache++;
 				goto walk;
 			}
@@ -1802,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 			skb = tcp_highest_sack(sk);
 			if (!skb)
 				break;
-			state->fack_count = tp->fackets_out;
 		}
 		skb = tcp_sacktag_skip(skb, sk, state, start_seq);
 
@@ -1822,9 +1801,8 @@ advance_sp:
 	for (j = 0; j < used_sacks; j++)
 		tp->recv_sack_cache[i++] = sp[j];
 
-	if ((state->reord < tp->fackets_out) &&
-	    ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
-		tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
+	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
+		tcp_check_sack_reordering(sk, state->reord, 0);
 
 	tcp_verify_left_out(tp);
 out:
@@ -1862,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
 static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	if (tcp_limit_reno_sacked(tp))
-		tcp_update_reordering(sk, tp->packets_out + addend, 0);
+
+	if (!tcp_limit_reno_sacked(tp))
+		return;
+
+	tp->reordering = min_t(u32, tp->packets_out + addend,
+			       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
 }
 
 /* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -1909,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp)
 	tp->lost_out = 0;
 	tp->undo_marker = 0;
 	tp->undo_retrans = -1;
-	tp->fackets_out = 0;
 	tp->sacked_out = 0;
 }
 
@@ -1959,7 +1941,6 @@ void tcp_enter_loss(struct sock *sk)
 	if (is_reneg) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
 		tp->sacked_out = 0;
-		tp->fackets_out = 0;
 	}
 	tcp_clear_all_retrans_hints(tp);
 
@@ -2026,11 +2007,6 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag)
 	return false;
 }
 
-static inline int tcp_fackets_out(const struct tcp_sock *tp)
-{
-	return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
-}
-
 /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
  * counter when SACK is enabled (without SACK, sacked_out is used for
  * that purpose).
@@ -2701,15 +2677,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
 }
 
 /* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked)
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tp->undo_marker && tcp_packet_delayed(tp)) {
 		/* Plain luck! Hole if filled with delayed
-		 * packet, rather than with a retransmit.
+		 * packet, rather than with a retransmit. Check reordering.
 		 */
-		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+		tcp_check_sack_reordering(sk, prior_snd_una, 1);
 
 		/* We are getting evidence that the reordering degree is higher
 		 * than we realized. If there are no retransmits out then we
@@ -2745,6 +2721,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
 	}
 }
 
+static bool tcp_force_fast_retransmit(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return after(tcp_highest_sack_seq(tp),
+		     tp->snd_una + tp->reordering * tp->mss_cache);
+}
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2757,19 +2741,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
  * It does _not_ decide what to send, it is made in function
  * tcp_xmit_retransmit_queue().
  */
-static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 				  bool is_dupack, int *ack_flag, int *rexmit)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int fast_rexmit = 0, flag = *ack_flag;
 	bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
-				    (tcp_fackets_out(tp) > tp->reordering));
+				     tcp_force_fast_retransmit(sk));
 
 	if (!tp->packets_out && tp->sacked_out)
 		tp->sacked_out = 0;
-	if (!tp->sacked_out && tp->fackets_out)
-		tp->fackets_out = 0;
 
 	/* Now state machine starts.
 	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
@@ -2816,11 +2798,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			if (tcp_is_reno(tp) && is_dupack)
 				tcp_add_reno_sack(sk);
 		} else {
-			if (tcp_try_undo_partial(sk, acked))
+			if (tcp_try_undo_partial(sk, prior_snd_una))
 				return;
 			/* Partial ACK arrived. Force fast retransmit. */
 			do_lost = tcp_is_reno(tp) ||
-				  tcp_fackets_out(tp) > tp->reordering;
+				  tcp_force_fast_retransmit(sk);
 		}
 		if (tcp_try_undo_dsack(sk)) {
 			tcp_try_keep_open(sk);
@@ -3030,15 +3012,15 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  * is before the ack sequence we can discard it as it's confirmed to have
  * arrived at the other end.
  */
-static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
-			       u32 prior_snd_una, int *acked,
+static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
+			       u32 prior_snd_una,
 			       struct tcp_sacktag_state *sack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u64 first_ackt, last_ackt;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
-	u32 reord = tp->packets_out;
+	u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
 	struct sk_buff *skb, *next;
 	bool fully_acked = true;
 	long sack_rtt_us = -1L;
@@ -3053,6 +3035,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+		const u32 start_seq = scb->seq;
 		u8 sacked = scb->sacked;
 		u32 acked_pcount;
 
@@ -3083,7 +3066,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 				first_ackt = last_ackt;
 
 			last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
-			reord = min(pkts_acked, reord);
+			if (before(start_seq, reord))
+				reord = start_seq;
 			if (!after(scb->end_seq, tp->high_seq))
 				flag |= FLAG_ORIG_SACK_ACKED;
 		}
@@ -3161,15 +3145,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			int delta;
 
 			/* Non-retransmitted hole got filled? That's reordering */
-			if (reord < prior_fackets && reord <= tp->fackets_out)
-				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
+			if (before(reord, prior_fack))
+				tcp_check_sack_reordering(sk, reord, 0);
 
 			delta = prior_sacked - tp->sacked_out;
 			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
-
-		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
-
 	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
 		   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
 		/* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3210,7 +3191,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		}
 	}
 #endif
-	*acked = pkts_acked;
 	return flag;
 }
 
@@ -3519,12 +3499,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
-	u32 prior_fackets;
 	int prior_packets = tp->packets_out;
 	u32 delivered = tp->delivered;
 	u32 lost = tp->lost;
-	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+	u32 prior_fack;
 
 	sack_state.first_sackt = 0;
 	sack_state.rate = &rs;
@@ -3556,7 +3535,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		icsk->icsk_retransmits = 0;
 	}
 
-	prior_fackets = tp->fackets_out;
+	prior_fack = tcp_highest_sack_seq(tp);
 	rs.prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* ts_recent update must be made after we are sure that the packet
@@ -3612,8 +3591,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		goto no_queue;
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-				    &sack_state);
+	flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
 
 	tcp_rack_update_reo_wnd(sk, &rs);
 
@@ -3625,7 +3603,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+				      &rexmit);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
@@ -3641,7 +3620,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+				      &rexmit);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3663,7 +3643,8 @@ old_ack:
 	if (TCP_SKB_CB(skb)->sacked) {
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 						&sack_state);
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+				      &rexmit);
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 326c9282bf94..e36eff0403f4 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -475,7 +475,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->packets_out = 0;
 		newtp->retrans_out = 0;
 		newtp->sacked_out = 0;
-		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = tcp_jiffies32;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 094c429b4401..0256f7a41041 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1218,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 	}
 }
 
-/* When a modification to fackets out becomes necessary, we need to check
- * skb is counted to fackets_out or not.
- */
-static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
-				   int decr)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (!tp->sacked_out || tcp_is_reno(tp))
-		return;
-
-	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
-		tp->fackets_out -= decr;
-}
-
 /* Pcount in the middle of the write queue got changed, we need to do various
  * tweaks to fix counters
  */
@@ -1253,8 +1238,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 	if (tcp_is_reno(tp) && decr > 0)
 		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
 
-	tcp_adjust_fackets_out(sk, skb, decr);
-
 	if (tp->lost_skb_hint &&
 	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
 	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
-- 
cgit v1.2.3


From 39b175211053c7a6a4d794c42e225994f1c069c2 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Fri, 10 Nov 2017 14:03:51 -0800
Subject: net: Remove unused skb_shared_info member

ip6_frag_id was only used by UFO, which has been removed.
ipv6_proxy_select_ident() only existed to set ip6_frag_id and has no
in-tree callers.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 -
 include/net/ipv6.h     |  1 -
 net/ipv6/output_core.c | 31 -------------------------------
 3 files changed, 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 57d712671081..54fe91183a8e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -500,7 +500,6 @@ struct skb_shared_info {
 	struct skb_shared_hwtstamps hwtstamps;
 	unsigned int	gso_type;
 	u32		tskey;
-	__be32          ip6_frag_id;
 
 	/*
 	 * Warning : all fields before dataref are cleared in __alloc_skb()
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index fb6d67012de6..ec14f0d5a3a1 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -767,7 +767,6 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr);
-void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
 
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index a338bbc33cf3..4a7e5ffa5108 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -31,37 +31,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
 	return id;
 }
 
-/* This function exists only for tap drivers that must support broken
- * clients requesting UFO without specifying an IPv6 fragment ID.
- *
- * This is similar to ipv6_select_ident() but we use an independent hash
- * seed to limit information leakage.
- *
- * The network header must be set before calling this.
- */
-void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
-{
-	static u32 ip6_proxy_idents_hashrnd __read_mostly;
-	struct in6_addr buf[2];
-	struct in6_addr *addrs;
-	u32 id;
-
-	addrs = skb_header_pointer(skb,
-				   skb_network_offset(skb) +
-				   offsetof(struct ipv6hdr, saddr),
-				   sizeof(buf), buf);
-	if (!addrs)
-		return;
-
-	net_get_random_once(&ip6_proxy_idents_hashrnd,
-			    sizeof(ip6_proxy_idents_hashrnd));
-
-	id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
-				 &addrs[1], &addrs[0]);
-	skb_shinfo(skb)->ip6_frag_id = htonl(id);
-}
-EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
-
 __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr)
-- 
cgit v1.2.3


From 99803171ef04037092bf5eb29ae801e8b4d49a75 Mon Sep 17 00:00:00 2001
From: Dave Taht <dave.taht@gmail.com>
Date: Wed, 8 Nov 2017 15:12:27 -0800
Subject: netem: add uapi to express delay and jitter in nanoseconds

netem userspace has long relied on a horrible /proc/net/psched hack
to translate the current notion of "ticks" to nanoseconds.

Expressing latency and jitter instead, in well defined nanoseconds,
increases the dynamic range of emulated delays and jitter in netem.

It will also ease a transition where reducing a tick to nsec
equivalence would constrain the max delay in prior versions of
netem to only 4.3 seconds.

Signed-off-by: Dave Taht <dave.taht@gmail.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_netem.c          | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 6a2c5ea7e9c4..8fe6d1842bee 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -537,6 +537,8 @@ enum {
 	TCA_NETEM_ECN,
 	TCA_NETEM_RATE64,
 	TCA_NETEM_PAD,
+	TCA_NETEM_LATENCY64,
+	TCA_NETEM_JITTER64,
 	__TCA_NETEM_MAX,
 };
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e64e0e0d94ff..47d6decba0ea 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -819,6 +819,8 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
+	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
+	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
 };
 
 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -916,6 +918,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 		q->rate = max_t(u64, q->rate,
 				nla_get_u64(tb[TCA_NETEM_RATE64]));
 
+	if (tb[TCA_NETEM_LATENCY64])
+		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
+
+	if (tb[TCA_NETEM_JITTER64])
+		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
+
 	if (tb[TCA_NETEM_ECN])
 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 
@@ -1020,6 +1028,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 		goto nla_put_failure;
 
+	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
+		goto nla_put_failure;
+
+	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
+		goto nla_put_failure;
+
 	cor.delay_corr = q->delay_cor.rho;
 	cor.loss_corr = q->loss_cor.rho;
 	cor.dup_corr = q->dup_cor.rho;
-- 
cgit v1.2.3


From 836af83b54e3e285c4a0cc06c24aeb737d3e0e18 Mon Sep 17 00:00:00 2001
From: Dave Taht <dave.taht@gmail.com>
Date: Wed, 8 Nov 2017 15:12:28 -0800
Subject: netem: support delivering packets in delayed time slots

Slotting is a crude approximation of the behaviors of shared media such
as cable, wifi, and LTE, which gather up a bunch of packets within a
varying delay window and deliver them, relative to that, nearly all at
once.

It works within the existing loss, duplication, jitter and delay
parameters of netem. Some amount of inherent latency must be specified,
regardless.

The new "slot" parameter specifies a minimum and maximum delay between
transmission attempts.

The "bytes" and "packets" parameters can be used to limit the amount of
information transferred per slot.

Examples of use:

tc qdisc add dev eth0 root netem delay 200us \
         slot 800us 10ms bytes 64k packets 42

A more correct example, using stacked netem instances and a packet limit
to emulate a tail drop wifi queue with slots and variable packet
delivery, with a 200Mbit isochronous underlying rate, and 20ms path
delay:

tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \
         limit 10000
tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \
         slot 800us 10ms bytes 64k packets 42 limit 512

Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  8 +++++
 net/sched/sch_netem.c          | 74 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 79 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 8fe6d1842bee..af3cc2f4e1ad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -539,6 +539,7 @@ enum {
 	TCA_NETEM_PAD,
 	TCA_NETEM_LATENCY64,
 	TCA_NETEM_JITTER64,
+	TCA_NETEM_SLOT,
 	__TCA_NETEM_MAX,
 };
 
@@ -576,6 +577,13 @@ struct tc_netem_rate {
 	__s32	cell_overhead;
 };
 
+struct tc_netem_slot {
+	__s64   min_delay; /* nsec */
+	__s64   max_delay;
+	__s32   max_packets;
+	__s32   max_bytes;
+};
+
 enum {
 	NETEM_LOSS_UNSPEC,
 	NETEM_LOSS_GI,		/* General Intuitive - 4 state model */
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 47d6decba0ea..b686e755fda9 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -135,6 +135,13 @@ struct netem_sched_data {
 		u32 a5; /* p23 used only in 4-states */
 	} clg;
 
+	struct tc_netem_slot slot_config;
+	struct slotstate {
+		u64 slot_next;
+		s32 packets_left;
+		s32 bytes_left;
+	} slot;
+
 };
 
 /* Time stamp put into socket buffer control block
@@ -591,6 +598,20 @@ finish_segs:
 	return NET_XMIT_SUCCESS;
 }
 
+/* Delay the next round with a new future slot with a
+ * correct number of bytes and packets.
+ */
+
+static void get_slot_next(struct netem_sched_data *q, u64 now)
+{
+	q->slot.slot_next = now + q->slot_config.min_delay +
+		(prandom_u32() *
+			(q->slot_config.max_delay -
+				q->slot_config.min_delay) >> 32);
+	q->slot.packets_left = q->slot_config.max_packets;
+	q->slot.bytes_left = q->slot_config.max_bytes;
+}
+
 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -608,14 +629,17 @@ deliver:
 	p = rb_first(&q->t_root);
 	if (p) {
 		u64 time_to_send;
+		u64 now = ktime_get_ns();
 
 		skb = rb_to_skb(p);
 
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
-		if (time_to_send <= ktime_get_ns()) {
-			rb_erase(p, &q->t_root);
+		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
+			get_slot_next(q, now);
 
+		if (time_to_send <= now &&  q->slot.slot_next <= now) {
+			rb_erase(p, &q->t_root);
 			sch->q.qlen--;
 			qdisc_qstats_backlog_dec(sch, skb);
 			skb->next = NULL;
@@ -634,6 +658,14 @@ deliver:
 				skb->tstamp = 0;
 #endif
 
+			if (q->slot.slot_next) {
+				q->slot.packets_left--;
+				q->slot.bytes_left -= qdisc_pkt_len(skb);
+				if (q->slot.packets_left <= 0 ||
+				    q->slot.bytes_left <= 0)
+					get_slot_next(q, now);
+			}
+
 			if (q->qdisc) {
 				unsigned int pkt_len = qdisc_pkt_len(skb);
 				struct sk_buff *to_free = NULL;
@@ -657,7 +689,10 @@ deliver:
 			if (skb)
 				goto deliver;
 		}
-		qdisc_watchdog_schedule_ns(&q->watchdog, time_to_send);
+
+		qdisc_watchdog_schedule_ns(&q->watchdog,
+					   max(time_to_send,
+					       q->slot.slot_next));
 	}
 
 	if (q->qdisc) {
@@ -688,6 +723,7 @@ static void dist_free(struct disttable *d)
  * Distribution data is a variable size payload containing
  * signed 16 bit values.
  */
+
 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -718,6 +754,23 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 	return 0;
 }
 
+static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
+{
+	const struct tc_netem_slot *c = nla_data(attr);
+
+	q->slot_config = *c;
+	if (q->slot_config.max_packets == 0)
+		q->slot_config.max_packets = INT_MAX;
+	if (q->slot_config.max_bytes == 0)
+		q->slot_config.max_bytes = INT_MAX;
+	q->slot.packets_left = q->slot_config.max_packets;
+	q->slot.bytes_left = q->slot_config.max_bytes;
+	if (q->slot_config.min_delay | q->slot_config.max_delay)
+		q->slot.slot_next = ktime_get_ns();
+	else
+		q->slot.slot_next = 0;
+}
+
 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
 {
 	const struct tc_netem_corr *c = nla_data(attr);
@@ -821,6 +874,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
+	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
 };
 
 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -927,6 +981,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_NETEM_ECN])
 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 
+	if (tb[TCA_NETEM_SLOT])
+		get_slot(q, tb[TCA_NETEM_SLOT]);
+
 	return ret;
 }
 
@@ -1016,6 +1073,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct tc_netem_reorder reorder;
 	struct tc_netem_corrupt corrupt;
 	struct tc_netem_rate rate;
+	struct tc_netem_slot slot;
 
 	qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
 			     UINT_MAX);
@@ -1070,6 +1128,16 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (dump_loss_model(q, skb) != 0)
 		goto nla_put_failure;
 
+	if (q->slot_config.min_delay | q->slot_config.max_delay) {
+		slot = q->slot_config;
+		if (slot.max_packets == INT_MAX)
+			slot.max_packets = 0;
+		if (slot.max_bytes == INT_MAX)
+			slot.max_bytes = 0;
+		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
+			goto nla_put_failure;
+	}
+
 	return nla_nest_end(skb, nla);
 
 nla_put_failure:
-- 
cgit v1.2.3


From 5ed4e3eb021762fee584ce65620bc822131c7aa0 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 10 Nov 2017 15:22:52 -0800
Subject: net: dsa: Pass a port to get_tag_protocol()

A number of drivers want to check whether the configured CPU port is a
possible configuration for enabling tagging, pass down the CPU port
number so they verify that.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c       | 32 +++++++++++++-------------------
 drivers/net/dsa/bcm_sf2.c              |  3 ++-
 drivers/net/dsa/dsa_loop.c             |  3 ++-
 drivers/net/dsa/lan9303-core.c         |  3 ++-
 drivers/net/dsa/microchip/ksz_common.c |  3 ++-
 drivers/net/dsa/mt7530.c               |  4 ++--
 drivers/net/dsa/mv88e6060.c            |  3 ++-
 drivers/net/dsa/mv88e6xxx/chip.c       |  3 ++-
 drivers/net/dsa/qca8k.c                |  2 +-
 include/net/dsa.h                      |  3 ++-
 net/dsa/dsa2.c                         |  2 +-
 net/dsa/legacy.c                       |  2 +-
 12 files changed, 32 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 44a9a03bff55..f72aeb9ed12a 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -541,7 +541,8 @@ EXPORT_SYMBOL(b53_disable_port);
 
 void b53_brcm_hdr_setup(struct dsa_switch *ds, int port)
 {
-	bool tag_en = !!(ds->ops->get_tag_protocol(ds) == DSA_TAG_PROTO_BRCM);
+	bool tag_en = !!(ds->ops->get_tag_protocol(ds, port) ==
+			 DSA_TAG_PROTO_BRCM);
 	struct b53_device *dev = ds->priv;
 	u8 hdr_ctl, val;
 	u16 reg;
@@ -1478,38 +1479,31 @@ void b53_br_fast_age(struct dsa_switch *ds, int port)
 }
 EXPORT_SYMBOL(b53_br_fast_age);
 
-static bool b53_can_enable_brcm_tags(struct dsa_switch *ds)
+static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port)
 {
-	unsigned int brcm_tag_mask;
-	unsigned int i;
-
 	/* Broadcom switches will accept enabling Broadcom tags on the
 	 * following ports: 5, 7 and 8, any other port is not supported
 	 */
-	brcm_tag_mask = BIT(B53_CPU_PORT_25) | BIT(7) | BIT(B53_CPU_PORT);
-
-	for (i = 0; i < ds->num_ports; i++) {
-		if (dsa_is_cpu_port(ds, i)) {
-			if (!(BIT(i) & brcm_tag_mask)) {
-				dev_warn(ds->dev,
-					 "Port %d is not Broadcom tag capable\n",
-					 i);
-				return false;
-			}
-		}
+	switch (port) {
+	case B53_CPU_PORT_25:
+	case 7:
+	case B53_CPU_PORT:
+		return true;
 	}
 
-	return true;
+	dev_warn(ds->dev, "Port %d is not Broadcom tag capable\n", port);
+	return false;
 }
 
-static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds)
+static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds,
+						  int port)
 {
 	struct b53_device *dev = ds->priv;
 
 	/* Older models support a different tag format that we do not
 	 * support in net/dsa/tag_brcm.c yet.
 	 */
-	if (is5325(dev) || is5365(dev) || !b53_can_enable_brcm_tags(ds))
+	if (is5325(dev) || is5365(dev) || !b53_can_enable_brcm_tags(ds, port))
 		return DSA_TAG_PROTO_NONE;
 	else
 		return DSA_TAG_PROTO_BRCM;
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 2d6867f4008c..93faa1fed6f2 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -35,7 +35,8 @@
 #include "b53/b53_priv.h"
 #include "b53/b53_regs.h"
 
-static enum dsa_tag_protocol bcm_sf2_sw_get_tag_protocol(struct dsa_switch *ds)
+static enum dsa_tag_protocol bcm_sf2_sw_get_tag_protocol(struct dsa_switch *ds,
+							 int port)
 {
 	return DSA_TAG_PROTO_BRCM;
 }
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index 3a3f4f7ba364..bb71d3d6f65b 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -64,7 +64,8 @@ struct dsa_loop_priv {
 
 static struct phy_device *phydevs[PHY_MAX_ADDR];
 
-static enum dsa_tag_protocol dsa_loop_get_protocol(struct dsa_switch *ds)
+static enum dsa_tag_protocol dsa_loop_get_protocol(struct dsa_switch *ds,
+						   int port)
 {
 	dev_dbg(ds->dev, "%s\n", __func__);
 
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index a2610085e7ba..fdfdb0edfe62 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -894,7 +894,8 @@ static int lan9303_check_device(struct lan9303 *chip)
 
 /* ---------------------------- DSA -----------------------------------*/
 
-static enum dsa_tag_protocol lan9303_get_tag_protocol(struct dsa_switch *ds)
+static enum dsa_tag_protocol lan9303_get_tag_protocol(struct dsa_switch *ds,
+						      int port)
 {
 	return DSA_TAG_PROTO_LAN9303;
 }
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 56cd6d365352..b5be93a1e0df 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -394,7 +394,8 @@ static int ksz_setup(struct dsa_switch *ds)
 	return 0;
 }
 
-static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds)
+static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
+						  int port)
 {
 	return DSA_TAG_PROTO_KSZ;
 }
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 627c039f12ca..2820d69810b3 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -907,11 +907,11 @@ err:
 }
 
 static enum dsa_tag_protocol
-mtk_get_tag_protocol(struct dsa_switch *ds)
+mtk_get_tag_protocol(struct dsa_switch *ds, int port)
 {
 	struct mt7530_priv *priv = ds->priv;
 
-	if (!dsa_is_cpu_port(ds, MT7530_CPU_PORT)) {
+	if (port != MT7530_CPU_PORT) {
 		dev_warn(priv->dev,
 			 "port not matched with tagging CPU port\n");
 		return DSA_TAG_PROTO_NONE;
diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index 45768e3c5bc5..65f10fec25b3 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -70,7 +70,8 @@ static const char *mv88e6060_get_name(struct mii_bus *bus, int sw_addr)
 	return NULL;
 }
 
-static enum dsa_tag_protocol mv88e6060_get_tag_protocol(struct dsa_switch *ds)
+static enum dsa_tag_protocol mv88e6060_get_tag_protocol(struct dsa_switch *ds,
+							int port)
 {
 	return DSA_TAG_PROTO_TRAILER;
 }
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index d6c3a22c8789..8171055fde7a 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3731,7 +3731,8 @@ static int mv88e6xxx_smi_init(struct mv88e6xxx_chip *chip,
 	return 0;
 }
 
-static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds)
+static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds,
+							int port)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index cf72e274275f..9df22ebee822 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -823,7 +823,7 @@ qca8k_port_fdb_dump(struct dsa_switch *ds, int port,
 }
 
 static enum dsa_tag_protocol
-qca8k_get_tag_protocol(struct dsa_switch *ds)
+qca8k_get_tag_protocol(struct dsa_switch *ds, int port)
 {
 	return DSA_TAG_PROTO_QCA;
 }
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6c239257309b..68e232fd4b0f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -321,7 +321,8 @@ struct dsa_switch_ops {
 				  struct device *host_dev, int sw_addr,
 				  void **priv);
 
-	enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds);
+	enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds,
+						  int port);
 
 	int	(*setup)(struct dsa_switch *ds);
 	u32	(*get_phy_flags)(struct dsa_switch *ds, int port);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index fd54a8e17986..44e3fb7dec8c 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -539,7 +539,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master)
 	const struct dsa_device_ops *tag_ops;
 	enum dsa_tag_protocol tag_protocol;
 
-	tag_protocol = ds->ops->get_tag_protocol(ds);
+	tag_protocol = ds->ops->get_tag_protocol(ds, dp->index);
 	tag_ops = dsa_resolve_tag_protocol(tag_protocol);
 	if (IS_ERR(tag_ops)) {
 		dev_warn(ds->dev, "No tagger for this switch\n");
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 4863e3e398b6..84611d7fcfa2 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -151,7 +151,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 		const struct dsa_device_ops *tag_ops;
 		enum dsa_tag_protocol tag_protocol;
 
-		tag_protocol = ops->get_tag_protocol(ds);
+		tag_protocol = ops->get_tag_protocol(ds, dst->cpu_dp->index);
 		tag_ops = dsa_resolve_tag_protocol(tag_protocol);
 		if (IS_ERR(tag_ops))
 			return PTR_ERR(tag_ops);
-- 
cgit v1.2.3


From b74b70c44986dee87881fbed3d912e02c5dcf78c Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 10 Nov 2017 15:22:54 -0800
Subject: net: dsa: Support prepended Broadcom tag

Add a new type: DSA_TAG_PROTO_PREPEND which allows us to support for the
4-bytes Broadcom tag that we already support, but in a format where it
is pre-pended to the packet instead of located between the MAC SA and
the Ethertyper (DSA_TAG_PROTO_BRCM).

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  1 +
 net/dsa/Kconfig    |  3 +++
 net/dsa/Makefile   |  1 +
 net/dsa/dsa.c      |  3 +++
 net/dsa/dsa_priv.h |  1 +
 net/dsa/tag_brcm.c | 39 ++++++++++++++++++++++++++++++++-------
 6 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 68e232fd4b0f..2a05738570d8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -29,6 +29,7 @@ struct fixed_phy_status;
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE = 0,
 	DSA_TAG_PROTO_BRCM,
+	DSA_TAG_PROTO_BRCM_PREPEND,
 	DSA_TAG_PROTO_DSA,
 	DSA_TAG_PROTO_EDSA,
 	DSA_TAG_PROTO_KSZ,
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index cc5f8f971689..2fed892094bc 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -19,6 +19,9 @@ if NET_DSA
 config NET_DSA_TAG_BRCM
 	bool
 
+config NET_DSA_TAG_BRCM_PREPEND
+	bool
+
 config NET_DSA_TAG_DSA
 	bool
 
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index e9a4a0f33e86..0e13c1f95d13 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -5,6 +5,7 @@ dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
+dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o
 dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index b8f2d9f7c3ed..6a9d0f50fbee 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -44,6 +44,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
 #ifdef CONFIG_NET_DSA_TAG_BRCM
 	[DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
 #endif
+#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND
+	[DSA_TAG_PROTO_BRCM_PREPEND] = &brcm_prepend_netdev_ops,
+#endif
 #ifdef CONFIG_NET_DSA_TAG_DSA
 	[DSA_TAG_PROTO_DSA] = &dsa_netdev_ops,
 #endif
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 507e1ce4d4d2..7d036696e8c4 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -191,6 +191,7 @@ void dsa_switch_unregister_notifier(struct dsa_switch *ds);
 
 /* tag_brcm.c */
 extern const struct dsa_device_ops brcm_netdev_ops;
+extern const struct dsa_device_ops brcm_prepend_netdev_ops;
 
 /* tag_dsa.c */
 extern const struct dsa_device_ops dsa_netdev_ops;
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 771409a1e65c..e6e0b7b6025c 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -96,13 +96,6 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 	return skb;
 }
 
-static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb,
-				     struct net_device *dev)
-{
-	/* Build the tag after the MAC Source Address */
-	return brcm_tag_xmit_ll(skb, dev, 2 * ETH_ALEN);
-}
-
 static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
 				       struct net_device *dev,
 				       struct packet_type *pt,
@@ -139,6 +132,15 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
 	return skb;
 }
 
+#ifdef CONFIG_NET_DSA_TAG_BRCM
+static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
+{
+	/* Build the tag after the MAC Source Address */
+	return brcm_tag_xmit_ll(skb, dev, 2 * ETH_ALEN);
+}
+
+
 static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 				    struct packet_type *pt)
 {
@@ -161,3 +163,26 @@ const struct dsa_device_ops brcm_netdev_ops = {
 	.xmit	= brcm_tag_xmit,
 	.rcv	= brcm_tag_rcv,
 };
+#endif
+
+#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND
+static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb,
+					     struct net_device *dev)
+{
+	/* tag is prepended to the packet */
+	return brcm_tag_xmit_ll(skb, dev, 0);
+}
+
+static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb,
+					    struct net_device *dev,
+					    struct packet_type *pt)
+{
+	/* tag is prepended to the packet */
+	return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN);
+}
+
+const struct dsa_device_ops brcm_prepend_netdev_ops = {
+	.xmit	= brcm_tag_xmit_prepend,
+	.rcv	= brcm_tag_rcv_prepend,
+};
+#endif
-- 
cgit v1.2.3


From 5794040647de4011598a6d005fdad95d24fd385b Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@ovn.org>
Date: Fri, 10 Nov 2017 12:09:40 -0800
Subject: openvswitch: Add meter netlink definitions

Meter has its own netlink family. Define netlink messages and attributes
for communicating with the user space programs.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 51 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index ec75a685f1dd..d60b9a4cf3d1 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -883,4 +883,55 @@ enum ovs_action_attr {
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
 
+/* Meters. */
+#define OVS_METER_FAMILY  "ovs_meter"
+#define OVS_METER_MCGROUP "ovs_meter"
+#define OVS_METER_VERSION 0x1
+
+enum ovs_meter_cmd {
+	OVS_METER_CMD_UNSPEC,
+	OVS_METER_CMD_FEATURES,	/* Get features supported by the datapath. */
+	OVS_METER_CMD_SET,	/* Add or modify a meter. */
+	OVS_METER_CMD_DEL,	/* Delete a meter. */
+	OVS_METER_CMD_GET	/* Get meter stats. */
+};
+
+enum ovs_meter_attr {
+	OVS_METER_ATTR_UNSPEC,
+	OVS_METER_ATTR_ID,	/* u32 meter ID within datapath. */
+	OVS_METER_ATTR_KBPS,	/* No argument. If set, units in kilobits
+				 * per second. Otherwise, units in
+				 * packets per second.
+				 */
+	OVS_METER_ATTR_STATS,	/* struct ovs_flow_stats for the meter. */
+	OVS_METER_ATTR_BANDS,	/* Nested attributes for meter bands. */
+	OVS_METER_ATTR_USED,	/* u64 msecs last used in monotonic time. */
+	OVS_METER_ATTR_CLEAR,	/* Flag to clear stats, used. */
+	OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */
+	OVS_METER_ATTR_MAX_BANDS,  /* u32 max number of bands per meter. */
+	OVS_METER_ATTR_PAD,
+	__OVS_METER_ATTR_MAX
+};
+
+#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1)
+
+enum ovs_band_attr {
+	OVS_BAND_ATTR_UNSPEC,
+	OVS_BAND_ATTR_TYPE,	/* u32 OVS_METER_BAND_TYPE_* constant. */
+	OVS_BAND_ATTR_RATE,	/* u32 band rate in meter units (see above). */
+	OVS_BAND_ATTR_BURST,	/* u32 burst size in meter units. */
+	OVS_BAND_ATTR_STATS,	/* struct ovs_flow_stats for the band. */
+	__OVS_BAND_ATTR_MAX
+};
+
+#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1)
+
+enum ovs_meter_band_type {
+	OVS_METER_BAND_TYPE_UNSPEC,
+	OVS_METER_BAND_TYPE_DROP,   /* Drop exceeding packets. */
+	__OVS_METER_BAND_TYPE_MAX
+};
+
+#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
cgit v1.2.3


From cd8a6c33693c1b89d2737ffdbf9611564e9ac907 Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@ovn.org>
Date: Fri, 10 Nov 2017 12:09:43 -0800
Subject: openvswitch: Add meter action support

Implements OVS kernel meter action support.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 3 +++
 net/openvswitch/actions.c        | 6 ++++++
 net/openvswitch/datapath.h       | 1 +
 net/openvswitch/flow_netlink.c   | 6 ++++++
 4 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d60b9a4cf3d1..4265d7f9e1f2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -838,6 +838,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
  * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
+ * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
+ * packet, or modify the packet (e.g., change the DSCP field).
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -870,6 +872,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
 	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
 	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
+	OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 9a6a6d51e421..30a5df27116e 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1330,6 +1330,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_POP_NSH:
 			err = pop_nsh(skb, key);
 			break;
+
+		case OVS_ACTION_ATTR_METER:
+			if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) {
+				consume_skb(skb);
+				return 0;
+			}
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 5d2997b42460..523d65526766 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS           USHRT_MAX
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 4201f9293af3..bb4dae198c78 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -90,6 +90,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 		case OVS_ACTION_ATTR_SAMPLE:
 		case OVS_ACTION_ATTR_SET:
 		case OVS_ACTION_ATTR_SET_MASKED:
+		case OVS_ACTION_ATTR_METER:
 		default:
 			return true;
 		}
@@ -2844,6 +2845,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
 			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
 			[OVS_ACTION_ATTR_POP_NSH] = 0,
+			[OVS_ACTION_ATTR_METER] = sizeof(u32),
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -3029,6 +3031,10 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			break;
 		}
 
+		case OVS_ACTION_ATTR_METER:
+			/* Non-existent meters are simply ignored.  */
+			break;
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
-- 
cgit v1.2.3


From 3a9b76fd0db9f0d426533f96a68a62a58753a51e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 11 Nov 2017 15:54:12 -0800
Subject: tcp: allow drivers to tweak TSQ logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I had many reports that TSQ logic breaks wifi aggregation.

Current logic is to allow up to 1 ms of bytes to be queued into qdisc
and drivers queues.

But Wifi aggregation needs a bigger budget to allow bigger rates to
be discovered by various TCP Congestion Controls algorithms.

This patch adds an extra socket field, allowing wifi drivers to select
another log scale to derive TCP Small Queue credit from current pacing
rate.

Initial value is 10, meaning that this patch does not change current
behavior.

We expect wifi drivers to set this field to smaller values (tests have
been done with values from 6 to 9)

They would have to use following template :

if (skb->sk && skb->sk->sk_pacing_shift != MY_PACING_SHIFT)
     skb->sk->sk_pacing_shift = MY_PACING_SHIFT;

Ref: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1670041
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Toke Høiland-Jørgensen <toke@toke.dk>
Cc: Kir Kolyshkin <kir@openvz.org>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    | 2 ++
 net/core/sock.c       | 1 +
 net/ipv4/tcp_output.c | 4 ++--
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 688a823dccc3..f8715c5af37d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -267,6 +267,7 @@ struct sock_common {
   *	@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
   *	@sk_gso_max_size: Maximum GSO segment size to build
   *	@sk_gso_max_segs: Maximum number of GSO segments
+  *	@sk_pacing_shift: scaling factor for TCP Small Queues
   *	@sk_lingertime: %SO_LINGER l_linger setting
   *	@sk_backlog: always used with the per-socket spinlock held
   *	@sk_callback_lock: used with the callbacks in the end of this struct
@@ -451,6 +452,7 @@ struct sock {
 	kmemcheck_bitfield_end(flags);
 
 	u16			sk_gso_max_segs;
+	u8			sk_pacing_shift;
 	unsigned long	        sk_lingertime;
 	struct proto		*sk_prot_creator;
 	rwlock_t		sk_callback_lock;
diff --git a/net/core/sock.c b/net/core/sock.c
index 57bbd6040eb6..13719af7b4e3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2746,6 +2746,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->sk_max_pacing_rate = ~0U;
 	sk->sk_pacing_rate = ~0U;
+	sk->sk_pacing_shift = 10;
 	sk->sk_incoming_cpu = -1;
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0256f7a41041..76dbe884f246 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1720,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 {
 	u32 bytes, segs;
 
-	bytes = min(sk->sk_pacing_rate >> 10,
+	bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
 		    sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
 
 	/* Goal is to send at least one packet per ms,
@@ -2198,7 +2198,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 {
 	unsigned int limit;
 
-	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
 	limit = min_t(u32, limit,
 		      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
 	limit <<= factor;
-- 
cgit v1.2.3


From 6d88207fcfddc002afe3e2e4a455e5201089d5d9 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 13 Nov 2017 10:22:45 +0200
Subject: tls: Add function to update the TLS socket configuration

The tx configuration is now stored in ctx->tx_conf.
And sk->sk_prot is updated trough a function
This will simplify things when we add rx
and support for different possible
tx and rx cross configurations.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  2 ++
 net/tls/tls_main.c | 46 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index b89d397dd62f..f058a6e08eaa 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -83,6 +83,8 @@ struct tls_context {
 
 	void *priv_ctx;
 
+	u8 tx_conf:2;
+
 	u16 prepend_size;
 	u16 tag_size;
 	u16 overhead_size;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 60aff60e30ad..de6a1416bc41 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -45,8 +45,18 @@ MODULE_AUTHOR("Mellanox Technologies");
 MODULE_DESCRIPTION("Transport Layer Security Support");
 MODULE_LICENSE("Dual BSD/GPL");
 
-static struct proto tls_base_prot;
-static struct proto tls_sw_prot;
+enum {
+	TLS_BASE_TX,
+	TLS_SW_TX,
+	TLS_NUM_CONFIG,
+};
+
+static struct proto tls_prots[TLS_NUM_CONFIG];
+
+static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
+{
+	sk->sk_prot = &tls_prots[ctx->tx_conf];
+}
 
 int wait_on_pending_writer(struct sock *sk, long *timeo)
 {
@@ -340,8 +350,8 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 {
 	struct tls_crypto_info *crypto_info, tmp_crypto_info;
 	struct tls_context *ctx = tls_get_ctx(sk);
-	struct proto *prot = NULL;
 	int rc = 0;
+	int tx_conf;
 
 	if (!optval || (optlen < sizeof(*crypto_info))) {
 		rc = -EINVAL;
@@ -396,11 +406,12 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 
 	/* currently SW is default, we will have ethtool in future */
 	rc = tls_set_sw_offload(sk, ctx);
-	prot = &tls_sw_prot;
+	tx_conf = TLS_SW_TX;
 	if (rc)
 		goto err_crypto_info;
 
-	sk->sk_prot = prot;
+	ctx->tx_conf = tx_conf;
+	update_sk_prot(sk, ctx);
 	goto out;
 
 err_crypto_info:
@@ -453,7 +464,9 @@ static int tls_init(struct sock *sk)
 	icsk->icsk_ulp_data = ctx;
 	ctx->setsockopt = sk->sk_prot->setsockopt;
 	ctx->getsockopt = sk->sk_prot->getsockopt;
-	sk->sk_prot = &tls_base_prot;
+
+	ctx->tx_conf = TLS_BASE_TX;
+	update_sk_prot(sk, ctx);
 out:
 	return rc;
 }
@@ -464,16 +477,21 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.init			= tls_init,
 };
 
+static void build_protos(struct proto *prot, struct proto *base)
+{
+	prot[TLS_BASE_TX] = *base;
+	prot[TLS_BASE_TX].setsockopt = tls_setsockopt;
+	prot[TLS_BASE_TX].getsockopt = tls_getsockopt;
+
+	prot[TLS_SW_TX] = prot[TLS_BASE_TX];
+	prot[TLS_SW_TX].close		= tls_sk_proto_close;
+	prot[TLS_SW_TX].sendmsg		= tls_sw_sendmsg;
+	prot[TLS_SW_TX].sendpage	= tls_sw_sendpage;
+}
+
 static int __init tls_register(void)
 {
-	tls_base_prot			= tcp_prot;
-	tls_base_prot.setsockopt	= tls_setsockopt;
-	tls_base_prot.getsockopt	= tls_getsockopt;
-
-	tls_sw_prot			= tls_base_prot;
-	tls_sw_prot.sendmsg		= tls_sw_sendmsg;
-	tls_sw_prot.sendpage            = tls_sw_sendpage;
-	tls_sw_prot.close               = tls_sk_proto_close;
+	build_protos(tls_prots, &tcp_prot);
 
 	tcp_register_ulp(&tcp_tls_ulp_ops);
 
-- 
cgit v1.2.3


From ff45d820a2df163957ad8ab459b6eb6976144c18 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 13 Nov 2017 10:22:46 +0200
Subject: tls: Fix TLS ulp context leak, when TLS_TX setsockopt is not used.

Previously the TLS ulp context would leak if we attached a TLS ulp
to a socket but did not use the TLS_TX setsockopt,
or did use it but it failed.
This patch solves the issue by overriding prot[TLS_BASE_TX].close
and fixing tls_sk_proto_close to work properly
when its called with ctx->tx_conf == TLS_BASE_TX.
This patch also removes ctx->free_resources as we can use ctx->tx_conf
to obtain the relevant information.

Fixes: 3c4d7559159b ('tls: kernel TLS support')
Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  2 +-
 net/tls/tls_main.c | 22 ++++++++++++++--------
 net/tls/tls_sw.c   |  4 ++--
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index f058a6e08eaa..7cb58a6b8fd0 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -99,7 +99,6 @@ struct tls_context {
 
 	u16 pending_open_record_frags;
 	int (*push_pending_record)(struct sock *sk, int flags);
-	void (*free_resources)(struct sock *sk);
 
 	void (*sk_write_space)(struct sock *sk);
 	void (*sk_proto_close)(struct sock *sk, long timeout);
@@ -124,6 +123,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_sw_sendpage(struct sock *sk, struct page *page,
 		    int offset, size_t size, int flags);
 void tls_sw_close(struct sock *sk, long timeout);
+void tls_sw_free_tx_resources(struct sock *sk);
 
 void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 void tls_icsk_clean_acked(struct sock *sk);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index de6a1416bc41..13427ee7c582 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -226,6 +226,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 	void (*sk_proto_close)(struct sock *sk, long timeout);
 
 	lock_sock(sk);
+	sk_proto_close = ctx->sk_proto_close;
+
+	if (ctx->tx_conf == TLS_BASE_TX) {
+		kfree(ctx);
+		goto skip_tx_cleanup;
+	}
 
 	if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
 		tls_handle_open_record(sk, 0);
@@ -242,13 +248,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 			sg++;
 		}
 	}
-	ctx->free_resources(sk);
+
 	kfree(ctx->rec_seq);
 	kfree(ctx->iv);
 
-	sk_proto_close = ctx->sk_proto_close;
-	kfree(ctx);
+	if (ctx->tx_conf == TLS_SW_TX)
+		tls_sw_free_tx_resources(sk);
 
+skip_tx_cleanup:
 	release_sock(sk);
 	sk_proto_close(sk, timeout);
 }
@@ -402,8 +409,6 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 	ctx->sk_write_space = sk->sk_write_space;
 	sk->sk_write_space = tls_write_space;
 
-	ctx->sk_proto_close = sk->sk_prot->close;
-
 	/* currently SW is default, we will have ethtool in future */
 	rc = tls_set_sw_offload(sk, ctx);
 	tx_conf = TLS_SW_TX;
@@ -464,6 +469,7 @@ static int tls_init(struct sock *sk)
 	icsk->icsk_ulp_data = ctx;
 	ctx->setsockopt = sk->sk_prot->setsockopt;
 	ctx->getsockopt = sk->sk_prot->getsockopt;
+	ctx->sk_proto_close = sk->sk_prot->close;
 
 	ctx->tx_conf = TLS_BASE_TX;
 	update_sk_prot(sk, ctx);
@@ -480,11 +486,11 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 static void build_protos(struct proto *prot, struct proto *base)
 {
 	prot[TLS_BASE_TX] = *base;
-	prot[TLS_BASE_TX].setsockopt = tls_setsockopt;
-	prot[TLS_BASE_TX].getsockopt = tls_getsockopt;
+	prot[TLS_BASE_TX].setsockopt	= tls_setsockopt;
+	prot[TLS_BASE_TX].getsockopt	= tls_getsockopt;
+	prot[TLS_BASE_TX].close		= tls_sk_proto_close;
 
 	prot[TLS_SW_TX] = prot[TLS_BASE_TX];
-	prot[TLS_SW_TX].close		= tls_sk_proto_close;
 	prot[TLS_SW_TX].sendmsg		= tls_sw_sendmsg;
 	prot[TLS_SW_TX].sendpage	= tls_sw_sendpage;
 }
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f00383a37622..fcd92a9c2d06 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -639,7 +639,7 @@ sendpage_end:
 	return ret;
 }
 
-static void tls_sw_free_resources(struct sock *sk)
+void tls_sw_free_tx_resources(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
@@ -650,6 +650,7 @@ static void tls_sw_free_resources(struct sock *sk)
 	tls_free_both_sg(sk);
 
 	kfree(ctx);
+	kfree(tls_ctx);
 }
 
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
@@ -679,7 +680,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 	}
 
 	ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
-	ctx->free_resources = tls_sw_free_resources;
 
 	crypto_info = &ctx->crypto_send;
 	switch (crypto_info->cipher_type) {
-- 
cgit v1.2.3


From 213ef6e7c9c063c482d77f12cc438872628d48ec Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 13 Nov 2017 10:22:47 +0200
Subject: tls: Move tls_make_aad to header to allow sharing

move tls_make_aad as it is going to be reused
by the device offload code and rx path.
Remove unused recv parameter.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 15 +++++++++++++++
 net/tls/tls_sw.c  | 18 +-----------------
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 7cb58a6b8fd0..70becd0a9299 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -214,6 +214,21 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
 	       ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
 }
 
+static inline void tls_make_aad(char *buf,
+				size_t size,
+				char *record_sequence,
+				int record_sequence_size,
+				unsigned char record_type)
+{
+	memcpy(buf, record_sequence, record_sequence_size);
+
+	buf[8] = record_type;
+	buf[9] = TLS_1_2_VERSION_MAJOR;
+	buf[10] = TLS_1_2_VERSION_MINOR;
+	buf[11] = size >> 8;
+	buf[12] = size & 0xFF;
+}
+
 static inline struct tls_context *tls_get_ctx(const struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index fcd92a9c2d06..73d19210dd49 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -39,22 +39,6 @@
 
 #include <net/tls.h>
 
-static inline void tls_make_aad(int recv,
-				char *buf,
-				size_t size,
-				char *record_sequence,
-				int record_sequence_size,
-				unsigned char record_type)
-{
-	memcpy(buf, record_sequence, record_sequence_size);
-
-	buf[8] = record_type;
-	buf[9] = TLS_1_2_VERSION_MAJOR;
-	buf[10] = TLS_1_2_VERSION_MINOR;
-	buf[11] = size >> 8;
-	buf[12] = size & 0xFF;
-}
-
 static void trim_sg(struct sock *sk, struct scatterlist *sg,
 		    int *sg_num_elem, unsigned int *sg_size, int target_size)
 {
@@ -249,7 +233,7 @@ static int tls_push_record(struct sock *sk, int flags,
 	sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
 	sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1);
 
-	tls_make_aad(0, ctx->aad_space, ctx->sg_plaintext_size,
+	tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size,
 		     tls_ctx->rec_seq, tls_ctx->rec_seq_size,
 		     record_type);
 
-- 
cgit v1.2.3


From 096d1dd0f03211fb42d6c2457f248827604b7f0e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 13 Nov 2017 16:19:46 +0100
Subject: netlink: remove unused NETLINK SKB flags

These flags are unused, remove them to be less confusing.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6ddb4a5da371..49b4257ce1ea 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -17,9 +17,6 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
 }
 
 enum netlink_skb_flags {
-	NETLINK_SKB_MMAPED	= 0x1,	/* Packet data is mmaped */
-	NETLINK_SKB_TX		= 0x2,	/* Packet was sent by userspace */
-	NETLINK_SKB_DELIVERED	= 0x4,	/* Packet was delivered */
 	NETLINK_SKB_DST		= 0x8,	/* Dst set in sendto or sendmsg */
 };
 
-- 
cgit v1.2.3


From 0eef304bc9f7d079a1165e8cd2f24b078e9e1f2a Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 13 Nov 2017 03:37:06 +0300
Subject: uapi: fix linux/rxrpc.h userspace compilation errors

Consistently use types provided by <linux/types.h> to fix the following
linux/rxrpc.h userspace compilation errors:

/usr/include/linux/rxrpc.h:24:2: error: unknown type name 'u16'
  u16  srx_service; /* service desired */
/usr/include/linux/rxrpc.h:25:2: error: unknown type name 'u16'
  u16  transport_type; /* type of transport socket (SOCK_DGRAM) */
/usr/include/linux/rxrpc.h:26:2: error: unknown type name 'u16'
  u16  transport_len; /* length of transport address */

Use __kernel_sa_family_t instead of sa_family_t the same way
as uapi/linux/in.h does, to fix the following
linux/rxrpc.h userspace compilation errors:

/usr/include/linux/rxrpc.h:23:2: error: unknown type name 'sa_family_t'
  sa_family_t srx_family; /* address family */
/usr/include/linux/rxrpc.h:28:3: error: unknown type name 'sa_family_t'
  sa_family_t family;  /* transport address family */

Fixes: 727f8914477e ("rxrpc: Expose UAPI definitions to userspace")
Cc: <stable@vger.kernel.org> # v4.14
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rxrpc.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 9656aad8f8f7..9d4afea308a4 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -20,12 +20,12 @@
  * RxRPC socket address
  */
 struct sockaddr_rxrpc {
-	sa_family_t	srx_family;	/* address family */
-	u16		srx_service;	/* service desired */
-	u16		transport_type;	/* type of transport socket (SOCK_DGRAM) */
-	u16		transport_len;	/* length of transport address */
+	__kernel_sa_family_t	srx_family;	/* address family */
+	__u16			srx_service;	/* service desired */
+	__u16			transport_type;	/* type of transport socket (SOCK_DGRAM) */
+	__u16			transport_len;	/* length of transport address */
 	union {
-		sa_family_t family;		/* transport address family */
+		__kernel_sa_family_t family;	/* transport address family */
 		struct sockaddr_in sin;		/* IPv4 transport address */
 		struct sockaddr_in6 sin6;	/* IPv6 transport address */
 	} transport;
-- 
cgit v1.2.3


From b9f3eb499d84f8d4adcb2f9212ec655700b28228 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 14 Nov 2017 06:30:11 +0300
Subject: uapi: fix linux/tls.h userspace compilation error

Move inclusion of a private kernel header <net/tcp.h>
from uapi/linux/tls.h to its only user - net/tls.h,
to fix the following linux/tls.h userspace compilation error:

/usr/include/linux/tls.h:41:21: fatal error: net/tcp.h: No such file or directory

As to this point uapi/linux/tls.h was totaly unusuable for userspace,
cleanup this header file further by moving other redundant includes
to net/tls.h.

Fixes: 3c4d7559159b ("tls: kernel TLS support")
Cc: <stable@vger.kernel.org> # v4.13+
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h        | 4 ++++
 include/uapi/linux/tls.h | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 70becd0a9299..936cfc5cab7d 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -35,6 +35,10 @@
 #define _TLS_OFFLOAD_H
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
 
 #include <uapi/linux/tls.h>
 
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index d5e0682ab837..293b2cdad88d 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -35,10 +35,6 @@
 #define _UAPI_LINUX_TLS_H
 
 #include <linux/types.h>
-#include <asm/byteorder.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <net/tcp.h>
 
 /* TLS socket options */
 #define TLS_TX			1	/* Set transmit parameters */
-- 
cgit v1.2.3


From 6670e152447732ba90626f36dfc015a13fbf150e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 14 Nov 2017 08:25:49 -0800
Subject: tcp: Namespace-ify sysctl_tcp_default_congestion_control

Make default TCP default congestion control to a per namespace
value. This changes default congestion control to a pointer to congestion ops
(rather than implicit as first element of available lsit).

The congestion control setting of new namespaces is inherited
from the current setting of the root namespace.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  6 ++--
 net/ipv4/fib_semantics.c   |  4 +--
 net/ipv4/sysctl_net_ipv4.c | 19 +++++++-----
 net/ipv4/tcp_cong.c        | 76 ++++++++++++++++++++++------------------------
 net/ipv4/tcp_ipv4.c        |  9 ++++++
 net/ipv6/route.c           |  3 +-
 7 files changed, 64 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 5e12975fc658..44668c29701a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -160,6 +160,7 @@ struct netns_ipv4 {
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
+	const struct tcp_congestion_ops __rcu  *tcp_congestion_control;
 	struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 	spinlock_t tcp_fastopen_ctx_lock;
 	unsigned int sysctl_tcp_fastopen_blackhole_timeout;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ed71511e67a6..35cc7d0d3d47 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1002,8 +1002,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
 void tcp_assign_congestion_control(struct sock *sk);
 void tcp_init_congestion_control(struct sock *sk);
 void tcp_cleanup_congestion_control(struct sock *sk);
-int tcp_set_default_congestion_control(const char *name);
-void tcp_get_default_congestion_control(char *name);
+int tcp_set_default_congestion_control(struct net *net, const char *name);
+void tcp_get_default_congestion_control(struct net *net, char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
@@ -1017,7 +1017,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
 
 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
-u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
 #ifdef CONFIG_INET
 char *tcp_ca_get_name_by_key(u32 key, char *buffer);
 #else
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 589caaa90613..f04d944f8abe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
 			bool ecn_ca = false;
 
 			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+			val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
 		} else {
 			val = nla_get_u32(nla);
 		}
@@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 			char tmp[TCP_CA_NAME_MAX];
 
 			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+			val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
 			if (val == TCP_CA_UNSPEC)
 				return -EINVAL;
 		} else {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ef0ff3357a44..93e172118a94 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
 static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
 				       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct net *net = container_of(ctl->data, struct net,
+				       ipv4.tcp_congestion_control);
 	char val[TCP_CA_NAME_MAX];
 	struct ctl_table tbl = {
 		.data = val,
@@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
 	};
 	int ret;
 
-	tcp_get_default_congestion_control(val);
+	tcp_get_default_congestion_control(net, val);
 
 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	if (write && ret == 0)
-		ret = tcp_set_default_congestion_control(val);
+		ret = tcp_set_default_congestion_control(net, val);
 	return ret;
 }
 
@@ -447,12 +449,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_congestion_control",
-		.mode		= 0644,
-		.maxlen		= TCP_CA_NAME_MAX,
-		.proc_handler	= proc_tcp_congestion_control,
-	},
 #ifdef CONFIG_NETLABEL
 	{
 		.procname	= "cipso_cache_enable",
@@ -763,6 +759,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &one
 	},
 #endif
+	{
+		.procname	= "tcp_congestion_control",
+		.data		= &init_net.ipv4.tcp_congestion_control,
+		.mode		= 0644,
+		.maxlen		= TCP_CA_NAME_MAX,
+		.proc_handler	= proc_tcp_congestion_control,
+	},
 	{
 		.procname	= "tcp_keepalive_time",
 		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2f26124fd160..bc6c02f16243 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
 }
 
 /* Must be called with rcu lock held */
-static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
+static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
+						       const char *name)
 {
-	const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+	struct tcp_congestion_ops *ca = tcp_ca_find(name);
+
 #ifdef CONFIG_MODULES
 	if (!ca && capable(CAP_NET_ADMIN)) {
 		rcu_read_unlock();
@@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 }
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
-u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
 {
 	const struct tcp_congestion_ops *ca;
 	u32 key = TCP_CA_UNSPEC;
@@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
 	might_sleep();
 
 	rcu_read_lock();
-	ca = __tcp_ca_find_autoload(name);
+	ca = tcp_ca_find_autoload(net, name);
 	if (ca) {
 		key = ca->key;
 		*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
@@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
 /* Assign choice of congestion control. */
 void tcp_assign_congestion_control(struct sock *sk)
 {
+	struct net *net = sock_net(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	struct tcp_congestion_ops *ca;
+	const struct tcp_congestion_ops *ca;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
-		if (likely(try_module_get(ca->owner))) {
-			icsk->icsk_ca_ops = ca;
-			goto out;
-		}
-		/* Fallback to next available. The last really
-		 * guaranteed fallback is Reno from this list.
-		 */
-	}
-out:
+	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
+	if (unlikely(!try_module_get(ca->owner)))
+		ca = &tcp_reno;
+	icsk->icsk_ca_ops = ca;
 	rcu_read_unlock();
-	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
 
+	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
 	if (ca->flags & TCP_CONG_NEEDS_ECN)
 		INET_ECN_xmit(sk);
 	else
@@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk)
 }
 
 /* Used by sysctl to change default congestion control */
-int tcp_set_default_congestion_control(const char *name)
+int tcp_set_default_congestion_control(struct net *net, const char *name)
 {
 	struct tcp_congestion_ops *ca;
-	int ret = -ENOENT;
-
-	spin_lock(&tcp_cong_list_lock);
-	ca = tcp_ca_find(name);
-#ifdef CONFIG_MODULES
-	if (!ca && capable(CAP_NET_ADMIN)) {
-		spin_unlock(&tcp_cong_list_lock);
+	const struct tcp_congestion_ops *prev;
+	int ret;
 
-		request_module("tcp_%s", name);
-		spin_lock(&tcp_cong_list_lock);
-		ca = tcp_ca_find(name);
-	}
-#endif
+	rcu_read_lock();
+	ca = tcp_ca_find_autoload(net, name);
+	if (!ca) {
+		ret = -ENOENT;
+	} else if (!try_module_get(ca->owner)) {
+		ret = -EBUSY;
+	} else {
+		prev = xchg(&net->ipv4.tcp_congestion_control, ca);
+		if (prev)
+			module_put(prev->owner);
 
-	if (ca) {
-		ca->flags |= TCP_CONG_NON_RESTRICTED;	/* default is always allowed */
-		list_move(&ca->list, &tcp_cong_list);
+		ca->flags |= TCP_CONG_NON_RESTRICTED;
 		ret = 0;
 	}
-	spin_unlock(&tcp_cong_list_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name)
 /* Set default value from kernel configuration at bootup */
 static int __init tcp_congestion_default(void)
 {
-	return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+	return tcp_set_default_congestion_control(&init_net,
+						  CONFIG_DEFAULT_TCP_CONG);
 }
 late_initcall(tcp_congestion_default);
 
@@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
 }
 
 /* Get current default congestion control */
-void tcp_get_default_congestion_control(char *name)
+void tcp_get_default_congestion_control(struct net *net, char *name)
 {
-	struct tcp_congestion_ops *ca;
-	/* We will always have reno... */
-	BUG_ON(list_empty(&tcp_cong_list));
+	const struct tcp_congestion_ops *ca;
 
 	rcu_read_lock();
-	ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
 	strncpy(name, ca->name, TCP_CA_NAME_MAX);
 	rcu_read_unlock();
 }
@@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
 	if (!load)
 		ca = tcp_ca_find(name);
 	else
-		ca = __tcp_ca_find_autoload(name);
+		ca = tcp_ca_find_autoload(sock_net(sk), name);
+
 	/* No change asking for existing value */
 	if (ca == icsk->icsk_ca_ops) {
 		icsk->icsk_ca_setsockopt = 1;
 		goto out;
 	}
+
 	if (!ca) {
 		err = -ENOENT;
 	} else if (!load) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1eac84b8044e..c6bc0c4d19c6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
 {
 	int cpu;
 
+	module_put(net->ipv4.tcp_congestion_control->owner);
+
 	for_each_possible_cpu(cpu)
 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
 	free_percpu(net->ipv4.tcp_sk);
@@ -2522,6 +2524,13 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
 
+	/* Reno is always built in */
+	if (!net_eq(net, &init_net) &&
+	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
+		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
+	else
+		net->ipv4.tcp_congestion_control = &tcp_reno;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 70d9659fc1e9..05eb7bc36156 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2378,6 +2378,7 @@ out:
 static int ip6_convert_metrics(struct mx6_config *mxc,
 			       const struct fib6_config *cfg)
 {
+	struct net *net = cfg->fc_nlinfo.nl_net;
 	bool ecn_ca = false;
 	struct nlattr *nla;
 	int remaining;
@@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 			char tmp[TCP_CA_NAME_MAX];
 
 			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
 			if (val == TCP_CA_UNSPEC)
 				goto err;
 		} else {
-- 
cgit v1.2.3


From 50895b9de1d3e0258e015e8e55128d835d9a9f19 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Nov 2017 21:02:19 -0800
Subject: tcp: highest_sack fix

syzbot easily found a regression added in our latest patches [1]

No longer set tp->highest_sack to the head of the send queue since
this is not logical and error prone.

Only sack processing should maintain the pointer to an skb from rtx queue.

We might in the future only remember the sequence instead of a pointer to skb,
since rb-tree should allow a fast lookup.

[1]
BUG: KASAN: use-after-free in tcp_highest_sack_seq include/net/tcp.h:1706 [inline]
BUG: KASAN: use-after-free in tcp_ack+0x42bb/0x4fd0 net/ipv4/tcp_input.c:3537
Read of size 4 at addr ffff8801c154faa8 by task syz-executor4/12860

CPU: 0 PID: 12860 Comm: syz-executor4 Not tainted 4.14.0-next-20171113+ #41
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:53
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x25b/0x340 mm/kasan/report.c:409
 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429
 tcp_highest_sack_seq include/net/tcp.h:1706 [inline]
 tcp_ack+0x42bb/0x4fd0 net/ipv4/tcp_input.c:3537
 tcp_rcv_established+0x672/0x18a0 net/ipv4/tcp_input.c:5439
 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1468
 sk_backlog_rcv include/net/sock.h:909 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2264
 release_sock+0xa4/0x2a0 net/core/sock.c:2778
 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462
 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
 sock_sendmsg_nosec net/socket.c:632 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:642
 ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2048
 __sys_sendmsg+0xe5/0x210 net/socket.c:2082
 SYSC_sendmsg net/socket.c:2093 [inline]
 SyS_sendmsg+0x2d/0x50 net/socket.c:2089
 entry_SYSCALL_64_fastpath+0x1f/0x96
RIP: 0033:0x452879
RSP: 002b:00007fc9761bfbe8 EFLAGS: 00000212 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000758020 RCX: 0000000000452879
RDX: 0000000000000000 RSI: 0000000020917fc8 RDI: 0000000000000015
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006ee3a0
R13: 00000000ffffffff R14: 00007fc9761c06d4 R15: 0000000000000000

Allocated by task 12860:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc_node+0x144/0x760 mm/slab.c:3638
 __alloc_skb+0xf1/0x780 net/core/skbuff.c:193
 alloc_skb_fclone include/linux/skbuff.h:1023 [inline]
 sk_stream_alloc_skb+0x11d/0x900 net/ipv4/tcp.c:870
 tcp_sendmsg_locked+0x1341/0x3b80 net/ipv4/tcp.c:1299
 tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1461
 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
 sock_sendmsg_nosec net/socket.c:632 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:642
 SYSC_sendto+0x358/0x5a0 net/socket.c:1749
 SyS_sendto+0x40/0x50 net/socket.c:1717
 entry_SYSCALL_64_fastpath+0x1f/0x96

Freed by task 12860:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3492 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3750
 kfree_skbmem+0xdd/0x1d0 net/core/skbuff.c:603
 __kfree_skb+0x1d/0x20 net/core/skbuff.c:642
 sk_wmem_free_skb include/net/sock.h:1419 [inline]
 tcp_rtx_queue_unlink_and_free include/net/tcp.h:1682 [inline]
 tcp_clean_rtx_queue net/ipv4/tcp_input.c:3111 [inline]
 tcp_ack+0x1b17/0x4fd0 net/ipv4/tcp_input.c:3593
 tcp_rcv_established+0x672/0x18a0 net/ipv4/tcp_input.c:5439
 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1468
 sk_backlog_rcv include/net/sock.h:909 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2264
 release_sock+0xa4/0x2a0 net/core/sock.c:2778
 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462
 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
 sock_sendmsg_nosec net/socket.c:632 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:642
 ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2048
 __sys_sendmsg+0xe5/0x210 net/socket.c:2082
 SYSC_sendmsg net/socket.c:2093 [inline]
 SyS_sendmsg+0x2d/0x50 net/socket.c:2089
 entry_SYSCALL_64_fastpath+0x1f/0x96

The buggy address belongs to the object at ffff8801c154fa80
 which belongs to the cache skbuff_fclone_cache of size 456
The buggy address is located 40 bytes inside of
 456-byte region [ffff8801c154fa80, ffff8801c154fc48)
The buggy address belongs to the page:
page:ffffea00070553c0 count:1 mapcount:0 mapping:ffff8801c154f080 index:0x0
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffff8801c154f080 0000000000000000 0000000100000006
raw: ffffea00070a5a20 ffffea0006a18360 ffff8801d9ca0500 0000000000000000
page dumped because: kasan: bad access detected

Fixes: 737ff314563c ("tcp: use sequence distance to detect reordering")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 17 +++--------------
 net/ipv4/tcp_input.c |  2 +-
 2 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 35cc7d0d3d47..85ea578195d4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1630,9 +1630,6 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli
 {
 	if (tcp_write_queue_empty(sk))
 		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-
-	if (tcp_sk(sk)->highest_sack == skb_unlinked)
-		tcp_sk(sk)->highest_sack = NULL;
 }
 
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
@@ -1645,12 +1642,8 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
 	__tcp_add_write_queue_tail(sk, skb);
 
 	/* Queue it, remembering where we must start sending. */
-	if (sk->sk_write_queue.next == skb) {
+	if (sk->sk_write_queue.next == skb)
 		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
-
-		if (tcp_sk(sk)->highest_sack == NULL)
-			tcp_sk(sk)->highest_sack = skb;
-	}
 }
 
 /* Insert new before skb on the write queue of sk.  */
@@ -1708,9 +1701,7 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
 
 static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
 {
-	struct sk_buff *next = skb_rb_next(skb);
-
-	tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
+	tcp_sk(sk)->highest_sack = skb_rb_next(skb);
 }
 
 static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1720,9 +1711,7 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
 
 static inline void tcp_highest_sack_reset(struct sock *sk)
 {
-	struct sk_buff *skb = tcp_rtx_queue_head(sk);
-
-	tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
+	tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk);
 }
 
 /* Called when old skb is about to be deleted and replaced by new skb */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c3447c5512fd..f0b572fe959a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3534,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		icsk->icsk_retransmits = 0;
 	}
 
-	prior_fack = tcp_highest_sack_seq(tp);
+	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
 	rs.prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* ts_recent update must be made after we are sure that the packet
-- 
cgit v1.2.3