From 24ea591d2201c3257d666466e8fac50a6cf3c52f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Jul 2015 05:18:03 -0700
Subject: net: sched: extend percpu stats helpers

qdisc_bstats_update_cpu() and other helpers were added to support
percpu stats for qdisc.

We want to add percpu stats for tc action, so this patch add common
helpers.

qdisc_bstats_update_cpu() is renamed to qdisc_bstats_cpu_update()
qdisc_qstats_drop_cpu() is renamed to qdisc_qstats_cpu_drop()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 2738f6f87908..2eab08c38e32 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -513,17 +513,20 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
 	bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
 }
 
-static inline void qdisc_bstats_update_cpu(struct Qdisc *sch,
-					   const struct sk_buff *skb)
+static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats,
+				     const struct sk_buff *skb)
 {
-	struct gnet_stats_basic_cpu *bstats =
-				this_cpu_ptr(sch->cpu_bstats);
-
 	u64_stats_update_begin(&bstats->syncp);
 	bstats_update(&bstats->bstats, skb);
 	u64_stats_update_end(&bstats->syncp);
 }
 
+static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
+					   const struct sk_buff *skb)
+{
+	bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb);
+}
+
 static inline void qdisc_bstats_update(struct Qdisc *sch,
 				       const struct sk_buff *skb)
 {
@@ -547,16 +550,24 @@ static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
 	sch->qstats.drops += count;
 }
 
-static inline void qdisc_qstats_drop(struct Qdisc *sch)
+static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
 {
-	sch->qstats.drops++;
+	qstats->drops++;
 }
 
-static inline void qdisc_qstats_drop_cpu(struct Qdisc *sch)
+static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats)
 {
-	struct gnet_stats_queue *qstats = this_cpu_ptr(sch->cpu_qstats);
+	qstats->overlimits++;
+}
 
-	qstats->drops++;
+static inline void qdisc_qstats_drop(struct Qdisc *sch)
+{
+	qstats_drop_inc(&sch->qstats);
+}
+
+static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch)
+{
+	qstats_drop_inc(this_cpu_ptr(sch->cpu_qstats));
 }
 
 static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
-- 
cgit v1.2.3


From 519c818e8fb646eef1e8bfedd18519bec47bc9a9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Jul 2015 05:18:04 -0700
Subject: net: sched: add percpu stats to actions

Reuse existing percpu infrastructure John Fastabend added for qdisc.

This patch adds a new cpustats parameter to tcf_hash_create() and all
actions pass false, meaning this patch should have no effect yet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h    |  4 +++-
 net/sched/act_api.c      | 44 ++++++++++++++++++++++++++++++++++----------
 net/sched/act_bpf.c      |  2 +-
 net/sched/act_connmark.c |  3 ++-
 net/sched/act_csum.c     |  3 ++-
 net/sched/act_gact.c     |  3 ++-
 net/sched/act_ipt.c      |  2 +-
 net/sched/act_mirred.c   |  3 ++-
 net/sched/act_nat.c      |  3 ++-
 net/sched/act_pedit.c    |  3 ++-
 net/sched/act_simple.c   |  3 ++-
 net/sched/act_skbedit.c  |  3 ++-
 net/sched/act_vlan.c     |  3 ++-
 13 files changed, 57 insertions(+), 22 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 3ee4c92afd1b..db2063ffd181 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -21,6 +21,8 @@ struct tcf_common {
 	struct gnet_stats_rate_est64	tcfc_rate_est;
 	spinlock_t			tcfc_lock;
 	struct rcu_head			tcfc_rcu;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct gnet_stats_queue __percpu *cpu_qstats;
 };
 #define tcf_head	common.tcfc_head
 #define tcf_index	common.tcfc_index
@@ -103,7 +105,7 @@ int tcf_hash_release(struct tc_action *a, int bind);
 u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo);
 int tcf_hash_check(u32 index, struct tc_action *a, int bind);
 int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
-		    int size, int bind);
+		    int size, int bind, bool cpustats);
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est);
 void tcf_hash_insert(struct tc_action *a);
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index af427a3dbcba..074a32f466f8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -27,6 +27,15 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
+static void free_tcf(struct rcu_head *head)
+{
+	struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu);
+
+	free_percpu(p->cpu_bstats);
+	free_percpu(p->cpu_qstats);
+	kfree(p);
+}
+
 void tcf_hash_destroy(struct tc_action *a)
 {
 	struct tcf_common *p = a->priv;
@@ -41,7 +50,7 @@ void tcf_hash_destroy(struct tc_action *a)
 	 * gen_estimator est_timer() might access p->tcfc_lock
 	 * or bstats, wait a RCU grace period before freeing p
 	 */
-	kfree_rcu(p, tcfc_rcu);
+	call_rcu(&p->tcfc_rcu, free_tcf);
 }
 EXPORT_SYMBOL(tcf_hash_destroy);
 
@@ -230,15 +239,16 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 	if (est)
 		gen_kill_estimator(&pc->tcfc_bstats,
 				   &pc->tcfc_rate_est);
-	kfree_rcu(pc, tcfc_rcu);
+	call_rcu(&pc->tcfc_rcu, free_tcf);
 }
 EXPORT_SYMBOL(tcf_hash_cleanup);
 
 int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
-		    int size, int bind)
+		    int size, int bind, bool cpustats)
 {
 	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+	int err = -ENOMEM;
 
 	if (unlikely(!p))
 		return -ENOMEM;
@@ -246,18 +256,32 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
 	if (bind)
 		p->tcfc_bindcnt = 1;
 
+	if (cpustats) {
+		p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+		if (!p->cpu_bstats) {
+err1:
+			kfree(p);
+			return err;
+		}
+		p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+		if (!p->cpu_qstats) {
+err2:
+			free_percpu(p->cpu_bstats);
+			goto err1;
+		}
+	}
 	spin_lock_init(&p->tcfc_lock);
 	INIT_HLIST_NODE(&p->tcfc_head);
 	p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
 	p->tcfc_tm.install = jiffies;
 	p->tcfc_tm.lastuse = jiffies;
 	if (est) {
-		int err = gen_new_estimator(&p->tcfc_bstats, NULL,
-					    &p->tcfc_rate_est,
-					    &p->tcfc_lock, est);
+		err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
+					&p->tcfc_rate_est,
+					&p->tcfc_lock, est);
 		if (err) {
-			kfree(p);
-			return err;
+			free_percpu(p->cpu_qstats);
+			goto err2;
 		}
 	}
 
@@ -615,10 +639,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 	if (err < 0)
 		goto errout;
 
-	if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 ||
+	if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
 	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
 				     &p->tcfc_rate_est) < 0 ||
-	    gnet_stats_copy_queue(&d, NULL,
+	    gnet_stats_copy_queue(&d, p->cpu_qstats,
 				  &p->tcfc_qstats,
 				  p->tcfc_qstats.qlen) < 0)
 		goto errout;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 1d56903fd4c7..99aa271633e9 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -281,7 +281,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
 	if (!tcf_hash_check(parm->index, act, bind)) {
 		ret = tcf_hash_create(parm->index, est, act,
-				      sizeof(*prog), bind);
+				      sizeof(*prog), bind, false);
 		if (ret < 0)
 			goto destroy_fp;
 
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 295d14bd6c67..f2b540220ad0 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -108,7 +108,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 	parm = nla_data(tb[TCA_CONNMARK_PARMS]);
 
 	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*ci),
+				      bind, false);
 		if (ret)
 			return ret;
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 4cd5cf1aedf8..b07c535ba8e7 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -62,7 +62,8 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
 	parm = nla_data(tb[TCA_CSUM_PARMS]);
 
 	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+				      bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 7fffc2272701..a4f8af29ee30 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -85,7 +85,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 #endif
 
 	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact),
+				      bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index cbc8dd7dd48a..99c9cc1c7af9 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -114,7 +114,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 		index = nla_get_u32(tb[TCA_IPT_INDEX]);
 
 	if (!tcf_hash_check(index, a, bind) ) {
-		ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
+		ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index a42a3b257226..002cd6c83dc6 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -93,7 +93,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	if (!tcf_hash_check(parm->index, a, bind)) {
 		if (dev == NULL)
 			return -EINVAL;
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*m),
+				      bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 270a030d5fd0..5be0b3c1c5b0 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -55,7 +55,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	parm = nla_data(tb[TCA_NAT_PARMS]);
 
 	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+				      bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 17e6d6669c7f..ce8676ad892f 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -57,7 +57,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	if (!tcf_hash_check(parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+				      bind, false);
 		if (ret)
 			return ret;
 		p = to_pedit(a);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 6a8d9488613a..d6b708d6afdf 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -103,7 +103,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	defdata = nla_data(tb[TCA_DEF_DATA]);
 
 	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
+				      bind, false);
 		if (ret)
 			return ret;
 
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index fcfeeaf838be..6751b5f8c046 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -99,7 +99,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
 	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
+				      bind, false);
 		if (ret)
 			return ret;
 
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index d735ecf0b1a7..796785e0bf96 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -116,7 +116,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	action = parm->v_action;
 
 	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*v),
+				      bind, false);
 		if (ret)
 			return ret;
 
-- 
cgit v1.2.3


From cc6510a9504fd3c03d76bd68d99653148342eecc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Jul 2015 05:18:06 -0700
Subject: net_sched: act_gact: use a separate packet counters for gact_determ()

Second step for gact RCU operation :

We want to get rid of the spinlock protecting gact operations.
Stats (packets/bytes) will soon be per cpu.

gact_determ() would not work without a central packet counter,
so lets add it for this mode.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_gact.h | 7 ++++---
 net/sched/act_gact.c         | 4 +++-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index 9fc9b578908a..592a6bc02b0b 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -6,9 +6,10 @@
 struct tcf_gact {
 	struct tcf_common	common;
 #ifdef CONFIG_GACT_PROB
-        u16			tcfg_ptype;
-        u16			tcfg_pval;
-        int			tcfg_paction;
+	u16			tcfg_ptype;
+	u16			tcfg_pval;
+	int			tcfg_paction;
+	atomic_t		packets;
 #endif
 };
 #define to_gact(a) \
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 22a3a61aa090..2f9bec584b3f 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -36,8 +36,10 @@ static int gact_net_rand(struct tcf_gact *gact)
 
 static int gact_determ(struct tcf_gact *gact)
 {
+	u32 pack = atomic_inc_return(&gact->packets);
+
 	smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
-	if (gact->tcf_bstats.packets % gact->tcfg_pval)
+	if (pack % gact->tcfg_pval)
 		return gact->tcf_action;
 	return gact->tcfg_paction;
 }
-- 
cgit v1.2.3


From 56e5d1ca183d8616fab377d7d466c244b4dbb3b9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Jul 2015 05:18:08 -0700
Subject: net_sched: act_gact: remove spinlock in fast path

Final step for gact RCU operation :

1) Use percpu stats
2) update lastuse only every clock tick to avoid false sharing
3) Remove spinlock acquisition, as it is no longer needed.

Since this is the last contended lock in packet RX when tc gact is used,
this gives impressive gain.

My host with 8 RX queues was handling 5 Mpps before the patch,
and more than 11 Mpps after patch.

Tested:

On receiver :

dev=eth0
tc qdisc del dev $dev ingress 2>/dev/null
tc qdisc add dev $dev ingress
tc filter del dev $dev root pref 10 2>/dev/null
tc filter del dev $dev pref 10 2>/dev/null
tc filter add dev $dev est 1sec 4sec parent ffff: protocol ip prio 1 \
	u32 match ip src 7.0.0.0/8 flowid 1:15 action drop

Sender sends packets flood from 7/8 network

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 11 +++++++++++
 net/sched/act_gact.c  | 17 +++++++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index db2063ffd181..8d2a707a9e87 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -70,6 +70,17 @@ static inline void tcf_hashinfo_destroy(struct tcf_hashinfo *hf)
 	kfree(hf->htab);
 }
 
+/* Update lastuse only if needed, to avoid dirtying a cache line.
+ * We use a temp variable to avoid fetching jiffies twice.
+ */
+static inline void tcf_lastuse_update(struct tcf_t *tm)
+{
+	unsigned long now = jiffies;
+
+	if (tm->lastuse != now)
+		tm->lastuse = now;
+}
+
 #ifdef CONFIG_NET_CLS_ACT
 
 #define ACT_P_CREATED 1
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e4eb88d3d8dc..5c1b05170736 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -90,7 +90,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 
 	if (!tcf_hash_check(parm->index, a, bind)) {
 		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact),
-				      bind, false);
+				      bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -104,7 +104,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 
 	gact = to_gact(a);
 
-	spin_lock_bh(&gact->tcf_lock);
+	ASSERT_RTNL();
 	gact->tcf_action = parm->action;
 #ifdef CONFIG_GACT_PROB
 	if (p_parm) {
@@ -117,7 +117,6 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 		gact->tcfg_ptype   = p_parm->ptype;
 	}
 #endif
-	spin_unlock_bh(&gact->tcf_lock);
 	if (ret == ACT_P_CREATED)
 		tcf_hash_insert(a);
 	return ret;
@@ -127,9 +126,8 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
 	struct tcf_gact *gact = a->priv;
-	int action = gact->tcf_action;
+	int action = READ_ONCE(gact->tcf_action);
 
-	spin_lock(&gact->tcf_lock);
 #ifdef CONFIG_GACT_PROB
 	{
 	u32 ptype = READ_ONCE(gact->tcfg_ptype);
@@ -138,12 +136,11 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
 		action = gact_rand[ptype](gact);
 	}
 #endif
-	gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	gact->tcf_bstats.packets++;
+	bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
 	if (action == TC_ACT_SHOT)
-		gact->tcf_qstats.drops++;
-	gact->tcf_tm.lastuse = jiffies;
-	spin_unlock(&gact->tcf_lock);
+		qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+
+	tcf_lastuse_update(&gact->tcf_tm);
 
 	return action;
 }
-- 
cgit v1.2.3


From 2ee22a90c7afac265bb6f7abea610b938195e2b8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Jul 2015 05:18:09 -0700
Subject: net_sched: act_mirred: remove spinlock in fast path

Like act_gact, act_mirred can be lockless in packet processing

1) Use percpu stats
2) update lastuse only every clock tick to avoid false sharing
3) use rcu to protect tcfm_dev
4) Remove spinlock usage, as it is no longer needed.

Next step : add multi queue capability to ifb device

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_mirred.h |  2 +-
 net/sched/act_mirred.c         | 57 ++++++++++++++++++++++--------------------
 2 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 4dd77a1c106b..dae96bae1c19 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -8,7 +8,7 @@ struct tcf_mirred {
 	int			tcfm_eaction;
 	int			tcfm_ifindex;
 	int			tcfm_ok_push;
-	struct net_device	*tcfm_dev;
+	struct net_device __rcu	*tcfm_dev;
 	struct list_head	tcfm_list;
 };
 #define to_mirred(a) \
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 002cd6c83dc6..19cd8904efa0 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -35,9 +35,11 @@ static LIST_HEAD(mirred_list);
 static void tcf_mirred_release(struct tc_action *a, int bind)
 {
 	struct tcf_mirred *m = to_mirred(a);
+	struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1);
+
 	list_del(&m->tcfm_list);
-	if (m->tcfm_dev)
-		dev_put(m->tcfm_dev);
+	if (dev)
+		dev_put(dev);
 }
 
 static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -94,7 +96,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		if (dev == NULL)
 			return -EINVAL;
 		ret = tcf_hash_create(parm->index, est, a, sizeof(*m),
-				      bind, false);
+				      bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -106,18 +108,18 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	}
 	m = to_mirred(a);
 
-	spin_lock_bh(&m->tcf_lock);
+	ASSERT_RTNL();
 	m->tcf_action = parm->action;
 	m->tcfm_eaction = parm->eaction;
 	if (dev != NULL) {
 		m->tcfm_ifindex = parm->ifindex;
 		if (ret != ACT_P_CREATED)
-			dev_put(m->tcfm_dev);
+			dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
 		dev_hold(dev);
-		m->tcfm_dev = dev;
+		rcu_assign_pointer(m->tcfm_dev, dev);
 		m->tcfm_ok_push = ok_push;
 	}
-	spin_unlock_bh(&m->tcf_lock);
+
 	if (ret == ACT_P_CREATED) {
 		list_add(&m->tcfm_list, &mirred_list);
 		tcf_hash_insert(a);
@@ -132,20 +134,22 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_mirred *m = a->priv;
 	struct net_device *dev;
 	struct sk_buff *skb2;
+	int retval, err;
 	u32 at;
-	int retval, err = 1;
 
-	spin_lock(&m->tcf_lock);
-	m->tcf_tm.lastuse = jiffies;
-	bstats_update(&m->tcf_bstats, skb);
+	tcf_lastuse_update(&m->tcf_tm);
+
+	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
 
-	dev = m->tcfm_dev;
-	if (!dev) {
-		printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
+	rcu_read_lock();
+	retval = READ_ONCE(m->tcf_action);
+	dev = rcu_dereference(m->tcfm_dev);
+	if (unlikely(!dev)) {
+		pr_notice_once("tc mirred: target device is gone\n");
 		goto out;
 	}
 
-	if (!(dev->flags & IFF_UP)) {
+	if (unlikely(!(dev->flags & IFF_UP))) {
 		net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
 				       dev->name);
 		goto out;
@@ -153,7 +157,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	at = G_TC_AT(skb->tc_verd);
 	skb2 = skb_clone(skb, GFP_ATOMIC);
-	if (skb2 == NULL)
+	if (!skb2)
 		goto out;
 
 	if (!(at & AT_EGRESS)) {
@@ -169,16 +173,13 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	skb2->dev = dev;
 	err = dev_queue_xmit(skb2);
 
-out:
 	if (err) {
-		m->tcf_qstats.overlimits++;
+out:
+		qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
 		if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
 			retval = TC_ACT_SHOT;
-		else
-			retval = m->tcf_action;
-	} else
-		retval = m->tcf_action;
-	spin_unlock(&m->tcf_lock);
+	}
+	rcu_read_unlock();
 
 	return retval;
 }
@@ -217,14 +218,16 @@ static int mirred_device_event(struct notifier_block *unused,
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct tcf_mirred *m;
 
+	ASSERT_RTNL();
 	if (event == NETDEV_UNREGISTER)
 		list_for_each_entry(m, &mirred_list, tcfm_list) {
-			spin_lock_bh(&m->tcf_lock);
-			if (m->tcfm_dev == dev) {
+			if (rcu_access_pointer(m->tcfm_dev) == dev) {
 				dev_put(dev);
-				m->tcfm_dev = NULL;
+				/* Note : no rcu grace period necessary, as
+				 * net_device are already rcu protected.
+				 */
+				RCU_INIT_POINTER(m->tcfm_dev, NULL);
 			}
-			spin_unlock_bh(&m->tcf_lock);
 		}
 
 	return NOTIFY_DONE;
-- 
cgit v1.2.3


From 071d5080e33d6f24139e4213c2d9f97a2c21b602 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 9 Jul 2015 13:16:29 -0700
Subject: tcp: add tcp_in_slow_start helper

Add a helper to test the slow start condition in various congestion
control modules and other places. This is to prepare a slight improvement
in policy as to exactly when to slow start.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        | 7 ++++++-
 net/ipv4/tcp_bic.c       | 2 +-
 net/ipv4/tcp_cong.c      | 2 +-
 net/ipv4/tcp_cubic.c     | 4 ++--
 net/ipv4/tcp_highspeed.c | 2 +-
 net/ipv4/tcp_htcp.c      | 2 +-
 net/ipv4/tcp_illinois.c  | 2 +-
 net/ipv4/tcp_metrics.c   | 2 +-
 net/ipv4/tcp_scalable.c  | 2 +-
 net/ipv4/tcp_vegas.c     | 6 +++---
 net/ipv4/tcp_veno.c      | 2 +-
 11 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 950cfecaad3c..dba22fc1b065 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -989,6 +989,11 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
 
 #define TCP_INFINITE_SSTHRESH	0x7fffffff
 
+static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
+{
+	return tp->snd_cwnd <= tp->snd_ssthresh;
+}
+
 static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
 {
 	return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
@@ -1065,7 +1070,7 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
 	const struct tcp_sock *tp = tcp_sk(sk);
 
 	/* If in slow start, ensure cwnd grows to twice what was ACKed. */
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
+	if (tcp_in_slow_start(tp))
 		return tp->snd_cwnd < 2 * tp->max_packets_out;
 
 	return tp->is_cwnd_limited;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index c037644eafb7..fd1405d37c14 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (!tcp_is_cwnd_limited(sk))
 		return;
 
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
+	if (tcp_in_slow_start(tp))
 		tcp_slow_start(tp, acked);
 	else {
 		bictcp_update(ca, tp->snd_cwnd);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 84be008c945c..654729a8cb23 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -413,7 +413,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		return;
 
 	/* In "safe" area, increase. */
-	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+	if (tcp_in_slow_start(tp)) {
 		acked = tcp_slow_start(tp, acked);
 		if (!acked)
 			return;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 06d3d665a9fd..28011fb1f4a2 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -320,7 +320,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (!tcp_is_cwnd_limited(sk))
 		return;
 
-	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+	if (tcp_in_slow_start(tp)) {
 		if (hystart && after(ack, ca->end_seq))
 			bictcp_hystart_reset(sk);
 		acked = tcp_slow_start(tp, acked);
@@ -439,7 +439,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
 		ca->delay_min = delay;
 
 	/* hystart triggers when cwnd is larger than some threshold */
-	if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
+	if (hystart && tcp_in_slow_start(tp) &&
 	    tp->snd_cwnd >= hystart_low_window)
 		hystart_update(sk, delay);
 }
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 882c08aae2f5..db7842495a64 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (!tcp_is_cwnd_limited(sk))
 		return;
 
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
+	if (tcp_in_slow_start(tp))
 		tcp_slow_start(tp, acked);
 	else {
 		/* Update AIMD parameters.
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 58469fff6c18..82f0d9ed60f5 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (!tcp_is_cwnd_limited(sk))
 		return;
 
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
+	if (tcp_in_slow_start(tp))
 		tcp_slow_start(tp, acked);
 	else {
 		/* In dangerous area, increase slowly.
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index f71002e4db0b..2ab9bbb6faff 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		return;
 
 	/* In slow start */
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
+	if (tcp_in_slow_start(tp))
 		tcp_slow_start(tp, acked);
 
 	else {
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index a51d63a43e33..b3d64f61d922 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -461,7 +461,7 @@ void tcp_update_metrics(struct sock *sk)
 				tcp_metric_set(tm, TCP_METRIC_CWND,
 					       tp->snd_cwnd);
 		}
-	} else if (tp->snd_cwnd > tp->snd_ssthresh &&
+	} else if (!tcp_in_slow_start(tp) &&
 		   icsk->icsk_ca_state == TCP_CA_Open) {
 		/* Cong. avoidance phase, cwnd is reliable. */
 		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 333bcb2415ff..bf5ea9e9bbc1 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (!tcp_is_cwnd_limited(sk))
 		return;
 
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
+	if (tcp_in_slow_start(tp))
 		tcp_slow_start(tp, acked);
 	else
 		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a6cea1d5e20d..13951c4087d4 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 			 */
 			diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
 
-			if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
+			if (diff > gamma && tcp_in_slow_start(tp)) {
 				/* Going too fast. Time to slow down
 				 * and switch to congestion avoidance.
 				 */
@@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 				tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
 				tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
 
-			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
+			} else if (tcp_in_slow_start(tp)) {
 				/* Slow start.  */
 				tcp_slow_start(tp, acked);
 			} else {
@@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		vegas->minRTT = 0x7fffffff;
 	}
 	/* Use normal slow start */
-	else if (tp->snd_cwnd <= tp->snd_ssthresh)
+	else if (tcp_in_slow_start(tp))
 		tcp_slow_start(tp, acked);
 }
 
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 112151eeee45..0d094b995cd9 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 
 		veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
 
-		if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		if (tcp_in_slow_start(tp)) {
 			/* Slow start.  */
 			tcp_slow_start(tp, acked);
 		} else {
-- 
cgit v1.2.3


From 76174004a0f19785a328f40388e87e982bbf69b9 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 9 Jul 2015 13:16:30 -0700
Subject: tcp: do not slow start when cwnd equals ssthresh

In the original design slow start is only used to raise cwnd
when cwnd is stricly below ssthresh. It makes little sense
to slow start when cwnd == ssthresh: especially
when hystart has set ssthresh in the initial ramp, or after
recovery when cwnd resets to ssthresh. Not doing so will
also help reduce the buffer bloat slightly.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 2 +-
 net/ipv4/tcp_cdg.c   | 2 +-
 net/ipv4/tcp_cong.c  | 4 +---
 net/ipv4/tcp_hybla.c | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dba22fc1b065..364426a2be5a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -991,7 +991,7 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
 
 static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
 {
-	return tp->snd_cwnd <= tp->snd_ssthresh;
+	return tp->snd_cwnd < tp->snd_ssthresh;
 }
 
 static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 8c6fd3d5e40f..167b6a3e1b98 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -264,7 +264,7 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	u32 prior_snd_cwnd;
 	u32 incr;
 
-	if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect)
+	if (tcp_in_slow_start(tp) && hystart_detect)
 		tcp_cdg_hystart_update(sk);
 
 	if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 654729a8cb23..a2ed23c595cf 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -365,10 +365,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
  */
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
 {
-	u32 cwnd = tp->snd_cwnd + acked;
+	u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
 
-	if (cwnd > tp->snd_ssthresh)
-		cwnd = tp->snd_ssthresh + 1;
 	acked -= cwnd - tp->snd_cwnd;
 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
 
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index f963b274f2b0..083831e359df 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 
 	rho_fractions = ca->rho_3ls - (ca->rho << 3);
 
-	if (tp->snd_cwnd < tp->snd_ssthresh) {
+	if (tcp_in_slow_start(tp)) {
 		/*
 		 * slow start
 		 *      INC = 2^RHO - 1
-- 
cgit v1.2.3


From 3fd2f1b9d91284afb957ab9899a83279d0e09f29 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jul 2015 14:28:28 -0700
Subject: inet: remove BUG_ON() in twsk_destructor()

Kernel will crash the same if one of the pointer is NULL anyway.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/timewait_sock.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
index 68f0ecad6c6e..1a47946f95ba 100644
--- a/include/net/timewait_sock.h
+++ b/include/net/timewait_sock.h
@@ -33,9 +33,6 @@ static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 
 static inline void twsk_destructor(struct sock *sk)
 {
-	BUG_ON(sk == NULL);
-	BUG_ON(sk->sk_prot == NULL);
-	BUG_ON(sk->sk_prot->twsk_prot == NULL);
 	if (sk->sk_prot->twsk_prot->twsk_destructor != NULL)
 		sk->sk_prot->twsk_prot->twsk_destructor(sk);
 }
-- 
cgit v1.2.3


From fc01538f9fb75572c969ca9988176ffc2a8741d6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jul 2015 14:28:29 -0700
Subject: inet: simplify timewait refcounting

timewait sockets have a complex refcounting logic.
Once we realize it should be similar to established and
syn_recv sockets, we can use sk_nulls_del_node_init_rcu()
and remove inet_twsk_unhash()

In particular, deferred inet_twsk_put() added in commit
13475a30b66cd ("tcp: connect() race with timewait reuse")
looks unecessary : When removing a timewait socket from
ehash or bhash, caller must own a reference on the socket
anyway.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h    |  4 ++--
 include/net/inet_timewait_sock.h |  6 ++----
 net/ipv4/inet_hashtables.c       | 31 ++++++++++-------------------
 net/ipv4/inet_timewait_sock.c    | 42 ++++++----------------------------------
 net/ipv6/inet6_hashtables.c      |  6 +-----
 5 files changed, 21 insertions(+), 68 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index b73c88a19dd4..b07d126694a7 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -205,8 +205,8 @@ void inet_put_port(struct sock *sk);
 
 void inet_hashinfo_init(struct inet_hashinfo *h);
 
-int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw);
-int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw);
+void __inet_hash_nolisten(struct sock *sk, struct sock *osk);
+void __inet_hash(struct sock *sk, struct sock *osk);
 void inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 360c4802288d..96f52a4711c8 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -100,10 +100,8 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
 void inet_twsk_free(struct inet_timewait_sock *tw);
 void inet_twsk_put(struct inet_timewait_sock *tw);
 
-int inet_twsk_unhash(struct inet_timewait_sock *tw);
-
-int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
-			  struct inet_hashinfo *hashinfo);
+void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+			   struct inet_hashinfo *hashinfo);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 					   struct inet_timewait_death_row *dr,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 5f9b063bbe8a..e58840330da7 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -343,7 +343,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	struct sock *sk2;
 	const struct hlist_nulls_node *node;
 	struct inet_timewait_sock *tw = NULL;
-	int twrefcnt = 0;
 
 	spin_lock(lock);
 
@@ -371,12 +370,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	WARN_ON(!sk_unhashed(sk));
 	__sk_nulls_add_node_rcu(sk, &head->chain);
 	if (tw) {
-		twrefcnt = inet_twsk_unhash(tw);
+		sk_nulls_del_node_init_rcu((struct sock *)tw);
 		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
 	}
 	spin_unlock(lock);
-	if (twrefcnt)
-		inet_twsk_put(tw);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 
 	if (twp) {
@@ -384,7 +381,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	} else if (tw) {
 		/* Silly. Should hash-dance instead... */
 		inet_twsk_deschedule(tw);
-
 		inet_twsk_put(tw);
 	}
 	return 0;
@@ -403,13 +399,12 @@ static u32 inet_sk_port_offset(const struct sock *sk)
 					  inet->inet_dport);
 }
 
-int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct hlist_nulls_head *list;
 	struct inet_ehash_bucket *head;
 	spinlock_t *lock;
-	int twrefcnt = 0;
 
 	WARN_ON(!sk_unhashed(sk));
 
@@ -420,23 +415,22 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
 
 	spin_lock(lock);
 	__sk_nulls_add_node_rcu(sk, list);
-	if (tw) {
-		WARN_ON(sk->sk_hash != tw->tw_hash);
-		twrefcnt = inet_twsk_unhash(tw);
+	if (osk) {
+		WARN_ON(sk->sk_hash != osk->sk_hash);
+		sk_nulls_del_node_init_rcu(osk);
 	}
 	spin_unlock(lock);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-	return twrefcnt;
 }
 EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
 
-int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
+void __inet_hash(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct inet_listen_hashbucket *ilb;
 
 	if (sk->sk_state != TCP_LISTEN)
-		return __inet_hash_nolisten(sk, tw);
+		return __inet_hash_nolisten(sk, osk);
 
 	WARN_ON(!sk_unhashed(sk));
 	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -445,7 +439,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
 	__sk_nulls_add_node_rcu(sk, &ilb->head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	spin_unlock(&ilb->lock);
-	return 0;
 }
 EXPORT_SYMBOL(__inet_hash);
 
@@ -492,7 +485,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	struct inet_bind_bucket *tb;
 	int ret;
 	struct net *net = sock_net(sk);
-	int twrefcnt = 1;
 
 	if (!snum) {
 		int i, remaining, low, high, port;
@@ -560,18 +552,15 @@ ok:
 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
 			inet_sk(sk)->inet_sport = htons(port);
-			twrefcnt += __inet_hash_nolisten(sk, tw);
+			__inet_hash_nolisten(sk, (struct sock *)tw);
 		}
 		if (tw)
-			twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+			inet_twsk_bind_unhash(tw, hinfo);
 		spin_unlock(&head->lock);
 
 		if (tw) {
 			inet_twsk_deschedule(tw);
-			while (twrefcnt) {
-				twrefcnt--;
-				inet_twsk_put(tw);
-			}
+			inet_twsk_put(tw);
 		}
 
 		ret = 0;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2ffbd16b79e0..92cd4d50404e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -17,28 +17,6 @@
 #include <net/ip.h>
 
 
-/**
- *	inet_twsk_unhash - unhash a timewait socket from established hash
- *	@tw: timewait socket
- *
- *	unhash a timewait socket from established hash, if hashed.
- *	ehash lock must be held by caller.
- *	Returns 1 if caller should call inet_twsk_put() after lock release.
- */
-int inet_twsk_unhash(struct inet_timewait_sock *tw)
-{
-	if (hlist_nulls_unhashed(&tw->tw_node))
-		return 0;
-
-	hlist_nulls_del_rcu(&tw->tw_node);
-	sk_nulls_node_init(&tw->tw_node);
-	/*
-	 * We cannot call inet_twsk_put() ourself under lock,
-	 * caller must call it for us.
-	 */
-	return 1;
-}
-
 /**
  *	inet_twsk_bind_unhash - unhash a timewait socket from bind hash
  *	@tw: timewait socket
@@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
  *	bind hash lock must be held by caller.
  *	Returns 1 if caller should call inet_twsk_put() after lock release.
  */
-int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 			  struct inet_hashinfo *hashinfo)
 {
 	struct inet_bind_bucket *tb = tw->tw_tb;
 
 	if (!tb)
-		return 0;
+		return;
 
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
 	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
-	/*
-	 * We cannot call inet_twsk_put() ourself under lock,
-	 * caller must call it for us.
-	 */
-	return 1;
+	__sock_put((struct sock *)tw);
 }
 
 /* Must be called with locally disabled BHs. */
 static void inet_twsk_kill(struct inet_timewait_sock *tw)
 {
 	struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
-	struct inet_bind_hashbucket *bhead;
-	int refcnt;
-	/* Unlink from established hashes. */
 	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+	struct inet_bind_hashbucket *bhead;
 
 	spin_lock(lock);
-	refcnt = inet_twsk_unhash(tw);
+	sk_nulls_del_node_init_rcu((struct sock *)tw);
 	spin_unlock(lock);
 
 	/* Disassociate with bind bucket. */
@@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
 			hashinfo->bhash_size)];
 
 	spin_lock(&bhead->lock);
-	refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+	inet_twsk_bind_unhash(tw, hashinfo);
 	spin_unlock(&bhead->lock);
 
-	BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
-	atomic_sub(refcnt, &tw->tw_refcnt);
 	atomic_dec(&tw->tw_dr->tw_count);
 	inet_twsk_put(tw);
 }
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b4fd96de97e6..a237398aa2b4 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -207,7 +207,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	struct sock *sk2;
 	const struct hlist_nulls_node *node;
 	struct inet_timewait_sock *tw = NULL;
-	int twrefcnt = 0;
 
 	spin_lock(lock);
 
@@ -234,12 +233,10 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	WARN_ON(!sk_unhashed(sk));
 	__sk_nulls_add_node_rcu(sk, &head->chain);
 	if (tw) {
-		twrefcnt = inet_twsk_unhash(tw);
+		sk_nulls_del_node_init_rcu((struct sock *)tw);
 		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
 	}
 	spin_unlock(lock);
-	if (twrefcnt)
-		inet_twsk_put(tw);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 
 	if (twp) {
@@ -247,7 +244,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	} else if (tw) {
 		/* Silly. Should hash-dance instead... */
 		inet_twsk_deschedule(tw);
-
 		inet_twsk_put(tw);
 	}
 	return 0;
-- 
cgit v1.2.3


From dbe7faa4045ea83a37b691b12bb02a8f86c2d2e9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Jul 2015 14:28:30 -0700
Subject: inet: inet_twsk_deschedule factorization

inet_twsk_deschedule() calls are followed by inet_twsk_put().

Only particular case is in inet_twsk_purge() but there is no point
to defer the inet_twsk_put() after re-enabling BH.

Lets rename inet_twsk_deschedule() to inet_twsk_deschedule_put()
and move the inet_twsk_put() inside.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h |  2 +-
 net/ipv4/inet_hashtables.c       |  9 +++------
 net/ipv4/inet_timewait_sock.c    | 13 ++++++++-----
 net/ipv4/tcp_ipv4.c              |  3 +--
 net/ipv4/tcp_minisocks.c         |  6 ++----
 net/ipv6/inet6_hashtables.c      |  3 +--
 net/ipv6/tcp_ipv6.c              |  3 +--
 net/netfilter/xt_TPROXY.c        |  6 ++----
 8 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 96f52a4711c8..879d6e5a973b 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -111,7 +111,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 			   struct inet_hashinfo *hashinfo);
 
 void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo);
-void inet_twsk_deschedule(struct inet_timewait_sock *tw);
+void inet_twsk_deschedule_put(struct inet_timewait_sock *tw);
 
 void inet_twsk_purge(struct inet_hashinfo *hashinfo,
 		     struct inet_timewait_death_row *twdr, int family);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e58840330da7..f8b3701a6c3c 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -380,8 +380,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 		*twp = tw;
 	} else if (tw) {
 		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw);
-		inet_twsk_put(tw);
+		inet_twsk_deschedule_put(tw);
 	}
 	return 0;
 
@@ -558,10 +557,8 @@ ok:
 			inet_twsk_bind_unhash(tw, hinfo);
 		spin_unlock(&head->lock);
 
-		if (tw) {
-			inet_twsk_deschedule(tw);
-			inet_twsk_put(tw);
-		}
+		if (tw)
+			inet_twsk_deschedule_put(tw);
 
 		ret = 0;
 		goto out;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 92cd4d50404e..ae22cc24fbe8 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -205,13 +205,17 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
  * tcp_input.c to verify this.
  */
 
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void inet_twsk_deschedule(struct inet_timewait_sock *tw)
+/* This is for handling early-kills of TIME_WAIT sockets.
+ * Warning : consume reference.
+ * Caller should not access tw anymore.
+ */
+void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
 {
 	if (del_timer_sync(&tw->tw_timer))
 		inet_twsk_kill(tw);
+	inet_twsk_put(tw);
 }
-EXPORT_SYMBOL(inet_twsk_deschedule);
+EXPORT_SYMBOL(inet_twsk_deschedule_put);
 
 void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
 {
@@ -281,9 +285,8 @@ restart:
 
 			rcu_read_unlock();
 			local_bh_disable();
-			inet_twsk_deschedule(tw);
+			inet_twsk_deschedule_put(tw);
 			local_bh_enable();
-			inet_twsk_put(tw);
 			goto restart_rcu;
 		}
 		/* If the nulls value we got at the end of this lookup is
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d7d4c2b79cf2..486ba96ae91a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1683,8 +1683,7 @@ do_time_wait:
 							iph->daddr, th->dest,
 							inet_iif(skb));
 		if (sk2) {
-			inet_twsk_deschedule(inet_twsk(sk));
-			inet_twsk_put(inet_twsk(sk));
+			inet_twsk_deschedule_put(inet_twsk(sk));
 			sk = sk2;
 			goto process;
 		}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4bc00cb79e60..6d8795b066ac 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 		if (!th->fin ||
 		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 kill_with_rst:
-			inet_twsk_deschedule(tw);
-			inet_twsk_put(tw);
+			inet_twsk_deschedule_put(tw);
 			return TCP_TW_RST;
 		}
 
@@ -198,8 +197,7 @@ kill_with_rst:
 			 */
 			if (sysctl_tcp_rfc1337 == 0) {
 kill:
-				inet_twsk_deschedule(tw);
-				inet_twsk_put(tw);
+				inet_twsk_deschedule_put(tw);
 				return TCP_TW_SUCCESS;
 			}
 		}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index a237398aa2b4..6ac8dad0138a 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -243,8 +243,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 		*twp = tw;
 	} else if (tw) {
 		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw);
-		inet_twsk_put(tw);
+		inet_twsk_deschedule_put(tw);
 	}
 	return 0;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6748c4277aff..d540846a1a79 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1481,8 +1481,7 @@ do_time_wait:
 					    ntohs(th->dest), tcp_v6_iif(skb));
 		if (sk2) {
 			struct inet_timewait_sock *tw = inet_twsk(sk);
-			inet_twsk_deschedule(tw);
-			inet_twsk_put(tw);
+			inet_twsk_deschedule_put(tw);
 			sk = sk2;
 			tcp_v6_restore_cb(skb);
 			goto process;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index cca96cec1b68..d0c96c5ae29a 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -272,8 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 					    hp->source, lport ? lport : hp->dest,
 					    skb->dev, NFT_LOOKUP_LISTENER);
 		if (sk2) {
-			inet_twsk_deschedule(inet_twsk(sk));
-			inet_twsk_put(inet_twsk(sk));
+			inet_twsk_deschedule_put(inet_twsk(sk));
 			sk = sk2;
 		}
 	}
@@ -437,8 +436,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 					    tgi->lport ? tgi->lport : hp->dest,
 					    skb->dev, NFT_LOOKUP_LISTENER);
 		if (sk2) {
-			inet_twsk_deschedule(inet_twsk(sk));
-			inet_twsk_put(inet_twsk(sk));
+			inet_twsk_deschedule_put(inet_twsk(sk));
 			sk = sk2;
 		}
 	}
-- 
cgit v1.2.3


From 35a256fee52c7c207796302681fa95189c85b408 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 8 Jul 2015 16:58:22 -0700
Subject: ipv6: Nonlocal bind

Add support to allow non-local binds similar to how this was done for IPv4.
Non-local binds are very useful in emulating the Internet in a box, etc.

This add the ip_nonlocal_bind sysctl under ipv6.

Testing:

Set up nonlocal binding and receive routing on a host, e.g.:

ip -6 rule add from ::/0 iif eth0 lookup 200
ip -6 route add local 2001:0:0:1::/64 dev lo proto kernel scope host table 200
sysctl -w net.ipv6.ip_nonlocal_bind=1

Set up routing to 2001:0:0:1::/64 on peer to go to first host

ping6 -I 2001:0:0:1::1 peer-address -- to verify

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 5 +++++
 include/net/netns/ipv6.h               | 1 +
 net/ipv4/ping.c                        | 3 ++-
 net/ipv6/af_inet6.c                    | 3 ++-
 net/ipv6/raw.c                         | 3 ++-
 net/ipv6/sysctl_net_ipv6.c             | 8 ++++++++
 6 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5fae7704daab..f63aeefd2c24 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1435,6 +1435,11 @@ mtu - INTEGER
 	Default Maximum Transfer Unit
 	Default: 1280 (IPv6 required minimum)
 
+ip_nonlocal_bind - BOOLEAN
+	If set, allows processes to bind() to non-local IPv6 addresses,
+	which can be quite useful - but may break some applications.
+	Default: 0
+
 router_probe_interval - INTEGER
 	Minimum interval (in seconds) between Router Probing described
 	in RFC4191.
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 8d93544a2d2b..c0368db6df54 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -31,6 +31,7 @@ struct netns_sysctl_ipv6 {
 	int auto_flowlabels;
 	int icmpv6_time;
 	int anycast_src_echo_reply;
+	int ip_nonlocal_bind;
 	int fwmark_reflect;
 	int idgen_retries;
 	int idgen_delay;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 05ff44b758df..e89094ab5ddb 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
 						    scoped);
 		rcu_read_unlock();
 
-		if (!(isk->freebind || isk->transparent || has_addr ||
+		if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
+		      isk->freebind || isk->transparent || has_addr ||
 		      addr_type == IPV6_ADDR_ANY))
 			return -EADDRNOTAVAIL;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 39e670a91596..7bc92ea4ae8f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -342,7 +342,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			 */
 			v4addr = LOOPBACK4_IPV6;
 			if (!(addr_type & IPV6_ADDR_MULTICAST))	{
-				if (!(inet->freebind || inet->transparent) &&
+				if (!net->ipv6.sysctl.ip_nonlocal_bind &&
+				    !(inet->freebind || inet->transparent) &&
 				    !ipv6_chk_addr(net, &addr->sin6_addr,
 						   dev, 0)) {
 					err = -EADDRNOTAVAIL;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ca4700cb26c4..fdbada1569a3 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -295,7 +295,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		 * unspecified and mapped address have a v4 equivalent.
 		 */
 		v4addr = LOOPBACK4_IPV6;
-		if (!(addr_type & IPV6_ADDR_MULTICAST))	{
+		if (!(addr_type & IPV6_ADDR_MULTICAST) &&
+		    !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) {
 			err = -EADDRNOTAVAIL;
 			if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
 					   dev, 0)) {
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 4e705add4f18..db48aebd9c47 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -75,6 +75,13 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_nonlocal_bind",
+		.data		= &init_net.ipv6.sysctl.ip_nonlocal_bind,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
@@ -117,6 +124,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
 	ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
 	ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
+	ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)
-- 
cgit v1.2.3


From 085db2c04557d31db61541f361bd8b4de92c9939 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 10 Jul 2015 18:15:06 -0500
Subject: netfilter: Per network namespace netfilter hooks.

- Add a new set of functions for registering and unregistering per
  network namespace hooks.

- Modify the old global namespace hook functions to use the per
  network namespace hooks in their implementation, so their remains a
  single list that needs to be walked for any hook (this is important
  for keeping the hook priority working and for keeping the code
  walking the hooks simple).

- Only allow registering the per netdevice hooks in the network
  namespace where the network device lives.

- Dynamically allocate the structures in the per network namespace
  hook list in nf_register_net_hook, and unregister them in
  nf_unregister_net_hook.

  Dynamic allocate is required somewhere as the number of network
  namespaces are not fixed so we might as well allocate them in the
  registration function.

  The chain of registered hooks on any list is expected to be small so
  the cost of walking that list to find the entry we are unregistering
  should also be small.

  Performing the management of the dynamically allocated list entries
  in the registration and unregistration functions keeps the complexity
  from spreading.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/netfilter.h     |  14 +++-
 include/net/netns/netfilter.h |   1 +
 net/netfilter/core.c          | 182 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 173 insertions(+), 24 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 60e89348a91d..9bbd110ec81b 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -11,6 +11,8 @@
 #include <linux/list.h>
 #include <linux/static_key.h>
 #include <linux/netfilter_defs.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
 
 #ifdef CONFIG_NETFILTER
 static inline int NF_DROP_GETERR(int verdict)
@@ -118,6 +120,13 @@ struct nf_sockopt_ops {
 };
 
 /* Function to register/unregister hook points. */
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			  unsigned int n);
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			     unsigned int n);
+
 int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
@@ -128,8 +137,6 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 int nf_register_sockopt(struct nf_sockopt_ops *reg);
 void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 
-extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
-
 #ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
@@ -167,7 +174,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int (*okfn)(struct sock *, struct sk_buff *),
 				 int thresh)
 {
-	struct list_head *nf_hook_list = &nf_hooks[pf][hook];
+	struct net *net = dev_net(indev ? indev : outdev);
+	struct list_head *nf_hook_list = &net->nf.hooks[pf][hook];
 
 	if (nf_hook_list_active(nf_hook_list, pf, hook)) {
 		struct nf_hook_state state;
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 532e4ba64f49..38aa4983e2a9 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -14,5 +14,6 @@ struct netns_nf {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *nf_log_dir_header;
 #endif
+	struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 };
 #endif
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index fa4d3c111d3f..56ead1a1711c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -52,9 +52,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
 }
 EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
 
-struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
-EXPORT_SYMBOL(nf_hooks);
-
 #ifdef HAVE_JUMP_LABEL
 struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
@@ -62,27 +59,40 @@ EXPORT_SYMBOL(nf_hooks_needed);
 
 static DEFINE_MUTEX(nf_hook_mutex);
 
-static struct list_head *find_nf_hook_list(const struct nf_hook_ops *reg)
+static struct list_head *find_nf_hook_list(struct net *net,
+					   const struct nf_hook_ops *reg)
 {
 	struct list_head *nf_hook_list = NULL;
 
 	if (reg->pf != NFPROTO_NETDEV)
-		nf_hook_list = &nf_hooks[reg->pf][reg->hooknum];
+		nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
 	else if (reg->hooknum == NF_NETDEV_INGRESS) {
 #ifdef CONFIG_NETFILTER_INGRESS
-		if (reg->dev)
+		if (reg->dev && dev_net(reg->dev) == net)
 			nf_hook_list = &reg->dev->nf_hooks_ingress;
 #endif
 	}
 	return nf_hook_list;
 }
 
-int nf_register_hook(struct nf_hook_ops *reg)
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
 	struct list_head *nf_hook_list;
-	struct nf_hook_ops *elem;
+	struct nf_hook_ops *elem, *new;
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
 
-	nf_hook_list = find_nf_hook_list(reg);
+	new->hook     = reg->hook;
+	new->dev      = reg->dev;
+	new->owner    = reg->owner;
+	new->priv     = reg->priv;
+	new->pf       = reg->pf;
+	new->hooknum  = reg->hooknum;
+	new->priority = reg->priority;
+
+	nf_hook_list = find_nf_hook_list(net, reg);
 	if (!nf_hook_list)
 		return -ENOENT;
 
@@ -91,7 +101,7 @@ int nf_register_hook(struct nf_hook_ops *reg)
 		if (reg->priority < elem->priority)
 			break;
 	}
-	list_add_rcu(&reg->list, elem->list.prev);
+	list_add_rcu(&new->list, elem->list.prev);
 	mutex_unlock(&nf_hook_mutex);
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
@@ -102,13 +112,35 @@ int nf_register_hook(struct nf_hook_ops *reg)
 #endif
 	return 0;
 }
-EXPORT_SYMBOL(nf_register_hook);
+EXPORT_SYMBOL(nf_register_net_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
+	struct list_head *nf_hook_list;
+	struct nf_hook_ops *elem;
+
+	nf_hook_list = find_nf_hook_list(net, reg);
+	if (!nf_hook_list)
+		return;
+
 	mutex_lock(&nf_hook_mutex);
-	list_del_rcu(&reg->list);
+	list_for_each_entry(elem, nf_hook_list, list) {
+		if ((reg->hook     == elem->hook) &&
+		    (reg->dev      == elem->dev) &&
+		    (reg->owner    == elem->owner) &&
+		    (reg->priv     == elem->priv) &&
+		    (reg->pf       == elem->pf) &&
+		    (reg->hooknum  == elem->hooknum) &&
+		    (reg->priority == elem->priority)) {
+			list_del_rcu(&elem->list);
+			break;
+		}
+	}
 	mutex_unlock(&nf_hook_mutex);
+	if (&elem->list == nf_hook_list) {
+		WARN(1, "nf_unregister_net_hook: hook not found!\n");
+		return;
+	}
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_dec_ingress_queue();
@@ -117,7 +149,77 @@ void nf_unregister_hook(struct nf_hook_ops *reg)
 	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
 	synchronize_net();
-	nf_queue_nf_hook_drop(reg);
+	nf_queue_nf_hook_drop(elem);
+	kfree(elem);
+}
+EXPORT_SYMBOL(nf_unregister_net_hook);
+
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			  unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = nf_register_net_hook(net, &reg[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		nf_unregister_net_hooks(net, reg, i);
+	return err;
+}
+EXPORT_SYMBOL(nf_register_net_hooks);
+
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			     unsigned int n)
+{
+	while (n-- > 0)
+		nf_unregister_net_hook(net, &reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_net_hooks);
+
+static LIST_HEAD(nf_hook_list);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	struct net *net, *last;
+	int ret;
+
+	rtnl_lock();
+	for_each_net(net) {
+		ret = nf_register_net_hook(net, reg);
+		if (ret && ret != -ENOENT)
+			goto rollback;
+	}
+	list_add_tail(&reg->list, &nf_hook_list);
+	rtnl_unlock();
+
+	return 0;
+rollback:
+	last = net;
+	for_each_net(net) {
+		if (net == last)
+			break;
+		nf_unregister_net_hook(net, reg);
+	}
+	rtnl_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	struct net *net;
+
+	rtnl_lock();
+	list_del(&reg->list);
+	for_each_net(net)
+		nf_unregister_net_hook(net, reg);
+	rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
 
@@ -294,8 +396,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(nf_nat_decode_session_hook);
 #endif
 
+static int nf_register_hook_list(struct net *net)
+{
+	struct nf_hook_ops *elem;
+	int ret;
+
+	rtnl_lock();
+	list_for_each_entry(elem, &nf_hook_list, list) {
+		ret = nf_register_net_hook(net, elem);
+		if (ret && ret != -ENOENT)
+			goto out_undo;
+	}
+	rtnl_unlock();
+	return 0;
+
+out_undo:
+	list_for_each_entry_continue_reverse(elem, &nf_hook_list, list)
+		nf_unregister_net_hook(net, elem);
+	rtnl_unlock();
+	return ret;
+}
+
+static void nf_unregister_hook_list(struct net *net)
+{
+	struct nf_hook_ops *elem;
+
+	rtnl_lock();
+	list_for_each_entry(elem, &nf_hook_list, list)
+		nf_unregister_net_hook(net, elem);
+	rtnl_unlock();
+}
+
 static int __net_init netfilter_net_init(struct net *net)
 {
+	int i, h, ret;
+
+	for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
+		for (h = 0; h < NF_MAX_HOOKS; h++)
+			INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+	}
+
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
 						net->proc_net);
@@ -306,11 +446,16 @@ static int __net_init netfilter_net_init(struct net *net)
 		return -ENOMEM;
 	}
 #endif
-	return 0;
+	ret = nf_register_hook_list(net);
+	if (ret)
+		remove_proc_entry("netfilter", net->proc_net);
+
+	return ret;
 }
 
 static void __net_exit netfilter_net_exit(struct net *net)
 {
+	nf_unregister_hook_list(net);
 	remove_proc_entry("netfilter", net->proc_net);
 }
 
@@ -321,12 +466,7 @@ static struct pernet_operations netfilter_net_ops = {
 
 int __init netfilter_init(void)
 {
-	int i, h, ret;
-
-	for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
-		for (h = 0; h < NF_MAX_HOOKS; h++)
-			INIT_LIST_HEAD(&nf_hooks[i][h]);
-	}
+	int ret;
 
 	ret = register_pernet_subsys(&netfilter_net_ops);
 	if (ret < 0)
-- 
cgit v1.2.3


From 5c48f1201744233d4f235c7dd916d5196ed20716 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 17 Jun 2015 09:58:06 +0200
Subject: mac80211: remove exposing 'mfp' to drivers

There's no driver using this, so remove it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 --
 net/mac80211/cfg.c     | 1 -
 net/mac80211/mlme.c    | 6 +-----
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6b1077c2a63f..43dbddfa06c0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1675,7 +1675,6 @@ struct ieee80211_sta_rates {
  * @tdls: indicates whether the STA is a TDLS peer
  * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only
  *	valid if the STA is a TDLS peer in the first place.
- * @mfp: indicates whether the STA uses management frame protection or not.
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction)
  */
 struct ieee80211_sta {
@@ -1693,7 +1692,6 @@ struct ieee80211_sta {
 	struct ieee80211_sta_rates __rcu *rates;
 	bool tdls;
 	bool tdls_initiator;
-	bool mfp;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS];
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index bf7023f6c327..5fc7788e2ff2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1076,7 +1076,6 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 	}
 
 	if (mask & BIT(NL80211_STA_FLAG_MFP)) {
-		sta->sta.mfp = !!(set & BIT(NL80211_STA_FLAG_MFP));
 		if (set & BIT(NL80211_STA_FLAG_MFP))
 			set_sta_flag(sta, WLAN_STA_MFP);
 		else
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 9b2cc278ac2a..ae5d6c48272d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3034,12 +3034,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 
 	rate_control_rate_init(sta);
 
-	if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) {
+	if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED)
 		set_sta_flag(sta, WLAN_STA_MFP);
-		sta->sta.mfp = true;
-	} else {
-		sta->sta.mfp = false;
-	}
 
 	sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS;
 
-- 
cgit v1.2.3


From af9f9b22beee70aae58651cdbb9d6375e6e51797 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 11 Jun 2015 16:02:32 +0200
Subject: mac80211: don't store napi struct

When introducing multiple RX queues, a single NAPI struct will not
be sufficient. Instead of trying to store multiple, simply change
the API to have the NAPI struct passed to the RX function. This of
course means that drivers using rx_irqsafe() cannot use NAPI, but
that seems a reasonable trade-off, particularly since only two of
all drivers are currently using it at all.

While at it, we can now remove the IEEE80211_RX_REORDER_TIMER flag
again since this code path cannot have a napi struct anyway.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/iwlwifi/dvm/dev.h  |  2 ++
 drivers/net/wireless/iwlwifi/dvm/main.c |  3 ++-
 drivers/net/wireless/iwlwifi/dvm/rx.c   |  2 +-
 drivers/net/wireless/iwlwifi/mvm/mvm.h  |  1 +
 drivers/net/wireless/iwlwifi/mvm/ops.c  |  3 ++-
 drivers/net/wireless/iwlwifi/mvm/rx.c   |  2 +-
 include/net/mac80211.h                  | 37 +++++++++++++++++++++------------
 net/mac80211/ieee80211_i.h              |  6 +-----
 net/mac80211/main.c                     | 12 -----------
 net/mac80211/rx.c                       | 18 +++++++++-------
 10 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/wireless/iwlwifi/dvm/dev.h b/drivers/net/wireless/iwlwifi/dvm/dev.h
index 3811878ab9cd..074977ede343 100644
--- a/drivers/net/wireless/iwlwifi/dvm/dev.h
+++ b/drivers/net/wireless/iwlwifi/dvm/dev.h
@@ -669,6 +669,8 @@ struct iwl_priv {
 	/* ieee device used by generic ieee processing code */
 	struct ieee80211_hw *hw;
 
+	struct napi_struct *napi;
+
 	struct list_head calib_results;
 
 	struct workqueue_struct *workqueue;
diff --git a/drivers/net/wireless/iwlwifi/dvm/main.c b/drivers/net/wireless/iwlwifi/dvm/main.c
index 234e30f498b2..644819563cf0 100644
--- a/drivers/net/wireless/iwlwifi/dvm/main.c
+++ b/drivers/net/wireless/iwlwifi/dvm/main.c
@@ -2037,7 +2037,8 @@ static void iwl_napi_add(struct iwl_op_mode *op_mode,
 {
 	struct iwl_priv *priv = IWL_OP_MODE_GET_DVM(op_mode);
 
-	ieee80211_napi_add(priv->hw, napi, napi_dev, poll, weight);
+	netif_napi_add(napi_dev, napi, poll, weight);
+	priv->napi = napi;
 }
 
 static const struct iwl_op_mode_ops iwl_dvm_ops = {
diff --git a/drivers/net/wireless/iwlwifi/dvm/rx.c b/drivers/net/wireless/iwlwifi/dvm/rx.c
index debec963c610..5a91f5d6b1dc 100644
--- a/drivers/net/wireless/iwlwifi/dvm/rx.c
+++ b/drivers/net/wireless/iwlwifi/dvm/rx.c
@@ -786,7 +786,7 @@ static void iwlagn_pass_packet_to_mac80211(struct iwl_priv *priv,
 
 	memcpy(IEEE80211_SKB_RXCB(skb), stats, sizeof(*stats));
 
-	ieee80211_rx(priv->hw, skb);
+	ieee80211_rx_napi(priv->hw, skb, priv->napi);
 }
 
 static u32 iwlagn_translate_rx_status(struct iwl_priv *priv, u32 decrypt_in)
diff --git a/drivers/net/wireless/iwlwifi/mvm/mvm.h b/drivers/net/wireless/iwlwifi/mvm/mvm.h
index 2d4bad5fe825..605f57a2c6be 100644
--- a/drivers/net/wireless/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/iwlwifi/mvm/mvm.h
@@ -559,6 +559,7 @@ struct iwl_mvm {
 	const struct iwl_cfg *cfg;
 	struct iwl_phy_db *phy_db;
 	struct ieee80211_hw *hw;
+	struct napi_struct *napi;
 
 	/* for protecting access to iwl_mvm */
 	struct mutex mutex;
diff --git a/drivers/net/wireless/iwlwifi/mvm/ops.c b/drivers/net/wireless/iwlwifi/mvm/ops.c
index e4fa50075ffd..3967df63e0f3 100644
--- a/drivers/net/wireless/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/iwlwifi/mvm/ops.c
@@ -1316,7 +1316,8 @@ static void iwl_mvm_napi_add(struct iwl_op_mode *op_mode,
 {
 	struct iwl_mvm *mvm = IWL_OP_MODE_GET_MVM(op_mode);
 
-	ieee80211_napi_add(mvm->hw, napi, napi_dev, poll, weight);
+	netif_napi_add(napi_dev, napi, poll, weight);
+	mvm->napi = napi;
 }
 
 static const struct iwl_op_mode_ops iwl_mvm_ops = {
diff --git a/drivers/net/wireless/iwlwifi/mvm/rx.c b/drivers/net/wireless/iwlwifi/mvm/rx.c
index 8f1d93b7a13a..9ff0b4321df3 100644
--- a/drivers/net/wireless/iwlwifi/mvm/rx.c
+++ b/drivers/net/wireless/iwlwifi/mvm/rx.c
@@ -129,7 +129,7 @@ static void iwl_mvm_pass_packet_to_mac80211(struct iwl_mvm *mvm,
 				fraglen, rxb->truesize);
 	}
 
-	ieee80211_rx(mvm->hw, skb);
+	ieee80211_rx_napi(mvm->hw, skb, mvm->napi);
 }
 
 /*
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 43dbddfa06c0..ff68b8c4ab35 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3694,20 +3694,28 @@ void ieee80211_free_hw(struct ieee80211_hw *hw);
 void ieee80211_restart_hw(struct ieee80211_hw *hw);
 
 /**
- * ieee80211_napi_add - initialize mac80211 NAPI context
- * @hw: the hardware to initialize the NAPI context on
- * @napi: the NAPI context to initialize
- * @napi_dev: dummy NAPI netdevice, here to not waste the space if the
- *	driver doesn't use NAPI
- * @poll: poll function
- * @weight: default weight
+ * ieee80211_rx_napi - receive frame from NAPI context
  *
- * See also netif_napi_add().
+ * Use this function to hand received frames to mac80211. The receive
+ * buffer in @skb must start with an IEEE 802.11 header. In case of a
+ * paged @skb is used, the driver is recommended to put the ieee80211
+ * header of the frame on the linear part of the @skb to avoid memory
+ * allocation and/or memcpy by the stack.
+ *
+ * This function may not be called in IRQ context. Calls to this function
+ * for a single hardware must be synchronized against each other. Calls to
+ * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be
+ * mixed for a single hardware. Must not run concurrently with
+ * ieee80211_tx_status() or ieee80211_tx_status_ni().
+ *
+ * This function must be called with BHs disabled.
+ *
+ * @hw: the hardware this frame came in on
+ * @skb: the buffer to receive, owned by mac80211 after this call
+ * @napi: the NAPI context
  */
-void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi,
-			struct net_device *napi_dev,
-			int (*poll)(struct napi_struct *, int),
-			int weight);
+void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
+		       struct napi_struct *napi);
 
 /**
  * ieee80211_rx - receive frame
@@ -3729,7 +3737,10 @@ void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi,
  * @hw: the hardware this frame came in on
  * @skb: the buffer to receive, owned by mac80211 after this call
  */
-void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb);
+static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+	ieee80211_rx_napi(hw, skb, NULL);
+}
 
 /**
  * ieee80211_rx_irqsafe - receive frame
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 361bb3ca335c..7d75f93bac7d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -202,8 +202,6 @@ enum ieee80211_packet_rx_flags {
  * @IEEE80211_RX_CMNTR: received on cooked monitor already
  * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported
  *	to cfg80211_report_obss_beacon().
- * @IEEE80211_RX_REORDER_TIMER: this frame is released by the
- *	reorder buffer timeout timer, not the normal RX path
  *
  * These flags are used across handling multiple interfaces
  * for a single frame.
@@ -211,10 +209,10 @@ enum ieee80211_packet_rx_flags {
 enum ieee80211_rx_flags {
 	IEEE80211_RX_CMNTR		= BIT(0),
 	IEEE80211_RX_BEACON_REPORTED	= BIT(1),
-	IEEE80211_RX_REORDER_TIMER	= BIT(2),
 };
 
 struct ieee80211_rx_data {
+	struct napi_struct *napi;
 	struct sk_buff *skb;
 	struct ieee80211_local *local;
 	struct ieee80211_sub_if_data *sdata;
@@ -1347,8 +1345,6 @@ struct ieee80211_local {
 
 	struct ieee80211_sub_if_data __rcu *p2p_sdata;
 
-	struct napi_struct *napi;
-
 	/* virtual monitor interface */
 	struct ieee80211_sub_if_data __rcu *monitor_sdata;
 	struct cfg80211_chan_def monitor_chandef;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 3c63468b4dfb..dba0a86dee18 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1132,18 +1132,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 }
 EXPORT_SYMBOL(ieee80211_register_hw);
 
-void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi,
-			struct net_device *napi_dev,
-			int (*poll)(struct napi_struct *, int),
-			int weight)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-
-	netif_napi_add(napi_dev, napi, poll, weight);
-	local->napi = napi;
-}
-EXPORT_SYMBOL_GPL(ieee80211_napi_add);
-
 void ieee80211_unregister_hw(struct ieee80211_hw *hw)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index dd6bb2a54d45..817bf22dad5a 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2148,9 +2148,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 		/* deliver to local stack */
 		skb->protocol = eth_type_trans(skb, dev);
 		memset(skb->cb, 0, sizeof(skb->cb));
-		if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) &&
-		    rx->local->napi)
-			napi_gro_receive(rx->local->napi, skb);
+		if (rx->napi)
+			napi_gro_receive(rx->napi, skb);
 		else
 			netif_receive_skb(skb);
 	}
@@ -3256,7 +3255,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
 		/* This is OK -- must be QoS data frame */
 		.security_idx = tid,
 		.seqno_idx = tid,
-		.flags = IEEE80211_RX_REORDER_TIMER,
+		.napi = NULL, /* must be NULL to not have races */
 	};
 	struct tid_ampdu_rx *tid_agg_rx;
 
@@ -3433,7 +3432,8 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
  * be called with rcu_read_lock protection.
  */
 static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
-					 struct sk_buff *skb)
+					 struct sk_buff *skb,
+					 struct napi_struct *napi)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata;
@@ -3449,6 +3449,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 	memset(&rx, 0, sizeof(rx));
 	rx.skb = skb;
 	rx.local = local;
+	rx.napi = napi;
 
 	if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc))
 		I802_DEBUG_INC(local->dot11ReceivedFragmentCount);
@@ -3550,7 +3551,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
  * This is the receive path handler. It is called by a low level driver when an
  * 802.11 MPDU is received from the hardware.
  */
-void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
+void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
+		       struct napi_struct *napi)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_rate *rate = NULL;
@@ -3649,7 +3651,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
 	ieee80211_tpt_led_trig_rx(local,
 			((struct ieee80211_hdr *)skb->data)->frame_control,
 			skb->len);
-	__ieee80211_rx_handle_packet(hw, skb);
+	__ieee80211_rx_handle_packet(hw, skb, napi);
 
 	rcu_read_unlock();
 
@@ -3657,7 +3659,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
  drop:
 	kfree_skb(skb);
 }
-EXPORT_SYMBOL(ieee80211_rx);
+EXPORT_SYMBOL(ieee80211_rx_napi);
 
 /* This is a version of the rx handler that can be called from hard irq
  * context. Post the skb on the queue and schedule the tasklet */
-- 
cgit v1.2.3


From 0c028b5fd1bd10d5777756e571c6c1971f04062b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 12 Jun 2015 14:33:54 +0200
Subject: mac80211: remove zero-length A-MPDU subframe reporting

As there's no driver using this capability and reporting zero-length
A-MPDU subframes for radiotap monitoring, remove the capability to
free up two RX flags.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 6 +-----
 net/mac80211/rx.c      | 7 +------
 2 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ff68b8c4ab35..7417fee18185 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -997,9 +997,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference
  *	number (@ampdu_reference) must be populated and be a distinct number for
  *	each A-MPDU
- * @RX_FLAG_AMPDU_REPORT_ZEROLEN: driver reports 0-length subframes
- * @RX_FLAG_AMPDU_IS_ZEROLEN: This is a zero-length subframe, for
- *	monitoring purposes only
  * @RX_FLAG_AMPDU_LAST_KNOWN: last subframe is known, should be set on all
  *	subframes of a single A-MPDU
  * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU
@@ -1039,8 +1036,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_NO_SIGNAL_VAL		= BIT(12),
 	RX_FLAG_HT_GF			= BIT(13),
 	RX_FLAG_AMPDU_DETAILS		= BIT(14),
-	RX_FLAG_AMPDU_REPORT_ZEROLEN	= BIT(15),
-	RX_FLAG_AMPDU_IS_ZEROLEN	= BIT(16),
+	/* bits 15/16 free */
 	RX_FLAG_AMPDU_LAST_KNOWN	= BIT(17),
 	RX_FLAG_AMPDU_IS_LAST		= BIT(18),
 	RX_FLAG_AMPDU_DELIM_CRC_ERROR	= BIT(19),
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 817bf22dad5a..9d95cb8e8e95 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -122,8 +122,7 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
 	hdr = (void *)(skb->data + rtap_vendor_space);
 
 	if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
-			    RX_FLAG_FAILED_PLCP_CRC |
-			    RX_FLAG_AMPDU_IS_ZEROLEN))
+			    RX_FLAG_FAILED_PLCP_CRC))
 		return true;
 
 	if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space))
@@ -391,10 +390,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 			cpu_to_le32(1 << IEEE80211_RADIOTAP_AMPDU_STATUS);
 		put_unaligned_le32(status->ampdu_reference, pos);
 		pos += 4;
-		if (status->flag & RX_FLAG_AMPDU_REPORT_ZEROLEN)
-			flags |= IEEE80211_RADIOTAP_AMPDU_REPORT_ZEROLEN;
-		if (status->flag & RX_FLAG_AMPDU_IS_ZEROLEN)
-			flags |= IEEE80211_RADIOTAP_AMPDU_IS_ZEROLEN;
 		if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN)
 			flags |= IEEE80211_RADIOTAP_AMPDU_LAST_KNOWN;
 		if (status->flag & RX_FLAG_AMPDU_IS_LAST)
-- 
cgit v1.2.3


From 981d94a80174e4f33bd5015fb49051bfc2eb00d2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 12 Jun 2015 14:39:02 +0200
Subject: mac80211: support device/driver PN check for CCMP/GCMP

When there are multiple RX queues, the PN checks in mac80211 cannot be
used since packets might be processed out of order on different CPUs.

Allow the driver to report that the PN has been checked, drivers that
will use multi-queue RX will have to set this flag.

For now, the flag is only valid when the frame has been decrypted, in
theory that restriction doesn't have to be there, but in practice the
hardware will have decrypted the frame already.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  7 ++++-
 net/mac80211/wpa.c     | 83 +++++++++++++++++++++++++++-----------------------
 2 files changed, 51 insertions(+), 39 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7417fee18185..4d3d2686f278 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -973,6 +973,10 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_IV_STRIPPED: The IV/ICV are stripped from this frame.
  *	If this flag is set, the stack cannot do any replay detection
  *	hence the driver or hardware will have to do that.
+ * @RX_FLAG_PN_VALIDATED: Currently only valid for CCMP/GCMP frames, this
+ *	flag indicates that the PN was verified for replay protection.
+ *	Note that this flag is also currently only supported when a frame
+ *	is also decrypted (ie. @RX_FLAG_DECRYPTED must be set)
  * @RX_FLAG_FAILED_FCS_CRC: Set this flag if the FCS check failed on
  *	the frame.
  * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on
@@ -1036,7 +1040,8 @@ enum mac80211_rx_flags {
 	RX_FLAG_NO_SIGNAL_VAL		= BIT(12),
 	RX_FLAG_HT_GF			= BIT(13),
 	RX_FLAG_AMPDU_DETAILS		= BIT(14),
-	/* bits 15/16 free */
+	RX_FLAG_PN_VALIDATED		= BIT(15),
+	/* bit 16 free */
 	RX_FLAG_AMPDU_LAST_KNOWN	= BIT(17),
 	RX_FLAG_AMPDU_IS_LAST		= BIT(18),
 	RX_FLAG_AMPDU_DELIM_CRC_ERROR	= BIT(19),
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 943f7606527e..feb547dc8643 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -516,31 +516,34 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 			return RX_DROP_UNUSABLE;
 	}
 
-	ccmp_hdr2pn(pn, skb->data + hdrlen);
+	if (!(status->flag & RX_FLAG_PN_VALIDATED)) {
+		ccmp_hdr2pn(pn, skb->data + hdrlen);
 
-	queue = rx->security_idx;
+		queue = rx->security_idx;
 
-	if (memcmp(pn, key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN) <= 0) {
-		key->u.ccmp.replays++;
-		return RX_DROP_UNUSABLE;
-	}
+		if (memcmp(pn, key->u.ccmp.rx_pn[queue],
+			   IEEE80211_CCMP_PN_LEN) <= 0) {
+			key->u.ccmp.replays++;
+			return RX_DROP_UNUSABLE;
+		}
 
-	if (!(status->flag & RX_FLAG_DECRYPTED)) {
-		u8 aad[2 * AES_BLOCK_SIZE];
-		u8 b_0[AES_BLOCK_SIZE];
-		/* hardware didn't decrypt/verify MIC */
-		ccmp_special_blocks(skb, pn, b_0, aad);
+		if (!(status->flag & RX_FLAG_DECRYPTED)) {
+			u8 aad[2 * AES_BLOCK_SIZE];
+			u8 b_0[AES_BLOCK_SIZE];
+			/* hardware didn't decrypt/verify MIC */
+			ccmp_special_blocks(skb, pn, b_0, aad);
+
+			if (ieee80211_aes_ccm_decrypt(
+				    key->u.ccmp.tfm, b_0, aad,
+				    skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
+				    data_len,
+				    skb->data + skb->len - mic_len, mic_len))
+				return RX_DROP_UNUSABLE;
+		}
 
-		if (ieee80211_aes_ccm_decrypt(
-			    key->u.ccmp.tfm, b_0, aad,
-			    skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
-			    data_len,
-			    skb->data + skb->len - mic_len, mic_len))
-			return RX_DROP_UNUSABLE;
+		memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
 	}
 
-	memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
-
 	/* Remove CCMP header and MIC */
 	if (pskb_trim(skb, skb->len - mic_len))
 		return RX_DROP_UNUSABLE;
@@ -739,31 +742,35 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
 			return RX_DROP_UNUSABLE;
 	}
 
-	gcmp_hdr2pn(pn, skb->data + hdrlen);
+	if (!(status->flag & RX_FLAG_PN_VALIDATED)) {
+		gcmp_hdr2pn(pn, skb->data + hdrlen);
 
-	queue = rx->security_idx;
+		queue = rx->security_idx;
 
-	if (memcmp(pn, key->u.gcmp.rx_pn[queue], IEEE80211_GCMP_PN_LEN) <= 0) {
-		key->u.gcmp.replays++;
-		return RX_DROP_UNUSABLE;
-	}
+		if (memcmp(pn, key->u.gcmp.rx_pn[queue],
+			   IEEE80211_GCMP_PN_LEN) <= 0) {
+			key->u.gcmp.replays++;
+			return RX_DROP_UNUSABLE;
+		}
 
-	if (!(status->flag & RX_FLAG_DECRYPTED)) {
-		u8 aad[2 * AES_BLOCK_SIZE];
-		u8 j_0[AES_BLOCK_SIZE];
-		/* hardware didn't decrypt/verify MIC */
-		gcmp_special_blocks(skb, pn, j_0, aad);
+		if (!(status->flag & RX_FLAG_DECRYPTED)) {
+			u8 aad[2 * AES_BLOCK_SIZE];
+			u8 j_0[AES_BLOCK_SIZE];
+			/* hardware didn't decrypt/verify MIC */
+			gcmp_special_blocks(skb, pn, j_0, aad);
+
+			if (ieee80211_aes_gcm_decrypt(
+				    key->u.gcmp.tfm, j_0, aad,
+				    skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN,
+				    data_len,
+				    skb->data + skb->len -
+				    IEEE80211_GCMP_MIC_LEN))
+				return RX_DROP_UNUSABLE;
+		}
 
-		if (ieee80211_aes_gcm_decrypt(
-			    key->u.gcmp.tfm, j_0, aad,
-			    skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN,
-			    data_len,
-			    skb->data + skb->len - IEEE80211_GCMP_MIC_LEN))
-			return RX_DROP_UNUSABLE;
+		memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
 	}
 
-	memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
-
 	/* Remove GCMP header and MIC */
 	if (pskb_trim(skb, skb->len - IEEE80211_GCMP_MIC_LEN))
 		return RX_DROP_UNUSABLE;
-- 
cgit v1.2.3


From b98fb44ffceeac717789e8f2fb3497e6b8c5c65b Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Wed, 10 Jun 2015 20:42:59 +0300
Subject: mac80211: define TDLS wider BW support bits

Allow a device to specify support for the TDLS wider-bandwidth feature.
Indicate this support during TDLS setup in the ext-capab IE and set an
appropriate station flag when our TDLS peer supports it.
This feature gives TDLS peers the ability to use a wider channel than
the base width of the BSS. For instance VHT capable TDLS peers connected
on a 20MHz channel can extend the channel to 80MHz, if regulatory
considerations allow it.

Do not cap the bandwidth of such stations by the current BSS channel width
in mac80211.

Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  |  4 ++++
 net/mac80211/cfg.c      |  6 ++++++
 net/mac80211/debugfs.c  |  1 +
 net/mac80211/sta_info.h |  3 +++
 net/mac80211/tdls.c     | 18 +++++++++++++-----
 net/mac80211/vht.c      |  8 ++++++--
 6 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 4d3d2686f278..8f61a230c482 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1887,6 +1887,9 @@ struct ieee80211_txq {
  * @IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands
  *	in one command, mac80211 doesn't have to run separate scans per band.
  *
+ * @IEEE80211_HW_TDLS_WIDER_BW: The device/driver supports wider bandwidth
+ *	than then BSS bandwidth for a TDLS link on the base channel.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -1919,6 +1922,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_CHANCTX_STA_CSA,
 	IEEE80211_HW_SUPPORTS_CLONED_SKBS,
 	IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS,
+	IEEE80211_HW_TDLS_WIDER_BW,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index b145942a7624..a32575bf0546 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1155,6 +1155,12 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 	    params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH)
 		set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH);
 
+	if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
+	    ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) &&
+	    params->ext_capab_len >= 8 &&
+	    params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED)
+		set_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW);
+
 	if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) {
 		sta->sta.uapsd_queues = params->uapsd_queues;
 		sta->sta.max_sp = params->max_sp;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 2c79d777f0e4..ced6bf3be8d6 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -122,6 +122,7 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
 	FLAG(CHANCTX_STA_CSA),
 	FLAG(SUPPORTS_CLONED_SKBS),
 	FLAG(SINGLE_SCAN_ON_ALL_BANDS),
+	FLAG(TDLS_WIDER_BW),
 
 	/* keep last for the build bug below */
 	(void *)0x1
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 9e568927c080..b9c1aaaa01ff 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -53,6 +53,8 @@
  * @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching
  * @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this
  *	TDLS peer
+ * @WLAN_STA_TDLS_WIDER_BW: This TDLS peer supports working on a wider bw on
+ *	the BSS base channel.
  * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was
  *	keeping station in power-save mode, reply when the driver
  *	unblocks the station.
@@ -84,6 +86,7 @@ enum ieee80211_sta_info_flags {
 	WLAN_STA_TDLS_INITIATOR,
 	WLAN_STA_TDLS_CHAN_SWITCH,
 	WLAN_STA_TDLS_OFF_CHANNEL,
+	WLAN_STA_TDLS_WIDER_BW,
 	WLAN_STA_UAPSD,
 	WLAN_STA_SP,
 	WLAN_STA_4ADDR_EVENT,
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index ad31b2dab4f5..fec1b336d03c 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -35,20 +35,28 @@ void ieee80211_tdls_peer_del_work(struct work_struct *wk)
 	mutex_unlock(&local->mtx);
 }
 
-static void ieee80211_tdls_add_ext_capab(struct ieee80211_local *local,
+static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata,
 					 struct sk_buff *skb)
 {
-	u8 *pos = (void *)skb_put(skb, 7);
+	struct ieee80211_local *local = sdata->local;
 	bool chan_switch = local->hw.wiphy->features &
 			   NL80211_FEATURE_TDLS_CHANNEL_SWITCH;
+	bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW);
+	enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+	struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
+	bool vht = sband && sband->vht_cap.vht_supported;
+	u8 *pos = (void *)skb_put(skb, 10);
 
 	*pos++ = WLAN_EID_EXT_CAPABILITY;
-	*pos++ = 5; /* len */
+	*pos++ = 8; /* len */
 	*pos++ = 0x0;
 	*pos++ = 0x0;
 	*pos++ = 0x0;
 	*pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0;
 	*pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED;
+	*pos++ = 0;
+	*pos++ = 0;
+	*pos++ = (vht && wider_band) ? WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED : 0;
 }
 
 static u8
@@ -320,7 +328,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
 		offset = noffset;
 	}
 
-	ieee80211_tdls_add_ext_capab(local, skb);
+	ieee80211_tdls_add_ext_capab(sdata, skb);
 
 	/* add the QoS element if we support it */
 	if (local->hw.queues >= IEEE80211_NUM_ACS &&
@@ -784,7 +792,7 @@ ieee80211_tdls_build_mgmt_packet_data(struct ieee80211_sub_if_data *sdata,
 			       max(sizeof(struct ieee80211_mgmt),
 				   sizeof(struct ieee80211_tdls_data)) +
 			       50 + /* supported rates */
-			       7 + /* ext capab */
+			       10 + /* ext capab */
 			       26 + /* max(WMM-info, WMM-param) */
 			       2 + max(sizeof(struct ieee80211_ht_cap),
 				       sizeof(struct ieee80211_ht_operation)) +
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 80694d55db74..f05808d0d80f 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -308,11 +308,15 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	enum ieee80211_sta_rx_bandwidth bw;
+	enum nl80211_chan_width bss_width = sdata->vif.bss_conf.chandef.width;
 
-	bw = ieee80211_chan_width_to_rx_bw(sdata->vif.bss_conf.chandef.width);
-	bw = min(bw, ieee80211_sta_cap_rx_bw(sta));
+	bw = ieee80211_sta_cap_rx_bw(sta);
 	bw = min(bw, sta->cur_max_bandwidth);
 
+	/* do not cap the BW of TDLS WIDER_BW peers by the bss */
+	if (!test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW))
+		bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width));
+
 	return bw;
 }
 
-- 
cgit v1.2.3


From f9a060f4b2003eb7350762e60dfc576447e44bad Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 12 Jun 2015 14:55:34 +0200
Subject: mac80211: add pointer for driver use to key

Some drivers may need to store data per key, for example for PN
validation. Allow this by adding a pointer to the struct that
the driver can assign.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8f61a230c482..484cc14fb947 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1492,8 +1492,10 @@ enum ieee80211_key_flags {
  * 	- Temporal Authenticator Rx MIC Key (64 bits)
  * @icv_len: The ICV length for this key type
  * @iv_len: The IV length for this key type
+ * @drv_priv: pointer for driver use
  */
 struct ieee80211_key_conf {
+	void *drv_priv;
 	atomic64_t tx_pn;
 	u32 cipher;
 	u8 icv_len;
-- 
cgit v1.2.3


From 33d8783c58427683b533664f67f8c4378ed64495 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 23 Jun 2015 17:47:05 +0200
Subject: cfg80211: allow mgmt_frame_register callback to sleep

This callback is currently not allowed to sleep, which makes it more
difficult to implement proper driver methods in mac80211 than it has
to be. Instead of doing asynchronous work here in mac80211, make it
possible for the callback to sleep by doing some asynchronous work
in cfg80211. This also enables improvements to other drivers, like
ath6kl, that would like to sleep in this callback.

While at it, also fix the code to call the driver on the implicit
unregistration when an interface is removed, and do that also when
a P2P-Device wdev is destroyed (otherwise we leak the structs.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h  |  3 +-
 net/wireless/core.c     |  5 ++++
 net/wireless/core.h     |  5 ++++
 net/wireless/mlme.c     | 75 +++++++++++++++++++++++++++++++++++++++----------
 net/wireless/rdev-ops.h |  2 ++
 5 files changed, 73 insertions(+), 17 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a741678f24a2..9a529c48f6ca 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2369,8 +2369,7 @@ struct cfg80211_qos_map {
  *	method returns 0.)
  *
  * @mgmt_frame_register: Notify driver that a management frame type was
- *	registered. Note that this callback may not sleep, and cannot run
- *	concurrently with itself.
+ *	registered. The callback is allowed to sleep.
  *
  * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device.
  *	Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 2a0bbd22854b..3893409dee95 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -407,6 +407,9 @@ use_default_name:
 	INIT_LIST_HEAD(&rdev->bss_list);
 	INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done);
 	INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results);
+	INIT_LIST_HEAD(&rdev->mlme_unreg);
+	spin_lock_init(&rdev->mlme_unreg_lock);
+	INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk);
 	INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk,
 			  cfg80211_dfs_channels_update_work);
 #ifdef CONFIG_CFG80211_WEXT
@@ -802,6 +805,7 @@ void wiphy_unregister(struct wiphy *wiphy)
 	cancel_delayed_work_sync(&rdev->dfs_update_channels_wk);
 	flush_work(&rdev->destroy_work);
 	flush_work(&rdev->sched_scan_stop_wk);
+	flush_work(&rdev->mlme_unreg_wk);
 
 #ifdef CONFIG_PM
 	if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup)
@@ -855,6 +859,7 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev)
 
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_P2P_DEVICE:
+		cfg80211_mlme_purge_registrations(wdev);
 		cfg80211_stop_p2p_device(rdev, wdev);
 		break;
 	default:
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 311eef26bf88..b9d5bc8c148d 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -59,6 +59,10 @@ struct cfg80211_registered_device {
 	struct list_head beacon_registrations;
 	spinlock_t beacon_registrations_lock;
 
+	struct list_head mlme_unreg;
+	spinlock_t mlme_unreg_lock;
+	struct work_struct mlme_unreg_wk;
+
 	/* protected by RTNL only */
 	int num_running_ifaces;
 	int num_running_monitor_ifaces;
@@ -348,6 +352,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
 int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
 				u16 frame_type, const u8 *match_data,
 				int match_len);
+void cfg80211_mlme_unreg_wk(struct work_struct *wk);
 void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
 void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
 int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 7aae329e2b4e..fb44fa3bf4ef 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -2,6 +2,7 @@
  * cfg80211 MLME SAP interface
  *
  * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
+ * Copyright (c) 2015		Intel Deutschland GmbH
  */
 
 #include <linux/kernel.h>
@@ -389,6 +390,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
 
 struct cfg80211_mgmt_registration {
 	struct list_head list;
+	struct wireless_dev *wdev;
 
 	u32 nlportid;
 
@@ -399,6 +401,46 @@ struct cfg80211_mgmt_registration {
 	u8 match[];
 };
 
+static void
+cfg80211_process_mlme_unregistrations(struct cfg80211_registered_device *rdev)
+{
+	struct cfg80211_mgmt_registration *reg;
+
+	ASSERT_RTNL();
+
+	spin_lock_bh(&rdev->mlme_unreg_lock);
+	while ((reg = list_first_entry_or_null(&rdev->mlme_unreg,
+					       struct cfg80211_mgmt_registration,
+					       list))) {
+		list_del(&reg->list);
+		spin_unlock_bh(&rdev->mlme_unreg_lock);
+
+		if (rdev->ops->mgmt_frame_register) {
+			u16 frame_type = le16_to_cpu(reg->frame_type);
+
+			rdev_mgmt_frame_register(rdev, reg->wdev,
+						 frame_type, false);
+		}
+
+		kfree(reg);
+
+		spin_lock_bh(&rdev->mlme_unreg_lock);
+	}
+	spin_unlock_bh(&rdev->mlme_unreg_lock);
+}
+
+void cfg80211_mlme_unreg_wk(struct work_struct *wk)
+{
+	struct cfg80211_registered_device *rdev;
+
+	rdev = container_of(wk, struct cfg80211_registered_device,
+			    mlme_unreg_wk);
+
+	rtnl_lock();
+	cfg80211_process_mlme_unregistrations(rdev);
+	rtnl_unlock();
+}
+
 int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 				u16 frame_type, const u8 *match_data,
 				int match_len)
@@ -449,11 +491,18 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 	nreg->match_len = match_len;
 	nreg->nlportid = snd_portid;
 	nreg->frame_type = cpu_to_le16(frame_type);
+	nreg->wdev = wdev;
 	list_add(&nreg->list, &wdev->mgmt_registrations);
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+	/* process all unregistrations to avoid driver confusion */
+	cfg80211_process_mlme_unregistrations(rdev);
 
 	if (rdev->ops->mgmt_frame_register)
 		rdev_mgmt_frame_register(rdev, wdev, frame_type, true);
 
+	return 0;
+
  out:
 	spin_unlock_bh(&wdev->mgmt_registrations_lock);
 
@@ -472,15 +521,12 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
 		if (reg->nlportid != nlportid)
 			continue;
 
-		if (rdev->ops->mgmt_frame_register) {
-			u16 frame_type = le16_to_cpu(reg->frame_type);
-
-			rdev_mgmt_frame_register(rdev, wdev,
-						 frame_type, false);
-		}
-
 		list_del(&reg->list);
-		kfree(reg);
+		spin_lock(&rdev->mlme_unreg_lock);
+		list_add_tail(&reg->list, &rdev->mlme_unreg);
+		spin_unlock(&rdev->mlme_unreg_lock);
+
+		schedule_work(&rdev->mlme_unreg_wk);
 	}
 
 	spin_unlock_bh(&wdev->mgmt_registrations_lock);
@@ -496,16 +542,15 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
 
 void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
 {
-	struct cfg80211_mgmt_registration *reg, *tmp;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 
 	spin_lock_bh(&wdev->mgmt_registrations_lock);
-
-	list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
-		list_del(&reg->list);
-		kfree(reg);
-	}
-
+	spin_lock(&rdev->mlme_unreg_lock);
+	list_splice_tail_init(&wdev->mgmt_registrations, &rdev->mlme_unreg);
+	spin_unlock(&rdev->mlme_unreg_lock);
 	spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+	cfg80211_process_mlme_unregistrations(rdev);
 }
 
 int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index c6e83a7468c0..c23516d0f807 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -733,6 +733,8 @@ static inline void
 rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev,
 			 struct wireless_dev *wdev, u16 frame_type, bool reg)
 {
+	might_sleep();
+
 	trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg);
 	rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg);
 	trace_rdev_return_void(&rdev->wiphy);
-- 
cgit v1.2.3


From b87a173e25d6bf5c26f13d329cdddf57dbd4061a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jul 2015 14:21:41 +0200
Subject: cls_cgroup: factor out classid retrieval

Split out retrieving the cgroups net_cls classid retrieval into its
own function, so that it can be reused later on from other parts of
the traffic control subsystem. If there's no skb->sk, then the small
helper returns 0 as well, which in cls_cgroup terms means 'could not
classify'.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cls_cgroup.h | 29 +++++++++++++++++++++++++++++
 net/sched/cls_cgroup.c   | 23 ++---------------------
 2 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index c15d39456e14..ccd6d8bffa4d 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -49,9 +49,38 @@ static inline void sock_update_classid(struct sock *sk)
 	if (classid != sk->sk_classid)
 		sk->sk_classid = classid;
 }
+
+static inline u32 task_get_classid(const struct sk_buff *skb)
+{
+	u32 classid = task_cls_state(current)->classid;
+
+	/* Due to the nature of the classifier it is required to ignore all
+	 * packets originating from softirq context as accessing `current'
+	 * would lead to false results.
+	 *
+	 * This test assumes that all callers of dev_queue_xmit() explicitly
+	 * disable bh. Knowing this, it is possible to detect softirq based
+	 * calls by looking at the number of nested bh disable calls because
+	 * softirqs always disables bh.
+	 */
+	if (in_serving_softirq()) {
+		/* If there is an sk_classid we'll use that. */
+		if (!skb->sk)
+			return 0;
+
+		classid = skb->sk->sk_classid;
+	}
+
+	return classid;
+}
 #else /* !CONFIG_CGROUP_NET_CLASSID */
 static inline void sock_update_classid(struct sock *sk)
 {
 }
+
+static inline u32 task_get_classid(const struct sk_buff *skb)
+{
+	return 0;
+}
 #endif /* CONFIG_CGROUP_NET_CLASSID */
 #endif  /* _NET_CLS_CGROUP_H */
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index ea611b216412..4c85bd3a750c 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -30,35 +30,16 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			       struct tcf_result *res)
 {
 	struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
-	u32 classid;
-
-	classid = task_cls_state(current)->classid;
-
-	/*
-	 * Due to the nature of the classifier it is required to ignore all
-	 * packets originating from softirq context as accessing `current'
-	 * would lead to false results.
-	 *
-	 * This test assumes that all callers of dev_queue_xmit() explicitely
-	 * disable bh. Knowing this, it is possible to detect softirq based
-	 * calls by looking at the number of nested bh disable calls because
-	 * softirqs always disables bh.
-	 */
-	if (in_serving_softirq()) {
-		/* If there is an sk_classid we'll use that. */
-		if (!skb->sk)
-			return -1;
-		classid = skb->sk->sk_classid;
-	}
+	u32 classid = task_get_classid(skb);
 
 	if (!classid)
 		return -1;
-
 	if (!tcf_em_tree_match(skb, &head->ematches, NULL))
 		return -1;
 
 	res->classid = classid;
 	res->class = 0;
+
 	return tcf_exts_exec(skb, &head->exts, res);
 }
 
-- 
cgit v1.2.3


From 1a3b2ec93d4277b121979321b4024b438cb09504 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sat, 18 Jul 2015 18:24:50 -0700
Subject: switchdev: add offload_fwd_mark generator helper

skb->offload_fwd_mark and dev->offload_fwd_mark are 32-bit and should be
unique for device and may even be unique for a sub-set of ports within
device, so add switchdev helper function to generate unique marks based on
port's switch ID and group_ifindex.  group_ifindex would typically be the
container dev's ifindex, such as the bridge's ifindex.

The generator uses a global hash table to store offload_fwd_marks hashed by
{switch ID, group_ifindex} key.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |   9 ++++
 net/switchdev/switchdev.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d5671f118bfc..89da8934519b 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -157,6 +157,9 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 			    struct net_device *dev,
 			    struct net_device *filter_dev, int idx);
+void switchdev_port_fwd_mark_set(struct net_device *dev,
+				 struct net_device *group_dev,
+				 bool joining);
 
 #else
 
@@ -271,6 +274,12 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb,
 	return -EOPNOTSUPP;
 }
 
+static inline void switchdev_port_fwd_mark_set(struct net_device *dev,
+					       struct net_device *group_dev,
+					       bool joining)
+{
+}
+
 #endif
 
 #endif /* _LINUX_SWITCHDEV_H_ */
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 4e5bba50ccff..33bafa2e703e 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -1039,3 +1039,106 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi)
 	fi->fib_net->ipv4.fib_offload_disabled = true;
 }
 EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
+
+static bool switchdev_port_same_parent_id(struct net_device *a,
+					  struct net_device *b)
+{
+	struct switchdev_attr a_attr = {
+		.id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+		.flags = SWITCHDEV_F_NO_RECURSE,
+	};
+	struct switchdev_attr b_attr = {
+		.id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+		.flags = SWITCHDEV_F_NO_RECURSE,
+	};
+
+	if (switchdev_port_attr_get(a, &a_attr) ||
+	    switchdev_port_attr_get(b, &b_attr))
+		return false;
+
+	return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
+}
+
+static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
+				       struct net_device *group_dev)
+{
+	struct net_device *lower_dev;
+	struct list_head *iter;
+
+	netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+		if (lower_dev == dev)
+			continue;
+		if (switchdev_port_same_parent_id(dev, lower_dev))
+			return lower_dev->offload_fwd_mark;
+		return switchdev_port_fwd_mark_get(dev, lower_dev);
+	}
+
+	return dev->ifindex;
+}
+
+static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
+					  u32 old_mark, u32 *reset_mark)
+{
+	struct net_device *lower_dev;
+	struct list_head *iter;
+
+	netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+		if (lower_dev->offload_fwd_mark == old_mark) {
+			if (!*reset_mark)
+				*reset_mark = lower_dev->ifindex;
+			lower_dev->offload_fwd_mark = *reset_mark;
+		}
+		switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark);
+	}
+}
+
+/**
+ *	switchdev_port_fwd_mark_set - Set port offload forwarding mark
+ *
+ *	@dev: port device
+ *	@group_dev: containing device
+ *	@joining: true if dev is joining group; false if leaving group
+ *
+ *	An ungrouped port's offload mark is just its ifindex.  A grouped
+ *	port's (member of a bridge, for example) offload mark is the ifindex
+ *	of one of the ports in the group with the same parent (switch) ID.
+ *	Ports on the same device in the same group will have the same mark.
+ *
+ *	Example:
+ *
+ *		br0		ifindex=9
+ *		  sw1p1		ifindex=2	mark=2
+ *		  sw1p2		ifindex=3	mark=2
+ *		  sw2p1		ifindex=4	mark=5
+ *		  sw2p2		ifindex=5	mark=5
+ *
+ *	If sw2p2 leaves the bridge, we'll have:
+ *
+ *		br0		ifindex=9
+ *		  sw1p1		ifindex=2	mark=2
+ *		  sw1p2		ifindex=3	mark=2
+ *		  sw2p1		ifindex=4	mark=4
+ *		sw2p2		ifindex=5	mark=5
+ */
+void switchdev_port_fwd_mark_set(struct net_device *dev,
+				 struct net_device *group_dev,
+				 bool joining)
+{
+	u32 mark = dev->ifindex;
+	u32 reset_mark = 0;
+
+	if (group_dev && joining) {
+		mark = switchdev_port_fwd_mark_get(dev, group_dev);
+	} else if (group_dev && !joining) {
+		if (dev->offload_fwd_mark == mark)
+			/* Ohoh, this port was the mark reference port,
+			 * but it's leaving the group, so reset the
+			 * mark for the remaining ports in the group.
+			 */
+			switchdev_port_fwd_mark_reset(group_dev, mark,
+						      &reset_mark);
+	}
+
+	dev->offload_fwd_mark = mark;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set);
-- 
cgit v1.2.3


From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:46 +0200
Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls

Provides infrastructure to parse/dump/store encap information for
light weight tunnels like mpls. Encap information for such tunnels
is associated with fib routes.

This infrastructure is based on previous suggestions from
Eric Biederman to follow the xfrm infrastructure.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lwtunnel.h      |   6 ++
 include/net/lwtunnel.h        | 132 +++++++++++++++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  15 ++++
 net/Kconfig                   |   7 ++
 net/core/Makefile             |   1 +
 net/core/lwtunnel.c           | 179 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 340 insertions(+)
 create mode 100644 include/linux/lwtunnel.h
 create mode 100644 include/net/lwtunnel.h
 create mode 100644 include/uapi/linux/lwtunnel.h
 create mode 100644 net/core/lwtunnel.c

(limited to 'include/net')

diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h
new file mode 100644
index 000000000000..97f32f8b4ae1
--- /dev/null
+++ b/include/linux/lwtunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_LWTUNNEL_H_
+#define _LINUX_LWTUNNEL_H_
+
+#include <uapi/linux/lwtunnel.h>
+
+#endif /* _LINUX_LWTUNNEL_H_ */
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
new file mode 100644
index 000000000000..df24b3611ff4
--- /dev/null
+++ b/include/net/lwtunnel.h
@@ -0,0 +1,132 @@
+#ifndef __NET_LWTUNNEL_H
+#define __NET_LWTUNNEL_H 1
+
+#include <linux/lwtunnel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/route.h>
+
+#define LWTUNNEL_HASH_BITS   7
+#define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
+
+/* lw tunnel state flags */
+#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1
+
+struct lwtunnel_state {
+	__u16		type;
+	__u16		flags;
+	atomic_t	refcnt;
+	int             len;
+	__u8            data[0];
+};
+
+struct lwtunnel_encap_ops {
+	int (*build_state)(struct net_device *dev, struct nlattr *encap,
+			   struct lwtunnel_state **ts);
+	int (*output)(struct sock *sk, struct sk_buff *skb);
+	int (*fill_encap)(struct sk_buff *skb,
+			  struct lwtunnel_state *lwtstate);
+	int (*get_encap_size)(struct lwtunnel_state *lwtstate);
+	int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
+};
+
+extern const struct lwtunnel_encap_ops __rcu *
+		lwtun_encaps[LWTUNNEL_ENCAP_MAX+1];
+
+#ifdef CONFIG_LWTUNNEL
+static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+{
+	atomic_inc(&lws->refcnt);
+}
+
+static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+{
+	if (!lws)
+		return;
+
+	if (atomic_dec_and_test(&lws->refcnt))
+		kfree(lws);
+}
+
+static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
+{
+	if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT))
+		return true;
+
+	return false;
+}
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
+			   unsigned int num);
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
+			   unsigned int num);
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+			 struct nlattr *encap,
+			 struct lwtunnel_state **lws);
+int lwtunnel_fill_encap(struct sk_buff *skb,
+			struct lwtunnel_state *lwtstate);
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
+struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
+
+#else
+
+static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+{
+}
+
+static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+{
+}
+
+static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
+{
+	return false;
+}
+
+static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
+					 unsigned int num)
+{
+	return -EOPNOTSUPP;
+
+}
+
+static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
+					 unsigned int num)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+				       struct nlattr *encap,
+				       struct lwtunnel_state **lws)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_fill_encap(struct sk_buff *skb,
+				      struct lwtunnel_state *lwtstate)
+{
+	return 0;
+}
+
+static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+	return 0;
+}
+
+static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len)
+{
+	return NULL;
+}
+
+static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a,
+				     struct lwtunnel_state *b)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
new file mode 100644
index 000000000000..aa611d931a31
--- /dev/null
+++ b/include/uapi/linux/lwtunnel.h
@@ -0,0 +1,15 @@
+#ifndef _UAPI_LWTUNNEL_H_
+#define _UAPI_LWTUNNEL_H_
+
+#include <linux/types.h>
+
+enum lwtunnel_encap_types {
+	LWTUNNEL_ENCAP_NONE,
+	LWTUNNEL_ENCAP_MPLS,
+	__LWTUNNEL_ENCAP_MAX,
+};
+
+#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1)
+
+
+#endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index 57a7c5af3175..7021c1bf44d6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -374,6 +374,13 @@ source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
 
+config LWTUNNEL
+	bool "Network light weight tunnels"
+	---help---
+	  This feature provides an infrastructure to support light weight
+	  tunnels like mpls. There is no netdevice associated with a light
+	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
+	  with light weight tunnel state associated with fib routes.
 
 endif   # if NET
 
diff --git a/net/core/Makefile b/net/core/Makefile
index fec0856dd6c0..086b01fbe1bd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
new file mode 100644
index 000000000000..d7ae3a235b4b
--- /dev/null
+++ b/net/core/lwtunnel.c
@@ -0,0 +1,179 @@
+/*
+ * lwtunnel	Infrastructure for light weight tunnels like mpls
+ *
+ * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+	struct lwtunnel_state *lws;
+
+	lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+	return lws;
+}
+EXPORT_SYMBOL(lwtunnel_state_alloc);
+
+const struct lwtunnel_encap_ops __rcu *
+		lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+			   unsigned int num)
+{
+	if (num > LWTUNNEL_ENCAP_MAX)
+		return -ERANGE;
+
+	return !cmpxchg((const struct lwtunnel_encap_ops **)
+			&lwtun_encaps[num],
+			NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+			   unsigned int encap_type)
+{
+	int ret;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return -ERANGE;
+
+	ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+		       &lwtun_encaps[encap_type],
+		       ops, NULL) == ops) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+			 struct nlattr *encap, struct lwtunnel_state **lws)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = -EINVAL;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return ret;
+
+	ret = -EOPNOTSUPP;
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[encap_type]);
+	if (likely(ops && ops->build_state))
+		ret = ops->build_state(dev, encap, lws);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	struct nlattr *nest;
+	int ret = -EINVAL;
+
+	if (!lwtstate)
+		return 0;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	ret = -EOPNOTSUPP;
+	nest = nla_nest_start(skb, RTA_ENCAP);
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->fill_encap))
+		ret = ops->fill_encap(skb, lwtstate);
+	rcu_read_unlock();
+
+	if (ret)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+	if (ret)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+
+	return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = 0;
+
+	if (!lwtstate)
+		return 0;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->get_encap_size))
+		ret = nla_total_size(ops->get_encap_size(lwtstate));
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = 0;
+
+	if (!a && !b)
+		return 0;
+
+	if (!a || !b)
+		return 1;
+
+	if (a->type != b->type)
+		return 1;
+
+	if (a->type == LWTUNNEL_ENCAP_NONE ||
+	    a->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[a->type]);
+	if (likely(ops && ops->cmp_encap))
+		ret = ops->cmp_encap(a, b);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_cmp_encap);
-- 
cgit v1.2.3


From 571e722676fe386bb66f72a75b64a6ebf535c077 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:47 +0200
Subject: ipv4: support for fib route lwtunnel encap attributes

This patch adds support in ipv4 fib functions to parse user
provided encap attributes and attach encap state data to fib_nh
and rtable.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  5 ++-
 include/net/route.h      |  1 +
 net/ipv4/fib_frontend.c  |  8 ++++
 net/ipv4/fib_semantics.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/route.c         | 16 +++++++-
 5 files changed, 122 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 49c142bdf01e..5e0196084f1e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -44,7 +44,9 @@ struct fib_config {
 	u32			fc_flow;
 	u32			fc_nlflags;
 	struct nl_info		fc_nlinfo;
- };
+	struct nlattr		*fc_encap;
+	u16			fc_encap_type;
+};
 
 struct fib_info;
 struct rtable;
@@ -89,6 +91,7 @@ struct fib_nh {
 	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
 	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	__rcu *nh_exceptions;
+	struct lwtunnel_state	*nh_lwtstate;
 };
 
 /*
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03afb6a..2d45f419477f 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -66,6 +66,7 @@ struct rtable {
 
 	struct list_head	rt_uncached;
 	struct uncached_list	*rt_uncached_list;
+	struct lwtunnel_state   *rt_lwtstate;
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6bbc54940eb4..9b2019cc3586 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -591,6 +591,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_METRICS]		= { .type = NLA_NESTED },
 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
 	[RTA_FLOW]		= { .type = NLA_U32 },
+	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
+	[RTA_ENCAP]		= { .type = NLA_NESTED },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -656,6 +658,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 		case RTA_TABLE:
 			cfg->fc_table = nla_get_u32(attr);
 			break;
+		case RTA_ENCAP:
+			cfg->fc_encap = attr;
+			break;
+		case RTA_ENCAP_TYPE:
+			cfg->fc_encap_type = nla_get_u16(attr);
+			break;
 		}
 	}
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c7358ea4ae93..6754c64b2fe0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -42,6 +42,7 @@
 #include <net/ip_fib.h>
 #include <net/netlink.h>
 #include <net/nexthop.h>
+#include <net/lwtunnel.h>
 
 #include "fib_lookup.h"
 
@@ -208,6 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
 	change_nexthops(fi) {
 		if (nexthop_nh->nh_dev)
 			dev_put(nexthop_nh->nh_dev);
+		lwtunnel_state_put(nexthop_nh->nh_lwtstate);
 		free_nh_exceptions(nexthop_nh);
 		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
 		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
@@ -266,6 +268,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
+		    lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
 		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
 			return -1;
 		onh++;
@@ -366,6 +369,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
 
 	if (fi->fib_nhs) {
+		size_t nh_encapsize = 0;
 		/* Also handles the special case fib_nhs == 1 */
 
 		/* each nexthop is packed in an attribute */
@@ -374,8 +378,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 		/* may contain flow and gateway attribute */
 		nhsize += 2 * nla_total_size(4);
 
+		/* grab encap info */
+		for_nexthops(fi) {
+			if (nh->nh_lwtstate) {
+				/* RTA_ENCAP_TYPE */
+				nh_encapsize += lwtunnel_get_encap_size(
+						nh->nh_lwtstate);
+				/* RTA_ENCAP */
+				nh_encapsize +=  nla_total_size(2);
+			}
+		} endfor_nexthops(fi);
+
 		/* all nexthops are packed in a nested attribute */
-		payload += nla_total_size(fi->fib_nhs * nhsize);
+		payload += nla_total_size((fi->fib_nhs * nhsize) +
+					  nh_encapsize);
+
 	}
 
 	return payload;
@@ -452,6 +469,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		       int remaining, struct fib_config *cfg)
 {
+	struct net *net = cfg->fc_nlinfo.nl_net;
+	int ret;
+
 	change_nexthops(fi) {
 		int attrlen;
 
@@ -475,18 +495,66 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			if (nexthop_nh->nh_tclassid)
 				fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
+			nla = nla_find(attrs, attrlen, RTA_ENCAP);
+			if (nla) {
+				struct lwtunnel_state *lwtstate;
+				struct net_device *dev = NULL;
+				struct nlattr *nla_entype;
+
+				nla_entype = nla_find(attrs, attrlen,
+						      RTA_ENCAP_TYPE);
+				if (!nla_entype)
+					goto err_inval;
+				if (cfg->fc_oif)
+					dev = __dev_get_by_index(net, cfg->fc_oif);
+				ret = lwtunnel_build_state(dev, nla_get_u16(
+							   nla_entype),
+							   nla, &lwtstate);
+				if (ret)
+					goto errout;
+				lwtunnel_state_get(lwtstate);
+				nexthop_nh->nh_lwtstate = lwtstate;
+			}
 		}
 
 		rtnh = rtnh_next(rtnh, &remaining);
 	} endfor_nexthops(fi);
 
 	return 0;
+
+err_inval:
+	ret = -EINVAL;
+
+errout:
+	return ret;
 }
 
 #endif
 
+int fib_encap_match(struct net *net, u16 encap_type,
+		    struct nlattr *encap,
+		    int oif, const struct fib_nh *nh)
+{
+	struct lwtunnel_state *lwtstate;
+	struct net_device *dev = NULL;
+	int ret;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE)
+		return 0;
+
+	if (oif)
+		dev = __dev_get_by_index(net, oif);
+	ret = lwtunnel_build_state(dev, encap_type,
+				   encap, &lwtstate);
+	if (!ret)
+		return lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+
+	return 0;
+}
+
 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 {
+	struct net *net = cfg->fc_nlinfo.nl_net;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	struct rtnexthop *rtnh;
 	int remaining;
@@ -496,6 +564,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 		return 1;
 
 	if (cfg->fc_oif || cfg->fc_gw) {
+		if (cfg->fc_encap) {
+			if (fib_encap_match(net, cfg->fc_encap_type,
+					    cfg->fc_encap, cfg->fc_oif,
+					    fi->fib_nh))
+			    return 1;
+		}
 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
 			return 0;
@@ -882,6 +956,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	} else {
 		struct fib_nh *nh = fi->fib_nh;
 
+		if (cfg->fc_encap) {
+			struct lwtunnel_state *lwtstate;
+			struct net_device *dev = NULL;
+
+			if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
+				goto err_inval;
+			if (cfg->fc_oif)
+				dev = __dev_get_by_index(net, cfg->fc_oif);
+			err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+						   cfg->fc_encap, &lwtstate);
+			if (err)
+				goto failure;
+
+			lwtunnel_state_get(lwtstate);
+			nh->nh_lwtstate = lwtstate;
+		}
 		nh->nh_oif = cfg->fc_oif;
 		nh->nh_gw = cfg->fc_gw;
 		nh->nh_flags = cfg->fc_flags;
@@ -1055,6 +1145,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
 			goto nla_put_failure;
 #endif
+		if (fi->fib_nh->nh_lwtstate)
+			lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
 	}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (fi->fib_nhs > 1) {
@@ -1090,6 +1182,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
 				goto nla_put_failure;
 #endif
+			if (nh->nh_lwtstate)
+				lwtunnel_fill_encap(skb, nh->nh_lwtstate);
 			/* length of rtnetlink header + attributes */
 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
 		} endfor_nexthops(fi);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 04c83de4f79e..226570ba1ced 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -102,6 +102,7 @@
 #include <net/tcp.h>
 #include <net/icmp.h>
 #include <net/xfrm.h>
+#include <net/lwtunnel.h>
 #include <net/netevent.h>
 #include <net/rtnetlink.h>
 #ifdef CONFIG_SYSCTL
@@ -1355,6 +1356,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 		list_del(&rt->rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
+	lwtunnel_state_put(rt->rt_lwtstate);
 }
 
 void rt_flush_dev(struct net_device *dev)
@@ -1403,6 +1405,12 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
+		if (nh->nh_lwtstate) {
+			lwtunnel_state_get(nh->nh_lwtstate);
+			rt->rt_lwtstate = nh->nh_lwtstate;
+		} else {
+			rt->rt_lwtstate = NULL;
+		}
 		if (unlikely(fnhe))
 			cached = rt_bind_exception(rt, fnhe, daddr);
 		else if (!(rt->dst.flags & DST_NOCACHE))
@@ -1488,6 +1496,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
+	rth->rt_lwtstate = NULL;
 	if (our) {
 		rth->dst.input= ip_local_deliver;
 		rth->rt_flags |= RTCF_LOCAL;
@@ -1617,6 +1626,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
+	rth->rt_lwtstate = NULL;
 	RT_CACHE_STAT_INC(in_slow_tot);
 
 	rth->dst.input = ip_forward;
@@ -1791,6 +1801,8 @@ local_input:
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
+	rth->rt_lwtstate = NULL;
+
 	RT_CACHE_STAT_INC(in_slow_tot);
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
@@ -1980,7 +1992,7 @@ add:
 	rth->rt_gateway = 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
-
+	rth->rt_lwtstate = NULL;
 	RT_CACHE_STAT_INC(out_slow_tot);
 
 	if (flags & RTCF_LOCAL)
@@ -2260,7 +2272,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_uses_gateway = ort->rt_uses_gateway;
 
 		INIT_LIST_HEAD(&rt->rt_uncached);
-
+		rt->rt_lwtstate = NULL;
 		dst_free(new);
 	}
 
-- 
cgit v1.2.3


From 19e42e45150672124b6a4341e2bc7982d247f0ac Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:48 +0200
Subject: ipv6: support for fib route lwtunnel encap attributes

This patch adds support in ipv6 fib functions to parse Netlink
RTA encap attributes and attach encap state data to rt6_info.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  3 +++
 net/ipv6/ip6_fib.c    |  2 ++
 net/ipv6/route.c      | 33 ++++++++++++++++++++++++++++++---
 3 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3b76849c190f..276328e3daa6 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -51,6 +51,8 @@ struct fib6_config {
 	struct nlattr	*fc_mp;
 
 	struct nl_info	fc_nlinfo;
+	struct nlattr	*fc_encap;
+	u16		fc_encap_type;
 };
 
 struct fib6_node {
@@ -131,6 +133,7 @@ struct rt6_info {
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
+	struct lwtunnel_state		*rt6i_lwtstate;
 };
 
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 55d19861ab20..d715f2e0c4e7 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -32,6 +32,7 @@
 #include <net/ipv6.h>
 #include <net/ndisc.h>
 #include <net/addrconf.h>
+#include <net/lwtunnel.h>
 
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
@@ -177,6 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 static void rt6_release(struct rt6_info *rt)
 {
 	if (atomic_dec_and_test(&rt->rt6i_ref)) {
+		lwtunnel_state_put(rt->rt6i_lwtstate);
 		rt6_free_pcpu(rt);
 		dst_free(&rt->dst);
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6090969937f8..b3431b79dfb1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -58,6 +58,7 @@
 #include <net/netevent.h>
 #include <net/netlink.h>
 #include <net/nexthop.h>
+#include <net/lwtunnel.h>
 
 #include <asm/uaccess.h>
 
@@ -1770,6 +1771,17 @@ int ip6_route_add(struct fib6_config *cfg)
 
 	rt->dst.output = ip6_output;
 
+	if (cfg->fc_encap) {
+		struct lwtunnel_state *lwtstate;
+
+		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+					   cfg->fc_encap, &lwtstate);
+		if (err)
+			goto out;
+		lwtunnel_state_get(lwtstate);
+		rt->rt6i_lwtstate = lwtstate;
+	}
+
 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
 	rt->rt6i_dst.plen = cfg->fc_dst_len;
 	if (rt->rt6i_dst.plen == 128)
@@ -2595,6 +2607,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_METRICS]           = { .type = NLA_NESTED },
 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
 	[RTA_PREF]              = { .type = NLA_U8 },
+	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
+	[RTA_ENCAP]		= { .type = NLA_NESTED },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2689,6 +2703,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 		cfg->fc_flags |= RTF_PREF(pref);
 	}
 
+	if (tb[RTA_ENCAP])
+		cfg->fc_encap = tb[RTA_ENCAP];
+
+	if (tb[RTA_ENCAP_TYPE])
+		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+
 	err = 0;
 errout:
 	return err;
@@ -2721,6 +2741,10 @@ beginning:
 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
 				r_cfg.fc_flags |= RTF_GATEWAY;
 			}
+			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+			if (nla)
+				r_cfg.fc_encap_type = nla_get_u16(nla);
 		}
 		err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
 		if (err) {
@@ -2783,7 +2807,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return ip6_route_add(&cfg);
 }
 
-static inline size_t rt6_nlmsg_size(void)
+static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
 {
 	return NLMSG_ALIGN(sizeof(struct rtmsg))
 	       + nla_total_size(16) /* RTA_SRC */
@@ -2797,7 +2821,8 @@ static inline size_t rt6_nlmsg_size(void)
 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
-	       + nla_total_size(1); /* RTA_PREF */
+	       + nla_total_size(1) /* RTA_PREF */
+	       + lwtunnel_get_encap_size(rt->rt6i_lwtstate);
 }
 
 static int rt6_fill_node(struct net *net,
@@ -2945,6 +2970,8 @@ static int rt6_fill_node(struct net *net,
 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
 		goto nla_put_failure;
 
+	lwtunnel_fill_encap(skb, rt->rt6i_lwtstate);
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -3071,7 +3098,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
 	err = -ENOBUFS;
 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 
-	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
+	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
 	if (!skb)
 		goto errout;
 
-- 
cgit v1.2.3


From ffce41962ef64b8e685e5b621caf24bf381addd9 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:49 +0200
Subject: lwtunnel: support dst output redirect function

This patch introduces lwtunnel_output function to call corresponding
lwtunnels output function to xmit the packet.

It adds two variants lwtunnel_output and lwtunnel_output6 for ipv4 and
ipv6 respectively today. But this is subject to change when lwtstate will
reside in dst or dst_metadata (as per upstream discussions).

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 12 +++++++++++
 net/core/lwtunnel.c    | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index df24b3611ff4..918e03c1dafa 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -69,6 +69,8 @@ int lwtunnel_fill_encap(struct sk_buff *skb,
 int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
 struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
 int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
+int lwtunnel_output(struct sock *sk, struct sk_buff *skb);
+int lwtunnel_output6(struct sock *sk, struct sk_buff *skb);
 
 #else
 
@@ -127,6 +129,16 @@ static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a,
 	return 0;
 }
 
+static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* __NET_LWTUNNEL_H */
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index d7ae3a235b4b..bb58826c708d 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -25,6 +25,7 @@
 
 #include <net/lwtunnel.h>
 #include <net/rtnetlink.h>
+#include <net/ip6_fib.h>
 
 struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
 {
@@ -177,3 +178,58 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
 	return ret;
 }
 EXPORT_SYMBOL(lwtunnel_cmp_encap);
+
+int __lwtunnel_output(struct sock *sk, struct sk_buff *skb,
+		      struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = -EINVAL;
+
+	if (!lwtstate)
+		goto drop;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	ret = -EOPNOTSUPP;
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->output))
+		ret = ops->output(sk, skb);
+	rcu_read_unlock();
+
+	if (ret == -EOPNOTSUPP)
+		goto drop;
+
+	return ret;
+
+drop:
+	kfree(skb);
+
+	return ret;
+}
+
+int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
+{
+	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+	struct lwtunnel_state *lwtstate = NULL;
+
+	if (rt)
+		lwtstate = rt->rt6i_lwtstate;
+
+	return __lwtunnel_output(sk, skb, lwtstate);
+}
+EXPORT_SYMBOL(lwtunnel_output6);
+
+int lwtunnel_output(struct sock *sk, struct sk_buff *skb)
+{
+	struct rtable *rt = (struct rtable *)skb_dst(skb);
+	struct lwtunnel_state *lwtstate = NULL;
+
+	if (rt)
+		lwtstate = rt->rt_lwtstate;
+
+	return __lwtunnel_output(sk, skb, lwtstate);
+}
+EXPORT_SYMBOL(lwtunnel_output);
-- 
cgit v1.2.3


From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:53 +0200
Subject: mpls: ip tunnel support

This implementation uses lwtunnel infrastructure to register
hooks for mpls tunnel encaps.

It picks cues from iptunnel_encaps infrastructure and previous
mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mpls_iptunnel.h      |   6 +
 include/net/mpls_iptunnel.h        |  29 +++++
 include/uapi/linux/mpls_iptunnel.h |  28 +++++
 net/mpls/Kconfig                   |   8 +-
 net/mpls/Makefile                  |   1 +
 net/mpls/mpls_iptunnel.c           | 233 +++++++++++++++++++++++++++++++++++++
 6 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mpls_iptunnel.h
 create mode 100644 include/net/mpls_iptunnel.h
 create mode 100644 include/uapi/linux/mpls_iptunnel.h
 create mode 100644 net/mpls/mpls_iptunnel.c

(limited to 'include/net')

diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..ef29eb2d6dfd
--- /dev/null
+++ b/include/linux/mpls_iptunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_MPLS_IPTUNNEL_H
+#define _LINUX_MPLS_IPTUNNEL_H
+
+#include <uapi/linux/mpls_iptunnel.h>
+
+#endif  /* _LINUX_MPLS_IPTUNNEL_H */
diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
new file mode 100644
index 000000000000..4757997f76ed
--- /dev/null
+++ b/include/net/mpls_iptunnel.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2015 Cumulus Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _NET_MPLS_IPTUNNEL_H
+#define _NET_MPLS_IPTUNNEL_H 1
+
+#define MAX_NEW_LABELS 2
+
+struct mpls_iptunnel_encap {
+	u32	label[MAX_NEW_LABELS];
+	u32	labels;
+};
+
+static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
+{
+	return (struct mpls_iptunnel_encap *)lwtstate->data;
+}
+
+#endif
diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..d80a0498f77e
--- /dev/null
+++ b/include/uapi/linux/mpls_iptunnel.h
@@ -0,0 +1,28 @@
+/*
+ *	mpls tunnel api
+ *
+ *	Authors:
+ *		Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H
+#define _UAPI_LINUX_MPLS_IPTUNNEL_H
+
+/* MPLS tunnel attributes
+ * [RTA_ENCAP] = {
+ *     [MPLS_IPTUNNEL_DST]
+ * }
+ */
+enum {
+	MPLS_IPTUNNEL_UNSPEC,
+	MPLS_IPTUNNEL_DST,
+	__MPLS_IPTUNNEL_MAX,
+};
+#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1)
+
+#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
index 17bde799c854..5c467ef97311 100644
--- a/net/mpls/Kconfig
+++ b/net/mpls/Kconfig
@@ -24,7 +24,13 @@ config NET_MPLS_GSO
 
 config MPLS_ROUTING
 	tristate "MPLS: routing support"
-	help
+	---help---
 	 Add support for forwarding of mpls packets.
 
+config MPLS_IPTUNNEL
+	tristate "MPLS: IP over MPLS tunnel support"
+	depends on LWTUNNEL && MPLS_ROUTING
+	---help---
+	 mpls ip tunnel support.
+
 endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
index 65bbe68c72e6..9ca923625016 100644
--- a/net/mpls/Makefile
+++ b/net/mpls/Makefile
@@ -3,5 +3,6 @@
 #
 obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
 obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
+obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o
 
 mpls_router-y := af_mpls.o
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
new file mode 100644
index 000000000000..eea096f21ba5
--- /dev/null
+++ b/net/mpls/mpls_iptunnel.c
@@ -0,0 +1,233 @@
+/*
+ * mpls tunnels	An implementation mpls tunnels using the light weight tunnel
+ *		infrastructure
+ *
+ * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/mpls_iptunnel.h>
+#include <linux/mpls_iptunnel.h>
+#include "internal.h"
+
+static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = {
+	[MPLS_IPTUNNEL_DST]	= { .type = NLA_U32 },
+};
+
+static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
+{
+	/* The size of the layer 2.5 labels to be added for this route */
+	return en->labels * sizeof(struct mpls_shim_hdr);
+}
+
+int mpls_output(struct sock *sk, struct sk_buff *skb)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	struct mpls_shim_hdr *hdr;
+	struct net_device *out_dev;
+	unsigned int hh_len;
+	unsigned int new_header_size;
+	unsigned int mtu;
+	struct dst_entry *dst = skb_dst(skb);
+	struct rtable *rt = NULL;
+	struct rt6_info *rt6 = NULL;
+	struct lwtunnel_state *lwtstate = NULL;
+	int err = 0;
+	bool bos;
+	int i;
+	unsigned int ttl;
+
+	/* Obtain the ttl */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ttl = ip_hdr(skb)->ttl;
+		rt = (struct rtable *)dst;
+		lwtstate = rt->rt_lwtstate;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ttl = ipv6_hdr(skb)->hop_limit;
+		rt6 = (struct rt6_info *)dst;
+		lwtstate = rt6->rt6i_lwtstate;
+	} else {
+		goto drop;
+	}
+
+	skb_orphan(skb);
+
+	/* Find the output device */
+	out_dev = rcu_dereference(dst->dev);
+	if (!mpls_output_possible(out_dev) ||
+	    !lwtstate || skb_warn_if_lro(skb))
+		goto drop;
+
+	skb_forward_csum(skb);
+
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	/* Verify the destination can hold the packet */
+	new_header_size = mpls_encap_size(tun_encap_info);
+	mtu = mpls_dev_mtu(out_dev);
+	if (mpls_pkt_too_big(skb, mtu - new_header_size))
+		goto drop;
+
+	hh_len = LL_RESERVED_SPACE(out_dev);
+	if (!out_dev->header_ops)
+		hh_len = 0;
+
+	/* Ensure there is enough space for the headers in the skb */
+	if (skb_cow(skb, hh_len + new_header_size))
+		goto drop;
+
+	skb_push(skb, new_header_size);
+	skb_reset_network_header(skb);
+
+	skb->dev = out_dev;
+	skb->protocol = htons(ETH_P_MPLS_UC);
+
+	/* Push the new labels */
+	hdr = mpls_hdr(skb);
+	bos = true;
+	for (i = tun_encap_info->labels - 1; i >= 0; i--) {
+		hdr[i] = mpls_entry_encode(tun_encap_info->label[i],
+					   ttl, 0, bos);
+		bos = false;
+	}
+
+	if (rt)
+		err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
+				 skb);
+	else if (rt6)
+		err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway,
+				 skb);
+	if (err)
+		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+				    __func__, err);
+
+	return 0;
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+			    struct lwtunnel_state **ts)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
+	struct lwtunnel_state *newts;
+	int tun_encap_info_len;
+	int ret;
+
+	ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
+			       mpls_iptunnel_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[MPLS_IPTUNNEL_DST])
+		return -EINVAL;
+
+	tun_encap_info_len = sizeof(*tun_encap_info);
+
+	newts = lwtunnel_state_alloc(tun_encap_info_len);
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = tun_encap_info_len;
+	tun_encap_info = mpls_lwtunnel_encap(newts);
+	ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+			     &tun_encap_info->labels, tun_encap_info->label);
+	if (ret)
+		goto errout;
+	newts->type = LWTUNNEL_ENCAP_MPLS;
+	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+	*ts = newts;
+
+	return 0;
+
+errout:
+	kfree(newts);
+	*ts = NULL;
+
+	return ret;
+}
+
+static int mpls_fill_encap_info(struct sk_buff *skb,
+				struct lwtunnel_state *lwtstate)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
+			   tun_encap_info->label))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	return nla_total_size(tun_encap_info->labels * 4);
+}
+
+static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a);
+	struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b);
+	int l;
+
+	if (a_hdr->labels != b_hdr->labels)
+		return 1;
+
+	for (l = 0; l < MAX_NEW_LABELS; l++)
+		if (a_hdr->label[l] != b_hdr->label[l])
+			return 1;
+	return 0;
+}
+
+static const struct lwtunnel_encap_ops mpls_iptun_ops = {
+	.build_state = mpls_build_state,
+	.output = mpls_output,
+	.fill_encap = mpls_fill_encap_info,
+	.get_encap_size = mpls_encap_nlsize,
+	.cmp_encap = mpls_encap_cmp,
+};
+
+static int __init mpls_iptunnel_init(void)
+{
+	return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_init(mpls_iptunnel_init);
+
+static void __exit mpls_iptunnel_exit(void)
+{
+	lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_exit(mpls_iptunnel_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 1d8fff907342d2339796dbd27ea47d0e76a6a2d0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:54 +0200
Subject: ip_tunnel: Make ovs_tunnel_info and ovs_key_ipv4_tunnel generic

Rename the tunnel metadata data structures currently internal to
OVS and make them generic for use by all IP tunnels.

Both structures are kernel internal and will stay that way. Their
members are exposed to user space through individual Netlink
attributes by OVS. It will therefore be possible to extend/modify
these structures without affecting user ABI.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h         | 63 +++++++++++++++++++++++++++++++++
 include/uapi/linux/openvswitch.h |  2 +-
 net/openvswitch/actions.c        |  2 +-
 net/openvswitch/datapath.h       |  5 +--
 net/openvswitch/flow.c           |  4 +--
 net/openvswitch/flow.h           | 76 ++--------------------------------------
 net/openvswitch/flow_netlink.c   | 16 ++++-----
 net/openvswitch/flow_netlink.h   |  2 +-
 net/openvswitch/vport-geneve.c   | 17 +++++----
 net/openvswitch/vport-gre.c      | 16 ++++-----
 net/openvswitch/vport-vxlan.c    | 18 +++++-----
 net/openvswitch/vport.c          | 30 ++++++++--------
 net/openvswitch/vport.h          | 12 +++----
 13 files changed, 128 insertions(+), 135 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index d8214cb88bbc..6b9d559ce5f5 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -22,6 +22,28 @@
 /* Keep error state on tunnel for 30 sec */
 #define IPTUNNEL_ERR_TIMEO	(30*HZ)
 
+/* Used to memset ip_tunnel padding. */
+#define IP_TUNNEL_KEY_SIZE					\
+	(offsetof(struct ip_tunnel_key, tp_dst) +		\
+	 FIELD_SIZEOF(struct ip_tunnel_key, tp_dst))
+
+struct ip_tunnel_key {
+	__be64			tun_id;
+	__be32			ipv4_src;
+	__be32			ipv4_dst;
+	__be16			tun_flags;
+	__u8			ipv4_tos;
+	__u8			ipv4_ttl;
+	__be16			tp_src;
+	__be16			tp_dst;
+} __packed __aligned(4); /* Minimize padding. */
+
+struct ip_tunnel_info {
+	struct ip_tunnel_key	key;
+	const void		*options;
+	u8			options_len;
+};
+
 /* 6rd prefix/relay information */
 #ifdef CONFIG_IPV6_SIT_6RD
 struct ip_tunnel_6rd_parm {
@@ -136,6 +158,47 @@ int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op,
 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 			    unsigned int num);
 
+static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
+					 __be32 saddr, __be32 daddr,
+					 u8 tos, u8 ttl,
+					 __be16 tp_src, __be16 tp_dst,
+					 __be64 tun_id, __be16 tun_flags,
+					 const void *opts, u8 opts_len)
+{
+	tun_info->key.tun_id = tun_id;
+	tun_info->key.ipv4_src = saddr;
+	tun_info->key.ipv4_dst = daddr;
+	tun_info->key.ipv4_tos = tos;
+	tun_info->key.ipv4_ttl = ttl;
+	tun_info->key.tun_flags = tun_flags;
+
+	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
+	 * the upper tunnel are used.
+	 * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
+	 */
+	tun_info->key.tp_src = tp_src;
+	tun_info->key.tp_dst = tp_dst;
+
+	/* Clear struct padding. */
+	if (sizeof(tun_info->key) != IP_TUNNEL_KEY_SIZE)
+		memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_SIZE,
+		       0, sizeof(tun_info->key) - IP_TUNNEL_KEY_SIZE);
+
+	tun_info->options = opts;
+	tun_info->options_len = opts_len;
+}
+
+static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
+				       const struct iphdr *iph,
+				       __be16 tp_src, __be16 tp_dst,
+				       __be64 tun_id, __be16 tun_flags,
+				       const void *opts, u8 opts_len)
+{
+	__ip_tunnel_info_init(tun_info, iph->saddr, iph->daddr,
+			      iph->tos, iph->ttl, tp_src, tp_dst,
+			      tun_id, tun_flags, opts, opts_len);
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 1dab77601c21..d6b885460187 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -321,7 +321,7 @@ enum ovs_key_attr {
 				 * the accepted length of the array. */
 
 #ifdef __KERNEL__
-	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ovs_tunnel_info */
+	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
 #endif
 	__OVS_KEY_ATTR_MAX
 };
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 8a8c0b8b4f63..27c1687cfd92 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -611,7 +611,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			    struct sw_flow_key *key, const struct nlattr *attr,
 			    const struct nlattr *actions, int actions_len)
 {
-	struct ovs_tunnel_info info;
+	struct ip_tunnel_info info;
 	struct dp_upcall_info upcall;
 	const struct nlattr *a;
 	int rem;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index cd691e935e08..6b28c5cedb23 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -25,6 +25,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/u64_stats_sync.h>
+#include <net/ip_tunnels.h>
 
 #include "flow.h"
 #include "flow_table.h"
@@ -98,7 +99,7 @@ struct datapath {
  * when a packet is received by OVS.
  */
 struct ovs_skb_cb {
-	struct ovs_tunnel_info  *egress_tun_info;
+	struct ip_tunnel_info  *egress_tun_info;
 	struct vport		*input_vport;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -114,7 +115,7 @@ struct ovs_skb_cb {
  * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
  */
 struct dp_upcall_info {
-	const struct ovs_tunnel_info *egress_tun_info;
+	const struct ip_tunnel_info *egress_tun_info;
 	const struct nlattr *userdata;
 	const struct nlattr *actions;
 	int actions_len;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index bc7b0aba994a..8db22ef73626 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -682,12 +682,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
 	return key_extract(skb, key);
 }
 
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
 	/* Extract metadata from packet. */
 	if (tun_info) {
-		memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
+		memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
 
 		if (tun_info->options) {
 			BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a076e445ccc2..cadc6c5c3545 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -32,31 +32,10 @@
 #include <linux/time.h>
 #include <linux/flex_array.h>
 #include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
 
 struct sk_buff;
 
-/* Used to memset ovs_key_ipv4_tunnel padding. */
-#define OVS_TUNNEL_KEY_SIZE					\
-	(offsetof(struct ovs_key_ipv4_tunnel, tp_dst) +		\
-	 FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst))
-
-struct ovs_key_ipv4_tunnel {
-	__be64 tun_id;
-	__be32 ipv4_src;
-	__be32 ipv4_dst;
-	__be16 tun_flags;
-	u8   ipv4_tos;
-	u8   ipv4_ttl;
-	__be16 tp_src;
-	__be16 tp_dst;
-} __packed __aligned(4); /* Minimize padding. */
-
-struct ovs_tunnel_info {
-	struct ovs_key_ipv4_tunnel tunnel;
-	const void *options;
-	u8 options_len;
-};
-
 /* Store options at the end of the array if they are less than the
  * maximum size. This allows us to get the benefits of variable length
  * matching for small options.
@@ -66,55 +45,6 @@ struct ovs_tunnel_info {
 #define TUN_METADATA_OPTS(flow_key, opt_len) \
 	((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
 
-static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
-					    __be32 saddr, __be32 daddr,
-					    u8 tos, u8 ttl,
-					    __be16 tp_src,
-					    __be16 tp_dst,
-					    __be64 tun_id,
-					    __be16 tun_flags,
-					    const void *opts,
-					    u8 opts_len)
-{
-	tun_info->tunnel.tun_id = tun_id;
-	tun_info->tunnel.ipv4_src = saddr;
-	tun_info->tunnel.ipv4_dst = daddr;
-	tun_info->tunnel.ipv4_tos = tos;
-	tun_info->tunnel.ipv4_ttl = ttl;
-	tun_info->tunnel.tun_flags = tun_flags;
-
-	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
-	 * the upper tunnel are used.
-	 * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
-	 */
-	tun_info->tunnel.tp_src = tp_src;
-	tun_info->tunnel.tp_dst = tp_dst;
-
-	/* Clear struct padding. */
-	if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE)
-		memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE,
-		       0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
-
-	tun_info->options = opts;
-	tun_info->options_len = opts_len;
-}
-
-static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
-					  const struct iphdr *iph,
-					  __be16 tp_src,
-					  __be16 tp_dst,
-					  __be64 tun_id,
-					  __be16 tun_flags,
-					  const void *opts,
-					  u8 opts_len)
-{
-	__ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr,
-				 iph->tos, iph->ttl,
-				 tp_src, tp_dst,
-				 tun_id, tun_flags,
-				 opts, opts_len);
-}
-
 #define OVS_SW_FLOW_KEY_METADATA_SIZE			\
 	(offsetof(struct sw_flow_key, recirc_id) +	\
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
@@ -122,7 +52,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
 struct sw_flow_key {
 	u8 tun_opts[255];
 	u8 tun_opts_len;
-	struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
+	struct ip_tunnel_key tun_key;	/* Encapsulating tunnel key. */
 	struct {
 		u32	priority;	/* Packet QoS priority. */
 		u32	skb_mark;	/* SKB mark. */
@@ -273,7 +203,7 @@ void ovs_flow_stats_clear(struct sw_flow *);
 u64 ovs_flow_used_time(unsigned long flow_jiffies);
 
 int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb,
 			 struct sw_flow_key *key);
 /* Extract key from packet coming from userspace. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624e41c4267f..ecfa530d3461 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -641,7 +641,7 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb,
 }
 
 static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
-				const struct ovs_key_ipv4_tunnel *output,
+				const struct ip_tunnel_key *output,
 				const void *tun_opts, int swkey_tun_opts_len)
 {
 	if (output->tun_flags & TUNNEL_KEY &&
@@ -689,7 +689,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
 }
 
 static int ipv4_tun_to_nlattr(struct sk_buff *skb,
-			      const struct ovs_key_ipv4_tunnel *output,
+			      const struct ip_tunnel_key *output,
 			      const void *tun_opts, int swkey_tun_opts_len)
 {
 	struct nlattr *nla;
@@ -708,9 +708,9 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
 }
 
 int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
-				  const struct ovs_tunnel_info *egress_tun_info)
+				  const struct ip_tunnel_info *egress_tun_info)
 {
-	return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel,
+	return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
 				    egress_tun_info->options,
 				    egress_tun_info->options_len);
 }
@@ -1746,7 +1746,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 {
 	struct sw_flow_match match;
 	struct sw_flow_key key;
-	struct ovs_tunnel_info *tun_info;
+	struct ip_tunnel_info *tun_info;
 	struct nlattr *a;
 	int err = 0, start, opts_type;
 
@@ -1777,7 +1777,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 		return PTR_ERR(a);
 
 	tun_info = nla_data(a);
-	tun_info->tunnel = key.tun_key;
+	tun_info->key = key.tun_key;
 	tun_info->options_len = key.tun_opts_len;
 
 	if (tun_info->options_len) {
@@ -2227,13 +2227,13 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 
 	switch (key_type) {
 	case OVS_KEY_ATTR_TUNNEL_INFO: {
-		struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+		struct ip_tunnel_info *tun_info = nla_data(ovs_key);
 
 		start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
 		if (!start)
 			return -EMSGSIZE;
 
-		err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
+		err = ipv4_tun_to_nlattr(skb, &tun_info->key,
 					 tun_info->options_len ?
 						tun_info->options : NULL,
 					 tun_info->options_len);
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 5c3d75bff310..ec53eb6e632b 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -55,7 +55,7 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
 int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key,
 		      const struct nlattr *mask, bool log);
 int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
-				  const struct ovs_tunnel_info *);
+				  const struct ip_tunnel_info *);
 
 bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
 int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 208c576bd1b6..1da3a14d1010 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -77,7 +77,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
 	struct vport *vport = gs->rcv_data;
 	struct genevehdr *geneveh = geneve_hdr(skb);
 	int opts_len;
-	struct ovs_tunnel_info tun_info;
+	struct ip_tunnel_info tun_info;
 	__be64 key;
 	__be16 flags;
 
@@ -90,10 +90,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
 
 	key = vni_to_tunnel_id(geneveh->vni);
 
-	ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
-			       udp_hdr(skb)->source, udp_hdr(skb)->dest,
-			       key, flags,
-			       geneveh->options, opts_len);
+	ip_tunnel_info_init(&tun_info, ip_hdr(skb),
+			    udp_hdr(skb)->source, udp_hdr(skb)->dest,
+			    key, flags, geneveh->options, opts_len);
 
 	ovs_vport_receive(vport, skb, &tun_info);
 }
@@ -165,8 +164,8 @@ error:
 
 static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 {
-	const struct ovs_key_ipv4_tunnel *tun_key;
-	struct ovs_tunnel_info *tun_info;
+	const struct ip_tunnel_key *tun_key;
+	struct ip_tunnel_info *tun_info;
 	struct net *net = ovs_dp_get_net(vport->dp);
 	struct geneve_port *geneve_port = geneve_vport(vport);
 	__be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
@@ -183,7 +182,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 		goto error;
 	}
 
-	tun_key = &tun_info->tunnel;
+	tun_key = &tun_info->key;
 	rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
@@ -225,7 +224,7 @@ static const char *geneve_get_name(const struct vport *vport)
 }
 
 static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				      struct ovs_tunnel_info *egress_tun_info)
+				      struct ip_tunnel_info *egress_tun_info)
 {
 	struct geneve_port *geneve_port = geneve_vport(vport);
 	struct net *net = ovs_dp_get_net(vport->dp);
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index f17ac9642f4e..b87656c66aaf 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -67,9 +67,9 @@ static struct sk_buff *__build_header(struct sk_buff *skb,
 				      int tunnel_hlen)
 {
 	struct tnl_ptk_info tpi;
-	const struct ovs_key_ipv4_tunnel *tun_key;
+	const struct ip_tunnel_key *tun_key;
 
-	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+	tun_key = &OVS_CB(skb)->egress_tun_info->key;
 
 	skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
 	if (IS_ERR(skb))
@@ -97,7 +97,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
 static int gre_rcv(struct sk_buff *skb,
 		   const struct tnl_ptk_info *tpi)
 {
-	struct ovs_tunnel_info tun_info;
+	struct ip_tunnel_info tun_info;
 	struct ovs_net *ovs_net;
 	struct vport *vport;
 	__be64 key;
@@ -108,8 +108,8 @@ static int gre_rcv(struct sk_buff *skb,
 		return PACKET_REJECT;
 
 	key = key_to_tunnel_id(tpi->key, tpi->seq);
-	ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
-			       filter_tnl_flags(tpi->flags), NULL, 0);
+	ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
+			    filter_tnl_flags(tpi->flags), NULL, 0);
 
 	ovs_vport_receive(vport, skb, &tun_info);
 	return PACKET_RCVD;
@@ -134,7 +134,7 @@ static int gre_err(struct sk_buff *skb, u32 info,
 static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 {
 	struct net *net = ovs_dp_get_net(vport->dp);
-	const struct ovs_key_ipv4_tunnel *tun_key;
+	const struct ip_tunnel_key *tun_key;
 	struct flowi4 fl;
 	struct rtable *rt;
 	int min_headroom;
@@ -147,7 +147,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 		goto err_free_skb;
 	}
 
-	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+	tun_key = &OVS_CB(skb)->egress_tun_info->key;
 	rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
@@ -277,7 +277,7 @@ static void gre_tnl_destroy(struct vport *vport)
 }
 
 static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				   struct ovs_tunnel_info *egress_tun_info)
+				   struct ip_tunnel_info *egress_tun_info)
 {
 	return ovs_tunnel_get_egress_info(egress_tun_info,
 					  ovs_dp_get_net(vport->dp),
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 6d39766e7828..6f7986fabb70 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -64,7 +64,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
 static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 		      struct vxlan_metadata *md)
 {
-	struct ovs_tunnel_info tun_info;
+	struct ip_tunnel_info tun_info;
 	struct vxlan_port *vxlan_port;
 	struct vport *vport = vs->data;
 	struct iphdr *iph;
@@ -82,9 +82,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	/* Save outer tunnel values */
 	iph = ip_hdr(skb);
 	key = cpu_to_be64(ntohl(md->vni) >> 8);
-	ovs_flow_tun_info_init(&tun_info, iph,
-			       udp_hdr(skb)->source, udp_hdr(skb)->dest,
-			       key, flags, &opts, sizeof(opts));
+	ip_tunnel_info_init(&tun_info, iph,
+			    udp_hdr(skb)->source, udp_hdr(skb)->dest,
+			    key, flags, &opts, sizeof(opts));
 
 	ovs_vport_receive(vport, skb, &tun_info);
 }
@@ -205,13 +205,13 @@ error:
 
 static int vxlan_ext_gbp(struct sk_buff *skb)
 {
-	const struct ovs_tunnel_info *tun_info;
+	const struct ip_tunnel_info *tun_info;
 	const struct ovs_vxlan_opts *opts;
 
 	tun_info = OVS_CB(skb)->egress_tun_info;
 	opts = tun_info->options;
 
-	if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT &&
+	if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT &&
 	    tun_info->options_len >= sizeof(*opts))
 		return opts->gbp;
 	else
@@ -224,7 +224,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 	struct vxlan_port *vxlan_port = vxlan_vport(vport);
 	struct sock *sk = vxlan_port->vs->sock->sk;
 	__be16 dst_port = inet_sk(sk)->inet_sport;
-	const struct ovs_key_ipv4_tunnel *tun_key;
+	const struct ip_tunnel_key *tun_key;
 	struct vxlan_metadata md = {0};
 	struct rtable *rt;
 	struct flowi4 fl;
@@ -238,7 +238,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 		goto error;
 	}
 
-	tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+	tun_key = &OVS_CB(skb)->egress_tun_info->key;
 	rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
@@ -269,7 +269,7 @@ error:
 }
 
 static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				     struct ovs_tunnel_info *egress_tun_info)
+				     struct ip_tunnel_info *egress_tun_info)
 {
 	struct net *net = ovs_dp_get_net(vport->dp);
 	struct vxlan_port *vxlan_port = vxlan_vport(vport);
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 067a3fff1d2c..af23ba077836 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -469,7 +469,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
  * skb->data should point to the Ethernet header.
  */
 void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
-		       const struct ovs_tunnel_info *tun_info)
+		       const struct ip_tunnel_info *tun_info)
 {
 	struct pcpu_sw_netstats *stats;
 	struct sw_flow_key key;
@@ -572,22 +572,22 @@ void ovs_vport_deferred_free(struct vport *vport)
 }
 EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
 
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
 			       struct net *net,
-			       const struct ovs_tunnel_info *tun_info,
+			       const struct ip_tunnel_info *tun_info,
 			       u8 ipproto,
 			       u32 skb_mark,
 			       __be16 tp_src,
 			       __be16 tp_dst)
 {
-	const struct ovs_key_ipv4_tunnel *tun_key;
+	const struct ip_tunnel_key *tun_key;
 	struct rtable *rt;
 	struct flowi4 fl;
 
 	if (unlikely(!tun_info))
 		return -EINVAL;
 
-	tun_key = &tun_info->tunnel;
+	tun_key = &tun_info->key;
 
 	/* Route lookup to get srouce IP address.
 	 * The process may need to be changed if the corresponding process
@@ -602,22 +602,22 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
 	/* Generate egress_tun_info based on tun_info,
 	 * saddr, tp_src and tp_dst
 	 */
-	__ovs_flow_tun_info_init(egress_tun_info,
-				 fl.saddr, tun_key->ipv4_dst,
-				 tun_key->ipv4_tos,
-				 tun_key->ipv4_ttl,
-				 tp_src, tp_dst,
-				 tun_key->tun_id,
-				 tun_key->tun_flags,
-				 tun_info->options,
-				 tun_info->options_len);
+	__ip_tunnel_info_init(egress_tun_info,
+			      fl.saddr, tun_key->ipv4_dst,
+			      tun_key->ipv4_tos,
+			      tun_key->ipv4_ttl,
+			      tp_src, tp_dst,
+			      tun_key->tun_id,
+			      tun_key->tun_flags,
+			      tun_info->options,
+			      tun_info->options_len);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
 
 int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct ovs_tunnel_info *info)
+				  struct ip_tunnel_info *info)
 {
 	/* get_egress_tun_info() is only implemented on tunnel ports. */
 	if (unlikely(!vport->ops->get_egress_tun_info))
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index bc85331a6c60..4750fb673a9f 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -58,15 +58,15 @@ u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
 
 int ovs_vport_send(struct vport *, struct sk_buff *);
 
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
 			       struct net *net,
-			       const struct ovs_tunnel_info *tun_info,
+			       const struct ip_tunnel_info *tun_info,
 			       u8 ipproto,
 			       u32 skb_mark,
 			       __be16 tp_src,
 			       __be16 tp_dst);
 int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct ovs_tunnel_info *info);
+				  struct ip_tunnel_info *info);
 
 /* The following definitions are for implementers of vport devices: */
 
@@ -176,7 +176,7 @@ struct vport_ops {
 
 	int (*send)(struct vport *, struct sk_buff *);
 	int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
-				   struct ovs_tunnel_info *);
+				   struct ip_tunnel_info *);
 
 	struct module *owner;
 	struct list_head list;
@@ -226,7 +226,7 @@ static inline struct vport *vport_from_priv(void *priv)
 }
 
 void ovs_vport_receive(struct vport *, struct sk_buff *,
-		       const struct ovs_tunnel_info *);
+		       const struct ip_tunnel_info *);
 
 static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
@@ -239,7 +239,7 @@ int ovs_vport_ops_register(struct vport_ops *ops);
 void ovs_vport_ops_unregister(struct vport_ops *ops);
 
 static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
-						     const struct ovs_key_ipv4_tunnel *key,
+						     const struct ip_tunnel_key *key,
 						     u32 mark,
 						     struct flowi4 *fl,
 						     u8 protocol)
-- 
cgit v1.2.3


From f38a9eb1f77b296ff07e000823884a0f64d67b2a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:56 +0200
Subject: dst: Metadata destinations

Introduces a new dst_metadata which enables to carry per packet metadata
between forwarding and processing elements via the skb->dst pointer.

The structure is set up to be a union. Thus, each separate type of
metadata requires its own dst instance. If demand arises to carry
multiple types of metadata concurrently, metadata dst entries can be
made stackable.

The metadata dst entry is refcnt'ed as expected for now but a non
reference counted use is possible if the reference is forced before
queueing the skb.

In order to allow allocating dsts with variable length, the existing
dst_alloc() is split into a dst_alloc() and dst_init() function. The
existing dst_init() function to initialize the subsystem is being
renamed to dst_subsys_init() to make it clear what is what.

The check before ip_route_input() is changed to ignore metadata dsts
and drop the dst inside the routing function thus allowing to interpret
metadata in a later commit.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h          |  6 +++-
 include/net/dst_metadata.h | 32 ++++++++++++++++++
 net/core/dev.c             |  2 +-
 net/core/dst.c             | 84 ++++++++++++++++++++++++++++++++++++++--------
 net/ipv4/ip_input.c        |  3 +-
 net/ipv4/route.c           |  2 ++
 6 files changed, 112 insertions(+), 17 deletions(-)
 create mode 100644 include/net/dst_metadata.h

(limited to 'include/net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 2bc73f8a00a9..2578811cef51 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -57,6 +57,7 @@ struct dst_entry {
 #define DST_FAKE_RTABLE		0x0040
 #define DST_XFRM_TUNNEL		0x0080
 #define DST_XFRM_QUEUE		0x0100
+#define DST_METADATA		0x0200
 
 	unsigned short		pending_confirm;
 
@@ -356,6 +357,9 @@ static inline int dst_discard(struct sk_buff *skb)
 }
 void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref,
 		int initial_obsolete, unsigned short flags);
+void dst_init(struct dst_entry *dst, struct dst_ops *ops,
+	      struct net_device *dev, int initial_ref, int initial_obsolete,
+	      unsigned short flags);
 void __dst_free(struct dst_entry *dst);
 struct dst_entry *dst_destroy(struct dst_entry *dst);
 
@@ -457,7 +461,7 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
 	return dst;
 }
 
-void dst_init(void);
+void dst_subsys_init(void);
 
 /* Flags for xfrm_lookup flags argument. */
 enum {
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
new file mode 100644
index 000000000000..4f7694f3c7d0
--- /dev/null
+++ b/include/net/dst_metadata.h
@@ -0,0 +1,32 @@
+#ifndef __NET_DST_METADATA_H
+#define __NET_DST_METADATA_H 1
+
+#include <linux/skbuff.h>
+#include <net/ip_tunnels.h>
+#include <net/dst.h>
+
+struct metadata_dst {
+	struct dst_entry		dst;
+	size_t				opts_len;
+};
+
+static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
+{
+	struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb);
+
+	if (md_dst && md_dst->dst.flags & DST_METADATA)
+		return md_dst;
+
+	return NULL;
+}
+
+static inline bool skb_valid_dst(const struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+
+	return dst && !(dst->flags & DST_METADATA);
+}
+
+struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
+
+#endif /* __NET_DST_METADATA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 2ee15afb412d..cb52cba30ae8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7669,7 +7669,7 @@ static int __init net_dev_init(void)
 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
 	hotcpu_notifier(dev_cpu_callback, 0);
-	dst_init();
+	dst_subsys_init();
 	rc = 0;
 out:
 	return rc;
diff --git a/net/core/dst.c b/net/core/dst.c
index e956ce6d1378..917364f0d0be 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -22,6 +22,7 @@
 #include <linux/prefetch.h>
 
 #include <net/dst.h>
+#include <net/dst_metadata.h>
 
 /*
  * Theory of operations:
@@ -158,19 +159,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = {
 	[RTAX_MAX] = 0xdeadbeef,
 };
 
-
-void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
-		int initial_ref, int initial_obsolete, unsigned short flags)
+void dst_init(struct dst_entry *dst, struct dst_ops *ops,
+	      struct net_device *dev, int initial_ref, int initial_obsolete,
+	      unsigned short flags)
 {
-	struct dst_entry *dst;
-
-	if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
-		if (ops->gc(ops))
-			return NULL;
-	}
-	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
-	if (!dst)
-		return NULL;
 	dst->child = NULL;
 	dst->dev = dev;
 	if (dev)
@@ -200,6 +192,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 	dst->next = NULL;
 	if (!(flags & DST_NOCOUNT))
 		dst_entries_add(ops, 1);
+}
+EXPORT_SYMBOL(dst_init);
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+		int initial_ref, int initial_obsolete, unsigned short flags)
+{
+	struct dst_entry *dst;
+
+	if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+		if (ops->gc(ops))
+			return NULL;
+	}
+
+	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+	if (!dst)
+		return NULL;
+
+	dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+
 	return dst;
 }
 EXPORT_SYMBOL(dst_alloc);
@@ -248,7 +259,11 @@ again:
 		dst->ops->destroy(dst);
 	if (dst->dev)
 		dev_put(dst->dev);
-	kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+	if (dst->flags & DST_METADATA)
+		kfree(dst);
+	else
+		kmem_cache_free(dst->ops->kmem_cachep, dst);
 
 	dst = child;
 	if (dst) {
@@ -327,6 +342,47 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
 }
 EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 
+static struct dst_ops md_dst_ops = {
+	.family =		AF_UNSPEC,
+};
+
+static int dst_md_discard_sk(struct sock *sk, struct sk_buff *skb)
+{
+	WARN_ONCE(1, "Attempting to call output on metadata dst\n");
+	kfree_skb(skb);
+	return 0;
+}
+
+static int dst_md_discard(struct sk_buff *skb)
+{
+	WARN_ONCE(1, "Attempting to call input on metadata dst\n");
+	kfree_skb(skb);
+	return 0;
+}
+
+struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+{
+	struct metadata_dst *md_dst;
+	struct dst_entry *dst;
+
+	md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+	if (!md_dst)
+		return ERR_PTR(-ENOMEM);
+
+	dst = &md_dst->dst;
+	dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
+		 DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
+
+	dst->input = dst_md_discard;
+	dst->output = dst_md_discard_sk;
+
+	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+	md_dst->opts_len = optslen;
+
+	return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat
  * this mistake in 2.3, but we have no choice
@@ -391,7 +447,7 @@ static struct notifier_block dst_dev_notifier = {
 	.priority = -10, /* must be called after other network notifiers */
 };
 
-void __init dst_init(void)
+void __init dst_subsys_init(void)
 {
 	register_netdevice_notifier(&dst_dev_notifier);
 }
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2db4c8773c1b..f4fc8a77aaa7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -146,6 +146,7 @@
 #include <net/xfrm.h>
 #include <linux/mroute.h>
 #include <linux/netlink.h>
+#include <net/dst_metadata.h>
 
 /*
  *	Process Router Attention IP option (RFC 2113)
@@ -331,7 +332,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
 	 *	Initialise the virtual path cache for the packet. It describes
 	 *	how the packet travels inside Linux networking.
 	 */
-	if (!skb_dst(skb)) {
+	if (!skb_valid_dst(skb)) {
 		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
 					       iph->tos, skb->dev);
 		if (unlikely(err)) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index cd3157c464e6..4c8e84e75871 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1690,6 +1690,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	   by fib_lookup.
 	 */
 
+	skb_dst_drop(skb);
+
 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
 		goto martian_source;
 
-- 
cgit v1.2.3


From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:58 +0200
Subject: vxlan: Flow based tunneling

Allows putting a VXLAN device into a new flow-based mode in which
skbs with a ip_tunnel_info dst metadata attached will be encapsulated
according to the instructions stored in there with the VXLAN device
defaults taken into consideration.

Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is
set, the packet processing will populate a ip_tunnel_info struct for
each packet received and attach it to the skb using the new metadata
dst.  The metadata structure will contain the outer header and tunnel
header fields which have been stripped off. Layers further up in the
stack such as routing, tc or netfitler can later match on these fields
and perform forwarding. It is the responsibility of upper layers to
ensure that the flag is set if the metadata is needed. The flag limits
the additional cost of metadata collecting based on demand.

This prepares the VXLAN device to be steered by the routing and other
subsystems which allows to support encapsulation for a large number
of tunnel endpoints and tunnel ids through a single net_device which
improves the scalability.

It also allows for OVS to leverage this mode which in turn allows for
the removal of the OVS specific VXLAN code.

Because the skb is currently scrubed in vxlan_rcv(), the attachment of
the new dst metadata is postponed until after scrubing which requires
the temporary addition of a new member to vxlan_metadata. This member
is removed again in a later commit after the indirect VXLAN receive API
has been removed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 149 ++++++++++++++++++++++++++++++++++++-------
 include/linux/skbuff.h       |   1 +
 include/net/dst_metadata.h   |  13 ++++
 include/net/ip_tunnels.h     |  14 ++++
 include/net/vxlan.h          |  10 ++-
 include/uapi/linux/if_link.h |   1 +
 6 files changed, 165 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ec86a11743fd..06c092b05a51 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -49,6 +49,7 @@
 #include <net/ip6_tunnel.h>
 #include <net/ip6_checksum.h>
 #endif
+#include <net/dst_metadata.h>
 
 #define VXLAN_VERSION	"0.1"
 
@@ -140,6 +141,11 @@ struct vxlan_dev {
 static u32 vxlan_salt __read_mostly;
 static struct workqueue_struct *vxlan_wq;
 
+static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
+{
+	return vs->flags & VXLAN_F_COLLECT_METADATA;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline
 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct metadata_dst *tun_dst = NULL;
+	struct ip_tunnel_info *info;
 	struct vxlan_sock *vs;
 	struct vxlanhdr *vxh;
 	u32 flags, vni;
-	struct vxlan_metadata md = {0};
+	struct vxlan_metadata _md;
+	struct vxlan_metadata *md = &_md;
 
 	/* Need Vxlan and inner Ethernet header to be present */
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		vni &= VXLAN_VNI_MASK;
 	}
 
+	if (vxlan_collect_metadata(vs)) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
+		if (!tun_dst)
+			goto drop;
+
+		info = &tun_dst->u.tun_info;
+		info->key.ipv4_src = iph->saddr;
+		info->key.ipv4_dst = iph->daddr;
+		info->key.ipv4_tos = iph->tos;
+		info->key.ipv4_ttl = iph->ttl;
+		info->key.tp_src = udp_hdr(skb)->source;
+		info->key.tp_dst = udp_hdr(skb)->dest;
+
+		info->mode = IP_TUNNEL_INFO_RX;
+		info->key.tun_flags = TUNNEL_KEY;
+		info->key.tun_id = cpu_to_be64(vni >> 8);
+		if (udp_hdr(skb)->check != 0)
+			info->key.tun_flags |= TUNNEL_CSUM;
+
+		md = ip_tunnel_info_opts(info, sizeof(*md));
+		md->tun_dst = tun_dst;
+	} else {
+		memset(md, 0, sizeof(*md));
+	}
+
 	/* For backwards compatibility, only allow reserved fields to be
 	 * used by VXLAN extensions if explicitly requested.
 	 */
@@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		struct vxlanhdr_gbp *gbp;
 
 		gbp = (struct vxlanhdr_gbp *)vxh;
-		md.gbp = ntohs(gbp->policy_id);
+		md->gbp = ntohs(gbp->policy_id);
+
+		if (tun_dst)
+			info->key.tun_flags |= TUNNEL_VXLAN_OPT;
 
 		if (gbp->dont_learn)
-			md.gbp |= VXLAN_GBP_DONT_LEARN;
+			md->gbp |= VXLAN_GBP_DONT_LEARN;
 
 		if (gbp->policy_applied)
-			md.gbp |= VXLAN_GBP_POLICY_APPLIED;
+			md->gbp |= VXLAN_GBP_POLICY_APPLIED;
 
 		flags &= ~VXLAN_GBP_USED_BITS;
 	}
@@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto bad_flags;
 	}
 
-	md.vni = vxh->vx_vni;
-	vs->rcv(vs, skb, &md);
+	md->vni = vxh->vx_vni;
+	vs->rcv(vs, skb, md);
 	return 0;
 
 drop:
@@ -1247,6 +1286,9 @@ bad_flags:
 		   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
 
 error:
+	if (tun_dst)
+		dst_release((struct dst_entry *)tun_dst);
+
 	/* Return non vxlan pkt */
 	return 1;
 }
@@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	int err = 0;
 	union vxlan_addr *remote_ip;
 
-	vni = ntohl(md->vni) >> 8;
+	/* For flow based devices, map all packets to VNI 0 */
+	if (vs->flags & VXLAN_F_FLOW_BASED)
+		vni = 0;
+	else
+		vni = ntohl(md->vni) >> 8;
+
 	/* Is this VNI defined? */
 	vxlan = vxlan_vs_find_vni(vs, vni);
 	if (!vxlan)
@@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 #endif
 	}
 
+	if (md->tun_dst) {
+		skb_dst_set(skb, (struct dst_entry *)md->tun_dst);
+		md->tun_dst = NULL;
+	}
+
 	if ((vxlan->flags & VXLAN_F_LEARN) &&
 	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
 		goto drop;
 
 	skb_reset_network_header(skb);
-	skb->mark = md->gbp;
+	/* In flow-based mode, GBP is carried in dst_metadata */
+	if (!(vs->flags & VXLAN_F_FLOW_BASED))
+		skb->mark = md->gbp;
 
 	if (oip6)
 		err = IP6_ECN_decapsulate(oip6, skb);
@@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 
 	return;
 drop:
+	if (md->tun_dst)
+		dst_release((struct dst_entry *)md->tun_dst);
+
 	/* Consume bad packet */
 	kfree_skb(skb);
 }
@@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct sock *sk = vxlan->vn_sock->sock->sk;
 	struct rtable *rt = NULL;
 	const struct iphdr *old_iph;
 	struct flowi4 fl4;
 	union vxlan_addr *dst;
-	struct vxlan_metadata md;
+	union vxlan_addr remote_ip;
+	struct vxlan_metadata _md;
+	struct vxlan_metadata *md = &_md;
 	__be16 src_port = 0, dst_port;
 	u32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
 	int err;
+	u32 flags = vxlan->flags;
 
-	dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
-	vni = rdst->remote_vni;
-	dst = &rdst->remote_ip;
+	if (rdst) {
+		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
+		vni = rdst->remote_vni;
+		dst = &rdst->remote_ip;
+	} else {
+		if (!info) {
+			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
+				  dev->name);
+			goto drop;
+		}
+
+		dst_port = info->key.tp_dst ? : vxlan->dst_port;
+		vni = be64_to_cpu(info->key.tun_id);
+		remote_ip.sin.sin_family = AF_INET;
+		remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst;
+		dst = &remote_ip;
+	}
 
 	if (vxlan_addr_any(dst)) {
 		if (did_rsc) {
@@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				     vxlan->port_max, true);
 
 	if (dst->sa.sa_family == AF_INET) {
+		if (info) {
+			if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+				df = htons(IP_DF);
+			if (info->key.tun_flags & TUNNEL_CSUM)
+				flags |= VXLAN_F_UDP_CSUM;
+			else
+				flags &= ~VXLAN_F_UDP_CSUM;
+
+			ttl = info->key.ipv4_ttl;
+			tos = info->key.ipv4_tos;
+
+			if (info->options_len)
+				md = ip_tunnel_info_opts(info, sizeof(*md));
+		} else {
+			md->gbp = skb->mark;
+		}
+
 		memset(&fl4, 0, sizeof(fl4));
-		fl4.flowi4_oif = rdst->remote_ifindex;
+		fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
 		fl4.flowi4_tos = RT_TOS(tos);
 		fl4.flowi4_mark = skb->mark;
 		fl4.flowi4_proto = IPPROTO_UDP;
@@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-		md.vni = htonl(vni << 8);
-		md.gbp = skb->mark;
-
+		md->vni = htonl(vni << 8);
 		err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
 				     dst->sin.sin_addr.s_addr, tos, ttl, df,
-				     src_port, dst_port, &md,
+				     src_port, dst_port, md,
 				     !net_eq(vxlan->net, dev_net(vxlan->dev)),
-				     vxlan->flags);
+				     flags);
 		if (err < 0) {
 			/* skb is already freed. */
 			skb = NULL;
@@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		u32 flags;
 
 		memset(&fl6, 0, sizeof(fl6));
-		fl6.flowi6_oif = rdst->remote_ifindex;
+		fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
 		fl6.daddr = dst->sin6.sin6_addr;
 		fl6.saddr = vxlan->saddr.sin6.sin6_addr;
 		fl6.flowi6_mark = skb->mark;
@@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
-		md.vni = htonl(vni << 8);
-		md.gbp = skb->mark;
+		md->vni = htonl(vni << 8);
+		md->gbp = skb->mark;
 
 		err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
-				      0, ttl, src_port, dst_port, &md,
+				      0, ttl, src_port, dst_port, md,
 				      !net_eq(vxlan->net, dev_net(vxlan->dev)),
 				      vxlan->flags);
 #endif
@@ -2051,6 +2141,7 @@ tx_free:
 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct ethhdr *eth;
 	bool did_rsc = false;
 	struct vxlan_rdst *rdst, *fdst = NULL;
@@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 	}
 
+	if (vxlan->flags & VXLAN_F_FLOW_BASED &&
+	    info && info->mode == IP_TUNNEL_INFO_TX) {
+		vxlan_xmit_one(skb, dev, NULL, false);
+		return NETDEV_TX_OK;
+	}
+
 	f = vxlan_find_mac(vxlan, eth->h_dest);
 	did_rsc = false;
 
@@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_FLOWBASED]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
 	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
@@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_LIMIT])
 		vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
 
+	if (data[IFLA_VXLAN_FLOWBASED] &&
+	    nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
+		vxlan->flags |= VXLAN_F_FLOW_BASED;
+
 	if (data[IFLA_VXLAN_PORT_RANGE]) {
 		const struct ifla_vxlan_port_range *p
 			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
@@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_FLOWBASED */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
 		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
@@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
 			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_FLOWBASED,
+		       !!(vxlan->flags & VXLAN_F_FLOW_BASED)) ||
 	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
 	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
 	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) ||
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6bd96fe9416a..648a2c241993 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3469,5 +3469,6 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
 			       skb_network_header(skb);
 	return hdr_len + skb_gso_transport_seglen(skb);
 }
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 4f7694f3c7d0..e843937fb30a 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -8,6 +8,9 @@
 struct metadata_dst {
 	struct dst_entry		dst;
 	size_t				opts_len;
+	union {
+		struct ip_tunnel_info	tun_info;
+	} u;
 };
 
 static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
@@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
+static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
+{
+	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+
+	if (md_dst)
+		return &md_dst->u.tun_info;
+
+	return NULL;
+}
+
 static inline bool skb_valid_dst(const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6b9d559ce5f5..d11530f1c1e2 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -38,10 +38,19 @@ struct ip_tunnel_key {
 	__be16			tp_dst;
 } __packed __aligned(4); /* Minimize padding. */
 
+/* Indicates whether the tunnel info structure represents receive
+ * or transmit tunnel parameters.
+ */
+enum {
+	IP_TUNNEL_INFO_RX,
+	IP_TUNNEL_INFO_TX,
+};
+
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
 	const void		*options;
 	u8			options_len;
+	u8			mode;
 };
 
 /* 6rd prefix/relay information */
@@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err,
 	}
 }
 
+static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n)
+{
+	return info + 1;
+}
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 0082b5d33d7d..80a2da29e088 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -7,6 +7,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/udp.h>
+#include <net/dst_metadata.h>
 
 #define VNI_HASH_BITS	10
 #define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
@@ -97,6 +98,9 @@ struct vxlanhdr {
 struct vxlan_metadata {
 	__be32		vni;
 	u32		gbp;
+
+	/* Temporary until vxlan_rcv() API is gone */
+	struct metadata_dst *tun_dst;
 };
 
 struct vxlan_sock;
@@ -130,6 +134,8 @@ struct vxlan_sock {
 #define VXLAN_F_REMCSUM_RX		0x400
 #define VXLAN_F_GBP			0x800
 #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
+#define VXLAN_F_COLLECT_METADATA	0x2000
+#define VXLAN_F_FLOW_BASED		0x4000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -137,7 +143,9 @@ struct vxlan_sock {
 #define VXLAN_F_RCV_FLAGS		(VXLAN_F_GBP |			\
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_REMCSUM_RX |		\
-					 VXLAN_F_REMCSUM_NOPARTIAL)
+					 VXLAN_F_REMCSUM_NOPARTIAL |	\
+					 VXLAN_F_COLLECT_METADATA |	\
+					 VXLAN_F_FLOW_BASED)
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 24d68b797c59..9eeb5d9cf8f0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -382,6 +382,7 @@ enum {
 	IFLA_VXLAN_REMCSUM_RX,
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
+	IFLA_VXLAN_FLOWBASED,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 1b7179d3adff0ab71f85ee24d7de28ccb7734b89 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:59 +0200
Subject: route: Extend flow representation with tunnel key

Add a new flowi_tunnel structure which is a subset of ip_tunnel_key to
allow routes to match on tunnel metadata. For now, the tunnel id is
added to flowi_tunnel which allows for routes to be bound to specific
virtual tunnels.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h      | 8 ++++++++
 net/ipv4/fib_frontend.c | 2 ++
 net/ipv4/route.c        | 8 ++++++++
 3 files changed, 18 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow.h b/include/net/flow.h
index 8109a159d1b3..3098ae33a178 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -19,6 +19,10 @@
 
 #define LOOPBACK_IFINDEX	1
 
+struct flowi_tunnel {
+	__be64			tun_id;
+};
+
 struct flowi_common {
 	int	flowic_oif;
 	int	flowic_iif;
@@ -30,6 +34,7 @@ struct flowi_common {
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_KNOWN_NH		0x02
 	__u32	flowic_secid;
+	struct flowi_tunnel flowic_tun_key;
 };
 
 union flowi_uli {
@@ -66,6 +71,7 @@ struct flowi4 {
 #define flowi4_proto		__fl_common.flowic_proto
 #define flowi4_flags		__fl_common.flowic_flags
 #define flowi4_secid		__fl_common.flowic_secid
+#define flowi4_tun_key		__fl_common.flowic_tun_key
 
 	/* (saddr,daddr) must be grouped, same order as in IP header */
 	__be32			saddr;
@@ -95,6 +101,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 	fl4->flowi4_proto = proto;
 	fl4->flowi4_flags = flags;
 	fl4->flowi4_secid = 0;
+	fl4->flowi4_tun_key.tun_id = 0;
 	fl4->daddr = daddr;
 	fl4->saddr = saddr;
 	fl4->fl4_dport = dport;
@@ -165,6 +172,7 @@ struct flowi {
 #define flowi_proto	u.__fl_common.flowic_proto
 #define flowi_flags	u.__fl_common.flowic_flags
 #define flowi_secid	u.__fl_common.flowic_secid
+#define flowi_tun_key	u.__fl_common.flowic_tun_key
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 9b2019cc3586..6b98de0d7949 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -280,6 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 		fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 		fl4.flowi4_scope = scope;
 		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+		fl4.flowi4_tun_key.tun_id = 0;
 		if (!fib_lookup(net, &fl4, &res, 0))
 			return FIB_RES_PREFSRC(net, res);
 	} else {
@@ -313,6 +314,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	fl4.saddr = dst;
 	fl4.flowi4_tos = tos;
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_tun_key.tun_id = 0;
 
 	no_addr = idev->ifa_list == NULL;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4c8e84e75871..91da18be0a71 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -91,6 +91,7 @@
 #include <linux/slab.h>
 #include <linux/jhash.h>
 #include <net/dst.h>
+#include <net/dst_metadata.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 #include <net/ip.h>
@@ -110,6 +111,7 @@
 #include <linux/kmemleak.h>
 #endif
 #include <net/secure_seq.h>
+#include <net/ip_tunnels.h>
 
 #define RT_FL_TOS(oldflp4) \
 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -1673,6 +1675,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
 	struct fib_result res;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct ip_tunnel_info *tun_info;
 	struct flowi4	fl4;
 	unsigned int	flags = 0;
 	u32		itag = 0;
@@ -1690,6 +1693,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	   by fib_lookup.
 	 */
 
+	tun_info = skb_tunnel_info(skb);
+	if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
+	else
+		fl4.flowi4_tun_key.tun_id = 0;
 	skb_dst_drop(skb);
 
 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
-- 
cgit v1.2.3


From 3093fbe7ff4bc7d1571fc217dade1cf80330a714 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:44:00 +0200
Subject: route: Per route IP tunnel metadata via lightweight tunnel

This introduces a new IP tunnel lightweight tunnel type which allows
to specify IP tunnel instructions per route. Only IPv4 is supported
at this point.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            |  10 +++-
 include/net/dst_metadata.h     |  12 ++++-
 include/net/ip_tunnels.h       |   7 ++-
 include/uapi/linux/lwtunnel.h  |   1 +
 include/uapi/linux/rtnetlink.h |  15 ++++++
 net/ipv4/ip_tunnel_core.c      | 114 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/route.c               |   2 +-
 net/openvswitch/vport.h        |   1 +
 8 files changed, 157 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 06c092b05a51..9486d7ec128c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1935,7 +1935,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
-	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	struct ip_tunnel_info *info;
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct sock *sk = vxlan->vn_sock->sock->sk;
 	struct rtable *rt = NULL;
@@ -1952,6 +1952,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	int err;
 	u32 flags = vxlan->flags;
 
+	/* FIXME: Support IPv6 */
+	info = skb_tunnel_info(skb, AF_INET);
+
 	if (rdst) {
 		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
 		vni = rdst->remote_vni;
@@ -2141,12 +2144,15 @@ tx_free:
 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	const struct ip_tunnel_info *info;
 	struct ethhdr *eth;
 	bool did_rsc = false;
 	struct vxlan_rdst *rdst, *fdst = NULL;
 	struct vxlan_fdb *f;
 
+	/* FIXME: Support IPv6 */
+	info = skb_tunnel_info(skb, AF_INET);
+
 	skb_reset_mac_header(skb);
 	eth = eth_hdr(skb);
 
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index e843937fb30a..7b0306894663 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -23,13 +23,23 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
-static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
+static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb,
+						     int family)
 {
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+	struct rtable *rt;
 
 	if (md_dst)
 		return &md_dst->u.tun_info;
 
+	switch (family) {
+	case AF_INET:
+		rt = (struct rtable *)skb_dst(skb);
+		if (rt && rt->rt_lwtstate)
+			return lwt_tun_info(rt->rt_lwtstate);
+		break;
+	}
+
 	return NULL;
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index d11530f1c1e2..0b7e18cfa0b4 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -9,9 +9,9 @@
 #include <net/dsfield.h>
 #include <net/gro_cells.h>
 #include <net/inet_ecn.h>
-#include <net/ip.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
+#include <net/lwtunnel.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -298,6 +298,11 @@ static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n)
 	return info + 1;
 }
 
+static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
+{
+	return (struct ip_tunnel_info *)lwtstate->data;
+}
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index aa611d931a31..31377bbea3f8 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -6,6 +6,7 @@
 enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
+	LWTUNNEL_ENCAP_IP,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc43356..47d24cb3fbc1 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -286,6 +286,21 @@ enum rt_class_t {
 
 /* Routing message attributes */
 
+enum ip_tunnel_t {
+	IP_TUN_UNSPEC,
+	IP_TUN_ID,
+	IP_TUN_DST,
+	IP_TUN_SRC,
+	IP_TUN_TTL,
+	IP_TUN_TOS,
+	IP_TUN_SPORT,
+	IP_TUN_DPORT,
+	IP_TUN_FLAGS,
+	__IP_TUN_MAX,
+};
+
+#define IP_TUN_MAX (__IP_TUN_MAX - 1)
+
 enum rtattr_type_t {
 	RTA_UNSPEC,
 	RTA_DST,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6a51a71a6c67..025b76e803fd 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -190,3 +190,117 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 	return tot;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+
+static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = {
+	[IP_TUN_ID]		= { .type = NLA_U64 },
+	[IP_TUN_DST]		= { .type = NLA_U32 },
+	[IP_TUN_SRC]		= { .type = NLA_U32 },
+	[IP_TUN_TTL]		= { .type = NLA_U8 },
+	[IP_TUN_TOS]		= { .type = NLA_U8 },
+	[IP_TUN_SPORT]		= { .type = NLA_U16 },
+	[IP_TUN_DPORT]		= { .type = NLA_U16 },
+	[IP_TUN_FLAGS]		= { .type = NLA_U16 },
+};
+
+static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+			      struct lwtunnel_state **ts)
+{
+	struct ip_tunnel_info *tun_info;
+	struct lwtunnel_state *new_state;
+	struct nlattr *tb[IP_TUN_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy);
+	if (err < 0)
+		return err;
+
+	new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+	if (!new_state)
+		return -ENOMEM;
+
+	new_state->type = LWTUNNEL_ENCAP_IP;
+
+	tun_info = lwt_tun_info(new_state);
+
+	if (tb[IP_TUN_ID])
+		tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]);
+
+	if (tb[IP_TUN_DST])
+		tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]);
+
+	if (tb[IP_TUN_SRC])
+		tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]);
+
+	if (tb[IP_TUN_TTL])
+		tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]);
+
+	if (tb[IP_TUN_TOS])
+		tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]);
+
+	if (tb[IP_TUN_SPORT])
+		tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]);
+
+	if (tb[IP_TUN_DPORT])
+		tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]);
+
+	if (tb[IP_TUN_FLAGS])
+		tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]);
+
+	tun_info->mode = IP_TUNNEL_INFO_TX;
+	tun_info->options = NULL;
+	tun_info->options_len = 0;
+
+	*ts = new_state;
+
+	return 0;
+}
+
+static int ip_tun_fill_encap_info(struct sk_buff *skb,
+				  struct lwtunnel_state *lwtstate)
+{
+	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+	if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) ||
+	    nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) ||
+	    nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) ||
+	    nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) ||
+	    nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) ||
+	    nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) ||
+	    nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) ||
+	    nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(8)	/* IP_TUN_ID */
+		+ nla_total_size(4)	/* IP_TUN_DST */
+		+ nla_total_size(4)	/* IP_TUN_SRC */
+		+ nla_total_size(1)	/* IP_TUN_TOS */
+		+ nla_total_size(1)	/* IP_TUN_TTL */
+		+ nla_total_size(2)	/* IP_TUN_SPORT */
+		+ nla_total_size(2)	/* IP_TUN_DPORT */
+		+ nla_total_size(2);	/* IP_TUN_FLAGS */
+}
+
+static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
+	.build_state = ip_tun_build_state,
+	.fill_encap = ip_tun_fill_encap_info,
+	.get_encap_size = ip_tun_encap_nlsize,
+};
+
+static int __init ip_tunnel_core_init(void)
+{
+	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+
+	return 0;
+}
+module_init(ip_tunnel_core_init);
+
+static void __exit ip_tunnel_core_exit(void)
+{
+	lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+}
+module_exit(ip_tunnel_core_exit);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 91da18be0a71..519ec232818d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1693,7 +1693,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	   by fib_lookup.
 	 */
 
-	tun_info = skb_tunnel_info(skb);
+	tun_info = skb_tunnel_info(skb, AF_INET);
 	if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
 	else
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 4750fb673a9f..75d68248ba69 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -27,6 +27,7 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/u64_stats_sync.h>
+#include <net/route.h>
 
 #include "datapath.h"
 
-- 
cgit v1.2.3


From e7030878fc8448492b6e5cecd574043f63271298 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:44:01 +0200
Subject: fib: Add fib rule match on tunnel id

This add the ability to select a routing table based on the tunnel
id which allows to maintain separate routing tables for each virtual
tunnel network.

ip rule add from all tunnel-id 100 lookup 100
ip rule add from all tunnel-id 200 lookup 200

A new static key controls the collection of metadata at tunnel level
upon demand.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            |  3 ++-
 include/net/fib_rules.h        |  1 +
 include/net/ip_tunnels.h       | 11 +++++++++++
 include/uapi/linux/fib_rules.h |  2 +-
 net/core/fib_rules.c           | 24 ++++++++++++++++++++++--
 net/ipv4/ip_tunnel_core.c      | 16 ++++++++++++++++
 6 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 9486d7ec128c..2587ac84f71a 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -143,7 +143,8 @@ static struct workqueue_struct *vxlan_wq;
 
 static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
 {
-	return vs->flags & VXLAN_F_COLLECT_METADATA;
+	return vs->flags & VXLAN_F_COLLECT_METADATA ||
+	       ip_tunnel_collect_metadata();
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 903a55efbffe..4e8f804f4589 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -19,6 +19,7 @@ struct fib_rule {
 	u8			action;
 	/* 3 bytes hole, try to use */
 	u32			target;
+	__be64			tun_id;
 	struct fib_rule __rcu	*ctarget;
 	struct net		*fr_net;
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0b7e18cfa0b4..0a5a7763eec2 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -303,6 +303,17 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat
 	return (struct ip_tunnel_info *)lwtstate->data;
 }
 
+extern struct static_key ip_tunnel_metadata_cnt;
+
+/* Returns > 0 if metadata should be collected */
+static inline int ip_tunnel_collect_metadata(void)
+{
+	return static_key_false(&ip_tunnel_metadata_cnt);
+}
+
+void ip_tunnel_need_metadata(void);
+void ip_tunnel_unneed_metadata(void);
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 2b82d7e30974..96161b8202b5 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -43,7 +43,7 @@ enum {
 	FRA_UNUSED5,
 	FRA_FWMARK,	/* mark */
 	FRA_FLOW,	/* flow/class id */
-	FRA_UNUSED6,
+	FRA_TUN_ID,
 	FRA_SUPPRESS_IFGROUP,
 	FRA_SUPPRESS_PREFIXLEN,
 	FRA_TABLE,	/* Extended table id */
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 9a12668f7d62..ae8306e7c56f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -16,6 +16,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
 
 int fib_default_rule_add(struct fib_rules_ops *ops,
 			 u32 pref, u32 table, u32 flags)
@@ -186,6 +187,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 	if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
 		goto out;
 
+	if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
+		goto out;
+
 	ret = ops->match(rule, fl, flags);
 out:
 	return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -330,6 +334,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (tb[FRA_FWMASK])
 		rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
 
+	if (tb[FRA_TUN_ID])
+		rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+
 	rule->action = frh->action;
 	rule->flags = frh->flags;
 	rule->table = frh_get_table(frh, tb);
@@ -407,6 +414,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (unresolved)
 		ops->unresolved_rules++;
 
+	if (rule->tun_id)
+		ip_tunnel_need_metadata();
+
 	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
 	flush_route_cache(ops);
 	rules_ops_put(ops);
@@ -473,6 +483,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 		    (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
 			continue;
 
+		if (tb[FRA_TUN_ID] &&
+		    (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
+			continue;
+
 		if (!ops->compare(rule, frh, tb))
 			continue;
 
@@ -487,6 +501,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 				goto errout;
 		}
 
+		if (rule->tun_id)
+			ip_tunnel_unneed_metadata();
+
 		list_del_rcu(&rule->list);
 
 		if (rule->action == FR_ACT_GOTO) {
@@ -535,7 +552,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
 			 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
 			 + nla_total_size(4) /* FRA_FWMARK */
-			 + nla_total_size(4); /* FRA_FWMASK */
+			 + nla_total_size(4) /* FRA_FWMASK */
+			 + nla_total_size(8); /* FRA_TUN_ID */
 
 	if (ops->nlmsg_payload)
 		payload += ops->nlmsg_payload(rule);
@@ -591,7 +609,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	    ((rule->mark_mask || rule->mark) &&
 	     nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
 	    (rule->target &&
-	     nla_put_u32(skb, FRA_GOTO, rule->target)))
+	     nla_put_u32(skb, FRA_GOTO, rule->target)) ||
+	    (rule->tun_id &&
+	     nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
 		goto nla_put_failure;
 
 	if (rule->suppress_ifgroup != -1) {
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 025b76e803fd..630e6d5712e8 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -32,6 +32,7 @@
 #include <linux/etherdevice.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
+#include <linux/static_key.h>
 
 #include <net/ip.h>
 #include <net/icmp.h>
@@ -304,3 +305,18 @@ static void __exit ip_tunnel_core_exit(void)
 	lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
 }
 module_exit(ip_tunnel_core_exit);
+
+struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
+
+void ip_tunnel_need_metadata(void)
+{
+	static_key_slow_inc(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
+
+void ip_tunnel_unneed_metadata(void)
+{
+	static_key_slow_dec(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
-- 
cgit v1.2.3


From 0dfbdf4102b9303d3ddf2177c0220098ff99f6de Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:44:02 +0200
Subject: vxlan: Factor out device configuration

This factors out the device configuration out of the RTNL newlink
API which allows for in-kernel creation of VXLAN net_devices.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 332 ++++++++++++++++++++++++++++------------------------
 include/net/vxlan.h |  59 ++++++++++
 2 files changed, 236 insertions(+), 155 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2587ac84f71a..30e1f215af73 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -55,10 +55,6 @@
 
 #define PORT_HASH_BITS	8
 #define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
-#define VNI_HASH_BITS	10
-#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
-#define FDB_HASH_BITS	8
-#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
 #define FDB_AGE_DEFAULT 300 /* 5 min */
 #define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */
 
@@ -75,6 +71,7 @@ module_param(log_ecn_error, bool, 0644);
 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 
 static int vxlan_net_id;
+static struct rtnl_link_ops vxlan_link_ops;
 
 static const u8 all_zeros_mac[ETH_ALEN];
 
@@ -85,21 +82,6 @@ struct vxlan_net {
 	spinlock_t	  sock_lock;
 };
 
-union vxlan_addr {
-	struct sockaddr_in sin;
-	struct sockaddr_in6 sin6;
-	struct sockaddr sa;
-};
-
-struct vxlan_rdst {
-	union vxlan_addr	 remote_ip;
-	__be16			 remote_port;
-	u32			 remote_vni;
-	u32			 remote_ifindex;
-	struct list_head	 list;
-	struct rcu_head		 rcu;
-};
-
 /* Forwarding table entry */
 struct vxlan_fdb {
 	struct hlist_node hlist;	/* linked list of entries */
@@ -112,31 +94,6 @@ struct vxlan_fdb {
 	u8		  flags;	/* see ndm_flags */
 };
 
-/* Pseudo network device */
-struct vxlan_dev {
-	struct hlist_node hlist;	/* vni hash table */
-	struct list_head  next;		/* vxlan's per namespace list */
-	struct vxlan_sock *vn_sock;	/* listening socket */
-	struct net_device *dev;
-	struct net	  *net;		/* netns for packet i/o */
-	struct vxlan_rdst default_dst;	/* default destination */
-	union vxlan_addr  saddr;	/* source address */
-	__be16		  dst_port;
-	__u16		  port_min;	/* source port range */
-	__u16		  port_max;
-	__u8		  tos;		/* TOS override */
-	__u8		  ttl;
-	u32		  flags;	/* VXLAN_F_* in vxlan.h */
-
-	unsigned long	  age_interval;
-	struct timer_list age_timer;
-	spinlock_t	  hash_lock;
-	unsigned int	  addrcnt;
-	unsigned int	  addrmax;
-
-	struct hlist_head fdb_head[FDB_HASH_SIZE];
-};
-
 /* salt for hash table */
 static u32 vxlan_salt __read_mostly;
 static struct workqueue_struct *vxlan_wq;
@@ -352,7 +309,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 	if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
 		goto nla_put_failure;
 
-	if (rdst->remote_port && rdst->remote_port != vxlan->dst_port &&
+	if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
 	    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
 		goto nla_put_failure;
 	if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
@@ -756,7 +713,8 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 		if (!(flags & NLM_F_CREATE))
 			return -ENOENT;
 
-		if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
+		if (vxlan->cfg.addrmax &&
+		    vxlan->addrcnt >= vxlan->cfg.addrmax)
 			return -ENOSPC;
 
 		/* Disallow replace to add a multicast entry */
@@ -842,7 +800,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
 			return -EINVAL;
 		*port = nla_get_be16(tb[NDA_PORT]);
 	} else {
-		*port = vxlan->dst_port;
+		*port = vxlan->cfg.dst_port;
 	}
 
 	if (tb[NDA_VNI]) {
@@ -1028,7 +986,7 @@ static bool vxlan_snoop(struct net_device *dev,
 			vxlan_fdb_create(vxlan, src_mac, src_ip,
 					 NUD_REACHABLE,
 					 NLM_F_EXCL|NLM_F_CREATE,
-					 vxlan->dst_port,
+					 vxlan->cfg.dst_port,
 					 vxlan->default_dst.remote_vni,
 					 0, NTF_SELF);
 		spin_unlock(&vxlan->hash_lock);
@@ -1957,7 +1915,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	info = skb_tunnel_info(skb, AF_INET);
 
 	if (rdst) {
-		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
+		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
 		vni = rdst->remote_vni;
 		dst = &rdst->remote_ip;
 	} else {
@@ -1967,7 +1925,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto drop;
 		}
 
-		dst_port = info->key.tp_dst ? : vxlan->dst_port;
+		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
 		vni = be64_to_cpu(info->key.tun_id);
 		remote_ip.sin.sin_family = AF_INET;
 		remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst;
@@ -1985,16 +1943,16 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 	old_iph = ip_hdr(skb);
 
-	ttl = vxlan->ttl;
+	ttl = vxlan->cfg.ttl;
 	if (!ttl && vxlan_addr_multicast(dst))
 		ttl = 1;
 
-	tos = vxlan->tos;
+	tos = vxlan->cfg.tos;
 	if (tos == 1)
 		tos = ip_tunnel_get_dsfield(old_iph, skb);
 
-	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->port_min,
-				     vxlan->port_max, true);
+	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
+				     vxlan->cfg.port_max, true);
 
 	if (dst->sa.sa_family == AF_INET) {
 		if (info) {
@@ -2020,7 +1978,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		fl4.flowi4_mark = skb->mark;
 		fl4.flowi4_proto = IPPROTO_UDP;
 		fl4.daddr = dst->sin.sin_addr.s_addr;
-		fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr;
+		fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr;
 
 		rt = ip_route_output_key(vxlan->net, &fl4);
 		if (IS_ERR(rt)) {
@@ -2076,7 +2034,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		memset(&fl6, 0, sizeof(fl6));
 		fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
 		fl6.daddr = dst->sin6.sin6_addr;
-		fl6.saddr = vxlan->saddr.sin6.sin6_addr;
+		fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
 		fl6.flowi6_mark = skb->mark;
 		fl6.flowi6_proto = IPPROTO_UDP;
 
@@ -2247,7 +2205,7 @@ static void vxlan_cleanup(unsigned long arg)
 			if (f->state & NUD_PERMANENT)
 				continue;
 
-			timeout = f->used + vxlan->age_interval * HZ;
+			timeout = f->used + vxlan->cfg.age_interval * HZ;
 			if (time_before_eq(timeout, jiffies)) {
 				netdev_dbg(vxlan->dev,
 					   "garbage collect %pM\n",
@@ -2311,8 +2269,8 @@ static int vxlan_open(struct net_device *dev)
 	struct vxlan_sock *vs;
 	int ret = 0;
 
-	vs = vxlan_sock_add(vxlan->net, vxlan->dst_port, vxlan_rcv, NULL,
-			    false, vxlan->flags);
+	vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, vxlan_rcv,
+			    NULL, vxlan->cfg.no_share, vxlan->flags);
 	if (IS_ERR(vs))
 		return PTR_ERR(vs);
 
@@ -2326,7 +2284,7 @@ static int vxlan_open(struct net_device *dev)
 		}
 	}
 
-	if (vxlan->age_interval)
+	if (vxlan->cfg.age_interval)
 		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
 
 	return ret;
@@ -2484,7 +2442,7 @@ static void vxlan_setup(struct net_device *dev)
 	vxlan->age_timer.function = vxlan_cleanup;
 	vxlan->age_timer.data = (unsigned long) vxlan;
 
-	vxlan->dst_port = htons(vxlan_port);
+	vxlan->cfg.dst_port = htons(vxlan_port);
 
 	vxlan->dev = dev;
 
@@ -2684,54 +2642,35 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 }
 EXPORT_SYMBOL_GPL(vxlan_sock_add);
 
-static int vxlan_newlink(struct net *src_net, struct net_device *dev,
-			 struct nlattr *tb[], struct nlattr *data[])
+static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
+			       struct vxlan_config *conf)
 {
 	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_rdst *dst = &vxlan->default_dst;
-	__u32 vni;
 	int err;
 	bool use_ipv6 = false;
-
-	if (!data[IFLA_VXLAN_ID])
-		return -EINVAL;
+	__be16 default_port = vxlan->cfg.dst_port;
 
 	vxlan->net = src_net;
 
-	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
-	dst->remote_vni = vni;
+	dst->remote_vni = conf->vni;
 
-	/* Unless IPv6 is explicitly requested, assume IPv4 */
-	dst->remote_ip.sa.sa_family = AF_INET;
-	if (data[IFLA_VXLAN_GROUP]) {
-		dst->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
-	} else if (data[IFLA_VXLAN_GROUP6]) {
-		if (!IS_ENABLED(CONFIG_IPV6))
-			return -EPFNOSUPPORT;
-
-		dst->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
-		dst->remote_ip.sa.sa_family = AF_INET6;
-		use_ipv6 = true;
-	}
+	memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));
 
-	if (data[IFLA_VXLAN_LOCAL]) {
-		vxlan->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
-		vxlan->saddr.sa.sa_family = AF_INET;
-	} else if (data[IFLA_VXLAN_LOCAL6]) {
-		if (!IS_ENABLED(CONFIG_IPV6))
-			return -EPFNOSUPPORT;
+	/* Unless IPv6 is explicitly requested, assume IPv4 */
+	if (!dst->remote_ip.sa.sa_family)
+		dst->remote_ip.sa.sa_family = AF_INET;
 
-		/* TODO: respect scope id */
-		vxlan->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
-		vxlan->saddr.sa.sa_family = AF_INET6;
+	if (dst->remote_ip.sa.sa_family == AF_INET6 ||
+	    vxlan->cfg.saddr.sa.sa_family == AF_INET6)
 		use_ipv6 = true;
-	}
 
-	if (data[IFLA_VXLAN_LINK] &&
-	    (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
+	if (conf->remote_ifindex) {
 		struct net_device *lowerdev
-			 = __dev_get_by_index(src_net, dst->remote_ifindex);
+			 = __dev_get_by_index(src_net, conf->remote_ifindex);
+
+		dst->remote_ifindex = conf->remote_ifindex;
 
 		if (!lowerdev) {
 			pr_info("ifindex %d does not exist\n", dst->remote_ifindex);
@@ -2749,7 +2688,7 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 		}
 #endif
 
-		if (!tb[IFLA_MTU])
+		if (!conf->mtu)
 			dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
 
 		dev->needed_headroom = lowerdev->hard_header_len +
@@ -2757,105 +2696,188 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	} else if (use_ipv6)
 		vxlan->flags |= VXLAN_F_IPV6;
 
+	memcpy(&vxlan->cfg, conf, sizeof(*conf));
+	if (!vxlan->cfg.dst_port)
+		vxlan->cfg.dst_port = default_port;
+	vxlan->flags |= conf->flags;
+
+	if (!vxlan->cfg.age_interval)
+		vxlan->cfg.age_interval = FDB_AGE_DEFAULT;
+
+	if (vxlan_find_vni(src_net, conf->vni, use_ipv6 ? AF_INET6 : AF_INET,
+			   vxlan->cfg.dst_port, vxlan->flags))
+		return -EEXIST;
+
+	dev->ethtool_ops = &vxlan_ethtool_ops;
+
+	/* create an fdb entry for a valid default destination */
+	if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
+		err = vxlan_fdb_create(vxlan, all_zeros_mac,
+				       &vxlan->default_dst.remote_ip,
+				       NUD_REACHABLE|NUD_PERMANENT,
+				       NLM_F_EXCL|NLM_F_CREATE,
+				       vxlan->cfg.dst_port,
+				       vxlan->default_dst.remote_vni,
+				       vxlan->default_dst.remote_ifindex,
+				       NTF_SELF);
+		if (err)
+			return err;
+	}
+
+	err = register_netdevice(dev);
+	if (err) {
+		vxlan_fdb_delete_default(vxlan);
+		return err;
+	}
+
+	list_add(&vxlan->next, &vn->vxlan_list);
+
+	return 0;
+}
+
+struct net_device *vxlan_dev_create(struct net *net, const char *name,
+				    u8 name_assign_type, struct vxlan_config *conf)
+{
+	struct nlattr *tb[IFLA_MAX+1];
+	struct net_device *dev;
+	int err;
+
+	memset(&tb, 0, sizeof(tb));
+
+	dev = rtnl_create_link(net, name, name_assign_type,
+			       &vxlan_link_ops, tb);
+	if (IS_ERR(dev))
+		return dev;
+
+	err = vxlan_dev_configure(net, dev, conf);
+	if (err < 0) {
+		free_netdev(dev);
+		return ERR_PTR(err);
+	}
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(vxlan_dev_create);
+
+static int vxlan_newlink(struct net *src_net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vxlan_config conf;
+	int err;
+
+	if (!data[IFLA_VXLAN_ID])
+		return -EINVAL;
+
+	memset(&conf, 0, sizeof(conf));
+	conf.vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
+	} else if (data[IFLA_VXLAN_GROUP6]) {
+		if (!IS_ENABLED(CONFIG_IPV6))
+			return -EPFNOSUPPORT;
+
+		conf.remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
+		conf.remote_ip.sa.sa_family = AF_INET6;
+	}
+
+	if (data[IFLA_VXLAN_LOCAL]) {
+		conf.saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
+		conf.saddr.sa.sa_family = AF_INET;
+	} else if (data[IFLA_VXLAN_LOCAL6]) {
+		if (!IS_ENABLED(CONFIG_IPV6))
+			return -EPFNOSUPPORT;
+
+		/* TODO: respect scope id */
+		conf.saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
+		conf.saddr.sa.sa_family = AF_INET6;
+	}
+
+	if (data[IFLA_VXLAN_LINK])
+		conf.remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
 	if (data[IFLA_VXLAN_TOS])
-		vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+		conf.tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
 
 	if (data[IFLA_VXLAN_TTL])
-		vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
+		conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
 
 	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
-		vxlan->flags |= VXLAN_F_LEARN;
+		conf.flags |= VXLAN_F_LEARN;
 
 	if (data[IFLA_VXLAN_AGEING])
-		vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
-	else
-		vxlan->age_interval = FDB_AGE_DEFAULT;
+		conf.age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
 
 	if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
-		vxlan->flags |= VXLAN_F_PROXY;
+		conf.flags |= VXLAN_F_PROXY;
 
 	if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
-		vxlan->flags |= VXLAN_F_RSC;
+		conf.flags |= VXLAN_F_RSC;
 
 	if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
-		vxlan->flags |= VXLAN_F_L2MISS;
+		conf.flags |= VXLAN_F_L2MISS;
 
 	if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
-		vxlan->flags |= VXLAN_F_L3MISS;
+		conf.flags |= VXLAN_F_L3MISS;
 
 	if (data[IFLA_VXLAN_LIMIT])
-		vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
+		conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
 
 	if (data[IFLA_VXLAN_FLOWBASED] &&
 	    nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
-		vxlan->flags |= VXLAN_F_FLOW_BASED;
+		conf.flags |= VXLAN_F_FLOW_BASED;
 
 	if (data[IFLA_VXLAN_PORT_RANGE]) {
 		const struct ifla_vxlan_port_range *p
 			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
-		vxlan->port_min = ntohs(p->low);
-		vxlan->port_max = ntohs(p->high);
+		conf.port_min = ntohs(p->low);
+		conf.port_max = ntohs(p->high);
 	}
 
 	if (data[IFLA_VXLAN_PORT])
-		vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
+		conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
 
 	if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
-		vxlan->flags |= VXLAN_F_UDP_CSUM;
+		conf.flags |= VXLAN_F_UDP_CSUM;
 
 	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
 	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
-		vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
+		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
 
 	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] &&
 	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
-		vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
+		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
 
 	if (data[IFLA_VXLAN_REMCSUM_TX] &&
 	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
-		vxlan->flags |= VXLAN_F_REMCSUM_TX;
+		conf.flags |= VXLAN_F_REMCSUM_TX;
 
 	if (data[IFLA_VXLAN_REMCSUM_RX] &&
 	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
-		vxlan->flags |= VXLAN_F_REMCSUM_RX;
+		conf.flags |= VXLAN_F_REMCSUM_RX;
 
 	if (data[IFLA_VXLAN_GBP])
-		vxlan->flags |= VXLAN_F_GBP;
+		conf.flags |= VXLAN_F_GBP;
 
 	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
-		vxlan->flags |= VXLAN_F_REMCSUM_NOPARTIAL;
+		conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
 
-	if (vxlan_find_vni(src_net, vni, use_ipv6 ? AF_INET6 : AF_INET,
-			   vxlan->dst_port, vxlan->flags)) {
-		pr_info("duplicate VNI %u\n", vni);
-		return -EEXIST;
-	}
-
-	dev->ethtool_ops = &vxlan_ethtool_ops;
+	err = vxlan_dev_configure(src_net, dev, &conf);
+	switch (err) {
+	case -ENODEV:
+		pr_info("ifindex %d does not exist\n", conf.remote_ifindex);
+		break;
 
-	/* create an fdb entry for a valid default destination */
-	if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
-		err = vxlan_fdb_create(vxlan, all_zeros_mac,
-				       &vxlan->default_dst.remote_ip,
-				       NUD_REACHABLE|NUD_PERMANENT,
-				       NLM_F_EXCL|NLM_F_CREATE,
-				       vxlan->dst_port,
-				       vxlan->default_dst.remote_vni,
-				       vxlan->default_dst.remote_ifindex,
-				       NTF_SELF);
-		if (err)
-			return err;
-	}
+	case -EPERM:
+		pr_info("IPv6 is disabled via sysctl\n");
+		break;
 
-	err = register_netdevice(dev);
-	if (err) {
-		vxlan_fdb_delete_default(vxlan);
-		return err;
+	case -EEXIST:
+		pr_info("duplicate VNI %u\n", conf.vni);
+		break;
 	}
 
-	list_add(&vxlan->next, &vn->vxlan_list);
-
-	return 0;
+	return err;
 }
 
 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
@@ -2904,8 +2926,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	const struct vxlan_dev *vxlan = netdev_priv(dev);
 	const struct vxlan_rdst *dst = &vxlan->default_dst;
 	struct ifla_vxlan_port_range ports = {
-		.low =  htons(vxlan->port_min),
-		.high = htons(vxlan->port_max),
+		.low =  htons(vxlan->cfg.port_min),
+		.high = htons(vxlan->cfg.port_max),
 	};
 
 	if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni))
@@ -2928,22 +2950,22 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
 		goto nla_put_failure;
 
-	if (!vxlan_addr_any(&vxlan->saddr)) {
-		if (vxlan->saddr.sa.sa_family == AF_INET) {
+	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
+		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
 			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
-					    vxlan->saddr.sin.sin_addr.s_addr))
+					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
 				goto nla_put_failure;
 #if IS_ENABLED(CONFIG_IPV6)
 		} else {
 			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
-					     &vxlan->saddr.sin6.sin6_addr))
+					     &vxlan->cfg.saddr.sin6.sin6_addr))
 				goto nla_put_failure;
 #endif
 		}
 	}
 
-	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
-	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->flags & VXLAN_F_LEARN)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
@@ -2955,9 +2977,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_FLOWBASED,
 		       !!(vxlan->flags & VXLAN_F_FLOW_BASED)) ||
-	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
-	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
-	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) ||
+	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
+	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
+	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
 	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
 			!!(vxlan->flags & VXLAN_F_UDP_CSUM)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 80a2da29e088..19535f85eb2c 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -95,6 +95,11 @@ struct vxlanhdr {
 #define VXLAN_VNI_MASK  (VXLAN_VID_MASK << 8)
 #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
 
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+
 struct vxlan_metadata {
 	__be32		vni;
 	u32		gbp;
@@ -121,6 +126,57 @@ struct vxlan_sock {
 	u32		  flags;
 };
 
+union vxlan_addr {
+	struct sockaddr_in sin;
+	struct sockaddr_in6 sin6;
+	struct sockaddr sa;
+};
+
+struct vxlan_rdst {
+	union vxlan_addr	 remote_ip;
+	__be16			 remote_port;
+	u32			 remote_vni;
+	u32			 remote_ifindex;
+	struct list_head	 list;
+	struct rcu_head		 rcu;
+};
+
+struct vxlan_config {
+	union vxlan_addr	remote_ip;
+	union vxlan_addr	saddr;
+	u32			vni;
+	int			remote_ifindex;
+	int			mtu;
+	__be16			dst_port;
+	__u16			port_min;
+	__u16			port_max;
+	__u8			tos;
+	__u8			ttl;
+	u32			flags;
+	unsigned long		age_interval;
+	unsigned int		addrmax;
+	bool			no_share;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+	struct hlist_node hlist;	/* vni hash table */
+	struct list_head  next;		/* vxlan's per namespace list */
+	struct vxlan_sock *vn_sock;	/* listening socket */
+	struct net_device *dev;
+	struct net	  *net;		/* netns for packet i/o */
+	struct vxlan_rdst default_dst;	/* default destination */
+	u32		  flags;	/* VXLAN_F_* in vxlan.h */
+
+	struct timer_list age_timer;
+	spinlock_t	  hash_lock;
+	unsigned int	  addrcnt;
+
+	struct vxlan_config	cfg;
+
+	struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
 #define VXLAN_F_LEARN			0x01
 #define VXLAN_F_PROXY			0x02
 #define VXLAN_F_RSC			0x04
@@ -151,6 +207,9 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
 				  bool no_share, u32 flags);
 
+struct net_device *vxlan_dev_create(struct net *net, const char *name,
+				    u8 name_assign_type, struct vxlan_config *conf);
+
 void vxlan_sock_release(struct vxlan_sock *vs);
 
 int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
-- 
cgit v1.2.3


From 614732eaa12dd462c0ab274700bed14f36afea5e Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:44:06 +0200
Subject: openvswitch: Use regular VXLAN net_device device

This gets rid of all OVS specific VXLAN code in the receive and
transmit path by using a VXLAN net_device to represent the vport.
Only a small shim layer remains which takes care of handling the
VXLAN specific OVS Netlink configuration.

Unexports vxlan_sock_add(), vxlan_sock_release(), vxlan_xmit_skb()
since they are no longer needed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            | 242 +++++++++++++++----------------
 include/net/rtnetlink.h        |   1 +
 include/net/vxlan.h            |  24 +--
 net/core/rtnetlink.c           |  26 ++--
 net/openvswitch/Kconfig        |  12 --
 net/openvswitch/Makefile       |   1 -
 net/openvswitch/flow_netlink.c |   6 +-
 net/openvswitch/vport-netdev.c | 201 ++++++++++++++++++++++++-
 net/openvswitch/vport-vxlan.c  | 322 -----------------------------------------
 net/openvswitch/vport-vxlan.h  |  11 --
 10 files changed, 339 insertions(+), 507 deletions(-)
 delete mode 100644 net/openvswitch/vport-vxlan.c
 delete mode 100644 net/openvswitch/vport-vxlan.h

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 30e1f215af73..e9feefb41f0b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -75,6 +75,9 @@ static struct rtnl_link_ops vxlan_link_ops;
 
 static const u8 all_zeros_mac[ETH_ALEN];
 
+static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
+					 bool no_share, u32 flags);
+
 /* per-network namespace private data for this module */
 struct vxlan_net {
 	struct list_head  vxlan_list;
@@ -1027,7 +1030,7 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
 	return false;
 }
 
-void vxlan_sock_release(struct vxlan_sock *vs)
+static void vxlan_sock_release(struct vxlan_sock *vs)
 {
 	struct sock *sk = vs->sock->sk;
 	struct net *net = sock_net(sk);
@@ -1043,7 +1046,6 @@ void vxlan_sock_release(struct vxlan_sock *vs)
 
 	queue_work(vxlan_wq, &vs->del_work);
 }
-EXPORT_SYMBOL_GPL(vxlan_sock_release);
 
 /* Update multicast group membership when first VNI on
  * multicast address is brought up
@@ -1126,6 +1128,102 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 	return vh;
 }
 
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+		      struct vxlan_metadata *md, u32 vni,
+		      struct metadata_dst *tun_dst)
+{
+	struct iphdr *oip = NULL;
+	struct ipv6hdr *oip6 = NULL;
+	struct vxlan_dev *vxlan;
+	struct pcpu_sw_netstats *stats;
+	union vxlan_addr saddr;
+	int err = 0;
+	union vxlan_addr *remote_ip;
+
+	/* For flow based devices, map all packets to VNI 0 */
+	if (vs->flags & VXLAN_F_FLOW_BASED)
+		vni = 0;
+
+	/* Is this VNI defined? */
+	vxlan = vxlan_vs_find_vni(vs, vni);
+	if (!vxlan)
+		goto drop;
+
+	remote_ip = &vxlan->default_dst.remote_ip;
+	skb_reset_mac_header(skb);
+	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
+	skb->protocol = eth_type_trans(skb, vxlan->dev);
+	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+	/* Ignore packet loops (and multicast echo) */
+	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
+		goto drop;
+
+	/* Re-examine inner Ethernet packet */
+	if (remote_ip->sa.sa_family == AF_INET) {
+		oip = ip_hdr(skb);
+		saddr.sin.sin_addr.s_addr = oip->saddr;
+		saddr.sa.sa_family = AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		oip6 = ipv6_hdr(skb);
+		saddr.sin6.sin6_addr = oip6->saddr;
+		saddr.sa.sa_family = AF_INET6;
+#endif
+	}
+
+	if (tun_dst) {
+		skb_dst_set(skb, (struct dst_entry *)tun_dst);
+		tun_dst = NULL;
+	}
+
+	if ((vxlan->flags & VXLAN_F_LEARN) &&
+	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
+		goto drop;
+
+	skb_reset_network_header(skb);
+	/* In flow-based mode, GBP is carried in dst_metadata */
+	if (!(vs->flags & VXLAN_F_FLOW_BASED))
+		skb->mark = md->gbp;
+
+	if (oip6)
+		err = IP6_ECN_decapsulate(oip6, skb);
+	if (oip)
+		err = IP_ECN_decapsulate(oip, skb);
+
+	if (unlikely(err)) {
+		if (log_ecn_error) {
+			if (oip6)
+				net_info_ratelimited("non-ECT from %pI6\n",
+						     &oip6->saddr);
+			if (oip)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &oip->saddr, oip->tos);
+		}
+		if (err > 1) {
+			++vxlan->dev->stats.rx_frame_errors;
+			++vxlan->dev->stats.rx_errors;
+			goto drop;
+		}
+	}
+
+	stats = this_cpu_ptr(vxlan->dev->tstats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+
+	netif_rx(skb);
+
+	return;
+drop:
+	if (tun_dst)
+		dst_release((struct dst_entry *)tun_dst);
+
+	/* Consume bad packet */
+	kfree_skb(skb);
+}
+
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
@@ -1192,7 +1290,6 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 			info->key.tun_flags |= TUNNEL_CSUM;
 
 		md = ip_tunnel_info_opts(info, sizeof(*md));
-		md->tun_dst = tun_dst;
 	} else {
 		memset(md, 0, sizeof(*md));
 	}
@@ -1231,8 +1328,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto bad_flags;
 	}
 
-	md->vni = vxh->vx_vni;
-	vs->rcv(vs, skb, md);
+	vxlan_rcv(vs, skb, md, vni >> 8, tun_dst);
 	return 0;
 
 drop:
@@ -1252,104 +1348,6 @@ error:
 	return 1;
 }
 
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
-		      struct vxlan_metadata *md)
-{
-	struct iphdr *oip = NULL;
-	struct ipv6hdr *oip6 = NULL;
-	struct vxlan_dev *vxlan;
-	struct pcpu_sw_netstats *stats;
-	union vxlan_addr saddr;
-	__u32 vni;
-	int err = 0;
-	union vxlan_addr *remote_ip;
-
-	/* For flow based devices, map all packets to VNI 0 */
-	if (vs->flags & VXLAN_F_FLOW_BASED)
-		vni = 0;
-	else
-		vni = ntohl(md->vni) >> 8;
-
-	/* Is this VNI defined? */
-	vxlan = vxlan_vs_find_vni(vs, vni);
-	if (!vxlan)
-		goto drop;
-
-	remote_ip = &vxlan->default_dst.remote_ip;
-	skb_reset_mac_header(skb);
-	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
-	skb->protocol = eth_type_trans(skb, vxlan->dev);
-	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
-
-	/* Ignore packet loops (and multicast echo) */
-	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
-		goto drop;
-
-	/* Re-examine inner Ethernet packet */
-	if (remote_ip->sa.sa_family == AF_INET) {
-		oip = ip_hdr(skb);
-		saddr.sin.sin_addr.s_addr = oip->saddr;
-		saddr.sa.sa_family = AF_INET;
-#if IS_ENABLED(CONFIG_IPV6)
-	} else {
-		oip6 = ipv6_hdr(skb);
-		saddr.sin6.sin6_addr = oip6->saddr;
-		saddr.sa.sa_family = AF_INET6;
-#endif
-	}
-
-	if (md->tun_dst) {
-		skb_dst_set(skb, (struct dst_entry *)md->tun_dst);
-		md->tun_dst = NULL;
-	}
-
-	if ((vxlan->flags & VXLAN_F_LEARN) &&
-	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
-		goto drop;
-
-	skb_reset_network_header(skb);
-	/* In flow-based mode, GBP is carried in dst_metadata */
-	if (!(vs->flags & VXLAN_F_FLOW_BASED))
-		skb->mark = md->gbp;
-
-	if (oip6)
-		err = IP6_ECN_decapsulate(oip6, skb);
-	if (oip)
-		err = IP_ECN_decapsulate(oip, skb);
-
-	if (unlikely(err)) {
-		if (log_ecn_error) {
-			if (oip6)
-				net_info_ratelimited("non-ECT from %pI6\n",
-						     &oip6->saddr);
-			if (oip)
-				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
-						     &oip->saddr, oip->tos);
-		}
-		if (err > 1) {
-			++vxlan->dev->stats.rx_frame_errors;
-			++vxlan->dev->stats.rx_errors;
-			goto drop;
-		}
-	}
-
-	stats = this_cpu_ptr(vxlan->dev->tstats);
-	u64_stats_update_begin(&stats->syncp);
-	stats->rx_packets++;
-	stats->rx_bytes += skb->len;
-	u64_stats_update_end(&stats->syncp);
-
-	netif_rx(skb);
-
-	return;
-drop:
-	if (md->tun_dst)
-		dst_release((struct dst_entry *)md->tun_dst);
-
-	/* Consume bad packet */
-	kfree_skb(skb);
-}
-
 static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
@@ -1688,7 +1686,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			   struct sk_buff *skb,
 			   struct net_device *dev, struct in6_addr *saddr,
 			   struct in6_addr *daddr, __u8 prio, __u8 ttl,
-			   __be16 src_port, __be16 dst_port,
+			   __be16 src_port, __be16 dst_port, __u32 vni,
 			   struct vxlan_metadata *md, bool xnet, u32 vxflags)
 {
 	struct vxlanhdr *vxh;
@@ -1738,7 +1736,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 	vxh->vx_flags = htonl(VXLAN_HF_VNI);
-	vxh->vx_vni = md->vni;
+	vxh->vx_vni = vni;
 
 	if (type & SKB_GSO_TUNNEL_REMCSUM) {
 		u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
@@ -1771,10 +1769,10 @@ err:
 }
 #endif
 
-int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
-		   __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
-		   __be16 src_port, __be16 dst_port,
-		   struct vxlan_metadata *md, bool xnet, u32 vxflags)
+static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
+			  __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
+			  __be16 src_port, __be16 dst_port, __u32 vni,
+			  struct vxlan_metadata *md, bool xnet, u32 vxflags)
 {
 	struct vxlanhdr *vxh;
 	int min_headroom;
@@ -1817,7 +1815,7 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 	vxh->vx_flags = htonl(VXLAN_HF_VNI);
-	vxh->vx_vni = md->vni;
+	vxh->vx_vni = vni;
 
 	if (type & SKB_GSO_TUNNEL_REMCSUM) {
 		u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
@@ -1844,7 +1842,6 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 				   ttl, df, src_port, dst_port, xnet,
 				   !(vxflags & VXLAN_F_UDP_CSUM));
 }
-EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
 
 /* Bypass encapsulation if the destination is local */
 static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
@@ -2012,10 +2009,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-		md->vni = htonl(vni << 8);
 		err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
 				     dst->sin.sin_addr.s_addr, tos, ttl, df,
-				     src_port, dst_port, md,
+				     src_port, dst_port, htonl(vni << 8), md,
 				     !net_eq(vxlan->net, dev_net(vxlan->dev)),
 				     flags);
 		if (err < 0) {
@@ -2070,11 +2066,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
-		md->vni = htonl(vni << 8);
 		md->gbp = skb->mark;
 
 		err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
-				      0, ttl, src_port, dst_port, md,
+				      0, ttl, src_port, dst_port, htonl(vni << 8), md,
 				      !net_eq(vxlan->net, dev_net(vxlan->dev)),
 				      vxlan->flags);
 #endif
@@ -2269,8 +2264,8 @@ static int vxlan_open(struct net_device *dev)
 	struct vxlan_sock *vs;
 	int ret = 0;
 
-	vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, vxlan_rcv,
-			    NULL, vxlan->cfg.no_share, vxlan->flags);
+	vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port,
+			    vxlan->cfg.no_share, vxlan->flags);
 	if (IS_ERR(vs))
 		return PTR_ERR(vs);
 
@@ -2563,7 +2558,6 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
 
 /* Create new listen socket if needed */
 static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
-					      vxlan_rcv_t *rcv, void *data,
 					      u32 flags)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
@@ -2592,8 +2586,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
 
 	vs->sock = sock;
 	atomic_set(&vs->refcnt, 1);
-	vs->rcv = rcv;
-	vs->data = data;
 	vs->flags = (flags & VXLAN_F_RCV_FLAGS);
 
 	/* Initialize the vxlan udp offloads structure */
@@ -2617,9 +2609,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
 	return vs;
 }
 
-struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
-				  vxlan_rcv_t *rcv, void *data,
-				  bool no_share, u32 flags)
+static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
+					 bool no_share, u32 flags)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 	struct vxlan_sock *vs;
@@ -2629,7 +2620,7 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 		spin_lock(&vn->sock_lock);
 		vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port,
 				     flags);
-		if (vs && vs->rcv == rcv) {
+		if (vs) {
 			if (!atomic_add_unless(&vs->refcnt, 1, 0))
 				vs = ERR_PTR(-EBUSY);
 			spin_unlock(&vn->sock_lock);
@@ -2638,9 +2629,8 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 		spin_unlock(&vn->sock_lock);
 	}
 
-	return vxlan_socket_create(net, port, rcv, data, flags);
+	return vxlan_socket_create(net, port, flags);
 }
-EXPORT_SYMBOL_GPL(vxlan_sock_add);
 
 static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
 			       struct vxlan_config *conf)
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 343d922d15c2..18fdb98185ab 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -141,6 +141,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
 				    unsigned char name_assign_type,
 				    const struct rtnl_link_ops *ops,
 				    struct nlattr *tb[]);
+int rtnl_delete_link(struct net_device *dev);
 int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
 
 int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 19535f85eb2c..eb8d721cdb67 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -101,22 +101,12 @@ struct vxlanhdr {
 #define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
 
 struct vxlan_metadata {
-	__be32		vni;
 	u32		gbp;
-
-	/* Temporary until vxlan_rcv() API is gone */
-	struct metadata_dst *tun_dst;
 };
 
-struct vxlan_sock;
-typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb,
-			   struct vxlan_metadata *md);
-
 /* per UDP socket information */
 struct vxlan_sock {
 	struct hlist_node hlist;
-	vxlan_rcv_t	 *rcv;
-	void		 *data;
 	struct work_struct del_work;
 	struct socket	 *sock;
 	struct rcu_head	  rcu;
@@ -203,19 +193,13 @@ struct vxlan_dev {
 					 VXLAN_F_COLLECT_METADATA |	\
 					 VXLAN_F_FLOW_BASED)
 
-struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
-				  vxlan_rcv_t *rcv, void *data,
-				  bool no_share, u32 flags);
-
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
 
-void vxlan_sock_release(struct vxlan_sock *vs);
-
-int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
-		   __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
-		   __be16 src_port, __be16 dst_port, struct vxlan_metadata *md,
-		   bool xnet, u32 vxflags);
+static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan)
+{
+	return inet_sk(vxlan->vn_sock->sock->sk)->inet_sport;
+}
 
 static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
 						     netdev_features_t features)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 03d61b54aac0..5fb4af20c6dd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1960,16 +1960,30 @@ static int rtnl_group_dellink(const struct net *net, int group)
 	return 0;
 }
 
+int rtnl_delete_link(struct net_device *dev)
+{
+	const struct rtnl_link_ops *ops;
+	LIST_HEAD(list_kill);
+
+	ops = dev->rtnl_link_ops;
+	if (!ops || !ops->dellink)
+		return -EOPNOTSUPP;
+
+	ops->dellink(dev, &list_kill);
+	unregister_netdevice_many(&list_kill);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_delete_link);
+
 static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
-	const struct rtnl_link_ops *ops;
 	struct net_device *dev;
 	struct ifinfomsg *ifm;
 	char ifname[IFNAMSIZ];
 	struct nlattr *tb[IFLA_MAX+1];
 	int err;
-	LIST_HEAD(list_kill);
 
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
@@ -1991,13 +2005,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (!dev)
 		return -ENODEV;
 
-	ops = dev->rtnl_link_ops;
-	if (!ops || !ops->dellink)
-		return -EOPNOTSUPP;
-
-	ops->dellink(dev, &list_kill);
-	unregister_netdevice_many(&list_kill);
-	return 0;
+	return rtnl_delete_link(dev);
 }
 
 int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 15840401a2ce..1119f46b80b4 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -44,18 +44,6 @@ config OPENVSWITCH_GRE
 
 	  If unsure, say Y.
 
-config OPENVSWITCH_VXLAN
-	tristate "Open vSwitch VXLAN tunneling support"
-	depends on OPENVSWITCH
-	depends on VXLAN
-	default OPENVSWITCH
-	---help---
-	  If you say Y here, then the Open vSwitch will be able create vxlan vport.
-
-	  Say N to exclude this support and reduce the binary size.
-
-	  If unsure, say Y.
-
 config OPENVSWITCH_GENEVE
 	tristate "Open vSwitch Geneve tunneling support"
 	depends on OPENVSWITCH
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 91b9478413ef..38e0e149c55e 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -16,5 +16,4 @@ openvswitch-y := \
 	vport-netdev.o
 
 obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
-obj-$(CONFIG_OPENVSWITCH_VXLAN)	+= vport-vxlan.o
 obj-$(CONFIG_OPENVSWITCH_GRE)	+= vport-gre.o
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e7906dfb8814..a6eb77ab1a64 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -47,9 +47,9 @@
 #include <net/ipv6.h>
 #include <net/ndisc.h>
 #include <net/mpls.h>
+#include <net/vxlan.h>
 
 #include "flow_netlink.h"
-#include "vport-vxlan.h"
 
 struct ovs_len_tbl {
 	int len;
@@ -475,7 +475,7 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
 {
 	struct nlattr *tb[OVS_VXLAN_EXT_MAX+1];
 	unsigned long opt_key_offset;
-	struct ovs_vxlan_opts opts;
+	struct vxlan_metadata opts;
 	int err;
 
 	BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
@@ -626,7 +626,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 static int vxlan_opt_to_nlattr(struct sk_buff *skb,
 			       const void *tun_opts, int swkey_tun_opts_len)
 {
-	const struct ovs_vxlan_opts *opts = tun_opts;
+	const struct vxlan_metadata *opts = tun_opts;
 	struct nlattr *nla;
 
 	nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index e682bdc34a5c..68d0582fc001 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -27,9 +27,13 @@
 #include <linux/skbuff.h>
 #include <linux/openvswitch.h>
 
-#include <net/llc.h>
+#include <net/udp.h>
+#include <net/ip_tunnels.h>
+#include <net/rtnetlink.h>
+#include <net/vxlan.h>
 
 #include "datapath.h"
+#include "vport.h"
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
@@ -147,7 +151,8 @@ static void free_port_rcu(struct rcu_head *rcu)
 {
 	struct vport *vport = container_of(rcu, struct vport, rcu);
 
-	dev_put(vport->dev);
+	if (vport->dev)
+		dev_put(vport->dev);
 	ovs_vport_free(vport);
 }
 
@@ -221,12 +226,202 @@ static struct vport_ops ovs_netdev_vport_ops = {
 	.send		= netdev_send,
 };
 
+/* Compat code for old userspace. */
+#if IS_ENABLED(CONFIG_VXLAN)
+static struct vport_ops ovs_vxlan_netdev_vport_ops;
+
+static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
+{
+	struct vxlan_dev *vxlan = netdev_priv(vport->dev);
+	__be16 dst_port = vxlan->cfg.dst_port;
+
+	if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
+		return -EMSGSIZE;
+
+	if (vxlan->flags & VXLAN_F_GBP) {
+		struct nlattr *exts;
+
+		exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
+		if (!exts)
+			return -EMSGSIZE;
+
+		if (vxlan->flags & VXLAN_F_GBP &&
+		    nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
+			return -EMSGSIZE;
+
+		nla_nest_end(skb, exts);
+	}
+
+	return 0;
+}
+
+static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = {
+	[OVS_VXLAN_EXT_GBP]	= { .type = NLA_FLAG, },
+};
+
+static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr,
+				struct vxlan_config *conf)
+{
+	struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1];
+	int err;
+
+	if (nla_len(attr) < sizeof(struct nlattr))
+		return -EINVAL;
+
+	err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy);
+	if (err < 0)
+		return err;
+
+	if (exts[OVS_VXLAN_EXT_GBP])
+		conf->flags |= VXLAN_F_GBP;
+
+	return 0;
+}
+
+static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
+{
+	struct net *net = ovs_dp_get_net(parms->dp);
+	struct nlattr *options = parms->options;
+	struct net_device *dev;
+	struct vport *vport;
+	struct nlattr *a;
+	int err;
+	struct vxlan_config conf = {
+		.no_share = true,
+		.flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA,
+	};
+
+	if (!options) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+	if (a && nla_len(a) == sizeof(u16)) {
+		conf.dst_port = htons(nla_get_u16(a));
+	} else {
+		/* Require destination port from userspace. */
+		err = -EINVAL;
+		goto error;
+	}
+
+	vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms);
+	if (IS_ERR(vport))
+		return vport;
+
+	a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
+	if (a) {
+		err = vxlan_configure_exts(vport, a, &conf);
+		if (err) {
+			ovs_vport_free(vport);
+			goto error;
+		}
+	}
+
+	rtnl_lock();
+	dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf);
+	if (IS_ERR(dev)) {
+		rtnl_unlock();
+		ovs_vport_free(vport);
+		return ERR_CAST(dev);
+	}
+
+	dev_change_flags(dev, dev->flags | IFF_UP);
+	rtnl_unlock();
+	return vport;
+error:
+	return ERR_PTR(err);
+}
+
+static struct vport *vxlan_create(const struct vport_parms *parms)
+{
+	struct vport *vport;
+
+	vport = vxlan_tnl_create(parms);
+	if (IS_ERR(vport))
+		return vport;
+
+	return netdev_link(vport, parms->name);
+}
+
+static void vxlan_destroy(struct vport *vport)
+{
+	rtnl_lock();
+	if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+		ovs_netdev_detach_dev(vport);
+
+	/* Early release so we can unregister the device */
+	dev_put(vport->dev);
+	rtnl_delete_link(vport->dev);
+	vport->dev = NULL;
+	rtnl_unlock();
+
+	call_rcu(&vport->rcu, free_port_rcu);
+}
+
+static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+				     struct ip_tunnel_info *egress_tun_info)
+{
+	struct vxlan_dev *vxlan = netdev_priv(vport->dev);
+	struct net *net = ovs_dp_get_net(vport->dp);
+	__be16 dst_port = vxlan_dev_dst_port(vxlan);
+	__be16 src_port;
+	int port_min;
+	int port_max;
+
+	inet_get_local_port_range(net, &port_min, &port_max);
+	src_port = udp_flow_src_port(net, skb, 0, 0, true);
+
+	return ovs_tunnel_get_egress_info(egress_tun_info, net,
+					  OVS_CB(skb)->egress_tun_info,
+					  IPPROTO_UDP, skb->mark,
+					  src_port, dst_port);
+}
+
+static struct vport_ops ovs_vxlan_netdev_vport_ops = {
+	.type		= OVS_VPORT_TYPE_VXLAN,
+	.create		= vxlan_create,
+	.destroy	= vxlan_destroy,
+	.get_options	= vxlan_get_options,
+	.send		= netdev_send,
+	.get_egress_tun_info	= vxlan_get_egress_tun_info,
+};
+
+static int vxlan_compat_init(void)
+{
+	return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops);
+}
+
+static void vxlan_compat_exit(void)
+{
+	ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops);
+}
+#else
+static int vxlan_compat_init(void)
+{
+	return 0;
+}
+
+static void vxlan_compat_exit(void)
+{
+}
+#endif
+
 int __init ovs_netdev_init(void)
 {
-	return ovs_vport_ops_register(&ovs_netdev_vport_ops);
+	int err;
+
+	err = ovs_vport_ops_register(&ovs_netdev_vport_ops);
+	if (err)
+		return err;
+	err = vxlan_compat_init();
+	if (err)
+		vxlan_compat_exit();
+	return err;
 }
 
 void ovs_netdev_exit(void)
 {
 	ovs_vport_ops_unregister(&ovs_netdev_vport_ops);
+	vxlan_compat_exit();
 }
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
deleted file mode 100644
index 6f7986fabb70..000000000000
--- a/net/openvswitch/vport-vxlan.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Copyright (c) 2014 Nicira, Inc.
- * Copyright (c) 2013 Cisco Systems, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/net.h>
-#include <linux/rculist.h>
-#include <linux/udp.h>
-#include <linux/module.h>
-
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/udp.h>
-#include <net/ip_tunnels.h>
-#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-#include <net/vxlan.h>
-
-#include "datapath.h"
-#include "vport.h"
-#include "vport-vxlan.h"
-
-/**
- * struct vxlan_port - Keeps track of open UDP ports
- * @vs: vxlan_sock created for the port.
- * @name: vport name.
- */
-struct vxlan_port {
-	struct vxlan_sock *vs;
-	char name[IFNAMSIZ];
-	u32 exts; /* VXLAN_F_* in <net/vxlan.h> */
-};
-
-static struct vport_ops ovs_vxlan_vport_ops;
-
-static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
-{
-	return vport_priv(vport);
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
-		      struct vxlan_metadata *md)
-{
-	struct ip_tunnel_info tun_info;
-	struct vxlan_port *vxlan_port;
-	struct vport *vport = vs->data;
-	struct iphdr *iph;
-	struct ovs_vxlan_opts opts = {
-		.gbp = md->gbp,
-	};
-	__be64 key;
-	__be16 flags;
-
-	flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
-	vxlan_port = vxlan_vport(vport);
-	if (vxlan_port->exts & VXLAN_F_GBP && md->gbp)
-		flags |= TUNNEL_VXLAN_OPT;
-
-	/* Save outer tunnel values */
-	iph = ip_hdr(skb);
-	key = cpu_to_be64(ntohl(md->vni) >> 8);
-	ip_tunnel_info_init(&tun_info, iph,
-			    udp_hdr(skb)->source, udp_hdr(skb)->dest,
-			    key, flags, &opts, sizeof(opts));
-
-	ovs_vport_receive(vport, skb, &tun_info);
-}
-
-static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
-{
-	struct vxlan_port *vxlan_port = vxlan_vport(vport);
-	__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
-
-	if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
-		return -EMSGSIZE;
-
-	if (vxlan_port->exts) {
-		struct nlattr *exts;
-
-		exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
-		if (!exts)
-			return -EMSGSIZE;
-
-		if (vxlan_port->exts & VXLAN_F_GBP &&
-		    nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
-			return -EMSGSIZE;
-
-		nla_nest_end(skb, exts);
-	}
-
-	return 0;
-}
-
-static void vxlan_tnl_destroy(struct vport *vport)
-{
-	struct vxlan_port *vxlan_port = vxlan_vport(vport);
-
-	vxlan_sock_release(vxlan_port->vs);
-
-	ovs_vport_deferred_free(vport);
-}
-
-static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = {
-	[OVS_VXLAN_EXT_GBP]	= { .type = NLA_FLAG, },
-};
-
-static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
-{
-	struct nlattr *exts[OVS_VXLAN_EXT_MAX+1];
-	struct vxlan_port *vxlan_port;
-	int err;
-
-	if (nla_len(attr) < sizeof(struct nlattr))
-		return -EINVAL;
-
-	err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy);
-	if (err < 0)
-		return err;
-
-	vxlan_port = vxlan_vport(vport);
-
-	if (exts[OVS_VXLAN_EXT_GBP])
-		vxlan_port->exts |= VXLAN_F_GBP;
-
-	return 0;
-}
-
-static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
-{
-	struct net *net = ovs_dp_get_net(parms->dp);
-	struct nlattr *options = parms->options;
-	struct vxlan_port *vxlan_port;
-	struct vxlan_sock *vs;
-	struct vport *vport;
-	struct nlattr *a;
-	u16 dst_port;
-	int err;
-
-	if (!options) {
-		err = -EINVAL;
-		goto error;
-	}
-	a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
-	if (a && nla_len(a) == sizeof(u16)) {
-		dst_port = nla_get_u16(a);
-	} else {
-		/* Require destination port from userspace. */
-		err = -EINVAL;
-		goto error;
-	}
-
-	vport = ovs_vport_alloc(sizeof(struct vxlan_port),
-				&ovs_vxlan_vport_ops, parms);
-	if (IS_ERR(vport))
-		return vport;
-
-	vxlan_port = vxlan_vport(vport);
-	strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
-
-	a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
-	if (a) {
-		err = vxlan_configure_exts(vport, a);
-		if (err) {
-			ovs_vport_free(vport);
-			goto error;
-		}
-	}
-
-	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true,
-			    vxlan_port->exts);
-	if (IS_ERR(vs)) {
-		ovs_vport_free(vport);
-		return (void *)vs;
-	}
-	vxlan_port->vs = vs;
-
-	return vport;
-
-error:
-	return ERR_PTR(err);
-}
-
-static int vxlan_ext_gbp(struct sk_buff *skb)
-{
-	const struct ip_tunnel_info *tun_info;
-	const struct ovs_vxlan_opts *opts;
-
-	tun_info = OVS_CB(skb)->egress_tun_info;
-	opts = tun_info->options;
-
-	if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT &&
-	    tun_info->options_len >= sizeof(*opts))
-		return opts->gbp;
-	else
-		return 0;
-}
-
-static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
-	struct net *net = ovs_dp_get_net(vport->dp);
-	struct vxlan_port *vxlan_port = vxlan_vport(vport);
-	struct sock *sk = vxlan_port->vs->sock->sk;
-	__be16 dst_port = inet_sk(sk)->inet_sport;
-	const struct ip_tunnel_key *tun_key;
-	struct vxlan_metadata md = {0};
-	struct rtable *rt;
-	struct flowi4 fl;
-	__be16 src_port;
-	__be16 df;
-	int err;
-	u32 vxflags;
-
-	if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
-		err = -EINVAL;
-		goto error;
-	}
-
-	tun_key = &OVS_CB(skb)->egress_tun_info->key;
-	rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		goto error;
-	}
-
-	df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
-		htons(IP_DF) : 0;
-
-	skb->ignore_df = 1;
-
-	src_port = udp_flow_src_port(net, skb, 0, 0, true);
-	md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
-	md.gbp = vxlan_ext_gbp(skb);
-	vxflags = vxlan_port->exts |
-		      (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0);
-
-	err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst,
-			     tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
-			     src_port, dst_port,
-			     &md, false, vxflags);
-	if (err < 0)
-		ip_rt_put(rt);
-	return err;
-error:
-	kfree_skb(skb);
-	return err;
-}
-
-static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				     struct ip_tunnel_info *egress_tun_info)
-{
-	struct net *net = ovs_dp_get_net(vport->dp);
-	struct vxlan_port *vxlan_port = vxlan_vport(vport);
-	__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
-	__be16 src_port;
-	int port_min;
-	int port_max;
-
-	inet_get_local_port_range(net, &port_min, &port_max);
-	src_port = udp_flow_src_port(net, skb, 0, 0, true);
-
-	return ovs_tunnel_get_egress_info(egress_tun_info, net,
-					  OVS_CB(skb)->egress_tun_info,
-					  IPPROTO_UDP, skb->mark,
-					  src_port, dst_port);
-}
-
-static const char *vxlan_get_name(const struct vport *vport)
-{
-	struct vxlan_port *vxlan_port = vxlan_vport(vport);
-	return vxlan_port->name;
-}
-
-static struct vport_ops ovs_vxlan_vport_ops = {
-	.type		= OVS_VPORT_TYPE_VXLAN,
-	.create		= vxlan_tnl_create,
-	.destroy	= vxlan_tnl_destroy,
-	.get_name	= vxlan_get_name,
-	.get_options	= vxlan_get_options,
-	.send		= vxlan_tnl_send,
-	.get_egress_tun_info	= vxlan_get_egress_tun_info,
-	.owner		= THIS_MODULE,
-};
-
-static int __init ovs_vxlan_tnl_init(void)
-{
-	return ovs_vport_ops_register(&ovs_vxlan_vport_ops);
-}
-
-static void __exit ovs_vxlan_tnl_exit(void)
-{
-	ovs_vport_ops_unregister(&ovs_vxlan_vport_ops);
-}
-
-module_init(ovs_vxlan_tnl_init);
-module_exit(ovs_vxlan_tnl_exit);
-
-MODULE_DESCRIPTION("OVS: VXLAN switching port");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("vport-type-4");
diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h
deleted file mode 100644
index 4b08233e73d5..000000000000
--- a/net/openvswitch/vport-vxlan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef VPORT_VXLAN_H
-#define VPORT_VXLAN_H 1
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-struct ovs_vxlan_opts {
-	__u32 gbp;
-};
-
-#endif
-- 
cgit v1.2.3


From e181a5430491f038c198f0eacc3142d6e871c2da Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Sun, 19 Jul 2015 22:21:13 +0200
Subject: net: #ifdefify sk_classid member of struct sock

The sk_classid member is only required when CONFIG_CGROUP_NET_CLASSID is
enabled. #ifdefify it to reduce the size of struct sock on 32 bit
systems, at least.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h       | 2 ++
 net/netfilter/nft_meta.c | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 05a8c1aea251..4353ef70bf48 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -429,7 +429,9 @@ struct sock {
 	void			*sk_security;
 #endif
 	__u32			sk_mark;
+#ifdef CONFIG_CGROUP_NET_CLASSID
 	u32			sk_classid;
+#endif
 	struct cg_proto		*sk_cgrp;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk);
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 52561e1c31e2..cb2f13ebb5a6 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -166,11 +166,13 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 			goto err;
 		*dest = out->group;
 		break;
+#ifdef CONFIG_CGROUP_NET_CLASSID
 	case NFT_META_CGROUP:
 		if (skb->sk == NULL || !sk_fullsock(skb->sk))
 			goto err;
 		*dest = skb->sk->sk_classid;
 		break;
+#endif
 	default:
 		WARN_ON(1);
 		goto err;
@@ -246,7 +248,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	case NFT_META_CPU:
 	case NFT_META_IIFGROUP:
 	case NFT_META_OIFGROUP:
+#ifdef CONFIG_CGROUP_NET_CLASSID
 	case NFT_META_CGROUP:
+#endif
 		len = sizeof(u32);
 		break;
 	case NFT_META_IIFNAME:
-- 
cgit v1.2.3


From 052831879945be0d9fad2216b127147c565ec1b1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 22 Jul 2015 14:43:58 +0200
Subject: ip_tunnel: Provide tunnel metadata API for CONFIG_INET=n

Account for the configuration FIB_RULES=y && INET=n as FIB_RULES can
be selected by IPV6 or DECNET without INET.

Fixes: e7030878fc84 ("fib: Add fib rule match on tunnel id")
Fixes: 3093fbe7ff4b ("route: Per route IP tunnel metadata via lightweight tunnel")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0a5a7763eec2..d975b3ebd6c7 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -314,6 +314,21 @@ static inline int ip_tunnel_collect_metadata(void)
 void ip_tunnel_need_metadata(void);
 void ip_tunnel_unneed_metadata(void);
 
+#else /* CONFIG_INET */
+
+static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
+{
+	return NULL;
+}
+
+static inline void ip_tunnel_need_metadata(void)
+{
+}
+
+static inline void ip_tunnel_unneed_metadata(void)
+{
+}
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
-- 
cgit v1.2.3


From 045a0fa0c5f5ea0f16c009f924ea579634afbba8 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jul 2015 10:08:44 +0200
Subject: ip_tunnel: Call ip_tunnel_core_init() from inet_init()

Convert the module_init() to a invocation from inet_init() since
ip_tunnel_core is part of the INET built-in.

Fixes: 3093fbe7ff4 ("route: Per route IP tunnel metadata via lightweight tunnel")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h  |  2 ++
 net/ipv4/af_inet.c        |  3 +++
 net/ipv4/ip_tunnel_core.c | 11 +----------
 3 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index d975b3ebd6c7..47984415f5d1 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -311,6 +311,8 @@ static inline int ip_tunnel_collect_metadata(void)
 	return static_key_false(&ip_tunnel_metadata_cnt);
 }
 
+void __init ip_tunnel_core_init(void);
+
 void ip_tunnel_need_metadata(void);
 void ip_tunnel_unneed_metadata(void);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9532ee87151f..cc4e498a0ccf 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -112,6 +112,7 @@
 #include <net/raw.h>
 #include <net/icmp.h>
 #include <net/inet_common.h>
+#include <net/ip_tunnels.h>
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
 #include <net/secure_seq.h>
@@ -1780,6 +1781,8 @@ static int __init inet_init(void)
 
 	dev_add_pack(&ip_packet_type);
 
+	ip_tunnel_core_init();
+
 	rc = 0;
 out:
 	return rc;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 630e6d5712e8..5512f4e4ec1b 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -292,19 +292,10 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
 	.get_encap_size = ip_tun_encap_nlsize,
 };
 
-static int __init ip_tunnel_core_init(void)
+void __init ip_tunnel_core_init(void)
 {
 	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
-
-	return 0;
-}
-module_init(ip_tunnel_core_init);
-
-static void __exit ip_tunnel_core_exit(void)
-{
-	lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
 }
-module_exit(ip_tunnel_core_exit);
 
 struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
 EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
-- 
cgit v1.2.3


From a6cb869b3b7c16fd7c3ee766dd9f9a4fdda7edf9 Mon Sep 17 00:00:00 2001
From: Varka Bhadram <varkabhadram@gmail.com>
Date: Wed, 24 Jun 2015 11:36:35 +0200
Subject: cfg802154: add PM hooks

This patch help to implement suspend/resume in mac802154, these
hooks will be run before the device is suspended and after it
resumes.

Signed-off-by: Varka Bhadram <varkab@cdac.in>
Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h   |  2 ++
 net/ieee802154/rdev-ops.h | 20 ++++++++++++++++++++
 net/ieee802154/sysfs.c    | 38 ++++++++++++++++++++++++++++++++++++++
 net/ieee802154/trace.h    | 22 ++++++++++++++++++++++
 4 files changed, 82 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 290a9a69af07..382f94b59f2f 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -34,6 +34,8 @@ struct cfg802154_ops {
 							   int type);
 	void	(*del_virtual_intf_deprecated)(struct wpan_phy *wpan_phy,
 					       struct net_device *dev);
+	int	(*suspend)(struct wpan_phy *wpan_phy);
+	int	(*resume)(struct wpan_phy *wpan_phy);
 	int	(*add_virtual_intf)(struct wpan_phy *wpan_phy,
 				    const char *name,
 				    unsigned char name_assign_type,
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index b2155a123f6c..8d5960a37195 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -23,6 +23,26 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
 	rdev->ops->del_virtual_intf_deprecated(&rdev->wpan_phy, dev);
 }
 
+static inline int
+rdev_suspend(struct cfg802154_registered_device *rdev)
+{
+	int ret;
+	trace_802154_rdev_suspend(&rdev->wpan_phy);
+	ret = rdev->ops->suspend(&rdev->wpan_phy);
+	trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+	return ret;
+}
+
+static inline int
+rdev_resume(struct cfg802154_registered_device *rdev)
+{
+	int ret;
+	trace_802154_rdev_resume(&rdev->wpan_phy);
+	ret = rdev->ops->resume(&rdev->wpan_phy);
+	trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+	return ret;
+}
+
 static inline int
 rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
 		      unsigned char name_assign_type,
diff --git a/net/ieee802154/sysfs.c b/net/ieee802154/sysfs.c
index 133b4280660c..bd88525b041e 100644
--- a/net/ieee802154/sysfs.c
+++ b/net/ieee802154/sysfs.c
@@ -14,11 +14,13 @@
  */
 
 #include <linux/device.h>
+#include <linux/rtnetlink.h>
 
 #include <net/cfg802154.h>
 
 #include "core.h"
 #include "sysfs.h"
+#include "rdev-ops.h"
 
 static inline struct cfg802154_registered_device *
 dev_to_rdev(struct device *dev)
@@ -62,10 +64,46 @@ static struct attribute *pmib_attrs[] = {
 };
 ATTRIBUTE_GROUPS(pmib);
 
+#ifdef CONFIG_PM_SLEEP
+static int wpan_phy_suspend(struct device *dev)
+{
+	struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+	int ret = 0;
+
+	if (rdev->ops->suspend) {
+		rtnl_lock();
+		ret = rdev_suspend(rdev);
+		rtnl_unlock();
+	}
+
+	return ret;
+}
+
+static int wpan_phy_resume(struct device *dev)
+{
+	struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+	int ret = 0;
+
+	if (rdev->ops->resume) {
+		rtnl_lock();
+		ret = rdev_resume(rdev);
+		rtnl_unlock();
+	}
+
+	return ret;
+}
+
+static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume);
+#define WPAN_PHY_PM_OPS (&wpan_phy_pm_ops)
+#else
+#define WPAN_PHY_PM_OPS NULL
+#endif
+
 struct class wpan_phy_class = {
 	.name = "ieee802154",
 	.dev_release = wpan_phy_release,
 	.dev_groups = pmib_groups,
+	.pm = WPAN_PHY_PM_OPS,
 };
 
 int wpan_phy_sysfs_init(void)
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
index 9b5f0eb36696..4399b7fbaa31 100644
--- a/net/ieee802154/trace.h
+++ b/net/ieee802154/trace.h
@@ -40,6 +40,28 @@
  *			rdev->ops traces		     *
  *************************************************************/
 
+DECLARE_EVENT_CLASS(wpan_phy_only_evt,
+	TP_PROTO(struct wpan_phy *wpan_phy),
+	TP_ARGS(wpan_phy),
+	TP_STRUCT__entry(
+		WPAN_PHY_ENTRY
+	),
+	TP_fast_assign(
+		WPAN_PHY_ASSIGN;
+	),
+	TP_printk(WPAN_PHY_PR_FMT, WPAN_PHY_PR_ARG)
+);
+
+DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_suspend,
+	TP_PROTO(struct wpan_phy *wpan_phy),
+	TP_ARGS(wpan_phy)
+);
+
+DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_resume,
+	TP_PROTO(struct wpan_phy *wpan_phy),
+	TP_ARGS(wpan_phy)
+);
+
 TRACE_EVENT(802154_rdev_add_virtual_intf,
 	TP_PROTO(struct wpan_phy *wpan_phy, char *name,
 		 enum nl802154_iftype type, __le64 extended_addr),
-- 
cgit v1.2.3


From 729a8989b3fa8ae7965c537dfccbd08512e84d3c Mon Sep 17 00:00:00 2001
From: Varka Bhadram <varkabhadram@gmail.com>
Date: Tue, 7 Jul 2015 10:50:42 +0530
Subject: mac802154: do not export ieee802154_rx()

Right now there are no other users for ieee802154_rx()
in kernel. So lets remove EXPORT_SYMBOL() for this.

Also it moves the function prototype from global header
file to local header file.

Signed-off-by: Varka Bhadram <varkabhadram@gmail.com>
Acked-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/mac802154.h      | 17 -----------------
 net/mac802154/ieee802154_i.h |  1 +
 net/mac802154/rx.c           |  1 -
 3 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index f534a46911dc..b7f99615224b 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -320,23 +320,6 @@ int ieee802154_register_hw(struct ieee802154_hw *hw);
  */
 void ieee802154_unregister_hw(struct ieee802154_hw *hw);
 
-/**
- * ieee802154_rx - receive frame
- *
- * Use this function to hand received frames to mac802154. The receive
- * buffer in @skb must start with an IEEE 802.15.4 header. In case of a
- * paged @skb is used, the driver is recommended to put the ieee802154
- * header of the frame on the linear part of the @skb to avoid memory
- * allocation and/or memcpy by the stack.
- *
- * This function may not be called in IRQ context. Calls to this function
- * for a single hardware must be synchronized against each other.
- *
- * @hw: the hardware this frame came in on
- * @skb: the buffer to receive, owned by mac802154 after this call
- */
-void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb);
-
 /**
  * ieee802154_rx_irqsafe - receive frame
  *
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index 0054f39d499b..eb8502a6e719 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -124,6 +124,7 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata)
 
 extern struct ieee802154_mlme_ops mac802154_mlme_wpan;
 
+void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb);
 netdev_tx_t
 ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
 netdev_tx_t
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index 5a258c11ed3b..7791c9b8cb57 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -290,7 +290,6 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb)
 drop:
 	kfree_skb(skb);
 }
-EXPORT_SYMBOL(ieee802154_rx);
 
 void
 ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi)
-- 
cgit v1.2.3


From cb02a25583b59ce48267472cd092485d754964f9 Mon Sep 17 00:00:00 2001
From: Dean Jenkins <Dean_Jenkins@mentor.com>
Date: Tue, 23 Jun 2015 17:59:38 +0100
Subject: Bluetooth: __l2cap_wait_ack() use msecs_to_jiffies()

Use msecs_to_jiffies() instead of using HZ so that it
is easier to specify the time in milliseconds.

Also add a #define L2CAP_WAIT_ACK_POLL_PERIOD to specify the 200ms
polling period so that it is defined in a single place.

Signed-off-by: Dean Jenkins <Dean_Jenkins@mentor.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/l2cap.h | 1 +
 net/bluetooth/l2cap_sock.c    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 2239a3753092..3dcad4159b0b 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -55,6 +55,7 @@
 #define L2CAP_INFO_TIMEOUT		msecs_to_jiffies(4000)
 #define L2CAP_MOVE_TIMEOUT		msecs_to_jiffies(4000)
 #define L2CAP_MOVE_ERTX_TIMEOUT		msecs_to_jiffies(60000)
+#define L2CAP_WAIT_ACK_POLL_PERIOD	msecs_to_jiffies(200)
 
 #define L2CAP_A2MP_DEFAULT_MTU		670
 
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index d915e4a96313..f0b052a75e8a 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1058,7 +1058,7 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int err = 0;
-	int timeo = HZ/5;
+	int timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
 
 	add_wait_queue(sk_sleep(sk), &wait);
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -1066,7 +1066,7 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan)
 		BT_DBG("Waiting for %d ACKs", chan->unacked_frames);
 
 		if (!timeo)
-			timeo = HZ/5;
+			timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
 
 		if (signal_pending(current)) {
 			err = sock_intr_errno(timeo);
-- 
cgit v1.2.3


From e432c72c464d2deb6c66d1e2a5f548dc1f0ef4dc Mon Sep 17 00:00:00 2001
From: Dean Jenkins <Dean_Jenkins@mentor.com>
Date: Tue, 23 Jun 2015 17:59:39 +0100
Subject: Bluetooth: __l2cap_wait_ack() add defensive timeout

Add a timeout to prevent the do while loop running in an
infinite loop. This ensures that the channel will be
instructed to close within 10 seconds so prevents
l2cap_sock_shutdown() getting stuck forever.

Returns -ENOLINK when the timeout is reached. The channel
will be subequently closed and not all data will be ACK'ed.

Signed-off-by: Dean Jenkins <Dean_Jenkins@mentor.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/l2cap.h |  1 +
 net/bluetooth/l2cap_sock.c    | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 3dcad4159b0b..c98afc08cc26 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -56,6 +56,7 @@
 #define L2CAP_MOVE_TIMEOUT		msecs_to_jiffies(4000)
 #define L2CAP_MOVE_ERTX_TIMEOUT		msecs_to_jiffies(60000)
 #define L2CAP_WAIT_ACK_POLL_PERIOD	msecs_to_jiffies(200)
+#define L2CAP_WAIT_ACK_TIMEOUT		msecs_to_jiffies(10000)
 
 #define L2CAP_A2MP_DEFAULT_MTU		670
 
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index f0b052a75e8a..586b3d580cfc 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1059,11 +1059,15 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan)
 	DECLARE_WAITQUEUE(wait, current);
 	int err = 0;
 	int timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
+	/* Timeout to prevent infinite loop */
+	unsigned long timeout = jiffies + L2CAP_WAIT_ACK_TIMEOUT;
 
 	add_wait_queue(sk_sleep(sk), &wait);
 	set_current_state(TASK_INTERRUPTIBLE);
 	do {
-		BT_DBG("Waiting for %d ACKs", chan->unacked_frames);
+		BT_DBG("Waiting for %d ACKs, timeout %04d ms",
+		       chan->unacked_frames, time_after(jiffies, timeout) ? 0 :
+		       jiffies_to_msecs(timeout - jiffies));
 
 		if (!timeo)
 			timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
@@ -1082,6 +1086,11 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan)
 		if (err)
 			break;
 
+		if (time_after(jiffies, timeout)) {
+			err = -ENOLINK;
+			break;
+		}
+
 	} while (chan->unacked_frames > 0 &&
 		 chan->state == BT_CONNECTED);
 
-- 
cgit v1.2.3


From 8757825b128e71319150219b0745d3ecb87f34aa Mon Sep 17 00:00:00 2001
From: Seungyoun Ju <sy39.ju@samsung.com>
Date: Mon, 13 Jul 2015 17:28:13 +0900
Subject: Bluetooth: hci_check_conn_params() check proper range

Slave latency range has been changed in Core Spec. 4.2 by Erratum 5419
of ESR08_V1.0.0. And it should be applied to Core Spec. 4.0 and 4.1.

Before:
   connSlaveLatency <= ((connSupervisionTimeout / connIntervalMax) - 1)

After:
   connSlaveLatency <= ((connSupervisionTimeout / (connIntervalMax*2)) - 1)

This patch makes hci_check_conn_params() check the allowable slave
latency range using the changed way.

Signed-off-by: Seungyoun Ju <sy39.ju@samsung.com>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 3bd618d3e55d..2a6b0919e23f 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1297,7 +1297,7 @@ static inline int hci_check_conn_params(u16 min, u16 max, u16 latency,
 	if (max >= to_multiplier * 8)
 		return -EINVAL;
 
-	max_latency = (to_multiplier * 8 / max) - 1;
+	max_latency = (to_multiplier * 4 / max) - 1;
 	if (latency > 499 || latency > max_latency)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 5a6228a0b472062646434cd2536d109c102b606e Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 24 Jul 2015 12:28:36 +0200
Subject: lwtunnel: change prototype of lwtunnel_state_get()

It saves some lines and simplify a bit the code when the state is returning
by this function. It's also useful to handle a NULL entry.

To avoid too long lines, I've also renamed lwtunnel_state_get() and
lwtunnel_state_put() to lwtstate_get() and lwtstate_put().

CC: Thomas Graf <tgraf@suug.ch>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h   | 16 +++++++++++-----
 net/ipv4/fib_semantics.c |  9 ++++-----
 net/ipv4/route.c         |  9 ++-------
 net/ipv6/ip6_fib.c       |  2 +-
 net/ipv6/route.c         |  8 ++------
 5 files changed, 20 insertions(+), 24 deletions(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 918e03c1dafa..b02039081b04 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -35,12 +35,16 @@ extern const struct lwtunnel_encap_ops __rcu *
 		lwtun_encaps[LWTUNNEL_ENCAP_MAX+1];
 
 #ifdef CONFIG_LWTUNNEL
-static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+static inline struct lwtunnel_state *
+lwtstate_get(struct lwtunnel_state *lws)
 {
-	atomic_inc(&lws->refcnt);
+	if (lws)
+		atomic_inc(&lws->refcnt);
+
+	return lws;
 }
 
-static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+static inline void lwtstate_put(struct lwtunnel_state *lws)
 {
 	if (!lws)
 		return;
@@ -74,11 +78,13 @@ int lwtunnel_output6(struct sock *sk, struct sk_buff *skb);
 
 #else
 
-static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+static inline struct lwtunnel_state *
+lwtstate_get(struct lwtunnel_state *lws)
 {
+	return lws;
 }
 
-static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+static inline void lwtstate_put(struct lwtunnel_state *lws)
 {
 }
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d4c6732cfbfa..65e00399a9a6 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -209,7 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
 	change_nexthops(fi) {
 		if (nexthop_nh->nh_dev)
 			dev_put(nexthop_nh->nh_dev);
-		lwtunnel_state_put(nexthop_nh->nh_lwtstate);
+		lwtstate_put(nexthop_nh->nh_lwtstate);
 		free_nh_exceptions(nexthop_nh);
 		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
 		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
@@ -514,8 +514,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 							   nla, &lwtstate);
 				if (ret)
 					goto errout;
-				lwtunnel_state_get(lwtstate);
-				nexthop_nh->nh_lwtstate = lwtstate;
+				nexthop_nh->nh_lwtstate =
+					lwtstate_get(lwtstate);
 			}
 		}
 
@@ -971,8 +971,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			if (err)
 				goto failure;
 
-			lwtunnel_state_get(lwtstate);
-			nh->nh_lwtstate = lwtstate;
+			nh->nh_lwtstate = lwtstate_get(lwtstate);
 		}
 		nh->nh_oif = cfg->fc_oif;
 		nh->nh_gw = cfg->fc_gw;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 519ec232818d..11096396ef4a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1358,7 +1358,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 		list_del(&rt->rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
-	lwtunnel_state_put(rt->rt_lwtstate);
+	lwtstate_put(rt->rt_lwtstate);
 }
 
 void rt_flush_dev(struct net_device *dev)
@@ -1407,12 +1407,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
-		if (nh->nh_lwtstate) {
-			lwtunnel_state_get(nh->nh_lwtstate);
-			rt->rt_lwtstate = nh->nh_lwtstate;
-		} else {
-			rt->rt_lwtstate = NULL;
-		}
+		rt->rt_lwtstate = lwtstate_get(nh->nh_lwtstate);
 		if (unlikely(fnhe))
 			cached = rt_bind_exception(rt, fnhe, daddr);
 		else if (!(rt->dst.flags & DST_NOCACHE))
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d715f2e0c4e7..5693b5eb8482 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -178,7 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 static void rt6_release(struct rt6_info *rt)
 {
 	if (atomic_dec_and_test(&rt->rt6i_ref)) {
-		lwtunnel_state_put(rt->rt6i_lwtstate);
+		lwtstate_put(rt->rt6i_lwtstate);
 		rt6_free_pcpu(rt);
 		dst_free(&rt->dst);
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fbe27fb6bd3f..c9b2b9fe83fc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1778,8 +1778,7 @@ int ip6_route_add(struct fib6_config *cfg)
 					   cfg->fc_encap, &lwtstate);
 		if (err)
 			goto out;
-		lwtunnel_state_get(lwtstate);
-		rt->rt6i_lwtstate = lwtstate;
+		rt->rt6i_lwtstate = lwtstate_get(lwtstate);
 		if (lwtunnel_output_redirect(rt->rt6i_lwtstate))
 			rt->dst.output = lwtunnel_output6;
 	}
@@ -2161,10 +2160,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 #endif
 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
 	rt->rt6i_table = ort->rt6i_table;
-	if (ort->rt6i_lwtstate) {
-		lwtunnel_state_get(ort->rt6i_lwtstate);
-		rt->rt6i_lwtstate = ort->rt6i_lwtstate;
-	}
+	rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate);
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
-- 
cgit v1.2.3


From 205845a34763432040496908c8f52f1f97e5ee62 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 24 Jul 2015 15:50:31 +0200
Subject: bonding: convert num_grat_arp to the new bonding option API

num_grat_arp wasn't converted to the new bonding option API, so do this
now and remove the specific sysfs store option in order to use the
standard one. num_grat_arp is the same as num_unsol_na so add it as an
alias with the same option settings. An important difference is the option
name which is matched in bond_sysfs_store_option().

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Veaceslav Falico <vfalico@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_options.c |  7 +++++++
 drivers/net/bonding/bond_sysfs.c   | 20 +++-----------------
 include/net/bond_options.h         |  1 +
 3 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index e9c624d54dd4..6dda57e2e724 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -420,6 +420,13 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.flags = BOND_OPTFLAG_IFDOWN,
 		.values = bond_ad_user_port_key_tbl,
 		.set = bond_option_ad_user_port_key_set,
+	},
+	[BOND_OPT_NUM_PEER_NOTIF_ALIAS] = {
+		.id = BOND_OPT_NUM_PEER_NOTIF_ALIAS,
+		.name = "num_grat_arp",
+		.desc = "Number of peer notifications to send on failover event",
+		.values = bond_num_peer_notif_tbl,
+		.set = bond_option_num_peer_notif_set
 	}
 };
 
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 31835a4dab57..f4ae72086215 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -380,7 +380,7 @@ static ssize_t bonding_show_ad_select(struct device *d,
 static DEVICE_ATTR(ad_select, S_IRUGO | S_IWUSR,
 		   bonding_show_ad_select, bonding_sysfs_store_option);
 
-/* Show and set the number of peer notifications to send after a failover event. */
+/* Show the number of peer notifications to send after a failover event. */
 static ssize_t bonding_show_num_peer_notif(struct device *d,
 					   struct device_attribute *attr,
 					   char *buf)
@@ -388,24 +388,10 @@ static ssize_t bonding_show_num_peer_notif(struct device *d,
 	struct bonding *bond = to_bond(d);
 	return sprintf(buf, "%d\n", bond->params.num_peer_notif);
 }
-
-static ssize_t bonding_store_num_peer_notif(struct device *d,
-					    struct device_attribute *attr,
-					    const char *buf, size_t count)
-{
-	struct bonding *bond = to_bond(d);
-	int ret;
-
-	ret = bond_opt_tryset_rtnl(bond, BOND_OPT_NUM_PEER_NOTIF, (char *)buf);
-	if (!ret)
-		ret = count;
-
-	return ret;
-}
 static DEVICE_ATTR(num_grat_arp, S_IRUGO | S_IWUSR,
-		   bonding_show_num_peer_notif, bonding_store_num_peer_notif);
+		   bonding_show_num_peer_notif, bonding_sysfs_store_option);
 static DEVICE_ATTR(num_unsol_na, S_IRUGO | S_IWUSR,
-		   bonding_show_num_peer_notif, bonding_store_num_peer_notif);
+		   bonding_show_num_peer_notif, bonding_sysfs_store_option);
 
 /* Show the MII monitor interval. */
 static ssize_t bonding_show_miimon(struct device *d,
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index c28aca25320e..1797235cd590 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -66,6 +66,7 @@ enum {
 	BOND_OPT_AD_ACTOR_SYS_PRIO,
 	BOND_OPT_AD_ACTOR_SYSTEM,
 	BOND_OPT_AD_USER_PORT_KEY,
+	BOND_OPT_NUM_PEER_NOTIF_ALIAS,
 	BOND_OPT_LAST
 };
 
-- 
cgit v1.2.3


From 877d1f6291f8e391237e324be58479a3e3a7407c Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 28 Jul 2015 16:02:05 -0700
Subject: net: Set sk_txhash from a random number

This patch creates sk_set_txhash and eliminates protocol specific
inet_set_txhash and ip6_set_txhash. sk_set_txhash simply sets a
random number instead of performing flow dissection. sk_set_txash
is also allowed to be called multiple times for the same socket,
we'll need this when redoing the hash for negative routing advice.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h    | 16 ----------------
 include/net/ipv6.h  | 19 -------------------
 include/net/sock.h  |  8 ++++++++
 net/ipv4/datagram.c |  2 +-
 net/ipv4/tcp_ipv4.c |  4 ++--
 net/ipv6/datagram.c |  2 +-
 net/ipv6/tcp_ipv6.c |  4 ++--
 7 files changed, 14 insertions(+), 41 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index d5fe9f2ab699..bee5f3582e38 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -370,22 +370,6 @@ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
 	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 }
 
-static inline void inet_set_txhash(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct flow_keys keys;
-
-	memset(&keys, 0, sizeof(keys));
-
-	keys.addrs.v4addrs.src = inet->inet_saddr;
-	keys.addrs.v4addrs.dst = inet->inet_daddr;
-	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-	keys.ports.src = inet->inet_sport;
-	keys.ports.dst = inet->inet_dport;
-
-	sk->sk_txhash = flow_hash_from_keys(&keys);
-}
-
 static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto)
 {
 	const struct iphdr *iph = skb_gro_network_header(skb);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 82dbdb092a5d..7c79798bcaab 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -707,25 +707,6 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static inline void ip6_set_txhash(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct flow_keys keys;
-
-	memset(&keys, 0, sizeof(keys));
-
-	memcpy(&keys.addrs.v6addrs.src, &np->saddr,
-	       sizeof(keys.addrs.v6addrs.src));
-	memcpy(&keys.addrs.v6addrs.dst, &sk->sk_v6_daddr,
-	       sizeof(keys.addrs.v6addrs.dst));
-	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-	keys.ports.src = inet->inet_sport;
-	keys.ports.dst = inet->inet_dport;
-
-	sk->sk_txhash = flow_hash_from_keys(&keys);
-}
-
 static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
 					__be32 flowlabel, bool autolabel)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index 4353ef70bf48..fe735c4841f6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1687,6 +1687,14 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 kuid_t sock_i_uid(struct sock *sk);
 unsigned long sock_i_ino(struct sock *sk);
 
+static inline void sk_set_txhash(struct sock *sk)
+{
+	sk->sk_txhash = prandom_u32();
+
+	if (unlikely(!sk->sk_txhash))
+		sk->sk_txhash = 1;
+}
+
 static inline struct dst_entry *
 __sk_dst_get(struct sock *sk)
 {
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 574fad9cca05..f915abff1350 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -74,7 +74,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
 	inet->inet_daddr = fl4->daddr;
 	inet->inet_dport = usin->sin_port;
 	sk->sk_state = TCP_ESTABLISHED;
-	inet_set_txhash(sk);
+	sk_set_txhash(sk);
 	inet->inet_id = jiffies;
 
 	sk_dst_set(sk, &rt->dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 486ba96ae91a..d27eb549ced6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -222,7 +222,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		goto failure;
 
-	inet_set_txhash(sk);
+	sk_set_txhash(sk);
 
 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 			       inet->inet_sport, inet->inet_dport, sk);
@@ -1277,7 +1277,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
 	newinet->rcv_tos      = ip_hdr(skb)->tos;
 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
-	inet_set_txhash(newsk);
+	sk_set_txhash(newsk);
 	if (inet_opt)
 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 	newinet->inet_id = newtp->write_seq ^ jiffies;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2572a324b345..9aadd57808a5 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -199,7 +199,7 @@ ipv4_connected:
 		      NULL);
 
 	sk->sk_state = TCP_ESTABLISHED;
-	ip6_set_txhash(sk);
+	sk_set_txhash(sk);
 out:
 	fl6_sock_release(flowlabel);
 	return err;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d540846a1a79..52dd0d9974d6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -276,7 +276,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (err)
 		goto late_failure;
 
-	ip6_set_txhash(sk);
+	sk_set_txhash(sk);
 
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
@@ -1090,7 +1090,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
 	newsk->sk_bound_dev_if = ireq->ir_iif;
 
-	ip6_set_txhash(newsk);
+	sk_set_txhash(newsk);
 
 	/* Now IPv6 options...
 
-- 
cgit v1.2.3


From 265f94ff54d62503663d9c788ba1f082e448f8b8 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 28 Jul 2015 16:02:06 -0700
Subject: net: Recompute sk_txhash on negative routing advice

When a connection is failing a transport protocol calls
dst_negative_advice to try to get a better route. This patch includes
changing the sk_txhash in that function. This provides a rudimentary
method to try to find a different path in the network since sk_txhash
affects ECMP on the local host and through the network (via flow labels
or UDP source port in encapsulation).

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index fe735c4841f6..24aa75c5317a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1695,6 +1695,12 @@ static inline void sk_set_txhash(struct sock *sk)
 		sk->sk_txhash = 1;
 }
 
+static inline void sk_rethink_txhash(struct sock *sk)
+{
+	if (sk->sk_txhash)
+		sk_set_txhash(sk);
+}
+
 static inline struct dst_entry *
 __sk_dst_get(struct sock *sk)
 {
@@ -1719,6 +1725,8 @@ static inline void dst_negative_advice(struct sock *sk)
 {
 	struct dst_entry *ndst, *dst = __sk_dst_get(sk);
 
+	sk_rethink_txhash(sk);
+
 	if (dst && dst->ops->negative_advice) {
 		ndst = dst->ops->negative_advice(dst);
 
-- 
cgit v1.2.3


From 92a99bf3bae7c1267db87bb3e3babda2c6dcc8a7 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 29 Jul 2015 09:45:40 +0200
Subject: lwtunnel: Make lwtun_encaps[] static

Any external user should use the registration API instead of
accessing this directly.

Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 3 ---
 net/core/lwtunnel.c    | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index b02039081b04..33bd30963a95 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -31,9 +31,6 @@ struct lwtunnel_encap_ops {
 	int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
 };
 
-extern const struct lwtunnel_encap_ops __rcu *
-		lwtun_encaps[LWTUNNEL_ENCAP_MAX+1];
-
 #ifdef CONFIG_LWTUNNEL
 static inline struct lwtunnel_state *
 lwtstate_get(struct lwtunnel_state *lws)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 5f7fae70ef68..c240c895b319 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -37,7 +37,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
 }
 EXPORT_SYMBOL(lwtunnel_state_alloc);
 
-const struct lwtunnel_encap_ops __rcu *
+static const struct lwtunnel_encap_ops __rcu *
 		lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
 
 int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
-- 
cgit v1.2.3


From d3aa45ce6b94c65b83971257317867db13e5f492 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 30 Jul 2015 15:36:57 -0700
Subject: bpf: add helpers to access tunnel metadata

Introduce helpers to let eBPF programs attached to TC manipulate tunnel metadata:
bpf_skb_[gs]et_tunnel_key(skb, key, size, flags)
skb: pointer to skb
key: pointer to 'struct bpf_tunnel_key'
size: size of 'struct bpf_tunnel_key'
flags: room for future extensions

First eBPF program that uses these helpers will allocate per_cpu
metadata_dst structures that will be used on TX.
On RX metadata_dst is allocated by tunnel driver.

Typical usage for TX:
struct bpf_tunnel_key tkey;
... populate tkey ...
bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0);
bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);

RX:
struct bpf_tunnel_key tkey = {};
bpf_skb_get_tunnel_key(skb, &tkey, sizeof(tkey), 0);
... lookup or redirect based on tkey ...

'struct bpf_tunnel_key' will be extended in the future by adding
elements to the end and the 'size' argument will indicate which fields
are populated, thereby keeping backwards compatibility.
The 'flags' argument may be used as well when the 'size' is not enough or
to indicate completely different layout of bpf_tunnel_key.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h |  1 +
 include/uapi/linux/bpf.h   | 17 ++++++++++
 net/core/dst.c             | 35 +++++++++++++++++----
 net/core/filter.c          | 77 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 7b0306894663..075f523ff23f 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -51,5 +51,6 @@ static inline bool skb_valid_dst(const struct sk_buff *skb)
 }
 
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
+struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
 
 #endif /* __NET_DST_METADATA_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2f6c83d714e9..bc0d27d3fbdd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -258,6 +258,18 @@ enum bpf_func_id {
 	BPF_FUNC_get_cgroup_classid,
 	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
 	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
+
+	/**
+	 * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags)
+	 * retrieve or populate tunnel metadata
+	 * @skb: pointer to skb
+	 * @key: pointer to 'struct bpf_tunnel_key'
+	 * @size: size of 'struct bpf_tunnel_key'
+	 * @flags: room for future extensions
+	 * Retrun: 0 on success
+	 */
+	BPF_FUNC_skb_get_tunnel_key,
+	BPF_FUNC_skb_set_tunnel_key,
 	__BPF_FUNC_MAX_ID,
 };
 
@@ -280,4 +292,9 @@ struct __sk_buff {
 	__u32 cb[5];
 };
 
+struct bpf_tunnel_key {
+	__u32 tunnel_id;
+	__u32 remote_ipv4;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/dst.c b/net/core/dst.c
index 76a617f6d60a..f8694d1b8702 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -362,15 +362,10 @@ static int dst_md_discard(struct sk_buff *skb)
 	return 0;
 }
 
-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
 {
-	struct metadata_dst *md_dst;
 	struct dst_entry *dst;
 
-	md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
-	if (!md_dst)
-		return ERR_PTR(-ENOMEM);
-
 	dst = &md_dst->dst;
 	dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
 		 DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
@@ -380,11 +375,39 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
 
 	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
 	md_dst->opts_len = optslen;
+}
+
+struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+{
+	struct metadata_dst *md_dst;
+
+	md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+	if (!md_dst)
+		return NULL;
+
+	__metadata_dst_init(md_dst, optslen);
 
 	return md_dst;
 }
 EXPORT_SYMBOL_GPL(metadata_dst_alloc);
 
+struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
+{
+	int cpu;
+	struct metadata_dst __percpu *md_dst;
+
+	md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
+				    __alignof__(struct metadata_dst), flags);
+	if (!md_dst)
+		return NULL;
+
+	for_each_possible_cpu(cpu)
+		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
+
+	return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
+
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat
  * this mistake in 2.3, but we have no choice
diff --git a/net/core/filter.c b/net/core/filter.c
index 786722a9c6f2..1b72264ff2ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -48,6 +48,7 @@
 #include <linux/bpf.h>
 #include <net/sch_generic.h>
 #include <net/cls_cgroup.h>
+#include <net/dst_metadata.h>
 
 /**
  *	sk_filter - run a packet through a socket filter
@@ -1483,6 +1484,78 @@ bool bpf_helper_changes_skb_data(void *func)
 	return false;
 }
 
+static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb, AF_INET);
+
+	if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
+		return -EINVAL;
+
+	to->tunnel_id = be64_to_cpu(info->key.tun_id);
+	to->remote_ipv4 = be32_to_cpu(info->key.ipv4_src);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+	.func		= bpf_skb_get_tunnel_key,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+static struct metadata_dst __percpu *md_dst;
+
+static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
+	struct metadata_dst *md = this_cpu_ptr(md_dst);
+	struct ip_tunnel_info *info;
+
+	if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags))
+		return -EINVAL;
+
+	skb_dst_drop(skb);
+	dst_hold((struct dst_entry *) md);
+	skb_dst_set(skb, (struct dst_entry *) md);
+
+	info = &md->u.tun_info;
+	info->mode = IP_TUNNEL_INFO_TX;
+	info->key.tun_id = cpu_to_be64(from->tunnel_id);
+	info->key.ipv4_dst = cpu_to_be32(from->remote_ipv4);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+	.func		= bpf_skb_set_tunnel_key,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+{
+	if (!md_dst) {
+		/* race is not possible, since it's called from
+		 * verifier that is holding verifier mutex
+		 */
+		md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+		if (!md_dst)
+			return NULL;
+	}
+	return &bpf_skb_set_tunnel_key_proto;
+}
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1526,6 +1599,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_vlan_push_proto;
 	case BPF_FUNC_skb_vlan_pop:
 		return &bpf_skb_vlan_pop_proto;
+	case BPF_FUNC_skb_get_tunnel_key:
+		return &bpf_skb_get_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_key:
+		return bpf_get_skb_set_tunnel_key_proto();
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 343d60aada5a358ca186d6e9e353230379c426d8 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 30 Jul 2015 13:34:53 -0700
Subject: ipv6: change ipv6_stub_impl.ipv6_dst_lookup to take net argument

This patch adds net argument to ipv6_stub_impl.ipv6_dst_lookup
for use cases where sk is not available (like mpls).
sk appears to be needed to get the namespace 'net' and is optional
otherwise. This patch series changes ipv6_stub_impl.ipv6_dst_lookup
to take net argument. sk remains optional.

All callers of ipv6_stub_impl.ipv6_dst_lookup have been modified
to pass net. I have modified them to use already available
'net' in the scope of the call. I can change them to
sock_net(sk) to avoid any unintended change in behaviour if sock
namespace is different. They dont seem to be from code inspection.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c      |  2 +-
 include/net/addrconf.h   |  4 ++--
 include/net/ipv6.h       |  3 ++-
 net/ipv6/addrconf_core.c | 11 ++++++++++-
 net/ipv6/icmp.c          |  6 +++---
 net/ipv6/ip6_output.c    | 12 ++++++------
 net/tipc/udp_media.c     |  3 ++-
 7 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 81f0f24b2cfb..beed5d4025a3 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2034,7 +2034,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		fl6.flowi6_mark = skb->mark;
 		fl6.flowi6_proto = IPPROTO_UDP;
 
-		if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
+		if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) {
 			netdev_dbg(dev, "no route to %pI6\n",
 				   &dst->sin6.sin6_addr);
 			dev->stats.tx_carrier_errors++;
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index def59d3a34d5..0c3ac5acb85f 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -158,8 +158,8 @@ struct ipv6_stub {
 				 const struct in6_addr *addr);
 	int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
 				 const struct in6_addr *addr);
-	int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst,
-				struct flowi6 *fl6);
+	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
+			       struct dst_entry **dst, struct flowi6 *fl6);
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, struct neighbour *neigh,
 			      const struct in6_addr *daddr,
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 7c79798bcaab..eecdfc92f807 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -813,7 +813,8 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
 			      &inet6_sk(sk)->cork);
 }
 
-int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6);
+int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
+		   struct flowi6 *fl6);
 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 				      const struct in6_addr *final_dst);
 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index ca09bf49ac68..bfa941fc1165 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -107,7 +107,16 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v)
 }
 EXPORT_SYMBOL(inet6addr_notifier_call_chain);
 
-const struct ipv6_stub *ipv6_stub __read_mostly;
+static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
+					struct dst_entry **u2,
+					struct flowi6 *u3)
+{
+	return -EAFNOSUPPORT;
+}
+
+const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
+	.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+};
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 713d7434c911..6c2b2132c8d3 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -329,7 +329,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
 	struct flowi6 fl2;
 	int err;
 
-	err = ip6_dst_lookup(sk, &dst, fl6);
+	err = ip6_dst_lookup(net, sk, &dst, fl6);
 	if (err)
 		return ERR_PTR(err);
 
@@ -361,7 +361,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
 	if (err)
 		goto relookup_failed;
 
-	err = ip6_dst_lookup(sk, &dst2, &fl2);
+	err = ip6_dst_lookup(net, sk, &dst2, &fl2);
 	if (err)
 		goto relookup_failed;
 
@@ -591,7 +591,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
 
-	err = ip6_dst_lookup(sk, &dst, &fl6);
+	err = ip6_dst_lookup(net, sk, &dst, &fl6);
 	if (err)
 		goto out;
 	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c5fc85286ef6..92b7cf0dc1f9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -881,10 +881,9 @@ out:
 	return dst;
 }
 
-static int ip6_dst_lookup_tail(struct sock *sk,
+static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6)
 {
-	struct net *net = sock_net(sk);
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	struct neighbour *n;
 	struct rt6_info *rt;
@@ -994,10 +993,11 @@ out_err_release:
  *
  *	It returns zero on success, or a standard errno code on error.
  */
-int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
+int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
+		   struct flowi6 *fl6)
 {
 	*dst = NULL;
-	return ip6_dst_lookup_tail(sk, dst, fl6);
+	return ip6_dst_lookup_tail(net, sk, dst, fl6);
 }
 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 
@@ -1018,7 +1018,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 	struct dst_entry *dst = NULL;
 	int err;
 
-	err = ip6_dst_lookup_tail(sk, &dst, fl6);
+	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
 	if (err)
 		return ERR_PTR(err);
 	if (final_dst)
@@ -1052,7 +1052,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 
 	dst = ip6_sk_dst_check(sk, dst, fl6);
 
-	err = ip6_dst_lookup_tail(sk, &dst, fl6);
+	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
 	if (err)
 		return ERR_PTR(err);
 	if (final_dst)
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 66deebc66aa1..c170d3138953 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -194,7 +194,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
 			.saddr = src->ipv6,
 			.flowi6_proto = IPPROTO_UDP
 		};
-		err = ipv6_stub->ipv6_dst_lookup(ub->ubsock->sk, &ndst, &fl6);
+		err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst,
+						 &fl6);
 		if (err)
 			goto tx_error;
 		ttl = ip6_dst_hoplimit(ndst);
-- 
cgit v1.2.3


From 67800f9b1f4eb5bbefc32e3f5044097354bc85b3 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 31 Jul 2015 16:52:11 -0700
Subject: ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel

We can't call skb_get_hash here since the packet is not complete to do
flow_dissector. Create hash based on flowi6 instead.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h    | 5 +++--
 net/ipv6/ip6_gre.c    | 5 +++--
 net/ipv6/ip6_output.c | 4 ++--
 net/ipv6/ip6_tunnel.c | 2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index eecdfc92f807..3e334b33ef3a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -708,12 +708,13 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
 
 #if IS_ENABLED(CONFIG_IPV6)
 static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
-					__be32 flowlabel, bool autolabel)
+					__be32 flowlabel, bool autolabel,
+					struct flowi6 *fl6)
 {
 	if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) {
 		u32 hash;
 
-		hash = skb_get_hash(skb);
+		hash = skb_get_hash_flowi6(skb, fl6);
 
 		/* Since this is being sent on the wire obfuscate hash a bit
 		 * to minimize possbility that any useful information to an
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index a38d3ac0f18f..a7d1ca2337a9 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -728,7 +728,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 	 */
 	ipv6h = ipv6_hdr(skb);
 	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
-		     ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
+		     ip6_make_flowlabel(net, skb, fl6->flowlabel, false, fl6));
 	ipv6h->hop_limit = tunnel->parms.hop_limit;
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
@@ -1182,7 +1182,8 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
 
 	ip6_flow_hdr(ipv6h, 0,
 		     ip6_make_flowlabel(dev_net(dev), skb,
-					t->fl.u.ip6.flowlabel, false));
+					t->fl.u.ip6.flowlabel, false,
+					&t->fl.u.ip6));
 	ipv6h->hop_limit = t->parms.hop_limit;
 	ipv6h->nexthdr = NEXTHDR_GRE;
 	ipv6h->saddr = t->parms.laddr;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 92b7cf0dc1f9..26ea47930740 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -207,7 +207,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 		hlimit = ip6_dst_hoplimit(dst);
 
 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
-						     np->autoflowlabel));
+						     np->autoflowlabel, fl6));
 
 	hdr->payload_len = htons(seg_len);
 	hdr->nexthdr = proto;
@@ -1649,7 +1649,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 
 	ip6_flow_hdr(hdr, v6_cork->tclass,
 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
-					np->autoflowlabel));
+					np->autoflowlabel, fl6));
 	hdr->hop_limit = v6_cork->hop_limit;
 	hdr->nexthdr = proto;
 	hdr->saddr = fl6->saddr;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 2e67b660118b..54e694c4af0e 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1095,7 +1095,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	skb_reset_network_header(skb);
 	ipv6h = ipv6_hdr(skb);
 	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
-		     ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
+		     ip6_make_flowlabel(net, skb, fl6->flowlabel, false, fl6));
 	ipv6h->hop_limit = t->parms.hop_limit;
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
-- 
cgit v1.2.3


From 42240901f7c438636715b9cb6ed93f4441ffc091 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 31 Jul 2015 16:52:12 -0700
Subject: ipv6: Implement different admin modes for automatic flow labels

Change the meaning of net.ipv6.auto_flowlabels to provide a mode for
automatic flow labels generation. There are four modes:

0: flow labels are disabled
1: flow labels are enabled, sockets can opt-out
2: flow labels are allowed, sockets can opt-in
3: flow labels are enabled and enforced, no opt-out for sockets

np->autoflowlabel is initialized according to the sysctl value.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 20 ++++++++----
 include/net/ipv6.h                     | 59 ++++++++++++++++++++++++++--------
 net/ipv6/af_inet6.c                    |  3 +-
 net/ipv6/ip6_gre.c                     |  4 +--
 net/ipv6/ip6_tunnel.c                  |  2 +-
 net/ipv6/sysctl_net_ipv6.c             |  7 +++-
 6 files changed, 70 insertions(+), 25 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 00d26d919459..9ac3af3ab739 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1215,14 +1215,20 @@ flowlabel_consistency - BOOLEAN
 	FALSE: disabled
 	Default: TRUE
 
-auto_flowlabels - BOOLEAN
-	Automatically generate flow labels based based on a flow hash
-	of the packet. This allows intermediate devices, such as routers,
-	to idenfify packet flows for mechanisms like Equal Cost Multipath
+auto_flowlabels - INTEGER
+	Automatically generate flow labels based on a flow hash of the
+	packet. This allows intermediate devices, such as routers, to
+	identify packet flows for mechanisms like Equal Cost Multipath
 	Routing (see RFC 6438).
-	TRUE: enabled
-	FALSE: disabled
-	Default: false
+	0: automatic flow labels are completely disabled
+	1: automatic flow labels are enabled by default, they can be
+	   disabled on a per socket basis using the IPV6_AUTOFLOWLABEL
+	   socket option
+	2: automatic flow labels are allowed, they may be enabled on a
+	   per socket basis using the IPV6_AUTOFLOWLABEL socket option
+	3: automatic flow labels are enabled and enforced, they cannot
+	   be disabled by the socket option
+	Default: 0
 
 flowlabel_state_ranges - BOOLEAN
 	Split the flow label number space into two ranges. 0-0x7FFFF is
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 3e334b33ef3a..c02c1c03363a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -707,36 +707,69 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
+
+/* Sysctl settings for net ipv6.auto_flowlabels */
+#define IP6_AUTO_FLOW_LABEL_OFF		0
+#define IP6_AUTO_FLOW_LABEL_OPTOUT	1
+#define IP6_AUTO_FLOW_LABEL_OPTIN	2
+#define IP6_AUTO_FLOW_LABEL_FORCED	3
+
+#define IP6_AUTO_FLOW_LABEL_MAX		IP6_AUTO_FLOW_LABEL_FORCED
+
+#define IP6_DEFAULT_AUTO_FLOW_LABELS	IP6_AUTO_FLOW_LABEL_OFF
+
 static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
 					__be32 flowlabel, bool autolabel,
 					struct flowi6 *fl6)
 {
-	if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) {
-		u32 hash;
+	u32 hash;
 
-		hash = skb_get_hash_flowi6(skb, fl6);
+	if (flowlabel ||
+	    net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF ||
+	    (!autolabel &&
+	     net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED))
+		return flowlabel;
 
-		/* Since this is being sent on the wire obfuscate hash a bit
-		 * to minimize possbility that any useful information to an
-		 * attacker is leaked. Only lower 20 bits are relevant.
-		 */
-		hash ^= hash >> 12;
+	hash = skb_get_hash_flowi6(skb, fl6);
 
-		flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
+	/* Since this is being sent on the wire obfuscate hash a bit
+	 * to minimize possbility that any useful information to an
+	 * attacker is leaked. Only lower 20 bits are relevant.
+	 */
+	rol32(hash, 16);
 
-		if (net->ipv6.sysctl.flowlabel_state_ranges)
-			flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;
-	}
+	flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
+
+	if (net->ipv6.sysctl.flowlabel_state_ranges)
+		flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;
 
 	return flowlabel;
 }
+
+static inline int ip6_default_np_autolabel(struct net *net)
+{
+	switch (net->ipv6.sysctl.auto_flowlabels) {
+	case IP6_AUTO_FLOW_LABEL_OFF:
+	case IP6_AUTO_FLOW_LABEL_OPTIN:
+	default:
+		return 0;
+	case IP6_AUTO_FLOW_LABEL_OPTOUT:
+	case IP6_AUTO_FLOW_LABEL_FORCED:
+		return 1;
+	}
+}
 #else
 static inline void ip6_set_txhash(struct sock *sk) { }
 static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
-					__be32 flowlabel, bool autolabel)
+					__be32 flowlabel, bool autolabel,
+					struct flowi6 *fl6)
 {
 	return flowlabel;
 }
+static inline int ip6_default_np_autolabel(struct net *net)
+{
+	return 0;
+}
 #endif
 
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7bc92ea4ae8f..3f0ae3a7c0b1 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -197,6 +197,7 @@ lookup_protocol:
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
 	np->mc_loop	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
+	np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk));
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 
 	/* Init the ipv4 part of the socket since we can have sockets
@@ -767,7 +768,7 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.bindv6only = 0;
 	net->ipv6.sysctl.icmpv6_time = 1*HZ;
 	net->ipv6.sysctl.flowlabel_consistency = 1;
-	net->ipv6.sysctl.auto_flowlabels = 0;
+	net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
 	net->ipv6.sysctl.idgen_retries = 3;
 	net->ipv6.sysctl.idgen_delay = 1 * HZ;
 	net->ipv6.sysctl.flowlabel_state_ranges = 1;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index a7d1ca2337a9..34f121812a14 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -728,7 +728,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 	 */
 	ipv6h = ipv6_hdr(skb);
 	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
-		     ip6_make_flowlabel(net, skb, fl6->flowlabel, false, fl6));
+		     ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
 	ipv6h->hop_limit = tunnel->parms.hop_limit;
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
@@ -1182,7 +1182,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
 
 	ip6_flow_hdr(ipv6h, 0,
 		     ip6_make_flowlabel(dev_net(dev), skb,
-					t->fl.u.ip6.flowlabel, false,
+					t->fl.u.ip6.flowlabel, true,
 					&t->fl.u.ip6));
 	ipv6h->hop_limit = t->parms.hop_limit;
 	ipv6h->nexthdr = NEXTHDR_GRE;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 54e694c4af0e..b0ab420612bc 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1095,7 +1095,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	skb_reset_network_header(skb);
 	ipv6h = ipv6_hdr(skb);
 	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
-		     ip6_make_flowlabel(net, skb, fl6->flowlabel, false, fl6));
+		     ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
 	ipv6h->hop_limit = t->parms.hop_limit;
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index db48aebd9c47..45243bbe5253 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -17,6 +17,9 @@
 #include <net/inet_frag.h>
 
 static int one = 1;
+static int auto_flowlabels_min;
+static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
+
 
 static struct ctl_table ipv6_table_template[] = {
 	{
@@ -45,7 +48,9 @@ static struct ctl_table ipv6_table_template[] = {
 		.data		= &init_net.ipv6.sysctl.auto_flowlabels,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &auto_flowlabels_min,
+		.extra2		= &auto_flowlabels_max
 	},
 	{
 		.procname	= "fwmark_reflect",
-- 
cgit v1.2.3


From b56774163f994efce3f5603f35aa4e677c3e725a Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 31 Jul 2015 16:52:14 -0700
Subject: ipv6: Enable auto flow labels by default

Initialize auto_flowlabels to one. This enables automatic flow labels,
individual socket may disable them using the IPV6_AUTOFLOWLABEL socket
option.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 2 +-
 include/net/ipv6.h                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 9ac3af3ab739..56db1efd7189 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1228,7 +1228,7 @@ auto_flowlabels - INTEGER
 	   per socket basis using the IPV6_AUTOFLOWLABEL socket option
 	3: automatic flow labels are enabled and enforced, they cannot
 	   be disabled by the socket option
-	Default: 0
+	Default: 1
 
 flowlabel_state_ranges - BOOLEAN
 	Split the flow label number space into two ranges. 0-0x7FFFF is
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index c02c1c03363a..711cca428cc8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -716,7 +716,7 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
 
 #define IP6_AUTO_FLOW_LABEL_MAX		IP6_AUTO_FLOW_LABEL_FORCED
 
-#define IP6_DEFAULT_AUTO_FLOW_LABELS	IP6_AUTO_FLOW_LABEL_OFF
+#define IP6_DEFAULT_AUTO_FLOW_LABELS	IP6_AUTO_FLOW_LABEL_OPTOUT
 
 static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
 					__be32 flowlabel, bool autolabel,
-- 
cgit v1.2.3


From bbde9fc1824aab58bc78c084163007dd6c03fe5b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 31 May 2015 17:54:44 +0200
Subject: netfilter: factor out packet duplication for IPv4/IPv6

Extracted from the xtables TEE target. This creates two new modules for IPv4
and IPv6 that are shared between the TEE target and the new nf_tables dup
expressions.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_dup_ipv4.h |   7 ++
 include/net/netfilter/ipv6/nf_dup_ipv6.h |   7 ++
 net/ipv4/netfilter/Kconfig               |   6 ++
 net/ipv4/netfilter/Makefile              |   2 +
 net/ipv4/netfilter/nf_dup_ipv4.c         | 120 +++++++++++++++++++++++
 net/ipv6/netfilter/Kconfig               |   6 ++
 net/ipv6/netfilter/Makefile              |   2 +
 net/ipv6/netfilter/nf_dup_ipv6.c         |  96 +++++++++++++++++++
 net/netfilter/Kconfig                    |   2 +
 net/netfilter/xt_TEE.c                   | 158 ++-----------------------------
 10 files changed, 254 insertions(+), 152 deletions(-)
 create mode 100644 include/net/netfilter/ipv4/nf_dup_ipv4.h
 create mode 100644 include/net/netfilter/ipv6/nf_dup_ipv6.h
 create mode 100644 net/ipv4/netfilter/nf_dup_ipv4.c
 create mode 100644 net/ipv6/netfilter/nf_dup_ipv6.c

(limited to 'include/net')

diff --git a/include/net/netfilter/ipv4/nf_dup_ipv4.h b/include/net/netfilter/ipv4/nf_dup_ipv4.h
new file mode 100644
index 000000000000..42008f10dfc4
--- /dev/null
+++ b/include/net/netfilter/ipv4/nf_dup_ipv4.h
@@ -0,0 +1,7 @@
+#ifndef _NF_DUP_IPV4_H_
+#define _NF_DUP_IPV4_H_
+
+void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum,
+		 const struct in_addr *gw, int oif);
+
+#endif /* _NF_DUP_IPV4_H_ */
diff --git a/include/net/netfilter/ipv6/nf_dup_ipv6.h b/include/net/netfilter/ipv6/nf_dup_ipv6.h
new file mode 100644
index 000000000000..ed6bd66fa5a0
--- /dev/null
+++ b/include/net/netfilter/ipv6/nf_dup_ipv6.h
@@ -0,0 +1,7 @@
+#ifndef _NF_DUP_IPV6_H_
+#define _NF_DUP_IPV6_H_
+
+void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum,
+		 const struct in6_addr *gw, int oif);
+
+#endif /* _NF_DUP_IPV6_H_ */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 2199a5db25e6..0142ea259d7d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -67,6 +67,12 @@ config NF_TABLES_ARP
 
 endif # NF_TABLES
 
+config NF_DUP_IPV4
+	tristate "Netfilter IPv4 packet duplication to alternate destination"
+	help
+	  This option enables the nf_dup_ipv4 core, which duplicates an IPv4
+	  packet to be rerouted to another destination.
+
 config NF_LOG_ARP
 	tristate "ARP packet logging"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 7fe6c703528f..9136ffc2d474 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -70,3 +70,5 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
 
 # just filtering instance of ARP tables for now
 obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+
+obj-$(CONFIG_NF_DUP_IPV4) += nf_dup_ipv4.o
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
new file mode 100644
index 000000000000..eff85ab3f47d
--- /dev/null
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -0,0 +1,120 @@
+/*
+ * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
+ * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Extracted from xt_TEE.c
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 or later, as
+ * published by the Free Software Foundation.
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static struct net *pick_net(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_NS
+	const struct dst_entry *dst;
+
+	if (skb->dev != NULL)
+		return dev_net(skb->dev);
+	dst = skb_dst(skb);
+	if (dst != NULL && dst->dev != NULL)
+		return dev_net(dst->dev);
+#endif
+	return &init_net;
+}
+
+static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw,
+			      int oif)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = pick_net(skb);
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	memset(&fl4, 0, sizeof(fl4));
+	if (oif != -1)
+		fl4.flowi4_oif = oif;
+
+	fl4.daddr = gw->s_addr;
+	fl4.flowi4_tos = RT_TOS(iph->tos);
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		return false;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+	skb->dev      = rt->dst.dev;
+	skb->protocol = htons(ETH_P_IP);
+
+	return true;
+}
+
+void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum,
+		 const struct in_addr *gw, int oif)
+{
+	struct iphdr *iph;
+
+	if (__this_cpu_read(nf_skb_duplicated))
+		return;
+	/*
+	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+	 * the original skb, which should continue on its way as if nothing has
+	 * happened. The copy should be independently delivered to the gateway.
+	 */
+	skb = pskb_copy(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	/* Avoid counting cloned packets towards the original connection. */
+	nf_conntrack_put(skb->nfct);
+	skb->nfct     = &nf_ct_untracked_get()->ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+#endif
+	/*
+	 * If we are in PREROUTING/INPUT, the checksum must be recalculated
+	 * since the length could have changed as a result of defragmentation.
+	 *
+	 * We also decrease the TTL to mitigate potential loops between two
+	 * hosts.
+	 *
+	 * Set %IP_DF so that the original source is notified of a potentially
+	 * decreased MTU on the clone route. IPv6 does this too.
+	 */
+	iph = ip_hdr(skb);
+	iph->frag_off |= htons(IP_DF);
+	if (hooknum == NF_INET_PRE_ROUTING ||
+	    hooknum == NF_INET_LOCAL_IN)
+		--iph->ttl;
+	ip_send_check(iph);
+
+	if (nf_dup_ipv4_route(skb, gw, oif)) {
+		__this_cpu_write(nf_skb_duplicated, true);
+		ip_local_out(skb);
+		__this_cpu_write(nf_skb_duplicated, false);
+	} else {
+		kfree_skb(skb);
+	}
+}
+EXPORT_SYMBOL_GPL(nf_dup_ipv4);
+
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("nf_dup_ipv4: Duplicate IPv4 packet");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index b552cf0d6198..298daf30d857 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -50,6 +50,12 @@ config NFT_REJECT_IPV6
 endif # NF_TABLES_IPV6
 endif # NF_TABLES
 
+config NF_DUP_IPV6
+	tristate "Netfilter IPv6 packet duplication to alternate destination"
+	help
+	  This option enables the nf_dup_ipv6 core, which duplicates an IPv6
+	  packet to be rerouted to another destination.
+
 config NF_REJECT_IPV6
 	tristate "IPv6 packet rejection"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index c36e0a5490de..dc6c732f98ca 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -30,6 +30,8 @@ obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
 # reject
 obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o
 
+obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o
+
 # nf_tables
 obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
 obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
new file mode 100644
index 000000000000..399fdda18447
--- /dev/null
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -0,0 +1,96 @@
+/*
+ * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
+ * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Extracted from xt_TEE.c
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 or later, as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/skbuff.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static struct net *pick_net(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_NS
+	const struct dst_entry *dst;
+
+	if (skb->dev != NULL)
+		return dev_net(skb->dev);
+	dst = skb_dst(skb);
+	if (dst != NULL && dst->dev != NULL)
+		return dev_net(dst->dev);
+#endif
+	return &init_net;
+}
+
+static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw,
+			      int oif)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct net *net = pick_net(skb);
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+
+	memset(&fl6, 0, sizeof(fl6));
+	if (oif != -1)
+		fl6.flowi6_oif = oif;
+
+	fl6.daddr = *gw;
+	fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
+			 (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (dst->error) {
+		dst_release(dst);
+		return false;
+	}
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+	skb->dev      = dst->dev;
+	skb->protocol = htons(ETH_P_IPV6);
+
+	return true;
+}
+
+void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum,
+		 const struct in6_addr *gw, int oif)
+{
+	if (__this_cpu_read(nf_skb_duplicated))
+		return;
+	skb = pskb_copy(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	nf_conntrack_put(skb->nfct);
+	skb->nfct     = &nf_ct_untracked_get()->ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+#endif
+	if (hooknum == NF_INET_PRE_ROUTING ||
+	    hooknum == NF_INET_LOCAL_IN) {
+		struct ipv6hdr *iph = ipv6_hdr(skb);
+		--iph->hop_limit;
+	}
+	if (nf_dup_ipv6_route(skb, gw, oif)) {
+		__this_cpu_write(nf_skb_duplicated, true);
+		ip6_local_out(skb);
+		__this_cpu_write(nf_skb_duplicated, false);
+	} else {
+		kfree_skb(skb);
+	}
+}
+EXPORT_SYMBOL_GPL(nf_dup_ipv6);
+
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("nf_dup_ipv6: IPv6 packet duplication");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 6eae69a698ed..3e1b4abf1897 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -867,6 +867,8 @@ config NETFILTER_XT_TARGET_TEE
 	depends on NETFILTER_ADVANCED
 	depends on IPV6 || IPV6=n
 	depends on !NF_CONNTRACK || NF_CONNTRACK
+	select NF_DUP_IPV4
+	select NF_DUP_IPV6 if IP6_NF_IPTABLES
 	---help---
 	This option adds a "TEE" target with which a packet can be cloned and
 	this clone be rerouted to another nexthop.
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 0ed9fb61d470..49fee6aa2c0a 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -10,23 +10,14 @@
  *	modify it under the terms of the GNU General Public License
  *	version 2 or later, as published by the Free Software Foundation.
  */
-#include <linux/ip.h>
 #include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/route.h>
 #include <linux/skbuff.h>
-#include <linux/notifier.h>
-#include <net/checksum.h>
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <net/route.h>
+#include <linux/route.h>
 #include <linux/netfilter/x_tables.h>
+#include <net/route.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
 #include <linux/netfilter/xt_TEE.h>
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-#include <net/netfilter/nf_conntrack.h>
-#endif
 
 struct xt_tee_priv {
 	struct notifier_block	notifier;
@@ -36,161 +27,24 @@ struct xt_tee_priv {
 
 static const union nf_inet_addr tee_zero_address;
 
-static struct net *pick_net(struct sk_buff *skb)
-{
-#ifdef CONFIG_NET_NS
-	const struct dst_entry *dst;
-
-	if (skb->dev != NULL)
-		return dev_net(skb->dev);
-	dst = skb_dst(skb);
-	if (dst != NULL && dst->dev != NULL)
-		return dev_net(dst->dev);
-#endif
-	return &init_net;
-}
-
-static bool
-tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-	struct net *net = pick_net(skb);
-	struct rtable *rt;
-	struct flowi4 fl4;
-
-	memset(&fl4, 0, sizeof(fl4));
-	if (info->priv) {
-		if (info->priv->oif == -1)
-			return false;
-		fl4.flowi4_oif = info->priv->oif;
-	}
-	fl4.daddr = info->gw.ip;
-	fl4.flowi4_tos = RT_TOS(iph->tos);
-	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
-	fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
-	rt = ip_route_output_key(net, &fl4);
-	if (IS_ERR(rt))
-		return false;
-
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-	skb->dev      = rt->dst.dev;
-	skb->protocol = htons(ETH_P_IP);
-	return true;
-}
-
 static unsigned int
 tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_tee_tginfo *info = par->targinfo;
-	struct iphdr *iph;
 
-	if (__this_cpu_read(nf_skb_duplicated))
-		return XT_CONTINUE;
-	/*
-	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
-	 * the original skb, which should continue on its way as if nothing has
-	 * happened. The copy should be independently delivered to the TEE
-	 * --gateway.
-	 */
-	skb = pskb_copy(skb, GFP_ATOMIC);
-	if (skb == NULL)
-		return XT_CONTINUE;
-
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	/* Avoid counting cloned packets towards the original connection. */
-	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
-#endif
-	/*
-	 * If we are in PREROUTING/INPUT, the checksum must be recalculated
-	 * since the length could have changed as a result of defragmentation.
-	 *
-	 * We also decrease the TTL to mitigate potential TEE loops
-	 * between two hosts.
-	 *
-	 * Set %IP_DF so that the original source is notified of a potentially
-	 * decreased MTU on the clone route. IPv6 does this too.
-	 */
-	iph = ip_hdr(skb);
-	iph->frag_off |= htons(IP_DF);
-	if (par->hooknum == NF_INET_PRE_ROUTING ||
-	    par->hooknum == NF_INET_LOCAL_IN)
-		--iph->ttl;
-	ip_send_check(iph);
+	nf_dup_ipv4(skb, par->hooknum, &info->gw.in, info->priv->oif);
 
-	if (tee_tg_route4(skb, info)) {
-		__this_cpu_write(nf_skb_duplicated, true);
-		ip_local_out(skb);
-		__this_cpu_write(nf_skb_duplicated, false);
-	} else {
-		kfree_skb(skb);
-	}
 	return XT_CONTINUE;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static bool
-tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
-{
-	const struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct net *net = pick_net(skb);
-	struct dst_entry *dst;
-	struct flowi6 fl6;
-
-	memset(&fl6, 0, sizeof(fl6));
-	if (info->priv) {
-		if (info->priv->oif == -1)
-			return false;
-		fl6.flowi6_oif = info->priv->oif;
-	}
-	fl6.daddr = info->gw.in6;
-	fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
-			   (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
-	fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
-	dst = ip6_route_output(net, NULL, &fl6);
-	if (dst->error) {
-		dst_release(dst);
-		return false;
-	}
-	skb_dst_drop(skb);
-	skb_dst_set(skb, dst);
-	skb->dev      = dst->dev;
-	skb->protocol = htons(ETH_P_IPV6);
-	return true;
-}
-
 static unsigned int
 tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_tee_tginfo *info = par->targinfo;
 
-	if (__this_cpu_read(nf_skb_duplicated))
-		return XT_CONTINUE;
-	skb = pskb_copy(skb, GFP_ATOMIC);
-	if (skb == NULL)
-		return XT_CONTINUE;
+	nf_dup_ipv6(skb, par->hooknum, &info->gw.in6, info->priv->oif);
 
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
-#endif
-	if (par->hooknum == NF_INET_PRE_ROUTING ||
-	    par->hooknum == NF_INET_LOCAL_IN) {
-		struct ipv6hdr *iph = ipv6_hdr(skb);
-		--iph->hop_limit;
-	}
-	if (tee_tg_route6(skb, info)) {
-		__this_cpu_write(nf_skb_duplicated, true);
-		ip6_local_out(skb);
-		__this_cpu_write(nf_skb_duplicated, false);
-	} else {
-		kfree_skb(skb);
-	}
 	return XT_CONTINUE;
 }
 #endif
-- 
cgit v1.2.3


From d877f07112f1e5a247c6b585c971a93895c9f738 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 31 May 2015 18:04:11 +0200
Subject: netfilter: nf_tables: add nft_dup expression

This new expression uses the nf_dup engine to clone packets to a given gateway.
Unlike xt_TEE, we use an index to indicate output interface which should be
fine at this stage.

Moreover, change to the preemtion-safe this_cpu_read(nf_skb_duplicated) from
nf_dup_ipv{4,6} to silence a lockdep splat.

Based on the original tee expression from Arturo Borrero Gonzalez, although
this patch has diverted quite a bit from this initial effort due to the
change to support maps.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_dup.h          |   9 +++
 include/uapi/linux/netfilter/nf_tables.h |  14 ++++
 net/ipv4/netfilter/Kconfig               |   6 ++
 net/ipv4/netfilter/Makefile              |   1 +
 net/ipv4/netfilter/nf_dup_ipv4.c         |   2 +-
 net/ipv4/netfilter/nft_dup_ipv4.c        | 110 +++++++++++++++++++++++++++++++
 net/ipv6/netfilter/Kconfig               |   6 ++
 net/ipv6/netfilter/Makefile              |   1 +
 net/ipv6/netfilter/nf_dup_ipv6.c         |   2 +-
 net/ipv6/netfilter/nft_dup_ipv6.c        | 108 ++++++++++++++++++++++++++++++
 10 files changed, 257 insertions(+), 2 deletions(-)
 create mode 100644 include/net/netfilter/nft_dup.h
 create mode 100644 net/ipv4/netfilter/nft_dup_ipv4.c
 create mode 100644 net/ipv6/netfilter/nft_dup_ipv6.c

(limited to 'include/net')

diff --git a/include/net/netfilter/nft_dup.h b/include/net/netfilter/nft_dup.h
new file mode 100644
index 000000000000..6b84cf6491a2
--- /dev/null
+++ b/include/net/netfilter/nft_dup.h
@@ -0,0 +1,9 @@
+#ifndef _NFT_DUP_H_
+#define _NFT_DUP_H_
+
+struct nft_dup_inet {
+	enum nft_registers	sreg_addr:8;
+	enum nft_registers	sreg_dev:8;
+};
+
+#endif /* _NFT_DUP_H_ */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a99e6a997140..2ef35f2c9bda 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -935,6 +935,20 @@ enum nft_redir_attributes {
 };
 #define NFTA_REDIR_MAX		(__NFTA_REDIR_MAX - 1)
 
+/**
+ * enum nft_dup_attributes - nf_tables dup expression netlink attributes
+ *
+ * @NFTA_DUP_SREG_ADDR: source register of address (NLA_U32: nft_registers)
+ * @NFTA_DUP_SREG_DEV: source register of output interface (NLA_U32: nft_register)
+ */
+enum nft_dup_attributes {
+	NFTA_DUP_UNSPEC,
+	NFTA_DUP_SREG_ADDR,
+	NFTA_DUP_SREG_DEV,
+	__NFTA_DUP_MAX
+};
+#define NFTA_DUP_MAX		(__NFTA_DUP_MAX - 1)
+
 /**
  * enum nft_gen_attributes - nf_tables ruleset generation attributes
  *
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 0142ea259d7d..690d27d3f2f9 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -58,6 +58,12 @@ config NFT_REJECT_IPV4
 	default NFT_REJECT
 	tristate
 
+config NFT_DUP_IPV4
+	tristate "IPv4 nf_tables packet duplication support"
+	select NF_DUP_IPV4
+	help
+	  This module enables IPv4 packet duplication support for nf_tables.
+
 endif # NF_TABLES_IPV4
 
 config NF_TABLES_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 9136ffc2d474..87b073da14c9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
 obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
 obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
 obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
+obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
 obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
 
 # generic IP tables 
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index eff85ab3f47d..b5bb37564b0e 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -69,7 +69,7 @@ void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum,
 {
 	struct iphdr *iph;
 
-	if (__this_cpu_read(nf_skb_duplicated))
+	if (this_cpu_read(nf_skb_duplicated))
 		return;
 	/*
 	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
new file mode 100644
index 000000000000..25419fbddcb6
--- /dev/null
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+
+struct nft_dup_ipv4 {
+	enum nft_registers	sreg_addr:8;
+	enum nft_registers	sreg_dev:8;
+};
+
+static void nft_dup_ipv4_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+	struct in_addr gw = {
+		.s_addr = regs->data[priv->sreg_addr],
+	};
+	int oif = regs->data[priv->sreg_dev];
+
+	nf_dup_ipv4(pkt->skb, pkt->ops->hooknum, &gw, oif);
+}
+
+static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+		return -EINVAL;
+
+	priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
+	err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr));
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_DUP_SREG_DEV] != NULL) {
+		priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
+		return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+	}
+	return 0;
+}
+
+static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+	    nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv4_type;
+static const struct nft_expr_ops nft_dup_ipv4_ops = {
+	.type		= &nft_dup_ipv4_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)),
+	.eval		= nft_dup_ipv4_eval,
+	.init		= nft_dup_ipv4_init,
+	.dump		= nft_dup_ipv4_dump,
+};
+
+static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = {
+	[NFTA_DUP_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_DUP_SREG_DEV]	= { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv4_type __read_mostly = {
+	.family		= NFPROTO_IPV4,
+	.name		= "dup",
+	.ops		= &nft_dup_ipv4_ops,
+	.policy		= nft_dup_ipv4_policy,
+	.maxattr	= NFTA_DUP_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_dup_ipv4_module_init(void)
+{
+	return nft_register_expr(&nft_dup_ipv4_type);
+}
+
+static void __exit nft_dup_ipv4_module_exit(void)
+{
+	nft_unregister_expr(&nft_dup_ipv4_type);
+}
+
+module_init(nft_dup_ipv4_module_init);
+module_exit(nft_dup_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 298daf30d857..96833e4b3193 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -47,6 +47,12 @@ config NFT_REJECT_IPV6
 	default NFT_REJECT
 	tristate
 
+config NFT_DUP_IPV6
+	tristate "IPv6 nf_tables packet duplication support"
+	select NF_DUP_IPV6
+	help
+	  This module enables IPv6 packet duplication support for nf_tables.
+
 endif # NF_TABLES_IPV6
 endif # NF_TABLES
 
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index dc6c732f98ca..b4f7d0b4e2af 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
 obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
 obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o
 obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
+obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 399fdda18447..d8ab654080b4 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -63,7 +63,7 @@ static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw,
 void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum,
 		 const struct in6_addr *gw, int oif)
 {
-	if (__this_cpu_read(nf_skb_duplicated))
+	if (this_cpu_read(nf_skb_duplicated))
 		return;
 	skb = pskb_copy(skb, GFP_ATOMIC);
 	if (skb == NULL)
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
new file mode 100644
index 000000000000..0eaa4f65fdea
--- /dev/null
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
+
+struct nft_dup_ipv6 {
+	enum nft_registers	sreg_addr:8;
+	enum nft_registers	sreg_dev:8;
+};
+
+static void nft_dup_ipv6_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+	struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr];
+	int oif = regs->data[priv->sreg_dev];
+
+	nf_dup_ipv6(pkt->skb, pkt->ops->hooknum, gw, oif);
+}
+
+static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
+{
+	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+		return -EINVAL;
+
+	priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
+	err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr));
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_DUP_SREG_DEV] != NULL) {
+		priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
+		return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+	}
+	return 0;
+}
+
+static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+	    nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv6_type;
+static const struct nft_expr_ops nft_dup_ipv6_ops = {
+	.type		= &nft_dup_ipv6_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv6)),
+	.eval		= nft_dup_ipv6_eval,
+	.init		= nft_dup_ipv6_init,
+	.dump		= nft_dup_ipv6_dump,
+};
+
+static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = {
+	[NFTA_DUP_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_DUP_SREG_DEV]	= { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv6_type __read_mostly = {
+	.family		= NFPROTO_IPV6,
+	.name		= "dup",
+	.ops		= &nft_dup_ipv6_ops,
+	.policy		= nft_dup_ipv6_policy,
+	.maxattr	= NFTA_DUP_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_dup_ipv6_module_init(void)
+{
+	return nft_register_expr(&nft_dup_ipv6_type);
+}
+
+static void __exit nft_dup_ipv6_module_exit(void)
+{
+	nft_unregister_expr(&nft_dup_ipv6_type);
+}
+
+module_init(nft_dup_ipv6_module_init);
+module_exit(nft_dup_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup");
-- 
cgit v1.2.3


From 3499abb249bb5ed9d21031944bc3059ec4aa2909 Mon Sep 17 00:00:00 2001
From: Andreas Schultz <aschultz@tpip.net>
Date: Wed, 5 Aug 2015 17:51:45 +0200
Subject: netfilter: nfacct: per network namespace support

- Move the nfnl_acct_list into the network namespace, initialize
  and destroy it per namespace
- Keep track of refcnt on nfacct objects, the old logic does not
  longer work with a per namespace list
- Adjust xt_nfacct to pass the namespace when registring objects

Signed-off-by: Andreas Schultz <aschultz@tpip.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink_acct.h |  3 +-
 include/net/net_namespace.h              |  3 ++
 net/netfilter/nfnetlink_acct.c           | 71 ++++++++++++++++++++++----------
 net/netfilter/xt_nfacct.c                |  2 +-
 4 files changed, 56 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h
index 6ec975748742..80ca889b164e 100644
--- a/include/linux/netfilter/nfnetlink_acct.h
+++ b/include/linux/netfilter/nfnetlink_acct.h
@@ -2,6 +2,7 @@
 #define _NFNL_ACCT_H_
 
 #include <uapi/linux/netfilter/nfnetlink_acct.h>
+#include <net/net_namespace.h>
 
 enum {
 	NFACCT_NO_QUOTA		= -1,
@@ -11,7 +12,7 @@ enum {
 
 struct nf_acct;
 
-struct nf_acct *nfnl_acct_find_get(const char *filter_name);
+struct nf_acct *nfnl_acct_find_get(struct net *net, const char *filter_name);
 void nfnl_acct_put(struct nf_acct *acct);
 void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct);
 extern int nfnl_acct_overquota(const struct sk_buff *skb,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index e951453e0a23..2dcea635ecce 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -118,6 +118,9 @@ struct net {
 #endif
 	struct sock		*nfnl;
 	struct sock		*nfnl_stash;
+#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT)
+	struct list_head        nfnl_acct_list;
+#endif
 #endif
 #ifdef CONFIG_WEXT_CORE
 	struct sk_buff_head	wext_nlevents;
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index c18af2f63eef..fefbf5f0b28d 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -27,8 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
 MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
 
-static LIST_HEAD(nfnl_acct_list);
-
 struct nf_acct {
 	atomic64_t		pkts;
 	atomic64_t		bytes;
@@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
 	struct nf_acct *nfacct, *matching = NULL;
+	struct net *net = sock_net(nfnl);
 	char *acct_name;
 	unsigned int size = 0;
 	u32 flags = 0;
@@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	if (strlen(acct_name) == 0)
 		return -EINVAL;
 
-	list_for_each_entry(nfacct, &nfnl_acct_list, head) {
+	list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
 		if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
 			continue;
 
@@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 			     be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
 	}
 	atomic_set(&nfacct->refcnt, 1);
-	list_add_tail_rcu(&nfacct->head, &nfnl_acct_list);
+	list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
 	return 0;
 }
 
@@ -185,6 +184,7 @@ nla_put_failure:
 static int
 nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct nf_acct *cur, *last;
 	const struct nfacct_filter *filter = cb->data;
 
@@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		cb->args[1] = 0;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+	list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
 		if (last) {
 			if (cur != last)
 				continue;
@@ -257,6 +257,7 @@ static int
 nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
+	struct net *net = sock_net(nfnl);
 	int ret = -ENOENT;
 	struct nf_acct *cur;
 	char *acct_name;
@@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
 		return -EINVAL;
 	acct_name = nla_data(tb[NFACCT_NAME]);
 
-	list_for_each_entry(cur, &nfnl_acct_list, head) {
+	list_for_each_entry(cur, &net->nfnl_acct_list, head) {
 		struct sk_buff *skb2;
 
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -336,19 +337,20 @@ static int
 nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
+	struct net *net = sock_net(nfnl);
 	char *acct_name;
 	struct nf_acct *cur;
 	int ret = -ENOENT;
 
 	if (!tb[NFACCT_NAME]) {
-		list_for_each_entry(cur, &nfnl_acct_list, head)
+		list_for_each_entry(cur, &net->nfnl_acct_list, head)
 			nfnl_acct_try_del(cur);
 
 		return 0;
 	}
 	acct_name = nla_data(tb[NFACCT_NAME]);
 
-	list_for_each_entry(cur, &nfnl_acct_list, head) {
+	list_for_each_entry(cur, &net->nfnl_acct_list, head) {
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
 			continue;
 
@@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = {
 
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
 
-struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
 {
 	struct nf_acct *cur, *acct = NULL;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+	list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
 			continue;
 
@@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
 
 void nfnl_acct_put(struct nf_acct *acct)
 {
-	atomic_dec(&acct->refcnt);
+	if (atomic_dec_and_test(&acct->refcnt))
+		kfree_rcu(acct, rcu_head);
+
 	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL_GPL(nfnl_acct_put);
@@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
 }
 EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
 
+static int __net_init nfnl_acct_net_init(struct net *net)
+{
+	INIT_LIST_HEAD(&net->nfnl_acct_list);
+
+	return 0;
+}
+
+static void __net_exit nfnl_acct_net_exit(struct net *net)
+{
+	struct nf_acct *cur, *tmp;
+
+	list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+		list_del_rcu(&cur->head);
+
+		if (atomic_dec_and_test(&cur->refcnt))
+			kfree_rcu(cur, rcu_head);
+	}
+}
+
+static struct pernet_operations nfnl_acct_ops = {
+        .init   = nfnl_acct_net_init,
+        .exit   = nfnl_acct_net_exit,
+};
+
 static int __init nfnl_acct_init(void)
 {
 	int ret;
 
+	ret = register_pernet_subsys(&nfnl_acct_ops);
+	if (ret < 0) {
+		pr_err("nfnl_acct_init: failed to register pernet ops\n");
+		goto err_out;
+	}
+
 	pr_info("nfnl_acct: registering with nfnetlink.\n");
 	ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
 	if (ret < 0) {
 		pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
-		goto err_out;
+		goto cleanup_pernet;
 	}
 	return 0;
+
+cleanup_pernet:
+	unregister_pernet_subsys(&nfnl_acct_ops);
 err_out:
 	return ret;
 }
 
 static void __exit nfnl_acct_exit(void)
 {
-	struct nf_acct *cur, *tmp;
-
 	pr_info("nfnl_acct: unregistering from nfnetlink.\n");
 	nfnetlink_subsys_unregister(&nfnl_acct_subsys);
-
-	list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) {
-		list_del_rcu(&cur->head);
-		/* We are sure that our objects have no clients at this point,
-		 * it's safe to release them all without checking refcnt. */
-		kfree_rcu(cur, rcu_head);
-	}
+	unregister_pernet_subsys(&nfnl_acct_ops);
 }
 
 module_init(nfnl_acct_init);
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 8c646ed9c921..3048a7e3a90a 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
 	struct xt_nfacct_match_info *info = par->matchinfo;
 	struct nf_acct *nfacct;
 
-	nfacct = nfnl_acct_find_get(info->name);
+	nfacct = nfnl_acct_find_get(par->net, info->name);
 	if (nfacct == NULL) {
 		pr_info("xt_nfacct: accounting object with name `%s' "
 			"does not exists\n", info->name);
-- 
cgit v1.2.3


From da8b43c0e1dcea3bcac5f37ea59934ddaa137aed Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 4 Aug 2015 22:51:07 -0700
Subject: vxlan: combine VXLAN_FLOWBASED into VXLAN_COLLECT_METADATA

IFLA_VXLAN_FLOWBASED is useless without IFLA_VXLAN_COLLECT_METADATA,
so combine them into single IFLA_VXLAN_COLLECT_METADATA flag.
'flowbased' doesn't convey real meaning of the vxlan tunnel mode.
This mode can be used by routing, tc+bpf and ovs.
Only ovs is strictly flow based, so 'collect metadata' is a better
name for this tunnel mode.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c           | 17 ++++++-----------
 include/net/vxlan.h           |  4 +---
 include/uapi/linux/if_link.h  |  1 -
 net/openvswitch/vport-vxlan.c |  2 +-
 4 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e90f7a484e1c..b6731fad19ba 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1141,7 +1141,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	union vxlan_addr *remote_ip;
 
 	/* For flow based devices, map all packets to VNI 0 */
-	if (vs->flags & VXLAN_F_FLOW_BASED)
+	if (vs->flags & VXLAN_F_COLLECT_METADATA)
 		vni = 0;
 
 	/* Is this VNI defined? */
@@ -1183,7 +1183,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 
 	skb_reset_network_header(skb);
 	/* In flow-based mode, GBP is carried in dst_metadata */
-	if (!(vs->flags & VXLAN_F_FLOW_BASED))
+	if (!(vs->flags & VXLAN_F_COLLECT_METADATA))
 		skb->mark = md->gbp;
 
 	if (oip6)
@@ -2129,7 +2129,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 	}
 
-	if (vxlan->flags & VXLAN_F_FLOW_BASED &&
+	if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
 	    info && info->mode == IP_TUNNEL_INFO_TX) {
 		vxlan_xmit_one(skb, dev, NULL, false);
 		return NETDEV_TX_OK;
@@ -2462,7 +2462,6 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
-	[IFLA_VXLAN_FLOWBASED]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_COLLECT_METADATA]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
 	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
@@ -2814,10 +2813,6 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_LIMIT])
 		conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
 
-	if (data[IFLA_VXLAN_FLOWBASED] &&
-	    nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
-		conf.flags |= VXLAN_F_FLOW_BASED;
-
 	if (data[IFLA_VXLAN_COLLECT_METADATA] &&
 	    nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
 		conf.flags |= VXLAN_F_COLLECT_METADATA;
@@ -2903,7 +2898,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
-		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_FLOWBASED */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
 		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
@@ -2970,8 +2965,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
 			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
-	    nla_put_u8(skb, IFLA_VXLAN_FLOWBASED,
-		       !!(vxlan->flags & VXLAN_F_FLOW_BASED)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
+		       !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) ||
 	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
 	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
 	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index eb8d721cdb67..e4534f1b2d8c 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -181,7 +181,6 @@ struct vxlan_dev {
 #define VXLAN_F_GBP			0x800
 #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
 #define VXLAN_F_COLLECT_METADATA	0x2000
-#define VXLAN_F_FLOW_BASED		0x4000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -190,8 +189,7 @@ struct vxlan_dev {
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_REMCSUM_RX |		\
 					 VXLAN_F_REMCSUM_NOPARTIAL |	\
-					 VXLAN_F_COLLECT_METADATA |	\
-					 VXLAN_F_FLOW_BASED)
+					 VXLAN_F_COLLECT_METADATA)
 
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ea047480a1f0..f24ec99a2262 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -382,7 +382,6 @@ enum {
 	IFLA_VXLAN_REMCSUM_RX,
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
-	IFLA_VXLAN_FLOWBASED,
 	IFLA_VXLAN_COLLECT_METADATA,
 	__IFLA_VXLAN_MAX
 };
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 547173336cd3..c6e937e36f8b 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -90,7 +90,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
 	int err;
 	struct vxlan_config conf = {
 		.no_share = true,
-		.flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA,
+		.flags = VXLAN_F_COLLECT_METADATA,
 	};
 
 	if (!options) {
-- 
cgit v1.2.3


From 1525c386a1f01612c6f3f27241113d7fc8e6d72d Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 6 Aug 2015 01:44:02 -0400
Subject: net: switchdev: change fdb addr for a byte array

The address in the switchdev_obj_fdb structure is currently represented
as a pointer. Replacing it for a 6-byte array allows switchdev to carry
addresses directly read from hardware registers, not stored by the
switch chip driver (as in Rocker).

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 2 +-
 include/net/switchdev.h              | 2 +-
 net/bridge/br_fdb.c                  | 2 +-
 net/switchdev/switchdev.c            | 5 +++--
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index b77e0e7307d4..80bb25c5a644 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4543,7 +4543,7 @@ static int rocker_port_fdb_dump(const struct rocker_port *rocker_port,
 	hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) {
 		if (found->key.pport != rocker_port->pport)
 			continue;
-		fdb->addr = found->key.addr;
+		ether_addr_copy(fdb->addr, found->key.addr);
 		fdb->vid = rocker_port_vlan_to_vid(rocker_port,
 						   found->key.vlan_id);
 		err = obj->cb(rocker_port->dev, obj);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 89da8934519b..e90e1a0fa579 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -70,7 +70,7 @@ struct switchdev_obj {
 			u32 tb_id;
 		} ipv4_fib;
 		struct switchdev_obj_fdb {		/* PORT_FDB */
-			const unsigned char *addr;
+			u8 addr[ETH_ALEN];
 			u16 vid;
 		} fdb;
 	} u;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 9e9875da0a4f..5656b44bf3de 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -136,11 +136,11 @@ static void fdb_del_external_learn(struct net_bridge_fdb_entry *f)
 	struct switchdev_obj obj = {
 		.id = SWITCHDEV_OBJ_PORT_FDB,
 		.u.fdb = {
-			.addr = f->addr.addr,
 			.vid = f->vlan_id,
 		},
 	};
 
+	ether_addr_copy(obj.u.fdb.addr, f->addr.addr);
 	switchdev_port_obj_del(f->dst->dev, &obj);
 }
 
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 33bafa2e703e..9db87a34f866 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -15,6 +15,7 @@
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
+#include <linux/etherdevice.h>
 #include <linux/if_bridge.h>
 #include <net/ip_fib.h>
 #include <net/switchdev.h>
@@ -742,11 +743,11 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	struct switchdev_obj obj = {
 		.id = SWITCHDEV_OBJ_PORT_FDB,
 		.u.fdb = {
-			.addr = addr,
 			.vid = vid,
 		},
 	};
 
+	ether_addr_copy(obj.u.fdb.addr, addr);
 	return switchdev_port_obj_add(dev, &obj);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_add);
@@ -769,11 +770,11 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 	struct switchdev_obj obj = {
 		.id = SWITCHDEV_OBJ_PORT_FDB,
 		.u.fdb = {
-			.addr = addr,
 			.vid = vid,
 		},
 	};
 
+	ether_addr_copy(obj.u.fdb.addr, addr);
 	return switchdev_port_obj_del(dev, &obj);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_del);
-- 
cgit v1.2.3


From 890248261a18c7ae22923095dfadea2c0a2a304a Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 6 Aug 2015 01:44:03 -0400
Subject: net: switchdev: support static FDB addresses

This patch adds a is_static boolean to the switchdev_obj_fdb structure,
in order to set the ndm_state to either NUD_NOARP or NUD_REACHABLE.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   | 1 +
 net/switchdev/switchdev.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index e90e1a0fa579..0e296b82aef3 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -72,6 +72,7 @@ struct switchdev_obj {
 		struct switchdev_obj_fdb {		/* PORT_FDB */
 			u8 addr[ETH_ALEN];
 			u16 vid;
+			bool is_static;
 		} fdb;
 	} u;
 };
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 9db87a34f866..e9d1cacc4060 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -811,7 +811,7 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev,
 	ndm->ndm_flags   = NTF_SELF;
 	ndm->ndm_type    = 0;
 	ndm->ndm_ifindex = dev->ifindex;
-	ndm->ndm_state   = NUD_REACHABLE;
+	ndm->ndm_state   = obj->u.fdb.is_static ? NUD_NOARP : NUD_REACHABLE;
 
 	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 55045ddded0f39d84c2ca019508973be8c595a78 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 6 Aug 2015 01:44:04 -0400
Subject: net: dsa: add support for switchdev FDB objects

Remove the fdb_{add,del,getnext} function pointer in favor of new
port_fdb_{add,del,getnext}.

Implement the switchdev_port_obj_{add,del,dump} functions in DSA to
support the SWITCHDEV_OBJ_PORT_FDB objects.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6171.c |   3 -
 drivers/net/dsa/mv88e6352.c |   3 -
 include/net/dsa.h           |  16 ++--
 net/dsa/slave.c             | 218 +++++++++++++++++++++++---------------------
 4 files changed, 126 insertions(+), 114 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index 1c7808495a9d..cfa21ed1f734 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -116,9 +116,6 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
 	.port_join_bridge       = mv88e6xxx_join_bridge,
 	.port_leave_bridge      = mv88e6xxx_leave_bridge,
 	.port_stp_update        = mv88e6xxx_port_stp_update,
-	.fdb_add		= mv88e6xxx_port_fdb_add,
-	.fdb_del		= mv88e6xxx_port_fdb_del,
-	.fdb_getnext		= mv88e6xxx_port_fdb_getnext,
 };
 
 MODULE_ALIAS("platform:mv88e6171");
diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
index af210efecc55..eb4630fec6f1 100644
--- a/drivers/net/dsa/mv88e6352.c
+++ b/drivers/net/dsa/mv88e6352.c
@@ -341,9 +341,6 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
 	.port_join_bridge	= mv88e6xxx_join_bridge,
 	.port_leave_bridge	= mv88e6xxx_leave_bridge,
 	.port_stp_update	= mv88e6xxx_port_stp_update,
-	.fdb_add		= mv88e6xxx_port_fdb_add,
-	.fdb_del		= mv88e6xxx_port_fdb_del,
-	.fdb_getnext		= mv88e6xxx_port_fdb_getnext,
 };
 
 MODULE_ALIAS("platform:mv88e6172");
diff --git a/include/net/dsa.h b/include/net/dsa.h
index fbca63ba8f73..091d35f77180 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -296,12 +296,16 @@ struct dsa_switch_driver {
 				     u32 br_port_mask);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
-	int	(*fdb_add)(struct dsa_switch *ds, int port,
-			   const unsigned char *addr, u16 vid);
-	int	(*fdb_del)(struct dsa_switch *ds, int port,
-			   const unsigned char *addr, u16 vid);
-	int	(*fdb_getnext)(struct dsa_switch *ds, int port,
-			       unsigned char *addr, bool *is_static);
+
+	/*
+	 * Forwarding database
+	 */
+	int	(*port_fdb_add)(struct dsa_switch *ds, int port, u16 vid,
+				const u8 addr[ETH_ALEN]);
+	int	(*port_fdb_del)(struct dsa_switch *ds, int port, u16 vid,
+				const u8 addr[ETH_ALEN]);
+	int	(*port_fdb_getnext)(struct dsa_switch *ds, int port, u16 *vid,
+				    u8 addr[ETH_ALEN], bool *is_static);
 };
 
 void register_switch_driver(struct dsa_switch_driver *type);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 0010c690cc67..1dbdeaab2bb4 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -19,6 +19,7 @@
 #include <net/switchdev.h>
 #include <linux/if_bridge.h>
 #include <linux/netpoll.h>
+#include <linux/if_vlan.h>
 #include "dsa_priv.h"
 
 /* slave mii_bus handling ***************************************************/
@@ -200,105 +201,6 @@ out:
 	return 0;
 }
 
-static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-			     struct net_device *dev,
-			     const unsigned char *addr, u16 vid, u16 nlm_flags)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
-	int ret = -EOPNOTSUPP;
-
-	if (ds->drv->fdb_add)
-		ret = ds->drv->fdb_add(ds, p->port, addr, vid);
-
-	return ret;
-}
-
-static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-			     struct net_device *dev,
-			     const unsigned char *addr, u16 vid)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
-	int ret = -EOPNOTSUPP;
-
-	if (ds->drv->fdb_del)
-		ret = ds->drv->fdb_del(ds, p->port, addr, vid);
-
-	return ret;
-}
-
-static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb,
-			       const unsigned char *addr, u16 vid,
-			       bool is_static,
-			       u32 portid, u32 seq, int type,
-			       unsigned int flags)
-{
-	struct nlmsghdr *nlh;
-	struct ndmsg *ndm;
-
-	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
-	if (!nlh)
-		return -EMSGSIZE;
-
-	ndm = nlmsg_data(nlh);
-	ndm->ndm_family	 = AF_BRIDGE;
-	ndm->ndm_pad1    = 0;
-	ndm->ndm_pad2    = 0;
-	ndm->ndm_flags	 = NTF_EXT_LEARNED;
-	ndm->ndm_type	 = 0;
-	ndm->ndm_ifindex = dev->ifindex;
-	ndm->ndm_state   = is_static ? NUD_NOARP : NUD_REACHABLE;
-
-	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
-		goto nla_put_failure;
-
-	if (vid && nla_put_u16(skb, NDA_VLAN, vid))
-		goto nla_put_failure;
-
-	nlmsg_end(skb, nlh);
-	return 0;
-
-nla_put_failure:
-	nlmsg_cancel(skb, nlh);
-	return -EMSGSIZE;
-}
-
-/* Dump information about entries, in response to GETNEIGH */
-static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			      struct net_device *dev,
-			      struct net_device *filter_dev, int idx)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
-	unsigned char addr[ETH_ALEN] = { 0 };
-	int ret;
-
-	if (!ds->drv->fdb_getnext)
-		return -EOPNOTSUPP;
-
-	for (; ; idx++) {
-		bool is_static;
-
-		ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static);
-		if (ret < 0)
-			break;
-
-		if (idx < cb->args[0])
-			continue;
-
-		ret = dsa_slave_fill_info(dev, skb, addr, 0,
-					  is_static,
-					  NETLINK_CB(cb->skb).portid,
-					  cb->nlh->nlmsg_seq,
-					  RTM_NEWNEIGH, NLM_F_MULTI);
-		if (ret < 0)
-			break;
-	}
-
-	return idx;
-}
-
 static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -364,6 +266,115 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
 	return ret;
 }
 
+static int dsa_slave_port_fdb_add(struct net_device *dev,
+				  struct switchdev_obj *obj)
+{
+	struct switchdev_obj_fdb *fdb = &obj->u.fdb;
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	int err;
+
+	if (obj->trans == SWITCHDEV_TRANS_PREPARE)
+		err = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP;
+	else if (obj->trans == SWITCHDEV_TRANS_COMMIT)
+		err = ds->drv->port_fdb_add(ds, p->port, fdb->vid, fdb->addr);
+	else
+		err = -EOPNOTSUPP;
+
+	return err;
+}
+
+static int dsa_slave_port_fdb_del(struct net_device *dev,
+				  struct switchdev_obj *obj)
+{
+	struct switchdev_obj_fdb *fdb = &obj->u.fdb;
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	if (!ds->drv->port_fdb_del)
+		return -EOPNOTSUPP;
+
+	return ds->drv->port_fdb_del(ds, p->port, fdb->vid, fdb->addr);
+}
+
+static int dsa_slave_port_fdb_dump(struct net_device *dev,
+				   struct switchdev_obj *obj)
+{
+	struct switchdev_obj_fdb *fdb = &obj->u.fdb;
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	int err;
+
+	if (!ds->drv->port_fdb_getnext)
+		return -EOPNOTSUPP;
+
+	memset(fdb, 0, sizeof(*fdb));
+
+	for (;;) {
+		err = ds->drv->port_fdb_getnext(ds, p->port, &fdb->vid,
+						fdb->addr, &fdb->is_static);
+		if (err)
+			break;
+
+		err = obj->cb(dev, obj);
+		if (err)
+			break;
+	}
+
+	return err == -ENOENT ? 0 : err;
+}
+
+static int dsa_slave_port_obj_add(struct net_device *dev,
+				  struct switchdev_obj *obj)
+{
+	int err;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_PORT_FDB:
+		err = dsa_slave_port_fdb_add(dev, obj);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int dsa_slave_port_obj_del(struct net_device *dev,
+				  struct switchdev_obj *obj)
+{
+	int err;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_PORT_FDB:
+		err = dsa_slave_port_fdb_del(dev, obj);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int dsa_slave_port_obj_dump(struct net_device *dev,
+				   struct switchdev_obj *obj)
+{
+	int err;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_PORT_FDB:
+		err = dsa_slave_port_fdb_dump(dev, obj);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
 static int dsa_slave_bridge_port_join(struct net_device *dev,
 				      struct net_device *br)
 {
@@ -765,9 +776,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_change_rx_flags	= dsa_slave_change_rx_flags,
 	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
 	.ndo_set_mac_address	= dsa_slave_set_mac_address,
-	.ndo_fdb_add		= dsa_slave_fdb_add,
-	.ndo_fdb_del		= dsa_slave_fdb_del,
-	.ndo_fdb_dump		= dsa_slave_fdb_dump,
+	.ndo_fdb_add		= switchdev_port_fdb_add,
+	.ndo_fdb_del		= switchdev_port_fdb_del,
+	.ndo_fdb_dump		= switchdev_port_fdb_dump,
 	.ndo_do_ioctl		= dsa_slave_ioctl,
 	.ndo_get_iflink		= dsa_slave_get_iflink,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -780,6 +791,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
 	.switchdev_port_attr_get	= dsa_slave_port_attr_get,
 	.switchdev_port_attr_set	= dsa_slave_port_attr_set,
+	.switchdev_port_obj_add		= dsa_slave_port_obj_add,
+	.switchdev_port_obj_del		= dsa_slave_port_obj_del,
+	.switchdev_port_obj_dump	= dsa_slave_port_obj_dump,
 };
 
 static void dsa_slave_adjust_link(struct net_device *dev)
-- 
cgit v1.2.3


From 51e0e5d8124ece158927a4c2288c0929d3b53aa3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Mon, 10 Aug 2015 21:15:53 +0200
Subject: ieee802154: 6lowpan: remove multiple lowpan per wpan support

We currently supports multiple lowpan interfaces per wpan interface. I
never saw any use case into such functionality. We drop this feature now
because it's much easier do deal with address changes inside the under
laying wpan interface.

This patch removes the multiple lowpan interface and adds a lowpan_dev
netdev pointer into the wpan_dev, if this pointer isn't null the wpan
interface belongs to the assigned lowpan interface.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Tested-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h            |  3 ++
 net/ieee802154/6lowpan/6lowpan_i.h |  8 -----
 net/ieee802154/6lowpan/core.c      | 67 +++++++++++---------------------------
 net/ieee802154/6lowpan/rx.c        | 38 +++++----------------
 4 files changed, 30 insertions(+), 86 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 382f94b59f2f..e53b6bfda976 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -173,6 +173,9 @@ struct wpan_dev {
 	struct list_head list;
 	struct net_device *netdev;
 
+	/* lowpan interface, set when the wpan_dev belongs to one lowpan_dev */
+	struct net_device *lowpan_dev;
+
 	u32 identifier;
 
 	/* MAC PIB */
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index e50f69da78eb..923b680adb61 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -37,15 +37,9 @@ static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
 	}
 }
 
-struct lowpan_dev_record {
-	struct net_device *ldev;
-	struct list_head list;
-};
-
 /* private device info */
 struct lowpan_dev_info {
 	struct net_device	*real_dev; /* real WPAN device ptr */
-	struct mutex		dev_list_mtx; /* mutex for list ops */
 	u16			fragment_tag;
 };
 
@@ -55,8 +49,6 @@ lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
 	return netdev_priv(dev);
 }
 
-extern struct list_head lowpan_devices;
-
 int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
 void lowpan_net_frag_exit(void);
 int lowpan_net_frag_init(void);
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index f20a387a1011..a4edee8fdc79 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -52,9 +52,6 @@
 
 #include "6lowpan_i.h"
 
-LIST_HEAD(lowpan_devices);
-static int lowpan_open_count;
-
 static struct header_ops lowpan_header_ops = {
 	.create	= lowpan_header_create,
 };
@@ -114,7 +111,6 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[])
 {
 	struct net_device *real_dev;
-	struct lowpan_dev_record *entry;
 	int ret;
 
 	ASSERT_RTNL();
@@ -133,31 +129,19 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev,
 		return -EINVAL;
 	}
 
-	lowpan_dev_info(dev)->real_dev = real_dev;
-	mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
-
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry) {
+	if (real_dev->ieee802154_ptr->lowpan_dev) {
 		dev_put(real_dev);
-		lowpan_dev_info(dev)->real_dev = NULL;
-		return -ENOMEM;
+		return -EBUSY;
 	}
 
-	entry->ldev = dev;
-
+	lowpan_dev_info(dev)->real_dev = real_dev;
 	/* Set the lowpan hardware address to the wpan hardware address. */
 	memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN);
 
-	mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx);
-	INIT_LIST_HEAD(&entry->list);
-	list_add_tail(&entry->list, &lowpan_devices);
-	mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx);
-
 	ret = register_netdevice(dev);
 	if (ret >= 0) {
-		if (!lowpan_open_count)
-			lowpan_rx_init();
-		lowpan_open_count++;
+		real_dev->ieee802154_ptr->lowpan_dev = dev;
+		lowpan_rx_init();
 	}
 
 	return ret;
@@ -167,27 +151,12 @@ static void lowpan_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev);
 	struct net_device *real_dev = lowpan_dev->real_dev;
-	struct lowpan_dev_record *entry, *tmp;
 
 	ASSERT_RTNL();
 
-	lowpan_open_count--;
-	if (!lowpan_open_count)
-		lowpan_rx_exit();
-
-	mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx);
-	list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) {
-		if (entry->ldev == dev) {
-			list_del(&entry->list);
-			kfree(entry);
-		}
-	}
-	mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx);
-
-	mutex_destroy(&lowpan_dev_info(dev)->dev_list_mtx);
-
-	unregister_netdevice_queue(dev, head);
-
+	lowpan_rx_exit();
+	real_dev->ieee802154_ptr->lowpan_dev = NULL;
+	unregister_netdevice(dev);
 	dev_put(real_dev);
 }
 
@@ -214,19 +183,21 @@ static int lowpan_device_event(struct notifier_block *unused,
 			       unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	LIST_HEAD(del_list);
-	struct lowpan_dev_record *entry, *tmp;
 
 	if (dev->type != ARPHRD_IEEE802154)
 		goto out;
 
-	if (event == NETDEV_UNREGISTER) {
-		list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) {
-			if (lowpan_dev_info(entry->ldev)->real_dev == dev)
-				lowpan_dellink(entry->ldev, &del_list);
-		}
-
-		unregister_netdevice_many(&del_list);
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		/* Check if wpan interface is unregistered that we
+		 * also delete possible lowpan interfaces which belongs
+		 * to the wpan interface.
+		 */
+		if (dev->ieee802154_ptr && dev->ieee802154_ptr->lowpan_dev)
+			lowpan_dellink(dev->ieee802154_ptr->lowpan_dev, NULL);
+		break;
+	default:
+		break;
 	}
 
 out:
diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c
index 4be1d289ab2d..d6f5e8ee6fda 100644
--- a/net/ieee802154/6lowpan/rx.c
+++ b/net/ieee802154/6lowpan/rx.c
@@ -15,36 +15,14 @@
 
 #include "6lowpan_i.h"
 
-static int lowpan_give_skb_to_devices(struct sk_buff *skb,
-				      struct net_device *dev)
+static int lowpan_give_skb_to_device(struct sk_buff *skb,
+				     struct net_device *dev)
 {
-	struct lowpan_dev_record *entry;
-	struct sk_buff *skb_cp;
-	int stat = NET_RX_SUCCESS;
-
+	skb->dev = dev->ieee802154_ptr->lowpan_dev;
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->pkt_type = PACKET_HOST;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(entry, &lowpan_devices, list)
-		if (lowpan_dev_info(entry->ldev)->real_dev == skb->dev) {
-			skb_cp = skb_copy(skb, GFP_ATOMIC);
-			if (!skb_cp) {
-				kfree_skb(skb);
-				rcu_read_unlock();
-				return NET_RX_DROP;
-			}
-
-			skb_cp->dev = entry->ldev;
-			stat = netif_rx(skb_cp);
-			if (stat == NET_RX_DROP)
-				break;
-		}
-	rcu_read_unlock();
-
-	consume_skb(skb);
-
-	return stat;
+	return netif_rx(skb);
 }
 
 static int
@@ -109,7 +87,7 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (skb->data[0] == LOWPAN_DISPATCH_IPV6) {
 		/* Pull off the 1-byte of 6lowpan header. */
 		skb_pull(skb, 1);
-		return lowpan_give_skb_to_devices(skb, NULL);
+		return lowpan_give_skb_to_device(skb, dev);
 	} else {
 		switch (skb->data[0] & 0xe0) {
 		case LOWPAN_DISPATCH_IPHC:	/* ipv6 datagram */
@@ -117,7 +95,7 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
 			if (ret < 0)
 				goto drop_skb;
 
-			return lowpan_give_skb_to_devices(skb, NULL);
+			return lowpan_give_skb_to_device(skb, dev);
 		case LOWPAN_DISPATCH_FRAG1:	/* first fragment header */
 			ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAG1);
 			if (ret == 1) {
@@ -125,7 +103,7 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
 				if (ret < 0)
 					goto drop_skb;
 
-				return lowpan_give_skb_to_devices(skb, NULL);
+				return lowpan_give_skb_to_device(skb, dev);
 			} else if (ret == -1) {
 				return NET_RX_DROP;
 			} else {
@@ -138,7 +116,7 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
 				if (ret < 0)
 					goto drop_skb;
 
-				return lowpan_give_skb_to_devices(skb, NULL);
+				return lowpan_give_skb_to_device(skb, dev);
 			} else if (ret == -1) {
 				return NET_RX_DROP;
 			} else {
-- 
cgit v1.2.3


From c91208d819c814e7f418c7a083059cf533ad0396 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Mon, 10 Aug 2015 21:15:58 +0200
Subject: ieee802154: add ack request default handling

This patch introduce a new mib entry which isn't part of 802.15.4 but
useful as default behaviour to set the ack request bit or not if we
don't know if the ack request bit should set. This is currently used for
stacks like IEEE 802.15.4 6LoWPAN.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/cfg802154.h     |  5 +++++
 include/net/nl802154.h      |  4 ++++
 net/ieee802154/6lowpan/tx.c |  2 +-
 net/ieee802154/nl802154.c   | 33 +++++++++++++++++++++++++++++++++
 net/ieee802154/rdev-ops.h   | 13 +++++++++++++
 net/ieee802154/trace.h      | 19 +++++++++++++++++++
 net/mac802154/cfg.c         | 11 +++++++++++
 7 files changed, 86 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index e53b6bfda976..76b1ffaea863 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -63,6 +63,8 @@ struct cfg802154_ops {
 					 s8 max_frame_retries);
 	int	(*set_lbt_mode)(struct wpan_phy *wpan_phy,
 				struct wpan_dev *wpan_dev, bool mode);
+	int	(*set_ackreq_default)(struct wpan_phy *wpan_phy,
+				      struct wpan_dev *wpan_dev, bool ackreq);
 };
 
 static inline bool
@@ -196,6 +198,9 @@ struct wpan_dev {
 	bool lbt;
 
 	bool promiscuous_mode;
+
+	/* fallback for acknowledgment bit setting */
+	bool ackreq;
 };
 
 #define to_phy(_dev)	container_of(_dev, struct wpan_phy, dev)
diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index b0ab530d28cd..cf2713d8b975 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -52,6 +52,8 @@ enum nl802154_commands {
 
 	NL802154_CMD_SET_LBT_MODE,
 
+	NL802154_CMD_SET_ACKREQ_DEFAULT,
+
 	/* add new commands above here */
 
 	/* used to define NL802154_CMD_MAX below */
@@ -104,6 +106,8 @@ enum nl802154_attrs {
 
 	NL802154_ATTR_SUPPORTED_COMMANDS,
 
+	NL802154_ATTR_ACKREQ_DEFAULT,
+
 	/* add attributes here, update the policy in nl802154.c */
 
 	__NL802154_ATTR_AFTER_LAST,
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index 2597abbf7f4b..1bf4a304b5c4 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -224,7 +224,7 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev)
 	} else {
 		da.mode = IEEE802154_ADDR_LONG;
 		da.extended_addr = ieee802154_devaddr_from_raw(daddr);
-		cb->ackreq = wpan_dev->frame_retries >= 0;
+		cb->ackreq = wpan_dev->ackreq;
 	}
 
 	return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 68f24016860c..1b00a14850cb 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -230,6 +230,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
 	[NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED },
 
 	[NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED },
+
+	[NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 },
 };
 
 /* message building helper */
@@ -458,6 +460,7 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
 	CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS);
 	CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES);
 	CMD(set_lbt_mode, SET_LBT_MODE);
+	CMD(set_ackreq_default, SET_ACKREQ_DEFAULT);
 
 	if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER)
 		CMD(set_tx_power, SET_TX_POWER);
@@ -656,6 +659,10 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
 	if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt))
 		goto nla_put_failure;
 
+	/* ackreq default behaviour */
+	if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq))
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -1042,6 +1049,24 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info)
 	return rdev_set_lbt_mode(rdev, wpan_dev, mode);
 }
 
+static int
+nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg802154_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+	bool ackreq;
+
+	if (netif_running(dev))
+		return -EBUSY;
+
+	if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT])
+		return -EINVAL;
+
+	ackreq = !!nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]);
+	return rdev_set_ackreq_default(rdev, wpan_dev, ackreq);
+}
+
 #define NL802154_FLAG_NEED_WPAN_PHY	0x01
 #define NL802154_FLAG_NEED_NETDEV	0x02
 #define NL802154_FLAG_NEED_RTNL		0x04
@@ -1248,6 +1273,14 @@ static const struct genl_ops nl802154_ops[] = {
 		.internal_flags = NL802154_FLAG_NEED_NETDEV |
 				  NL802154_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL802154_CMD_SET_ACKREQ_DEFAULT,
+		.doit = nl802154_set_ackreq_default,
+		.policy = nl802154_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL802154_FLAG_NEED_NETDEV |
+				  NL802154_FLAG_NEED_RTNL,
+	},
 };
 
 /* initialisation/exit functions */
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 8d5960a37195..03b357501cc5 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -195,4 +195,17 @@ rdev_set_lbt_mode(struct cfg802154_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_set_ackreq_default(struct cfg802154_registered_device *rdev,
+			struct wpan_dev *wpan_dev, bool ackreq)
+{
+	int ret;
+
+	trace_802154_rdev_set_ackreq_default(&rdev->wpan_phy, wpan_dev,
+					     ackreq);
+	ret = rdev->ops->set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq);
+	trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+	return ret;
+}
+
 #endif /* __CFG802154_RDEV_OPS */
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
index 4399b7fbaa31..9a471e41ec73 100644
--- a/net/ieee802154/trace.h
+++ b/net/ieee802154/trace.h
@@ -275,6 +275,25 @@ TRACE_EVENT(802154_rdev_set_lbt_mode,
 		WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode))
 );
 
+TRACE_EVENT(802154_rdev_set_ackreq_default,
+	TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+		 bool ackreq),
+	TP_ARGS(wpan_phy, wpan_dev, ackreq),
+	TP_STRUCT__entry(
+		WPAN_PHY_ENTRY
+		WPAN_DEV_ENTRY
+		__field(bool, ackreq)
+	),
+	TP_fast_assign(
+		WPAN_PHY_ASSIGN;
+		WPAN_DEV_ASSIGN;
+		__entry->ackreq = ackreq;
+	),
+	TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+		", ackreq default: %s", WPAN_PHY_PR_ARG,
+		WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->ackreq))
+);
+
 TRACE_EVENT(802154_rdev_return_int,
 	TP_PROTO(struct wpan_phy *wpan_phy, int ret),
 	TP_ARGS(wpan_phy, ret),
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index cecfcda09aac..c865ebb2ace2 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -256,6 +256,16 @@ ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
 	return 0;
 }
 
+static int
+ieee802154_set_ackreq_default(struct wpan_phy *wpan_phy,
+			      struct wpan_dev *wpan_dev, bool ackreq)
+{
+	ASSERT_RTNL();
+
+	wpan_dev->ackreq = ackreq;
+	return 0;
+}
+
 const struct cfg802154_ops mac802154_config_ops = {
 	.add_virtual_intf_deprecated = ieee802154_add_iface_deprecated,
 	.del_virtual_intf_deprecated = ieee802154_del_iface_deprecated,
@@ -273,4 +283,5 @@ const struct cfg802154_ops mac802154_config_ops = {
 	.set_max_csma_backoffs = ieee802154_set_max_csma_backoffs,
 	.set_max_frame_retries = ieee802154_set_max_frame_retries,
 	.set_lbt_mode = ieee802154_set_lbt_mode,
+	.set_ackreq_default = ieee802154_set_ackreq_default,
 };
-- 
cgit v1.2.3


From 158e92185075184ebc5f25bab61fdd598693e28d Mon Sep 17 00:00:00 2001
From: Jakub Pawlowski <jpawlowski@google.com>
Date: Fri, 7 Aug 2015 20:22:51 +0200
Subject: Bluetooth: preparation for new connect procedure

Currently, when trying to connect to already paired device that just
rotated its RPA MAC address, old address would be used and connection
would fail. In order to fix that, kernel must scan and receive
advertisement with fresh RPA before connecting.

This patch adds some fields to hci_conn_params, in preparation to new
connect procedure.

explicit_connect will be used to override any current auto_connect action,
and connect to device when ad is received.

HCI_AUTO_CONN_EXPLICIT was added to auto_connect enum. When this value
will be used, explicit connect is the only action, and params can be
removed after successful connection.

HCI_CONN_SCANNING is added to hci_conn flags. When it's set, connect is
scan phase. It gets cleared when advertisement is received, and
HCI_OP_LE_CREATE_CONN is sent.

Signed-off-by: Jakub Pawlowski <jpawlowski@google.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 2a6b0919e23f..c8d2b5a89d08 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -512,9 +512,11 @@ struct hci_conn_params {
 		HCI_AUTO_CONN_DIRECT,
 		HCI_AUTO_CONN_ALWAYS,
 		HCI_AUTO_CONN_LINK_LOSS,
+		HCI_AUTO_CONN_EXPLICIT,
 	} auto_connect;
 
 	struct hci_conn *conn;
+	bool explicit_connect;
 };
 
 extern struct list_head hci_dev_list;
@@ -639,6 +641,7 @@ enum {
 	HCI_CONN_DROP,
 	HCI_CONN_PARAM_REMOVAL_PEND,
 	HCI_CONN_NEW_LINK_KEY,
+	HCI_CONN_SCANNING,
 };
 
 static inline bool hci_conn_ssp_enabled(struct hci_conn *conn)
-- 
cgit v1.2.3


From e7d9ab731ac7babaf2e1b7b5e2280f5f555d263f Mon Sep 17 00:00:00 2001
From: Jakub Pawlowski <jpawlowski@google.com>
Date: Fri, 7 Aug 2015 20:22:52 +0200
Subject: Bluetooth: add hci_lookup_le_connect

This patch adds hci_lookup_le_connect method, that will be used to check
wether outgoing le connection attempt is in progress.

Signed-off-by: Jakub Pawlowski <jpawlowski@google.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h | 20 ++++++++++++++++++++
 net/bluetooth/hci_conn.c         |  5 ++---
 net/bluetooth/hci_event.c        |  4 ++--
 net/bluetooth/hci_request.c      |  6 ++----
 net/bluetooth/mgmt.c             |  2 +-
 5 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index c8d2b5a89d08..f0a9fc1d06e0 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -811,6 +811,26 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
 	return NULL;
 }
 
+static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c->type == LE_LINK && c->state == BT_CONNECT &&
+		    !test_bit(HCI_CONN_SCANNING, &c->flags)) {
+			rcu_read_unlock();
+			return c;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 int hci_disconnect(struct hci_conn *conn, __u8 reason);
 bool hci_setup_sync(struct hci_conn *conn, __u16 handle);
 void hci_sco_setup(struct hci_conn *conn, __u8 status);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 2c48bf0b5afb..0b4d919c8d96 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -645,7 +645,7 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 
 	hci_dev_lock(hdev);
 
-	conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+	conn = hci_lookup_le_connect(hdev);
 	if (!conn)
 		goto done;
 
@@ -759,8 +759,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 	/* Since the controller supports only one LE connection attempt at a
 	 * time, we return -EBUSY if there is any connection attempt running.
 	 */
-	conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
-	if (conn)
+	if (hci_lookup_le_connect(hdev))
 		return ERR_PTR(-EBUSY);
 
 	/* When given an identity address with existing identity
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 218d7dfc342f..128c5b70ee5e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1059,7 +1059,7 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
 
 		hci_dev_set_flag(hdev, HCI_LE_ADV);
 
-		conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+		conn = hci_lookup_le_connect(hdev);
 		if (conn)
 			queue_delayed_work(hdev->workqueue,
 					   &conn->le_conn_timeout,
@@ -4447,7 +4447,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	 */
 	hci_dev_clear_flag(hdev, HCI_LE_ADV);
 
-	conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+	conn = hci_lookup_le_connect(hdev);
 	if (!conn) {
 		conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role);
 		if (!conn) {
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index d6025d6e6d59..b7369220c9ef 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -317,7 +317,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
 	 * address be updated at the next cycle.
 	 */
 	if (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
-	    hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) {
+	    hci_lookup_le_connect(hdev)) {
 		BT_DBG("Deferring random address update");
 		hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
 		return;
@@ -479,7 +479,6 @@ void hci_update_page_scan(struct hci_dev *hdev)
 void __hci_update_background_scan(struct hci_request *req)
 {
 	struct hci_dev *hdev = req->hdev;
-	struct hci_conn *conn;
 
 	if (!test_bit(HCI_UP, &hdev->flags) ||
 	    test_bit(HCI_INIT, &hdev->flags) ||
@@ -529,8 +528,7 @@ void __hci_update_background_scan(struct hci_request *req)
 		 * since some controllers are not able to scan and connect at
 		 * the same time.
 		 */
-		conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
-		if (conn)
+		if (hci_lookup_le_connect(hdev))
 			return;
 
 		/* If controller is currently scanning, we stop it to ensure we
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7ab191589541..35418bbe6b15 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4210,7 +4210,7 @@ static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status)
 		/* Don't let discovery abort an outgoing connection attempt
 		 * that's using directed advertising.
 		 */
-		if (hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) {
+		if (hci_lookup_le_connect(hdev)) {
 			*status = MGMT_STATUS_REJECTED;
 			return false;
 		}
-- 
cgit v1.2.3


From f75113a26008980ca13834fb6573145523596776 Mon Sep 17 00:00:00 2001
From: Jakub Pawlowski <jpawlowski@google.com>
Date: Fri, 7 Aug 2015 20:22:53 +0200
Subject: Bluetooth: add hci_connect_le_scan

Currently, when trying to connect to already paired device that just
rotated its RPA MAC address, old address would be used and connection
would fail. In order to fix that, kernel must scan and receive
advertisement with fresh RPA before connecting.

This patch adds hci_connect_le_scan with dependencies, new method that
will be used to connect to remote LE devices. Instead of just sending
connect request, it adds a device to whitelist. Later patches will make
use of this whitelist to send conenct request when advertisement is
received, and properly handle timeouts.

Signed-off-by: Jakub Pawlowski <jpawlowski@google.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |   6 ++
 net/bluetooth/hci_conn.c         | 174 +++++++++++++++++++++++++++++++++++++++
 net/bluetooth/hci_core.c         |  33 ++++++++
 3 files changed, 213 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f0a9fc1d06e0..9e1a59e01fa2 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -846,6 +846,9 @@ void hci_chan_del(struct hci_chan *chan);
 void hci_chan_list_flush(struct hci_conn *conn);
 struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle);
 
+struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
+				     u8 dst_type, u8 sec_level,
+				     u16 conn_timeout, u8 role);
 struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 				u8 dst_type, u8 sec_level, u16 conn_timeout,
 				u8 role);
@@ -1011,6 +1014,9 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev);
 struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
 						  bdaddr_t *addr,
 						  u8 addr_type);
+struct hci_conn_params *hci_explicit_connect_lookup(struct hci_dev *hdev,
+						    bdaddr_t *addr,
+						    u8 addr_type);
 
 void hci_uuids_clear(struct hci_dev *hdev);
 
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 0b4d919c8d96..534feb7956a3 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -64,6 +64,48 @@ static void hci_le_create_connection_cancel(struct hci_conn *conn)
 	hci_send_cmd(conn->hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL);
 }
 
+/* This function requires the caller holds hdev->lock */
+static void hci_connect_le_scan_cleanup(struct hci_conn *conn)
+{
+	struct hci_conn_params *params;
+	struct smp_irk *irk;
+	bdaddr_t *bdaddr;
+	u8 bdaddr_type;
+
+	bdaddr = &conn->dst;
+	bdaddr_type = conn->dst_type;
+
+	/* Check if we need to convert to identity address */
+	irk = hci_get_irk(conn->hdev, bdaddr, bdaddr_type);
+	if (irk) {
+		bdaddr = &irk->bdaddr;
+		bdaddr_type = irk->addr_type;
+	}
+
+	params = hci_explicit_connect_lookup(conn->hdev, bdaddr, bdaddr_type);
+	if (!params)
+		return;
+
+	/* The connection attempt was doing scan for new RPA, and is
+	 * in scan phase. If params are not associated with any other
+	 * autoconnect action, remove them completely. If they are, just unmark
+	 * them as waiting for connection, by clearing explicit_connect field.
+	 */
+	if (params->auto_connect == HCI_AUTO_CONN_EXPLICIT)
+		hci_conn_params_del(conn->hdev, bdaddr, bdaddr_type);
+	else
+		params->explicit_connect = false;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void hci_connect_le_scan_remove(struct hci_conn *conn)
+{
+	hci_connect_le_scan_cleanup(conn);
+
+	hci_conn_hash_del(conn->hdev, conn);
+	hci_update_background_scan(conn->hdev);
+}
+
 static void hci_acl_create_connection(struct hci_conn *conn)
 {
 	struct hci_dev *hdev = conn->hdev;
@@ -858,6 +900,138 @@ done:
 	return conn;
 }
 
+static void hci_connect_le_scan_complete(struct hci_dev *hdev, u8 status,
+					 u16 opcode)
+{
+	struct hci_conn *conn;
+
+	if (!status)
+		return;
+
+	BT_ERR("Failed to add device to auto conn whitelist: status 0x%2.2x",
+	       status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+	if (conn)
+		hci_le_conn_failed(conn, status);
+
+	hci_dev_unlock(hdev);
+}
+
+static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
+{
+	struct hci_conn *conn;
+
+	conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr);
+	if (!conn)
+		return false;
+
+	if (conn->dst_type != type)
+		return false;
+
+	if (conn->state != BT_CONNECTED)
+		return false;
+
+	return true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static int hci_explicit_conn_params_set(struct hci_request *req,
+					bdaddr_t *addr, u8 addr_type)
+{
+	struct hci_dev *hdev = req->hdev;
+	struct hci_conn_params *params;
+
+	if (is_connected(hdev, addr, addr_type))
+		return -EISCONN;
+
+	params = hci_conn_params_add(hdev, addr, addr_type);
+	if (!params)
+		return -EIO;
+
+	/* If we created new params, or existing params were marked as disabled,
+	 * mark them to be used just once to connect.
+	 */
+	if (params->auto_connect == HCI_AUTO_CONN_DISABLED) {
+		params->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+		list_del_init(&params->action);
+		list_add(&params->action, &hdev->pend_le_conns);
+	}
+
+	params->explicit_connect = true;
+	__hci_update_background_scan(req);
+
+	BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type,
+	       params->auto_connect);
+
+	return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
+				     u8 dst_type, u8 sec_level,
+				     u16 conn_timeout, u8 role)
+{
+	struct hci_conn *conn;
+	struct hci_request req;
+	int err;
+
+	/* Let's make sure that le is enabled.*/
+	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+		if (lmp_le_capable(hdev))
+			return ERR_PTR(-ECONNREFUSED);
+
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	/* Some devices send ATT messages as soon as the physical link is
+	 * established. To be able to handle these ATT messages, the user-
+	 * space first establishes the connection and then starts the pairing
+	 * process.
+	 *
+	 * So if a hci_conn object already exists for the following connection
+	 * attempt, we simply update pending_sec_level and auth_type fields
+	 * and return the object found.
+	 */
+	conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst);
+	if (conn) {
+		if (conn->pending_sec_level < sec_level)
+			conn->pending_sec_level = sec_level;
+		goto done;
+	}
+
+	BT_DBG("requesting refresh of dst_addr");
+
+	conn = hci_conn_add(hdev, LE_LINK, dst, role);
+	if (!conn)
+		return ERR_PTR(-ENOMEM);
+
+	hci_req_init(&req, hdev);
+
+	if (hci_explicit_conn_params_set(&req, dst, dst_type) < 0)
+		return ERR_PTR(-EBUSY);
+
+	conn->state = BT_CONNECT;
+	set_bit(HCI_CONN_SCANNING, &conn->flags);
+
+	err = hci_req_run(&req, hci_connect_le_scan_complete);
+	if (err && err != -ENODATA) {
+		hci_conn_del(conn);
+		return ERR_PTR(err);
+	}
+
+	conn->dst_type = dst_type;
+	conn->sec_level = BT_SECURITY_LOW;
+	conn->pending_sec_level = sec_level;
+	conn->conn_timeout = conn_timeout;
+
+done:
+	hci_conn_hold(conn);
+	return conn;
+}
+
 struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 				 u8 sec_level, u8 auth_type)
 {
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index bc43b6490555..adcbc74c2432 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2847,6 +2847,30 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
 	return NULL;
 }
 
+/* This function requires the caller holds hdev->lock */
+struct hci_conn_params *hci_explicit_connect_lookup(struct hci_dev *hdev,
+						    bdaddr_t *addr,
+						    u8 addr_type)
+{
+	struct hci_conn_params *param;
+
+	list_for_each_entry(param, &hdev->pend_le_conns, action) {
+		if (bacmp(&param->addr, addr) == 0 &&
+		    param->addr_type == addr_type &&
+		    param->explicit_connect)
+			return param;
+	}
+
+	list_for_each_entry(param, &hdev->pend_le_reports, action) {
+		if (bacmp(&param->addr, addr) == 0 &&
+		    param->addr_type == addr_type &&
+		    param->explicit_connect)
+			return param;
+	}
+
+	return NULL;
+}
+
 /* This function requires the caller holds hdev->lock */
 struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
 					    bdaddr_t *addr, u8 addr_type)
@@ -2916,6 +2940,15 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev)
 	list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) {
 		if (params->auto_connect != HCI_AUTO_CONN_DISABLED)
 			continue;
+
+		/* If trying to estabilish one time connection to disabled
+		 * device, leave the params, but mark them as just once.
+		 */
+		if (params->explicit_connect) {
+			params->auto_connect = HCI_AUTO_CONN_EXPLICIT;
+			continue;
+		}
+
 		list_del(&params->list);
 		kfree(params);
 	}
-- 
cgit v1.2.3


From fb811395cd5a71b9e94a068f524a6f4a21b67bdb Mon Sep 17 00:00:00 2001
From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 7 Aug 2015 11:10:37 -0700
Subject: net: add explicit logging and stat for neighbour table overflow

Add an explicit neighbour table overflow message (ratelimited) and
statistic to make diagnosing neighbour table overflows tractable in
the wild.

Diagnosing a neighbour table overflow can be quite difficult in the wild
because there is no explicit dmesg logged.  Callers to neighbour code
seem to use net_dbg_ratelimit when the neighbour call fails which means
the "base message" is not emitted and the callback suppressed messages
from the ratelimiting can end-up juxtaposed with unrelated messages.
Further, a forced garbage collection will increment a stat on each call
whether it was successful in freeing-up a table entry or not, so that
statistic is only a hint.  So, add a net_info_ratelimited message and
explicit statistic to the neighbour code.

Signed-off-by: Rick Jones <rick.jones2@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h        |  1 +
 include/uapi/linux/neighbour.h |  1 +
 net/core/neighbour.c           | 14 ++++++++++----
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index bd33e66f49aa..8b683841e574 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -125,6 +125,7 @@ struct neigh_statistics {
 	unsigned long forced_gc_runs;	/* number of forced GC runs */
 
 	unsigned long unres_discards;	/* number of unresolved drops */
+	unsigned long table_fulls;      /* times even gc couldn't help */
 };
 
 #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field)
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 2e35c61bbdd1..788655bfa0f3 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -106,6 +106,7 @@ struct ndt_stats {
 	__u64		ndts_rcv_probes_ucast;
 	__u64		ndts_periodic_gc_runs;
 	__u64		ndts_forced_gc_runs;
+	__u64		ndts_table_fulls;
 };
 
 enum {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 84195dacb8b6..2b515ba7e94f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -274,8 +274,12 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 	    (entries >= tbl->gc_thresh2 &&
 	     time_after(now, tbl->last_flush + 5 * HZ))) {
 		if (!neigh_forced_gc(tbl) &&
-		    entries >= tbl->gc_thresh3)
+		    entries >= tbl->gc_thresh3) {
+			net_info_ratelimited("%s: neighbor table overflow!\n",
+					     tbl->id);
+			NEIGH_CACHE_STAT_INC(tbl, table_fulls);
 			goto out_entries;
+		}
 	}
 
 	n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
@@ -1849,6 +1853,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 			ndst.ndts_rcv_probes_ucast	+= st->rcv_probes_ucast;
 			ndst.ndts_periodic_gc_runs	+= st->periodic_gc_runs;
 			ndst.ndts_forced_gc_runs	+= st->forced_gc_runs;
+			ndst.ndts_table_fulls		+= st->table_fulls;
 		}
 
 		if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
@@ -2717,12 +2722,12 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
 	struct neigh_statistics *st = v;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "entries  allocs destroys hash_grows  lookups hits  res_failed  rcv_probes_mcast rcv_probes_ucast  periodic_gc_runs forced_gc_runs unresolved_discards\n");
+		seq_printf(seq, "entries  allocs destroys hash_grows  lookups hits  res_failed  rcv_probes_mcast rcv_probes_ucast  periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
 		return 0;
 	}
 
 	seq_printf(seq, "%08x  %08lx %08lx %08lx  %08lx %08lx  %08lx  "
-			"%08lx %08lx  %08lx %08lx %08lx\n",
+			"%08lx %08lx  %08lx %08lx %08lx %08lx\n",
 		   atomic_read(&tbl->entries),
 
 		   st->allocs,
@@ -2739,7 +2744,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
 
 		   st->periodic_gc_runs,
 		   st->forced_gc_runs,
-		   st->unres_discards
+		   st->unres_discards,
+		   st->table_fulls
 		   );
 
 	return 0;
-- 
cgit v1.2.3


From 2e15ea390e6f4466655066d97e22ec66870a042c Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Fri, 7 Aug 2015 23:51:42 -0700
Subject: ip_gre: Add support to collect tunnel metadata.

Following patch create new tunnel flag which enable
tunnel metadata collection on given device.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h       |   7 +-
 include/uapi/linux/if_tunnel.h |   1 +
 net/ipv4/ip_gre.c              | 195 +++++++++++++++++++++++++++++++++++++----
 net/ipv4/ip_tunnel.c           |  37 ++++++--
 net/ipv4/ipip.c                |   2 +-
 net/ipv6/sit.c                 |   2 +-
 6 files changed, 216 insertions(+), 28 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 47984415f5d1..984dbfa15e13 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -82,6 +82,8 @@ struct ip_tunnel_dst {
 	__be32				 saddr;
 };
 
+struct metadata_dst;
+
 struct ip_tunnel {
 	struct ip_tunnel __rcu	*next;
 	struct hlist_node hash_node;
@@ -115,6 +117,7 @@ struct ip_tunnel {
 	unsigned int		prl_count;	/* # of entries in PRL */
 	int			ip_tnl_net_id;
 	struct gro_cells	gro_cells;
+	bool			collect_md;
 };
 
 #define TUNNEL_CSUM		__cpu_to_be16(0x01)
@@ -149,6 +152,7 @@ struct tnl_ptk_info {
 struct ip_tunnel_net {
 	struct net_device *fb_tunnel_dev;
 	struct hlist_head tunnels[IP_TNL_HASH_SIZE];
+	struct ip_tunnel __rcu *collect_md_tun;
 };
 
 struct ip_tunnel_encap_ops {
@@ -235,7 +239,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 				   __be32 key);
 
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
-		  const struct tnl_ptk_info *tpi, bool log_ecn_error);
+		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+		  bool log_ecn_error);
 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 			 struct ip_tunnel_parm *p);
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index bd3cc11a431f..af4de90ba27d 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -112,6 +112,7 @@ enum {
 	IFLA_GRE_ENCAP_FLAGS,
 	IFLA_GRE_ENCAP_SPORT,
 	IFLA_GRE_ENCAP_DPORT,
+	IFLA_GRE_COLLECT_METADATA,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5fd706473c73..554a760c2cd0 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -25,6 +25,7 @@
 #include <linux/udp.h>
 #include <linux/if_arp.h>
 #include <linux/mroute.h>
+#include <linux/if_vlan.h>
 #include <linux/init.h>
 #include <linux/in6.h>
 #include <linux/inetdevice.h>
@@ -47,6 +48,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/gre.h>
+#include <net/dst_metadata.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -200,9 +202,29 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
 	return PACKET_RCVD;
 }
 
+static __be64 key_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be64)((__force u32)key);
+#else
+	return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 tunnel_id_to_key(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be32)x;
+#else
+	return (__force __be32)((__force u64)x >> 32);
+#endif
+}
+
 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 {
 	struct net *net = dev_net(skb->dev);
+	struct metadata_dst *tun_dst = NULL;
 	struct ip_tunnel_net *itn;
 	const struct iphdr *iph;
 	struct ip_tunnel *tunnel;
@@ -218,40 +240,162 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 
 	if (tunnel) {
 		skb_pop_mac_header(skb);
-		ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+		if (tunnel->collect_md) {
+			struct ip_tunnel_info *info;
+
+			tun_dst = metadata_dst_alloc(0, GFP_ATOMIC);
+			if (!tun_dst)
+				return PACKET_REJECT;
+
+			info = &tun_dst->u.tun_info;
+			info->key.ipv4_src = iph->saddr;
+			info->key.ipv4_dst = iph->daddr;
+			info->key.ipv4_tos = iph->tos;
+			info->key.ipv4_ttl = iph->ttl;
+
+			info->mode = IP_TUNNEL_INFO_RX;
+			info->key.tun_flags = tpi->flags &
+					      (TUNNEL_CSUM | TUNNEL_KEY);
+			info->key.tun_id = key_to_tunnel_id(tpi->key);
+
+			info->key.tp_src = 0;
+			info->key.tp_dst = 0;
+		}
+
+		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 		return PACKET_RCVD;
 	}
 	return PACKET_REJECT;
 }
 
+static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
+			 __be16 proto, __be32 key, __be32 seq)
+{
+	struct gre_base_hdr *greh;
+
+	skb_push(skb, hdr_len);
+
+	skb_reset_transport_header(skb);
+	greh = (struct gre_base_hdr *)skb->data;
+	greh->flags = tnl_flags_to_gre_flags(flags);
+	greh->protocol = proto;
+
+	if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
+		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+		if (flags & TUNNEL_SEQ) {
+			*ptr = seq;
+			ptr--;
+		}
+		if (flags & TUNNEL_KEY) {
+			*ptr = key;
+			ptr--;
+		}
+		if (flags & TUNNEL_CSUM &&
+		    !(skb_shinfo(skb)->gso_type &
+		      (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
+			*ptr = 0;
+			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+								 skb->len, 0));
+		}
+	}
+}
+
 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 		       const struct iphdr *tnl_params,
 		       __be16 proto)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct tnl_ptk_info tpi;
 
-	tpi.flags = tunnel->parms.o_flags;
-	tpi.proto = proto;
-	tpi.key = tunnel->parms.o_key;
 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
 		tunnel->o_seqno++;
-	tpi.seq = htonl(tunnel->o_seqno);
 
 	/* Push GRE header. */
-	gre_build_header(skb, &tpi, tunnel->tun_hlen);
-
-	skb_set_inner_protocol(skb, tpi.proto);
+	build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+		     proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
 
+	skb_set_inner_protocol(skb, proto);
 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 }
 
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel_info *tun_info;
+	struct net *net = dev_net(dev);
+	const struct ip_tunnel_key *key;
+	struct flowi4 fl;
+	struct rtable *rt;
+	int min_headroom;
+	int tunnel_hlen;
+	__be16 df, flags;
+	int err;
+
+	tun_info = skb_tunnel_info(skb, AF_INET);
+	if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX))
+		goto err_free_skb;
+
+	key = &tun_info->key;
+	memset(&fl, 0, sizeof(fl));
+	fl.daddr = key->ipv4_dst;
+	fl.saddr = key->ipv4_src;
+	fl.flowi4_tos = RT_TOS(key->ipv4_tos);
+	fl.flowi4_mark = skb->mark;
+	fl.flowi4_proto = IPPROTO_GRE;
+
+	rt = ip_route_output_key(net, &fl);
+	if (IS_ERR(rt))
+		goto err_free_skb;
+
+	tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
+
+	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+			+ tunnel_hlen + sizeof(struct iphdr);
+	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+		int head_delta = SKB_DATA_ALIGN(min_headroom -
+						skb_headroom(skb) +
+						16);
+		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+				       0, GFP_ATOMIC);
+		if (unlikely(err))
+			goto err_free_rt;
+	}
+
+	/* Push Tunnel header. */
+	skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
+	if (IS_ERR(skb)) {
+		skb = NULL;
+		goto err_free_rt;
+	}
+
+	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+	build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+		     tunnel_id_to_key(tun_info->key.tun_id), 0);
+
+	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
+	err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
+			    key->ipv4_dst, IPPROTO_GRE,
+			    key->ipv4_tos, key->ipv4_ttl, df, false);
+	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+	return;
+
+err_free_rt:
+	ip_rt_put(rt);
+err_free_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+}
+
 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 			      struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr *tnl_params;
 
+	if (tunnel->collect_md) {
+		gre_fb_xmit(skb, dev);
+		return NETDEV_TX_OK;
+	}
+
 	if (dev->header_ops) {
 		/* Need space for new headers */
 		if (skb_cow_head(skb, dev->needed_headroom -
@@ -277,7 +421,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 		goto out;
 
 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
-
 	return NETDEV_TX_OK;
 
 free_skb:
@@ -292,6 +435,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
+	if (tunnel->collect_md) {
+		gre_fb_xmit(skb, dev);
+		return NETDEV_TX_OK;
+	}
+
 	skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
 	if (IS_ERR(skb))
 		goto out;
@@ -300,7 +448,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 		goto free_skb;
 
 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
-
 	return NETDEV_TX_OK;
 
 free_skb:
@@ -596,8 +743,10 @@ out:
 	return ipgre_tunnel_validate(tb, data);
 }
 
-static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
-			       struct ip_tunnel_parm *parms)
+static void ipgre_netlink_parms(struct net_device *dev,
+				struct nlattr *data[],
+				struct nlattr *tb[],
+				struct ip_tunnel_parm *parms)
 {
 	memset(parms, 0, sizeof(*parms));
 
@@ -635,6 +784,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
 
 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
 		parms->iph.frag_off = htons(IP_DF);
+
+	if (data[IFLA_GRE_COLLECT_METADATA]) {
+		struct ip_tunnel *t = netdev_priv(dev);
+
+		t->collect_md = true;
+	}
 }
 
 /* This function returns true when ENCAP attributes are present in the nl msg */
@@ -712,7 +867,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
 			return err;
 	}
 
-	ipgre_netlink_parms(data, tb, &p);
+	ipgre_netlink_parms(dev, data, tb, &p);
 	return ip_tunnel_newlink(dev, tb, &p);
 }
 
@@ -730,7 +885,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 
-	ipgre_netlink_parms(data, tb, &p);
+	ipgre_netlink_parms(dev, data, tb, &p);
 	return ip_tunnel_changelink(dev, tb, &p);
 }
 
@@ -765,6 +920,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_GRE_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_GRE_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -796,6 +953,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			t->encap.flags))
 		goto nla_put_failure;
 
+	if (t->collect_md) {
+		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -817,6 +979,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -851,7 +1014,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 
 static int __net_init ipgre_tap_init_net(struct net *net)
 {
-	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
 }
 
 static void __net_exit ipgre_tap_exit_net(struct net *net)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 626d9e56a6bd..cbb51f3fac06 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -230,10 +230,13 @@ skip_key_lookup:
 	if (cand)
 		return cand;
 
+	t = rcu_dereference(itn->collect_md_tun);
+	if (t)
+		return t;
+
 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 		return netdev_priv(itn->fb_tunnel_dev);
 
-
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
@@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 {
 	struct hlist_head *head = ip_bucket(itn, &t->parms);
 
+	if (t->collect_md)
+		rcu_assign_pointer(itn->collect_md_tun, t);
 	hlist_add_head_rcu(&t->hash_node, head);
 }
 
-static void ip_tunnel_del(struct ip_tunnel *t)
+static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 {
+	if (t->collect_md)
+		rcu_assign_pointer(itn->collect_md_tun, NULL);
 	hlist_del_init_rcu(&t->hash_node);
 }
 
@@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
 }
 
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
-		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
+		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+		  bool log_ecn_error)
 {
 	struct pcpu_sw_netstats *tstats;
 	const struct iphdr *iph = ip_hdr(skb);
@@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		skb->dev = tunnel->dev;
 	}
 
+	if (tun_dst)
+		skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
@@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
 			     struct ip_tunnel_parm *p,
 			     bool set_mtu)
 {
-	ip_tunnel_del(t);
+	ip_tunnel_del(itn, t);
 	t->parms.iph.saddr = p->iph.saddr;
 	t->parms.iph.daddr = p->iph.daddr;
 	t->parms.i_key = p->i_key;
@@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 
 	if (itn->fb_tunnel_dev != dev) {
-		ip_tunnel_del(netdev_priv(dev));
+		ip_tunnel_del(itn, netdev_priv(dev));
 		unregister_netdevice_queue(dev, head);
 	}
 }
@@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 	nt = netdev_priv(dev);
 	itn = net_generic(net, nt->ip_tnl_net_id);
 
-	if (ip_tunnel_find(itn, p, dev->type))
-		return -EEXIST;
+	if (nt->collect_md) {
+		if (rtnl_dereference(itn->collect_md_tun))
+			return -EEXIST;
+	} else {
+		if (ip_tunnel_find(itn, p, dev->type))
+			return -EEXIST;
+	}
 
 	nt->net = net;
 	nt->parms = *p;
@@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		dev->mtu = mtu;
 
 	ip_tunnel_add(itn, nt);
-
 out:
 	return err;
 }
@@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev)
 	iph->version		= 4;
 	iph->ihl		= 5;
 
+	if (tunnel->collect_md) {
+		dev->features |= NETIF_F_NETNS_LOCAL;
+		netif_keep_dst(dev);
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev)
 	itn = net_generic(net, tunnel->ip_tnl_net_id);
 	/* fb_tunnel_dev will be unregisted in net-exit call. */
 	if (itn->fb_tunnel_dev != dev)
-		ip_tunnel_del(netdev_priv(dev));
+		ip_tunnel_del(itn, netdev_priv(dev));
 
 	ip_tunnel_dst_reset_all(tunnel);
 }
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 254238daf58b..f34c31defafe 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb)
 			goto drop;
 		if (iptunnel_pull_header(skb, 0, tpi.proto))
 			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
 	}
 
 	return -1;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index ac35a28599be..94428fd85b2f 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -742,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb)
 			goto drop;
 		if (iptunnel_pull_header(skb, 0, tpi.proto))
 			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From b2acd1dc3949cd60c571844d495594f05f0351f4 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Fri, 7 Aug 2015 23:51:47 -0700
Subject: openvswitch: Use regular GRE net_device instead of vport

Using GRE tunnel meta data collection feature, we can implement
OVS GRE vport. This patch removes all of the OVS
specific GRE code and make OVS use a ip_gre net_device.
Minimal GRE vport is kept to handle compatibility with
current userspace application.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gre.h           |  12 +--
 net/ipv4/gre_demux.c        |  34 -------
 net/ipv4/ip_gre.c           |  36 +++++++
 net/openvswitch/Kconfig     |   2 +-
 net/openvswitch/vport-gre.c | 237 ++++----------------------------------------
 5 files changed, 61 insertions(+), 260 deletions(-)

(limited to 'include/net')

diff --git a/include/net/gre.h b/include/net/gre.h
index b53182018743..e3e08459bf67 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -33,16 +33,8 @@ struct gre_cisco_protocol {
 int gre_cisco_register(struct gre_cisco_protocol *proto);
 int gre_cisco_unregister(struct gre_cisco_protocol *proto);
 
-void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
-		      int hdr_len);
-
-static inline struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
-						  bool csum)
-{
-	return iptunnel_handle_offloads(skb, csum,
-					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
-}
-
+struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
+				       u8 name_assign_type);
 
 static inline int ip_gre_calc_hlen(__be16 o_flags)
 {
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 4a7b5b2a1ce3..77562e0ac66b 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -61,40 +61,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
 }
 EXPORT_SYMBOL_GPL(gre_del_protocol);
 
-void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
-		      int hdr_len)
-{
-	struct gre_base_hdr *greh;
-
-	skb_push(skb, hdr_len);
-
-	skb_reset_transport_header(skb);
-	greh = (struct gre_base_hdr *)skb->data;
-	greh->flags = tnl_flags_to_gre_flags(tpi->flags);
-	greh->protocol = tpi->proto;
-
-	if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
-		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
-
-		if (tpi->flags&TUNNEL_SEQ) {
-			*ptr = tpi->seq;
-			ptr--;
-		}
-		if (tpi->flags&TUNNEL_KEY) {
-			*ptr = tpi->key;
-			ptr--;
-		}
-		if (tpi->flags&TUNNEL_CSUM &&
-		    !(skb_shinfo(skb)->gso_type &
-		      (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
-			*ptr = 0;
-			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
-								 skb->len, 0));
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(gre_build_header);
-
 static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 			    bool *csum_err)
 {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 554a760c2cd0..49d140200d03 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -318,6 +318,13 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 }
 
+static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
+					   bool csum)
+{
+	return iptunnel_handle_offloads(skb, csum,
+					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+}
+
 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel_info *tun_info;
@@ -1012,6 +1019,35 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.get_link_net	= ip_tunnel_get_link_net,
 };
 
+struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
+					u8 name_assign_type)
+{
+	struct nlattr *tb[IFLA_MAX + 1];
+	struct net_device *dev;
+	struct ip_tunnel *t;
+	int err;
+
+	memset(&tb, 0, sizeof(tb));
+
+	dev = rtnl_create_link(net, name, name_assign_type,
+			       &ipgre_tap_ops, tb);
+	if (IS_ERR(dev))
+		return dev;
+
+	/* Configure flow based GRE device. */
+	t = netdev_priv(dev);
+	t->collect_md = true;
+
+	err = ipgre_newlink(net, dev, tb, NULL);
+	if (err < 0)
+		goto out;
+	return dev;
+out:
+	free_netdev(dev);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
+
 static int __net_init ipgre_tap_init_net(struct net *net)
 {
 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 15840401a2ce..422dc0567de9 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -34,7 +34,7 @@ config OPENVSWITCH
 config OPENVSWITCH_GRE
 	tristate "Open vSwitch GRE tunneling support"
 	depends on OPENVSWITCH
-	depends on NET_IPGRE_DEMUX
+	depends on NET_IPGRE
 	default OPENVSWITCH
 	---help---
 	  If you say Y here, then the Open vSwitch will be able create GRE
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index b87656c66aaf..871801d2ac23 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -45,235 +45,43 @@
 
 #include "datapath.h"
 #include "vport.h"
+#include "vport-netdev.h"
 
 static struct vport_ops ovs_gre_vport_ops;
 
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 be64_get_low32(__be64 x)
+static struct vport *gre_tnl_create(const struct vport_parms *parms)
 {
-#ifdef __BIG_ENDIAN
-	return (__force __be32)x;
-#else
-	return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
-static __be16 filter_tnl_flags(__be16 flags)
-{
-	return flags & (TUNNEL_CSUM | TUNNEL_KEY);
-}
-
-static struct sk_buff *__build_header(struct sk_buff *skb,
-				      int tunnel_hlen)
-{
-	struct tnl_ptk_info tpi;
-	const struct ip_tunnel_key *tun_key;
-
-	tun_key = &OVS_CB(skb)->egress_tun_info->key;
-
-	skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
-	if (IS_ERR(skb))
-		return skb;
-
-	tpi.flags = filter_tnl_flags(tun_key->tun_flags);
-	tpi.proto = htons(ETH_P_TEB);
-	tpi.key = be64_get_low32(tun_key->tun_id);
-	tpi.seq = 0;
-	gre_build_header(skb, &tpi, tunnel_hlen);
-
-	return skb;
-}
-
-static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
-{
-#ifdef __BIG_ENDIAN
-	return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
-#else
-	return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
-#endif
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_rcv(struct sk_buff *skb,
-		   const struct tnl_ptk_info *tpi)
-{
-	struct ip_tunnel_info tun_info;
-	struct ovs_net *ovs_net;
-	struct vport *vport;
-	__be64 key;
-
-	ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
-	vport = rcu_dereference(ovs_net->vport_net.gre_vport);
-	if (unlikely(!vport))
-		return PACKET_REJECT;
-
-	key = key_to_tunnel_id(tpi->key, tpi->seq);
-	ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
-			    filter_tnl_flags(tpi->flags), NULL, 0);
-
-	ovs_vport_receive(vport, skb, &tun_info);
-	return PACKET_RCVD;
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_err(struct sk_buff *skb, u32 info,
-		   const struct tnl_ptk_info *tpi)
-{
-	struct ovs_net *ovs_net;
+	struct net *net = ovs_dp_get_net(parms->dp);
+	struct net_device *dev;
 	struct vport *vport;
 
-	ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
-	vport = rcu_dereference(ovs_net->vport_net.gre_vport);
-
-	if (unlikely(!vport))
-		return PACKET_REJECT;
-	else
-		return PACKET_RCVD;
-}
-
-static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
-	struct net *net = ovs_dp_get_net(vport->dp);
-	const struct ip_tunnel_key *tun_key;
-	struct flowi4 fl;
-	struct rtable *rt;
-	int min_headroom;
-	int tunnel_hlen;
-	__be16 df;
-	int err;
-
-	if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
-		err = -EINVAL;
-		goto err_free_skb;
-	}
-
-	tun_key = &OVS_CB(skb)->egress_tun_info->key;
-	rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		goto err_free_skb;
-	}
-
-	tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags);
-
-	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
-			+ tunnel_hlen + sizeof(struct iphdr)
-			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
-	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
-		int head_delta = SKB_DATA_ALIGN(min_headroom -
-						skb_headroom(skb) +
-						16);
-		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
-					0, GFP_ATOMIC);
-		if (unlikely(err))
-			goto err_free_rt;
-	}
-
-	skb = vlan_hwaccel_push_inside(skb);
-	if (unlikely(!skb)) {
-		err = -ENOMEM;
-		goto err_free_rt;
-	}
-
-	/* Push Tunnel header. */
-	skb = __build_header(skb, tunnel_hlen);
-	if (IS_ERR(skb)) {
-		err = PTR_ERR(skb);
-		skb = NULL;
-		goto err_free_rt;
+	vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms);
+	if (IS_ERR(vport))
+		return vport;
+
+	rtnl_lock();
+	dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER);
+	if (IS_ERR(dev)) {
+		rtnl_unlock();
+		ovs_vport_free(vport);
+		return ERR_CAST(dev);
 	}
 
-	df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
-		htons(IP_DF) : 0;
-
-	skb->ignore_df = 1;
-
-	return iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
-			     tun_key->ipv4_dst, IPPROTO_GRE,
-			     tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false);
-err_free_rt:
-	ip_rt_put(rt);
-err_free_skb:
-	kfree_skb(skb);
-	return err;
-}
-
-static struct gre_cisco_protocol gre_protocol = {
-	.handler        = gre_rcv,
-	.err_handler    = gre_err,
-	.priority       = 1,
-};
-
-static int gre_ports;
-static int gre_init(void)
-{
-	int err;
-
-	gre_ports++;
-	if (gre_ports > 1)
-		return 0;
-
-	err = gre_cisco_register(&gre_protocol);
-	if (err)
-		pr_warn("cannot register gre protocol handler\n");
-
-	return err;
-}
-
-static void gre_exit(void)
-{
-	gre_ports--;
-	if (gre_ports > 0)
-		return;
-
-	gre_cisco_unregister(&gre_protocol);
-}
+	dev_change_flags(dev, dev->flags | IFF_UP);
+	rtnl_unlock();
 
-static const char *gre_get_name(const struct vport *vport)
-{
-	return vport_priv(vport);
+	return vport;
 }
 
 static struct vport *gre_create(const struct vport_parms *parms)
 {
-	struct net *net = ovs_dp_get_net(parms->dp);
-	struct ovs_net *ovs_net;
 	struct vport *vport;
-	int err;
-
-	err = gre_init();
-	if (err)
-		return ERR_PTR(err);
-
-	ovs_net = net_generic(net, ovs_net_id);
-	if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
-		vport = ERR_PTR(-EEXIST);
-		goto error;
-	}
 
-	vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms);
+	vport = gre_tnl_create(parms);
 	if (IS_ERR(vport))
-		goto error;
-
-	strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
-	rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
-	return vport;
-
-error:
-	gre_exit();
-	return vport;
-}
-
-static void gre_tnl_destroy(struct vport *vport)
-{
-	struct net *net = ovs_dp_get_net(vport->dp);
-	struct ovs_net *ovs_net;
-
-	ovs_net = net_generic(net, ovs_net_id);
+		return vport;
 
-	RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL);
-	ovs_vport_deferred_free(vport);
-	gre_exit();
+	return ovs_netdev_link(vport, parms->name);
 }
 
 static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
@@ -288,10 +96,9 @@ static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
 static struct vport_ops ovs_gre_vport_ops = {
 	.type		= OVS_VPORT_TYPE_GRE,
 	.create		= gre_create,
-	.destroy	= gre_tnl_destroy,
-	.get_name	= gre_get_name,
-	.send		= gre_tnl_send,
+	.send		= ovs_netdev_send,
 	.get_egress_tun_info	= gre_get_egress_tun_info,
+	.destroy	= ovs_netdev_tunnel_destroy,
 	.owner		= THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 9f57c67c379d88a10e8ad676426fee5ae7341b14 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Fri, 7 Aug 2015 23:51:52 -0700
Subject: gre: Remove support for sharing GRE protocol hook.

Support for sharing GREPROTO_CISCO port was added so that
OVS gre port and kernel GRE devices can co-exist. After
flow-based tunneling patches OVS GRE protocol processing
is completely moved to ip_gre module. so there is no need
for GRE protocol hook. Following patch consolidates
GRE protocol related functions into ip_gre module.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gre.h    |  80 ++-----------------
 net/ipv4/gre_demux.c | 201 +----------------------------------------------
 net/ipv4/ip_gre.c    | 215 +++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 206 insertions(+), 290 deletions(-)

(limited to 'include/net')

diff --git a/include/net/gre.h b/include/net/gre.h
index e3e08459bf67..97eafdc47eea 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -4,6 +4,12 @@
 #include <linux/skbuff.h>
 #include <net/ip_tunnels.h>
 
+struct gre_base_hdr {
+	__be16 flags;
+	__be16 protocol;
+};
+#define GRE_HEADER_SECTION 4
+
 #define GREPROTO_CISCO		0
 #define GREPROTO_PPTP		1
 #define GREPROTO_MAX		2
@@ -14,83 +20,9 @@ struct gre_protocol {
 	void (*err_handler)(struct sk_buff *skb, u32 info);
 };
 
-struct gre_base_hdr {
-	__be16 flags;
-	__be16 protocol;
-};
-#define GRE_HEADER_SECTION 4
-
 int gre_add_protocol(const struct gre_protocol *proto, u8 version);
 int gre_del_protocol(const struct gre_protocol *proto, u8 version);
 
-struct gre_cisco_protocol {
-	int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi);
-	int (*err_handler)(struct sk_buff *skb, u32 info,
-			   const struct tnl_ptk_info *tpi);
-	u8 priority;
-};
-
-int gre_cisco_register(struct gre_cisco_protocol *proto);
-int gre_cisco_unregister(struct gre_cisco_protocol *proto);
-
 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 				       u8 name_assign_type);
-
-static inline int ip_gre_calc_hlen(__be16 o_flags)
-{
-	int addend = 4;
-
-	if (o_flags&TUNNEL_CSUM)
-		addend += 4;
-	if (o_flags&TUNNEL_KEY)
-		addend += 4;
-	if (o_flags&TUNNEL_SEQ)
-		addend += 4;
-	return addend;
-}
-
-static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
-{
-	__be16 tflags = 0;
-
-	if (flags & GRE_CSUM)
-		tflags |= TUNNEL_CSUM;
-	if (flags & GRE_ROUTING)
-		tflags |= TUNNEL_ROUTING;
-	if (flags & GRE_KEY)
-		tflags |= TUNNEL_KEY;
-	if (flags & GRE_SEQ)
-		tflags |= TUNNEL_SEQ;
-	if (flags & GRE_STRICT)
-		tflags |= TUNNEL_STRICT;
-	if (flags & GRE_REC)
-		tflags |= TUNNEL_REC;
-	if (flags & GRE_VERSION)
-		tflags |= TUNNEL_VERSION;
-
-	return tflags;
-}
-
-static inline __be16 tnl_flags_to_gre_flags(__be16 tflags)
-{
-	__be16 flags = 0;
-
-	if (tflags & TUNNEL_CSUM)
-		flags |= GRE_CSUM;
-	if (tflags & TUNNEL_ROUTING)
-		flags |= GRE_ROUTING;
-	if (tflags & TUNNEL_KEY)
-		flags |= GRE_KEY;
-	if (tflags & TUNNEL_SEQ)
-		flags |= GRE_SEQ;
-	if (tflags & TUNNEL_STRICT)
-		flags |= GRE_STRICT;
-	if (tflags & TUNNEL_REC)
-		flags |= GRE_REC;
-	if (tflags & TUNNEL_VERSION)
-		flags |= GRE_VERSION;
-
-	return flags;
-}
-
 #endif
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 77562e0ac66b..d9c552a721fc 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -31,7 +31,6 @@
 #include <net/xfrm.h>
 
 static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
-static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
 
 int gre_add_protocol(const struct gre_protocol *proto, u8 version)
 {
@@ -61,163 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
 }
 EXPORT_SYMBOL_GPL(gre_del_protocol);
 
-static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
-			    bool *csum_err)
-{
-	const struct gre_base_hdr *greh;
-	__be32 *options;
-	int hdr_len;
-
-	if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
-		return -EINVAL;
-
-	greh = (struct gre_base_hdr *)skb_transport_header(skb);
-	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
-		return -EINVAL;
-
-	tpi->flags = gre_flags_to_tnl_flags(greh->flags);
-	hdr_len = ip_gre_calc_hlen(tpi->flags);
-
-	if (!pskb_may_pull(skb, hdr_len))
-		return -EINVAL;
-
-	greh = (struct gre_base_hdr *)skb_transport_header(skb);
-	tpi->proto = greh->protocol;
-
-	options = (__be32 *)(greh + 1);
-	if (greh->flags & GRE_CSUM) {
-		if (skb_checksum_simple_validate(skb)) {
-			*csum_err = true;
-			return -EINVAL;
-		}
-
-		skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
-					 null_compute_pseudo);
-
-		options++;
-	}
-
-	if (greh->flags & GRE_KEY) {
-		tpi->key = *options;
-		options++;
-	} else
-		tpi->key = 0;
-
-	if (unlikely(greh->flags & GRE_SEQ)) {
-		tpi->seq = *options;
-		options++;
-	} else
-		tpi->seq = 0;
-
-	/* WCCP version 1 and 2 protocol decoding.
-	 * - Change protocol to IP
-	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
-	 */
-	if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
-		tpi->proto = htons(ETH_P_IP);
-		if ((*(u8 *)options & 0xF0) != 0x40) {
-			hdr_len += 4;
-			if (!pskb_may_pull(skb, hdr_len))
-				return -EINVAL;
-		}
-	}
-
-	return iptunnel_pull_header(skb, hdr_len, tpi->proto);
-}
-
-static int gre_cisco_rcv(struct sk_buff *skb)
-{
-	struct tnl_ptk_info tpi;
-	int i;
-	bool csum_err = false;
-
-#ifdef CONFIG_NET_IPGRE_BROADCAST
-	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
-		/* Looped back packet, drop it! */
-		if (rt_is_output_route(skb_rtable(skb)))
-			goto drop;
-	}
-#endif
-
-	if (parse_gre_header(skb, &tpi, &csum_err) < 0)
-		goto drop;
-
-	rcu_read_lock();
-	for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
-		struct gre_cisco_protocol *proto;
-		int ret;
-
-		proto = rcu_dereference(gre_cisco_proto_list[i]);
-		if (!proto)
-			continue;
-		ret = proto->handler(skb, &tpi);
-		if (ret == PACKET_RCVD) {
-			rcu_read_unlock();
-			return 0;
-		}
-	}
-	rcu_read_unlock();
-
-	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-drop:
-	kfree_skb(skb);
-	return 0;
-}
-
-static void gre_cisco_err(struct sk_buff *skb, u32 info)
-{
-	/* All the routers (except for Linux) return only
-	 * 8 bytes of packet payload. It means, that precise relaying of
-	 * ICMP in the real Internet is absolutely infeasible.
-	 *
-	 * Moreover, Cisco "wise men" put GRE key to the third word
-	 * in GRE header. It makes impossible maintaining even soft
-	 * state for keyed
-	 * GRE tunnels with enabled checksum. Tell them "thank you".
-	 *
-	 * Well, I wonder, rfc1812 was written by Cisco employee,
-	 * what the hell these idiots break standards established
-	 * by themselves???
-	 */
-
-	const int type = icmp_hdr(skb)->type;
-	const int code = icmp_hdr(skb)->code;
-	struct tnl_ptk_info tpi;
-	bool csum_err = false;
-	int i;
-
-	if (parse_gre_header(skb, &tpi, &csum_err)) {
-		if (!csum_err)		/* ignore csum errors. */
-			return;
-	}
-
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
-				skb->dev->ifindex, 0, IPPROTO_GRE, 0);
-		return;
-	}
-	if (type == ICMP_REDIRECT) {
-		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
-				IPPROTO_GRE, 0);
-		return;
-	}
-
-	rcu_read_lock();
-	for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
-		struct gre_cisco_protocol *proto;
-
-		proto = rcu_dereference(gre_cisco_proto_list[i]);
-		if (!proto)
-			continue;
-
-		if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
-			goto out;
-
-	}
-out:
-	rcu_read_unlock();
-}
-
 static int gre_rcv(struct sk_buff *skb)
 {
 	const struct gre_protocol *proto;
@@ -268,60 +110,19 @@ static const struct net_protocol net_gre_protocol = {
 	.netns_ok    = 1,
 };
 
-static const struct gre_protocol ipgre_protocol = {
-	.handler     = gre_cisco_rcv,
-	.err_handler = gre_cisco_err,
-};
-
-int gre_cisco_register(struct gre_cisco_protocol *newp)
-{
-	struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
-					    &gre_cisco_proto_list[newp->priority];
-
-	return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
-}
-EXPORT_SYMBOL_GPL(gre_cisco_register);
-
-int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
-{
-	struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
-					    &gre_cisco_proto_list[del_proto->priority];
-	int ret;
-
-	ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
-
-	if (ret)
-		return ret;
-
-	synchronize_net();
-	return 0;
-}
-EXPORT_SYMBOL_GPL(gre_cisco_unregister);
-
 static int __init gre_init(void)
 {
 	pr_info("GRE over IPv4 demultiplexor driver\n");
 
 	if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
 		pr_err("can't add protocol\n");
-		goto err;
+		return -EAGAIN;
 	}
-
-	if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
-		pr_info("%s: can't add ipgre handler\n", __func__);
-		goto err_gre;
-	}
-
 	return 0;
-err_gre:
-	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
-err:
-	return -EAGAIN;
 }
 
 static void __exit gre_exit(void)
 {
-	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
 }
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 49d140200d03..fb44d693796e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -123,8 +123,127 @@ static int ipgre_tunnel_init(struct net_device *dev);
 static int ipgre_net_id __read_mostly;
 static int gre_tap_net_id __read_mostly;
 
-static int ipgre_err(struct sk_buff *skb, u32 info,
-		     const struct tnl_ptk_info *tpi)
+static int ip_gre_calc_hlen(__be16 o_flags)
+{
+	int addend = 4;
+
+	if (o_flags & TUNNEL_CSUM)
+		addend += 4;
+	if (o_flags & TUNNEL_KEY)
+		addend += 4;
+	if (o_flags & TUNNEL_SEQ)
+		addend += 4;
+	return addend;
+}
+
+static __be16 gre_flags_to_tnl_flags(__be16 flags)
+{
+	__be16 tflags = 0;
+
+	if (flags & GRE_CSUM)
+		tflags |= TUNNEL_CSUM;
+	if (flags & GRE_ROUTING)
+		tflags |= TUNNEL_ROUTING;
+	if (flags & GRE_KEY)
+		tflags |= TUNNEL_KEY;
+	if (flags & GRE_SEQ)
+		tflags |= TUNNEL_SEQ;
+	if (flags & GRE_STRICT)
+		tflags |= TUNNEL_STRICT;
+	if (flags & GRE_REC)
+		tflags |= TUNNEL_REC;
+	if (flags & GRE_VERSION)
+		tflags |= TUNNEL_VERSION;
+
+	return tflags;
+}
+
+static __be16 tnl_flags_to_gre_flags(__be16 tflags)
+{
+	__be16 flags = 0;
+
+	if (tflags & TUNNEL_CSUM)
+		flags |= GRE_CSUM;
+	if (tflags & TUNNEL_ROUTING)
+		flags |= GRE_ROUTING;
+	if (tflags & TUNNEL_KEY)
+		flags |= GRE_KEY;
+	if (tflags & TUNNEL_SEQ)
+		flags |= GRE_SEQ;
+	if (tflags & TUNNEL_STRICT)
+		flags |= GRE_STRICT;
+	if (tflags & TUNNEL_REC)
+		flags |= GRE_REC;
+	if (tflags & TUNNEL_VERSION)
+		flags |= GRE_VERSION;
+
+	return flags;
+}
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+			    bool *csum_err)
+{
+	const struct gre_base_hdr *greh;
+	__be32 *options;
+	int hdr_len;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+		return -EINVAL;
+
+	greh = (struct gre_base_hdr *)skb_transport_header(skb);
+	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+		return -EINVAL;
+
+	tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+	hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+	if (!pskb_may_pull(skb, hdr_len))
+		return -EINVAL;
+
+	greh = (struct gre_base_hdr *)skb_transport_header(skb);
+	tpi->proto = greh->protocol;
+
+	options = (__be32 *)(greh + 1);
+	if (greh->flags & GRE_CSUM) {
+		if (skb_checksum_simple_validate(skb)) {
+			*csum_err = true;
+			return -EINVAL;
+		}
+
+		skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+					 null_compute_pseudo);
+		options++;
+	}
+
+	if (greh->flags & GRE_KEY) {
+		tpi->key = *options;
+		options++;
+	} else {
+		tpi->key = 0;
+	}
+	if (unlikely(greh->flags & GRE_SEQ)) {
+		tpi->seq = *options;
+		options++;
+	} else {
+		tpi->seq = 0;
+	}
+	/* WCCP version 1 and 2 protocol decoding.
+	 * - Change protocol to IP
+	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+	 */
+	if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+		tpi->proto = htons(ETH_P_IP);
+		if ((*(u8 *)options & 0xF0) != 0x40) {
+			hdr_len += 4;
+			if (!pskb_may_pull(skb, hdr_len))
+				return -EINVAL;
+		}
+	}
+	return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static void ipgre_err(struct sk_buff *skb, u32 info,
+		      const struct tnl_ptk_info *tpi)
 {
 
 	/* All the routers (except for Linux) return only
@@ -150,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
 	switch (type) {
 	default:
 	case ICMP_PARAMETERPROB:
-		return PACKET_RCVD;
+		return;
 
 	case ICMP_DEST_UNREACH:
 		switch (code) {
 		case ICMP_SR_FAILED:
 		case ICMP_PORT_UNREACH:
 			/* Impossible event. */
-			return PACKET_RCVD;
+			return;
 		default:
 			/* All others are translated to HOST_UNREACH.
 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -166,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
 			break;
 		}
 		break;
+
 	case ICMP_TIME_EXCEEDED:
 		if (code != ICMP_EXC_TTL)
-			return PACKET_RCVD;
+			return;
 		break;
 
 	case ICMP_REDIRECT:
@@ -185,21 +305,60 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
 			     iph->daddr, iph->saddr, tpi->key);
 
 	if (!t)
-		return PACKET_REJECT;
+		return;
 
 	if (t->parms.iph.daddr == 0 ||
 	    ipv4_is_multicast(t->parms.iph.daddr))
-		return PACKET_RCVD;
+		return;
 
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
-		return PACKET_RCVD;
+		return;
 
 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 		t->err_count++;
 	else
 		t->err_count = 1;
 	t->err_time = jiffies;
-	return PACKET_RCVD;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+	/* All the routers (except for Linux) return only
+	 * 8 bytes of packet payload. It means, that precise relaying of
+	 * ICMP in the real Internet is absolutely infeasible.
+	 *
+	 * Moreover, Cisco "wise men" put GRE key to the third word
+	 * in GRE header. It makes impossible maintaining even soft
+	 * state for keyed
+	 * GRE tunnels with enabled checksum. Tell them "thank you".
+	 *
+	 * Well, I wonder, rfc1812 was written by Cisco employee,
+	 * what the hell these idiots break standards established
+	 * by themselves???
+	 */
+
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct tnl_ptk_info tpi;
+	bool csum_err = false;
+
+	if (parse_gre_header(skb, &tpi, &csum_err)) {
+		if (!csum_err)		/* ignore csum errors. */
+			return;
+	}
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+				 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+		return;
+	}
+	if (type == ICMP_REDIRECT) {
+		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+			      IPPROTO_GRE, 0);
+		return;
+	}
+
+	ipgre_err(skb, info, &tpi);
 }
 
 static __be64 key_to_tunnel_id(__be32 key)
@@ -268,6 +427,31 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 	return PACKET_REJECT;
 }
 
+static int gre_rcv(struct sk_buff *skb)
+{
+	struct tnl_ptk_info tpi;
+	bool csum_err = false;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+		/* Looped back packet, drop it! */
+		if (rt_is_output_route(skb_rtable(skb)))
+			goto drop;
+	}
+#endif
+
+	if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+		goto drop;
+
+	if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
+		return 0;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
 static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
 			 __be16 proto, __be32 key, __be32 seq)
 {
@@ -684,10 +868,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	return ip_tunnel_init(dev);
 }
 
-static struct gre_cisco_protocol ipgre_protocol = {
-	.handler        = ipgre_rcv,
-	.err_handler    = ipgre_err,
-	.priority       = 0,
+static const struct gre_protocol ipgre_protocol = {
+	.handler     = gre_rcv,
+	.err_handler = gre_err,
 };
 
 static int __net_init ipgre_init_net(struct net *net)
@@ -1080,7 +1263,7 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto pnet_tap_faied;
 
-	err = gre_cisco_register(&ipgre_protocol);
+	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	if (err < 0) {
 		pr_info("%s: can't add protocol\n", __func__);
 		goto add_proto_failed;
@@ -1099,7 +1282,7 @@ static int __init ipgre_init(void)
 tap_ops_failed:
 	rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
-	gre_cisco_unregister(&ipgre_protocol);
+	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 add_proto_failed:
 	unregister_pernet_device(&ipgre_tap_net_ops);
 pnet_tap_faied:
@@ -1111,7 +1294,7 @@ static void __exit ipgre_fini(void)
 {
 	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
-	gre_cisco_unregister(&ipgre_protocol);
+	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	unregister_pernet_device(&ipgre_tap_net_ops);
 	unregister_pernet_device(&ipgre_net_ops);
 }
-- 
cgit v1.2.3


From 308ac9143ee2208f54d061eca54a89da509b5d92 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 8 Aug 2015 21:40:01 +0200
Subject: netfilter: nf_conntrack: push zone object into functions

This patch replaces the zone id which is pushed down into functions
with the actual zone object. It's a bigger one-time change, but
needed for later on extending zones with a direction parameter, and
thus decoupling this additional information from all call-sites.

No functional changes in this patch.

The default zone becomes a global const object, namely nf_ct_zone_dflt
and will be returned directly in various cases, one being, when there's
f.e. no zoning support.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h           | 10 ++-
 include/net/netfilter/nf_conntrack_core.h      |  3 +-
 include/net/netfilter/nf_conntrack_expect.h    | 11 +++-
 include/net/netfilter/nf_conntrack_zones.h     | 33 +++++++---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  3 +-
 net/ipv4/netfilter/nf_defrag_ipv4.c            | 11 ++--
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  3 +-
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      | 12 ++--
 net/netfilter/ipvs/ip_vs_nfct.c                |  2 +-
 net/netfilter/nf_conntrack_core.c              | 75 ++++++++++++++---------
 net/netfilter/nf_conntrack_expect.c            | 21 ++++---
 net/netfilter/nf_conntrack_netlink.c           | 84 ++++++++++++++------------
 net/netfilter/nf_conntrack_pptp.c              |  3 +-
 net/netfilter/nf_conntrack_standalone.c        | 17 ++++--
 net/netfilter/nf_nat_core.c                    | 19 +++---
 net/netfilter/nf_synproxy_core.c               |  4 +-
 net/netfilter/xt_CT.c                          |  6 +-
 net/netfilter/xt_connlimit.c                   |  9 +--
 net/sched/act_connmark.c                       |  5 +-
 21 files changed, 203 insertions(+), 132 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 37cd3911d5c5..f5e23c6dee8b 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -250,8 +250,12 @@ void nf_ct_untracked_status_or(unsigned long bits);
 void nf_ct_iterate_cleanup(struct net *net,
 			   int (*iter)(struct nf_conn *i, void *data),
 			   void *data, u32 portid, int report);
+
+struct nf_conntrack_zone;
+
 void nf_conntrack_free(struct nf_conn *ct);
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+struct nf_conn *nf_conntrack_alloc(struct net *net,
+				   const struct nf_conntrack_zone *zone,
 				   const struct nf_conntrack_tuple *orig,
 				   const struct nf_conntrack_tuple *repl,
 				   gfp_t gfp);
@@ -291,7 +295,9 @@ extern unsigned int nf_conntrack_max;
 extern unsigned int nf_conntrack_hash_rnd;
 void init_nf_conntrack_hash_rnd(void);
 
-struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags);
+struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
+				 const struct nf_conntrack_zone *zone,
+				 gfp_t flags);
 
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index f2f0fa3bb150..c03f9c42b3cd 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -52,7 +52,8 @@ bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 
 /* Find a connection corresponding to a tuple. */
 struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
+nf_conntrack_find_get(struct net *net,
+		      const struct nf_conntrack_zone *zone,
 		      const struct nf_conntrack_tuple *tuple);
 
 int __nf_conntrack_confirm(struct sk_buff *skb);
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index 3f3aecbc8632..dce56f09ac9a 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -4,7 +4,9 @@
 
 #ifndef _NF_CONNTRACK_EXPECT_H
 #define _NF_CONNTRACK_EXPECT_H
+
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_zones.h>
 
 extern unsigned int nf_ct_expect_hsize;
 extern unsigned int nf_ct_expect_max;
@@ -76,15 +78,18 @@ int nf_conntrack_expect_init(void);
 void nf_conntrack_expect_fini(void);
 
 struct nf_conntrack_expect *
-__nf_ct_expect_find(struct net *net, u16 zone,
+__nf_ct_expect_find(struct net *net,
+		    const struct nf_conntrack_zone *zone,
 		    const struct nf_conntrack_tuple *tuple);
 
 struct nf_conntrack_expect *
-nf_ct_expect_find_get(struct net *net, u16 zone,
+nf_ct_expect_find_get(struct net *net,
+		      const struct nf_conntrack_zone *zone,
 		      const struct nf_conntrack_tuple *tuple);
 
 struct nf_conntrack_expect *
-nf_ct_find_expectation(struct net *net, u16 zone,
+nf_ct_find_expectation(struct net *net,
+		       const struct nf_conntrack_zone *zone,
 		       const struct nf_conntrack_tuple *tuple);
 
 void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 034efe8d45a5..0788bb0f267d 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -1,25 +1,38 @@
 #ifndef _NF_CONNTRACK_ZONES_H
 #define _NF_CONNTRACK_ZONES_H
 
-#define NF_CT_DEFAULT_ZONE	0
-
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-#include <net/netfilter/nf_conntrack_extend.h>
+#define NF_CT_DEFAULT_ZONE_ID	0
 
 struct nf_conntrack_zone {
 	u16	id;
 };
 
-static inline u16 nf_ct_zone(const struct nf_conn *ct)
+extern const struct nf_conntrack_zone nf_ct_zone_dflt;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_extend.h>
+
+static inline const struct nf_conntrack_zone *
+nf_ct_zone(const struct nf_conn *ct)
 {
+	const struct nf_conntrack_zone *nf_ct_zone = NULL;
+
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-	struct nf_conntrack_zone *nf_ct_zone;
 	nf_ct_zone = nf_ct_ext_find(ct, NF_CT_EXT_ZONE);
-	if (nf_ct_zone)
-		return nf_ct_zone->id;
 #endif
-	return NF_CT_DEFAULT_ZONE;
+	return nf_ct_zone ? nf_ct_zone : &nf_ct_zone_dflt;
 }
 
-#endif /* CONFIG_NF_CONNTRACK || CONFIG_NF_CONNTRACK_MODULE */
+static inline const struct nf_conntrack_zone *
+nf_ct_zone_tmpl(const struct nf_conn *tmpl)
+{
+	return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt;
+}
+
+static inline bool nf_ct_zone_equal(const struct nf_conn *a,
+				    const struct nf_conntrack_zone *b)
+{
+	return nf_ct_zone(a)->id == b->id;
+}
+#endif /* IS_ENABLED(CONFIG_NF_CONNTRACK) */
 #endif /* _NF_CONNTRACK_ZONES_H */
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 30ad9554b5e9..8a2caaf3940b 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -280,7 +280,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 		return -EINVAL;
 	}
 
-	h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+	h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
 	if (h) {
 		struct sockaddr_in sin;
 		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 80d5554b9a88..8a2f41c2fe6f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -134,9 +134,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	struct nf_conntrack_tuple innertuple, origtuple;
 	const struct nf_conntrack_l4proto *innerproto;
 	const struct nf_conntrack_tuple_hash *h;
-	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+	const struct nf_conntrack_zone *zone;
 
 	NF_CT_ASSERT(skb->nfct == NULL);
+	zone = nf_ct_zone_tmpl(tmpl);
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuplepr(skb,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index b69e82bda215..20fe8e67c09b 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -43,19 +43,18 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
 static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 					      struct sk_buff *skb)
 {
-	u16 zone = NF_CT_DEFAULT_ZONE;
-
+	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	if (skb->nfct)
-		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+		zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id;
 #endif
 	if (nf_bridge_in_prerouting(skb))
-		return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+		return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
 
 	if (hooknum == NF_INET_PRE_ROUTING)
-		return IP_DEFRAG_CONNTRACK_IN + zone;
+		return IP_DEFRAG_CONNTRACK_IN + zone_id;
 	else
-		return IP_DEFRAG_CONNTRACK_OUT + zone;
+		return IP_DEFRAG_CONNTRACK_OUT + zone_id;
 }
 
 static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 4ba0c34c627b..7302900c321a 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -251,7 +251,7 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	if (*len < 0 || (unsigned int) *len < sizeof(sin6))
 		return -EINVAL;
 
-	h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+	h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
 	if (!h) {
 		pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
 			 &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 90388d606483..202914151360 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -150,7 +150,6 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_tuple intuple, origtuple;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_l4proto *inproto;
-	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
 
 	NF_CT_ASSERT(skb->nfct == NULL);
 
@@ -177,7 +176,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 
 	*ctinfo = IP_CT_RELATED;
 
-	h = nf_conntrack_find_get(net, zone, &intuple);
+	h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl), &intuple);
 	if (!h) {
 		pr_debug("icmpv6_error: no match\n");
 		return -NF_ACCEPT;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 267fb8d5876e..9d3de9b74856 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -33,20 +33,18 @@
 static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 						struct sk_buff *skb)
 {
-	u16 zone = NF_CT_DEFAULT_ZONE;
-
+	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	if (skb->nfct)
-		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+		zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id;
 #endif
 	if (nf_bridge_in_prerouting(skb))
-		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
 
 	if (hooknum == NF_INET_PRE_ROUTING)
-		return IP6_DEFRAG_CONNTRACK_IN + zone;
+		return IP6_DEFRAG_CONNTRACK_IN + zone_id;
 	else
-		return IP6_DEFRAG_CONNTRACK_OUT + zone;
-
+		return IP6_DEFRAG_CONNTRACK_OUT + zone_id;
 }
 
 static unsigned int ipv6_defrag(const struct nf_hook_ops *ops,
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 5882bbfd198c..136184572fc9 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -274,7 +274,7 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
 		" for conn " FMT_CONN "\n",
 		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
 
-	h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+	h = nf_conntrack_find_get(ip_vs_conn_net(cp), &nf_ct_zone_dflt,
 				  &tuple);
 	if (h) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 651039ad1681..0bb26e84f849 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -126,7 +126,8 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 unsigned int nf_conntrack_hash_rnd __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
 
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
+			      const struct nf_conntrack_zone *zone)
 {
 	unsigned int n;
 
@@ -135,7 +136,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
 	 * three bytes manually.
 	 */
 	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
-	return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+	return jhash2((u32 *)tuple, n, zone->id ^ nf_conntrack_hash_rnd ^
 		      (((__force __u16)tuple->dst.u.all << 16) |
 		      tuple->dst.protonum));
 }
@@ -151,12 +152,14 @@ static u32 hash_bucket(u32 hash, const struct net *net)
 }
 
 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
-				  u16 zone, unsigned int size)
+				  const struct nf_conntrack_zone *zone,
+				  unsigned int size)
 {
 	return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
 }
 
-static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
+static inline u_int32_t hash_conntrack(const struct net *net,
+				       const struct nf_conntrack_zone *zone,
 				       const struct nf_conntrack_tuple *tuple)
 {
 	return __hash_conntrack(tuple, zone, net->ct.htable_size);
@@ -288,7 +291,9 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 }
 
 /* Released via destroy_conntrack() */
-struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags)
+struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
+				 const struct nf_conntrack_zone *zone,
+				 gfp_t flags)
 {
 	struct nf_conn *tmpl;
 
@@ -306,7 +311,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags)
 		nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC);
 		if (!nf_ct_zone)
 			goto out_free;
-		nf_ct_zone->id = zone;
+		nf_ct_zone->id = zone->id;
 	}
 #endif
 	atomic_set(&tmpl->ct_general.use, 0);
@@ -371,11 +376,12 @@ destroy_conntrack(struct nf_conntrack *nfct)
 
 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
+	const struct nf_conntrack_zone *zone;
 	struct net *net = nf_ct_net(ct);
 	unsigned int hash, reply_hash;
-	u16 zone = nf_ct_zone(ct);
 	unsigned int sequence;
 
+	zone = nf_ct_zone(ct);
 	nf_ct_helper_destroy(ct);
 
 	local_bh_disable();
@@ -431,8 +437,8 @@ static void death_by_timeout(unsigned long ul_conntrack)
 
 static inline bool
 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
-			const struct nf_conntrack_tuple *tuple,
-			u16 zone)
+		const struct nf_conntrack_tuple *tuple,
+		const struct nf_conntrack_zone *zone)
 {
 	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 
@@ -440,8 +446,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 	 * so we need to check that the conntrack is confirmed
 	 */
 	return nf_ct_tuple_equal(tuple, &h->tuple) &&
-		nf_ct_zone(ct) == zone &&
-		nf_ct_is_confirmed(ct);
+	       nf_ct_zone_equal(ct, zone) &&
+	       nf_ct_is_confirmed(ct);
 }
 
 /*
@@ -450,7 +456,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
  *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
  */
 static struct nf_conntrack_tuple_hash *
-____nf_conntrack_find(struct net *net, u16 zone,
+____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 		      const struct nf_conntrack_tuple *tuple, u32 hash)
 {
 	struct nf_conntrack_tuple_hash *h;
@@ -486,7 +492,7 @@ begin:
 
 /* Find a connection corresponding to a tuple. */
 static struct nf_conntrack_tuple_hash *
-__nf_conntrack_find_get(struct net *net, u16 zone,
+__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 			const struct nf_conntrack_tuple *tuple, u32 hash)
 {
 	struct nf_conntrack_tuple_hash *h;
@@ -513,7 +519,7 @@ begin:
 }
 
 struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
+nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 		      const struct nf_conntrack_tuple *tuple)
 {
 	return __nf_conntrack_find_get(net, zone, tuple,
@@ -536,11 +542,11 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 int
 nf_conntrack_hash_check_insert(struct nf_conn *ct)
 {
+	const struct nf_conntrack_zone *zone;
 	struct net *net = nf_ct_net(ct);
 	unsigned int hash, reply_hash;
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_nulls_node *n;
-	u16 zone;
 	unsigned int sequence;
 
 	zone = nf_ct_zone(ct);
@@ -558,12 +564,12 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				      &h->tuple) &&
-		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
 			goto out;
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				      &h->tuple) &&
-		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
 			goto out;
 
 	add_timer(&ct->timeout);
@@ -588,6 +594,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 int
 __nf_conntrack_confirm(struct sk_buff *skb)
 {
+	const struct nf_conntrack_zone *zone;
 	unsigned int hash, reply_hash;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
@@ -596,7 +603,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	struct hlist_nulls_node *n;
 	enum ip_conntrack_info ctinfo;
 	struct net *net;
-	u16 zone;
 	unsigned int sequence;
 
 	ct = nf_ct_get(skb, &ctinfo);
@@ -649,12 +655,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				      &h->tuple) &&
-		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
 			goto out;
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				      &h->tuple) &&
-		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
 			goto out;
 
 	/* Timer relative to confirmation time, not original
@@ -707,11 +713,14 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 			 const struct nf_conn *ignored_conntrack)
 {
 	struct net *net = nf_ct_net(ignored_conntrack);
+	const struct nf_conntrack_zone *zone;
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_nulls_node *n;
 	struct nf_conn *ct;
-	u16 zone = nf_ct_zone(ignored_conntrack);
-	unsigned int hash = hash_conntrack(net, zone, tuple);
+	unsigned int hash;
+
+	zone = nf_ct_zone(ignored_conntrack);
+	hash = hash_conntrack(net, zone, tuple);
 
 	/* Disable BHs the entire time since we need to disable them at
 	 * least once for the stats anyway.
@@ -721,7 +730,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (ct != ignored_conntrack &&
 		    nf_ct_tuple_equal(tuple, &h->tuple) &&
-		    nf_ct_zone(ct) == zone) {
+		    nf_ct_zone_equal(ct, zone)) {
 			NF_CT_STAT_INC(net, found);
 			rcu_read_unlock_bh();
 			return 1;
@@ -810,7 +819,8 @@ void init_nf_conntrack_hash_rnd(void)
 }
 
 static struct nf_conn *
-__nf_conntrack_alloc(struct net *net, u16 zone,
+__nf_conntrack_alloc(struct net *net,
+		     const struct nf_conntrack_zone *zone,
 		     const struct nf_conntrack_tuple *orig,
 		     const struct nf_conntrack_tuple *repl,
 		     gfp_t gfp, u32 hash)
@@ -864,7 +874,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
 		nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
 		if (!nf_ct_zone)
 			goto out_free;
-		nf_ct_zone->id = zone;
+		nf_ct_zone->id = zone->id;
 	}
 #endif
 	/* Because we use RCU lookups, we set ct_general.use to zero before
@@ -881,7 +891,8 @@ out_free:
 #endif
 }
 
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+struct nf_conn *nf_conntrack_alloc(struct net *net,
+				   const struct nf_conntrack_zone *zone,
 				   const struct nf_conntrack_tuple *orig,
 				   const struct nf_conntrack_tuple *repl,
 				   gfp_t gfp)
@@ -923,7 +934,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_tuple repl_tuple;
 	struct nf_conntrack_ecache *ecache;
 	struct nf_conntrack_expect *exp = NULL;
-	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+	const struct nf_conntrack_zone *zone;
 	struct nf_conn_timeout *timeout_ext;
 	unsigned int *timeouts;
 
@@ -932,6 +943,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		return NULL;
 	}
 
+	zone = nf_ct_zone_tmpl(tmpl);
 	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
 				  hash);
 	if (IS_ERR(ct))
@@ -1026,10 +1038,10 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 		  int *set_reply,
 		  enum ip_conntrack_info *ctinfo)
 {
+	const struct nf_conntrack_zone *zone;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
-	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
 	u32 hash;
 
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
@@ -1040,6 +1052,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* look for tuple match */
+	zone = nf_ct_zone_tmpl(tmpl);
 	hash = hash_conntrack_raw(&tuple, zone);
 	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
@@ -1290,6 +1303,12 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
 
+/* Built-in default zone used e.g. by modules. */
+const struct nf_conntrack_zone nf_ct_zone_dflt = {
+	.id	= NF_CT_DEFAULT_ZONE_ID,
+};
+EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
+
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
 	.len	= sizeof(struct nf_conntrack_zone),
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index b45a4223cb05..980db854c3c8 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -88,7 +88,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple
 }
 
 struct nf_conntrack_expect *
-__nf_ct_expect_find(struct net *net, u16 zone,
+__nf_ct_expect_find(struct net *net,
+		    const struct nf_conntrack_zone *zone,
 		    const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conntrack_expect *i;
@@ -100,7 +101,7 @@ __nf_ct_expect_find(struct net *net, u16 zone,
 	h = nf_ct_expect_dst_hash(tuple);
 	hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
 		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
-		    nf_ct_zone(i->master) == zone)
+		    nf_ct_zone_equal(i->master, zone))
 			return i;
 	}
 	return NULL;
@@ -109,7 +110,8 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
 
 /* Just find a expectation corresponding to a tuple. */
 struct nf_conntrack_expect *
-nf_ct_expect_find_get(struct net *net, u16 zone,
+nf_ct_expect_find_get(struct net *net,
+		      const struct nf_conntrack_zone *zone,
 		      const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conntrack_expect *i;
@@ -127,7 +129,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
 /* If an expectation for this connection is found, it gets delete from
  * global list then returned. */
 struct nf_conntrack_expect *
-nf_ct_find_expectation(struct net *net, u16 zone,
+nf_ct_find_expectation(struct net *net,
+		       const struct nf_conntrack_zone *zone,
 		       const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conntrack_expect *i, *exp = NULL;
@@ -140,7 +143,7 @@ nf_ct_find_expectation(struct net *net, u16 zone,
 	hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
 		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
 		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
-		    nf_ct_zone(i->master) == zone) {
+		    nf_ct_zone_equal(i->master, zone)) {
 			exp = i;
 			break;
 		}
@@ -220,16 +223,16 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
 	}
 
 	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
-	       nf_ct_zone(a->master) == nf_ct_zone(b->master);
+	       nf_ct_zone_equal(a->master, nf_ct_zone(b->master));
 }
 
 static inline int expect_matches(const struct nf_conntrack_expect *a,
 				 const struct nf_conntrack_expect *b)
 {
 	return a->master == b->master && a->class == b->class &&
-		nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
-		nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
-		nf_ct_zone(a->master) == nf_ct_zone(b->master);
+	       nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
+	       nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+	       nf_ct_zone_equal(a->master, nf_ct_zone(b->master));
 }
 
 /* Generally a bad idea to call this: could have matched already. */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6b8b0abbfab4..95f7f01e253d 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -458,6 +458,7 @@ static int
 ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 		    struct nf_conn *ct)
 {
+	const struct nf_conntrack_zone *zone;
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	struct nlattr *nest_parms;
@@ -487,8 +488,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	if (nf_ct_zone(ct) &&
-	    nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+	zone = nf_ct_zone(ct);
+	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
+	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
 		goto nla_put_failure;
 
 	if (ctnetlink_dump_status(skb, ct) < 0 ||
@@ -609,6 +611,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
 static int
 ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 {
+	const struct nf_conntrack_zone *zone;
 	struct net *net;
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
@@ -669,8 +672,9 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	if (nf_ct_zone(ct) &&
-	    nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+	zone = nf_ct_zone(ct);
+	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
+	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
 		goto nla_put_failure;
 
 	if (ctnetlink_dump_id(skb, ct) < 0)
@@ -965,17 +969,18 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
 }
 
 static int
-ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
+ctnetlink_parse_zone(const struct nlattr *attr,
+		     struct nf_conntrack_zone *zone)
 {
-	if (attr)
+	zone->id = NF_CT_DEFAULT_ZONE_ID;
+
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-		*zone = ntohs(nla_get_be16(attr));
+	if (attr)
+		zone->id = ntohs(nla_get_be16(attr));
 #else
+	if (attr)
 		return -EOPNOTSUPP;
 #endif
-	else
-		*zone = 0;
-
 	return 0;
 }
 
@@ -1058,7 +1063,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	struct nf_conn *ct;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u_int8_t u3 = nfmsg->nfgen_family;
-	u16 zone;
+	struct nf_conntrack_zone zone;
 	int err;
 
 	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1078,7 +1083,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	h = nf_conntrack_find_get(net, zone, &tuple);
+	h = nf_conntrack_find_get(net, &zone, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -1112,7 +1117,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	struct sk_buff *skb2 = NULL;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u_int8_t u3 = nfmsg->nfgen_family;
-	u16 zone;
+	struct nf_conntrack_zone zone;
 	int err;
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -1147,7 +1152,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	h = nf_conntrack_find_get(net, zone, &tuple);
+	h = nf_conntrack_find_get(net, &zone, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -1645,7 +1650,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
 }
 
 static struct nf_conn *
-ctnetlink_create_conntrack(struct net *net, u16 zone,
+ctnetlink_create_conntrack(struct net *net,
+			   const struct nf_conntrack_zone *zone,
 			   const struct nlattr * const cda[],
 			   struct nf_conntrack_tuple *otuple,
 			   struct nf_conntrack_tuple *rtuple,
@@ -1804,7 +1810,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	struct nf_conn *ct;
 	u_int8_t u3 = nfmsg->nfgen_family;
-	u16 zone;
+	struct nf_conntrack_zone zone;
 	int err;
 
 	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1824,9 +1830,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	}
 
 	if (cda[CTA_TUPLE_ORIG])
-		h = nf_conntrack_find_get(net, zone, &otuple);
+		h = nf_conntrack_find_get(net, &zone, &otuple);
 	else if (cda[CTA_TUPLE_REPLY])
-		h = nf_conntrack_find_get(net, zone, &rtuple);
+		h = nf_conntrack_find_get(net, &zone, &rtuple);
 
 	if (h == NULL) {
 		err = -ENOENT;
@@ -1836,7 +1842,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 			if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
 				return -EINVAL;
 
-			ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
+			ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
 							&rtuple, u3);
 			if (IS_ERR(ct))
 				return PTR_ERR(ct);
@@ -2091,6 +2097,7 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
 static int
 ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
 {
+	const struct nf_conntrack_zone *zone;
 	struct nlattr *nest_parms;
 
 	rcu_read_lock();
@@ -2108,10 +2115,10 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
 		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	if (nf_ct_zone(ct)) {
-		if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
-			goto nla_put_failure;
-	}
+	zone = nf_ct_zone(ct);
+	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
+	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
+		goto nla_put_failure;
 
 	if (ctnetlink_dump_id(skb, ct) < 0)
 		goto nla_put_failure;
@@ -2612,7 +2619,7 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
-	u16 zone = 0;
+	struct nf_conntrack_zone zone;
 	struct netlink_dump_control c = {
 		.dump = ctnetlink_exp_ct_dump_table,
 		.done = ctnetlink_exp_done,
@@ -2622,13 +2629,11 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	if (cda[CTA_EXPECT_ZONE]) {
-		err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
-		if (err < 0)
-			return err;
-	}
+	err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+	if (err < 0)
+		return err;
 
-	h = nf_conntrack_find_get(net, zone, &tuple);
+	h = nf_conntrack_find_get(net, &zone, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -2652,7 +2657,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 	struct sk_buff *skb2;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u_int8_t u3 = nfmsg->nfgen_family;
-	u16 zone;
+	struct nf_conntrack_zone zone;
 	int err;
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -2681,7 +2686,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	exp = nf_ct_expect_find_get(net, zone, &tuple);
+	exp = nf_ct_expect_find_get(net, &zone, &tuple);
 	if (!exp)
 		return -ENOENT;
 
@@ -2732,8 +2737,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	struct hlist_node *next;
 	u_int8_t u3 = nfmsg->nfgen_family;
+	struct nf_conntrack_zone zone;
 	unsigned int i;
-	u16 zone;
 	int err;
 
 	if (cda[CTA_EXPECT_TUPLE]) {
@@ -2747,7 +2752,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 			return err;
 
 		/* bump usage count to 2 */
-		exp = nf_ct_expect_find_get(net, zone, &tuple);
+		exp = nf_ct_expect_find_get(net, &zone, &tuple);
 		if (!exp)
 			return -ENOENT;
 
@@ -2937,7 +2942,8 @@ err_out:
 }
 
 static int
-ctnetlink_create_expect(struct net *net, u16 zone,
+ctnetlink_create_expect(struct net *net,
+			const struct nf_conntrack_zone *zone,
 			const struct nlattr * const cda[],
 			u_int8_t u3, u32 portid, int report)
 {
@@ -3011,7 +3017,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
 	struct nf_conntrack_expect *exp;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u_int8_t u3 = nfmsg->nfgen_family;
-	u16 zone;
+	struct nf_conntrack_zone zone;
 	int err;
 
 	if (!cda[CTA_EXPECT_TUPLE]
@@ -3028,14 +3034,12 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	spin_lock_bh(&nf_conntrack_expect_lock);
-	exp = __nf_ct_expect_find(net, zone, &tuple);
-
+	exp = __nf_ct_expect_find(net, &zone, &tuple);
 	if (!exp) {
 		spin_unlock_bh(&nf_conntrack_expect_lock);
 		err = -ENOENT;
 		if (nlh->nlmsg_flags & NLM_F_CREATE) {
-			err = ctnetlink_create_expect(net, zone, cda,
-						      u3,
+			err = ctnetlink_create_expect(net, &zone, cda, u3,
 						      NETLINK_CB(skb).portid,
 						      nlmsg_report(nlh));
 		}
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 825c3e3f8305..5588c7ae1ac2 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -143,13 +143,14 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
 				  const struct nf_conntrack_tuple *t)
 {
 	const struct nf_conntrack_tuple_hash *h;
+	const struct nf_conntrack_zone *zone;
 	struct nf_conntrack_expect *exp;
 	struct nf_conn *sibling;
-	u16 zone = nf_ct_zone(ct);
 
 	pr_debug("trying to timeout ct or exp for tuple ");
 	nf_ct_dump_tuple(t);
 
+	zone = nf_ct_zone(ct);
 	h = nf_conntrack_find_get(net, zone, t);
 	if (h)  {
 		sibling = nf_ct_tuplehash_to_ctrack(h);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index fc823fa5dcf5..28c8b2b982ec 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -140,6 +140,17 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
 }
 #endif
 
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct)
+{
+	seq_printf(s, "zone=%u ", nf_ct_zone(ct)->id);
+}
+#else
+static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct)
+{
+}
+#endif
+
 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
 static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
 {
@@ -228,11 +239,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 #endif
 
 	ct_show_secctx(s, ct);
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	seq_printf(s, "zone=%u ", nf_ct_zone(ct));
-#endif
-
+	ct_show_zone(s, ct);
 	ct_show_delta_time(s, ct);
 
 	seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 4e0b47831d43..65ebaf9fc4f9 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -118,14 +118,15 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
-hash_by_src(const struct net *net, u16 zone,
+hash_by_src(const struct net *net,
+	    const struct nf_conntrack_zone *zone,
 	    const struct nf_conntrack_tuple *tuple)
 {
 	unsigned int hash;
 
 	/* Original src, to ensure we map it consistently if poss. */
 	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
-		      tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd);
+		      tuple->dst.protonum ^ zone->id ^ nf_conntrack_hash_rnd);
 
 	return reciprocal_scale(hash, net->ct.nat_htable_size);
 }
@@ -185,7 +186,8 @@ same_src(const struct nf_conn *ct,
 
 /* Only called for SRC manip */
 static int
-find_appropriate_src(struct net *net, u16 zone,
+find_appropriate_src(struct net *net,
+		     const struct nf_conntrack_zone *zone,
 		     const struct nf_nat_l3proto *l3proto,
 		     const struct nf_nat_l4proto *l4proto,
 		     const struct nf_conntrack_tuple *tuple,
@@ -198,7 +200,7 @@ find_appropriate_src(struct net *net, u16 zone,
 
 	hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
 		ct = nat->ct;
-		if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
+		if (same_src(ct, tuple) && nf_ct_zone_equal(ct, zone)) {
 			/* Copy source part from reply tuple. */
 			nf_ct_invert_tuplepr(result,
 				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -218,7 +220,8 @@ find_appropriate_src(struct net *net, u16 zone,
  * the ip with the lowest src-ip/dst-ip/proto usage.
  */
 static void
-find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
+find_best_ips_proto(const struct nf_conntrack_zone *zone,
+		    struct nf_conntrack_tuple *tuple,
 		    const struct nf_nat_range *range,
 		    const struct nf_conn *ct,
 		    enum nf_nat_manip_type maniptype)
@@ -258,7 +261,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
 	 */
 	j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
 		   range->flags & NF_NAT_RANGE_PERSISTENT ?
-			0 : (__force u32)tuple->dst.u3.all[max] ^ zone);
+			0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
 
 	full_range = false;
 	for (i = 0; i <= max; i++) {
@@ -297,10 +300,12 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 struct nf_conn *ct,
 		 enum nf_nat_manip_type maniptype)
 {
+	const struct nf_conntrack_zone *zone;
 	const struct nf_nat_l3proto *l3proto;
 	const struct nf_nat_l4proto *l4proto;
 	struct net *net = nf_ct_net(ct);
-	u16 zone = nf_ct_zone(ct);
+
+	zone = nf_ct_zone(ct);
 
 	rcu_read_lock();
 	l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 71f1e9fdfa18..58b2e84dab27 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -17,10 +17,12 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_tcpudp.h>
 #include <linux/netfilter/xt_SYNPROXY.h>
+
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_zones.h>
 
 int synproxy_net_id;
 EXPORT_SYMBOL_GPL(synproxy_net_id);
@@ -352,7 +354,7 @@ static int __net_init synproxy_net_init(struct net *net)
 	struct nf_conn *ct;
 	int err = -ENOMEM;
 
-	ct = nf_ct_tmpl_alloc(net, 0, GFP_KERNEL);
+	ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL);
 	if (IS_ERR(ct)) {
 		err = PTR_ERR(ct);
 		goto err1;
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index c6630030c912..29e2856063ff 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -184,6 +184,7 @@ out:
 static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 			  struct xt_ct_target_info_v1 *info)
 {
+	struct nf_conntrack_zone zone;
 	struct nf_conn *ct;
 	int ret = -EOPNOTSUPP;
 
@@ -201,7 +202,10 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	if (ret < 0)
 		goto err1;
 
-	ct = nf_ct_tmpl_alloc(par->net, info->zone, GFP_KERNEL);
+	memset(&zone, 0, sizeof(zone));
+	zone.id = info->zone;
+
+	ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
 	ret = PTR_ERR(ct);
 	if (IS_ERR(ct))
 		goto err2;
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 29ba6218a820..075d89d94d28 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -134,7 +134,7 @@ static bool add_hlist(struct hlist_head *head,
 static unsigned int check_hlist(struct net *net,
 				struct hlist_head *head,
 				const struct nf_conntrack_tuple *tuple,
-				u16 zone,
+				const struct nf_conntrack_zone *zone,
 				bool *addit)
 {
 	const struct nf_conntrack_tuple_hash *found;
@@ -201,7 +201,7 @@ static unsigned int
 count_tree(struct net *net, struct rb_root *root,
 	   const struct nf_conntrack_tuple *tuple,
 	   const union nf_inet_addr *addr, const union nf_inet_addr *mask,
-	   u8 family, u16 zone)
+	   u8 family, const struct nf_conntrack_zone *zone)
 {
 	struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
 	struct rb_node **rbnode, *parent;
@@ -290,7 +290,8 @@ static int count_them(struct net *net,
 		      const struct nf_conntrack_tuple *tuple,
 		      const union nf_inet_addr *addr,
 		      const union nf_inet_addr *mask,
-		      u_int8_t family, u16 zone)
+		      u_int8_t family,
+		      const struct nf_conntrack_zone *zone)
 {
 	struct rb_root *root;
 	int count;
@@ -321,10 +322,10 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	union nf_inet_addr addr;
 	struct nf_conntrack_tuple tuple;
 	const struct nf_conntrack_tuple *tuple_ptr = &tuple;
+	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
 	unsigned int connections;
-	u16 zone = NF_CT_DEFAULT_ZONE;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (ct != NULL) {
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index f2b540220ad0..e67a1bdd0929 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -37,6 +37,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 	struct nf_conntrack_tuple tuple;
 	enum ip_conntrack_info ctinfo;
 	struct tcf_connmark_info *ca = a->priv;
+	struct nf_conntrack_zone zone;
 	struct nf_conn *c;
 	int proto;
 
@@ -70,7 +71,9 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 			       proto, &tuple))
 		goto out;
 
-	thash = nf_conntrack_find_get(dev_net(skb->dev), ca->zone, &tuple);
+	zone.id = ca->zone;
+
+	thash = nf_conntrack_find_get(dev_net(skb->dev), &zone, &tuple);
 	if (!thash)
 		goto out;
 
-- 
cgit v1.2.3


From 42a7b32b73d6bf22e4bdd7bf68746e2d71f4cd8d Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 10 Aug 2015 16:58:11 -0600
Subject: xfrm: Add oif to dst lookups

Rules can be installed that direct route lookups to specific tables based
on oif. Plumb the oif through the xfrm lookups so it gets set in the flow
struct and passed to the resolver routines.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  7 +++++--
 net/ipv4/xfrm4_policy.c | 11 ++++++-----
 net/ipv6/xfrm6_policy.c |  7 ++++---
 net/xfrm/xfrm_policy.c  | 24 ++++++++++++++----------
 4 files changed, 29 insertions(+), 20 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f0ee97eec24d..312e3fee9ccf 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -285,10 +285,13 @@ struct xfrm_policy_afinfo {
 	unsigned short		family;
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(struct net *net);
-	struct dst_entry	*(*dst_lookup)(struct net *net, int tos,
+	struct dst_entry	*(*dst_lookup)(struct net *net,
+					       int tos, int oif,
 					       const xfrm_address_t *saddr,
 					       const xfrm_address_t *daddr);
-	int			(*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr);
+	int			(*get_saddr)(struct net *net, int oif,
+					     xfrm_address_t *saddr,
+					     xfrm_address_t *daddr);
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index bff69746e05f..55b3c0f4dde5 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -19,7 +19,7 @@
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
 
 static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
-					    int tos,
+					    int tos, int oif,
 					    const xfrm_address_t *saddr,
 					    const xfrm_address_t *daddr)
 {
@@ -28,6 +28,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	memset(fl4, 0, sizeof(*fl4));
 	fl4->daddr = daddr->a4;
 	fl4->flowi4_tos = tos;
+	fl4->flowi4_oif = oif;
 	if (saddr)
 		fl4->saddr = saddr->a4;
 
@@ -38,22 +39,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	return ERR_CAST(rt);
 }
 
-static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
 					  const xfrm_address_t *daddr)
 {
 	struct flowi4 fl4;
 
-	return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
+	return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr);
 }
 
-static int xfrm4_get_saddr(struct net *net,
+static int xfrm4_get_saddr(struct net *net, int oif,
 			   xfrm_address_t *saddr, xfrm_address_t *daddr)
 {
 	struct dst_entry *dst;
 	struct flowi4 fl4;
 
-	dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
+	dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ed0583c1b9fc..a74013d3eceb 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -26,7 +26,7 @@
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
 
-static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
+static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
 					  const xfrm_address_t *daddr)
 {
@@ -35,6 +35,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
 	int err;
 
 	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_oif = oif;
 	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
 	if (saddr)
 		memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
@@ -50,13 +51,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
 	return dst;
 }
 
-static int xfrm6_get_saddr(struct net *net,
+static int xfrm6_get_saddr(struct net *net, int oif,
 			   xfrm_address_t *saddr, xfrm_address_t *daddr)
 {
 	struct dst_entry *dst;
 	struct net_device *dev;
 
-	dst = xfrm6_dst_lookup(net, 0, NULL, daddr);
+	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 18cead7645be..94af3d065785 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -115,7 +115,8 @@ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 	rcu_read_unlock();
 }
 
-static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
+static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
+						  int tos, int oif,
 						  const xfrm_address_t *saddr,
 						  const xfrm_address_t *daddr,
 						  int family)
@@ -127,14 +128,15 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
 	if (unlikely(afinfo == NULL))
 		return ERR_PTR(-EAFNOSUPPORT);
 
-	dst = afinfo->dst_lookup(net, tos, saddr, daddr);
+	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
 
 	xfrm_policy_put_afinfo(afinfo);
 
 	return dst;
 }
 
-static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
+static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
+						int tos, int oif,
 						xfrm_address_t *prev_saddr,
 						xfrm_address_t *prev_daddr,
 						int family)
@@ -153,7 +155,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
 		daddr = x->coaddr;
 	}
 
-	dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family);
+	dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family);
 
 	if (!IS_ERR(dst)) {
 		if (prev_saddr != saddr)
@@ -1373,15 +1375,15 @@ int __xfrm_sk_clone_policy(struct sock *sk)
 }
 
 static int
-xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote,
-	       unsigned short family)
+xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
+	       xfrm_address_t *remote, unsigned short family)
 {
 	int err;
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
 	if (unlikely(afinfo == NULL))
 		return -EINVAL;
-	err = afinfo->get_saddr(net, local, remote);
+	err = afinfo->get_saddr(net, oif, local, remote);
 	xfrm_policy_put_afinfo(afinfo);
 	return err;
 }
@@ -1410,7 +1412,9 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
 			remote = &tmpl->id.daddr;
 			local = &tmpl->saddr;
 			if (xfrm_addr_any(local, tmpl->encap_family)) {
-				error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
+				error = xfrm_get_saddr(net, fl->flowi_oif,
+						       &tmp, remote,
+						       tmpl->encap_family);
 				if (error)
 					goto fail;
 				local = &tmp;
@@ -1690,8 +1694,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 
 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
 			family = xfrm[i]->props.family;
-			dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr,
-					      family);
+			dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
+					      &saddr, &daddr, family);
 			err = PTR_ERR(dst);
 			if (IS_ERR(dst))
 				goto put_states;
-- 
cgit v1.2.3


From cdf0969763e020923abe28fddc605add572febc2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 11 Aug 2015 12:00:37 -0700
Subject: Revert "Merge branch 'mv88e6xxx-switchdev-fdb'"

This reverts commit f1d5ca434413b20cd3f8c18ff2b634b7782149a5, reversing
changes made to 4933d85c5173832ebd261756522095837583c458.

I applied v2 instead of v3.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6171.c          |   6 +-
 drivers/net/dsa/mv88e6352.c          |   6 +-
 drivers/net/dsa/mv88e6xxx.c          | 223 +++++++++++------------------------
 drivers/net/dsa/mv88e6xxx.h          |  31 ++---
 drivers/net/ethernet/rocker/rocker.c |   2 +-
 include/net/dsa.h                    |  16 +--
 include/net/switchdev.h              |   3 +-
 net/bridge/br_fdb.c                  |   2 +-
 net/dsa/slave.c                      | 218 ++++++++++++++++------------------
 net/switchdev/switchdev.c            |   7 +-
 10 files changed, 197 insertions(+), 317 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index 735f04cd83ee..1c7808495a9d 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -116,9 +116,9 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
 	.port_join_bridge       = mv88e6xxx_join_bridge,
 	.port_leave_bridge      = mv88e6xxx_leave_bridge,
 	.port_stp_update        = mv88e6xxx_port_stp_update,
-	.port_fdb_add		= mv88e6xxx_port_fdb_add,
-	.port_fdb_del		= mv88e6xxx_port_fdb_del,
-	.port_fdb_getnext	= mv88e6xxx_port_fdb_getnext,
+	.fdb_add		= mv88e6xxx_port_fdb_add,
+	.fdb_del		= mv88e6xxx_port_fdb_del,
+	.fdb_getnext		= mv88e6xxx_port_fdb_getnext,
 };
 
 MODULE_ALIAS("platform:mv88e6171");
diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
index a18f7c83d4cb..7e935852e192 100644
--- a/drivers/net/dsa/mv88e6352.c
+++ b/drivers/net/dsa/mv88e6352.c
@@ -343,9 +343,9 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
 	.port_join_bridge	= mv88e6xxx_join_bridge,
 	.port_leave_bridge	= mv88e6xxx_leave_bridge,
 	.port_stp_update	= mv88e6xxx_port_stp_update,
-	.port_fdb_add		= mv88e6xxx_port_fdb_add,
-	.port_fdb_del		= mv88e6xxx_port_fdb_del,
-	.port_fdb_getnext	= mv88e6xxx_port_fdb_getnext,
+	.fdb_add		= mv88e6xxx_port_fdb_add,
+	.fdb_del		= mv88e6xxx_port_fdb_del,
+	.fdb_getnext		= mv88e6xxx_port_fdb_getnext,
 };
 
 MODULE_ALIAS("platform:mv88e6172");
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 9c6781de533b..109452056eff 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -964,7 +964,7 @@ static int _mv88e6xxx_atu_cmd(struct dsa_switch *ds, int fid, u16 cmd)
 {
 	int ret;
 
-	ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_FID, fid);
+	ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, 0x01, fid);
 	if (ret < 0)
 		return ret;
 
@@ -1091,7 +1091,7 @@ int mv88e6xxx_join_bridge(struct dsa_switch *ds, int port, u32 br_port_mask)
 	ps->bridge_mask[fid] = br_port_mask;
 
 	if (fid != ps->fid[port]) {
-		clear_bit(ps->fid[port], ps->fid_bitmap);
+		ps->fid_mask |= 1 << ps->fid[port];
 		ps->fid[port] = fid;
 		ret = _mv88e6xxx_update_bridge_config(ds, fid);
 	}
@@ -1125,16 +1125,9 @@ int mv88e6xxx_leave_bridge(struct dsa_switch *ds, int port, u32 br_port_mask)
 
 	mutex_lock(&ps->smi_mutex);
 
-	newfid = find_next_zero_bit(ps->fid_bitmap, VLAN_N_VID, 1);
-	if (unlikely(newfid > ps->num_ports)) {
-		netdev_err(ds->ports[port], "all first %d FIDs are used\n",
-			   ps->num_ports);
-		ret = -ENOSPC;
-		goto unlock;
-	}
-
+	newfid = __ffs(ps->fid_mask);
 	ps->fid[port] = newfid;
-	set_bit(newfid, ps->fid_bitmap);
+	ps->fid_mask &= ~(1 << newfid);
 	ps->bridge_mask[fid] &= ~(1 << port);
 	ps->bridge_mask[newfid] = 1 << port;
 
@@ -1142,7 +1135,6 @@ int mv88e6xxx_leave_bridge(struct dsa_switch *ds, int port, u32 br_port_mask)
 	if (!ret)
 		ret = _mv88e6xxx_update_bridge_config(ds, newfid);
 
-unlock:
 	mutex_unlock(&ps->smi_mutex);
 
 	return ret;
@@ -1182,8 +1174,8 @@ int mv88e6xxx_port_stp_update(struct dsa_switch *ds, int port, u8 state)
 	return 0;
 }
 
-static int _mv88e6xxx_atu_mac_write(struct dsa_switch *ds,
-				    const u8 addr[ETH_ALEN])
+static int __mv88e6xxx_write_addr(struct dsa_switch *ds,
+				  const unsigned char *addr)
 {
 	int i, ret;
 
@@ -1198,7 +1190,7 @@ static int _mv88e6xxx_atu_mac_write(struct dsa_switch *ds,
 	return 0;
 }
 
-static int _mv88e6xxx_atu_mac_read(struct dsa_switch *ds, u8 addr[ETH_ALEN])
+static int __mv88e6xxx_read_addr(struct dsa_switch *ds, unsigned char *addr)
 {
 	int i, ret;
 
@@ -1214,190 +1206,109 @@ static int _mv88e6xxx_atu_mac_read(struct dsa_switch *ds, u8 addr[ETH_ALEN])
 	return 0;
 }
 
-static int _mv88e6xxx_atu_load(struct dsa_switch *ds,
-			       struct mv88e6xxx_atu_entry *entry)
-{
-	u16 reg = 0;
-	int ret;
-
-	ret = _mv88e6xxx_atu_wait(ds);
-	if (ret < 0)
-		return ret;
-
-	ret = _mv88e6xxx_atu_mac_write(ds, entry->mac);
-	if (ret < 0)
-		return ret;
-
-	if (entry->state != GLOBAL_ATU_DATA_STATE_UNUSED) {
-		unsigned int mask, shift;
-
-		if (entry->trunk) {
-			reg |= GLOBAL_ATU_DATA_TRUNK;
-			mask = GLOBAL_ATU_DATA_TRUNK_ID_MASK;
-			shift = GLOBAL_ATU_DATA_TRUNK_ID_SHIFT;
-		} else {
-			mask = GLOBAL_ATU_DATA_PORT_VECTOR_MASK;
-			shift = GLOBAL_ATU_DATA_PORT_VECTOR_SHIFT;
-		}
-
-		reg |= (entry->portv_trunkid << shift) & mask;
-	}
-
-	reg |= entry->state & GLOBAL_ATU_DATA_STATE_MASK;
-
-	ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_DATA, reg);
-	if (ret < 0)
-		return ret;
-
-	return _mv88e6xxx_atu_cmd(ds, entry->fid, GLOBAL_ATU_OP_LOAD_DB);
-}
-
-static int _mv88e6xxx_atu_getnext(struct dsa_switch *ds, u16 fid,
-				  const u8 addr[ETH_ALEN],
-				  struct mv88e6xxx_atu_entry *entry)
+static int __mv88e6xxx_port_fdb_cmd(struct dsa_switch *ds, int port,
+				    const unsigned char *addr, int state)
 {
-	struct mv88e6xxx_atu_entry next = { 0 };
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+	u8 fid = ps->fid[port];
 	int ret;
 
-	next.fid = fid;
-
 	ret = _mv88e6xxx_atu_wait(ds);
 	if (ret < 0)
 		return ret;
 
-	ret = _mv88e6xxx_atu_mac_write(ds, addr);
+	ret = __mv88e6xxx_write_addr(ds, addr);
 	if (ret < 0)
 		return ret;
 
-	ret = _mv88e6xxx_atu_cmd(ds, fid, GLOBAL_ATU_OP_GET_NEXT_DB);
-	if (ret < 0)
-		return ret;
-
-	ret = _mv88e6xxx_atu_mac_read(ds, next.mac);
-	if (ret < 0)
-		return ret;
-
-	ret = _mv88e6xxx_reg_read(ds, REG_GLOBAL, GLOBAL_ATU_DATA);
-	if (ret < 0)
-		return ret;
-
-	next.state = ret & GLOBAL_ATU_DATA_STATE_MASK;
-	if (next.state != GLOBAL_ATU_DATA_STATE_UNUSED) {
-		unsigned int mask, shift;
-
-		if (ret & GLOBAL_ATU_DATA_TRUNK) {
-			next.trunk = true;
-			mask = GLOBAL_ATU_DATA_TRUNK_ID_MASK;
-			shift = GLOBAL_ATU_DATA_TRUNK_ID_SHIFT;
-		} else {
-			next.trunk = false;
-			mask = GLOBAL_ATU_DATA_PORT_VECTOR_MASK;
-			shift = GLOBAL_ATU_DATA_PORT_VECTOR_SHIFT;
-		}
-
-		next.portv_trunkid = (ret & mask) >> shift;
-	}
-
-	*entry = next;
-	return 0;
-}
-
-static int _mv88e6xxx_port_vid_to_fid(struct dsa_switch *ds, int port, u16 vid)
-{
-	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
-
-	if (vid == 0)
-		return ps->fid[port];
-
-	return -ENOENT;
-}
-
-static int _mv88e6xxx_port_fdb_load(struct dsa_switch *ds, int port, u16 vid,
-				    const u8 addr[ETH_ALEN], u8 state)
-{
-	struct mv88e6xxx_atu_entry entry = { 0 };
-	int ret;
-
-	ret = _mv88e6xxx_port_vid_to_fid(ds, port, vid);
-	if (ret < 0)
+	ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_DATA,
+				   (0x10 << port) | state);
+	if (ret)
 		return ret;
 
-	entry.fid = ret;
-	entry.state = state;
-	ether_addr_copy(entry.mac, addr);
-	if (state != GLOBAL_ATU_DATA_STATE_UNUSED) {
-		entry.trunk = false;
-		entry.portv_trunkid = BIT(port);
-	}
+	ret = _mv88e6xxx_atu_cmd(ds, fid, GLOBAL_ATU_OP_LOAD_DB);
 
-	return _mv88e6xxx_atu_load(ds, &entry);
+	return ret;
 }
 
-int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, u16 vid,
-			   const u8 addr[ETH_ALEN])
+int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
+			   const unsigned char *addr, u16 vid)
 {
-	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
-	u8 state = is_multicast_ether_addr(addr) ?
+	int state = is_multicast_ether_addr(addr) ?
 		GLOBAL_ATU_DATA_STATE_MC_STATIC :
 		GLOBAL_ATU_DATA_STATE_UC_STATIC;
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 	int ret;
 
 	mutex_lock(&ps->smi_mutex);
-	ret = _mv88e6xxx_port_fdb_load(ds, port, vid, addr, state);
+	ret = __mv88e6xxx_port_fdb_cmd(ds, port, addr, state);
 	mutex_unlock(&ps->smi_mutex);
 
 	return ret;
 }
 
-int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port, u16 vid,
-			   const u8 addr[ETH_ALEN])
+int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
+			   const unsigned char *addr, u16 vid)
 {
 	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
-	u8 state = GLOBAL_ATU_DATA_STATE_UNUSED;
 	int ret;
 
 	mutex_lock(&ps->smi_mutex);
-	ret = _mv88e6xxx_port_fdb_load(ds, port, vid, addr, state);
+	ret = __mv88e6xxx_port_fdb_cmd(ds, port, addr,
+				       GLOBAL_ATU_DATA_STATE_UNUSED);
 	mutex_unlock(&ps->smi_mutex);
 
 	return ret;
 }
 
-int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port, u16 *vid,
-			       u8 addr[ETH_ALEN], bool *is_static)
+static int __mv88e6xxx_port_getnext(struct dsa_switch *ds, int port,
+				    unsigned char *addr, bool *is_static)
 {
 	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
-	struct mv88e6xxx_atu_entry next;
-	u16 fid;
-	int ret;
+	u8 fid = ps->fid[port];
+	int ret, state;
 
-	mutex_lock(&ps->smi_mutex);
+	ret = _mv88e6xxx_atu_wait(ds);
+	if (ret < 0)
+		return ret;
 
-	ret = _mv88e6xxx_port_vid_to_fid(ds, port, *vid);
+	ret = __mv88e6xxx_write_addr(ds, addr);
 	if (ret < 0)
-		goto unlock;
-	fid = ret;
+		return ret;
 
 	do {
-		if (is_broadcast_ether_addr(addr)) {
-			ret = -ENOENT;
-			goto unlock;
-		}
+		ret = _mv88e6xxx_atu_cmd(ds, fid,  GLOBAL_ATU_OP_GET_NEXT_DB);
+		if (ret < 0)
+			return ret;
 
-		ret = _mv88e6xxx_atu_getnext(ds, fid, addr, &next);
+		ret = _mv88e6xxx_reg_read(ds, REG_GLOBAL, GLOBAL_ATU_DATA);
 		if (ret < 0)
-			goto unlock;
+			return ret;
+		state = ret & GLOBAL_ATU_DATA_STATE_MASK;
+		if (state == GLOBAL_ATU_DATA_STATE_UNUSED)
+			return -ENOENT;
+	} while (!(((ret >> 4) & 0xff) & (1 << port)));
 
-		ether_addr_copy(addr, next.mac);
+	ret = __mv88e6xxx_read_addr(ds, addr);
+	if (ret < 0)
+		return ret;
 
-		if (next.state == GLOBAL_ATU_DATA_STATE_UNUSED)
-			continue;
-	} while (next.trunk || (next.portv_trunkid & BIT(port)) == 0);
+	*is_static = state == (is_multicast_ether_addr(addr) ?
+			       GLOBAL_ATU_DATA_STATE_MC_STATIC :
+			       GLOBAL_ATU_DATA_STATE_UC_STATIC);
+
+	return 0;
+}
+
+/* get next entry for port */
+int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port,
+			       unsigned char *addr, bool *is_static)
+{
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+	int ret;
 
-	*is_static = next.state == (is_multicast_ether_addr(addr) ?
-				    GLOBAL_ATU_DATA_STATE_MC_STATIC :
-				    GLOBAL_ATU_DATA_STATE_UC_STATIC);
-unlock:
+	mutex_lock(&ps->smi_mutex);
+	ret = __mv88e6xxx_port_getnext(ds, port, addr, is_static);
 	mutex_unlock(&ps->smi_mutex);
 
 	return ret;
@@ -1641,9 +1552,9 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port)
 	 * ports, and allow each of the 'real' ports to only talk to
 	 * the upstream port.
 	 */
-	fid = port + 1;
+	fid = __ffs(ps->fid_mask);
 	ps->fid[port] = fid;
-	set_bit(fid, ps->fid_bitmap);
+	ps->fid_mask &= ~(1 << fid);
 
 	if (!dsa_is_cpu_port(ds, port))
 		ps->bridge_mask[fid] = 1 << port;
@@ -1740,7 +1651,7 @@ static int mv88e6xxx_atu_show_db(struct seq_file *s, struct dsa_switch *ds,
 	unsigned char addr[6];
 	int ret, data, state;
 
-	ret = _mv88e6xxx_atu_mac_write(ds, bcast);
+	ret = __mv88e6xxx_write_addr(ds, bcast);
 	if (ret < 0)
 		return ret;
 
@@ -1755,7 +1666,7 @@ static int mv88e6xxx_atu_show_db(struct seq_file *s, struct dsa_switch *ds,
 		state = data & GLOBAL_ATU_DATA_STATE_MASK;
 		if (state == GLOBAL_ATU_DATA_STATE_UNUSED)
 			break;
-		ret = _mv88e6xxx_atu_mac_read(ds, addr);
+		ret = __mv88e6xxx_read_addr(ds, addr);
 		if (ret < 0)
 			return ret;
 		mv88e6xxx_atu_show_entry(s, dbnum, addr, data);
@@ -1942,6 +1853,8 @@ int mv88e6xxx_setup_common(struct dsa_switch *ds)
 
 	ps->id = REG_READ(REG_PORT(0), PORT_SWITCH_ID) & 0xfff0;
 
+	ps->fid_mask = (1 << DSA_MAX_PORTS) - 1;
+
 	INIT_WORK(&ps->bridge_work, mv88e6xxx_bridge_work);
 
 	name = kasprintf(GFP_KERNEL, "dsa%d", ds->index);
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index a94c0cbb3813..8b017d65b691 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -11,8 +11,6 @@
 #ifndef __MV88E6XXX_H
 #define __MV88E6XXX_H
 
-#include <linux/if_vlan.h>
-
 #ifndef UINT64_MAX
 #define UINT64_MAX		(u64)(~((u64)0))
 #endif
@@ -171,7 +169,6 @@
 #define GLOBAL_MAC_01		0x01
 #define GLOBAL_MAC_23		0x02
 #define GLOBAL_MAC_45		0x03
-#define GLOBAL_ATU_FID		0x01	/* 6097 6165 6351 6352 */
 #define GLOBAL_CONTROL		0x04
 #define GLOBAL_CONTROL_SW_RESET		BIT(15)
 #define GLOBAL_CONTROL_PPU_ENABLE	BIT(14)
@@ -206,8 +203,6 @@
 #define GLOBAL_ATU_OP_GET_CLR_VIOLATION	  ((7 << 12) | GLOBAL_ATU_OP_BUSY)
 #define GLOBAL_ATU_DATA		0x0c
 #define GLOBAL_ATU_DATA_TRUNK			BIT(15)
-#define GLOBAL_ATU_DATA_TRUNK_ID_MASK		0x00f0
-#define GLOBAL_ATU_DATA_TRUNK_ID_SHIFT		4
 #define GLOBAL_ATU_DATA_PORT_VECTOR_MASK	0x3ff0
 #define GLOBAL_ATU_DATA_PORT_VECTOR_SHIFT	4
 #define GLOBAL_ATU_DATA_STATE_MASK		0x0f
@@ -318,14 +313,6 @@
 #define GLOBAL2_QOS_WEIGHT	0x1c
 #define GLOBAL2_MISC		0x1d
 
-struct mv88e6xxx_atu_entry {
-	u16	fid;
-	u8	state;
-	bool	trunk;
-	u16	portv_trunkid;
-	u8	mac[ETH_ALEN];
-};
-
 struct mv88e6xxx_priv_state {
 	/* When using multi-chip addressing, this mutex protects
 	 * access to the indirect access registers.  (In single-chip
@@ -364,9 +351,9 @@ struct mv88e6xxx_priv_state {
 
 	/* hw bridging */
 
-	DECLARE_BITMAP(fid_bitmap, VLAN_N_VID);	/* FIDs 1 to 4095 available */
-	u16 fid[DSA_MAX_PORTS];			/* per (non-bridged) port FID */
-	u16 bridge_mask[DSA_MAX_PORTS];		/* br groups (indexed by FID) */
+	u32 fid_mask;
+	u8 fid[DSA_MAX_PORTS];
+	u16 bridge_mask[DSA_MAX_PORTS];
 
 	unsigned long port_state_update_mask;
 	u8 port_state[DSA_MAX_PORTS];
@@ -426,15 +413,15 @@ int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
 int mv88e6xxx_join_bridge(struct dsa_switch *ds, int port, u32 br_port_mask);
 int mv88e6xxx_leave_bridge(struct dsa_switch *ds, int port, u32 br_port_mask);
 int mv88e6xxx_port_stp_update(struct dsa_switch *ds, int port, u8 state);
+int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
+			   const unsigned char *addr, u16 vid);
+int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
+			   const unsigned char *addr, u16 vid);
+int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port,
+			       unsigned char *addr, bool *is_static);
 int mv88e6xxx_phy_page_read(struct dsa_switch *ds, int port, int page, int reg);
 int mv88e6xxx_phy_page_write(struct dsa_switch *ds, int port, int page,
 			     int reg, int val);
-int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, u16 vid,
-			   const u8 addr[ETH_ALEN]);
-int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port, u16 vid,
-			   const u8 addr[ETH_ALEN]);
-int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port, u16 *vid,
-			       u8 addr[ETH_ALEN], bool *is_static);
 
 extern struct dsa_switch_driver mv88e6131_switch_driver;
 extern struct dsa_switch_driver mv88e6123_61_65_switch_driver;
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 80bb25c5a644..b77e0e7307d4 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4543,7 +4543,7 @@ static int rocker_port_fdb_dump(const struct rocker_port *rocker_port,
 	hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) {
 		if (found->key.pport != rocker_port->pport)
 			continue;
-		ether_addr_copy(fdb->addr, found->key.addr);
+		fdb->addr = found->key.addr;
 		fdb->vid = rocker_port_vlan_to_vid(rocker_port,
 						   found->key.vlan_id);
 		err = obj->cb(rocker_port->dev, obj);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 091d35f77180..fbca63ba8f73 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -296,16 +296,12 @@ struct dsa_switch_driver {
 				     u32 br_port_mask);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
-
-	/*
-	 * Forwarding database
-	 */
-	int	(*port_fdb_add)(struct dsa_switch *ds, int port, u16 vid,
-				const u8 addr[ETH_ALEN]);
-	int	(*port_fdb_del)(struct dsa_switch *ds, int port, u16 vid,
-				const u8 addr[ETH_ALEN]);
-	int	(*port_fdb_getnext)(struct dsa_switch *ds, int port, u16 *vid,
-				    u8 addr[ETH_ALEN], bool *is_static);
+	int	(*fdb_add)(struct dsa_switch *ds, int port,
+			   const unsigned char *addr, u16 vid);
+	int	(*fdb_del)(struct dsa_switch *ds, int port,
+			   const unsigned char *addr, u16 vid);
+	int	(*fdb_getnext)(struct dsa_switch *ds, int port,
+			       unsigned char *addr, bool *is_static);
 };
 
 void register_switch_driver(struct dsa_switch_driver *type);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 0e296b82aef3..89da8934519b 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -70,9 +70,8 @@ struct switchdev_obj {
 			u32 tb_id;
 		} ipv4_fib;
 		struct switchdev_obj_fdb {		/* PORT_FDB */
-			u8 addr[ETH_ALEN];
+			const unsigned char *addr;
 			u16 vid;
-			bool is_static;
 		} fdb;
 	} u;
 };
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 5656b44bf3de..9e9875da0a4f 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -136,11 +136,11 @@ static void fdb_del_external_learn(struct net_bridge_fdb_entry *f)
 	struct switchdev_obj obj = {
 		.id = SWITCHDEV_OBJ_PORT_FDB,
 		.u.fdb = {
+			.addr = f->addr.addr,
 			.vid = f->vlan_id,
 		},
 	};
 
-	ether_addr_copy(obj.u.fdb.addr, f->addr.addr);
 	switchdev_port_obj_del(f->dst->dev, &obj);
 }
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 1dbdeaab2bb4..0010c690cc67 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -19,7 +19,6 @@
 #include <net/switchdev.h>
 #include <linux/if_bridge.h>
 #include <linux/netpoll.h>
-#include <linux/if_vlan.h>
 #include "dsa_priv.h"
 
 /* slave mii_bus handling ***************************************************/
@@ -201,6 +200,105 @@ out:
 	return 0;
 }
 
+static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			     struct net_device *dev,
+			     const unsigned char *addr, u16 vid, u16 nlm_flags)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	int ret = -EOPNOTSUPP;
+
+	if (ds->drv->fdb_add)
+		ret = ds->drv->fdb_add(ds, p->port, addr, vid);
+
+	return ret;
+}
+
+static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			     struct net_device *dev,
+			     const unsigned char *addr, u16 vid)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	int ret = -EOPNOTSUPP;
+
+	if (ds->drv->fdb_del)
+		ret = ds->drv->fdb_del(ds, p->port, addr, vid);
+
+	return ret;
+}
+
+static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb,
+			       const unsigned char *addr, u16 vid,
+			       bool is_static,
+			       u32 portid, u32 seq, int type,
+			       unsigned int flags)
+{
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family	 = AF_BRIDGE;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags	 = NTF_EXT_LEARNED;
+	ndm->ndm_type	 = 0;
+	ndm->ndm_ifindex = dev->ifindex;
+	ndm->ndm_state   = is_static ? NUD_NOARP : NUD_REACHABLE;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+		goto nla_put_failure;
+
+	if (vid && nla_put_u16(skb, NDA_VLAN, vid))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+/* Dump information about entries, in response to GETNEIGH */
+static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			      struct net_device *dev,
+			      struct net_device *filter_dev, int idx)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	unsigned char addr[ETH_ALEN] = { 0 };
+	int ret;
+
+	if (!ds->drv->fdb_getnext)
+		return -EOPNOTSUPP;
+
+	for (; ; idx++) {
+		bool is_static;
+
+		ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static);
+		if (ret < 0)
+			break;
+
+		if (idx < cb->args[0])
+			continue;
+
+		ret = dsa_slave_fill_info(dev, skb, addr, 0,
+					  is_static,
+					  NETLINK_CB(cb->skb).portid,
+					  cb->nlh->nlmsg_seq,
+					  RTM_NEWNEIGH, NLM_F_MULTI);
+		if (ret < 0)
+			break;
+	}
+
+	return idx;
+}
+
 static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -266,115 +364,6 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
 	return ret;
 }
 
-static int dsa_slave_port_fdb_add(struct net_device *dev,
-				  struct switchdev_obj *obj)
-{
-	struct switchdev_obj_fdb *fdb = &obj->u.fdb;
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
-	int err;
-
-	if (obj->trans == SWITCHDEV_TRANS_PREPARE)
-		err = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP;
-	else if (obj->trans == SWITCHDEV_TRANS_COMMIT)
-		err = ds->drv->port_fdb_add(ds, p->port, fdb->vid, fdb->addr);
-	else
-		err = -EOPNOTSUPP;
-
-	return err;
-}
-
-static int dsa_slave_port_fdb_del(struct net_device *dev,
-				  struct switchdev_obj *obj)
-{
-	struct switchdev_obj_fdb *fdb = &obj->u.fdb;
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
-
-	if (!ds->drv->port_fdb_del)
-		return -EOPNOTSUPP;
-
-	return ds->drv->port_fdb_del(ds, p->port, fdb->vid, fdb->addr);
-}
-
-static int dsa_slave_port_fdb_dump(struct net_device *dev,
-				   struct switchdev_obj *obj)
-{
-	struct switchdev_obj_fdb *fdb = &obj->u.fdb;
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
-	int err;
-
-	if (!ds->drv->port_fdb_getnext)
-		return -EOPNOTSUPP;
-
-	memset(fdb, 0, sizeof(*fdb));
-
-	for (;;) {
-		err = ds->drv->port_fdb_getnext(ds, p->port, &fdb->vid,
-						fdb->addr, &fdb->is_static);
-		if (err)
-			break;
-
-		err = obj->cb(dev, obj);
-		if (err)
-			break;
-	}
-
-	return err == -ENOENT ? 0 : err;
-}
-
-static int dsa_slave_port_obj_add(struct net_device *dev,
-				  struct switchdev_obj *obj)
-{
-	int err;
-
-	switch (obj->id) {
-	case SWITCHDEV_OBJ_PORT_FDB:
-		err = dsa_slave_port_fdb_add(dev, obj);
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
-static int dsa_slave_port_obj_del(struct net_device *dev,
-				  struct switchdev_obj *obj)
-{
-	int err;
-
-	switch (obj->id) {
-	case SWITCHDEV_OBJ_PORT_FDB:
-		err = dsa_slave_port_fdb_del(dev, obj);
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
-static int dsa_slave_port_obj_dump(struct net_device *dev,
-				   struct switchdev_obj *obj)
-{
-	int err;
-
-	switch (obj->id) {
-	case SWITCHDEV_OBJ_PORT_FDB:
-		err = dsa_slave_port_fdb_dump(dev, obj);
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
 static int dsa_slave_bridge_port_join(struct net_device *dev,
 				      struct net_device *br)
 {
@@ -776,9 +765,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_change_rx_flags	= dsa_slave_change_rx_flags,
 	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
 	.ndo_set_mac_address	= dsa_slave_set_mac_address,
-	.ndo_fdb_add		= switchdev_port_fdb_add,
-	.ndo_fdb_del		= switchdev_port_fdb_del,
-	.ndo_fdb_dump		= switchdev_port_fdb_dump,
+	.ndo_fdb_add		= dsa_slave_fdb_add,
+	.ndo_fdb_del		= dsa_slave_fdb_del,
+	.ndo_fdb_dump		= dsa_slave_fdb_dump,
 	.ndo_do_ioctl		= dsa_slave_ioctl,
 	.ndo_get_iflink		= dsa_slave_get_iflink,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -791,9 +780,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
 	.switchdev_port_attr_get	= dsa_slave_port_attr_get,
 	.switchdev_port_attr_set	= dsa_slave_port_attr_set,
-	.switchdev_port_obj_add		= dsa_slave_port_obj_add,
-	.switchdev_port_obj_del		= dsa_slave_port_obj_del,
-	.switchdev_port_obj_dump	= dsa_slave_port_obj_dump,
 };
 
 static void dsa_slave_adjust_link(struct net_device *dev)
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index e9d1cacc4060..33bafa2e703e 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -15,7 +15,6 @@
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
-#include <linux/etherdevice.h>
 #include <linux/if_bridge.h>
 #include <net/ip_fib.h>
 #include <net/switchdev.h>
@@ -743,11 +742,11 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	struct switchdev_obj obj = {
 		.id = SWITCHDEV_OBJ_PORT_FDB,
 		.u.fdb = {
+			.addr = addr,
 			.vid = vid,
 		},
 	};
 
-	ether_addr_copy(obj.u.fdb.addr, addr);
 	return switchdev_port_obj_add(dev, &obj);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_add);
@@ -770,11 +769,11 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 	struct switchdev_obj obj = {
 		.id = SWITCHDEV_OBJ_PORT_FDB,
 		.u.fdb = {
+			.addr = addr,
 			.vid = vid,
 		},
 	};
 
-	ether_addr_copy(obj.u.fdb.addr, addr);
 	return switchdev_port_obj_del(dev, &obj);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_del);
@@ -811,7 +810,7 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev,
 	ndm->ndm_flags   = NTF_SELF;
 	ndm->ndm_type    = 0;
 	ndm->ndm_ifindex = dev->ifindex;
-	ndm->ndm_state   = obj->u.fdb.is_static ? NUD_NOARP : NUD_REACHABLE;
+	ndm->ndm_state   = NUD_REACHABLE;
 
 	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 2a778e1b58990e15de5cba4badec1fa7ecb87e80 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Mon, 10 Aug 2015 09:09:49 -0400
Subject: net: dsa: change FDB routines prototypes

Change the prototype of port_getnext to include a vid parameter.

This is necessary to introduce the support for VLAN.

Also rename the fdb_{add,del,getnext} function pointers to
port_fdb_{add,del,getnext} since they are specific to a given port.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6171.c |  6 +++---
 drivers/net/dsa/mv88e6352.c |  6 +++---
 drivers/net/dsa/mv88e6xxx.c |  2 +-
 drivers/net/dsa/mv88e6xxx.h |  2 +-
 include/net/dsa.h           | 17 +++++++++++------
 net/dsa/slave.c             | 16 +++++++++-------
 6 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index 1c7808495a9d..735f04cd83ee 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -116,9 +116,9 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
 	.port_join_bridge       = mv88e6xxx_join_bridge,
 	.port_leave_bridge      = mv88e6xxx_leave_bridge,
 	.port_stp_update        = mv88e6xxx_port_stp_update,
-	.fdb_add		= mv88e6xxx_port_fdb_add,
-	.fdb_del		= mv88e6xxx_port_fdb_del,
-	.fdb_getnext		= mv88e6xxx_port_fdb_getnext,
+	.port_fdb_add		= mv88e6xxx_port_fdb_add,
+	.port_fdb_del		= mv88e6xxx_port_fdb_del,
+	.port_fdb_getnext	= mv88e6xxx_port_fdb_getnext,
 };
 
 MODULE_ALIAS("platform:mv88e6171");
diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
index 7e935852e192..a18f7c83d4cb 100644
--- a/drivers/net/dsa/mv88e6352.c
+++ b/drivers/net/dsa/mv88e6352.c
@@ -343,9 +343,9 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
 	.port_join_bridge	= mv88e6xxx_join_bridge,
 	.port_leave_bridge	= mv88e6xxx_leave_bridge,
 	.port_stp_update	= mv88e6xxx_port_stp_update,
-	.fdb_add		= mv88e6xxx_port_fdb_add,
-	.fdb_del		= mv88e6xxx_port_fdb_del,
-	.fdb_getnext		= mv88e6xxx_port_fdb_getnext,
+	.port_fdb_add		= mv88e6xxx_port_fdb_add,
+	.port_fdb_del		= mv88e6xxx_port_fdb_del,
+	.port_fdb_getnext	= mv88e6xxx_port_fdb_getnext,
 };
 
 MODULE_ALIAS("platform:mv88e6172");
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 0cc83785d194..d68e3fdd6c99 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1310,7 +1310,7 @@ static int __mv88e6xxx_port_getnext(struct dsa_switch *ds, int port,
 
 /* get next entry for port */
 int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port,
-			       unsigned char *addr, bool *is_static)
+			       unsigned char *addr, u16 *vid, bool *is_static)
 {
 	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 	int ret;
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 200327b7ea7d..55a6190ce159 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -421,7 +421,7 @@ int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
 int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
 			   const unsigned char *addr, u16 vid);
 int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port,
-			       unsigned char *addr, bool *is_static);
+			       unsigned char *addr, u16 *vid, bool *is_static);
 int mv88e6xxx_phy_page_read(struct dsa_switch *ds, int port, int page, int reg);
 int mv88e6xxx_phy_page_write(struct dsa_switch *ds, int port, int page,
 			     int reg, int val);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index fbca63ba8f73..6356f437e911 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -296,12 +296,17 @@ struct dsa_switch_driver {
 				     u32 br_port_mask);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
-	int	(*fdb_add)(struct dsa_switch *ds, int port,
-			   const unsigned char *addr, u16 vid);
-	int	(*fdb_del)(struct dsa_switch *ds, int port,
-			   const unsigned char *addr, u16 vid);
-	int	(*fdb_getnext)(struct dsa_switch *ds, int port,
-			       unsigned char *addr, bool *is_static);
+
+	/*
+	 * Forwarding database
+	 */
+	int	(*port_fdb_add)(struct dsa_switch *ds, int port,
+				const unsigned char *addr, u16 vid);
+	int	(*port_fdb_del)(struct dsa_switch *ds, int port,
+				const unsigned char *addr, u16 vid);
+	int	(*port_fdb_getnext)(struct dsa_switch *ds, int port,
+				    unsigned char *addr, u16 *vid,
+				    bool *is_static);
 };
 
 void register_switch_driver(struct dsa_switch_driver *type);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 0010c690cc67..3d341b694ecf 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -208,8 +208,8 @@ static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	struct dsa_switch *ds = p->parent;
 	int ret = -EOPNOTSUPP;
 
-	if (ds->drv->fdb_add)
-		ret = ds->drv->fdb_add(ds, p->port, addr, vid);
+	if (ds->drv->port_fdb_add)
+		ret = ds->drv->port_fdb_add(ds, p->port, addr, vid);
 
 	return ret;
 }
@@ -222,8 +222,8 @@ static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 	struct dsa_switch *ds = p->parent;
 	int ret = -EOPNOTSUPP;
 
-	if (ds->drv->fdb_del)
-		ret = ds->drv->fdb_del(ds, p->port, addr, vid);
+	if (ds->drv->port_fdb_del)
+		ret = ds->drv->port_fdb_del(ds, p->port, addr, vid);
 
 	return ret;
 }
@@ -272,22 +272,24 @@ static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->parent;
 	unsigned char addr[ETH_ALEN] = { 0 };
+	u16 vid = 0;
 	int ret;
 
-	if (!ds->drv->fdb_getnext)
+	if (!ds->drv->port_fdb_getnext)
 		return -EOPNOTSUPP;
 
 	for (; ; idx++) {
 		bool is_static;
 
-		ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static);
+		ret = ds->drv->port_fdb_getnext(ds, p->port, addr, &vid,
+						&is_static);
 		if (ret < 0)
 			break;
 
 		if (idx < cb->args[0])
 			continue;
 
-		ret = dsa_slave_fill_info(dev, skb, addr, 0,
+		ret = dsa_slave_fill_info(dev, skb, addr, vid,
 					  is_static,
 					  NETLINK_CB(cb->skb).portid,
 					  cb->nlh->nlmsg_seq,
-- 
cgit v1.2.3


From ce80e7bc57e25062c361de8fb6444129a63bac6d Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Mon, 10 Aug 2015 09:09:52 -0400
Subject: net: switchdev: support static FDB addresses

This patch adds an ndm_state member to the switchdev_obj_fdb structure,
in order to support static FDB addresses.

Set Rocker ndm_state to NUD_REACHABLE.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 1 +
 include/net/switchdev.h              | 1 +
 net/switchdev/switchdev.c            | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index b77e0e7307d4..af050759eb44 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4544,6 +4544,7 @@ static int rocker_port_fdb_dump(const struct rocker_port *rocker_port,
 		if (found->key.pport != rocker_port->pport)
 			continue;
 		fdb->addr = found->key.addr;
+		fdb->ndm_state = NUD_REACHABLE;
 		fdb->vid = rocker_port_vlan_to_vid(rocker_port,
 						   found->key.vlan_id);
 		err = obj->cb(rocker_port->dev, obj);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 89da8934519b..319baab3b48e 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -72,6 +72,7 @@ struct switchdev_obj {
 		struct switchdev_obj_fdb {		/* PORT_FDB */
 			const unsigned char *addr;
 			u16 vid;
+			u16 ndm_state;
 		} fdb;
 	} u;
 };
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 33bafa2e703e..16c1c43980a1 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -810,7 +810,7 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev,
 	ndm->ndm_flags   = NTF_SELF;
 	ndm->ndm_type    = 0;
 	ndm->ndm_ifindex = dev->ifindex;
-	ndm->ndm_state   = NUD_REACHABLE;
+	ndm->ndm_state   = obj->u.fdb.ndm_state;
 
 	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From b72f6f51dc5abce94c1b5ee0186e9407ea0f919f Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Tue, 11 Aug 2015 21:44:08 +0200
Subject: 6lowpan: add generic 6lowpan netdev private data

This patch introduced the 6lowpan netdev private data struct. We name it
lowpan_priv and it's placed at the beginning of netdev private data. All
lowpan interfaces should allocate this room at first of netdev private
data. 6LoWPAN LL private data can be allocate by additional netdev private
data, e.g. dev->priv_size should be "sizeof(struct lowpan_priv) +
sizeof(LL_LOWPAN_PRIVATE_DATA)".

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/6lowpan.h              | 23 +++++++++++++++++++++++
 net/6lowpan/Makefile               |  2 +-
 net/6lowpan/core.c                 | 20 ++++++++++++++++++++
 net/bluetooth/6lowpan.c            |  9 ++++++---
 net/ieee802154/6lowpan/6lowpan_i.h |  3 ++-
 net/ieee802154/6lowpan/core.c      |  4 +++-
 6 files changed, 55 insertions(+), 6 deletions(-)
 create mode 100644 net/6lowpan/core.c

(limited to 'include/net')

diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h
index dc03d77ad23b..a2f59ec98d24 100644
--- a/include/net/6lowpan.h
+++ b/include/net/6lowpan.h
@@ -197,6 +197,27 @@
 #define LOWPAN_NHC_UDP_CS_P_11	0xF3 /* source & dest = 0xF0B + 4bit inline */
 #define LOWPAN_NHC_UDP_CS_C	0x04 /* checksum elided */
 
+#define LOWPAN_PRIV_SIZE(llpriv_size)	\
+	(sizeof(struct lowpan_priv) + llpriv_size)
+
+enum lowpan_lltypes {
+	LOWPAN_LLTYPE_BTLE,
+	LOWPAN_LLTYPE_IEEE802154,
+};
+
+struct lowpan_priv {
+	enum lowpan_lltypes lltype;
+
+	/* must be last */
+	u8 priv[0] __aligned(sizeof(void *));
+};
+
+static inline
+struct lowpan_priv *lowpan_priv(const struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
 #ifdef DEBUG
 /* print data in line */
 static inline void raw_dump_inline(const char *caller, char *msg,
@@ -372,6 +393,8 @@ lowpan_uncompress_size(const struct sk_buff *skb, u16 *dgram_offset)
 	return skb->len + uncomp_header - ret;
 }
 
+void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype);
+
 int
 lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev,
 			 const u8 *saddr, const u8 saddr_type,
diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile
index eb8baa72adc8..c6ffc55ee0d7 100644
--- a/net/6lowpan/Makefile
+++ b/net/6lowpan/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_6LOWPAN) += 6lowpan.o
 
-6lowpan-y := iphc.o nhc.o
+6lowpan-y := core.o iphc.o nhc.o
 
 #rfc6282 nhcs
 obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
new file mode 100644
index 000000000000..ed0eec9b41a1
--- /dev/null
+++ b/net/6lowpan/core.c
@@ -0,0 +1,20 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ */
+
+#include <net/6lowpan.h>
+
+void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype)
+{
+	lowpan_priv(dev)->lltype = lltype;
+}
+EXPORT_SYMBOL(lowpan_netdev_setup);
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 24ed5b02cefc..131e79cde350 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -85,7 +85,7 @@ struct lowpan_dev {
 
 static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev)
 {
-	return netdev_priv(netdev);
+	return (struct lowpan_dev *)lowpan_priv(netdev)->priv;
 }
 
 static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer)
@@ -848,8 +848,9 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
 	struct net_device *netdev;
 	int err = 0;
 
-	netdev = alloc_netdev(sizeof(struct lowpan_dev), IFACE_NAME_TEMPLATE,
-			      NET_NAME_UNKNOWN, netdev_setup);
+	netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev)),
+			      IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN,
+			      netdev_setup);
 	if (!netdev)
 		return -ENOMEM;
 
@@ -869,6 +870,8 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
 	list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
 	spin_unlock(&devices_lock);
 
+	lowpan_netdev_setup(netdev, LOWPAN_LLTYPE_BTLE);
+
 	err = register_netdev(netdev);
 	if (err < 0) {
 		BT_INFO("register_netdev failed %d", err);
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index 923b680adb61..ea339fa94c27 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -5,6 +5,7 @@
 
 #include <net/ieee802154_netdev.h>
 #include <net/inet_frag.h>
+#include <net/6lowpan.h>
 
 struct lowpan_create_arg {
 	u16 tag;
@@ -46,7 +47,7 @@ struct lowpan_dev_info {
 static inline struct
 lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
 {
-	return netdev_priv(dev);
+	return (struct lowpan_dev_info *)lowpan_priv(dev)->priv;
 }
 
 int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index a4edee8fdc79..180e9f5f86c3 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -138,6 +138,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev,
 	/* Set the lowpan hardware address to the wpan hardware address. */
 	memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN);
 
+	lowpan_netdev_setup(dev, LOWPAN_LLTYPE_IEEE802154);
+
 	ret = register_netdevice(dev);
 	if (ret >= 0) {
 		real_dev->ieee802154_ptr->lowpan_dev = dev;
@@ -162,7 +164,7 @@ static void lowpan_dellink(struct net_device *dev, struct list_head *head)
 
 static struct rtnl_link_ops lowpan_link_ops __read_mostly = {
 	.kind		= "lowpan",
-	.priv_size	= sizeof(struct lowpan_dev_info),
+	.priv_size	= LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev_info)),
 	.setup		= lowpan_setup,
 	.newlink	= lowpan_newlink,
 	.dellink	= lowpan_dellink,
-- 
cgit v1.2.3


From 4b58c37bb9d4282446f7a0194dbc44325787ac8c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 8 Jul 2015 15:41:48 +0300
Subject: mac80211: remove ieee80211_aes_cmac_calculate_k1_k2()

The iwlwifi driver was the only driver that used this, but as
it turns out it never needed it, so we can remove it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 13 -------------
 net/mac80211/aes_cmac.c | 17 -----------------
 2 files changed, 30 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 484cc14fb947..e3314e516681 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4330,19 +4330,6 @@ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf,
 void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf,
 			    struct sk_buff *skb, u8 *p2k);
 
-/**
- * ieee80211_aes_cmac_calculate_k1_k2 - calculate the AES-CMAC sub keys
- *
- * This function computes the two AES-CMAC sub-keys, based on the
- * previously installed master key.
- *
- * @keyconf: the parameter passed with the set key
- * @k1: a buffer to be filled with the 1st sub-key
- * @k2: a buffer to be filled with the 2nd sub-key
- */
-void ieee80211_aes_cmac_calculate_k1_k2(struct ieee80211_key_conf *keyconf,
-					u8 *k1, u8 *k2);
-
 /**
  * ieee80211_get_key_tx_seq - get key TX sequence counter
  *
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index 4192806be3d3..bdf0790d89cc 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -145,20 +145,3 @@ void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
 {
 	crypto_free_cipher(tfm);
 }
-
-void ieee80211_aes_cmac_calculate_k1_k2(struct ieee80211_key_conf *keyconf,
-					u8 *k1, u8 *k2)
-{
-	u8 l[AES_BLOCK_SIZE] = {};
-	struct ieee80211_key *key =
-		container_of(keyconf, struct ieee80211_key, conf);
-
-	crypto_cipher_encrypt_one(key->u.aes_cmac.tfm, l, l);
-
-	memcpy(k1, l, AES_BLOCK_SIZE);
-	gf_mulx(k1);
-
-	memcpy(k2, k1, AES_BLOCK_SIZE);
-	gf_mulx(k2);
-}
-EXPORT_SYMBOL(ieee80211_aes_cmac_calculate_k1_k2);
-- 
cgit v1.2.3


From 111495361598205967f1be4e07d4726b0f762d60 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 13 Aug 2015 12:52:17 -0400
Subject: net: dsa: add support for switchdev VLAN objects

Add new functions in DSA drivers to access hardware VLAN entries through
SWITCHDEV_OBJ_PORT_VLAN objects:

 - port_pvid_get() and vlan_getnext() to dump a VLAN
 - port_vlan_del() to exclude a port from a VLAN
 - port_pvid_set() and port_vlan_add() to join a port to a VLAN

The DSA infrastructure will ensure that each VLAN of the given range
does not already belong to another bridge. If it does, it will fallback
to software VLAN and won't program the hardware.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  11 ++++
 net/dsa/slave.c   | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6356f437e911..bd9b76502458 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -297,6 +297,17 @@ struct dsa_switch_driver {
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
 
+	/*
+	 * VLAN support
+	 */
+	int	(*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid);
+	int	(*port_pvid_set)(struct dsa_switch *ds, int port, u16 pvid);
+	int	(*port_vlan_add)(struct dsa_switch *ds, int port, u16 vid,
+				 bool untagged);
+	int	(*port_vlan_del)(struct dsa_switch *ds, int port, u16 vid);
+	int	(*vlan_getnext)(struct dsa_switch *ds, u16 *vid,
+				unsigned long *ports, unsigned long *untagged);
+
 	/*
 	 * Forwarding database
 	 */
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index aa0266f7d0ce..373ff315030d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -200,6 +200,152 @@ out:
 	return 0;
 }
 
+static int dsa_bridge_check_vlan_range(struct dsa_switch *ds,
+				       const struct net_device *bridge,
+				       u16 vid_begin, u16 vid_end)
+{
+	struct dsa_slave_priv *p;
+	struct net_device *dev, *vlan_br;
+	DECLARE_BITMAP(members, DSA_MAX_PORTS);
+	DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
+	u16 vid;
+	int member, err;
+
+	if (!ds->drv->vlan_getnext || !vid_begin)
+		return -EOPNOTSUPP;
+
+	vid = vid_begin - 1;
+
+	do {
+		err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
+		if (err)
+			break;
+
+		if (vid > vid_end)
+			break;
+
+		member = find_first_bit(members, DSA_MAX_PORTS);
+		if (member == DSA_MAX_PORTS)
+			continue;
+
+		dev = ds->ports[member];
+		p = netdev_priv(dev);
+		vlan_br = p->bridge_dev;
+		if (vlan_br == bridge)
+			continue;
+
+		netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid);
+		return -EOPNOTSUPP;
+	} while (vid < vid_end);
+
+	return err == -ENOENT ? 0 : err;
+}
+
+static int dsa_slave_port_vlan_add(struct net_device *dev,
+				   struct switchdev_obj *obj)
+{
+	struct switchdev_obj_vlan *vlan = &obj->u.vlan;
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	u16 vid;
+	int err;
+
+	switch (obj->trans) {
+	case SWITCHDEV_TRANS_PREPARE:
+		if (!ds->drv->port_vlan_add || !ds->drv->port_pvid_set)
+			return -EOPNOTSUPP;
+
+		/* If the requested port doesn't belong to the same bridge as
+		 * the VLAN members, fallback to software VLAN (hopefully).
+		 */
+		err = dsa_bridge_check_vlan_range(ds, p->bridge_dev,
+						  vlan->vid_begin,
+						  vlan->vid_end);
+		if (err)
+			return err;
+		break;
+	case SWITCHDEV_TRANS_COMMIT:
+		for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) {
+			err = ds->drv->port_vlan_add(ds, p->port, vid,
+						     vlan->flags &
+						     BRIDGE_VLAN_INFO_UNTAGGED);
+			if (!err && vlan->flags & BRIDGE_VLAN_INFO_PVID)
+				err = ds->drv->port_pvid_set(ds, p->port, vid);
+			if (err)
+				return err;
+		}
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int dsa_slave_port_vlan_del(struct net_device *dev,
+				   struct switchdev_obj *obj)
+{
+	struct switchdev_obj_vlan *vlan = &obj->u.vlan;
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	u16 vid;
+	int err;
+
+	if (!ds->drv->port_vlan_del)
+		return -EOPNOTSUPP;
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) {
+		err = ds->drv->port_vlan_del(ds, p->port, vid);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int dsa_slave_port_vlan_dump(struct net_device *dev,
+				    struct switchdev_obj *obj)
+{
+	struct switchdev_obj_vlan *vlan = &obj->u.vlan;
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+	DECLARE_BITMAP(members, DSA_MAX_PORTS);
+	DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
+	u16 pvid, vid = 0;
+	int err;
+
+	if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get)
+		return -EOPNOTSUPP;
+
+	err = ds->drv->port_pvid_get(ds, p->port, &pvid);
+	if (err)
+		return err;
+
+	for (;;) {
+		err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
+		if (err)
+			break;
+
+		if (!test_bit(p->port, members))
+			continue;
+
+		memset(vlan, 0, sizeof(*vlan));
+		vlan->vid_begin = vlan->vid_end = vid;
+
+		if (vid == pvid)
+			vlan->flags |= BRIDGE_VLAN_INFO_PVID;
+
+		if (test_bit(p->port, untagged))
+			vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+
+		err = obj->cb(dev, obj);
+		if (err)
+			break;
+	}
+
+	return err == -ENOENT ? 0 : err;
+}
+
 static int dsa_slave_port_fdb_add(struct net_device *dev,
 				  struct switchdev_obj *obj)
 {
@@ -341,6 +487,9 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
 	case SWITCHDEV_OBJ_PORT_FDB:
 		err = dsa_slave_port_fdb_add(dev, obj);
 		break;
+	case SWITCHDEV_OBJ_PORT_VLAN:
+		err = dsa_slave_port_vlan_add(dev, obj);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -358,6 +507,9 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
 	case SWITCHDEV_OBJ_PORT_FDB:
 		err = dsa_slave_port_fdb_del(dev, obj);
 		break;
+	case SWITCHDEV_OBJ_PORT_VLAN:
+		err = dsa_slave_port_vlan_del(dev, obj);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -375,6 +527,9 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
 	case SWITCHDEV_OBJ_PORT_FDB:
 		err = dsa_slave_port_fdb_dump(dev, obj);
 		break;
+	case SWITCHDEV_OBJ_PORT_VLAN:
+		err = dsa_slave_port_vlan_dump(dev, obj);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -794,6 +949,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_netpoll_cleanup	= dsa_slave_netpoll_cleanup,
 	.ndo_poll_controller	= dsa_slave_poll_controller,
 #endif
+	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
+	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
+	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
 };
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
-- 
cgit v1.2.3


From 4e3c89920cd3a6cfce22c6f537690747c26128dd Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 14:59:00 -0600
Subject: net: Introduce VRF related flags and helpers

Add a VRF_MASTER flag for interfaces and helper functions for determining
if a device is a VRF_MASTER.

Add link attribute for passing VRF_TABLE id.

Add vrf_ptr to netdevice.

Add various macros for determining if a device is a VRF device, the index
of the master VRF device and table associated with VRF device.

Signed-off-by: Shrijeet Mukherjee <shm@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  20 +++++++
 include/net/vrf.h            | 139 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/if_link.h |   9 +++
 3 files changed, 168 insertions(+)
 create mode 100644 include/net/vrf.h

(limited to 'include/net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 607b5f41f46f..f7a6ef2fae3a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1289,6 +1289,7 @@ enum netdev_priv_flags {
 	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
 	IFF_IPVLAN_MASTER		= 1<<23,
 	IFF_IPVLAN_SLAVE		= 1<<24,
+	IFF_VRF_MASTER			= 1<<25,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1316,6 +1317,7 @@ enum netdev_priv_flags {
 #define IFF_XMIT_DST_RELEASE_PERM	IFF_XMIT_DST_RELEASE_PERM
 #define IFF_IPVLAN_MASTER		IFF_IPVLAN_MASTER
 #define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
+#define IFF_VRF_MASTER			IFF_VRF_MASTER
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -1432,6 +1434,7 @@ enum netdev_priv_flags {
  *	@dn_ptr:	DECnet specific data
  *	@ip6_ptr:	IPv6 specific data
  *	@ax25_ptr:	AX.25 specific data
+ *	@vrf_ptr:	VRF specific data
  *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
  *
  *	@last_rx:	Time of last Rx
@@ -1650,6 +1653,7 @@ struct net_device {
 	struct dn_dev __rcu     *dn_ptr;
 	struct inet6_dev __rcu	*ip6_ptr;
 	void			*ax25_ptr;
+	struct net_vrf_dev __rcu *vrf_ptr;
 	struct wireless_dev	*ieee80211_ptr;
 	struct wpan_dev		*ieee802154_ptr;
 #if IS_ENABLED(CONFIG_MPLS_ROUTING)
@@ -3808,6 +3812,22 @@ static inline bool netif_supports_nofcs(struct net_device *dev)
 	return dev->priv_flags & IFF_SUPP_NOFCS;
 }
 
+static inline bool netif_is_vrf(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_VRF_MASTER;
+}
+
+static inline bool netif_index_is_vrf(struct net *net, int ifindex)
+{
+	struct net_device *dev = dev_get_by_index_rcu(net, ifindex);
+	bool rc = false;
+
+	if (dev)
+		rc = netif_is_vrf(dev);
+
+	return rc;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/include/net/vrf.h b/include/net/vrf.h
new file mode 100644
index 000000000000..0484d29d4589
--- /dev/null
+++ b/include/net/vrf.h
@@ -0,0 +1,139 @@
+/*
+ * include/net/net_vrf.h - adds vrf dev structure definitions
+ * Copyright (c) 2015 Cumulus Networks
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_NET_VRF_H
+#define __LINUX_NET_VRF_H
+
+struct net_vrf_dev {
+	struct rcu_head		rcu;
+	int                     ifindex; /* ifindex of master dev */
+	u32                     tb_id;   /* table id for VRF */
+};
+
+struct slave {
+	struct list_head	list;
+	struct net_device	*dev;
+};
+
+struct slave_queue {
+	struct list_head	all_slaves;
+	int			num_slaves;
+};
+
+struct net_vrf {
+	struct slave_queue	queue;
+	struct rtable           *rth;
+	u32			tb_id;
+};
+
+
+#if IS_ENABLED(CONFIG_NET_VRF)
+/* called with rcu_read_lock() */
+static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
+{
+	struct net_vrf_dev *vrf_ptr;
+	int ifindex = 0;
+
+	if (!dev)
+		return 0;
+
+	if (netif_is_vrf(dev))
+		ifindex = dev->ifindex;
+	else {
+		vrf_ptr = rcu_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			ifindex = vrf_ptr->ifindex;
+	}
+
+	return ifindex;
+}
+
+/* called with rcu_read_lock */
+static inline int vrf_dev_table_rcu(const struct net_device *dev)
+{
+	int tb_id = 0;
+
+	if (dev) {
+		struct net_vrf_dev *vrf_ptr;
+
+		vrf_ptr = rcu_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			tb_id = vrf_ptr->tb_id;
+	}
+	return tb_id;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+	int tb_id;
+
+	rcu_read_lock();
+	tb_id = vrf_dev_table_rcu(dev);
+	rcu_read_unlock();
+
+	return tb_id;
+}
+
+/* called with rtnl */
+static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+{
+	int tb_id = 0;
+
+	if (dev) {
+		struct net_vrf_dev *vrf_ptr;
+
+		vrf_ptr = rtnl_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			tb_id = vrf_ptr->tb_id;
+	}
+	return tb_id;
+}
+
+/* caller has already checked netif_is_vrf(dev) */
+static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev)
+{
+	struct rtable *rth = ERR_PTR(-ENETUNREACH);
+	struct net_vrf *vrf = netdev_priv(dev);
+
+	if (vrf) {
+		rth = vrf->rth;
+		atomic_inc(&rth->dst.__refcnt);
+	}
+	return rth;
+}
+
+#else
+static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table_rcu(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev)
+{
+	return ERR_PTR(-ENETUNREACH);
+}
+#endif
+
+#endif /* __LINUX_NET_VRF_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d450be36add2..313c305fd1ad 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -341,6 +341,15 @@ enum macvlan_macaddr_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VRF section */
+enum {
+	IFLA_VRF_UNSPEC,
+	IFLA_VRF_TABLE,
+	__IFLA_VRF_MAX
+};
+
+#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
-- 
cgit v1.2.3


From 613d09b30f8b589d5a9b49775054c8865db95d1c Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 14:59:02 -0600
Subject: net: Use VRF device index for lookups on TX

As with ingress use the index of VRF master device for route lookups on
egress. However, the oif should only be used to direct the lookups to a
specific table. Routes in the table are not based on the VRF device but
rather interfaces that are part of the VRF so do not consider the oif for
lookups within the table. The FLOWI_FLAG_VRFSRC is used to control this
latter part.

Signed-off-by: Shrijeet Mukherjee <shm@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h  | 1 +
 include/net/route.h | 3 +++
 net/ipv4/fib_trie.c | 7 +++++--
 net/ipv4/icmp.c     | 4 ++++
 net/ipv4/route.c    | 5 +++++
 5 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow.h b/include/net/flow.h
index 3098ae33a178..f305588fc162 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -33,6 +33,7 @@ struct flowi_common {
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_KNOWN_NH		0x02
+#define FLOWI_FLAG_VRFSRC		0x04
 	__u32	flowic_secid;
 	struct flowi_tunnel flowic_tun_key;
 };
diff --git a/include/net/route.h b/include/net/route.h
index 2d45f419477f..94189d4bd899 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -251,6 +251,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 	if (inet_sk(sk)->transparent)
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
+	if (netif_index_is_vrf(sock_net(sk), oif))
+		flow_flags |= FLOWI_FLAG_VRFSRC;
+
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
 			   protocol, flow_flags, dst, src, dport, sport);
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 37c4bb89a708..1243c79cb5b0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1423,8 +1423,11 @@ found:
 			    nh->nh_flags & RTNH_F_LINKDOWN &&
 			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
 				continue;
-			if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
-				continue;
+			if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
+				if (flp->flowi4_oif &&
+				    flp->flowi4_oif != nh->nh_oif)
+					continue;
+			}
 
 			if (!(fib_flags & FIB_LOOKUP_NOREF))
 				atomic_inc(&fi->fib_clntref);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c0556f1e4bf0..1164fc4ce3bc 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -96,6 +96,7 @@
 #include <net/xfrm.h>
 #include <net/inet_common.h>
 #include <net/ip_fib.h>
+#include <net/vrf.h>
 
 /*
  *	Build xmit assembly blocks
@@ -425,6 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
+	fl4.flowi4_oif = vrf_master_ifindex_rcu(skb->dev) ? : skb->dev->ifindex;
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
@@ -458,6 +460,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->flowi4_proto = IPPROTO_ICMP;
 	fl4->fl4_icmp_type = type;
 	fl4->fl4_icmp_code = code;
+	fl4->flowi4_oif = vrf_master_ifindex_rcu(skb_in->dev) ? : skb_in->dev->ifindex;
+
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
 	rt = __ip_route_output_key(net, fl4);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c26ff1f7067d..2c89d294b669 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2131,6 +2131,11 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 				fl4->saddr = inet_select_addr(dev_out, 0,
 							      RT_SCOPE_HOST);
 		}
+		if (netif_is_vrf(dev_out) &&
+		    !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
+			rth = vrf_dev_get_rth(dev_out);
+			goto out;
+		}
 	}
 
 	if (!fl4->daddr) {
-- 
cgit v1.2.3


From 15be405eb2ea943ac5fa2aab7d0ba282e9ef1301 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 14:59:04 -0600
Subject: net: Add inet_addr lookup by table

Currently inet_addr_type and inet_dev_addr_type expect local addresses
to be in the local table. With the VRF device local routes for devices
associated with a VRF will be in the table associated with the VRF.
Provide an alternate inet_addr lookup to use a specific table rather
than defaulting to the local table.

Signed-off-by: Shrijeet Mukherjee <shm@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |  1 +
 net/ipv4/fib_frontend.c | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/route.h b/include/net/route.h
index 94189d4bd899..6ba681f0b98d 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -189,6 +189,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
 void ip_rt_send_redirect(struct sk_buff *skb);
 
 unsigned int inet_addr_type(struct net *net, __be32 addr);
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr);
 void ip_rt_multicast_event(struct in_device *);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d8ced1d89f1b..b11321a8e58d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -212,12 +212,12 @@ void fib_flush_external(struct net *net)
  */
 static inline unsigned int __inet_dev_addr_type(struct net *net,
 						const struct net_device *dev,
-						__be32 addr)
+						__be32 addr, int tb_id)
 {
 	struct flowi4		fl4 = { .daddr = addr };
 	struct fib_result	res;
 	unsigned int ret = RTN_BROADCAST;
-	struct fib_table *local_table;
+	struct fib_table *table;
 
 	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
 		return RTN_BROADCAST;
@@ -226,10 +226,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 
 	rcu_read_lock();
 
-	local_table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (local_table) {
+	table = fib_get_table(net, tb_id);
+	if (table) {
 		ret = RTN_UNICAST;
-		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+		if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 			if (!dev || dev == res.fi->fib_dev)
 				ret = res.type;
 		}
@@ -239,16 +239,24 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	return ret;
 }
 
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id)
+{
+	return __inet_dev_addr_type(net, NULL, addr, tb_id);
+}
+EXPORT_SYMBOL(inet_addr_type_table);
+
 unsigned int inet_addr_type(struct net *net, __be32 addr)
 {
-	return __inet_dev_addr_type(net, NULL, addr);
+	return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
 }
 EXPORT_SYMBOL(inet_addr_type);
 
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr)
 {
-	return __inet_dev_addr_type(net, dev, addr);
+	int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+
+	return __inet_dev_addr_type(net, dev, addr, rt_table);
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 
-- 
cgit v1.2.3


From 30bbaa19500559d7625c65632195413f639b3b97 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 14:59:05 -0600
Subject: net: Fix up inet_addr_type checks

Currently inet_addr_type and inet_dev_addr_type expect local addresses
to be in the local table. With the VRF device local routes for devices
associated with a VRF will be in the table associated with the VRF.
Provide an alternate inet_addr lookup to use a specific table rather
than defaulting to the local table.

inet_addr_type_dev_table keeps the same semantics as inet_addr_type but
if the passed in device is enslaved to a VRF then the table for that VRF
is used for the lookup.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h      |  3 +++
 net/ipv4/af_inet.c       | 13 ++++++++++++-
 net/ipv4/arp.c           | 15 +++++++++------
 net/ipv4/fib_frontend.c  | 25 ++++++++++++++++++++++---
 net/ipv4/fib_semantics.c |  6 ++++--
 net/ipv4/icmp.c          |  5 +++--
 6 files changed, 53 insertions(+), 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/route.h b/include/net/route.h
index 6ba681f0b98d..6dda2c1bf8c6 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -192,6 +192,9 @@ unsigned int inet_addr_type(struct net *net, __be32 addr);
 unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr);
+unsigned int inet_addr_type_dev_table(struct net *net,
+				      const struct net_device *dev,
+				      __be32 addr);
 void ip_rt_multicast_event(struct in_device *);
 int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg);
 void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index cc4e498a0ccf..c8b855882fa5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -119,6 +119,7 @@
 #ifdef CONFIG_IP_MROUTE
 #include <linux/mroute.h>
 #endif
+#include <net/vrf.h>
 
 
 /* The inetsw table contains everything that inet_create needs to
@@ -427,6 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct net *net = sock_net(sk);
 	unsigned short snum;
 	int chk_addr_ret;
+	int tb_id = RT_TABLE_LOCAL;
 	int err;
 
 	/* If the socket has its own bind function then use it. (RAW) */
@@ -448,7 +450,16 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			goto out;
 	}
 
-	chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+	if (sk->sk_bound_dev_if) {
+		struct net_device *dev;
+
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+		if (dev)
+			tb_id = vrf_dev_table_rcu(dev) ? : tb_id;
+		rcu_read_unlock();
+	}
+	chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
 
 	/* Not specified by any standard per-se, however it breaks too
 	 * many applications when removed.  It is unfortunate since
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 34a308573f4b..30409b75e925 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -233,7 +233,7 @@ static int arp_constructor(struct neighbour *neigh)
 		return -EINVAL;
 	}
 
-	neigh->type = inet_addr_type(dev_net(dev), addr);
+	neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
 
 	parms = in_dev->arp_parms;
 	__neigh_parms_put(neigh->parms);
@@ -343,7 +343,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
 	default:
 	case 0:		/* By default announce any local IP */
-		if (skb && inet_addr_type(dev_net(dev),
+		if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
 					  ip_hdr(skb)->saddr) == RTN_LOCAL)
 			saddr = ip_hdr(skb)->saddr;
 		break;
@@ -351,7 +351,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 		if (!skb)
 			break;
 		saddr = ip_hdr(skb)->saddr;
-		if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+		if (inet_addr_type_dev_table(dev_net(dev), dev,
+					     saddr) == RTN_LOCAL) {
 			/* saddr should be known to target */
 			if (inet_addr_onlink(in_dev, target, saddr))
 				break;
@@ -751,7 +752,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
 	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
 	if (sip == 0) {
 		if (arp->ar_op == htons(ARPOP_REQUEST) &&
-		    inet_addr_type(net, tip) == RTN_LOCAL &&
+		    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
 		    !arp_ignore(in_dev, sip, tip))
 			arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
 				 dev->dev_addr, sha);
@@ -811,16 +812,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
 	n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
 
 	if (IN_DEV_ARP_ACCEPT(in_dev)) {
+		unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip);
+
 		/* Unsolicited ARP is not accepted by default.
 		   It is possible, that this option should be enabled for some
 		   devices (strip is candidate)
 		 */
 		is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
-			  inet_addr_type(net, sip) == RTN_UNICAST;
+			  addr_type == RTN_UNICAST;
 
 		if (!n &&
 		    ((arp->ar_op == htons(ARPOP_REPLY)  &&
-		      inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
+				addr_type == RTN_UNICAST) || is_garp))
 			n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
 	}
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b11321a8e58d..c55723ec4c3e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -260,6 +260,19 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 
+/* inet_addr_type with dev == NULL but using the table from a dev
+ * if one is associated
+ */
+unsigned int inet_addr_type_dev_table(struct net *net,
+				      const struct net_device *dev,
+				      __be32 addr)
+{
+	int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+
+	return __inet_dev_addr_type(net, NULL, addr, rt_table);
+}
+EXPORT_SYMBOL(inet_addr_type_dev_table);
+
 __be32 fib_compute_spec_dst(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
@@ -510,9 +523,12 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 
 	addr = sk_extract_addr(&rt->rt_gateway);
 	if (rt->rt_gateway.sa_family == AF_INET && addr) {
+		unsigned int addr_type;
+
 		cfg->fc_gw = addr;
+		addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
 		if (rt->rt_flags & RTF_GATEWAY &&
-		    inet_addr_type(net, addr) == RTN_UNICAST)
+		    addr_type == RTN_UNICAST)
 			cfg->fc_scope = RT_SCOPE_UNIVERSE;
 	}
 
@@ -984,11 +1000,14 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
 	}
 	if (!(ok & LOCAL_OK)) {
+		unsigned int addr_type;
+
 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
 
 		/* Check, that this local address finally disappeared. */
-		if (gone &&
-		    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+		addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
+						     ifa->ifa_local);
+		if (gone && addr_type != RTN_LOCAL) {
 			/* And the last, but not the least thing.
 			 * We must flush stray FIB entries.
 			 *
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 558e196bae0f..410ddb67221e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -670,16 +670,18 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 		struct fib_result res;
 
 		if (nh->nh_flags & RTNH_F_ONLINK) {
+			unsigned int addr_type;
 
 			if (cfg->fc_scope >= RT_SCOPE_LINK)
 				return -EINVAL;
-			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
-				return -EINVAL;
 			dev = __dev_get_by_index(net, nh->nh_oif);
 			if (!dev)
 				return -ENODEV;
 			if (!(dev->flags & IFF_UP))
 				return -ENETDOWN;
+			addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
+			if (addr_type != RTN_UNICAST)
+				return -EINVAL;
 			if (!netif_carrier_ok(dev))
 				nh->nh_flags |= RTNH_F_LINKDOWN;
 			nh->nh_dev = dev;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1164fc4ce3bc..c6f1ce149ffb 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -484,7 +484,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	if (err)
 		goto relookup_failed;
 
-	if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+	if (inet_addr_type_dev_table(net, skb_in->dev,
+				     fl4_dec.saddr) == RTN_LOCAL) {
 		rt2 = __ip_route_output_key(net, &fl4_dec);
 		if (IS_ERR(rt2))
 			err = PTR_ERR(rt2);
@@ -833,7 +834,7 @@ static bool icmp_unreach(struct sk_buff *skb)
 	 */
 
 	if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
-	    inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
+	    inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
 		net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
 				     &ip_hdr(skb)->saddr,
 				     icmph->type, icmph->code,
-- 
cgit v1.2.3


From dc028da54ed353edd44dca88b7eb19fd5126c354 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 16 Aug 2015 17:13:27 -0600
Subject: inet: Move VRF table lookup to inlined function

Table lookup compiles out when VRF is not enabled.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vrf.h  | 24 ++++++++++++++++++++++++
 net/ipv4/af_inet.c | 10 +---------
 2 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/vrf.h b/include/net/vrf.h
index 0484d29d4589..40e3793c7a05 100644
--- a/include/net/vrf.h
+++ b/include/net/vrf.h
@@ -81,6 +81,25 @@ static inline int vrf_dev_table(const struct net_device *dev)
 	return tb_id;
 }
 
+static inline int vrf_dev_table_ifindex(struct net *net, int ifindex)
+{
+	struct net_device *dev;
+	int tb_id = 0;
+
+	if (!ifindex)
+		return 0;
+
+	rcu_read_lock();
+
+	dev = dev_get_by_index_rcu(net, ifindex);
+	if (dev)
+		tb_id = vrf_dev_table_rcu(dev);
+
+	rcu_read_unlock();
+
+	return tb_id;
+}
+
 /* called with rtnl */
 static inline int vrf_dev_table_rtnl(const struct net_device *dev)
 {
@@ -125,6 +144,11 @@ static inline int vrf_dev_table(const struct net_device *dev)
 	return 0;
 }
 
+static inline int vrf_dev_table_ifindex(struct net *net, int ifindex)
+{
+	return 0;
+}
+
 static inline int vrf_dev_table_rtnl(const struct net_device *dev)
 {
 	return 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c8b855882fa5..675e88cac2b4 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -450,15 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			goto out;
 	}
 
-	if (sk->sk_bound_dev_if) {
-		struct net_device *dev;
-
-		rcu_read_lock();
-		dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
-		if (dev)
-			tb_id = vrf_dev_table_rcu(dev) ? : tb_id;
-		rcu_read_unlock();
-	}
+	tb_id = vrf_dev_table_ifindex(net, sk->sk_bound_dev_if) ? : tb_id;
 	chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
 
 	/* Not specified by any standard per-se, however it breaks too
-- 
cgit v1.2.3


From deedb59039f111c41aa5a54ee384c8e7c08bc78a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 14 Aug 2015 16:03:39 +0200
Subject: netfilter: nf_conntrack: add direction support for zones

This work adds a direction parameter to netfilter zones, so identity
separation can be performed only in original/reply or both directions
(default). This basically opens up the possibility of doing NAT with
conflicting IP address/port tuples from multiple, isolated tenants
on a host (e.g. from a netns) without requiring each tenant to NAT
twice resp. to use its own dedicated IP address to SNAT to, meaning
overlapping tuples can be made unique with the zone identifier in
original direction, where the NAT engine will then allocate a unique
tuple in the commonly shared default zone for the reply direction.
In some restricted, local DNAT cases, also port redirection could be
used for making the reply traffic unique w/o requiring SNAT.

The consensus we've reached and discussed at NFWS and since the initial
implementation [1] was to directly integrate the direction meta data
into the existing zones infrastructure, as opposed to the ct->mark
approach we proposed initially.

As we pass the nf_conntrack_zone object directly around, we don't have
to touch all call-sites, but only those, that contain equality checks
of zones. Thus, based on the current direction (original or reply),
we either return the actual id, or the default NF_CT_DEFAULT_ZONE_ID.
CT expectations are direction-agnostic entities when expectations are
being compared among themselves, so we can only use the identifier
in this case.

Note that zone identifiers can not be included into the hash mix
anymore as they don't contain a "stable" value that would be equal
for both directions at all times, f.e. if only zone->id would
unconditionally be xor'ed into the table slot hash, then replies won't
find the corresponding conntracking entry anymore.

If no particular direction is specified when configuring zones, the
behaviour is exactly as we expect currently (both directions).

Support has been added for the CT netlink interface as well as the
x_tables raw CT target, which both already offer existing interfaces
to user space for the configuration of zones.

Below a minimal, simplified collision example (script in [2]) with
netperf sessions:

  +--- tenant-1 ---+   mark := 1
  |    netperf     |--+
  +----------------+  |                CT zone := mark [ORIGINAL]
   [ip,sport] := X   +--------------+  +--- gateway ---+
                     | mark routing |--|     SNAT      |-- ... +
                     +--------------+  +---------------+       |
  +--- tenant-2 ---+  |                                     ~~~|~~~
  |    netperf     |--+                +-----------+           |
  +----------------+   mark := 2       | netserver |------ ... +
   [ip,sport] := X                     +-----------+
                                        [ip,port] := Y
On the gateway netns, example:

  iptables -t raw -A PREROUTING -j CT --zone mark --zone-dir ORIGINAL
  iptables -t nat -A POSTROUTING -o <dev> -j SNAT --to-source <ip> --random-fully

  iptables -t mangle -A PREROUTING -m conntrack --ctdir ORIGINAL -j CONNMARK --save-mark
  iptables -t mangle -A POSTROUTING -m conntrack --ctdir REPLY -j CONNMARK --restore-mark

conntrack dump from gateway netns:

  netperf -H 10.1.1.2 -t TCP_STREAM -l60 -p12865,5555 from each tenant netns

  tcp 6 431995 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=1
                           src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=1024
               [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 431994 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=2
                           src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=5555
               [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 299 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=39438 dport=33768 zone-orig=1
                        src=10.1.1.2 dst=10.1.1.1 sport=33768 dport=39438
               [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1

  tcp 6 300 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=32889 dport=40206 zone-orig=2
                        src=10.1.1.2 dst=10.1.1.1 sport=40206 dport=32889
               [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=2

Taking this further, test script in [2] creates 200 tenants and runs
original-tuple colliding netperf sessions each. A conntrack -L dump in
the gateway netns also confirms 200 overlapping entries, all in ESTABLISHED
state as expected.

I also did run various other tests with some permutations of the script,
to mention some: SNAT in random/random-fully/persistent mode, no zones (no
overlaps), static zones (original, reply, both directions), etc.

  [1] http://thread.gmane.org/gmane.comp.security.firewalls.netfilter.devel/57412/
  [2] https://paste.fedoraproject.org/242835/65657871/

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_zones.h         |  31 +++-
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |   1 +
 include/uapi/linux/netfilter/xt_CT.h               |   6 +-
 net/ipv4/netfilter/nf_defrag_ipv4.c                |   8 +-
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c          |   8 +-
 net/netfilter/nf_conntrack_core.c                  |  53 +++---
 net/netfilter/nf_conntrack_expect.c                |   8 +-
 net/netfilter/nf_conntrack_netlink.c               | 177 +++++++++++++++------
 net/netfilter/nf_conntrack_standalone.c            |  30 +++-
 net/netfilter/nf_nat_core.c                        |  13 +-
 net/netfilter/xt_CT.c                              |  17 +-
 net/sched/act_connmark.c                           |   1 +
 12 files changed, 259 insertions(+), 94 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 0788bb0f267d..3942ddf0d4ff 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -1,10 +1,18 @@
 #ifndef _NF_CONNTRACK_ZONES_H
 #define _NF_CONNTRACK_ZONES_H
 
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
+
 #define NF_CT_DEFAULT_ZONE_ID	0
 
+#define NF_CT_ZONE_DIR_ORIG	(1 << IP_CT_DIR_ORIGINAL)
+#define NF_CT_ZONE_DIR_REPL	(1 << IP_CT_DIR_REPLY)
+
+#define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
+
 struct nf_conntrack_zone {
 	u16	id;
+	u16	dir;
 };
 
 extern const struct nf_conntrack_zone nf_ct_zone_dflt;
@@ -29,8 +37,29 @@ nf_ct_zone_tmpl(const struct nf_conn *tmpl)
 	return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt;
 }
 
+static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
+					  enum ip_conntrack_dir dir)
+{
+	return zone->dir & (1 << dir);
+}
+
+static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone,
+				enum ip_conntrack_dir dir)
+{
+	return nf_ct_zone_matches_dir(zone, dir) ?
+	       zone->id : NF_CT_DEFAULT_ZONE_ID;
+}
+
 static inline bool nf_ct_zone_equal(const struct nf_conn *a,
-				    const struct nf_conntrack_zone *b)
+				    const struct nf_conntrack_zone *b,
+				    enum ip_conntrack_dir dir)
+{
+	return nf_ct_zone_id(nf_ct_zone(a), dir) ==
+	       nf_ct_zone_id(b, dir);
+}
+
+static inline bool nf_ct_zone_equal_any(const struct nf_conn *a,
+					const struct nf_conntrack_zone *b)
 {
 	return nf_ct_zone(a)->id == b->id;
 }
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index acad6c52a652..c1a4e1441a25 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -61,6 +61,7 @@ enum ctattr_tuple {
 	CTA_TUPLE_UNSPEC,
 	CTA_TUPLE_IP,
 	CTA_TUPLE_PROTO,
+	CTA_TUPLE_ZONE,
 	__CTA_TUPLE_MAX
 };
 #define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1)
diff --git a/include/uapi/linux/netfilter/xt_CT.h b/include/uapi/linux/netfilter/xt_CT.h
index 5a688c1ca4d7..452005ff0e9e 100644
--- a/include/uapi/linux/netfilter/xt_CT.h
+++ b/include/uapi/linux/netfilter/xt_CT.h
@@ -6,7 +6,11 @@
 enum {
 	XT_CT_NOTRACK		= 1 << 0,
 	XT_CT_NOTRACK_ALIAS	= 1 << 1,
-	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS,
+	XT_CT_ZONE_DIR_ORIG	= 1 << 2,
+	XT_CT_ZONE_DIR_REPL	= 1 << 3,
+
+	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS |
+				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL,
 };
 
 struct xt_ct_target_info {
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 20fe8e67c09b..9306ec4fab41 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -45,8 +45,12 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct)
-		zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id;
+	if (skb->nfct) {
+		enum ip_conntrack_info ctinfo;
+		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+	}
 #endif
 	if (nf_bridge_in_prerouting(skb))
 		return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 9d3de9b74856..6d9c0b3d5b8c 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -35,8 +35,12 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct)
-		zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id;
+	if (skb->nfct) {
+		enum ip_conntrack_info ctinfo;
+		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+	}
 #endif
 	if (nf_bridge_in_prerouting(skb))
 		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0bb26e84f849..acc06222ce6a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -126,8 +126,7 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 unsigned int nf_conntrack_hash_rnd __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
 
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
-			      const struct nf_conntrack_zone *zone)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple)
 {
 	unsigned int n;
 
@@ -136,7 +135,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 	 * three bytes manually.
 	 */
 	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
-	return jhash2((u32 *)tuple, n, zone->id ^ nf_conntrack_hash_rnd ^
+	return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^
 		      (((__force __u16)tuple->dst.u.all << 16) |
 		      tuple->dst.protonum));
 }
@@ -152,17 +151,15 @@ static u32 hash_bucket(u32 hash, const struct net *net)
 }
 
 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
-				  const struct nf_conntrack_zone *zone,
 				  unsigned int size)
 {
-	return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
+	return __hash_bucket(hash_conntrack_raw(tuple), size);
 }
 
 static inline u_int32_t hash_conntrack(const struct net *net,
-				       const struct nf_conntrack_zone *zone,
 				       const struct nf_conntrack_tuple *tuple)
 {
-	return __hash_conntrack(tuple, zone, net->ct.htable_size);
+	return __hash_conntrack(tuple, net->ct.htable_size);
 }
 
 bool
@@ -312,6 +309,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 		if (!nf_ct_zone)
 			goto out_free;
 		nf_ct_zone->id = zone->id;
+		nf_ct_zone->dir = zone->dir;
 	}
 #endif
 	atomic_set(&tmpl->ct_general.use, 0);
@@ -376,20 +374,18 @@ destroy_conntrack(struct nf_conntrack *nfct)
 
 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
-	const struct nf_conntrack_zone *zone;
 	struct net *net = nf_ct_net(ct);
 	unsigned int hash, reply_hash;
 	unsigned int sequence;
 
-	zone = nf_ct_zone(ct);
 	nf_ct_helper_destroy(ct);
 
 	local_bh_disable();
 	do {
 		sequence = read_seqcount_begin(&net->ct.generation);
-		hash = hash_conntrack(net, zone,
+		hash = hash_conntrack(net,
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		reply_hash = hash_conntrack(net, zone,
+		reply_hash = hash_conntrack(net,
 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 
@@ -446,7 +442,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 	 * so we need to check that the conntrack is confirmed
 	 */
 	return nf_ct_tuple_equal(tuple, &h->tuple) &&
-	       nf_ct_zone_equal(ct, zone) &&
+	       nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
 	       nf_ct_is_confirmed(ct);
 }
 
@@ -523,7 +519,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 		      const struct nf_conntrack_tuple *tuple)
 {
 	return __nf_conntrack_find_get(net, zone, tuple,
-				       hash_conntrack_raw(tuple, zone));
+				       hash_conntrack_raw(tuple));
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 
@@ -554,9 +550,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 	local_bh_disable();
 	do {
 		sequence = read_seqcount_begin(&net->ct.generation);
-		hash = hash_conntrack(net, zone,
+		hash = hash_conntrack(net,
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		reply_hash = hash_conntrack(net, zone,
+		reply_hash = hash_conntrack(net,
 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 
@@ -564,12 +560,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				      &h->tuple) &&
-		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+				     NF_CT_DIRECTION(h)))
 			goto out;
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				      &h->tuple) &&
-		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+				     NF_CT_DIRECTION(h)))
 			goto out;
 
 	add_timer(&ct->timeout);
@@ -623,7 +621,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 		/* reuse the hash saved before */
 		hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
 		hash = hash_bucket(hash, net);
-		reply_hash = hash_conntrack(net, zone,
+		reply_hash = hash_conntrack(net,
 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
@@ -655,12 +653,14 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 				      &h->tuple) &&
-		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+				     NF_CT_DIRECTION(h)))
 			goto out;
 	hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 				      &h->tuple) &&
-		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone))
+		    nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+				     NF_CT_DIRECTION(h)))
 			goto out;
 
 	/* Timer relative to confirmation time, not original
@@ -720,7 +720,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 	unsigned int hash;
 
 	zone = nf_ct_zone(ignored_conntrack);
-	hash = hash_conntrack(net, zone, tuple);
+	hash = hash_conntrack(net, tuple);
 
 	/* Disable BHs the entire time since we need to disable them at
 	 * least once for the stats anyway.
@@ -730,7 +730,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (ct != ignored_conntrack &&
 		    nf_ct_tuple_equal(tuple, &h->tuple) &&
-		    nf_ct_zone_equal(ct, zone)) {
+		    nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) {
 			NF_CT_STAT_INC(net, found);
 			rcu_read_unlock_bh();
 			return 1;
@@ -830,7 +830,7 @@ __nf_conntrack_alloc(struct net *net,
 	if (unlikely(!nf_conntrack_hash_rnd)) {
 		init_nf_conntrack_hash_rnd();
 		/* recompute the hash as nf_conntrack_hash_rnd is initialized */
-		hash = hash_conntrack_raw(orig, zone);
+		hash = hash_conntrack_raw(orig);
 	}
 
 	/* We don't want any race condition at early drop stage */
@@ -875,6 +875,7 @@ __nf_conntrack_alloc(struct net *net,
 		if (!nf_ct_zone)
 			goto out_free;
 		nf_ct_zone->id = zone->id;
+		nf_ct_zone->dir = zone->dir;
 	}
 #endif
 	/* Because we use RCU lookups, we set ct_general.use to zero before
@@ -1053,7 +1054,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 
 	/* look for tuple match */
 	zone = nf_ct_zone_tmpl(tmpl);
-	hash = hash_conntrack_raw(&tuple, zone);
+	hash = hash_conntrack_raw(&tuple);
 	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
 		h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
@@ -1306,6 +1307,7 @@ EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
 /* Built-in default zone used e.g. by modules. */
 const struct nf_conntrack_zone nf_ct_zone_dflt = {
 	.id	= NF_CT_DEFAULT_ZONE_ID,
+	.dir	= NF_CT_DEFAULT_ZONE_DIR,
 };
 EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
 
@@ -1617,8 +1619,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 					struct nf_conntrack_tuple_hash, hnnode);
 			ct = nf_ct_tuplehash_to_ctrack(h);
 			hlist_nulls_del_rcu(&h->hnnode);
-			bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
-						  hashsize);
+			bucket = __hash_conntrack(&h->tuple, hashsize);
 			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
 		}
 	}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 980db854c3c8..acf5c7b3f378 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -101,7 +101,7 @@ __nf_ct_expect_find(struct net *net,
 	h = nf_ct_expect_dst_hash(tuple);
 	hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
 		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
-		    nf_ct_zone_equal(i->master, zone))
+		    nf_ct_zone_equal_any(i->master, zone))
 			return i;
 	}
 	return NULL;
@@ -143,7 +143,7 @@ nf_ct_find_expectation(struct net *net,
 	hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
 		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
 		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
-		    nf_ct_zone_equal(i->master, zone)) {
+		    nf_ct_zone_equal_any(i->master, zone)) {
 			exp = i;
 			break;
 		}
@@ -223,7 +223,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
 	}
 
 	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
-	       nf_ct_zone_equal(a->master, nf_ct_zone(b->master));
+	       nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
 }
 
 static inline int expect_matches(const struct nf_conntrack_expect *a,
@@ -232,7 +232,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
 	return a->master == b->master && a->class == b->class &&
 	       nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
 	       nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
-	       nf_ct_zone_equal(a->master, nf_ct_zone(b->master));
+	       nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
 }
 
 /* Generally a bad idea to call this: could have matched already. */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 95f7f01e253d..4eaf925bead4 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -127,6 +127,20 @@ ctnetlink_dump_tuples(struct sk_buff *skb,
 	return ret;
 }
 
+static inline int
+ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
+		       const struct nf_conntrack_zone *zone, int dir)
+{
+	if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir)
+		return 0;
+	if (nla_put_be16(skb, attrtype, htons(zone->id)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static inline int
 ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
 {
@@ -474,11 +488,16 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	nfmsg->version      = NFNETLINK_V0;
 	nfmsg->res_id	    = 0;
 
+	zone = nf_ct_zone(ct);
+
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_ORIG) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -486,11 +505,13 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_REPL) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	zone = nf_ct_zone(ct);
-	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
-	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
+	if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+				   NF_CT_DEFAULT_ZONE_DIR) < 0)
 		goto nla_put_failure;
 
 	if (ctnetlink_dump_status(skb, ct) < 0 ||
@@ -600,7 +621,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
 	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
 #endif
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
 #endif
 	       + ctnetlink_proto_size(ct)
 	       + ctnetlink_label_size(ct)
@@ -658,11 +679,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	nfmsg->res_id	= 0;
 
 	rcu_read_lock();
+	zone = nf_ct_zone(ct);
+
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_ORIG) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -670,11 +696,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_REPL) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	zone = nf_ct_zone(ct);
-	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
-	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
+	if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+				   NF_CT_DEFAULT_ZONE_DIR) < 0)
 		goto nla_put_failure;
 
 	if (ctnetlink_dump_id(skb, ct) < 0)
@@ -924,15 +952,55 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr,
 	return ret;
 }
 
+static int
+ctnetlink_parse_zone(const struct nlattr *attr,
+		     struct nf_conntrack_zone *zone)
+{
+	zone->id  = NF_CT_DEFAULT_ZONE_ID;
+	zone->dir = NF_CT_DEFAULT_ZONE_DIR;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	if (attr)
+		zone->id = ntohs(nla_get_be16(attr));
+#else
+	if (attr)
+		return -EOPNOTSUPP;
+#endif
+	return 0;
+}
+
+static int
+ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type,
+			   struct nf_conntrack_zone *zone)
+{
+	int ret;
+
+	if (zone->id != NF_CT_DEFAULT_ZONE_ID)
+		return -EINVAL;
+
+	ret = ctnetlink_parse_zone(attr, zone);
+	if (ret < 0)
+		return ret;
+
+	if (type == CTA_TUPLE_REPLY)
+		zone->dir = NF_CT_ZONE_DIR_REPL;
+	else
+		zone->dir = NF_CT_ZONE_DIR_ORIG;
+
+	return 0;
+}
+
 static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
 	[CTA_TUPLE_IP]		= { .type = NLA_NESTED },
 	[CTA_TUPLE_PROTO]	= { .type = NLA_NESTED },
+	[CTA_TUPLE_ZONE]	= { .type = NLA_U16 },
 };
 
 static int
 ctnetlink_parse_tuple(const struct nlattr * const cda[],
 		      struct nf_conntrack_tuple *tuple,
-		      enum ctattr_type type, u_int8_t l3num)
+		      enum ctattr_type type, u_int8_t l3num,
+		      struct nf_conntrack_zone *zone)
 {
 	struct nlattr *tb[CTA_TUPLE_MAX+1];
 	int err;
@@ -959,6 +1027,16 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
 	if (err < 0)
 		return err;
 
+	if (tb[CTA_TUPLE_ZONE]) {
+		if (!zone)
+			return -EINVAL;
+
+		err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE],
+						 type, zone);
+		if (err < 0)
+			return err;
+	}
+
 	/* orig and expect tuples get DIR_ORIGINAL */
 	if (type == CTA_TUPLE_REPLY)
 		tuple->dst.dir = IP_CT_DIR_REPLY;
@@ -968,22 +1046,6 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
 	return 0;
 }
 
-static int
-ctnetlink_parse_zone(const struct nlattr *attr,
-		     struct nf_conntrack_zone *zone)
-{
-	zone->id = NF_CT_DEFAULT_ZONE_ID;
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	if (attr)
-		zone->id = ntohs(nla_get_be16(attr));
-#else
-	if (attr)
-		return -EOPNOTSUPP;
-#endif
-	return 0;
-}
-
 static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
 	[CTA_HELP_NAME]		= { .type = NLA_NUL_STRING,
 				    .len = NF_CT_HELPER_NAME_LEN - 1 },
@@ -1071,9 +1133,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	if (cda[CTA_TUPLE_ORIG])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
+					    u3, &zone);
 	else if (cda[CTA_TUPLE_REPLY])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
+					    u3, &zone);
 	else {
 		return ctnetlink_flush_conntrack(net, cda,
 						 NETLINK_CB(skb).portid,
@@ -1143,9 +1207,11 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	if (cda[CTA_TUPLE_ORIG])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
+					    u3, &zone);
 	else if (cda[CTA_TUPLE_REPLY])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
+					    u3, &zone);
 	else
 		return -EINVAL;
 
@@ -1767,7 +1833,8 @@ ctnetlink_create_conntrack(struct net *net,
 		struct nf_conntrack_tuple_hash *master_h;
 		struct nf_conn *master_ct;
 
-		err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3);
+		err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER,
+					    u3, NULL);
 		if (err < 0)
 			goto err2;
 
@@ -1818,13 +1885,15 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	if (cda[CTA_TUPLE_ORIG]) {
-		err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
+		err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG,
+					    u3, &zone);
 		if (err < 0)
 			return err;
 	}
 
 	if (cda[CTA_TUPLE_REPLY]) {
-		err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
+		err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY,
+					    u3, &zone);
 		if (err < 0)
 			return err;
 	}
@@ -2088,7 +2157,7 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
 	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
 #endif
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+	       + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
 #endif
 	       + ctnetlink_proto_size(ct)
 	       ;
@@ -2101,11 +2170,16 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
 	struct nlattr *nest_parms;
 
 	rcu_read_lock();
+	zone = nf_ct_zone(ct);
+
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_ORIG) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -2113,11 +2187,13 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
 		goto nla_put_failure;
 	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
 		goto nla_put_failure;
+	if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+				   NF_CT_ZONE_DIR_REPL) < 0)
+		goto nla_put_failure;
 	nla_nest_end(skb, nest_parms);
 
-	zone = nf_ct_zone(ct);
-	if (zone->id != NF_CT_DEFAULT_ZONE_ID &&
-	    nla_put_be16(skb, CTA_ZONE, htons(zone->id)))
+	if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+				   NF_CT_DEFAULT_ZONE_DIR) < 0)
 		goto nla_put_failure;
 
 	if (ctnetlink_dump_id(skb, ct) < 0)
@@ -2225,12 +2301,12 @@ static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
 	int err;
 
 	err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
-				    nf_ct_l3num(ct));
+				    nf_ct_l3num(ct), NULL);
 	if (err < 0)
 		return err;
 
 	return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
-				     nf_ct_l3num(ct));
+				     nf_ct_l3num(ct), NULL);
 }
 
 static int
@@ -2625,7 +2701,8 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
 		.done = ctnetlink_exp_done,
 	};
 
-	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+				    u3, NULL);
 	if (err < 0)
 		return err;
 
@@ -2677,9 +2754,11 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 		return err;
 
 	if (cda[CTA_EXPECT_TUPLE])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+					    u3, NULL);
 	else if (cda[CTA_EXPECT_MASTER])
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+					    u3, NULL);
 	else
 		return -EINVAL;
 
@@ -2747,7 +2826,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		if (err < 0)
 			return err;
 
-		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+					    u3, NULL);
 		if (err < 0)
 			return err;
 
@@ -2854,7 +2934,8 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr,
 		return -EINVAL;
 
 	err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
-					&nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);
+				    &nat_tuple, CTA_EXPECT_NAT_TUPLE,
+				    u3, NULL);
 	if (err < 0)
 		return err;
 
@@ -2955,13 +3036,16 @@ ctnetlink_create_expect(struct net *net,
 	int err;
 
 	/* caller guarantees that those three CTA_EXPECT_* exist */
-	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+				    u3, NULL);
 	if (err < 0)
 		return err;
-	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK,
+				    u3, NULL);
 	if (err < 0)
 		return err;
-	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER,
+				    u3, NULL);
 	if (err < 0)
 		return err;
 
@@ -3029,7 +3113,8 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
 	if (err < 0)
 		return err;
 
-	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+				    u3, NULL);
 	if (err < 0)
 		return err;
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 28c8b2b982ec..1fb3cacc04e1 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -141,12 +141,30 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
 #endif
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
-static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct)
+static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct,
+			 int dir)
 {
-	seq_printf(s, "zone=%u ", nf_ct_zone(ct)->id);
+	const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
+
+	if (zone->dir != dir)
+		return;
+	switch (zone->dir) {
+	case NF_CT_DEFAULT_ZONE_DIR:
+		seq_printf(s, "zone=%u ", zone->id);
+		break;
+	case NF_CT_ZONE_DIR_ORIG:
+		seq_printf(s, "zone-orig=%u ", zone->id);
+		break;
+	case NF_CT_ZONE_DIR_REPL:
+		seq_printf(s, "zone-reply=%u ", zone->id);
+		break;
+	default:
+		break;
+	}
 }
 #else
-static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct)
+static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct,
+				int dir)
 {
 }
 #endif
@@ -213,6 +231,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 		    l3proto, l4proto);
 
+	ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG);
+
 	if (seq_has_overflowed(s))
 		goto release;
 
@@ -225,6 +245,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 		    l3proto, l4proto);
 
+	ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
+
 	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
 		goto release;
 
@@ -239,7 +261,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 #endif
 
 	ct_show_secctx(s, ct);
-	ct_show_zone(s, ct);
+	ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR);
 	ct_show_delta_time(s, ct);
 
 	seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 65ebaf9fc4f9..5113dfd39df9 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -118,15 +118,13 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
-hash_by_src(const struct net *net,
-	    const struct nf_conntrack_zone *zone,
-	    const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
 {
 	unsigned int hash;
 
 	/* Original src, to ensure we map it consistently if poss. */
 	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
-		      tuple->dst.protonum ^ zone->id ^ nf_conntrack_hash_rnd);
+		      tuple->dst.protonum ^ nf_conntrack_hash_rnd);
 
 	return reciprocal_scale(hash, net->ct.nat_htable_size);
 }
@@ -194,13 +192,14 @@ find_appropriate_src(struct net *net,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
-	unsigned int h = hash_by_src(net, zone, tuple);
+	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn_nat *nat;
 	const struct nf_conn *ct;
 
 	hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
 		ct = nat->ct;
-		if (same_src(ct, tuple) && nf_ct_zone_equal(ct, zone)) {
+		if (same_src(ct, tuple) &&
+		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
 			/* Copy source part from reply tuple. */
 			nf_ct_invert_tuplepr(result,
 				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -425,7 +424,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 	if (maniptype == NF_NAT_MANIP_SRC) {
 		unsigned int srchash;
 
-		srchash = hash_by_src(net, nf_ct_zone(ct),
+		srchash = hash_by_src(net,
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 		spin_lock_bh(&nf_nat_lock);
 		/* nf_conntrack_alter_reply might re-allocate extension aera */
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 29e2856063ff..536cb67928ad 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -181,6 +181,19 @@ out:
 #endif
 }
 
+static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
+{
+	switch (info->flags & (XT_CT_ZONE_DIR_ORIG |
+			       XT_CT_ZONE_DIR_REPL)) {
+	case XT_CT_ZONE_DIR_ORIG:
+		return NF_CT_ZONE_DIR_ORIG;
+	case XT_CT_ZONE_DIR_REPL:
+		return NF_CT_ZONE_DIR_REPL;
+	default:
+		return NF_CT_DEFAULT_ZONE_DIR;
+	}
+}
+
 static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 			  struct xt_ct_target_info_v1 *info)
 {
@@ -194,7 +207,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	}
 
 #ifndef CONFIG_NF_CONNTRACK_ZONES
-	if (info->zone)
+	if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
+					 XT_CT_ZONE_DIR_REPL))
 		goto err1;
 #endif
 
@@ -204,6 +218,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 
 	memset(&zone, 0, sizeof(zone));
 	zone.id = info->zone;
+	zone.dir = xt_ct_flags_to_dir(info);
 
 	ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
 	ret = PTR_ERR(ct);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e67a1bdd0929..5019a47b9270 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -72,6 +72,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 		goto out;
 
 	zone.id = ca->zone;
+	zone.dir = NF_CT_DEFAULT_ZONE_DIR;
 
 	thash = nf_conntrack_find_get(dev_net(skb->dev), &zone, &tuple);
 	if (!thash)
-- 
cgit v1.2.3


From 5e8018fc61423e677398d4ad4d72df70b9788e77 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 14 Aug 2015 16:03:40 +0200
Subject: netfilter: nf_conntrack: add efficient mark to zone mapping

This work adds the possibility of deriving the zone id from the skb->mark
field in a scalable manner. This allows for having only a single template
serving hundreds/thousands of different zones, for example, instead of the
need to have one match for each zone as an extra CT jump target.

Note that we'd need to have this information attached to the template as at
the time when we're trying to lookup a possible ct object, we already need
to know zone information for a possible match when going into
__nf_conntrack_find_get(). This work provides a minimal implementation for
a possible mapping.

In order to not add/expose an extra ct->status bit, the zone structure has
been extended to carry a flag for deriving the mark.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_zones.h     | 45 +++++++++++++++++++++--
 include/uapi/linux/netfilter/xt_CT.h           |  4 ++-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  3 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  4 ++-
 net/netfilter/nf_conntrack_core.c              | 50 +++++++++-----------------
 net/netfilter/nf_conntrack_netlink.c           |  5 ++-
 net/netfilter/xt_CT.c                          |  5 ++-
 7 files changed, 72 insertions(+), 44 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 3942ddf0d4ff..5316c7b3a374 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -10,9 +10,12 @@
 
 #define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
 
+#define NF_CT_FLAG_MARK		1
+
 struct nf_conntrack_zone {
 	u16	id;
-	u16	dir;
+	u8	flags;
+	u8	dir;
 };
 
 extern const struct nf_conntrack_zone nf_ct_zone_dflt;
@@ -32,9 +35,45 @@ nf_ct_zone(const struct nf_conn *ct)
 }
 
 static inline const struct nf_conntrack_zone *
-nf_ct_zone_tmpl(const struct nf_conn *tmpl)
+nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags)
+{
+	zone->id = id;
+	zone->flags = flags;
+	zone->dir = dir;
+
+	return zone;
+}
+
+static inline const struct nf_conntrack_zone *
+nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
+		struct nf_conntrack_zone *tmp)
+{
+	const struct nf_conntrack_zone *zone;
+
+	if (!tmpl)
+		return &nf_ct_zone_dflt;
+
+	zone = nf_ct_zone(tmpl);
+	if (zone->flags & NF_CT_FLAG_MARK)
+		zone = nf_ct_zone_init(tmp, skb->mark, zone->dir, 0);
+
+	return zone;
+}
+
+static inline int nf_ct_zone_add(struct nf_conn *ct, gfp_t flags,
+				 const struct nf_conntrack_zone *info)
 {
-	return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	struct nf_conntrack_zone *nf_ct_zone;
+
+	nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, flags);
+	if (!nf_ct_zone)
+		return -ENOMEM;
+
+	nf_ct_zone_init(nf_ct_zone, info->id, info->dir,
+			info->flags);
+#endif
+	return 0;
 }
 
 static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
diff --git a/include/uapi/linux/netfilter/xt_CT.h b/include/uapi/linux/netfilter/xt_CT.h
index 452005ff0e9e..9e520418b858 100644
--- a/include/uapi/linux/netfilter/xt_CT.h
+++ b/include/uapi/linux/netfilter/xt_CT.h
@@ -8,9 +8,11 @@ enum {
 	XT_CT_NOTRACK_ALIAS	= 1 << 1,
 	XT_CT_ZONE_DIR_ORIG	= 1 << 2,
 	XT_CT_ZONE_DIR_REPL	= 1 << 3,
+	XT_CT_ZONE_MARK		= 1 << 4,
 
 	XT_CT_MASK		= XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS |
-				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL,
+				  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL |
+				  XT_CT_ZONE_MARK,
 };
 
 struct xt_ct_target_info {
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 8a2f41c2fe6f..cdde3ec496e9 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -135,9 +135,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	const struct nf_conntrack_l4proto *innerproto;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_zone *zone;
+	struct nf_conntrack_zone tmp;
 
 	NF_CT_ASSERT(skb->nfct == NULL);
-	zone = nf_ct_zone_tmpl(tmpl);
+	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuplepr(skb,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 202914151360..0e6fae103d33 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -150,6 +150,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_tuple intuple, origtuple;
 	const struct nf_conntrack_tuple_hash *h;
 	const struct nf_conntrack_l4proto *inproto;
+	struct nf_conntrack_zone tmp;
 
 	NF_CT_ASSERT(skb->nfct == NULL);
 
@@ -176,7 +177,8 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 
 	*ctinfo = IP_CT_RELATED;
 
-	h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl), &intuple);
+	h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
+				  &intuple);
 	if (!h) {
 		pr_debug("icmpv6_error: no match\n");
 		return -NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index acc06222ce6a..48521d62c672 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -301,25 +301,15 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 	tmpl->status = IPS_TEMPLATE;
 	write_pnet(&tmpl->ct_net, net);
 
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	if (zone) {
-		struct nf_conntrack_zone *nf_ct_zone;
-
-		nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC);
-		if (!nf_ct_zone)
-			goto out_free;
-		nf_ct_zone->id = zone->id;
-		nf_ct_zone->dir = zone->dir;
-	}
-#endif
+	if (nf_ct_zone_add(tmpl, flags, zone) < 0)
+		goto out_free;
+
 	atomic_set(&tmpl->ct_general.use, 0);
 
 	return tmpl;
-#ifdef CONFIG_NF_CONNTRACK_ZONES
 out_free:
 	kfree(tmpl);
 	return NULL;
-#endif
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 
@@ -850,10 +840,9 @@ __nf_conntrack_alloc(struct net *net,
 	 * SLAB_DESTROY_BY_RCU.
 	 */
 	ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
-	if (ct == NULL) {
-		atomic_dec(&net->ct.count);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (ct == NULL)
+		goto out;
+
 	spin_lock_init(&ct->lock);
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
@@ -867,29 +856,20 @@ __nf_conntrack_alloc(struct net *net,
 	memset(&ct->__nfct_init_offset[0], 0,
 	       offsetof(struct nf_conn, proto) -
 	       offsetof(struct nf_conn, __nfct_init_offset[0]));
-#ifdef CONFIG_NF_CONNTRACK_ZONES
-	if (zone) {
-		struct nf_conntrack_zone *nf_ct_zone;
-
-		nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
-		if (!nf_ct_zone)
-			goto out_free;
-		nf_ct_zone->id = zone->id;
-		nf_ct_zone->dir = zone->dir;
-	}
-#endif
+
+	if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0)
+		goto out_free;
+
 	/* Because we use RCU lookups, we set ct_general.use to zero before
 	 * this is inserted in any list.
 	 */
 	atomic_set(&ct->ct_general.use, 0);
 	return ct;
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
 out_free:
-	atomic_dec(&net->ct.count);
 	kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+out:
+	atomic_dec(&net->ct.count);
 	return ERR_PTR(-ENOMEM);
-#endif
 }
 
 struct nf_conn *nf_conntrack_alloc(struct net *net,
@@ -937,6 +917,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_expect *exp = NULL;
 	const struct nf_conntrack_zone *zone;
 	struct nf_conn_timeout *timeout_ext;
+	struct nf_conntrack_zone tmp;
 	unsigned int *timeouts;
 
 	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
@@ -944,7 +925,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		return NULL;
 	}
 
-	zone = nf_ct_zone_tmpl(tmpl);
+	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
 				  hash);
 	if (IS_ERR(ct))
@@ -1042,6 +1023,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	const struct nf_conntrack_zone *zone;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_zone tmp;
 	struct nf_conn *ct;
 	u32 hash;
 
@@ -1053,7 +1035,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* look for tuple match */
-	zone = nf_ct_zone_tmpl(tmpl);
+	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 	hash = hash_conntrack_raw(&tuple);
 	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4eaf925bead4..94a66541e0b7 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -956,9 +956,8 @@ static int
 ctnetlink_parse_zone(const struct nlattr *attr,
 		     struct nf_conntrack_zone *zone)
 {
-	zone->id  = NF_CT_DEFAULT_ZONE_ID;
-	zone->dir = NF_CT_DEFAULT_ZONE_DIR;
-
+	nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID,
+			NF_CT_DEFAULT_ZONE_DIR, 0);
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	if (attr)
 		zone->id = ntohs(nla_get_be16(attr));
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 536cb67928ad..346509825a80 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -208,7 +208,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 
 #ifndef CONFIG_NF_CONNTRACK_ZONES
 	if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
-					 XT_CT_ZONE_DIR_REPL))
+					 XT_CT_ZONE_DIR_REPL |
+					 XT_CT_ZONE_MARK))
 		goto err1;
 #endif
 
@@ -219,6 +220,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	memset(&zone, 0, sizeof(zone));
 	zone.id = info->zone;
 	zone.dir = xt_ct_flags_to_dir(info);
+	if (info->flags & XT_CT_ZONE_MARK)
+		zone.flags |= NF_CT_FLAG_MARK;
 
 	ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
 	ret = PTR_ERR(ct);
-- 
cgit v1.2.3


From 2536862311d2276454ddef9dc36d6551a4b400fd Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 17 Aug 2015 13:42:24 -0700
Subject: lwt: Add support to redirect dst.input

This patch adds the capability to redirect dst input in the same way
that dst output is redirected by LWT.

Also, save the original dst.input and and dst.out when setting up
lwtunnel redirection. These can be called by the client as a pass-
through.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 30 ++++++++++++++++++++++++++-
 net/core/lwtunnel.c    | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/route.c       |  8 +++++++-
 net/ipv6/route.c       |  8 +++++++-
 4 files changed, 98 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33bd30963a95..e25b60eb262d 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -11,12 +11,15 @@
 #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
 
 /* lw tunnel state flags */
-#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1
+#define LWTUNNEL_STATE_OUTPUT_REDIRECT	BIT(0)
+#define LWTUNNEL_STATE_INPUT_REDIRECT	BIT(1)
 
 struct lwtunnel_state {
 	__u16		type;
 	__u16		flags;
 	atomic_t	refcnt;
+	int		(*orig_output)(struct sock *sk, struct sk_buff *skb);
+	int		(*orig_input)(struct sk_buff *);
 	int             len;
 	__u8            data[0];
 };
@@ -25,6 +28,7 @@ struct lwtunnel_encap_ops {
 	int (*build_state)(struct net_device *dev, struct nlattr *encap,
 			   struct lwtunnel_state **ts);
 	int (*output)(struct sock *sk, struct sk_buff *skb);
+	int (*input)(struct sk_buff *skb);
 	int (*fill_encap)(struct sk_buff *skb,
 			  struct lwtunnel_state *lwtstate);
 	int (*get_encap_size)(struct lwtunnel_state *lwtstate);
@@ -58,6 +62,13 @@ static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
 	return false;
 }
 
+static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
+{
+	if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT))
+		return true;
+
+	return false;
+}
 int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
 			   unsigned int num);
 int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
@@ -72,6 +83,8 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
 int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct sock *sk, struct sk_buff *skb);
 int lwtunnel_output6(struct sock *sk, struct sk_buff *skb);
+int lwtunnel_input(struct sk_buff *skb);
+int lwtunnel_input6(struct sk_buff *skb);
 
 #else
 
@@ -90,6 +103,11 @@ static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
 	return false;
 }
 
+static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
+{
+	return false;
+}
+
 static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
 					 unsigned int num)
 {
@@ -142,6 +160,16 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
 	return -EOPNOTSUPP;
 }
 
+static inline int lwtunnel_input(struct sk_buff *skb)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_input6(struct sk_buff *skb)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* __NET_LWTUNNEL_H */
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 5d6d8e3d450a..3331585174d9 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -241,3 +241,58 @@ int lwtunnel_output(struct sock *sk, struct sk_buff *skb)
 	return __lwtunnel_output(sk, skb, lwtstate);
 }
 EXPORT_SYMBOL(lwtunnel_output);
+
+int __lwtunnel_input(struct sk_buff *skb,
+		     struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = -EINVAL;
+
+	if (!lwtstate)
+		goto drop;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	ret = -EOPNOTSUPP;
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->input))
+		ret = ops->input(skb);
+	rcu_read_unlock();
+
+	if (ret == -EOPNOTSUPP)
+		goto drop;
+
+	return ret;
+
+drop:
+	kfree_skb(skb);
+
+	return ret;
+}
+
+int lwtunnel_input6(struct sk_buff *skb)
+{
+	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+	struct lwtunnel_state *lwtstate = NULL;
+
+	if (rt)
+		lwtstate = rt->rt6i_lwtstate;
+
+	return __lwtunnel_input(skb, lwtstate);
+}
+EXPORT_SYMBOL(lwtunnel_input6);
+
+int lwtunnel_input(struct sk_buff *skb)
+{
+	struct rtable *rt = (struct rtable *)skb_dst(skb);
+	struct lwtunnel_state *lwtstate = NULL;
+
+	if (rt)
+		lwtstate = rt->rt_lwtstate;
+
+	return __lwtunnel_input(skb, lwtstate);
+}
+EXPORT_SYMBOL(lwtunnel_input);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2c89d294b669..2403e85107f0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1631,8 +1631,14 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.output = ip_output;
 
 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
-	if (lwtunnel_output_redirect(rth->rt_lwtstate))
+	if (lwtunnel_output_redirect(rth->rt_lwtstate)) {
+		rth->rt_lwtstate->orig_output = rth->dst.output;
 		rth->dst.output = lwtunnel_output;
+	}
+	if (lwtunnel_input_redirect(rth->rt_lwtstate)) {
+		rth->rt_lwtstate->orig_input = rth->dst.input;
+		rth->dst.input = lwtunnel_input;
+	}
 	skb_dst_set(skb, &rth->dst);
 out:
 	err = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1c0217e61357..c3733049715e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1785,8 +1785,14 @@ int ip6_route_add(struct fib6_config *cfg)
 		if (err)
 			goto out;
 		rt->rt6i_lwtstate = lwtstate_get(lwtstate);
-		if (lwtunnel_output_redirect(rt->rt6i_lwtstate))
+		if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) {
+			rt->rt6i_lwtstate->orig_output = rt->dst.output;
 			rt->dst.output = lwtunnel_output6;
+		}
+		if (lwtunnel_input_redirect(rt->rt6i_lwtstate)) {
+			rt->rt6i_lwtstate->orig_input = rt->dst.input;
+			rt->dst.input = lwtunnel_input6;
+		}
 	}
 
 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
-- 
cgit v1.2.3


From 4b048d6d9d0b0b90e1e94f2393796bbf1fa8df4e Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 17 Aug 2015 13:42:25 -0700
Subject: net: Change pseudohdr argument of inet_proto_csum_replace* to be a
 bool

inet_proto_csum_replace4,2,16 take a pseudohdr argument which indicates
the checksum field carries a pseudo header. This argument should be a
boolean instead of an int.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/checksum.h                   |  6 +++---
 net/core/filter.c                        |  2 +-
 net/core/utils.c                         |  4 ++--
 net/ipv4/netfilter/ipt_ECN.c             |  2 +-
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c |  4 ++--
 net/ipv4/netfilter/nf_nat_proto_icmp.c   |  2 +-
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c |  4 ++--
 net/ipv6/netfilter/nf_nat_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_seqadj.c      |  9 +++++----
 net/netfilter/nf_nat_proto_dccp.c        |  2 +-
 net/netfilter/nf_nat_proto_tcp.c         |  2 +-
 net/netfilter/nf_nat_proto_udp.c         |  2 +-
 net/netfilter/nf_nat_proto_udplite.c     |  2 +-
 net/netfilter/nf_synproxy_core.c         |  2 +-
 net/netfilter/xt_TCPMSS.c                |  8 ++++----
 net/netfilter/xt_TCPOPTSTRIP.c           |  2 +-
 net/openvswitch/actions.c                | 12 ++++++------
 net/sched/act_nat.c                      |  7 ++++---
 18 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'include/net')

diff --git a/include/net/checksum.h b/include/net/checksum.h
index 2d1d73cb773e..619f3445d57e 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -140,14 +140,14 @@ static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new)
 
 struct sk_buff;
 void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
-			      __be32 from, __be32 to, int pseudohdr);
+			      __be32 from, __be32 to, bool pseudohdr);
 void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
 			       const __be32 *from, const __be32 *to,
-			       int pseudohdr);
+			       bool pseudohdr);
 
 static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb,
 					    __be16 from, __be16 to,
-					    int pseudohdr)
+					    bool pseudohdr)
 {
 	inet_proto_csum_replace4(sum, skb, (__force __be32)from,
 				 (__force __be32)to, pseudohdr);
diff --git a/net/core/filter.c b/net/core/filter.c
index f8184222465e..83f08cefeab7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1349,7 +1349,7 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = {
 static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags);
+	bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags);
 	int offset = (int) r2;
 	__sum16 sum, *ptr;
 
diff --git a/net/core/utils.c b/net/core/utils.c
index a7732a068043..cd7d202f340e 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -301,7 +301,7 @@ out:
 EXPORT_SYMBOL(in6_pton);
 
 void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
-			      __be32 from, __be32 to, int pseudohdr)
+			      __be32 from, __be32 to, bool pseudohdr)
 {
 	if (skb->ip_summed != CHECKSUM_PARTIAL) {
 		csum_replace4(sum, from, to);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(inet_proto_csum_replace4);
 
 void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
 			       const __be32 *from, const __be32 *to,
-			       int pseudohdr)
+			       bool pseudohdr)
 {
 	__be32 diff[] = {
 		~from[0], ~from[1], ~from[2], ~from[3],
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 4bf3dc49ad1e..270765236f5e 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -72,7 +72,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
 		tcph->cwr = einfo->proto.tcp.cwr;
 
 	inet_proto_csum_replace2(&tcph->check, skb,
-				 oldval, ((__be16 *)tcph)[6], 0);
+				 oldval, ((__be16 *)tcph)[6], false);
 	return true;
 }
 
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index e59cc05c09e9..22f4579b0c2a 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -120,7 +120,7 @@ static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
 		oldip = iph->daddr;
 		newip = t->dst.u3.ip;
 	}
-	inet_proto_csum_replace4(check, skb, oldip, newip, 1);
+	inet_proto_csum_replace4(check, skb, oldip, newip, true);
 }
 
 static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
@@ -151,7 +151,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
 		}
 	} else
 		inet_proto_csum_replace2(check, skb,
-					 htons(oldlen), htons(datalen), 1);
+					 htons(oldlen), htons(datalen), true);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 4557b4ab8342..7b98baa13ede 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -67,7 +67,7 @@ icmp_manip_pkt(struct sk_buff *skb,
 
 	hdr = (struct icmphdr *)(skb->data + hdroff);
 	inet_proto_csum_replace2(&hdr->checksum, skb,
-				 hdr->un.echo.id, tuple->src.u.icmp.id, 0);
+				 hdr->un.echo.id, tuple->src.u.icmp.id, false);
 	hdr->un.echo.id = tuple->src.u.icmp.id;
 	return true;
 }
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index e76900e0aa92..70fbaed49edb 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -124,7 +124,7 @@ static void nf_nat_ipv6_csum_update(struct sk_buff *skb,
 		newip = &t->dst.u3.in6;
 	}
 	inet_proto_csum_replace16(check, skb, oldip->s6_addr32,
-				  newip->s6_addr32, 1);
+				  newip->s6_addr32, true);
 }
 
 static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
@@ -155,7 +155,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
 		}
 	} else
 		inet_proto_csum_replace2(check, skb,
-					 htons(oldlen), htons(datalen), 1);
+					 htons(oldlen), htons(datalen), true);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index 2205e8eeeacf..57593b00c5b4 100644
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -73,7 +73,7 @@ icmpv6_manip_pkt(struct sk_buff *skb,
 	    hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
 		inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
 					 hdr->icmp6_identifier,
-					 tuple->src.u.icmp.id, 0);
+					 tuple->src.u.icmp.id, false);
 		hdr->icmp6_identifier = tuple->src.u.icmp.id;
 	}
 	return true;
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index ce3e840c8704..dff0f0cc59e4 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -103,9 +103,9 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb,
 			 ntohl(sack->end_seq), ntohl(new_end_seq));
 
 		inet_proto_csum_replace4(&tcph->check, skb,
-					 sack->start_seq, new_start_seq, 0);
+					 sack->start_seq, new_start_seq, false);
 		inet_proto_csum_replace4(&tcph->check, skb,
-					 sack->end_seq, new_end_seq, 0);
+					 sack->end_seq, new_end_seq, false);
 		sack->start_seq = new_start_seq;
 		sack->end_seq = new_end_seq;
 		sackoff += sizeof(*sack);
@@ -193,8 +193,9 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
 	newseq = htonl(ntohl(tcph->seq) + seqoff);
 	newack = htonl(ntohl(tcph->ack_seq) - ackoff);
 
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack,
+				 false);
 
 	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
 		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index b8067b53ff3a..15c47b246d0d 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -69,7 +69,7 @@ dccp_manip_pkt(struct sk_buff *skb,
 	l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
 			     tuple, maniptype);
 	inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
-				 0);
+				 false);
 	return true;
 }
 
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
index 37f5505f4529..4f8820fc5148 100644
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -70,7 +70,7 @@ tcp_manip_pkt(struct sk_buff *skb,
 		return true;
 
 	l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
-	inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+	inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false);
 	return true;
 }
 
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index b0ede2f0d8bc..b1e627227b6e 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -57,7 +57,7 @@ udp_manip_pkt(struct sk_buff *skb,
 		l3proto->csum_update(skb, iphdroff, &hdr->check,
 				     tuple, maniptype);
 		inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
-					 0);
+					 false);
 		if (!hdr->check)
 			hdr->check = CSUM_MANGLED_0;
 	}
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
index 368f14e01e75..58340c97bd83 100644
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ b/net/netfilter/nf_nat_proto_udplite.c
@@ -56,7 +56,7 @@ udplite_manip_pkt(struct sk_buff *skb,
 	}
 
 	l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
-	inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
+	inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false);
 	if (!hdr->check)
 		hdr->check = CSUM_MANGLED_0;
 
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index d7f168527903..14f8b43ec5a7 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -225,7 +225,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
 						     synproxy->tsoff);
 				}
 				inet_proto_csum_replace4(&th->check, skb,
-							 old, *ptr, 0);
+							 old, *ptr, false);
 				return 1;
 			}
 			optoff += op[1];
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 8c3190e2fc6a..8c02501a530f 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -144,7 +144,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 
 			inet_proto_csum_replace2(&tcph->check, skb,
 						 htons(oldmss), htons(newmss),
-						 0);
+						 false);
 			return 0;
 		}
 	}
@@ -185,18 +185,18 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 	memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr));
 
 	inet_proto_csum_replace2(&tcph->check, skb,
-				 htons(len), htons(len + TCPOLEN_MSS), 1);
+				 htons(len), htons(len + TCPOLEN_MSS), true);
 	opt[0] = TCPOPT_MSS;
 	opt[1] = TCPOLEN_MSS;
 	opt[2] = (newmss & 0xff00) >> 8;
 	opt[3] = newmss & 0x00ff;
 
-	inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0);
+	inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false);
 
 	oldval = ((__be16 *)tcph)[6];
 	tcph->doff += TCPOLEN_MSS/4;
 	inet_proto_csum_replace2(&tcph->check, skb,
-				 oldval, ((__be16 *)tcph)[6], 0);
+				 oldval, ((__be16 *)tcph)[6], false);
 	return TCPOLEN_MSS;
 }
 
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 625fa1d636a0..eb92bffff11c 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -80,7 +80,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
 				n <<= 8;
 			}
 			inet_proto_csum_replace2(&tcph->check, skb, htons(o),
-						 htons(n), 0);
+						 htons(n), false);
 		}
 		memset(opt + i, TCPOPT_NOP, optl);
 	}
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 14da52ddd327..4f4200717bef 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -284,14 +284,14 @@ static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
 	if (nh->protocol == IPPROTO_TCP) {
 		if (likely(transport_len >= sizeof(struct tcphdr)))
 			inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
-						 addr, new_addr, 1);
+						 addr, new_addr, true);
 	} else if (nh->protocol == IPPROTO_UDP) {
 		if (likely(transport_len >= sizeof(struct udphdr))) {
 			struct udphdr *uh = udp_hdr(skb);
 
 			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
 				inet_proto_csum_replace4(&uh->check, skb,
-							 addr, new_addr, 1);
+							 addr, new_addr, true);
 				if (!uh->check)
 					uh->check = CSUM_MANGLED_0;
 			}
@@ -316,14 +316,14 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
 	if (l4_proto == NEXTHDR_TCP) {
 		if (likely(transport_len >= sizeof(struct tcphdr)))
 			inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
-						  addr, new_addr, 1);
+						  addr, new_addr, true);
 	} else if (l4_proto == NEXTHDR_UDP) {
 		if (likely(transport_len >= sizeof(struct udphdr))) {
 			struct udphdr *uh = udp_hdr(skb);
 
 			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
 				inet_proto_csum_replace16(&uh->check, skb,
-							  addr, new_addr, 1);
+							  addr, new_addr, true);
 				if (!uh->check)
 					uh->check = CSUM_MANGLED_0;
 			}
@@ -331,7 +331,7 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
 	} else if (l4_proto == NEXTHDR_ICMP) {
 		if (likely(transport_len >= sizeof(struct icmp6hdr)))
 			inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum,
-						  skb, addr, new_addr, 1);
+						  skb, addr, new_addr, true);
 	}
 }
 
@@ -498,7 +498,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 static void set_tp_port(struct sk_buff *skb, __be16 *port,
 			__be16 new_port, __sum16 *check)
 {
-	inet_proto_csum_replace2(check, skb, *port, new_port, 0);
+	inet_proto_csum_replace2(check, skb, *port, new_port, false);
 	*port = new_port;
 }
 
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 5be0b3c1c5b0..b7c4ead8b5a8 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -162,7 +162,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 			goto drop;
 
 		tcph = (void *)(skb_network_header(skb) + ihl);
-		inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
+		inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr,
+					 true);
 		break;
 	}
 	case IPPROTO_UDP:
@@ -178,7 +179,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		udph = (void *)(skb_network_header(skb) + ihl);
 		if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
 			inet_proto_csum_replace4(&udph->check, skb, addr,
-						 new_addr, 1);
+						 new_addr, true);
 			if (!udph->check)
 				udph->check = CSUM_MANGLED_0;
 		}
@@ -231,7 +232,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 			iph->saddr = new_addr;
 
 		inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
-					 0);
+					 false);
 		break;
 	}
 	default:
-- 
cgit v1.2.3


From abc5d1ff3e8f9b4a9d274818459b123e31981dc9 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 17 Aug 2015 13:42:26 -0700
Subject: net: Add inet_proto_csum_replace_by_diff utility function

This function updates a checksum field value and skb->csum based on
a value which is the difference between the old and new checksum.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/checksum.h |  2 ++
 net/core/utils.c       | 13 +++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/checksum.h b/include/net/checksum.h
index 619f3445d57e..9fcaedf994ee 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -144,6 +144,8 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
 void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
 			       const __be32 *from, const __be32 *to,
 			       bool pseudohdr);
+void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
+				     __wsum diff, bool pseudohdr);
 
 static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb,
 					    __be16 from, __be16 to,
diff --git a/net/core/utils.c b/net/core/utils.c
index cd7d202f340e..3dffce953c39 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -336,6 +336,19 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(inet_proto_csum_replace16);
 
+void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
+				     __wsum diff, bool pseudohdr)
+{
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		*sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+		if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+			skb->csum = ~csum_add(diff, ~skb->csum);
+	} else if (pseudohdr) {
+		*sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
+	}
+}
+EXPORT_SYMBOL(inet_proto_csum_replace_by_diff);
+
 struct __net_random_once_work {
 	struct work_struct work;
 	struct static_key *key;
-- 
cgit v1.2.3


From 60045cbfc0b291dae8dd5b929d67b87c5ea954d4 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Aug 2015 23:52:51 +0200
Subject: net: dsa: Add dsa_is_dsa_port() helper

Add an inline helper for determining is a port is a DSA port.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx.c | 5 ++---
 include/net/dsa.h           | 5 +++++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 332f2c8090d0..486e9792fc05 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1926,8 +1926,7 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port)
 		 * full duplex.
 		 */
 		reg = _mv88e6xxx_reg_read(ds, REG_PORT(port), PORT_PCS_CTRL);
-		if (dsa_is_cpu_port(ds, port) ||
-		    ds->dsa_port_mask & (1 << port)) {
+		if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) {
 			reg |= PORT_PCS_CTRL_FORCE_LINK |
 				PORT_PCS_CTRL_LINK_UP |
 				PORT_PCS_CTRL_DUPLEX_FULL |
@@ -1992,7 +1991,7 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port)
 	    mv88e6xxx_6165_family(ds) || mv88e6xxx_6097_family(ds) ||
 	    mv88e6xxx_6095_family(ds) || mv88e6xxx_6065_family(ds) ||
 	    mv88e6xxx_6320_family(ds)) {
-		if (ds->dsa_port_mask & (1 << port))
+		if (dsa_is_dsa_port(ds, port))
 			reg |= PORT_CONTROL_FRAME_MODE_DSA;
 		if (port == dsa_upstream_port(ds))
 			reg |= PORT_CONTROL_FORWARD_UNKNOWN |
diff --git a/include/net/dsa.h b/include/net/dsa.h
index bd9b76502458..b34d812bc5d0 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -171,6 +171,11 @@ static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
 	return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port);
 }
 
+static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
+{
+	return !!((ds->dsa_port_mask) & (1 << p));
+}
+
 static inline bool dsa_is_port_initialized(struct dsa_switch *ds, int p)
 {
 	return ds->phys_port_mask & (1 << p) && ds->ports[p];
-- 
cgit v1.2.3


From df383e6240ef222703648072dafd2a1ae21b0d2a Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 18 Aug 2015 18:41:13 +0200
Subject: lwtunnel: fix memory leak

The built lwtunnel_state struct has to be freed after comparison.

Fixes: 571e722676fe3 ("ipv4: support for fib route lwtunnel encap attributes")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h   |  7 ++++++-
 net/ipv4/fib_semantics.c | 10 ++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index e25b60eb262d..34fd8f70c2ca 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -36,6 +36,11 @@ struct lwtunnel_encap_ops {
 };
 
 #ifdef CONFIG_LWTUNNEL
+static inline void lwtstate_free(struct lwtunnel_state *lws)
+{
+	kfree(lws);
+}
+
 static inline struct lwtunnel_state *
 lwtstate_get(struct lwtunnel_state *lws)
 {
@@ -51,7 +56,7 @@ static inline void lwtstate_put(struct lwtunnel_state *lws)
 		return;
 
 	if (atomic_dec_and_test(&lws->refcnt))
-		kfree(lws);
+		lwtstate_free(lws);
 }
 
 static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c8025851dac7..d5253071f71f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -539,7 +539,7 @@ int fib_encap_match(struct net *net, u16 encap_type,
 {
 	struct lwtunnel_state *lwtstate;
 	struct net_device *dev = NULL;
-	int ret;
+	int ret, result = 0;
 
 	if (encap_type == LWTUNNEL_ENCAP_NONE)
 		return 0;
@@ -548,10 +548,12 @@ int fib_encap_match(struct net *net, u16 encap_type,
 		dev = __dev_get_by_index(net, oif);
 	ret = lwtunnel_build_state(dev, encap_type,
 				   encap, &lwtstate);
-	if (!ret)
-		return lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+	if (!ret) {
+		result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+		lwtstate_free(lwtstate);
+	}
 
-	return 0;
+	return result;
 }
 
 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
-- 
cgit v1.2.3


From db5dbec5ef2d4565bb8d42709802de66b06f9965 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 18 Aug 2015 20:28:02 +0300
Subject: vrf: drop unused num_slaves member

slave_queue has a num_slaves member which is unused, drop it.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c | 2 --
 include/net/vrf.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index cd4bc77f2e04..3d7da0c6f827 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -335,13 +335,11 @@ static struct slave *__vrf_find_slave_dev(struct slave_queue *queue,
 static void __vrf_remove_slave(struct slave_queue *queue, struct slave *slave)
 {
 	list_del(&slave->list);
-	queue->num_slaves--;
 }
 
 static void __vrf_insert_slave(struct slave_queue *queue, struct slave *slave)
 {
 	list_add(&slave->list, &queue->all_slaves);
-	queue->num_slaves++;
 }
 
 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
diff --git a/include/net/vrf.h b/include/net/vrf.h
index 40e3793c7a05..3bb4af462ed6 100644
--- a/include/net/vrf.h
+++ b/include/net/vrf.h
@@ -24,7 +24,6 @@ struct slave {
 
 struct slave_queue {
 	struct list_head	all_slaves;
-	int			num_slaves;
 };
 
 struct net_vrf {
-- 
cgit v1.2.3


From bf798657eb5ba57552096843c315f096fdf9b715 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 12 Aug 2015 17:41:00 +0200
Subject: netfilter: nf_tables: Use 32 bit addressing register from
 nft_type_to_reg()

nft_type_to_reg() needs to return the register in the new 32 bit addressing,
otherwise we hit EINVAL when using mappings.

Fixes: 49499c3 ("netfilter: nf_tables: switch registers to 32 bit addressing")
Reported-by: Andreas Schultz <aschultz@tpip.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2a246680a6c3..aa8bee72c9d3 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -125,7 +125,7 @@ static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
 
 static inline enum nft_registers nft_type_to_reg(enum nft_data_types type)
 {
-	return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1;
+	return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE;
 }
 
 unsigned int nft_parse_register(const struct nlattr *attr);
-- 
cgit v1.2.3


From 18e1db67e93ed75d9dc0d34c8d783ccf10547c2b Mon Sep 17 00:00:00 2001
From: Bernhard Thaler <bernhard.thaler@wvnet.at>
Date: Thu, 13 Aug 2015 08:58:15 +0200
Subject: netfilter: bridge: fix IPv6 packets not being bridged with
 CONFIG_IPV6=n

230ac490f7fba introduced a dependency to CONFIG_IPV6 which breaks bridging
of IPv6 packets on a bridge with CONFIG_IPV6=n.

Sysctl entry /proc/sys/net/bridge/bridge-nf-call-ip6tables defaults to 1,
for this reason packets are handled by br_nf_pre_routing_ipv6(). When compiled
with CONFIG_IPV6=n this function returns NF_DROP but should return NF_ACCEPT
to let packets through.

Change CONFIG_IPV6=n br_nf_pre_routing_ipv6() return value to NF_ACCEPT.

Tested with a simple bridge with two interfaces and IPv6 packets trying
to pass from host on left side to host on right side of the bridge.

Fixes: 230ac490f7fba ("netfilter: bridge: split ipv6 code into separated file")
Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/br_netfilter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h
index bab824bde92c..d4c6b5f30acd 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -59,7 +59,7 @@ static inline unsigned int
 br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		       const struct nf_hook_state *state)
 {
-	return NF_DROP;
+	return NF_ACCEPT;
 }
 #endif
 
-- 
cgit v1.2.3


From 824e7383e92815cb591793c74cc836aa5165f7f8 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 19 Aug 2015 15:46:17 +0800
Subject: lwtunnel: Fix the sparse warnings in fib_encap_match
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_LWTUNNEL config is not enabled, the lwtstate_free() is not
declared in lwtunnel.h at all. However, even in this case, the function
is still referenced in fib_semantics.c so that there appears the
following sparse warnings:

net/ipv4/fib_semantics.c:553:17: error: undefined identifier 'lwtstate_free'
  CC      net/ipv4/fib_semantics.o
  net/ipv4/fib_semantics.c: In function ‘fib_encap_match’:
  net/ipv4/fib_semantics.c:553:3: error: implicit declaration of function ‘lwtstate_free’ [-Werror=implicit-function-declaration]
  cc1: some warnings being treated as errors
  make[1]: *** [net/ipv4/fib_semantics.o] Error 1
  make: *** [net/ipv4/fib_semantics.o] Error 2

To eliminate the error, we define an empty function for lwtstate_free()
in lwtunnel.h when CONFIG_LWTUNNEL is disabled.

Fixes: df383e6240ef ("lwtunnel: fix memory leak")
Cc: Jiri Benc <jbenc@redhat.com>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 34fd8f70c2ca..cfee53916ba5 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -93,6 +93,10 @@ int lwtunnel_input6(struct sk_buff *skb);
 
 #else
 
+static inline void lwtstate_free(struct lwtunnel_state *lws)
+{
+}
+
 static inline struct lwtunnel_state *
 lwtstate_get(struct lwtunnel_state *lws)
 {
-- 
cgit v1.2.3


From 18041e31743d278b6323518d20a2ef656c3cc689 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 18 Aug 2015 21:40:16 +0300
Subject: vrf: vrf_master_ifindex_rcu is not always called with rcu read lock

While running net-next I hit this:
[  634.073119] ===============================
[  634.073150] [ INFO: suspicious RCU usage. ]
[  634.073182] 4.2.0-rc6+ #45 Not tainted
[  634.073213] -------------------------------
[  634.073244] include/net/vrf.h:38 suspicious rcu_dereference_check()
usage!
[  634.073274]
               other info that might help us debug this:

[  634.073307]
               rcu_scheduler_active = 1, debug_locks = 1
[  634.073338] 2 locks held by swapper/0/0:
[  634.073369]  #0:  (((&n->timer))){+.-...}, at: [<ffffffff8112bc35>]
call_timer_fn+0x5/0x480
[  634.073412]  #1:  (slock-AF_INET){+.-...}, at: [<ffffffff8174f0f5>]
icmp_send+0x155/0x5f0
[  634.073450]
               stack backtrace:
[  634.073483] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.2.0-rc6+ #45
[  634.073514] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS
VirtualBox 12/01/2006
[  634.073545]  0000000000000000 0593ba8242d9ace4 ffff88002fc03b48
ffffffff81803f1b
[  634.073612]  0000000000000000 ffffffff81e12500 ffff88002fc03b78
ffffffff811003c5
[  634.073642]  0000000000000000 ffff88002ec4e600 ffffffff81f00f80
ffff88002fc03cf0
[  634.073669] Call Trace:
[  634.073694]  <IRQ>  [<ffffffff81803f1b>] dump_stack+0x4c/0x65
[  634.073728]  [<ffffffff811003c5>] lockdep_rcu_suspicious+0xc5/0x100
[  634.073763]  [<ffffffff8174eb56>] icmp_route_lookup+0x176/0x5c0
[  634.073793]  [<ffffffff8174f2fb>] ? icmp_send+0x35b/0x5f0
[  634.073818]  [<ffffffff8174f274>] ? icmp_send+0x2d4/0x5f0
[  634.073844]  [<ffffffff8174f3ce>] icmp_send+0x42e/0x5f0
[  634.073873]  [<ffffffff8170b662>] ipv4_link_failure+0x22/0xa0
[  634.073899]  [<ffffffff8174bdda>] arp_error_report+0x3a/0x80
[  634.073926]  [<ffffffff816d6100>] ? neigh_lookup+0x2c0/0x2c0
[  634.073952]  [<ffffffff816d396e>] neigh_invalidate+0x8e/0x110
[  634.073984]  [<ffffffff816d62ae>] neigh_timer_handler+0x1ae/0x290
[  634.074013]  [<ffffffff816d6100>] ? neigh_lookup+0x2c0/0x2c0
[  634.074013]  [<ffffffff8112bce3>] call_timer_fn+0xb3/0x480
[  634.074013]  [<ffffffff8112bc35>] ? call_timer_fn+0x5/0x480
[  634.074013]  [<ffffffff816d6100>] ? neigh_lookup+0x2c0/0x2c0
[  634.074013]  [<ffffffff8112c2bc>] run_timer_softirq+0x20c/0x430
[  634.074013]  [<ffffffff810af50e>] __do_softirq+0xde/0x630
[  634.074013]  [<ffffffff810afc97>] irq_exit+0x117/0x120
[  634.074013]  [<ffffffff81810976>] smp_apic_timer_interrupt+0x46/0x60
[  634.074013]  [<ffffffff8180e950>] apic_timer_interrupt+0x70/0x80
[  634.074013]  <EOI>  [<ffffffff8106b9d6>] ? native_safe_halt+0x6/0x10
[  634.074013]  [<ffffffff81101d8d>] ? trace_hardirqs_on+0xd/0x10
[  634.074013]  [<ffffffff81027d43>] default_idle+0x23/0x200
[  634.074013]  [<ffffffff8102852f>] arch_cpu_idle+0xf/0x20
[  634.074013]  [<ffffffff810f89ba>] default_idle_call+0x2a/0x40
[  634.074013]  [<ffffffff810f8dcc>] cpu_startup_entry+0x39c/0x4c0
[  634.074013]  [<ffffffff817f9cad>] rest_init+0x13d/0x150
[  634.074013]  [<ffffffff81f69038>] start_kernel+0x4a8/0x4c9
[  634.074013]  [<ffffffff81f68120>] ?
early_idt_handler_array+0x120/0x120
[  634.074013]  [<ffffffff81f68339>] x86_64_start_reservations+0x2a/0x2c
[  634.074013]  [<ffffffff81f68485>] x86_64_start_kernel+0x14a/0x16d

It would seem vrf_master_ifindex_rcu() can be called without RCU held in
other contexts as well so introduce a new helper which acquires rcu and
returns the ifindex.
Also add curly braces around both the "if" and "else" parts as per the
style guide.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vrf.h | 20 ++++++++++++++++++--
 net/ipv4/icmp.c   |  4 ++--
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/vrf.h b/include/net/vrf.h
index 3bb4af462ed6..5bfb16237fd7 100644
--- a/include/net/vrf.h
+++ b/include/net/vrf.h
@@ -43,9 +43,9 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
 	if (!dev)
 		return 0;
 
-	if (netif_is_vrf(dev))
+	if (netif_is_vrf(dev)) {
 		ifindex = dev->ifindex;
-	else {
+	} else {
 		vrf_ptr = rcu_dereference(dev->vrf_ptr);
 		if (vrf_ptr)
 			ifindex = vrf_ptr->ifindex;
@@ -54,6 +54,17 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
 	return ifindex;
 }
 
+static inline int vrf_master_ifindex(const struct net_device *dev)
+{
+	int ifindex;
+
+	rcu_read_lock();
+	ifindex = vrf_master_ifindex_rcu(dev);
+	rcu_read_unlock();
+
+	return ifindex;
+}
+
 /* called with rcu_read_lock */
 static inline int vrf_dev_table_rcu(const struct net_device *dev)
 {
@@ -133,6 +144,11 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
 	return 0;
 }
 
+static inline int vrf_master_ifindex(const struct net_device *dev)
+{
+	return 0;
+}
+
 static inline int vrf_dev_table_rcu(const struct net_device *dev)
 {
 	return 0;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c6f1ce149ffb..f16488efa1c8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -426,7 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
-	fl4.flowi4_oif = vrf_master_ifindex_rcu(skb->dev) ? : skb->dev->ifindex;
+	fl4.flowi4_oif = vrf_master_ifindex(skb->dev) ? : skb->dev->ifindex;
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
@@ -460,7 +460,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->flowi4_proto = IPPROTO_ICMP;
 	fl4->fl4_icmp_type = type;
 	fl4->fl4_icmp_code = code;
-	fl4->flowi4_oif = vrf_master_ifindex_rcu(skb_in->dev) ? : skb_in->dev->ifindex;
+	fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex;
 
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
 	rt = __ip_route_output_key(net, fl4);
-- 
cgit v1.2.3


From fdf79bd48876812acf0de58ed7a8bc1b3a3c67d6 Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Thu, 20 Aug 2015 17:26:00 +0200
Subject: NFC: nci: Add post_setup handler

Some drivers require non-standard configuration after NCI_CORE_INIT
request, because they need to know ndev->manufact_specific_info or
ndev->manufact_id. This patch adds post_setup handler allowing to do
such custom configuration.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nci_core.h | 1 +
 net/nfc/nci/core.c         | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index 01fc8c531115..1bdaa5f51107 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -79,6 +79,7 @@ struct nci_ops {
 	int   (*close)(struct nci_dev *ndev);
 	int   (*send)(struct nci_dev *ndev, struct sk_buff *skb);
 	int   (*setup)(struct nci_dev *ndev);
+	int   (*post_setup)(struct nci_dev *ndev);
 	int   (*fw_download)(struct nci_dev *ndev, const char *firmware_name);
 	__u32 (*get_rfprotocol)(struct nci_dev *ndev, __u8 rf_protocol);
 	int   (*discover_se)(struct nci_dev *ndev);
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 95af2d24d5be..d9045ec172e3 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -388,6 +388,10 @@ static int nci_open_device(struct nci_dev *ndev)
 				   msecs_to_jiffies(NCI_INIT_TIMEOUT));
 	}
 
+	if (ndev->ops->post_setup) {
+		rc = ndev->ops->post_setup(ndev);
+	}
+
 	if (!rc) {
 		rc = __nci_request(ndev, nci_init_complete_req, 0,
 				   msecs_to_jiffies(NCI_INIT_TIMEOUT));
-- 
cgit v1.2.3


From 025a0cb8380b7100d39fb426db9192b6c59595dc Mon Sep 17 00:00:00 2001
From: Robert Baldyga <r.baldyga@samsung.com>
Date: Thu, 20 Aug 2015 17:26:01 +0200
Subject: NFC: nci: export nci_core_reset and nci_core_init

Some drivers needs to have ability to reinit NCI core, for example
after updating firmware in setup() of post_setup() callback. This
patch makes nci_core_reset() and nci_core_init() functions public,
to make it possible.

Signed-off-by: Robert Baldyga <r.baldyga@samsung.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nci_core.h |  2 ++
 net/nfc/nci/core.c         | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/net')

diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index 1bdaa5f51107..d0d0f1e53bb9 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -278,6 +278,8 @@ int nci_request(struct nci_dev *ndev,
 			    unsigned long opt),
 		unsigned long opt, __u32 timeout);
 int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload);
+int nci_core_reset(struct nci_dev *ndev);
+int nci_core_init(struct nci_dev *ndev);
 
 int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb);
 int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val);
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index d9045ec172e3..943889b87a34 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -351,6 +351,20 @@ int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload)
 }
 EXPORT_SYMBOL(nci_prop_cmd);
 
+int nci_core_reset(struct nci_dev *ndev)
+{
+	return __nci_request(ndev, nci_reset_req, 0,
+			     msecs_to_jiffies(NCI_RESET_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_core_reset);
+
+int nci_core_init(struct nci_dev *ndev)
+{
+	return __nci_request(ndev, nci_init_req, 0,
+			     msecs_to_jiffies(NCI_INIT_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_core_init);
+
 static int nci_open_device(struct nci_dev *ndev)
 {
 	int rc = 0;
-- 
cgit v1.2.3


From 29e76924cf087bc6a9114a9244828fd13ae959bb Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Wed, 19 Aug 2015 21:26:43 +0200
Subject: nfc: netlink: Add capability to reply to vendor_cmd with data

A proprietary vendor command may send back useful data to the user
application.

For example, the field level applied on the NFC router antenna.

Still based on net/wireless/nl80211.c implementation,
add nfc_vendor_cmd_alloc_reply_skb and nfc_vendor_cmd_reply in
order to send back over netlink data generated by a proprietary
command.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nfc.h | 41 ++++++++++++++++++++++++
 net/nfc/netlink.c     | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 125 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index f9e58ae45f9c..30afc9a6718c 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -203,6 +203,7 @@ struct nfc_dev {
 	int n_vendor_cmds;
 
 	struct nfc_ops *ops;
+	struct genl_info *cur_cmd_info;
 };
 #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev)
 
@@ -318,4 +319,44 @@ static inline int nfc_set_vendor_cmds(struct nfc_dev *dev,
 	return 0;
 }
 
+struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev,
+						 enum nfc_attrs attr,
+						 u32 oui, u32 subcmd,
+						 int approxlen);
+int nfc_vendor_cmd_reply(struct sk_buff *skb);
+
+/**
+ * nfc_vendor_cmd_alloc_reply_skb - allocate vendor command reply
+ * @dev: nfc device
+ * @oui: vendor oui
+ * @approxlen: an upper bound of the length of the data that will
+ *      be put into the skb
+ *
+ * This function allocates and pre-fills an skb for a reply to
+ * a vendor command. Since it is intended for a reply, calling
+ * it outside of a vendor command's doit() operation is invalid.
+ *
+ * The returned skb is pre-filled with some identifying data in
+ * a way that any data that is put into the skb (with skb_put(),
+ * nla_put() or similar) will end up being within the
+ * %NFC_ATTR_VENDOR_DATA attribute, so all that needs to be done
+ * with the skb is adding data for the corresponding userspace tool
+ * which can then read that data out of the vendor data attribute.
+ * You must not modify the skb in any other way.
+ *
+ * When done, call nfc_vendor_cmd_reply() with the skb and return
+ * its error code as the result of the doit() operation.
+ *
+ * Return: An allocated and pre-filled skb. %NULL if any errors happen.
+ */
+static inline struct sk_buff *
+nfc_vendor_cmd_alloc_reply_skb(struct nfc_dev *dev,
+				u32 oui, u32 subcmd, int approxlen)
+{
+	return __nfc_alloc_vendor_cmd_reply_skb(dev,
+						NFC_ATTR_VENDOR_DATA,
+						oui,
+						subcmd, approxlen);
+}
+
 #endif /* __NET_NFC_H */
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 73d1ca7c546c..853172c27f68 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -63,6 +63,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
 	[NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING,
 				     .len = NFC_FIRMWARE_NAME_MAXSIZE },
 	[NFC_ATTR_SE_APDU] = { .type = NLA_BINARY },
+	[NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },
+
 };
 
 static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = {
@@ -1503,7 +1505,7 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb,
 	u32 dev_idx, vid, subcmd;
 	u8 *data;
 	size_t data_len;
-	int i;
+	int i, err;
 
 	if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
 	    !info->attrs[NFC_ATTR_VENDOR_ID] ||
@@ -1534,12 +1536,92 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb,
 		if (cmd->vendor_id != vid || cmd->subcmd != subcmd)
 			continue;
 
-		return cmd->doit(dev, data, data_len);
+		dev->cur_cmd_info = info;
+		err = cmd->doit(dev, data, data_len);
+		dev->cur_cmd_info = NULL;
+		return err;
 	}
 
 	return -EOPNOTSUPP;
 }
 
+/* message building helper */
+static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+				int flags, u8 cmd)
+{
+	/* since there is no private header just add the generic one */
+	return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd);
+}
+
+static struct sk_buff *
+__nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen,
+			   u32 portid, u32 seq,
+			   enum nfc_attrs attr,
+			   u32 oui, u32 subcmd, gfp_t gfp)
+{
+	struct sk_buff *skb;
+	void *hdr;
+
+	skb = nlmsg_new(approxlen + 100, gfp);
+	if (!skb)
+		return NULL;
+
+	hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR);
+	if (!hdr) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx))
+		goto nla_put_failure;
+	if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui))
+		goto nla_put_failure;
+	if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd))
+		goto nla_put_failure;
+
+	((void **)skb->cb)[0] = dev;
+	((void **)skb->cb)[1] = hdr;
+
+	return skb;
+
+nla_put_failure:
+	kfree_skb(skb);
+	return NULL;
+}
+
+struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev,
+						 enum nfc_attrs attr,
+						 u32 oui, u32 subcmd,
+						 int approxlen)
+{
+	if (WARN_ON(!dev->cur_cmd_info))
+		return NULL;
+
+	return __nfc_alloc_vendor_cmd_skb(dev, approxlen,
+					  dev->cur_cmd_info->snd_portid,
+					  dev->cur_cmd_info->snd_seq, attr,
+					  oui, subcmd, GFP_KERNEL);
+}
+EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb);
+
+int nfc_vendor_cmd_reply(struct sk_buff *skb)
+{
+	struct nfc_dev *dev = ((void **)skb->cb)[0];
+	void *hdr = ((void **)skb->cb)[1];
+
+	/* clear CB data for netlink core to own from now on */
+	memset(skb->cb, 0, sizeof(skb->cb));
+
+	if (WARN_ON(!dev->cur_cmd_info)) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	genlmsg_end(skb, hdr);
+	return genlmsg_reply(skb, dev->cur_cmd_info);
+}
+EXPORT_SYMBOL(nfc_vendor_cmd_reply);
+
 static const struct genl_ops nfc_genl_ops[] = {
 	{
 		.cmd = NFC_CMD_GET_DEVICE,
-- 
cgit v1.2.3


From ac1cf3990c99802eae3aa735b35c94a2131eb9fe Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:20 +0200
Subject: ip_tunnels: remove custom alignment and packing

The custom alignment of struct ip_tunnel_key is unnecessary. In struct
sw_flow_key, it starts at offset 256, in struct ip_tunnel_info it's the
first field.

The structure is also packed even without the __packed keyword.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 984dbfa15e13..81cf11c931e4 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -36,7 +36,7 @@ struct ip_tunnel_key {
 	__u8			ipv4_ttl;
 	__be16			tp_src;
 	__be16			tp_dst;
-} __packed __aligned(4); /* Minimize padding. */
+};
 
 /* Indicates whether the tunnel info structure represents receive
  * or transmit tunnel parameters.
-- 
cgit v1.2.3


From 6b8847c5a2bafbbf92f4b779f87165093457ea68 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:21 +0200
Subject: ip_tunnels: use u8/u16/u32

The ip_tunnels.h include file uses mixture of __u16 and u16 (etc.) types.
Unify it to the non-underscore variants.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 81cf11c931e4..ca173f22f07f 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -32,8 +32,8 @@ struct ip_tunnel_key {
 	__be32			ipv4_src;
 	__be32			ipv4_dst;
 	__be16			tun_flags;
-	__u8			ipv4_tos;
-	__u8			ipv4_ttl;
+	u8			ipv4_tos;
+	u8			ipv4_ttl;
 	__be16			tp_src;
 	__be16			tp_dst;
 };
@@ -64,8 +64,8 @@ struct ip_tunnel_6rd_parm {
 #endif
 
 struct ip_tunnel_encap {
-	__u16			type;
-	__u16			flags;
+	u16			type;
+	u16			flags;
 	__be16			sport;
 	__be16			dport;
 };
@@ -95,8 +95,8 @@ struct ip_tunnel {
 					 * arrived */
 
 	/* These four fields used only by GRE */
-	__u32		i_seqno;	/* The last seen seqno	*/
-	__u32		o_seqno;	/* The last output seqno */
+	u32		i_seqno;	/* The last seen seqno	*/
+	u32		o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
 	int		mlink;
 
@@ -273,8 +273,8 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 
 int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
 int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
-		  __be32 src, __be32 dst, __u8 proto,
-		  __u8 tos, __u8 ttl, __be16 df, bool xnet);
+		  __be32 src, __be32 dst, u8 proto,
+		  u8 tos, u8 ttl, __be16 df, bool xnet);
 
 struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum,
 					 int gso_type_mask);
-- 
cgit v1.2.3


From 376534a3d17002d608985bd67c3b0880eacadd14 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:22 +0200
Subject: ip_tunnels: use offsetofend

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index ca173f22f07f..cc3b39e9010b 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -23,9 +23,7 @@
 #define IPTUNNEL_ERR_TIMEO	(30*HZ)
 
 /* Used to memset ip_tunnel padding. */
-#define IP_TUNNEL_KEY_SIZE					\
-	(offsetof(struct ip_tunnel_key, tp_dst) +		\
-	 FIELD_SIZEOF(struct ip_tunnel_key, tp_dst))
+#define IP_TUNNEL_KEY_SIZE	offsetofend(struct ip_tunnel_key, tp_dst)
 
 struct ip_tunnel_key {
 	__be64			tun_id;
-- 
cgit v1.2.3


From c1ea5d672aaff08da337dee735dbb548e3415585 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:23 +0200
Subject: ip_tunnels: add IPv6 addresses to ip_tunnel_key

Add the IPv6 addresses as an union with IPv4 ones. When using IPv4, the
newly introduced padding after the IPv4 addresses needs to be zeroed out.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            |  6 +++---
 include/net/ip_tunnels.h       | 24 ++++++++++++++++++++----
 net/core/filter.c              |  4 ++--
 net/ipv4/ip_gre.c              | 10 +++++-----
 net/ipv4/ip_tunnel_core.c      |  8 ++++----
 net/openvswitch/flow_netlink.c | 18 +++++++++---------
 net/openvswitch/flow_table.c   |  2 +-
 net/openvswitch/vport-geneve.c |  2 +-
 net/openvswitch/vport.c        |  2 +-
 net/openvswitch/vport.h        |  4 ++--
 10 files changed, 48 insertions(+), 32 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ad51dac88d19..30a7abcf2c09 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1276,8 +1276,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 			goto drop;
 
 		info = &tun_dst->u.tun_info;
-		info->key.ipv4_src = iph->saddr;
-		info->key.ipv4_dst = iph->daddr;
+		info->key.u.ipv4.src = iph->saddr;
+		info->key.u.ipv4.dst = iph->daddr;
 		info->key.ipv4_tos = iph->tos;
 		info->key.ipv4_ttl = iph->ttl;
 		info->key.tp_src = udp_hdr(skb)->source;
@@ -1925,7 +1925,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
 		vni = be64_to_cpu(info->key.tun_id);
 		remote_ip.sin.sin_family = AF_INET;
-		remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst;
+		remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
 		dst = &remote_ip;
 	}
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index cc3b39e9010b..6a51371dad00 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -25,10 +25,24 @@
 /* Used to memset ip_tunnel padding. */
 #define IP_TUNNEL_KEY_SIZE	offsetofend(struct ip_tunnel_key, tp_dst)
 
+/* Used to memset ipv4 address padding. */
+#define IP_TUNNEL_KEY_IPV4_PAD	offsetofend(struct ip_tunnel_key, u.ipv4.dst)
+#define IP_TUNNEL_KEY_IPV4_PAD_LEN				\
+	(FIELD_SIZEOF(struct ip_tunnel_key, u) -		\
+	 FIELD_SIZEOF(struct ip_tunnel_key, u.ipv4))
+
 struct ip_tunnel_key {
 	__be64			tun_id;
-	__be32			ipv4_src;
-	__be32			ipv4_dst;
+	union {
+		struct {
+			__be32	src;
+			__be32	dst;
+		} ipv4;
+		struct {
+			struct in6_addr src;
+			struct in6_addr dst;
+		} ipv6;
+	} u;
 	__be16			tun_flags;
 	u8			ipv4_tos;
 	u8			ipv4_ttl;
@@ -177,8 +191,10 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
 					 const void *opts, u8 opts_len)
 {
 	tun_info->key.tun_id = tun_id;
-	tun_info->key.ipv4_src = saddr;
-	tun_info->key.ipv4_dst = daddr;
+	tun_info->key.u.ipv4.src = saddr;
+	tun_info->key.u.ipv4.dst = daddr;
+	memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD,
+	       0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
 	tun_info->key.ipv4_tos = tos;
 	tun_info->key.ipv4_ttl = ttl;
 	tun_info->key.tun_flags = tun_flags;
diff --git a/net/core/filter.c b/net/core/filter.c
index 83f08cefeab7..379568562ffb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1495,7 +1495,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EINVAL;
 
 	to->tunnel_id = be64_to_cpu(info->key.tun_id);
-	to->remote_ipv4 = be32_to_cpu(info->key.ipv4_src);
+	to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
 
 	return 0;
 }
@@ -1529,7 +1529,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	info = &md->u.tun_info;
 	info->mode = IP_TUNNEL_INFO_TX;
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
-	info->key.ipv4_dst = cpu_to_be32(from->remote_ipv4);
+	info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
 
 	return 0;
 }
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index fb44d693796e..b7bb7d6aa7a8 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -407,8 +407,8 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 				return PACKET_REJECT;
 
 			info = &tun_dst->u.tun_info;
-			info->key.ipv4_src = iph->saddr;
-			info->key.ipv4_dst = iph->daddr;
+			info->key.u.ipv4.src = iph->saddr;
+			info->key.u.ipv4.dst = iph->daddr;
 			info->key.ipv4_tos = iph->tos;
 			info->key.ipv4_ttl = iph->ttl;
 
@@ -527,8 +527,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	key = &tun_info->key;
 	memset(&fl, 0, sizeof(fl));
-	fl.daddr = key->ipv4_dst;
-	fl.saddr = key->ipv4_src;
+	fl.daddr = key->u.ipv4.dst;
+	fl.saddr = key->u.ipv4.src;
 	fl.flowi4_tos = RT_TOS(key->ipv4_tos);
 	fl.flowi4_mark = skb->mark;
 	fl.flowi4_proto = IPPROTO_GRE;
@@ -564,7 +564,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 	err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
-			    key->ipv4_dst, IPPROTO_GRE,
+			    key->u.ipv4.dst, IPPROTO_GRE,
 			    key->ipv4_tos, key->ipv4_ttl, df, false);
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 	return;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 1c2389d582a6..93907d71cda6 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -227,10 +227,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
 		tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]);
 
 	if (tb[LWTUNNEL_IP_DST])
-		tun_info->key.ipv4_dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
+		tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
 
 	if (tb[LWTUNNEL_IP_SRC])
-		tun_info->key.ipv4_src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
+		tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
 
 	if (tb[LWTUNNEL_IP_TTL])
 		tun_info->key.ipv4_ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
@@ -262,8 +262,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
 	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 
 	if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
-	    nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.ipv4_dst) ||
-	    nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.ipv4_src) ||
+	    nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
+	    nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
 	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.ipv4_tos) ||
 	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ipv4_ttl) ||
 	    nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) ||
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index a6eb77ab1a64..a7f866374817 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -534,11 +534,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 			tun_flags |= TUNNEL_KEY;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
-			SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+			SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src,
 					nla_get_in_addr(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
-			SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+			SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst,
 					nla_get_in_addr(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_TOS:
@@ -609,7 +609,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 	}
 
 	if (!is_mask) {
-		if (!match->key->tun_key.ipv4_dst) {
+		if (!match->key->tun_key.u.ipv4.dst) {
 			OVS_NLERR(log, "IPv4 tunnel dst address is zero");
 			return -EINVAL;
 		}
@@ -647,13 +647,13 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
 	if (output->tun_flags & TUNNEL_KEY &&
 	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
 		return -EMSGSIZE;
-	if (output->ipv4_src &&
+	if (output->u.ipv4.src &&
 	    nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC,
-			    output->ipv4_src))
+			    output->u.ipv4.src))
 		return -EMSGSIZE;
-	if (output->ipv4_dst &&
+	if (output->u.ipv4.dst &&
 	    nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST,
-			    output->ipv4_dst))
+			    output->u.ipv4.dst))
 		return -EMSGSIZE;
 	if (output->ipv4_tos &&
 	    nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
@@ -1116,7 +1116,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 			/* The userspace does not send tunnel attributes that
 			 * are 0, but we should not wildcard them nonetheless.
 			 */
-			if (match->key->tun_key.ipv4_dst)
+			if (match->key->tun_key.u.ipv4.dst)
 				SW_FLOW_KEY_MEMSET_FIELD(match, tun_key,
 							 0xff, true);
 
@@ -1287,7 +1287,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
 		goto nla_put_failure;
 
-	if ((swkey->tun_key.ipv4_dst || is_mask)) {
+	if ((swkey->tun_key.u.ipv4.dst || is_mask)) {
 		const void *opts = NULL;
 
 		if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 3a9d1dde76ed..d22d8e948d0f 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -426,7 +426,7 @@ static u32 flow_hash(const struct sw_flow_key *key,
 
 static int flow_key_start(const struct sw_flow_key *key)
 {
-	if (key->tun_key.ipv4_dst)
+	if (key->tun_key.u.ipv4.dst)
 		return 0;
 	else
 		return rounddown(offsetof(struct sw_flow_key, phy),
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 1da3a14d1010..023813d05f88 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -203,7 +203,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 	}
 
 	err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
-			      tun_key->ipv4_dst, tun_key->ipv4_tos,
+			      tun_key->u.ipv4.dst, tun_key->ipv4_tos,
 			      tun_key->ipv4_ttl, df, sport, dport,
 			      tun_key->tun_flags, vni, opts_len, opts,
 			      !!(tun_key->tun_flags & TUNNEL_CSUM), false);
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index d14f59403c5e..a06adc72a58d 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -603,7 +603,7 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
 	 * saddr, tp_src and tp_dst
 	 */
 	__ip_tunnel_info_init(egress_tun_info,
-			      fl.saddr, tun_key->ipv4_dst,
+			      fl.saddr, tun_key->u.ipv4.dst,
 			      tun_key->ipv4_tos,
 			      tun_key->ipv4_ttl,
 			      tp_src, tp_dst,
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 1a689c28b5a6..43d8f5a835cb 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -254,8 +254,8 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
 	struct rtable *rt;
 
 	memset(fl, 0, sizeof(*fl));
-	fl->daddr = key->ipv4_dst;
-	fl->saddr = key->ipv4_src;
+	fl->daddr = key->u.ipv4.dst;
+	fl->saddr = key->u.ipv4.src;
 	fl->flowi4_tos = RT_TOS(key->ipv4_tos);
 	fl->flowi4_mark = mark;
 	fl->flowi4_proto = protocol;
-- 
cgit v1.2.3


From 7c383fb2254c44e096427470da6a36380169b548 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:24 +0200
Subject: ip_tunnels: use tos and ttl fields also for IPv6

Rename the ipv4_tos and ipv4_ttl fields to just 'tos' and 'ttl', as they'll
be used with IPv6 tunnels, too.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            |  8 ++++----
 include/net/ip_tunnels.h       |  8 ++++----
 net/ipv4/ip_gre.c              |  8 ++++----
 net/ipv4/ip_tunnel_core.c      |  8 ++++----
 net/openvswitch/flow_netlink.c | 10 +++++-----
 net/openvswitch/vport-geneve.c |  4 ++--
 net/openvswitch/vport.c        |  4 ++--
 net/openvswitch/vport.h        |  2 +-
 8 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 30a7abcf2c09..ebeb3def06c5 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1278,8 +1278,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		info = &tun_dst->u.tun_info;
 		info->key.u.ipv4.src = iph->saddr;
 		info->key.u.ipv4.dst = iph->daddr;
-		info->key.ipv4_tos = iph->tos;
-		info->key.ipv4_ttl = iph->ttl;
+		info->key.tos = iph->tos;
+		info->key.ttl = iph->ttl;
 		info->key.tp_src = udp_hdr(skb)->source;
 		info->key.tp_dst = udp_hdr(skb)->dest;
 
@@ -1960,8 +1960,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			else
 				flags &= ~VXLAN_F_UDP_CSUM;
 
-			ttl = info->key.ipv4_ttl;
-			tos = info->key.ipv4_tos;
+			ttl = info->key.ttl;
+			tos = info->key.tos;
 
 			if (info->options_len)
 				md = ip_tunnel_info_opts(info, sizeof(*md));
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6a51371dad00..224e4ecec91b 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -44,8 +44,8 @@ struct ip_tunnel_key {
 		} ipv6;
 	} u;
 	__be16			tun_flags;
-	u8			ipv4_tos;
-	u8			ipv4_ttl;
+	u8			tos;		/* TOS for IPv4, TC for IPv6 */
+	u8			ttl;		/* TTL for IPv4, HL for IPv6 */
 	__be16			tp_src;
 	__be16			tp_dst;
 };
@@ -195,8 +195,8 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
 	tun_info->key.u.ipv4.dst = daddr;
 	memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD,
 	       0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
-	tun_info->key.ipv4_tos = tos;
-	tun_info->key.ipv4_ttl = ttl;
+	tun_info->key.tos = tos;
+	tun_info->key.ttl = ttl;
 	tun_info->key.tun_flags = tun_flags;
 
 	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b7bb7d6aa7a8..5193618b2600 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -409,8 +409,8 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 			info = &tun_dst->u.tun_info;
 			info->key.u.ipv4.src = iph->saddr;
 			info->key.u.ipv4.dst = iph->daddr;
-			info->key.ipv4_tos = iph->tos;
-			info->key.ipv4_ttl = iph->ttl;
+			info->key.tos = iph->tos;
+			info->key.ttl = iph->ttl;
 
 			info->mode = IP_TUNNEL_INFO_RX;
 			info->key.tun_flags = tpi->flags &
@@ -529,7 +529,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	memset(&fl, 0, sizeof(fl));
 	fl.daddr = key->u.ipv4.dst;
 	fl.saddr = key->u.ipv4.src;
-	fl.flowi4_tos = RT_TOS(key->ipv4_tos);
+	fl.flowi4_tos = RT_TOS(key->tos);
 	fl.flowi4_mark = skb->mark;
 	fl.flowi4_proto = IPPROTO_GRE;
 
@@ -565,7 +565,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 	err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
 			    key->u.ipv4.dst, IPPROTO_GRE,
-			    key->ipv4_tos, key->ipv4_ttl, df, false);
+			    key->tos, key->ttl, df, false);
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 	return;
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 93907d71cda6..f0514e39e57c 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -233,10 +233,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
 		tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
 
 	if (tb[LWTUNNEL_IP_TTL])
-		tun_info->key.ipv4_ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
+		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
 
 	if (tb[LWTUNNEL_IP_TOS])
-		tun_info->key.ipv4_tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
+		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
 
 	if (tb[LWTUNNEL_IP_SPORT])
 		tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]);
@@ -264,8 +264,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
 	if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
 	    nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
 	    nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
-	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.ipv4_tos) ||
-	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ipv4_ttl) ||
+	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
+	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
 	    nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) ||
 	    nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) ||
 	    nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index a7f866374817..4e7a3f7facc2 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -542,11 +542,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 					nla_get_in_addr(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_TOS:
-			SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+			SW_FLOW_KEY_PUT(match, tun_key.tos,
 					nla_get_u8(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_TTL:
-			SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+			SW_FLOW_KEY_PUT(match, tun_key.ttl,
 					nla_get_u8(a), is_mask);
 			ttl = true;
 			break;
@@ -655,10 +655,10 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
 	    nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST,
 			    output->u.ipv4.dst))
 		return -EMSGSIZE;
-	if (output->ipv4_tos &&
-	    nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+	if (output->tos &&
+	    nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos))
 		return -EMSGSIZE;
-	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
+	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl))
 		return -EMSGSIZE;
 	if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
 	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 023813d05f88..d01bd6360970 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -203,8 +203,8 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 	}
 
 	err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
-			      tun_key->u.ipv4.dst, tun_key->ipv4_tos,
-			      tun_key->ipv4_ttl, df, sport, dport,
+			      tun_key->u.ipv4.dst, tun_key->tos,
+			      tun_key->ttl, df, sport, dport,
 			      tun_key->tun_flags, vni, opts_len, opts,
 			      !!(tun_key->tun_flags & TUNNEL_CSUM), false);
 	if (err < 0)
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index a06adc72a58d..d73e5a16e7ca 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -604,8 +604,8 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
 	 */
 	__ip_tunnel_info_init(egress_tun_info,
 			      fl.saddr, tun_key->u.ipv4.dst,
-			      tun_key->ipv4_tos,
-			      tun_key->ipv4_ttl,
+			      tun_key->tos,
+			      tun_key->ttl,
 			      tp_src, tp_dst,
 			      tun_key->tun_id,
 			      tun_key->tun_flags,
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 43d8f5a835cb..b88b3ee86f07 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -256,7 +256,7 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
 	memset(fl, 0, sizeof(*fl));
 	fl->daddr = key->u.ipv4.dst;
 	fl->saddr = key->u.ipv4.src;
-	fl->flowi4_tos = RT_TOS(key->ipv4_tos);
+	fl->flowi4_tos = RT_TOS(key->tos);
 	fl->flowi4_mark = mark;
 	fl->flowi4_proto = protocol;
 
-- 
cgit v1.2.3


From 61adedf3e3f1d3f032c5a6a299978d91eff6d555 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:25 +0200
Subject: route: move lwtunnel state to dst_entry

Currently, the lwtunnel state resides in per-protocol data. This is
a problem if we encapsulate ipv6 traffic in an ipv4 tunnel (or vice versa).
The xmit function of the tunnel does not know whether the packet has been
routed to it by ipv4 or ipv6, yet it needs the lwtstate data. Moving the
lwtstate data to dst_entry makes such inter-protocol tunneling possible.

As a bonus, this brings a nice diffstat.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c              |  1 -
 drivers/net/vxlan.c            |  4 +--
 include/net/dst.h              |  3 +-
 include/net/dst_metadata.h     | 15 +++------
 include/net/ip6_fib.h          |  1 -
 include/net/lwtunnel.h         | 12 --------
 include/net/route.h            |  1 -
 net/core/dst.c                 |  3 ++
 net/core/filter.c              |  2 +-
 net/core/lwtunnel.c            | 70 ++++++------------------------------------
 net/ipv4/ip_gre.c              |  2 +-
 net/ipv4/route.c               | 20 +++++-------
 net/ipv6/ila.c                 | 14 +++------
 net/ipv6/ip6_fib.c             |  1 -
 net/ipv6/route.c               | 20 ++++++------
 net/mpls/mpls_iptunnel.c       |  7 ++---
 net/openvswitch/vport-netdev.c |  2 +-
 17 files changed, 48 insertions(+), 130 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index dbeffe789185..b3d9c5546c79 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -295,7 +295,6 @@ static struct rtable *vrf_rtable_create(struct net_device *dev)
 		rth->rt_uses_gateway = 0;
 		INIT_LIST_HEAD(&rth->rt_uncached);
 		rth->rt_uncached_list = NULL;
-		rth->rt_lwtstate = NULL;
 	}
 
 	return rth;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ebeb3def06c5..93613ffd8d7e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1909,7 +1909,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	u32 flags = vxlan->flags;
 
 	/* FIXME: Support IPv6 */
-	info = skb_tunnel_info(skb, AF_INET);
+	info = skb_tunnel_info(skb);
 
 	if (rdst) {
 		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
@@ -2105,7 +2105,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct vxlan_fdb *f;
 
 	/* FIXME: Support IPv6 */
-	info = skb_tunnel_info(skb, AF_INET);
+	info = skb_tunnel_info(skb);
 
 	skb_reset_mac_header(skb);
 	eth = eth_hdr(skb);
diff --git a/include/net/dst.h b/include/net/dst.h
index 2578811cef51..0a9a723f6c19 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -44,6 +44,7 @@ struct dst_entry {
 #else
 	void			*__pad1;
 #endif
+	struct lwtunnel_state   *lwtstate;
 	int			(*input)(struct sk_buff *);
 	int			(*output)(struct sock *sk, struct sk_buff *skb);
 
@@ -89,7 +90,7 @@ struct dst_entry {
 	 * (L1_CACHE_SIZE would be too much)
 	 */
 #ifdef CONFIG_64BIT
-	long			__pad_to_align_refcnt[2];
+	long			__pad_to_align_refcnt[1];
 #endif
 	/*
 	 * __refcnt wants to be on a different cache line from
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 075f523ff23f..2cb52d562272 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -23,22 +23,17 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
-static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb,
-						     int family)
+static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
 {
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
-	struct rtable *rt;
+	struct dst_entry *dst;
 
 	if (md_dst)
 		return &md_dst->u.tun_info;
 
-	switch (family) {
-	case AF_INET:
-		rt = (struct rtable *)skb_dst(skb);
-		if (rt && rt->rt_lwtstate)
-			return lwt_tun_info(rt->rt_lwtstate);
-		break;
-	}
+	dst = skb_dst(skb);
+	if (dst && dst->lwtstate)
+		return lwt_tun_info(dst->lwtstate);
 
 	return NULL;
 }
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 276328e3daa6..063d30474cf6 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -133,7 +133,6 @@ struct rt6_info {
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
-	struct lwtunnel_state		*rt6i_lwtstate;
 };
 
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index cfee53916ba5..843489884448 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -87,9 +87,7 @@ int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
 struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
 int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct sock *sk, struct sk_buff *skb);
-int lwtunnel_output6(struct sock *sk, struct sk_buff *skb);
 int lwtunnel_input(struct sk_buff *skb);
-int lwtunnel_input6(struct sk_buff *skb);
 
 #else
 
@@ -164,21 +162,11 @@ static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb)
 	return -EOPNOTSUPP;
 }
 
-static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int lwtunnel_input(struct sk_buff *skb)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int lwtunnel_input6(struct sk_buff *skb)
-{
-	return -EOPNOTSUPP;
-}
-
 #endif
 
 #endif /* __NET_LWTUNNEL_H */
diff --git a/include/net/route.h b/include/net/route.h
index 6dda2c1bf8c6..395d79bb556c 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -66,7 +66,6 @@ struct rtable {
 
 	struct list_head	rt_uncached;
 	struct uncached_list	*rt_uncached_list;
-	struct lwtunnel_state   *rt_lwtstate;
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/net/core/dst.c b/net/core/dst.c
index f8694d1b8702..50dcdbb0ee46 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -20,6 +20,7 @@
 #include <net/net_namespace.h>
 #include <linux/sched.h>
 #include <linux/prefetch.h>
+#include <net/lwtunnel.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -184,6 +185,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	dst->tclassid = 0;
 #endif
+	dst->lwtstate = NULL;
 	atomic_set(&dst->__refcnt, initial_ref);
 	dst->__use = 0;
 	dst->lastuse = jiffies;
@@ -264,6 +266,7 @@ again:
 		kfree(dst);
 	else
 		kmem_cache_free(dst->ops->kmem_cachep, dst);
+	lwtstate_put(dst->lwtstate);
 
 	dst = child;
 	if (dst) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 379568562ffb..b4adc961413f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1489,7 +1489,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
-	struct ip_tunnel_info *info = skb_tunnel_info(skb, AF_INET);
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
 		return -EINVAL;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 3331585174d9..e924c2e08554 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -179,14 +179,16 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
 }
 EXPORT_SYMBOL(lwtunnel_cmp_encap);
 
-int __lwtunnel_output(struct sock *sk, struct sk_buff *skb,
-		      struct lwtunnel_state *lwtstate)
+int lwtunnel_output(struct sock *sk, struct sk_buff *skb)
 {
+	struct dst_entry *dst = skb_dst(skb);
 	const struct lwtunnel_encap_ops *ops;
+	struct lwtunnel_state *lwtstate;
 	int ret = -EINVAL;
 
-	if (!lwtstate)
+	if (!dst)
 		goto drop;
+	lwtstate = dst->lwtstate;
 
 	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
 	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
@@ -209,47 +211,18 @@ drop:
 
 	return ret;
 }
-
-int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
-{
-	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-	struct lwtunnel_state *lwtstate = NULL;
-
-	if (rt) {
-		lwtstate = rt->rt6i_lwtstate;
-		skb->dev = rt->dst.dev;
-	}
-
-	skb->protocol = htons(ETH_P_IPV6);
-
-	return __lwtunnel_output(sk, skb, lwtstate);
-}
-EXPORT_SYMBOL(lwtunnel_output6);
-
-int lwtunnel_output(struct sock *sk, struct sk_buff *skb)
-{
-	struct rtable *rt = (struct rtable *)skb_dst(skb);
-	struct lwtunnel_state *lwtstate = NULL;
-
-	if (rt) {
-		lwtstate = rt->rt_lwtstate;
-		skb->dev = rt->dst.dev;
-	}
-
-	skb->protocol = htons(ETH_P_IP);
-
-	return __lwtunnel_output(sk, skb, lwtstate);
-}
 EXPORT_SYMBOL(lwtunnel_output);
 
-int __lwtunnel_input(struct sk_buff *skb,
-		     struct lwtunnel_state *lwtstate)
+int lwtunnel_input(struct sk_buff *skb)
 {
+	struct dst_entry *dst = skb_dst(skb);
 	const struct lwtunnel_encap_ops *ops;
+	struct lwtunnel_state *lwtstate;
 	int ret = -EINVAL;
 
-	if (!lwtstate)
+	if (!dst)
 		goto drop;
+	lwtstate = dst->lwtstate;
 
 	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
 	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
@@ -272,27 +245,4 @@ drop:
 
 	return ret;
 }
-
-int lwtunnel_input6(struct sk_buff *skb)
-{
-	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-	struct lwtunnel_state *lwtstate = NULL;
-
-	if (rt)
-		lwtstate = rt->rt6i_lwtstate;
-
-	return __lwtunnel_input(skb, lwtstate);
-}
-EXPORT_SYMBOL(lwtunnel_input6);
-
-int lwtunnel_input(struct sk_buff *skb)
-{
-	struct rtable *rt = (struct rtable *)skb_dst(skb);
-	struct lwtunnel_state *lwtstate = NULL;
-
-	if (rt)
-		lwtstate = rt->rt_lwtstate;
-
-	return __lwtunnel_input(skb, lwtstate);
-}
 EXPORT_SYMBOL(lwtunnel_input);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5193618b2600..1bf328182697 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -521,7 +521,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	__be16 df, flags;
 	int err;
 
-	tun_info = skb_tunnel_info(skb, AF_INET);
+	tun_info = skb_tunnel_info(skb);
 	if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX))
 		goto err_free_skb;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2403e85107f0..f3087aaa6dd8 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1359,7 +1359,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 		list_del(&rt->rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
-	lwtstate_put(rt->rt_lwtstate);
 }
 
 void rt_flush_dev(struct net_device *dev)
@@ -1408,7 +1407,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
-		rt->rt_lwtstate = lwtstate_get(nh->nh_lwtstate);
+		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
 		if (unlikely(fnhe))
 			cached = rt_bind_exception(rt, fnhe, daddr);
 		else if (!(rt->dst.flags & DST_NOCACHE))
@@ -1494,7 +1493,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
-	rth->rt_lwtstate = NULL;
 	if (our) {
 		rth->dst.input= ip_local_deliver;
 		rth->rt_flags |= RTCF_LOCAL;
@@ -1624,19 +1622,18 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
-	rth->rt_lwtstate = NULL;
 	RT_CACHE_STAT_INC(in_slow_tot);
 
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
-	if (lwtunnel_output_redirect(rth->rt_lwtstate)) {
-		rth->rt_lwtstate->orig_output = rth->dst.output;
+	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+		rth->dst.lwtstate->orig_output = rth->dst.output;
 		rth->dst.output = lwtunnel_output;
 	}
-	if (lwtunnel_input_redirect(rth->rt_lwtstate)) {
-		rth->rt_lwtstate->orig_input = rth->dst.input;
+	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+		rth->dst.lwtstate->orig_input = rth->dst.input;
 		rth->dst.input = lwtunnel_input;
 	}
 	skb_dst_set(skb, &rth->dst);
@@ -1695,7 +1692,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	   by fib_lookup.
 	 */
 
-	tun_info = skb_tunnel_info(skb, AF_INET);
+	tun_info = skb_tunnel_info(skb);
 	if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
 	else
@@ -1815,7 +1812,6 @@ local_input:
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
-	rth->rt_lwtstate = NULL;
 
 	RT_CACHE_STAT_INC(in_slow_tot);
 	if (res.type == RTN_UNREACHABLE) {
@@ -2006,7 +2002,6 @@ add:
 	rth->rt_gateway = 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
-	rth->rt_lwtstate = NULL;
 	RT_CACHE_STAT_INC(out_slow_tot);
 
 	if (flags & RTCF_LOCAL)
@@ -2029,7 +2024,7 @@ add:
 	}
 
 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
-	if (lwtunnel_output_redirect(rth->rt_lwtstate))
+	if (lwtunnel_output_redirect(rth->dst.lwtstate))
 		rth->dst.output = lwtunnel_output;
 
 	return rth;
@@ -2293,7 +2288,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_uses_gateway = ort->rt_uses_gateway;
 
 		INIT_LIST_HEAD(&rt->rt_uncached);
-		rt->rt_lwtstate = NULL;
 		dst_free(new);
 	}
 
diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c
index 2540ab4b76d1..f011c3d5ca40 100644
--- a/net/ipv6/ila.c
+++ b/net/ipv6/ila.c
@@ -89,16 +89,13 @@ static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
 static int ila_output(struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct rt6_info *rt6 = NULL;
 
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto drop;
 
-	rt6 = (struct rt6_info *)dst;
+	update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
 
-	update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate));
-
-	return rt6->rt6i_lwtstate->orig_output(sk, skb);
+	return dst->lwtstate->orig_output(sk, skb);
 
 drop:
 	kfree_skb(skb);
@@ -108,16 +105,13 @@ drop:
 static int ila_input(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct rt6_info *rt6 = NULL;
 
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto drop;
 
-	rt6 = (struct rt6_info *)dst;
-
-	update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate));
+	update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate));
 
-	return rt6->rt6i_lwtstate->orig_input(skb);
+	return dst->lwtstate->orig_input(skb);
 
 drop:
 	kfree_skb(skb);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5693b5eb8482..865e777ae20c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -178,7 +178,6 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 static void rt6_release(struct rt6_info *rt)
 {
 	if (atomic_dec_and_test(&rt->rt6i_ref)) {
-		lwtstate_put(rt->rt6i_lwtstate);
 		rt6_free_pcpu(rt);
 		dst_free(&rt->dst);
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c3733049715e..e6bbcdee7707 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1784,14 +1784,14 @@ int ip6_route_add(struct fib6_config *cfg)
 					   cfg->fc_encap, &lwtstate);
 		if (err)
 			goto out;
-		rt->rt6i_lwtstate = lwtstate_get(lwtstate);
-		if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) {
-			rt->rt6i_lwtstate->orig_output = rt->dst.output;
-			rt->dst.output = lwtunnel_output6;
+		rt->dst.lwtstate = lwtstate_get(lwtstate);
+		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
+			rt->dst.lwtstate->orig_output = rt->dst.output;
+			rt->dst.output = lwtunnel_output;
 		}
-		if (lwtunnel_input_redirect(rt->rt6i_lwtstate)) {
-			rt->rt6i_lwtstate->orig_input = rt->dst.input;
-			rt->dst.input = lwtunnel_input6;
+		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
+			rt->dst.lwtstate->orig_input = rt->dst.input;
+			rt->dst.input = lwtunnel_input;
 		}
 	}
 
@@ -2174,7 +2174,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 #endif
 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
 	rt->rt6i_table = ort->rt6i_table;
-	rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate);
+	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -2838,7 +2838,7 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
 	       + nla_total_size(1) /* RTA_PREF */
-	       + lwtunnel_get_encap_size(rt->rt6i_lwtstate);
+	       + lwtunnel_get_encap_size(rt->dst.lwtstate);
 }
 
 static int rt6_fill_node(struct net *net,
@@ -2991,7 +2991,7 @@ static int rt6_fill_node(struct net *net,
 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
 		goto nla_put_failure;
 
-	lwtunnel_fill_encap(skb, rt->rt6i_lwtstate);
+	lwtunnel_fill_encap(skb, rt->dst.lwtstate);
 
 	nlmsg_end(skb, nlh);
 	return 0;
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 276f8c992218..3da5ca3ba563 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -48,7 +48,6 @@ int mpls_output(struct sock *sk, struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct rtable *rt = NULL;
 	struct rt6_info *rt6 = NULL;
-	struct lwtunnel_state *lwtstate = NULL;
 	int err = 0;
 	bool bos;
 	int i;
@@ -58,11 +57,9 @@ int mpls_output(struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol == htons(ETH_P_IP)) {
 		ttl = ip_hdr(skb)->ttl;
 		rt = (struct rtable *)dst;
-		lwtstate = rt->rt_lwtstate;
 	} else if (skb->protocol == htons(ETH_P_IPV6)) {
 		ttl = ipv6_hdr(skb)->hop_limit;
 		rt6 = (struct rt6_info *)dst;
-		lwtstate = rt6->rt6i_lwtstate;
 	} else {
 		goto drop;
 	}
@@ -72,12 +69,12 @@ int mpls_output(struct sock *sk, struct sk_buff *skb)
 	/* Find the output device */
 	out_dev = dst->dev;
 	if (!mpls_output_possible(out_dev) ||
-	    !lwtstate || skb_warn_if_lro(skb))
+	    !dst->lwtstate || skb_warn_if_lro(skb))
 		goto drop;
 
 	skb_forward_csum(skb);
 
-	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+	tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate);
 
 	/* Verify the destination can hold the packet */
 	new_header_size = mpls_encap_size(tun_encap_info);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 4b70aaa4a746..a75011505039 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -57,7 +57,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
 	skb_push(skb, ETH_HLEN);
 	ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
 
-	ovs_vport_receive(vport, skb, skb_tunnel_info(skb, AF_INET));
+	ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
 	return;
 
 error:
-- 
cgit v1.2.3


From ab450605b35caa768ca33e86db9403229bf42be4 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:27 +0200
Subject: ipv6: ndisc: inherit metadata dst when creating ndisc requests

If output device wants to see the dst, inherit the dst of the original skb
in the ndisc request.

This is an IPv6 counterpart of commit 0accfc268f4d ("arp: Inherit metadata
dst when creating ARP requests").

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ndisc.h |  3 ++-
 net/ipv6/addrconf.c |  2 +-
 net/ipv6/ndisc.c    | 10 +++++++---
 net/ipv6/route.c    |  2 +-
 4 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index b3a7751251b4..aba5695fadb0 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -182,7 +182,8 @@ int ndisc_rcv(struct sk_buff *skb);
 
 void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
 		   const struct in6_addr *solicit,
-		   const struct in6_addr *daddr, const struct in6_addr *saddr);
+		   const struct in6_addr *daddr, const struct in6_addr *saddr,
+		   struct sk_buff *oskb);
 
 void ndisc_send_rs(struct net_device *dev,
 		   const struct in6_addr *saddr, const struct in6_addr *daddr);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 59242399b0b5..0f08d3b9e238 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3656,7 +3656,7 @@ static void addrconf_dad_work(struct work_struct *w)
 
 	/* send a neighbour solicitation for our addr */
 	addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
-	ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
+	ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any, NULL);
 out:
 	in6_ifa_put(ifp);
 	rtnl_unlock();
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b3054611f88a..13d3c2beb93e 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -553,7 +553,8 @@ static void ndisc_send_unsol_na(struct net_device *dev)
 
 void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
 		   const struct in6_addr *solicit,
-		   const struct in6_addr *daddr, const struct in6_addr *saddr)
+		   const struct in6_addr *daddr, const struct in6_addr *saddr,
+		   struct sk_buff *oskb)
 {
 	struct sk_buff *skb;
 	struct in6_addr addr_buf;
@@ -589,6 +590,9 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
 		ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
 				       dev->dev_addr);
 
+	if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE) && oskb)
+		skb_dst_copy(skb, oskb);
+
 	ndisc_send_skb(skb, daddr, saddr);
 }
 
@@ -675,12 +679,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 				  "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
 				  __func__, target);
 		}
-		ndisc_send_ns(dev, neigh, target, target, saddr);
+		ndisc_send_ns(dev, neigh, target, target, saddr, skb);
 	} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
 		neigh_app_ns(neigh);
 	} else {
 		addrconf_addr_solict_mult(target, &mcaddr);
-		ndisc_send_ns(dev, NULL, target, &mcaddr, saddr);
+		ndisc_send_ns(dev, NULL, target, &mcaddr, saddr, skb);
 	}
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0947ad0b3de8..c4f3b9fcca9d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -538,7 +538,7 @@ static void rt6_probe_deferred(struct work_struct *w)
 		container_of(w, struct __rt6_probe_work, work);
 
 	addrconf_addr_solict_mult(&work->target, &mcaddr);
-	ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
+	ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL, NULL);
 	dev_put(work->dev);
 	kfree(work);
 }
-- 
cgit v1.2.3


From 705cc62f6728c5a23e3c82465aa94e652e0b50e4 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:28 +0200
Subject: vxlan: provide access function for vxlan socket address family

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 8 ++++----
 include/net/vxlan.h | 5 +++++
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 93613ffd8d7e..070149f77072 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -236,7 +236,7 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
 
 	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
 		if (inet_sk(vs->sock->sk)->inet_sport == port &&
-		    inet_sk(vs->sock->sk)->sk.sk_family == family &&
+		    vxlan_get_sk_family(vs) == family &&
 		    vs->flags == flags)
 			return vs;
 	}
@@ -625,7 +625,7 @@ static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
 	struct net_device *dev;
 	struct sock *sk = vs->sock->sk;
 	struct net *net = sock_net(sk);
-	sa_family_t sa_family = sk->sk_family;
+	sa_family_t sa_family = vxlan_get_sk_family(vs);
 	__be16 port = inet_sk(sk)->inet_sport;
 	int err;
 
@@ -650,7 +650,7 @@ static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
 	struct net_device *dev;
 	struct sock *sk = vs->sock->sk;
 	struct net *net = sock_net(sk);
-	sa_family_t sa_family = sk->sk_family;
+	sa_family_t sa_family = vxlan_get_sk_family(vs);
 	__be16 port = inet_sk(sk)->inet_sport;
 
 	rcu_read_lock();
@@ -2390,7 +2390,7 @@ void vxlan_get_rx_port(struct net_device *dev)
 	for (i = 0; i < PORT_HASH_SIZE; ++i) {
 		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
 			port = inet_sk(vs->sock->sk)->inet_sport;
-			sa_family = vs->sock->sk->sk_family;
+			sa_family = vxlan_get_sk_family(vs);
 			dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
 							    port);
 		}
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index e4534f1b2d8c..43677e6b9c43 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -241,3 +241,8 @@ static inline void vxlan_get_rx_port(struct net_device *netdev)
 }
 #endif
 #endif
+
+static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
+{
+	return vs->sock->sk->sk_family;
+}
-- 
cgit v1.2.3


From 904af04d30f303d96902584206457128c3051d8d Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 20 Aug 2015 13:56:31 +0200
Subject: ipv6: route: extend flow representation with tunnel key

Use flowi_tunnel in flowi6 similarly to what is done with IPv4.
This complements commit 1b7179d3adff ("route: Extend flow representation
with tunnel key").

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h | 1 +
 net/ipv6/route.c   | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow.h b/include/net/flow.h
index f305588fc162..9e0297c4c11d 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -130,6 +130,7 @@ struct flowi6 {
 #define flowi6_proto		__fl_common.flowic_proto
 #define flowi6_flags		__fl_common.flowic_flags
 #define flowi6_secid		__fl_common.flowic_secid
+#define flowi6_tun_key		__fl_common.flowic_tun_key
 	struct in6_addr		daddr;
 	struct in6_addr		saddr;
 	__be32			flowlabel;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c4f3b9fcca9d..6c0fe4c7ce8d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -54,11 +54,13 @@
 #include <net/tcp.h>
 #include <linux/rtnetlink.h>
 #include <net/dst.h>
+#include <net/dst_metadata.h>
 #include <net/xfrm.h>
 #include <net/netevent.h>
 #include <net/netlink.h>
 #include <net/nexthop.h>
 #include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
 
 #include <asm/uaccess.h>
 
@@ -1131,6 +1133,7 @@ void ip6_route_input(struct sk_buff *skb)
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct net *net = dev_net(skb->dev);
 	int flags = RT6_LOOKUP_F_HAS_SADDR;
+	struct ip_tunnel_info *tun_info;
 	struct flowi6 fl6 = {
 		.flowi6_iif = skb->dev->ifindex,
 		.daddr = iph->daddr,
@@ -1140,6 +1143,9 @@ void ip6_route_input(struct sk_buff *skb)
 		.flowi6_proto = iph->nexthdr,
 	};
 
+	tun_info = skb_tunnel_info(skb);
+	if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
 	skb_dst_drop(skb);
 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 }
-- 
cgit v1.2.3


From e4ff67513096e6e196ca58043fce04d0f87babbe Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 26 Jul 2015 15:03:27 +0300
Subject: ipvs: add sync_maxlen parameter for the sync daemon

Allow setups with large MTU to send large sync packets by
adding sync_maxlen parameter. The default value is now based
on MTU but no more than 1500 for compatibility reasons.

To avoid problems if MTU changes allow fragmentation by
sending packets with DF=0. Problem reported by Dan Carpenter.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |  19 +++---
 include/uapi/linux/ip_vs.h      |   1 +
 net/netfilter/ipvs/ip_vs_ctl.c  |  53 ++++++++++------
 net/netfilter/ipvs/ip_vs_sync.c | 137 ++++++++++++++++++----------------------
 4 files changed, 108 insertions(+), 102 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4e3731ee4eac..2fdc13caf712 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -846,6 +846,13 @@ struct ipvs_master_sync_state {
 /* How much time to keep dests in trash */
 #define IP_VS_DEST_TRASH_PERIOD		(120 * HZ)
 
+struct ipvs_sync_daemon_cfg {
+	int			syncid;
+	u16			sync_maxlen;
+	/* multicast interface name */
+	char			mcast_ifn[IP_VS_IFNAME_MAXLEN];
+};
+
 /* IPVS in network namespace */
 struct netns_ipvs {
 	int			gen;		/* Generation */
@@ -961,15 +968,10 @@ struct netns_ipvs {
 	spinlock_t		sync_buff_lock;
 	struct task_struct	**backup_threads;
 	int			threads_mask;
-	int			send_mesg_maxlen;
-	int			recv_mesg_maxlen;
 	volatile int		sync_state;
-	volatile int		master_syncid;
-	volatile int		backup_syncid;
 	struct mutex		sync_mutex;
-	/* multicast interface name */
-	char			master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-	char			backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+	struct ipvs_sync_daemon_cfg	mcfg;	/* Master Configuration */
+	struct ipvs_sync_daemon_cfg	bcfg;	/* Backup Configuration */
 	/* net name space ptr */
 	struct net		*net;            /* Needed by timer routines */
 	/* Number of heterogeneous destinations, needed becaus heterogeneous
@@ -1408,7 +1410,8 @@ static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest)
 /* IPVS sync daemon data and function prototypes
  * (from ip_vs_sync.c)
  */
-int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid);
+int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *cfg,
+		      int state);
 int stop_sync_thread(struct net *net, int state);
 void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts);
 
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 3199243f2028..68377d8c8870 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -406,6 +406,7 @@ enum {
 	IPVS_DAEMON_ATTR_STATE,		/* sync daemon state (master/backup) */
 	IPVS_DAEMON_ATTR_MCAST_IFN,	/* multicast interface name */
 	IPVS_DAEMON_ATTR_SYNC_ID,	/* SyncID we belong to */
+	IPVS_DAEMON_ATTR_SYNC_MAXLEN,	/* UDP Payload Size */
 	__IPVS_DAEMON_ATTR_MAX,
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index af0b69e411b7..96f7bbfd5e1d 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2336,10 +2336,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
 
 		if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+			struct ipvs_sync_daemon_cfg cfg;
+
+			memset(&cfg, 0, sizeof(cfg));
+			strlcpy(cfg.mcast_ifn, dm->mcast_ifn,
+				sizeof(cfg.mcast_ifn));
+			cfg.syncid = dm->syncid;
 			rtnl_lock();
 			mutex_lock(&ipvs->sync_mutex);
-			ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
-						dm->syncid);
+			ret = start_sync_thread(net, &cfg, dm->state);
 			mutex_unlock(&ipvs->sync_mutex);
 			rtnl_unlock();
 		} else {
@@ -2650,15 +2655,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		mutex_lock(&ipvs->sync_mutex);
 		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
 			d[0].state = IP_VS_STATE_MASTER;
-			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+			strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
 				sizeof(d[0].mcast_ifn));
-			d[0].syncid = ipvs->master_syncid;
+			d[0].syncid = ipvs->mcfg.syncid;
 		}
 		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
 			d[1].state = IP_VS_STATE_BACKUP;
-			strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+			strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
 				sizeof(d[1].mcast_ifn));
-			d[1].syncid = ipvs->backup_syncid;
+			d[1].syncid = ipvs->bcfg.syncid;
 		}
 		if (copy_to_user(user, &d, sizeof(d)) != 0)
 			ret = -EFAULT;
@@ -2813,6 +2818,7 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
 	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
 					    .len = IP_VS_IFNAME_MAXLEN },
 	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
+	[IPVS_DAEMON_ATTR_SYNC_MAXLEN]	= { .type = NLA_U16 },
 };
 
 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
@@ -3271,7 +3277,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 }
 
 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
-				  const char *mcast_ifn, __u32 syncid)
+				  struct ipvs_sync_daemon_cfg *c)
 {
 	struct nlattr *nl_daemon;
 
@@ -3280,8 +3286,9 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
 		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
-	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
-	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
+	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
+	    nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen))
 		goto nla_put_failure;
 	nla_nest_end(skb, nl_daemon);
 
@@ -3293,7 +3300,7 @@ nla_put_failure:
 }
 
 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
-				  const char *mcast_ifn, __u32 syncid,
+				  struct ipvs_sync_daemon_cfg *c,
 				  struct netlink_callback *cb)
 {
 	void *hdr;
@@ -3303,7 +3310,7 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
 	if (!hdr)
 		return -EMSGSIZE;
 
-	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+	if (ip_vs_genl_fill_daemon(skb, state, c))
 		goto nla_put_failure;
 
 	genlmsg_end(skb, hdr);
@@ -3323,8 +3330,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
 	mutex_lock(&ipvs->sync_mutex);
 	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
-					   ipvs->master_mcast_ifn,
-					   ipvs->master_syncid, cb) < 0)
+					   &ipvs->mcfg, cb) < 0)
 			goto nla_put_failure;
 
 		cb->args[0] = 1;
@@ -3332,8 +3338,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
 
 	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
-					   ipvs->backup_mcast_ifn,
-					   ipvs->backup_syncid, cb) < 0)
+					   &ipvs->bcfg, cb) < 0)
 			goto nla_put_failure;
 
 		cb->args[1] = 1;
@@ -3348,25 +3353,33 @@ nla_put_failure:
 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ipvs_sync_daemon_cfg c;
+	struct nlattr *a;
 	int ret;
 
+	memset(&c, 0, sizeof(c));
 	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
 	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
 	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
 		return -EINVAL;
+	strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+		sizeof(c.mcast_ifn));
+	c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
+
+	a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
+	if (a)
+		c.sync_maxlen = nla_get_u16(a);
 
 	/* The synchronization protocol is incompatible with mixed family
 	 * services
 	 */
-	if (net_ipvs(net)->mixed_address_family_dests > 0)
+	if (ipvs->mixed_address_family_dests > 0)
 		return -EINVAL;
 
 	rtnl_lock();
 	mutex_lock(&ipvs->sync_mutex);
-	ret = start_sync_thread(net,
-				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
-				nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
-				nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+	ret = start_sync_thread(net, &c,
+				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
 	mutex_unlock(&ipvs->sync_mutex);
 	rtnl_unlock();
 	return ret;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 6bc6dca9bca8..e68a43421479 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -320,26 +320,28 @@ sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
  * Create a new sync buffer for Version 1 proto.
  */
 static inline struct ip_vs_sync_buff *
-ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
 {
 	struct ip_vs_sync_buff *sb;
 
 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;
 
-	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+	len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
+		    ipvs->mcfg.sync_maxlen);
+	sb->mesg = kmalloc(len, GFP_ATOMIC);
 	if (!sb->mesg) {
 		kfree(sb);
 		return NULL;
 	}
 	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zero now */
 	sb->mesg->version = SYNC_PROTO_VER;
-	sb->mesg->syncid = ipvs->master_syncid;
+	sb->mesg->syncid = ipvs->mcfg.syncid;
 	sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
 	sb->mesg->nr_conns = 0;
 	sb->mesg->spare = 0;
 	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
-	sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+	sb->end = (unsigned char *)sb->mesg + len;
 
 	sb->firstuse = jiffies;
 	return sb;
@@ -402,7 +404,7 @@ select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
  * Create a new sync buffer for Version 0 proto.
  */
 static inline struct ip_vs_sync_buff *
-ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
 {
 	struct ip_vs_sync_buff *sb;
 	struct ip_vs_sync_mesg_v0 *mesg;
@@ -410,17 +412,19 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;
 
-	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+	len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
+		    ipvs->mcfg.sync_maxlen);
+	sb->mesg = kmalloc(len, GFP_ATOMIC);
 	if (!sb->mesg) {
 		kfree(sb);
 		return NULL;
 	}
 	mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
 	mesg->nr_conns = 0;
-	mesg->syncid = ipvs->master_syncid;
+	mesg->syncid = ipvs->mcfg.syncid;
 	mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
 	sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
-	sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+	sb->end = (unsigned char *)mesg + len;
 	sb->firstuse = jiffies;
 	return sb;
 }
@@ -533,7 +537,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 	struct ip_vs_sync_buff *buff;
 	struct ipvs_master_sync_state *ms;
 	int id;
-	int len;
+	unsigned int len;
 
 	if (unlikely(cp->af != AF_INET))
 		return;
@@ -553,17 +557,19 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 	id = select_master_thread_id(ipvs, cp);
 	ms = &ipvs->ms[id];
 	buff = ms->sync_buff;
+	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+		SIMPLE_CONN_SIZE;
 	if (buff) {
 		m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
 		/* Send buffer if it is for v1 */
-		if (!m->nr_conns) {
+		if (buff->head + len > buff->end || !m->nr_conns) {
 			sb_queue_tail(ipvs, ms);
 			ms->sync_buff = NULL;
 			buff = NULL;
 		}
 	}
 	if (!buff) {
-		buff = ip_vs_sync_buff_create_v0(ipvs);
+		buff = ip_vs_sync_buff_create_v0(ipvs, len);
 		if (!buff) {
 			spin_unlock_bh(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
@@ -572,8 +578,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 		ms->sync_buff = buff;
 	}
 
-	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
-		SIMPLE_CONN_SIZE;
 	m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
 	s = (struct ip_vs_sync_conn_v0 *) buff->head;
 
@@ -597,12 +601,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 	m->nr_conns++;
 	m->size = htons(ntohs(m->size) + len);
 	buff->head += len;
-
-	/* check if there is a space for next one */
-	if (buff->head + FULL_CONN_SIZE > buff->end) {
-		sb_queue_tail(ipvs, ms);
-		ms->sync_buff = NULL;
-	}
 	spin_unlock_bh(&ipvs->sync_buff_lock);
 
 	/* synchronize its controller if it has */
@@ -694,7 +692,7 @@ sloop:
 	}
 
 	if (!buff) {
-		buff = ip_vs_sync_buff_create(ipvs);
+		buff = ip_vs_sync_buff_create(ipvs, len);
 		if (!buff) {
 			spin_unlock_bh(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
@@ -1219,7 +1217,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
 		return;
 	}
 	/* SyncID sanity check */
-	if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+	if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
 		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
 		return;
 	}
@@ -1319,6 +1317,17 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
 	release_sock(sk);
 }
 
+/* Control fragmentation of messages */
+static void set_mcast_pmtudisc(struct sock *sk, int val)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
+	lock_sock(sk);
+	inet->pmtudisc = val;
+	release_sock(sk);
+}
+
 /*
  *      Specifiy default interface for outgoing multicasts
  */
@@ -1344,43 +1353,6 @@ static int set_mcast_if(struct sock *sk, char *ifname)
 }
 
 
-/*
- *	Set the maximum length of sync message according to the
- *	specified interface's MTU.
- */
-static int set_sync_mesg_maxlen(struct net *net, int sync_state)
-{
-	struct netns_ipvs *ipvs = net_ipvs(net);
-	struct net_device *dev;
-	int num;
-
-	if (sync_state == IP_VS_STATE_MASTER) {
-		dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
-		if (!dev)
-			return -ENODEV;
-
-		num = (dev->mtu - sizeof(struct iphdr) -
-		       sizeof(struct udphdr) -
-		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
-		ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
-			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
-		IP_VS_DBG(7, "setting the maximum length of sync sending "
-			  "message %d.\n", ipvs->send_mesg_maxlen);
-	} else if (sync_state == IP_VS_STATE_BACKUP) {
-		dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
-		if (!dev)
-			return -ENODEV;
-
-		ipvs->recv_mesg_maxlen = dev->mtu -
-			sizeof(struct iphdr) - sizeof(struct udphdr);
-		IP_VS_DBG(7, "setting the maximum length of sync receiving "
-			  "message %d.\n", ipvs->recv_mesg_maxlen);
-	}
-
-	return 0;
-}
-
-
 /*
  *      Join a multicast group.
  *      the group is specified by a class D multicast address 224.0.0.0/8
@@ -1461,7 +1433,7 @@ static struct socket *make_send_sock(struct net *net, int id)
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}
-	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
+	result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
 		goto error;
@@ -1469,11 +1441,13 @@ static struct socket *make_send_sock(struct net *net, int id)
 
 	set_mcast_loop(sock->sk, 0);
 	set_mcast_ttl(sock->sk, 1);
+	/* Allow fragmentation if MTU changes */
+	set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
 	result = sysctl_sync_sock_size(ipvs);
 	if (result > 0)
 		set_sock_size(sock->sk, 1, result);
 
-	result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
+	result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
 	if (result < 0) {
 		pr_err("Error binding address of the mcast interface\n");
 		goto error;
@@ -1531,7 +1505,7 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	/* join the multicast group */
 	result = join_mcast_group(sock->sk,
 			(struct in_addr *) &mcast_addr.sin_addr,
-			ipvs->backup_mcast_ifn);
+			ipvs->bcfg.mcast_ifn);
 	if (result < 0) {
 		pr_err("Error joining to the multicast group\n");
 		goto error;
@@ -1639,7 +1613,7 @@ static int sync_thread_master(void *data)
 
 	pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
 		"syncid = %d, id = %d\n",
-		ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
+		ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
 
 	for (;;) {
 		sb = next_sync_buff(ipvs, ms);
@@ -1693,7 +1667,7 @@ static int sync_thread_backup(void *data)
 
 	pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
 		"syncid = %d, id = %d\n",
-		ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
+		ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
 
 	while (!kthread_should_stop()) {
 		wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1703,7 +1677,7 @@ static int sync_thread_backup(void *data)
 		/* do we have data now? */
 		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
 			len = ip_vs_receive(tinfo->sock, tinfo->buf,
-					ipvs->recv_mesg_maxlen);
+					ipvs->bcfg.sync_maxlen);
 			if (len <= 0) {
 				if (len != -EAGAIN)
 					pr_err("receiving message error\n");
@@ -1723,16 +1697,19 @@ static int sync_thread_backup(void *data)
 }
 
 
-int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *c,
+		      int state)
 {
 	struct ip_vs_sync_thread_data *tinfo;
 	struct task_struct **array = NULL, *task;
 	struct socket *sock;
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct net_device *dev;
 	char *name;
 	int (*threadfn)(void *data);
-	int id, count;
+	int id, count, hlen;
 	int result = -ENOMEM;
+	u16 mtu, min_mtu;
 
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
 	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
@@ -1744,22 +1721,35 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 	} else
 		count = ipvs->threads_mask + 1;
 
+	dev = __dev_get_by_name(net, c->mcast_ifn);
+	if (!dev) {
+		pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
+		return -ENODEV;
+	}
+	hlen = sizeof(struct iphdr) + sizeof(struct udphdr);
+	mtu = (state == IP_VS_STATE_BACKUP) ?
+		  clamp(dev->mtu, 1500U, 65535U) : 1500U;
+	min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
+
+	if (c->sync_maxlen)
+		c->sync_maxlen = clamp_t(unsigned int,
+					 c->sync_maxlen, min_mtu,
+					 65535 - hlen);
+	else
+		c->sync_maxlen = mtu - hlen;
+
 	if (state == IP_VS_STATE_MASTER) {
 		if (ipvs->ms)
 			return -EEXIST;
 
-		strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
-			sizeof(ipvs->master_mcast_ifn));
-		ipvs->master_syncid = syncid;
+		ipvs->mcfg = *c;
 		name = "ipvs-m:%d:%d";
 		threadfn = sync_thread_master;
 	} else if (state == IP_VS_STATE_BACKUP) {
 		if (ipvs->backup_threads)
 			return -EEXIST;
 
-		strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
-			sizeof(ipvs->backup_mcast_ifn));
-		ipvs->backup_syncid = syncid;
+		ipvs->bcfg = *c;
 		name = "ipvs-b:%d:%d";
 		threadfn = sync_thread_backup;
 	} else {
@@ -1787,7 +1777,6 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 		if (!array)
 			goto out;
 	}
-	set_sync_mesg_maxlen(net, state);
 
 	tinfo = NULL;
 	for (id = 0; id < count; id++) {
@@ -1805,7 +1794,7 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 		tinfo->net = net;
 		tinfo->sock = sock;
 		if (state == IP_VS_STATE_BACKUP) {
-			tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+			tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
 					     GFP_KERNEL);
 			if (!tinfo->buf)
 				goto outtinfo;
-- 
cgit v1.2.3


From d33288172e72c4729e8b9f2243fb40601afabc8f Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sun, 26 Jul 2015 15:03:28 +0300
Subject: ipvs: add more mcast parameters for the sync daemon

- mcast_group: configure the multicast address, now IPv6
is supported too

- mcast_port: configure the multicast port

- mcast_ttl: configure the multicast TTL/HOP_LIMIT

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |   4 ++
 include/uapi/linux/ip_vs.h      |   4 ++
 net/netfilter/ipvs/ip_vs_ctl.c  |  50 ++++++++++++++-
 net/netfilter/ipvs/ip_vs_sync.c | 138 +++++++++++++++++++++++++++++++++-------
 4 files changed, 172 insertions(+), 24 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 2fdc13caf712..9b9ca87a4210 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -847,8 +847,12 @@ struct ipvs_master_sync_state {
 #define IP_VS_DEST_TRASH_PERIOD		(120 * HZ)
 
 struct ipvs_sync_daemon_cfg {
+	union nf_inet_addr	mcast_group;
 	int			syncid;
 	u16			sync_maxlen;
+	u16			mcast_port;
+	u8			mcast_af;
+	u8			mcast_ttl;
 	/* multicast interface name */
 	char			mcast_ifn[IP_VS_IFNAME_MAXLEN];
 };
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 68377d8c8870..391395c06c7e 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -407,6 +407,10 @@ enum {
 	IPVS_DAEMON_ATTR_MCAST_IFN,	/* multicast interface name */
 	IPVS_DAEMON_ATTR_SYNC_ID,	/* SyncID we belong to */
 	IPVS_DAEMON_ATTR_SYNC_MAXLEN,	/* UDP Payload Size */
+	IPVS_DAEMON_ATTR_MCAST_GROUP,	/* IPv4 Multicast Address */
+	IPVS_DAEMON_ATTR_MCAST_GROUP6,	/* IPv6 Multicast Address */
+	IPVS_DAEMON_ATTR_MCAST_PORT,	/* Multicast Port (base) */
+	IPVS_DAEMON_ATTR_MCAST_TTL,	/* Multicast TTL */
 	__IPVS_DAEMON_ATTR_MAX,
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 96f7bbfd5e1d..1a23e91d50d8 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2819,6 +2819,10 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
 					    .len = IP_VS_IFNAME_MAXLEN },
 	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
 	[IPVS_DAEMON_ATTR_SYNC_MAXLEN]	= { .type = NLA_U16 },
+	[IPVS_DAEMON_ATTR_MCAST_GROUP]	= { .type = NLA_U32 },
+	[IPVS_DAEMON_ATTR_MCAST_GROUP6]	= { .len = sizeof(struct in6_addr) },
+	[IPVS_DAEMON_ATTR_MCAST_PORT]	= { .type = NLA_U16 },
+	[IPVS_DAEMON_ATTR_MCAST_TTL]	= { .type = NLA_U8 },
 };
 
 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
@@ -3288,8 +3292,21 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
 	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
 	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
 	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
-	    nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen))
+	    nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
+	    nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
+	    nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
 		goto nla_put_failure;
+#ifdef CONFIG_IP_VS_IPV6
+	if (c->mcast_af == AF_INET6) {
+		if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
+				     &c->mcast_group.in6))
+			goto nla_put_failure;
+	} else
+#endif
+		if (c->mcast_af == AF_INET &&
+		    nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
+				    c->mcast_group.ip))
+			goto nla_put_failure;
 	nla_nest_end(skb, nl_daemon);
 
 	return 0;
@@ -3370,6 +3387,37 @@ static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
 	if (a)
 		c.sync_maxlen = nla_get_u16(a);
 
+	a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
+	if (a) {
+		c.mcast_af = AF_INET;
+		c.mcast_group.ip = nla_get_in_addr(a);
+		if (!ipv4_is_multicast(c.mcast_group.ip))
+			return -EINVAL;
+	} else {
+		a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
+		if (a) {
+#ifdef CONFIG_IP_VS_IPV6
+			int addr_type;
+
+			c.mcast_af = AF_INET6;
+			c.mcast_group.in6 = nla_get_in6_addr(a);
+			addr_type = ipv6_addr_type(&c.mcast_group.in6);
+			if (!(addr_type & IPV6_ADDR_MULTICAST))
+				return -EINVAL;
+#else
+			return -EAFNOSUPPORT;
+#endif
+		}
+	}
+
+	a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
+	if (a)
+		c.mcast_port = nla_get_u16(a);
+
+	a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
+	if (a)
+		c.mcast_ttl = nla_get_u8(a);
+
 	/* The synchronization protocol is incompatible with mixed family
 	 * services
 	 */
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index e68a43421479..43f140950075 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -262,6 +262,11 @@ struct ip_vs_sync_mesg {
 	/* ip_vs_sync_conn entries start here */
 };
 
+union ipvs_sockaddr {
+	struct sockaddr_in	in;
+	struct sockaddr_in6	in6;
+};
+
 struct ip_vs_sync_buff {
 	struct list_head        list;
 	unsigned long           firstuse;
@@ -1301,6 +1306,14 @@ static void set_mcast_loop(struct sock *sk, u_char loop)
 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
 	lock_sock(sk);
 	inet->mc_loop = loop ? 1 : 0;
+#ifdef CONFIG_IP_VS_IPV6
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		/* IPV6_MULTICAST_LOOP */
+		np->mc_loop = loop ? 1 : 0;
+	}
+#endif
 	release_sock(sk);
 }
 
@@ -1314,6 +1327,14 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
 	lock_sock(sk);
 	inet->mc_ttl = ttl;
+#ifdef CONFIG_IP_VS_IPV6
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		/* IPV6_MULTICAST_HOPS */
+		np->mcast_hops = ttl;
+	}
+#endif
 	release_sock(sk);
 }
 
@@ -1325,6 +1346,14 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
 	/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
 	lock_sock(sk);
 	inet->pmtudisc = val;
+#ifdef CONFIG_IP_VS_IPV6
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		/* IPV6_MTU_DISCOVER */
+		np->pmtudisc = val;
+	}
+#endif
 	release_sock(sk);
 }
 
@@ -1347,6 +1376,14 @@ static int set_mcast_if(struct sock *sk, char *ifname)
 	lock_sock(sk);
 	inet->mc_index = dev->ifindex;
 	/*  inet->mc_addr  = 0; */
+#ifdef CONFIG_IP_VS_IPV6
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		/* IPV6_MULTICAST_IF */
+		np->mcast_oif = dev->ifindex;
+	}
+#endif
 	release_sock(sk);
 
 	return 0;
@@ -1384,6 +1421,27 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 	return ret;
 }
 
+#ifdef CONFIG_IP_VS_IPV6
+static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
+			     char *ifname)
+{
+	struct net *net = sock_net(sk);
+	struct net_device *dev;
+	int ret;
+
+	dev = __dev_get_by_name(net, ifname);
+	if (!dev)
+		return -ENODEV;
+	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+		return -EINVAL;
+
+	lock_sock(sk);
+	ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
+	release_sock(sk);
+
+	return ret;
+}
+#endif
 
 static int bind_mcastif_addr(struct socket *sock, char *ifname)
 {
@@ -1412,6 +1470,26 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
 	return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
 }
 
+static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
+			       struct ipvs_sync_daemon_cfg *c, int id)
+{
+	if (AF_INET6 == c->mcast_af) {
+		sa->in6 = (struct sockaddr_in6) {
+			.sin6_family = AF_INET6,
+			.sin6_port = htons(c->mcast_port + id),
+		};
+		sa->in6.sin6_addr = c->mcast_group.in6;
+		*salen = sizeof(sa->in6);
+	} else {
+		sa->in = (struct sockaddr_in) {
+			.sin_family = AF_INET,
+			.sin_port = htons(c->mcast_port + id),
+		};
+		sa->in.sin_addr = c->mcast_group.in;
+		*salen = sizeof(sa->in);
+	}
+}
+
 /*
  *      Set up sending multicast socket over UDP
  */
@@ -1419,16 +1497,13 @@ static struct socket *make_send_sock(struct net *net, int id)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	/* multicast addr */
-	struct sockaddr_in mcast_addr = {
-		.sin_family		= AF_INET,
-		.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT + id),
-		.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
-	};
+	union ipvs_sockaddr mcast_addr;
 	struct socket *sock;
-	int result;
+	int result, salen;
 
 	/* First create a socket */
-	result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	result = sock_create_kern(net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
+				  IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
@@ -1440,21 +1515,25 @@ static struct socket *make_send_sock(struct net *net, int id)
 	}
 
 	set_mcast_loop(sock->sk, 0);
-	set_mcast_ttl(sock->sk, 1);
+	set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
 	/* Allow fragmentation if MTU changes */
 	set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
 	result = sysctl_sync_sock_size(ipvs);
 	if (result > 0)
 		set_sock_size(sock->sk, 1, result);
 
-	result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+	if (AF_INET == ipvs->mcfg.mcast_af)
+		result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+	else
+		result = 0;
 	if (result < 0) {
 		pr_err("Error binding address of the mcast interface\n");
 		goto error;
 	}
 
+	get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
 	result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
-			sizeof(struct sockaddr), 0);
+				    salen, 0);
 	if (result < 0) {
 		pr_err("Error connecting to the multicast addr\n");
 		goto error;
@@ -1475,16 +1554,13 @@ static struct socket *make_receive_sock(struct net *net, int id)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	/* multicast addr */
-	struct sockaddr_in mcast_addr = {
-		.sin_family		= AF_INET,
-		.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT + id),
-		.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
-	};
+	union ipvs_sockaddr mcast_addr;
 	struct socket *sock;
-	int result;
+	int result, salen;
 
 	/* First create a socket */
-	result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	result = sock_create_kern(net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
+				  IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
@@ -1495,17 +1571,22 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	if (result > 0)
 		set_sock_size(sock->sk, 0, result);
 
-	result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
-			sizeof(struct sockaddr));
+	get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
+	result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
 	if (result < 0) {
 		pr_err("Error binding to the multicast addr\n");
 		goto error;
 	}
 
 	/* join the multicast group */
-	result = join_mcast_group(sock->sk,
-			(struct in_addr *) &mcast_addr.sin_addr,
-			ipvs->bcfg.mcast_ifn);
+#ifdef CONFIG_IP_VS_IPV6
+	if (ipvs->bcfg.mcast_af == AF_INET6)
+		result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
+					   ipvs->bcfg.mcast_ifn);
+	else
+#endif
+		result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
+					  ipvs->bcfg.mcast_ifn);
 	if (result < 0) {
 		pr_err("Error joining to the multicast group\n");
 		goto error;
@@ -1721,12 +1802,23 @@ int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *c,
 	} else
 		count = ipvs->threads_mask + 1;
 
+	if (c->mcast_af == AF_UNSPEC) {
+		c->mcast_af = AF_INET;
+		c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
+	}
+	if (!c->mcast_port)
+		c->mcast_port = IP_VS_SYNC_PORT;
+	if (!c->mcast_ttl)
+		c->mcast_ttl = 1;
+
 	dev = __dev_get_by_name(net, c->mcast_ifn);
 	if (!dev) {
 		pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
 		return -ENODEV;
 	}
-	hlen = sizeof(struct iphdr) + sizeof(struct udphdr);
+	hlen = (AF_INET6 == c->mcast_af) ?
+	       sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
+	       sizeof(struct iphdr) + sizeof(struct udphdr);
 	mtu = (state == IP_VS_STATE_BACKUP) ?
 		  clamp(dev->mtu, 1500U, 65535U) : 1500U;
 	min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
-- 
cgit v1.2.3


From 58ce31cca1ffe057f4744c3f671e3e84606d3d4a Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 19 Aug 2015 17:07:33 -0700
Subject: vxlan: GRO support at tunnel layer

Add calls to gro_cells infrastructure to do GRO when receiving on a tunnel.

Testing:

Ran 200 netperf TCP_STREAM instance

  - With fix (GRO enabled on VXLAN interface)

    Verify GRO is happening.

    9084 MBps tput
    3.44% CPU utilization

  - Without fix (GRO disabled on VXLAN interface)

    Verified no GRO is happening.

    9084 MBps tput
    5.54% CPU utilization

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 9 +++++++--
 include/net/vxlan.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 64fcd2402562..61b457b9ec00 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1208,7 +1208,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	stats->rx_bytes += skb->len;
 	u64_stats_update_end(&stats->syncp);
 
-	netif_rx(skb);
+	gro_cells_receive(&vxlan->gro_cells, skb);
 
 	return;
 drop:
@@ -2446,6 +2446,8 @@ static void vxlan_setup(struct net_device *dev)
 
 	vxlan->dev = dev;
 
+	gro_cells_init(&vxlan->gro_cells, dev);
+
 	for (h = 0; h < FDB_HASH_SIZE; ++h)
 		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
 }
@@ -2885,6 +2887,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head)
 		hlist_del_rcu(&vxlan->hlist);
 	spin_unlock(&vn->sock_lock);
 
+	gro_cells_destroy(&vxlan->gro_cells);
 	list_del(&vxlan->next);
 	unregister_netdevice_queue(dev, head);
 }
@@ -3093,8 +3096,10 @@ static void __net_exit vxlan_exit_net(struct net *net)
 		/* If vxlan->dev is in the same netns, it has already been added
 		 * to the list by the previous loop.
 		 */
-		if (!net_eq(dev_net(vxlan->dev), net))
+		if (!net_eq(dev_net(vxlan->dev), net)) {
+			gro_cells_destroy(&vxlan->gro_cells);
 			unregister_netdevice_queue(vxlan->dev, &list);
+		}
 	}
 
 	unregister_netdevice_many(&list);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 43677e6b9c43..6b3234599a2c 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -161,6 +161,7 @@ struct vxlan_dev {
 	struct timer_list age_timer;
 	spinlock_t	  hash_lock;
 	unsigned int	  addrcnt;
+	struct gro_cells  gro_cells;
 
 	struct vxlan_config	cfg;
 
-- 
cgit v1.2.3


From 751a587ac9f9a8bf314590fbac32d9e418060c5a Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 21 Aug 2015 12:41:14 +0200
Subject: route: fix breakage after moving lwtunnel state

__recnt and related fields need to be in its own cacheline for performance
reasons. Commit 61adedf3e3f1 ("route: move lwtunnel state to dst_entry")
broke that on 32bit archs, causing BUILD_BUG_ON in dst_hold to be triggered.

This patch fixes the breakage by moving the lwtunnel state to the end of
dst_entry on 32bit archs. Unfortunately, this makes it share the cacheline
with __refcnt and may affect performance, thus further patches may be
needed.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Fixes: 61adedf3e3f1 ("route: move lwtunnel state to dst_entry")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 0a9a723f6c19..ef8f1d43a203 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -44,7 +44,6 @@ struct dst_entry {
 #else
 	void			*__pad1;
 #endif
-	struct lwtunnel_state   *lwtstate;
 	int			(*input)(struct sk_buff *);
 	int			(*output)(struct sock *sk, struct sk_buff *skb);
 
@@ -85,11 +84,12 @@ struct dst_entry {
 	__u32			__pad2;
 #endif
 
+#ifdef CONFIG_64BIT
+	struct lwtunnel_state   *lwtstate;
 	/*
 	 * Align __refcnt to a 64 bytes alignment
 	 * (L1_CACHE_SIZE would be too much)
 	 */
-#ifdef CONFIG_64BIT
 	long			__pad_to_align_refcnt[1];
 #endif
 	/*
@@ -99,6 +99,9 @@ struct dst_entry {
 	atomic_t		__refcnt;	/* client references	*/
 	int			__use;
 	unsigned long		lastuse;
+#ifndef CONFIG_64BIT
+	struct lwtunnel_state   *lwtstate;
+#endif
 	union {
 		struct dst_entry	*next;
 		struct rtable __rcu	*rt_next;
-- 
cgit v1.2.3


From 127eb7cd3c210afead788991a30950a9e36759ea Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 24 Aug 2015 09:45:41 -0700
Subject: lwt: Add cfg argument to build_state

Add cfg and family arguments to lwt build state functions. cfg is a void
pointer and will either be a pointer to a fib_config or fib6_config
structure. The family parameter indicates which one (either AF_INET
or AF_INET6).

LWT encpasulation implementation may use the fib configuration to build
the LWT state.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h    |  3 +++
 net/core/lwtunnel.c       |  5 +++--
 net/ipv4/fib_semantics.c  | 17 ++++++++++-------
 net/ipv4/ip_tunnel_core.c |  2 ++
 net/ipv6/ila.c            |  1 +
 net/ipv6/route.c          |  3 ++-
 net/mpls/mpls_iptunnel.c  |  1 +
 7 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 843489884448..fce0e35e74d0 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -26,6 +26,7 @@ struct lwtunnel_state {
 
 struct lwtunnel_encap_ops {
 	int (*build_state)(struct net_device *dev, struct nlattr *encap,
+			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts);
 	int (*output)(struct sock *sk, struct sk_buff *skb);
 	int (*input)(struct sk_buff *skb);
@@ -80,6 +81,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
 			   unsigned int num);
 int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 			 struct nlattr *encap,
+			 unsigned int family, const void *cfg,
 			 struct lwtunnel_state **lws);
 int lwtunnel_fill_encap(struct sk_buff *skb,
 			struct lwtunnel_state *lwtstate);
@@ -130,6 +132,7 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
 
 static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 				       struct nlattr *encap,
+				       unsigned int family, const void *cfg,
 				       struct lwtunnel_state **lws)
 {
 	return -EOPNOTSUPP;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index e924c2e08554..dfb1a9ca0835 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -72,7 +72,8 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
 EXPORT_SYMBOL(lwtunnel_encap_del_ops);
 
 int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
-			 struct nlattr *encap, struct lwtunnel_state **lws)
+			 struct nlattr *encap, unsigned int family,
+			 const void *cfg, struct lwtunnel_state **lws)
 {
 	const struct lwtunnel_encap_ops *ops;
 	int ret = -EINVAL;
@@ -85,7 +86,7 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 	rcu_read_lock();
 	ops = rcu_dereference(lwtun_encaps[encap_type]);
 	if (likely(ops && ops->build_state))
-		ret = ops->build_state(dev, encap, lws);
+		ret = ops->build_state(dev, encap, family, cfg, lws);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 01f1c7dcd329..1b2d01170a4d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -511,7 +511,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 					dev = __dev_get_by_index(net, cfg->fc_oif);
 				ret = lwtunnel_build_state(dev, nla_get_u16(
 							   nla_entype),
-							   nla, &lwtstate);
+							   nla,  AF_INET, cfg,
+							   &lwtstate);
 				if (ret)
 					goto errout;
 				nexthop_nh->nh_lwtstate =
@@ -535,7 +536,8 @@ errout:
 
 static int fib_encap_match(struct net *net, u16 encap_type,
 			   struct nlattr *encap,
-			   int oif, const struct fib_nh *nh)
+			   int oif, const struct fib_nh *nh,
+			   const struct fib_config *cfg)
 {
 	struct lwtunnel_state *lwtstate;
 	struct net_device *dev = NULL;
@@ -546,8 +548,8 @@ static int fib_encap_match(struct net *net, u16 encap_type,
 
 	if (oif)
 		dev = __dev_get_by_index(net, oif);
-	ret = lwtunnel_build_state(dev, encap_type,
-				   encap, &lwtstate);
+	ret = lwtunnel_build_state(dev, encap_type, encap,
+				   AF_INET, cfg, &lwtstate);
 	if (!ret) {
 		result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
 		lwtstate_free(lwtstate);
@@ -571,7 +573,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 		if (cfg->fc_encap) {
 			if (fib_encap_match(net, cfg->fc_encap_type,
 					    cfg->fc_encap, cfg->fc_oif,
-					    fi->fib_nh))
+					    fi->fib_nh, cfg))
 			    return 1;
 		}
 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
@@ -663,7 +665,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			struct fib_nh *nh)
 {
-	int err;
+	int err = 0;
 	struct net *net;
 	struct net_device *dev;
 
@@ -1005,7 +1007,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			if (cfg->fc_oif)
 				dev = __dev_get_by_index(net, cfg->fc_oif);
 			err = lwtunnel_build_state(dev, cfg->fc_encap_type,
-						   cfg->fc_encap, &lwtstate);
+						   cfg->fc_encap, AF_INET, cfg,
+						   &lwtstate);
 			if (err)
 				goto failure;
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 289b6c26ce37..934f2ac8ad61 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -204,6 +204,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 };
 
 static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+			      unsigned int family, const void *cfg,
 			      struct lwtunnel_state **ts)
 {
 	struct ip_tunnel_info *tun_info;
@@ -311,6 +312,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 };
 
 static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+			       unsigned int family, const void *cfg,
 			       struct lwtunnel_state **ts)
 {
 	struct ip_tunnel_info *tun_info;
diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c
index f011c3d5ca40..ffe4dcad6088 100644
--- a/net/ipv6/ila.c
+++ b/net/ipv6/ila.c
@@ -123,6 +123,7 @@ static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 };
 
 static int ila_build_state(struct net_device *dev, struct nlattr *nla,
+			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts)
 {
 	struct ila_params *p;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e476f01add87..df3e353a012d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1819,7 +1819,8 @@ int ip6_route_add(struct fib6_config *cfg)
 		struct lwtunnel_state *lwtstate;
 
 		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
-					   cfg->fc_encap, &lwtstate);
+					   cfg->fc_encap, AF_INET6, cfg,
+					   &lwtstate);
 		if (err)
 			goto out;
 		rt->dst.lwtstate = lwtstate_get(lwtstate);
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 3da5ca3ba563..21e70bc9af98 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -123,6 +123,7 @@ drop:
 }
 
 static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts)
 {
 	struct mpls_iptunnel_encap *tun_encap_info;
-- 
cgit v1.2.3


From 6f021c62d64f38092bc2a0c5fe7b81d5e5b21a00 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Aug 2015 12:30:00 -0700
Subject: tcp: fix slow start after idle vs TSO/GSO

slow start after idle might reduce cwnd, but we perform this
after first packet was cooked and sent.

With TSO/GSO, it means that we might send a full TSO packet
even if cwnd should have been reduced to IW10.

Moving the SSAI check in skb_entail() makes sense, because
we slightly reduce number of times this check is done,
especially for large send() and TCP Small queue callbacks from
softirq context.

As Neal pointed out, we also need to perform the check
if/when receive window opens.

Tested:

Following packetdrill test demonstrates the problem
// Test of slow start after idle

`sysctl -q net.ipv4.tcp_slow_start_after_idle=1`

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0    setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0    bind(3, ..., ...) = 0
+0    listen(3, 1) = 0

+0    < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0    > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+.100 < . 1:1(0) ack 1 win 511
+0    accept(3, ..., ...) = 4
+0    setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0

+0    write(4, ..., 26000) = 26000
+0    > . 1:5001(5000) ack 1
+0    > . 5001:10001(5000) ack 1
+0    %{ assert tcpi_snd_cwnd == 10 }%

+.100 < . 1:1(0) ack 10001 win 511
+0    %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
+0    > . 10001:20001(10000) ack 1
+0    > P. 20001:26001(6000) ack 1

+.100 < . 1:1(0) ack 26001 win 511
+0    %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%

+4 write(4, ..., 20000) = 20000
// If slow start after idle works properly, we should send 5 MSS here (cwnd/2)
+0    > . 26001:31001(5000) ack 1
+0    %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+0    > . 31001:36001(5000) ack 1

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 13 +++++++++++++
 net/ipv4/tcp.c        |  2 ++
 net/ipv4/tcp_input.c  |  3 +++
 net/ipv4/tcp_output.c | 12 ++++--------
 4 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 364426a2be5a..309801f7eb82 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1165,6 +1165,19 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
 }
 
 u32 tcp_default_init_rwnd(u32 mss);
+void tcp_cwnd_restart(struct sock *sk, s32 delta);
+
+static inline void tcp_slow_start_after_idle_check(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	s32 delta;
+
+	if (!sysctl_tcp_slow_start_after_idle || tp->packets_out)
+		return;
+	delta = tcp_time_stamp - tp->lsndtime;
+	if (delta > inet_csk(sk)->icsk_rto)
+		tcp_cwnd_restart(sk, delta);
+}
 
 /* Determine a window scaling and initial window to offer. */
 void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 45534a5ab430..b8b8fa184f75 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -627,6 +627,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb)
 	sk_mem_charge(sk, skb->truesize);
 	if (tp->nonagle & TCP_NAGLE_PUSH)
 		tp->nonagle &= ~TCP_NAGLE_PUSH;
+
+	tcp_slow_start_after_idle_check(sk);
 }
 
 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4e4d6bcd0ca9..0abca2841de2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3332,6 +3332,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 			tp->pred_flags = 0;
 			tcp_fast_path_check(sk);
 
+			if (tcp_send_head(sk))
+				tcp_slow_start_after_idle_check(sk);
+
 			if (nwin > tp->max_window) {
 				tp->max_window = nwin;
 				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 444ab5beecbd..1188e4fcf23b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk)
 }
 
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
- * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
+ * This is the first part of cwnd validation mechanism.
+ */
+void tcp_cwnd_restart(struct sock *sk, s32 delta)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	s32 delta = tcp_time_stamp - tp->lsndtime;
-	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
 	u32 cwnd = tp->snd_cwnd;
 
 	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
@@ -164,10 +164,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
 
-	if (sysctl_tcp_slow_start_after_idle &&
-	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
-		tcp_cwnd_restart(sk, __sk_dst_get(sk));
-
 	tp->lsndtime = now;
 
 	/* If it is a reply for ato after last received
-- 
cgit v1.2.3


From 43e122b014c955a33220fabbd09c4b5e4f422c3c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Aug 2015 17:38:02 -0700
Subject: tcp: refine pacing rate determination

When TCP pacing was added back in linux-3.12, we chose
to apply a fixed ratio of 200 % against current rate,
to allow probing for optimal throughput even during
slow start phase, where cwnd can be doubled every other gRTT.

At Google, we found it was better applying a different ratio
while in Congestion Avoidance phase.
This ratio was set to 120 %.

We've used the normal tcp_in_slow_start() helper for a while,
then tuned the condition to select the conservative ratio
as soon as cwnd >= ssthresh/2 :

- After cwnd reduction, it is safer to ramp up more slowly,
  as we approach optimal cwnd.
- Initial ramp up (ssthresh == INFINITY) still allows doubling
  cwnd every other RTT.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 15 +++++++++++++++
 include/net/tcp.h                      |  2 ++
 net/ipv4/sysctl_net_ipv4.c             | 19 +++++++++++++++++++
 net/ipv4/tcp_input.c                   | 18 +++++++++++++++++-
 4 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 46e88ed7f41d..ac77a13d2ea2 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER
 	if available window is too small.
 	Default: 2
 
+tcp_pacing_ss_ratio - INTEGER
+	sk->sk_pacing_rate is set by TCP stack using a ratio applied
+	to current rate. (current_rate = cwnd * mss / srtt)
+	If TCP is in slow start, tcp_pacing_ss_ratio is applied
+	to let TCP probe for bigger speeds, assuming cwnd can be
+	doubled every other RTT.
+	Default: 200
+
+tcp_pacing_ca_ratio - INTEGER
+	sk->sk_pacing_rate is set by TCP stack using a ratio applied
+	to current rate. (current_rate = cwnd * mss / srtt)
+	If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio
+	is applied to conservatively probe for bigger throughput.
+	Default: 120
+
 tcp_tso_win_divisor - INTEGER
 	This allows control over what percentage of the congestion window
 	can be consumed by a single TSO frame.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 309801f7eb82..4a7b03947a38 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_pacing_ss_ratio;
+extern int sysctl_tcp_pacing_ca_ratio;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0330ab2e2b63..879bdc5c95b1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
 static int zero;
 static int one = 1;
 static int four = 4;
+static int thousand = 1000;
 static int gso_max_segs = GSO_MAX_SEGS;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
@@ -711,6 +712,24 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &one,
 		.extra2		= &gso_max_segs,
 	},
+	{
+		.procname	= "tcp_pacing_ss_ratio",
+		.data		= &sysctl_tcp_pacing_ss_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &thousand,
+	},
+	{
+		.procname	= "tcp_pacing_ca_ratio",
+		.data		= &sysctl_tcp_pacing_ca_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &thousand,
+	},
 	{
 		.procname	= "tcp_autocorking",
 		.data		= &sysctl_tcp_autocorking,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0abca2841de2..dc08e2352665 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
  * TCP pacing, to smooth the burst on large writes when packets
  * in flight is significantly lower than cwnd (or rwin)
  */
+int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
+int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
+
 static void tcp_update_pacing_rate(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	u64 rate;
 
 	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
-	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+	/* current rate is (cwnd * mss) / srtt
+	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+	 * In Congestion Avoidance phase, set it to 120 % the current rate.
+	 *
+	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+	 *	 end of slow start and should slow down.
+	 */
+	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
+		rate *= sysctl_tcp_pacing_ss_ratio;
+	else
+		rate *= sysctl_tcp_pacing_ca_ratio;
 
 	rate *= max(tp->snd_cwnd, tp->packets_out);
 
-- 
cgit v1.2.3


From 2c0027cd54cc3ed856e87d9aeddb6ef00f5f17f4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 23 Aug 2015 08:21:22 -0600
Subject: inetpeer: remove dead code

Remove various inlined functions not referenced in the kernel.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h | 67 --------------------------------------------------
 1 file changed, 67 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index d5332ddcea3f..002f0bd27001 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -65,71 +65,12 @@ struct inet_peer_base {
 	int			total;
 };
 
-#define INETPEER_BASE_BIT	0x1UL
-
-static inline struct inet_peer *inetpeer_ptr(unsigned long val)
-{
-	BUG_ON(val & INETPEER_BASE_BIT);
-	return (struct inet_peer *) val;
-}
-
-static inline struct inet_peer_base *inetpeer_base_ptr(unsigned long val)
-{
-	if (!(val & INETPEER_BASE_BIT))
-		return NULL;
-	val &= ~INETPEER_BASE_BIT;
-	return (struct inet_peer_base *) val;
-}
-
-static inline bool inetpeer_ptr_is_peer(unsigned long val)
-{
-	return !(val & INETPEER_BASE_BIT);
-}
-
-static inline void __inetpeer_ptr_set_peer(unsigned long *val, struct inet_peer *peer)
-{
-	/* This implicitly clears INETPEER_BASE_BIT */
-	*val = (unsigned long) peer;
-}
-
-static inline bool inetpeer_ptr_set_peer(unsigned long *ptr, struct inet_peer *peer)
-{
-	unsigned long val = (unsigned long) peer;
-	unsigned long orig = *ptr;
-
-	if (!(orig & INETPEER_BASE_BIT) ||
-	    cmpxchg(ptr, orig, val) != orig)
-		return false;
-	return true;
-}
-
-static inline void inetpeer_init_ptr(unsigned long *ptr, struct inet_peer_base *base)
-{
-	*ptr = (unsigned long) base | INETPEER_BASE_BIT;
-}
-
-static inline void inetpeer_transfer_peer(unsigned long *to, unsigned long *from)
-{
-	unsigned long val = *from;
-
-	*to = val;
-	if (inetpeer_ptr_is_peer(val)) {
-		struct inet_peer *peer = inetpeer_ptr(val);
-		atomic_inc(&peer->refcnt);
-	}
-}
-
 void inet_peer_base_init(struct inet_peer_base *);
 
 void inet_initpeers(void) __init;
 
 #define INETPEER_METRICS_NEW	(~(u32) 0)
 
-static inline bool inet_metrics_new(const struct inet_peer *p)
-{
-	return p->metrics[RTAX_LOCK-1] == INETPEER_METRICS_NEW;
-}
-
 /* can be called with or without local BH being disabled */
 struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 			       const struct inetpeer_addr *daddr,
@@ -163,12 +104,4 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
 
 void inetpeer_invalidate_tree(struct inet_peer_base *);
 
-/*
- * temporary check to make sure we dont access rid, tcp_ts,
- * tcp_ts_stamp if no refcount is taken on inet_peer
- */
-static inline void inet_peer_refcheck(const struct inet_peer *p)
-{
-	WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0);
-}
 #endif /* _NET_INETPEER_H */
-- 
cgit v1.2.3


From 48e92c44bd73a8bc213560058e6b18e45929526e Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Tue, 25 Aug 2015 18:36:50 +0200
Subject: vxlan: fix multiple inclusion of vxlan.h

The vxlan_get_sk_family inline function was added after the last #endif,
making multiple inclusion of net/vxlan.h fail. Move it to the proper place.

Reported-by: Mark Rustad <mark.d.rustad@intel.com>
Fixes: 705cc62f6728c ("vxlan: provide access function for vxlan socket address family")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6b3234599a2c..480a319b4c92 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -241,9 +241,10 @@ static inline void vxlan_get_rx_port(struct net_device *netdev)
 {
 }
 #endif
-#endif
 
 static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
 {
 	return vs->sock->sk->sk_family;
 }
+
+#endif
-- 
cgit v1.2.3


From 3c645621b79828be7a46fb2694eb423b343b4bbe Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 25 Aug 2015 20:06:31 -0700
Subject: net_sched: make tcf_hash_destroy() static

tcf_hash_destroy() used once. Make it static.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 1 -
 net/sched/act_api.c   | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 4519c81304bd..9d446f136607 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -111,7 +111,6 @@ struct tc_action_ops {
 };
 
 int tcf_hash_search(struct tc_action *a, u32 index);
-void tcf_hash_destroy(struct tc_action *a);
 u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo);
 int tcf_hash_check(u32 index, struct tc_action *a, int bind);
 int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index b087087ccfa9..06e7c4a37245 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,7 +36,7 @@ static void free_tcf(struct rcu_head *head)
 	kfree(p);
 }
 
-void tcf_hash_destroy(struct tc_action *a)
+static void tcf_hash_destroy(struct tc_action *a)
 {
 	struct tcf_common *p = a->priv;
 	struct tcf_hashinfo *hinfo = a->ops->hinfo;
@@ -52,7 +52,6 @@ void tcf_hash_destroy(struct tc_action *a)
 	 */
 	call_rcu(&p->tcfc_rcu, free_tcf);
 }
-EXPORT_SYMBOL(tcf_hash_destroy);
 
 int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 {
-- 
cgit v1.2.3


From cff82457c5584f6a96d2b85d1a88b81ba304a330 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 25 Aug 2015 20:06:35 -0700
Subject: net_sched: act_bpf: remove spinlock in fast path

Similar to act_gact/act_mirred, act_bpf can be lockless in packet processing
with extra care taken to free bpf programs after rcu grace period.
Replacement of existing act_bpf (very rare) is done with synchronize_rcu()
and final destruction is done from tc_action_ops->cleanup() callback that is
called from tcf_exts_destroy()->tcf_action_destroy()->__tcf_hash_release() when
bind and refcnt reach zero which is only possible when classifier is destroyed.
Previous two patches fixed the last two classifiers (tcindex and rsvp) to
call tcf_exts_destroy() from rcu callback.

Similar to gact/mirred there is a race between prog->filter and
prog->tcf_action. Meaning that the program being replaced may use
previous default action if it happened to return TC_ACT_UNSPEC.
act_mirred race betwen tcf_action and tcfm_dev is similar.
In all cases the race is harmless.
Long term we may want to improve the situation by replacing the whole
tc_action->priv as single pointer instead of updating inner fields one by one.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_bpf.h |  2 +-
 net/sched/act_bpf.c         | 36 +++++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h
index a152e9858b2c..958d69cfb19c 100644
--- a/include/net/tc_act/tc_bpf.h
+++ b/include/net/tc_act/tc_bpf.h
@@ -15,7 +15,7 @@
 
 struct tcf_bpf {
 	struct tcf_common	common;
-	struct bpf_prog		*filter;
+	struct bpf_prog __rcu	*filter;
 	union {
 		u32		bpf_fd;
 		u16		bpf_num_ops;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 458cf647e698..559bfa011bda 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -37,25 +37,24 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 		   struct tcf_result *res)
 {
 	struct tcf_bpf *prog = act->priv;
+	struct bpf_prog *filter;
 	int action, filter_res;
 	bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
 
 	if (unlikely(!skb_mac_header_was_set(skb)))
 		return TC_ACT_UNSPEC;
 
-	spin_lock(&prog->tcf_lock);
-
-	prog->tcf_tm.lastuse = jiffies;
-	bstats_update(&prog->tcf_bstats, skb);
+	tcf_lastuse_update(&prog->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
 
-	/* Needed here for accessing maps. */
 	rcu_read_lock();
+	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
-		filter_res = BPF_PROG_RUN(prog->filter, skb);
+		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
-		filter_res = BPF_PROG_RUN(prog->filter, skb);
+		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
 
@@ -77,7 +76,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 		break;
 	case TC_ACT_SHOT:
 		action = filter_res;
-		prog->tcf_qstats.drops++;
+		qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats));
 		break;
 	case TC_ACT_UNSPEC:
 		action = prog->tcf_action;
@@ -87,7 +86,6 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 		break;
 	}
 
-	spin_unlock(&prog->tcf_lock);
 	return action;
 }
 
@@ -263,7 +261,10 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
 				  struct tcf_bpf_cfg *cfg)
 {
 	cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
-	cfg->filter = prog->filter;
+	/* updates to prog->filter are prevented, since it's called either
+	 * with rtnl lock or during final cleanup in rcu callback
+	 */
+	cfg->filter = rcu_dereference_protected(prog->filter, 1);
 
 	cfg->bpf_ops = prog->bpf_ops;
 	cfg->bpf_name = prog->bpf_name;
@@ -294,7 +295,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
 	if (!tcf_hash_check(parm->index, act, bind)) {
 		ret = tcf_hash_create(parm->index, est, act,
-				      sizeof(*prog), bind, false);
+				      sizeof(*prog), bind, true);
 		if (ret < 0)
 			return ret;
 
@@ -325,7 +326,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 		goto out;
 
 	prog = to_bpf(act);
-	spin_lock_bh(&prog->tcf_lock);
+	ASSERT_RTNL();
 
 	if (res != ACT_P_CREATED)
 		tcf_bpf_prog_fill_cfg(prog, &old);
@@ -339,14 +340,15 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 		prog->bpf_fd = cfg.bpf_fd;
 
 	prog->tcf_action = parm->action;
-	prog->filter = cfg.filter;
-
-	spin_unlock_bh(&prog->tcf_lock);
+	rcu_assign_pointer(prog->filter, cfg.filter);
 
-	if (res == ACT_P_CREATED)
+	if (res == ACT_P_CREATED) {
 		tcf_hash_insert(act);
-	else
+	} else {
+		/* make sure the program being replaced is no longer executing */
+		synchronize_rcu();
 		tcf_bpf_cfg_cleanup(&old);
+	}
 
 	return res;
 out:
-- 
cgit v1.2.3


From e79e259588a414589a016edc428ee8dd308f81ad Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:47 -0700
Subject: dst: Add __skb_dst_copy() variation

This variation on skb_dst_copy() doesn't require two skbs.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst.h b/include/net/dst.h
index ef8f1d43a203..4c4801645371 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -289,13 +289,18 @@ static inline void skb_dst_drop(struct sk_buff *skb)
 	}
 }
 
-static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
+static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
 {
-	nskb->_skb_refdst = oskb->_skb_refdst;
+	nskb->_skb_refdst = refdst;
 	if (!(nskb->_skb_refdst & SKB_DST_NOREF))
 		dst_clone(skb_dst(nskb));
 }
 
+static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
+{
+	__skb_dst_copy(nskb, oskb->_skb_refdst);
+}
+
 /**
  * skb_dst_force - makes sure skb dst is refcounted
  * @skb: buffer
-- 
cgit v1.2.3


From 86ca02e77408bb58ba596c1a411ec7f631733690 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 26 Aug 2015 11:31:51 -0700
Subject: netfilter: connlabels: Export setting connlabel length

Add functions to change connlabel length into nf_conntrack_labels.c so
they may be reused by other modules like OVS and nftables without
needing to jump through xt_match_check() hoops.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Florian Westphal <fw@strlen.de>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_conntrack_labels.h |  4 ++++
 net/netfilter/nf_conntrack_labels.c         | 32 +++++++++++++++++++++++++++++
 net/netfilter/xt_connlabel.c                | 16 ++++-----------
 3 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index dec6336bf850..7e2b1d025f50 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -54,7 +54,11 @@ int nf_connlabels_replace(struct nf_conn *ct,
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 int nf_conntrack_labels_init(void);
 void nf_conntrack_labels_fini(void);
+int nf_connlabels_get(struct net *net, unsigned int n_bits);
+void nf_connlabels_put(struct net *net);
 #else
 static inline int nf_conntrack_labels_init(void) { return 0; }
 static inline void nf_conntrack_labels_fini(void) {}
+static inline int nf_connlabels_get(struct net *net, unsigned int n_bits) { return 0; }
+static inline void nf_connlabels_put(struct net *net) {}
 #endif
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index daa7c1383bec..3ce5c314ea4b 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -14,6 +14,8 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 
+static spinlock_t nf_connlabels_lock;
+
 static unsigned int label_bits(const struct nf_conn_labels *l)
 {
 	unsigned int longs = l->words;
@@ -89,6 +91,35 @@ int nf_connlabels_replace(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_replace);
 
+int nf_connlabels_get(struct net *net, unsigned int n_bits)
+{
+	size_t words;
+
+	if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE))
+		return -ERANGE;
+
+	words = BITS_TO_LONGS(n_bits);
+
+	spin_lock(&nf_connlabels_lock);
+	net->ct.labels_used++;
+	if (words > net->ct.label_words)
+		net->ct.label_words = words;
+	spin_unlock(&nf_connlabels_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_get);
+
+void nf_connlabels_put(struct net *net)
+{
+	spin_lock(&nf_connlabels_lock);
+	net->ct.labels_used--;
+	if (net->ct.labels_used == 0)
+		net->ct.label_words = 0;
+	spin_unlock(&nf_connlabels_lock);
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_put);
+
 static struct nf_ct_ext_type labels_extend __read_mostly = {
 	.len    = sizeof(struct nf_conn_labels),
 	.align  = __alignof__(struct nf_conn_labels),
@@ -97,6 +128,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = {
 
 int nf_conntrack_labels_init(void)
 {
+	spin_lock_init(&nf_connlabels_lock);
 	return nf_ct_extend_register(&labels_extend);
 }
 
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index 9f8719df2001..bb9cbeb18868 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
 			    XT_CONNLABEL_OP_SET;
 	struct xt_connlabel_mtinfo *info = par->matchinfo;
 	int ret;
-	size_t words;
-
-	if (info->bit > XT_CONNLABEL_MAXBIT)
-		return -ERANGE;
 
 	if (info->options & ~options) {
 		pr_err("Unknown options in mask %x\n", info->options);
@@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
 		return ret;
 	}
 
-	par->net->ct.labels_used++;
-	words = BITS_TO_LONGS(info->bit+1);
-	if (words > par->net->ct.label_words)
-		par->net->ct.label_words = words;
-
+	ret = nf_connlabels_get(par->net, info->bit + 1);
+	if (ret < 0)
+		nf_ct_l3proto_module_put(par->family);
 	return ret;
 }
 
 static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	par->net->ct.labels_used--;
-	if (par->net->ct.labels_used == 0)
-		par->net->ct.label_words = 0;
+	nf_connlabels_put(par->net);
 	nf_ct_l3proto_module_put(par->family);
 }
 
-- 
cgit v1.2.3


From 3b3ae880266d148bf73a573a766bc9b78c08d805 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 26 Aug 2015 23:00:06 +0200
Subject: net: sched: consolidate tc_classify{,_compat}

For classifiers getting invoked via tc_classify(), we always need an
extra function call into tc_classify_compat(), as both are being
exported as symbols and tc_classify() itself doesn't do much except
handling of reclassifications when tp->classify() returned with
TC_ACT_RECLASSIFY.

CBQ and ATM are the only qdiscs that directly call into tc_classify_compat(),
all others use tc_classify(). When tc actions are being configured
out in the kernel, tc_classify() effectively does nothing besides
delegating.

We could spare this layer and consolidate both functions. pktgen on
single CPU constantly pushing skbs directly into the netif_receive_skb()
path with a dummy classifier on ingress qdisc attached, improves
slightly from 22.3Mpps to 23.1Mpps.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h  |  4 +---
 net/core/dev.c           |  2 +-
 net/sched/sch_api.c      | 55 ++++++++++++++++++++++--------------------------
 net/sched/sch_atm.c      |  2 +-
 net/sched/sch_cbq.c      |  2 +-
 net/sched/sch_choke.c    |  2 +-
 net/sched/sch_drr.c      |  2 +-
 net/sched/sch_dsmark.c   |  2 +-
 net/sched/sch_fq_codel.c |  2 +-
 net/sched/sch_hfsc.c     |  2 +-
 net/sched/sch_htb.c      |  2 +-
 net/sched/sch_multiq.c   |  2 +-
 net/sched/sch_prio.c     |  2 +-
 net/sched/sch_qfq.c      |  2 +-
 net/sched/sch_sfb.c      |  2 +-
 net/sched/sch_sfq.c      |  2 +-
 16 files changed, 40 insertions(+), 47 deletions(-)

(limited to 'include/net')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 2342bf12cb78..401038d2f9b8 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -110,10 +110,8 @@ static inline void qdisc_run(struct Qdisc *q)
 		__qdisc_run(q);
 }
 
-int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
-		       struct tcf_result *res);
 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-		struct tcf_result *res);
+		struct tcf_result *res, bool compat_mode);
 
 static inline __be16 tc_skb_protocol(const struct sk_buff *skb)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index b1f3f4844e60..7bb24f1879b8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3657,7 +3657,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 	qdisc_bstats_cpu_update(cl->q, skb);
 
-	switch (tc_classify(skb, cl, &cl_res)) {
+	switch (tc_classify(skb, cl, &cl_res, false)) {
 	case TC_ACT_OK:
 	case TC_ACT_RECLASSIFY:
 		skb->tc_index = TC_H_MIN(cl_res.classid);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f06aa01d60fd..59c227f26b56 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1806,51 +1806,46 @@ done:
  * to this qdisc, (optionally) tests for protocol and asks
  * specific classifiers.
  */
-int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
-		       struct tcf_result *res)
+int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+		struct tcf_result *res, bool compat_mode)
 {
 	__be16 protocol = tc_skb_protocol(skb);
-	int err;
+#ifdef CONFIG_NET_CLS_ACT
+	const struct tcf_proto *old_tp = tp;
+	int limit = 0;
 
+reclassify:
+#endif
 	for (; tp; tp = rcu_dereference_bh(tp->next)) {
+		int err;
+
 		if (tp->protocol != protocol &&
 		    tp->protocol != htons(ETH_P_ALL))
 			continue;
-		err = tp->classify(skb, tp, res);
 
+		err = tp->classify(skb, tp, res);
+#ifdef CONFIG_NET_CLS_ACT
+		if (unlikely(err == TC_ACT_RECLASSIFY &&
+			     !compat_mode))
+			goto reset;
+#endif
 		if (err >= 0)
 			return err;
 	}
-	return -1;
-}
-EXPORT_SYMBOL(tc_classify_compat);
 
-int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-		struct tcf_result *res)
-{
-	int err = 0;
-#ifdef CONFIG_NET_CLS_ACT
-	const struct tcf_proto *otp = tp;
-	int limit = 0;
-reclassify:
-#endif
-
-	err = tc_classify_compat(skb, tp, res);
+	return -1;
 #ifdef CONFIG_NET_CLS_ACT
-	if (err == TC_ACT_RECLASSIFY) {
-		tp = otp;
-
-		if (unlikely(limit++ >= MAX_REC_LOOP)) {
-			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
-					       tp->q->ops->id,
-					       tp->prio & 0xffff,
-					       ntohs(tp->protocol));
-			return TC_ACT_SHOT;
-		}
-		goto reclassify;
+reset:
+	if (unlikely(limit++ >= MAX_REC_LOOP)) {
+		net_notice_ratelimited("%s: reclassify loop, rule prio %u, "
+				       "protocol %02x\n", tp->q->ops->id,
+				       tp->prio & 0xffff, ntohs(tp->protocol));
+		return TC_ACT_SHOT;
 	}
+
+	tp = old_tp;
+	goto reclassify;
 #endif
-	return err;
 }
 EXPORT_SYMBOL(tc_classify);
 
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index e3e2cc5fd068..1911af3ca7c0 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -375,7 +375,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		list_for_each_entry(flow, &p->flows, list) {
 			fl = rcu_dereference_bh(flow->filter_list);
 			if (fl) {
-				result = tc_classify_compat(skb, fl, &res);
+				result = tc_classify(skb, fl, &res, true);
 				if (result < 0)
 					continue;
 				flow = (struct atm_flow_data *)res.class;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beeb75f80fdb..c538d9e4a8f6 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -240,7 +240,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		/*
 		 * Step 2+n. Apply classifier.
 		 */
-		result = tc_classify_compat(skb, fl, &res);
+		result = tc_classify(skb, fl, &res, true);
 		if (!fl || result < 0)
 			goto fallback;
 
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 6a783afe4960..665bde07916b 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -201,7 +201,7 @@ static bool choke_classify(struct sk_buff *skb,
 	int result;
 
 	fl = rcu_dereference_bh(q->filter_list);
-	result = tc_classify(skb, fl, &res);
+	result = tc_classify(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 338706092c27..f26bdea875c1 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -331,7 +331,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	fl = rcu_dereference_bh(q->filter_list);
-	result = tc_classify(skb, fl, &res);
+	result = tc_classify(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 66700a6116aa..c4d45fd8c551 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -230,7 +230,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	else {
 		struct tcf_result res;
 		struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
-		int result = tc_classify(skb, fl, &res);
+		int result = tc_classify(skb, fl, &res, false);
 
 		pr_debug("result %d class 0x%04x\n", result, res.classid);
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a9ba030435a2..4c834e93dafb 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -92,7 +92,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return fq_codel_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, filter, &res);
+	result = tc_classify(skb, filter, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index e6c7416d0332..b7ebe2c87586 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1165,7 +1165,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	head = &q->root;
 	tcf = rcu_dereference_bh(q->root.filter_list);
-	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+	while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index cf4b0f865d1b..15ccd7f8fb2a 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -229,7 +229,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+	while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 42dd218871e0..4e904ca0af9d 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -46,7 +46,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	int err;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	err = tc_classify(skb, fl, &res);
+	err = tc_classify(skb, fl, &res, false);
 #ifdef CONFIG_NET_CLS_ACT
 	switch (err) {
 	case TC_ACT_STOLEN:
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 8e5cd34aaa74..ba6487f2741f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -42,7 +42,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
 		fl = rcu_dereference_bh(q->filter_list);
-		err = tc_classify(skb, fl, &res);
+		err = tc_classify(skb, fl, &res, false);
 #ifdef CONFIG_NET_CLS_ACT
 		switch (err) {
 		case TC_ACT_STOLEN:
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ffaeea63d473..3dc3a6e56052 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -717,7 +717,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	fl = rcu_dereference_bh(q->filter_list);
-	result = tc_classify(skb, fl, &res);
+	result = tc_classify(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index dcdff5c769a1..5bbb6332ec57 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -258,7 +258,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
 	struct tcf_result res;
 	int result;
 
-	result = tc_classify(skb, fl, &res);
+	result = tc_classify(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 52f75a5473e1..3abab534eb5c 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -179,7 +179,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return sfq_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, fl, &res);
+	result = tc_classify(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
-- 
cgit v1.2.3


From c29a70d2cadfea443c027d23481f820530b70057 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 26 Aug 2015 23:46:50 -0700
Subject: tunnel: introduce udp_tun_rx_dst()

Introduce function udp_tun_rx_dst() to initialize tunnel dst on
receive path.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c        | 29 +++-------------------
 include/net/dst_metadata.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 include/net/udp_tunnel.h   |  4 +++
 net/ipv4/ip_gre.c          | 21 ++++------------
 net/ipv4/udp_tunnel.c      | 25 ++++++++++++++++++-
 5 files changed, 97 insertions(+), 43 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 61b457b9ec00..5b4cf66e632e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1264,36 +1264,13 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	if (vxlan_collect_metadata(vs)) {
-		tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
+		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
+					 cpu_to_be64(vni >> 8), sizeof(*md));
+
 		if (!tun_dst)
 			goto drop;
 
 		info = &tun_dst->u.tun_info;
-		if (vxlan_get_sk_family(vs) == AF_INET) {
-			const struct iphdr *iph = ip_hdr(skb);
-
-			info->key.u.ipv4.src = iph->saddr;
-			info->key.u.ipv4.dst = iph->daddr;
-			info->key.tos = iph->tos;
-			info->key.ttl = iph->ttl;
-		} else {
-			const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-
-			info->key.u.ipv6.src = ip6h->saddr;
-			info->key.u.ipv6.dst = ip6h->daddr;
-			info->key.tos = ipv6_get_dsfield(ip6h);
-			info->key.ttl = ip6h->hop_limit;
-		}
-
-		info->key.tp_src = udp_hdr(skb)->source;
-		info->key.tp_dst = udp_hdr(skb)->dest;
-
-		info->mode = IP_TUNNEL_INFO_RX;
-		info->key.tun_flags = TUNNEL_KEY;
-		info->key.tun_id = cpu_to_be64(vni >> 8);
-		if (udp_hdr(skb)->check != 0)
-			info->key.tun_flags |= TUNNEL_CSUM;
-
 		md = ip_tunnel_info_opts(info, sizeof(*md));
 	} else {
 		memset(md, 0, sizeof(*md));
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 2cb52d562272..60c03326c087 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -48,4 +48,65 @@ static inline bool skb_valid_dst(const struct sk_buff *skb)
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
 
+static inline struct metadata_dst *tun_rx_dst(__be16 flags,
+					      __be64 tunnel_id, int md_size)
+{
+	struct metadata_dst *tun_dst;
+	struct ip_tunnel_info *info;
+
+	tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC);
+	if (!tun_dst)
+		return NULL;
+
+	info = &tun_dst->u.tun_info;
+	info->mode = IP_TUNNEL_INFO_RX;
+	info->key.tun_flags = flags;
+	info->key.tun_id = tunnel_id;
+	info->key.tp_src = 0;
+	info->key.tp_dst = 0;
+	return tun_dst;
+}
+
+static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
+						 __be16 flags,
+						 __be64 tunnel_id,
+						 int md_size)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct metadata_dst *tun_dst;
+	struct ip_tunnel_info *info;
+
+	tun_dst = tun_rx_dst(flags, tunnel_id, md_size);
+	if (!tun_dst)
+		return NULL;
+
+	info = &tun_dst->u.tun_info;
+	info->key.u.ipv4.src = iph->saddr;
+	info->key.u.ipv4.dst = iph->daddr;
+	info->key.tos = iph->tos;
+	info->key.ttl = iph->ttl;
+	return tun_dst;
+}
+
+static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
+						 __be16 flags,
+						 __be64 tunnel_id,
+						 int md_size)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct metadata_dst *tun_dst;
+	struct ip_tunnel_info *info;
+
+	tun_dst = tun_rx_dst(flags, tunnel_id, md_size);
+	if (!tun_dst)
+		return NULL;
+
+	info = &tun_dst->u.tun_info;
+	info->key.u.ipv6.src = ip6h->saddr;
+	info->key.u.ipv6.dst = ip6h->daddr;
+	info->key.tos = ipv6_get_dsfield(ip6h);
+	info->key.ttl = ip6h->hop_limit;
+	return tun_dst;
+}
+
 #endif /* __NET_DST_METADATA_H */
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index c491c1221606..35041d0fc21e 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -93,6 +93,10 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 
 void udp_tunnel_sock_release(struct socket *sock);
 
+struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
+				    __be16 flags, __be64 tunnel_id,
+				    int md_size);
+
 static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb,
 							 bool udp_csum)
 {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1bf328182697..faf1cde6f8da 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -400,25 +400,14 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 	if (tunnel) {
 		skb_pop_mac_header(skb);
 		if (tunnel->collect_md) {
-			struct ip_tunnel_info *info;
+			__be16 flags;
+			__be64 tun_id;
 
-			tun_dst = metadata_dst_alloc(0, GFP_ATOMIC);
+			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+			tun_id = key_to_tunnel_id(tpi->key);
+			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 			if (!tun_dst)
 				return PACKET_REJECT;
-
-			info = &tun_dst->u.tun_info;
-			info->key.u.ipv4.src = iph->saddr;
-			info->key.u.ipv4.dst = iph->daddr;
-			info->key.tos = iph->tos;
-			info->key.ttl = iph->ttl;
-
-			info->mode = IP_TUNNEL_INFO_RX;
-			info->key.tun_flags = tpi->flags &
-					      (TUNNEL_CSUM | TUNNEL_KEY);
-			info->key.tun_id = key_to_tunnel_id(tpi->key);
-
-			info->key.tp_src = 0;
-			info->key.tp_dst = 0;
 		}
 
 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 933ea903f7b8..aba428626b52 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -4,9 +4,10 @@
 #include <linux/udp.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <net/dst_metadata.h>
+#include <net/net_namespace.h>
 #include <net/udp.h>
 #include <net/udp_tunnel.h>
-#include <net/net_namespace.h>
 
 int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 		     struct socket **sockp)
@@ -103,4 +104,26 @@ void udp_tunnel_sock_release(struct socket *sock)
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
 
+struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb,  unsigned short family,
+				    __be16 flags, __be64 tunnel_id, int md_size)
+{
+	struct metadata_dst *tun_dst;
+	struct ip_tunnel_info *info;
+
+	if (family == AF_INET)
+		tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size);
+	else
+		tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size);
+	if (!tun_dst)
+		return NULL;
+
+	info = &tun_dst->u.tun_info;
+	info->key.tp_src = udp_hdr(skb)->source;
+	info->key.tp_dst = udp_hdr(skb)->dest;
+	if (udp_hdr(skb)->check)
+		info->key.tun_flags |= TUNNEL_CSUM;
+	return tun_dst;
+}
+EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
+
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From e305ac6cf5a1e1386aedce7ef9cb773635d5845c Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 26 Aug 2015 23:46:52 -0700
Subject: geneve: Add support to collect tunnel metadata.

Following patch create new tunnel flag which enable
tunnel metadata collection on given device. These devices
can be used by tunnel metadata based routing or by OVS.
Geneve Consolidation patch get rid of collect_md_tun to
simplify tunnel lookup further.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c         | 356 ++++++++++++++++++++++++++++++++-----------
 include/net/geneve.h         |   3 +
 include/uapi/linux/if_link.h |   1 +
 3 files changed, 275 insertions(+), 85 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 0a6d9741d956..d05150cc25d4 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -15,6 +15,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/hash.h>
+#include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
 #include <net/geneve.h>
 
@@ -36,6 +37,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 struct geneve_net {
 	struct list_head  geneve_list;
 	struct hlist_head vni_list[VNI_HASH_SIZE];
+	struct geneve_dev __rcu *collect_md_tun;
 };
 
 /* Pseudo network device */
@@ -50,6 +52,7 @@ struct geneve_dev {
 	struct sockaddr_in remote;	/* IPv4 address for link partner */
 	struct list_head   next;	/* geneve's per namespace list */
 	__be16		   dst_port;
+	bool		   collect_md;
 };
 
 static int geneve_net_id;
@@ -62,48 +65,95 @@ static inline __u32 geneve_net_vni_hash(u8 vni[3])
 	return hash_32(vnid, VNI_HASH_BITS);
 }
 
-/* geneve receive/decap routine */
-static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+static __be64 vni_to_tunnel_id(const __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	return (vni[0] << 16) | (vni[1] << 8) | vni[2];
+#else
+	return (__force __be64)(((__force u64)vni[0] << 40) |
+				((__force u64)vni[1] << 48) |
+				((__force u64)vni[2] << 56));
+#endif
+}
+
+static struct geneve_dev *geneve_lookup(struct geneve_net *gn,
+					struct geneve_sock *gs,
+					struct iphdr *iph,
+					struct genevehdr *gnvh)
 {
 	struct inet_sock *sk = inet_sk(gs->sock->sk);
-	struct genevehdr *gnvh = geneve_hdr(skb);
-	struct geneve_dev *dummy, *geneve = NULL;
-	struct geneve_net *gn;
-	struct iphdr *iph = NULL;
-	struct pcpu_sw_netstats *stats;
 	struct hlist_head *vni_list_head;
-	int err = 0;
+	struct geneve_dev *geneve;
 	__u32 hash;
 
-	iph = ip_hdr(skb); /* Still outer IP header... */
-
-	gn = gs->rcv_data;
+	geneve = rcu_dereference(gn->collect_md_tun);
+	if (geneve)
+		return geneve;
 
 	/* Find the device for this VNI */
 	hash = geneve_net_vni_hash(gnvh->vni);
 	vni_list_head = &gn->vni_list[hash];
-	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
-		if (!memcmp(gnvh->vni, dummy->vni, sizeof(dummy->vni)) &&
-		    iph->saddr == dummy->remote.sin_addr.s_addr &&
-		    sk->inet_sport == dummy->dst_port) {
-			geneve = dummy;
-			break;
+	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
+		if (!memcmp(gnvh->vni, geneve->vni, sizeof(geneve->vni)) &&
+		    iph->saddr == geneve->remote.sin_addr.s_addr &&
+		    sk->inet_sport == geneve->dst_port) {
+			return geneve;
 		}
 	}
+	return NULL;
+}
+
+/* geneve receive/decap routine */
+static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+{
+	struct genevehdr *gnvh = geneve_hdr(skb);
+	struct metadata_dst *tun_dst = NULL;
+	struct geneve_dev *geneve = NULL;
+	struct pcpu_sw_netstats *stats;
+	struct geneve_net *gn;
+	struct iphdr *iph;
+	int err;
+
+	iph = ip_hdr(skb); /* Still outer IP header... */
+	gn = gs->rcv_data;
+	geneve = geneve_lookup(gn, gs, iph, gnvh);
 	if (!geneve)
 		goto drop;
 
-	/* Drop packets w/ critical options,
-	 * since we don't support any...
-	 */
-	if (gnvh->critical)
-		goto drop;
+	if (ip_tunnel_collect_metadata() || geneve->collect_md) {
+		__be16 flags;
+		void *opts;
+
+		flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
+			(gnvh->oam ? TUNNEL_OAM : 0) |
+			(gnvh->critical ? TUNNEL_CRIT_OPT : 0);
+
+		tun_dst = udp_tun_rx_dst(skb, AF_INET, flags,
+					 vni_to_tunnel_id(gnvh->vni),
+					 gnvh->opt_len * 4);
+		if (!tun_dst)
+			goto drop;
+
+		/* Update tunnel dst according to Geneve options. */
+		opts = ip_tunnel_info_opts(&tun_dst->u.tun_info,
+					   gnvh->opt_len * 4);
+		memcpy(opts, gnvh->options, gnvh->opt_len * 4);
+	} else {
+		/* Drop packets w/ critical options,
+		 * since we don't support any...
+		 */
+		if (gnvh->critical)
+			goto drop;
+	}
 
 	skb_reset_mac_header(skb);
 	skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
 	skb->protocol = eth_type_trans(skb, geneve->dev);
 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
+	if (tun_dst)
+		skb_dst_set(skb, &tun_dst->dst);
+
 	/* Ignore packet loops (and multicast echo) */
 	if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
 		goto drop;
@@ -131,7 +181,6 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
 	u64_stats_update_end(&stats->syncp);
 
 	netif_rx(skb);
-
 	return;
 drop:
 	/* Consume bad packet */
@@ -144,7 +193,6 @@ static int geneve_init(struct net_device *dev)
 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
-
 	return 0;
 }
 
@@ -180,69 +228,137 @@ static int geneve_stop(struct net_device *dev)
 	return 0;
 }
 
+static struct rtable *geneve_get_rt(struct sk_buff *skb,
+				    struct net_device *dev,
+				    struct flowi4 *fl4,
+				    struct ip_tunnel_info *info)
+{
+	struct geneve_dev *geneve = netdev_priv(dev);
+	struct rtable *rt = NULL;
+	__u8 tos;
+
+	memset(fl4, 0, sizeof(*fl4));
+	fl4->flowi4_mark = skb->mark;
+	fl4->flowi4_proto = IPPROTO_UDP;
+
+	if (info) {
+		fl4->daddr = info->key.u.ipv4.dst;
+		fl4->saddr = info->key.u.ipv4.src;
+		fl4->flowi4_tos = RT_TOS(info->key.tos);
+	} else {
+		tos = geneve->tos;
+		if (tos == 1) {
+			const struct iphdr *iip = ip_hdr(skb);
+
+			tos = ip_tunnel_get_dsfield(iip, skb);
+		}
+
+		fl4->flowi4_tos = RT_TOS(tos);
+		fl4->daddr = geneve->remote.sin_addr.s_addr;
+	}
+
+	rt = ip_route_output_key(geneve->net, fl4);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
+		dev->stats.tx_carrier_errors++;
+		return rt;
+	}
+	if (rt->dst.dev == dev) { /* is this necessary? */
+		netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr);
+		dev->stats.collisions++;
+		ip_rt_put(rt);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return rt;
+}
+
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	vni[0] = (__force __u8)(tun_id >> 16);
+	vni[1] = (__force __u8)(tun_id >> 8);
+	vni[2] = (__force __u8)tun_id;
+#else
+	vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+	vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+	vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
 static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct geneve_sock *gs = geneve->sock;
+	struct ip_tunnel_info *info = NULL;
 	struct rtable *rt = NULL;
 	const struct iphdr *iip; /* interior IP header */
 	struct flowi4 fl4;
-	int err;
-	__be16 sport;
 	__u8 tos, ttl;
+	__be16 sport;
+	bool xnet;
+	int err;
 
-	iip = ip_hdr(skb);
-
-	skb_reset_mac_header(skb);
-
-	/* TODO: port min/max limits should be configurable */
-	sport = udp_flow_src_port(dev_net(dev), skb, 0, 0, true);
-
-	tos = geneve->tos;
-	if (tos == 1)
-		tos = ip_tunnel_get_dsfield(iip, skb);
+	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_tos = RT_TOS(tos);
-	fl4.daddr = geneve->remote.sin_addr.s_addr;
-	fl4.flowi4_mark = skb->mark;
-	fl4.flowi4_proto = IPPROTO_UDP;
+	if (geneve->collect_md) {
+		info = skb_tunnel_info(skb);
+		if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) {
+			netdev_dbg(dev, "no tunnel metadata\n");
+			goto tx_error;
+		}
+	}
 
-	rt = ip_route_output_key(geneve->net, &fl4);
+	rt = geneve_get_rt(skb, dev, &fl4, info);
 	if (IS_ERR(rt)) {
 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
 		dev->stats.tx_carrier_errors++;
 		goto tx_error;
 	}
-	if (rt->dst.dev == dev) { /* is this necessary? */
-		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
-		dev->stats.collisions++;
-		goto rt_tx_error;
+	skb_reset_mac_header(skb);
+	xnet = !net_eq(geneve->net, dev_net(geneve->dev));
+
+	if (info) {
+		const struct ip_tunnel_key *key = &info->key;
+		bool udp_csum;
+		u8 *opts = NULL;
+		u8 vni[3];
+		__be16 df;
+
+		tunnel_id_to_vni(key->tun_id, vni);
+		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+		udp_csum = !!(key->tun_flags & TUNNEL_CSUM);
+
+		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+			opts = ip_tunnel_info_opts(info, info->options_len);
+
+		err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
+				      key->tos, key->ttl, df,
+				      sport, geneve->dst_port,
+				      key->tun_flags, vni,
+				      info->options_len, opts, udp_csum, xnet);
+	} else {
+		iip = ip_hdr(skb);
+		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
+
+		ttl = geneve->ttl;
+		if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
+			ttl = 1;
+
+		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+		/* no need to handle local destination and encap bypass...yet... */
+		err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr, tos,
+				      ttl, 0, sport, geneve->dst_port, 0,
+				      geneve->vni, 0, NULL, false, xnet);
 	}
-
-	tos = ip_tunnel_ecn_encap(tos, iip, skb);
-
-	ttl = geneve->ttl;
-	if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
-		ttl = 1;
-
-	ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-
-	/* no need to handle local destination and encap bypass...yet... */
-
-	err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
-			      tos, ttl, 0, sport, geneve->dst_port, 0,
-	                      geneve->vni, 0, NULL, false,
-	                      !net_eq(geneve->net, dev_net(geneve->dev)));
 	if (err < 0)
 		ip_rt_put(rt);
 
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
-
 	return NETDEV_TX_OK;
 
-rt_tx_error:
-	ip_rt_put(rt);
 tx_error:
 	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
@@ -312,6 +428,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
+	[IFLA_GENEVE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -337,71 +454,112 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
-static int geneve_newlink(struct net *net, struct net_device *dev,
-			 struct nlattr *tb[], struct nlattr *data[])
+static int geneve_configure(struct net *net, struct net_device *dev,
+			    __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos,
+			    __u16 dst_port, bool metadata)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *dummy, *geneve = netdev_priv(dev);
 	struct hlist_head *vni_list_head;
 	struct sockaddr_in remote;	/* IPv4 address for link partner */
-	__u32 vni, hash;
-	__be16 dst_port;
+	__u32 hash;
 	int err;
 
-	if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
-		return -EINVAL;
+	if (metadata) {
+		if (rtnl_dereference(gn->collect_md_tun))
+			return -EEXIST;
+		if (!list_empty(&gn->geneve_list))
+			return -EPERM;
+	} else {
+		if (rtnl_dereference(gn->collect_md_tun))
+			return -EPERM;
+	}
 
 	geneve->net = net;
 	geneve->dev = dev;
 
-	vni = nla_get_u32(data[IFLA_GENEVE_ID]);
 	geneve->vni[0] = (vni & 0x00ff0000) >> 16;
 	geneve->vni[1] = (vni & 0x0000ff00) >> 8;
 	geneve->vni[2] =  vni & 0x000000ff;
 
-	geneve->remote.sin_addr.s_addr =
-		nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+	geneve->remote.sin_addr.s_addr = rem_addr;
 	if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr)))
 		return -EINVAL;
 
-	if (data[IFLA_GENEVE_PORT])
-		dst_port = htons(nla_get_u16(data[IFLA_GENEVE_PORT]));
-	else
-		dst_port = htons(GENEVE_UDP_PORT);
-
 	remote = geneve->remote;
+	if (metadata) {
+		if (rem_addr || vni || tos || ttl)
+			return -EINVAL;
+	}
+
 	hash = geneve_net_vni_hash(geneve->vni);
 	vni_list_head = &gn->vni_list[hash];
 	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
 		if (!memcmp(geneve->vni, dummy->vni, sizeof(dummy->vni)) &&
 		    !memcmp(&remote, &dummy->remote, sizeof(dummy->remote)) &&
-		    dst_port == dummy->dst_port) {
+		    htons(dst_port) == dummy->dst_port) {
 			return -EBUSY;
 		}
 	}
 
+	geneve->ttl = ttl;
+	geneve->tos = tos;
+	geneve->dst_port = htons(dst_port);
+	geneve->collect_md = metadata;
+
 	err = register_netdevice(dev);
 	if (err)
 		return err;
 
+	list_add(&geneve->next, &gn->geneve_list);
+	hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
+
+	if (geneve->collect_md)
+		rcu_assign_pointer(gn->collect_md_tun, geneve);
+	return 0;
+}
+
+static int geneve_newlink(struct net *net, struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[])
+{
+	__u16 dst_port = GENEVE_UDP_PORT;
+	__u8 ttl = 0, tos = 0;
+	bool metadata = false;
+	__be32 rem_addr;
+	__u32 vni;
+
+	if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_GENEVE_ID]);
+	rem_addr = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+
 	if (data[IFLA_GENEVE_TTL])
-		geneve->ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
+		ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
 
 	if (data[IFLA_GENEVE_TOS])
-		geneve->tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
+		tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
-	geneve->dst_port = dst_port;
-	list_add(&geneve->next, &gn->geneve_list);
+	if (data[IFLA_GENEVE_PORT])
+		dst_port = nla_get_u16(data[IFLA_GENEVE_PORT]);
 
-	hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
+	if (data[IFLA_GENEVE_COLLECT_METADATA])
+		metadata = true;
 
-	return 0;
+	return geneve_configure(net, dev, rem_addr, vni,
+				ttl, tos, dst_port, metadata);
 }
 
 static void geneve_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 
+	if (geneve->collect_md) {
+		struct geneve_net *gn = net_generic(geneve->net, geneve_net_id);
+
+		rcu_assign_pointer(gn->collect_md_tun, NULL);
+	}
+
 	if (!hlist_unhashed(&geneve->hlist))
 		hlist_del_rcu(&geneve->hlist);
 
@@ -416,6 +574,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
 		nla_total_size(sizeof(__u16)) +  /* IFLA_GENEVE_PORT */
+		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
 		0;
 }
 
@@ -439,6 +598,11 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put_u16(skb, IFLA_GENEVE_PORT, ntohs(geneve->dst_port)))
 		goto nla_put_failure;
 
+	if (geneve->collect_md) {
+		if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -458,6 +622,28 @@ static struct rtnl_link_ops geneve_link_ops __read_mostly = {
 	.fill_info	= geneve_fill_info,
 };
 
+struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
+					u8 name_assign_type, u16 dst_port)
+{
+	struct nlattr *tb[IFLA_MAX + 1];
+	struct net_device *dev;
+	int err;
+
+	memset(tb, 0, sizeof(tb));
+	dev = rtnl_create_link(net, name, name_assign_type,
+			       &geneve_link_ops, tb);
+	if (IS_ERR(dev))
+		return dev;
+
+	err = geneve_configure(net, dev, 0, 0, 0, 0, dst_port, true);
+	if (err) {
+		free_netdev(dev);
+		return ERR_PTR(err);
+	}
+	return dev;
+}
+EXPORT_SYMBOL_GPL(geneve_dev_create_fb);
+
 static __net_init int geneve_init_net(struct net *net)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
diff --git a/include/net/geneve.h b/include/net/geneve.h
index 2a0543a1899d..4245e1d23b9b 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -96,6 +96,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
 		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
 		    bool csum, bool xnet);
+
+struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
+					u8 name_assign_type, u16 dst_port);
 #endif /*ifdef CONFIG_INET */
 
 #endif /*ifdef__NET_GENEVE_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9d73c31896d0..3a5f263cfc2f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -411,6 +411,7 @@ enum {
 	IFLA_GENEVE_TTL,
 	IFLA_GENEVE_TOS,
 	IFLA_GENEVE_PORT,	/* destination port */
+	IFLA_GENEVE_COLLECT_METADATA,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 371bd1061d29562e6423435073623add8c475ee2 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 26 Aug 2015 23:46:54 -0700
Subject: geneve: Consolidate Geneve functionality in single module.

geneve_core module handles send and receive functionality.
This way OVS could use the Geneve API. Now with use of
tunnel meatadata mode OVS can directly use Geneve netdevice.
So there is no need for separate module for Geneve. Following
patch consolidates Geneve protocol processing in single module.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Acked-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig    |   4 +-
 drivers/net/geneve.c   | 507 ++++++++++++++++++++++++++++++++++++++++---------
 include/net/geneve.h   |  34 ----
 net/ipv4/Kconfig       |  14 --
 net/ipv4/Makefile      |   1 -
 net/ipv4/geneve_core.c | 447 -------------------------------------------
 6 files changed, 421 insertions(+), 586 deletions(-)
 delete mode 100644 net/ipv4/geneve_core.c

(limited to 'include/net')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 770483b31d62..d18eb607bee6 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -180,8 +180,8 @@ config VXLAN
 	  will be called vxlan.
 
 config GENEVE
-       tristate "Generic Network Virtualization Encapsulation netdev"
-       depends on INET && GENEVE_CORE
+       tristate "Generic Network Virtualization Encapsulation"
+       depends on INET && NET_UDP_TUNNEL
        select NET_IP_TUNNEL
        ---help---
 	  This allows one to create geneve virtual interfaces that provide
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index d05150cc25d4..90d4d433f1c9 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -18,6 +18,7 @@
 #include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
 #include <net/geneve.h>
+#include <net/protocol.h>
 
 #define GENEVE_NETDEV_VER	"0.6"
 
@@ -33,13 +34,18 @@ static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 
+#define GENEVE_VER 0
+#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
+
 /* per-network namespace private data for this module */
 struct geneve_net {
-	struct list_head  geneve_list;
-	struct hlist_head vni_list[VNI_HASH_SIZE];
-	struct geneve_dev __rcu *collect_md_tun;
+	struct list_head	geneve_list;
+	struct hlist_head	vni_list[VNI_HASH_SIZE];
+	struct list_head	sock_list;
 };
 
+static int geneve_net_id;
+
 /* Pseudo network device */
 struct geneve_dev {
 	struct hlist_node  hlist;	/* vni hash table */
@@ -55,7 +61,15 @@ struct geneve_dev {
 	bool		   collect_md;
 };
 
-static int geneve_net_id;
+struct geneve_sock {
+	bool			collect_md;
+	struct geneve_net	*gn;
+	struct list_head	list;
+	struct socket		*sock;
+	struct rcu_head		rcu;
+	int			refcnt;
+	struct udp_offload	udp_offloads;
+};
 
 static inline __u32 geneve_net_vni_hash(u8 vni[3])
 {
@@ -76,51 +90,62 @@ static __be64 vni_to_tunnel_id(const __u8 *vni)
 #endif
 }
 
-static struct geneve_dev *geneve_lookup(struct geneve_net *gn,
-					struct geneve_sock *gs,
-					struct iphdr *iph,
-					struct genevehdr *gnvh)
+static struct geneve_dev *geneve_lookup(struct geneve_net *gn, __be16 port,
+					__be32 addr, u8 vni[])
 {
-	struct inet_sock *sk = inet_sk(gs->sock->sk);
 	struct hlist_head *vni_list_head;
 	struct geneve_dev *geneve;
 	__u32 hash;
 
-	geneve = rcu_dereference(gn->collect_md_tun);
-	if (geneve)
-		return geneve;
-
 	/* Find the device for this VNI */
-	hash = geneve_net_vni_hash(gnvh->vni);
+	hash = geneve_net_vni_hash(vni);
 	vni_list_head = &gn->vni_list[hash];
 	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
-		if (!memcmp(gnvh->vni, geneve->vni, sizeof(geneve->vni)) &&
-		    iph->saddr == geneve->remote.sin_addr.s_addr &&
-		    sk->inet_sport == geneve->dst_port) {
+		if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
+		    addr == geneve->remote.sin_addr.s_addr &&
+		    port == geneve->dst_port) {
 			return geneve;
 		}
 	}
 	return NULL;
 }
 
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+	return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
 /* geneve receive/decap routine */
 static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
 {
+	struct inet_sock *sk = inet_sk(gs->sock->sk);
 	struct genevehdr *gnvh = geneve_hdr(skb);
+	struct geneve_net *gn = gs->gn;
 	struct metadata_dst *tun_dst = NULL;
 	struct geneve_dev *geneve = NULL;
 	struct pcpu_sw_netstats *stats;
-	struct geneve_net *gn;
 	struct iphdr *iph;
+	u8 *vni;
+	__be32 addr;
 	int err;
 
 	iph = ip_hdr(skb); /* Still outer IP header... */
-	gn = gs->rcv_data;
-	geneve = geneve_lookup(gn, gs, iph, gnvh);
+
+	if (gs->collect_md) {
+		static u8 zero_vni[3];
+
+		vni = zero_vni;
+		addr = 0;
+	} else {
+		vni = gnvh->vni;
+		addr = iph->saddr;
+	}
+
+	geneve = geneve_lookup(gn, sk->inet_sport, addr, vni);
 	if (!geneve)
 		goto drop;
 
-	if (ip_tunnel_collect_metadata() || geneve->collect_md) {
+	if (ip_tunnel_collect_metadata() || gs->collect_md) {
 		__be16 flags;
 		void *opts;
 
@@ -201,31 +226,326 @@ static void geneve_uninit(struct net_device *dev)
 	free_percpu(dev->tstats);
 }
 
+/* Callback from net/ipv4/udp.c to receive packets */
+static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct genevehdr *geneveh;
+	struct geneve_sock *gs;
+	int opts_len;
+
+	/* Need Geneve and inner Ethernet header to be present */
+	if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
+		goto error;
+
+	/* Return packets with reserved bits set */
+	geneveh = geneve_hdr(skb);
+	if (unlikely(geneveh->ver != GENEVE_VER))
+		goto error;
+
+	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+		goto error;
+
+	opts_len = geneveh->opt_len * 4;
+	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
+				 htons(ETH_P_TEB)))
+		goto drop;
+
+	gs = rcu_dereference_sk_user_data(sk);
+	if (!gs)
+		goto drop;
+
+	geneve_rx(gs, skb);
+	return 0;
+
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+
+error:
+	/* Let the UDP layer deal with the skb */
+	return 1;
+}
+
+static struct socket *geneve_create_sock(struct net *net, bool ipv6,
+					 __be16 port)
+{
+	struct socket *sock;
+	struct udp_port_cfg udp_conf;
+	int err;
+
+	memset(&udp_conf, 0, sizeof(udp_conf));
+
+	if (ipv6) {
+		udp_conf.family = AF_INET6;
+	} else {
+		udp_conf.family = AF_INET;
+		udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+	}
+
+	udp_conf.local_udp_port = port;
+
+	/* Open UDP socket */
+	err = udp_sock_create(net, &udp_conf, &sock);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	return sock;
+}
+
+static void geneve_notify_add_rx_port(struct geneve_sock *gs)
+{
+	struct sock *sk = gs->sock->sk;
+	sa_family_t sa_family = sk->sk_family;
+	int err;
+
+	if (sa_family == AF_INET) {
+		err = udp_add_offload(&gs->udp_offloads);
+		if (err)
+			pr_warn("geneve: udp_add_offload failed with status %d\n",
+				err);
+	}
+}
+
+static int geneve_hlen(struct genevehdr *gh)
+{
+	return sizeof(*gh) + gh->opt_len * 4;
+}
+
+static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
+					   struct sk_buff *skb,
+					   struct udp_offload *uoff)
+{
+	struct sk_buff *p, **pp = NULL;
+	struct genevehdr *gh, *gh2;
+	unsigned int hlen, gh_len, off_gnv;
+	const struct packet_offload *ptype;
+	__be16 type;
+	int flush = 1;
+
+	off_gnv = skb_gro_offset(skb);
+	hlen = off_gnv + sizeof(*gh);
+	gh = skb_gro_header_fast(skb, off_gnv);
+	if (skb_gro_header_hard(skb, hlen)) {
+		gh = skb_gro_header_slow(skb, hlen, off_gnv);
+		if (unlikely(!gh))
+			goto out;
+	}
+
+	if (gh->ver != GENEVE_VER || gh->oam)
+		goto out;
+	gh_len = geneve_hlen(gh);
+
+	hlen = off_gnv + gh_len;
+	if (skb_gro_header_hard(skb, hlen)) {
+		gh = skb_gro_header_slow(skb, hlen, off_gnv);
+		if (unlikely(!gh))
+			goto out;
+	}
+
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		gh2 = (struct genevehdr *)(p->data + off_gnv);
+		if (gh->opt_len != gh2->opt_len ||
+		    memcmp(gh, gh2, gh_len)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+	}
+
+	type = gh->proto_type;
+
+	rcu_read_lock();
+	ptype = gro_find_receive_by_type(type);
+	if (!ptype) {
+		flush = 1;
+		goto out_unlock;
+	}
+
+	skb_gro_pull(skb, gh_len);
+	skb_gro_postpull_rcsum(skb, gh, gh_len);
+	pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
+			       struct udp_offload *uoff)
+{
+	struct genevehdr *gh;
+	struct packet_offload *ptype;
+	__be16 type;
+	int gh_len;
+	int err = -ENOSYS;
+
+	udp_tunnel_gro_complete(skb, nhoff);
+
+	gh = (struct genevehdr *)(skb->data + nhoff);
+	gh_len = geneve_hlen(gh);
+	type = gh->proto_type;
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype)
+		err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
+
+	rcu_read_unlock();
+	return err;
+}
+
+/* Create new listen socket if needed */
+static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
+						bool ipv6)
+{
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
+	struct geneve_sock *gs;
+	struct socket *sock;
+	struct udp_tunnel_sock_cfg tunnel_cfg;
+
+	gs = kzalloc(sizeof(*gs), GFP_KERNEL);
+	if (!gs)
+		return ERR_PTR(-ENOMEM);
+
+	sock = geneve_create_sock(net, ipv6, port);
+	if (IS_ERR(sock)) {
+		kfree(gs);
+		return ERR_CAST(sock);
+	}
+
+	gs->sock = sock;
+	gs->refcnt = 1;
+	gs->gn = gn;
+
+	/* Initialize the geneve udp offloads structure */
+	gs->udp_offloads.port = port;
+	gs->udp_offloads.callbacks.gro_receive  = geneve_gro_receive;
+	gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
+	geneve_notify_add_rx_port(gs);
+
+	/* Mark socket as an encapsulation socket */
+	tunnel_cfg.sk_user_data = gs;
+	tunnel_cfg.encap_type = 1;
+	tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
+	tunnel_cfg.encap_destroy = NULL;
+	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
+
+	list_add(&gs->list, &gn->sock_list);
+	return gs;
+}
+
+static void geneve_notify_del_rx_port(struct geneve_sock *gs)
+{
+	struct sock *sk = gs->sock->sk;
+	sa_family_t sa_family = sk->sk_family;
+
+	if (sa_family == AF_INET)
+		udp_del_offload(&gs->udp_offloads);
+}
+
+static void geneve_sock_release(struct geneve_sock *gs)
+{
+	if (--gs->refcnt)
+		return;
+
+	list_del(&gs->list);
+	geneve_notify_del_rx_port(gs);
+	udp_tunnel_sock_release(gs->sock);
+	kfree_rcu(gs, rcu);
+}
+
+static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
+					    __be16 dst_port)
+{
+	struct geneve_sock *gs;
+
+	list_for_each_entry(gs, &gn->sock_list, list) {
+		if (inet_sk(gs->sock->sk)->inet_sport == dst_port &&
+		    inet_sk(gs->sock->sk)->sk.sk_family == AF_INET) {
+			return gs;
+		}
+	}
+	return NULL;
+}
+
 static int geneve_open(struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct net *net = geneve->net;
-	struct geneve_net *gn = net_generic(geneve->net, geneve_net_id);
+	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
 
-	gs = geneve_sock_add(net, geneve->dst_port, geneve_rx, gn,
-	                     false, false);
+	gs = geneve_find_sock(gn, geneve->dst_port);
+	if (gs) {
+		gs->refcnt++;
+		goto out;
+	}
+
+	gs = geneve_socket_create(net, geneve->dst_port, false);
 	if (IS_ERR(gs))
 		return PTR_ERR(gs);
 
+out:
+	gs->collect_md = geneve->collect_md;
 	geneve->sock = gs;
-
 	return 0;
 }
 
 static int geneve_stop(struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
-	struct geneve_sock *gs = geneve->sock;
 
-	geneve_sock_release(gs);
+	geneve_sock_release(geneve->sock);
+	return 0;
+}
+
+static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
+			    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+			    bool csum)
+{
+	struct genevehdr *gnvh;
+	int min_headroom;
+	int err;
 
+	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr);
+	err = skb_cow_head(skb, min_headroom);
+	if (unlikely(err)) {
+		kfree_skb(skb);
+		goto free_rt;
+	}
+
+	skb = udp_tunnel_handle_offloads(skb, csum);
+	if (IS_ERR(skb)) {
+		err = PTR_ERR(skb);
+		goto free_rt;
+	}
+
+	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
+	gnvh->ver = GENEVE_VER;
+	gnvh->opt_len = opt_len / 4;
+	gnvh->oam = !!(tun_flags & TUNNEL_OAM);
+	gnvh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+	gnvh->rsvd1 = 0;
+	memcpy(gnvh->vni, vni, 3);
+	gnvh->proto_type = htons(ETH_P_TEB);
+	gnvh->rsvd2 = 0;
+	memcpy(gnvh->options, opt, opt_len);
+
+	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
 	return 0;
+
+free_rt:
+	ip_rt_put(rt);
+	return err;
 }
 
 static struct rtable *geneve_get_rt(struct sk_buff *skb,
@@ -269,7 +589,6 @@ static struct rtable *geneve_get_rt(struct sk_buff *skb,
 		ip_rt_put(rt);
 		return ERR_PTR(-EINVAL);
 	}
-
 	return rt;
 }
 
@@ -293,15 +612,13 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct geneve_sock *gs = geneve->sock;
 	struct ip_tunnel_info *info = NULL;
 	struct rtable *rt = NULL;
-	const struct iphdr *iip; /* interior IP header */
 	struct flowi4 fl4;
 	__u8 tos, ttl;
 	__be16 sport;
-	bool xnet;
+	bool udp_csum;
+	__be16 df;
 	int err;
 
-	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
-
 	if (geneve->collect_md) {
 		info = skb_tunnel_info(skb);
 		if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) {
@@ -316,52 +633,57 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 		dev->stats.tx_carrier_errors++;
 		goto tx_error;
 	}
+
+	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	skb_reset_mac_header(skb);
-	xnet = !net_eq(geneve->net, dev_net(geneve->dev));
 
 	if (info) {
 		const struct ip_tunnel_key *key = &info->key;
-		bool udp_csum;
 		u8 *opts = NULL;
 		u8 vni[3];
-		__be16 df;
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
-		udp_csum = !!(key->tun_flags & TUNNEL_CSUM);
-
 		if (key->tun_flags & TUNNEL_GENEVE_OPT)
 			opts = ip_tunnel_info_opts(info, info->options_len);
 
-		err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
-				      key->tos, key->ttl, df,
-				      sport, geneve->dst_port,
-				      key->tun_flags, vni,
-				      info->options_len, opts, udp_csum, xnet);
+		udp_csum = !!(key->tun_flags & TUNNEL_CSUM);
+		err = geneve_build_skb(rt, skb, key->tun_flags, vni,
+				       info->options_len, opts, udp_csum);
+		if (unlikely(err))
+			goto err;
+
+		tos = key->tos;
+		ttl = key->ttl;
+		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 	} else {
+		const struct iphdr *iip; /* interior IP header */
+
+		udp_csum = false;
+		err = geneve_build_skb(rt, skb, 0, geneve->vni,
+				       0, NULL, udp_csum);
+		if (unlikely(err))
+			goto err;
+
 		iip = ip_hdr(skb);
 		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
-
 		ttl = geneve->ttl;
 		if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
 			ttl = 1;
-
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-
-		/* no need to handle local destination and encap bypass...yet... */
-		err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr, tos,
-				      ttl, 0, sport, geneve->dst_port, 0,
-				      geneve->vni, 0, NULL, false, xnet);
+		df = 0;
 	}
-	if (err < 0)
-		ip_rt_put(rt);
+	err = udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, fl4.saddr, fl4.daddr,
+				  tos, ttl, df, sport, geneve->dst_port,
+				  !net_eq(geneve->net, dev_net(geneve->dev)),
+				  !udp_csum);
 
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 	return NETDEV_TX_OK;
 
 tx_error:
-	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
+err:
+	dev->stats.tx_errors++;
 	return NETDEV_TX_OK;
 }
 
@@ -454,25 +776,44 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
+					  __be16 dst_port,
+					  __be32 rem_addr,
+					  u8 vni[],
+					  bool *tun_on_same_port,
+					  bool *tun_collect_md)
+{
+	struct geneve_dev *geneve, *t;
+
+	*tun_on_same_port = false;
+	*tun_collect_md = false;
+	t = NULL;
+	list_for_each_entry(geneve, &gn->geneve_list, next) {
+		if (geneve->dst_port == dst_port) {
+			*tun_collect_md = geneve->collect_md;
+			*tun_on_same_port = true;
+		}
+		if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
+		    rem_addr == geneve->remote.sin_addr.s_addr &&
+		    dst_port == geneve->dst_port)
+			t = geneve;
+	}
+	return t;
+}
+
 static int geneve_configure(struct net *net, struct net_device *dev,
 			    __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos,
 			    __u16 dst_port, bool metadata)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
-	struct geneve_dev *dummy, *geneve = netdev_priv(dev);
-	struct hlist_head *vni_list_head;
-	struct sockaddr_in remote;	/* IPv4 address for link partner */
+	struct geneve_dev *t, *geneve = netdev_priv(dev);
+	bool tun_collect_md, tun_on_same_port;
 	__u32 hash;
 	int err;
 
 	if (metadata) {
-		if (rtnl_dereference(gn->collect_md_tun))
-			return -EEXIST;
-		if (!list_empty(&gn->geneve_list))
-			return -EPERM;
-	} else {
-		if (rtnl_dereference(gn->collect_md_tun))
-			return -EPERM;
+		if (rem_addr || vni || tos || ttl)
+			return -EINVAL;
 	}
 
 	geneve->net = net;
@@ -486,36 +827,31 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr)))
 		return -EINVAL;
 
-	remote = geneve->remote;
-	if (metadata) {
-		if (rem_addr || vni || tos || ttl)
-			return -EINVAL;
-	}
-
-	hash = geneve_net_vni_hash(geneve->vni);
-	vni_list_head = &gn->vni_list[hash];
-	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
-		if (!memcmp(geneve->vni, dummy->vni, sizeof(dummy->vni)) &&
-		    !memcmp(&remote, &dummy->remote, sizeof(dummy->remote)) &&
-		    htons(dst_port) == dummy->dst_port) {
-			return -EBUSY;
-		}
-	}
-
 	geneve->ttl = ttl;
 	geneve->tos = tos;
 	geneve->dst_port = htons(dst_port);
 	geneve->collect_md = metadata;
 
+	t = geneve_find_dev(gn, htons(dst_port), rem_addr, geneve->vni,
+			    &tun_on_same_port, &tun_collect_md);
+	if (t)
+		return -EBUSY;
+
+	if (metadata) {
+		if (tun_on_same_port)
+			return -EPERM;
+	} else {
+		if (tun_collect_md)
+			return -EPERM;
+	}
+
 	err = register_netdevice(dev);
 	if (err)
 		return err;
 
 	list_add(&geneve->next, &gn->geneve_list);
+	hash = geneve_net_vni_hash(geneve->vni);
 	hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
-
-	if (geneve->collect_md)
-		rcu_assign_pointer(gn->collect_md_tun, geneve);
 	return 0;
 }
 
@@ -554,12 +890,6 @@ static void geneve_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 
-	if (geneve->collect_md) {
-		struct geneve_net *gn = net_generic(geneve->net, geneve_net_id);
-
-		rcu_assign_pointer(gn->collect_md_tun, NULL);
-	}
-
 	if (!hlist_unhashed(&geneve->hlist))
 		hlist_del_rcu(&geneve->hlist);
 
@@ -651,6 +981,7 @@ static __net_init int geneve_init_net(struct net *net)
 
 	INIT_LIST_HEAD(&gn->geneve_list);
 
+	INIT_LIST_HEAD(&gn->sock_list);
 	for (h = 0; h < VNI_HASH_SIZE; ++h)
 		INIT_HLIST_HEAD(&gn->vni_list[h]);
 
diff --git a/include/net/geneve.h b/include/net/geneve.h
index 4245e1d23b9b..3106ed6eae0d 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -62,41 +62,7 @@ struct genevehdr {
 	struct geneve_opt options[];
 };
 
-static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
-{
-	return (struct genevehdr *)(udp_hdr(skb) + 1);
-}
-
 #ifdef CONFIG_INET
-struct geneve_sock;
-
-typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb);
-
-struct geneve_sock {
-	struct list_head	list;
-	geneve_rcv_t		*rcv;
-	void			*rcv_data;
-	struct socket		*sock;
-	struct rcu_head		rcu;
-	int			refcnt;
-	struct udp_offload	udp_offloads;
-};
-
-#define GENEVE_VER 0
-#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
-
-struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
-				    geneve_rcv_t *rcv, void *data,
-				    bool no_share, bool ipv6);
-
-void geneve_sock_release(struct geneve_sock *vs);
-
-int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
-		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
-		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
-		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
-		    bool csum, bool xnet);
-
 struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 					u8 name_assign_type, u16 dst_port);
 #endif /*ifdef CONFIG_INET */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6fb3c90ad726..416dfa004cfb 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -331,20 +331,6 @@ config NET_FOU_IP_TUNNELS
 	  When this option is enabled IP tunnels can be configured to use
 	  FOU or GUE encapsulation.
 
-config GENEVE_CORE
-	tristate "Generic Network Virtualization Encapsulation library"
-	depends on INET
-	select NET_UDP_TUNNEL
-	---help---
-	This allows one to create Geneve virtual interfaces that provide
-	Layer 2 Networks over Layer 3 Networks. Geneve is often used
-	to tunnel virtual network infrastructure in virtualized environments.
-	For more information see:
-	  http://tools.ietf.org/html/draft-gross-geneve-01
-
-	  To compile this driver as a module, choose M here: the module
-
-
 config INET_AH
 	tristate "IP: AH transformation"
 	select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index efc43f300b8c..89aacb630a53 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
-obj-$(CONFIG_GENEVE_CORE) += geneve_core.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/geneve_core.c b/net/ipv4/geneve_core.c
deleted file mode 100644
index 311a4ba6950a..000000000000
--- a/net/ipv4/geneve_core.c
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Geneve: Generic Network Virtualization Encapsulation
- *
- * Copyright (c) 2014 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/list.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/igmp.h>
-#include <linux/etherdevice.h>
-#include <linux/if_ether.h>
-#include <linux/if_vlan.h>
-#include <linux/ethtool.h>
-#include <linux/mutex.h>
-#include <net/arp.h>
-#include <net/ndisc.h>
-#include <net/ip.h>
-#include <net/ip_tunnels.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-#include <net/geneve.h>
-#include <net/protocol.h>
-#include <net/udp_tunnel.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/addrconf.h>
-#include <net/ip6_tunnel.h>
-#include <net/ip6_checksum.h>
-#endif
-
-/* Protects sock_list and refcounts. */
-static DEFINE_MUTEX(geneve_mutex);
-
-/* per-network namespace private data for this module */
-struct geneve_net {
-	struct list_head	sock_list;
-};
-
-static int geneve_net_id;
-
-static struct geneve_sock *geneve_find_sock(struct net *net,
-					    sa_family_t family, __be16 port)
-{
-	struct geneve_net *gn = net_generic(net, geneve_net_id);
-	struct geneve_sock *gs;
-
-	list_for_each_entry(gs, &gn->sock_list, list) {
-		if (inet_sk(gs->sock->sk)->inet_sport == port &&
-		    inet_sk(gs->sock->sk)->sk.sk_family == family)
-			return gs;
-	}
-
-	return NULL;
-}
-
-static void geneve_build_header(struct genevehdr *geneveh,
-				__be16 tun_flags, u8 vni[3],
-				u8 options_len, u8 *options)
-{
-	geneveh->ver = GENEVE_VER;
-	geneveh->opt_len = options_len / 4;
-	geneveh->oam = !!(tun_flags & TUNNEL_OAM);
-	geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
-	geneveh->rsvd1 = 0;
-	memcpy(geneveh->vni, vni, 3);
-	geneveh->proto_type = htons(ETH_P_TEB);
-	geneveh->rsvd2 = 0;
-
-	memcpy(geneveh->options, options, options_len);
-}
-
-/* Transmit a fully formatted Geneve frame.
- *
- * When calling this function. The skb->data should point
- * to the geneve header which is fully formed.
- *
- * This function will add other UDP tunnel headers.
- */
-int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
-		    struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
-		    __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
-		    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
-		    bool csum, bool xnet)
-{
-	struct genevehdr *gnvh;
-	int min_headroom;
-	int err;
-
-	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
-			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
-			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
-
-	err = skb_cow_head(skb, min_headroom);
-	if (unlikely(err)) {
-		kfree_skb(skb);
-		return err;
-	}
-
-	skb = vlan_hwaccel_push_inside(skb);
-	if (unlikely(!skb))
-		return -ENOMEM;
-
-	skb = udp_tunnel_handle_offloads(skb, csum);
-	if (IS_ERR(skb))
-		return PTR_ERR(skb);
-
-	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
-	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
-
-	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
-
-	return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
-				   tos, ttl, df, src_port, dst_port, xnet,
-				   !csum);
-}
-EXPORT_SYMBOL_GPL(geneve_xmit_skb);
-
-static int geneve_hlen(struct genevehdr *gh)
-{
-	return sizeof(*gh) + gh->opt_len * 4;
-}
-
-static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
-					   struct sk_buff *skb,
-					   struct udp_offload *uoff)
-{
-	struct sk_buff *p, **pp = NULL;
-	struct genevehdr *gh, *gh2;
-	unsigned int hlen, gh_len, off_gnv;
-	const struct packet_offload *ptype;
-	__be16 type;
-	int flush = 1;
-
-	off_gnv = skb_gro_offset(skb);
-	hlen = off_gnv + sizeof(*gh);
-	gh = skb_gro_header_fast(skb, off_gnv);
-	if (skb_gro_header_hard(skb, hlen)) {
-		gh = skb_gro_header_slow(skb, hlen, off_gnv);
-		if (unlikely(!gh))
-			goto out;
-	}
-
-	if (gh->ver != GENEVE_VER || gh->oam)
-		goto out;
-	gh_len = geneve_hlen(gh);
-
-	hlen = off_gnv + gh_len;
-	if (skb_gro_header_hard(skb, hlen)) {
-		gh = skb_gro_header_slow(skb, hlen, off_gnv);
-		if (unlikely(!gh))
-			goto out;
-	}
-
-	flush = 0;
-
-	for (p = *head; p; p = p->next) {
-		if (!NAPI_GRO_CB(p)->same_flow)
-			continue;
-
-		gh2 = (struct genevehdr *)(p->data + off_gnv);
-		if (gh->opt_len != gh2->opt_len ||
-		    memcmp(gh, gh2, gh_len)) {
-			NAPI_GRO_CB(p)->same_flow = 0;
-			continue;
-		}
-	}
-
-	type = gh->proto_type;
-
-	rcu_read_lock();
-	ptype = gro_find_receive_by_type(type);
-	if (!ptype) {
-		flush = 1;
-		goto out_unlock;
-	}
-
-	skb_gro_pull(skb, gh_len);
-	skb_gro_postpull_rcsum(skb, gh, gh_len);
-	pp = ptype->callbacks.gro_receive(head, skb);
-
-out_unlock:
-	rcu_read_unlock();
-out:
-	NAPI_GRO_CB(skb)->flush |= flush;
-
-	return pp;
-}
-
-static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
-			       struct udp_offload *uoff)
-{
-	struct genevehdr *gh;
-	struct packet_offload *ptype;
-	__be16 type;
-	int gh_len;
-	int err = -ENOSYS;
-
-	udp_tunnel_gro_complete(skb, nhoff);
-
-	gh = (struct genevehdr *)(skb->data + nhoff);
-	gh_len = geneve_hlen(gh);
-	type = gh->proto_type;
-
-	rcu_read_lock();
-	ptype = gro_find_complete_by_type(type);
-	if (ptype)
-		err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
-
-	rcu_read_unlock();
-	return err;
-}
-
-static void geneve_notify_add_rx_port(struct geneve_sock *gs)
-{
-	struct sock *sk = gs->sock->sk;
-	sa_family_t sa_family = sk->sk_family;
-	int err;
-
-	if (sa_family == AF_INET) {
-		err = udp_add_offload(&gs->udp_offloads);
-		if (err)
-			pr_warn("geneve: udp_add_offload failed with status %d\n",
-				err);
-	}
-}
-
-static void geneve_notify_del_rx_port(struct geneve_sock *gs)
-{
-	struct sock *sk = gs->sock->sk;
-	sa_family_t sa_family = sk->sk_family;
-
-	if (sa_family == AF_INET)
-		udp_del_offload(&gs->udp_offloads);
-}
-
-/* Callback from net/ipv4/udp.c to receive packets */
-static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
-{
-	struct genevehdr *geneveh;
-	struct geneve_sock *gs;
-	int opts_len;
-
-	/* Need Geneve and inner Ethernet header to be present */
-	if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
-		goto error;
-
-	/* Return packets with reserved bits set */
-	geneveh = geneve_hdr(skb);
-
-	if (unlikely(geneveh->ver != GENEVE_VER))
-		goto error;
-
-	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
-		goto error;
-
-	opts_len = geneveh->opt_len * 4;
-	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
-				 htons(ETH_P_TEB)))
-		goto drop;
-
-	gs = rcu_dereference_sk_user_data(sk);
-	if (!gs)
-		goto drop;
-
-	gs->rcv(gs, skb);
-	return 0;
-
-drop:
-	/* Consume bad packet */
-	kfree_skb(skb);
-	return 0;
-
-error:
-	/* Let the UDP layer deal with the skb */
-	return 1;
-}
-
-static struct socket *geneve_create_sock(struct net *net, bool ipv6,
-					 __be16 port)
-{
-	struct socket *sock;
-	struct udp_port_cfg udp_conf;
-	int err;
-
-	memset(&udp_conf, 0, sizeof(udp_conf));
-
-	if (ipv6) {
-		udp_conf.family = AF_INET6;
-	} else {
-		udp_conf.family = AF_INET;
-		udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
-	}
-
-	udp_conf.local_udp_port = port;
-
-	/* Open UDP socket */
-	err = udp_sock_create(net, &udp_conf, &sock);
-	if (err < 0)
-		return ERR_PTR(err);
-
-	return sock;
-}
-
-/* Create new listen socket if needed */
-static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
-						geneve_rcv_t *rcv, void *data,
-						bool ipv6)
-{
-	struct geneve_net *gn = net_generic(net, geneve_net_id);
-	struct geneve_sock *gs;
-	struct socket *sock;
-	struct udp_tunnel_sock_cfg tunnel_cfg;
-
-	gs = kzalloc(sizeof(*gs), GFP_KERNEL);
-	if (!gs)
-		return ERR_PTR(-ENOMEM);
-
-	sock = geneve_create_sock(net, ipv6, port);
-	if (IS_ERR(sock)) {
-		kfree(gs);
-		return ERR_CAST(sock);
-	}
-
-	gs->sock = sock;
-	gs->refcnt = 1;
-	gs->rcv = rcv;
-	gs->rcv_data = data;
-
-	/* Initialize the geneve udp offloads structure */
-	gs->udp_offloads.port = port;
-	gs->udp_offloads.callbacks.gro_receive  = geneve_gro_receive;
-	gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
-	geneve_notify_add_rx_port(gs);
-
-	/* Mark socket as an encapsulation socket */
-	tunnel_cfg.sk_user_data = gs;
-	tunnel_cfg.encap_type = 1;
-	tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
-	tunnel_cfg.encap_destroy = NULL;
-	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
-
-	list_add(&gs->list, &gn->sock_list);
-
-	return gs;
-}
-
-struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
-				    geneve_rcv_t *rcv, void *data,
-				    bool no_share, bool ipv6)
-{
-	struct geneve_sock *gs;
-
-	mutex_lock(&geneve_mutex);
-
-	gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
-	if (gs) {
-		if (!no_share && gs->rcv == rcv)
-			gs->refcnt++;
-		else
-			gs = ERR_PTR(-EBUSY);
-	} else {
-		gs = geneve_socket_create(net, port, rcv, data, ipv6);
-	}
-
-	mutex_unlock(&geneve_mutex);
-
-	return gs;
-}
-EXPORT_SYMBOL_GPL(geneve_sock_add);
-
-void geneve_sock_release(struct geneve_sock *gs)
-{
-	mutex_lock(&geneve_mutex);
-
-	if (--gs->refcnt)
-		goto unlock;
-
-	list_del(&gs->list);
-	geneve_notify_del_rx_port(gs);
-	udp_tunnel_sock_release(gs->sock);
-	kfree_rcu(gs, rcu);
-
-unlock:
-	mutex_unlock(&geneve_mutex);
-}
-EXPORT_SYMBOL_GPL(geneve_sock_release);
-
-static __net_init int geneve_init_net(struct net *net)
-{
-	struct geneve_net *gn = net_generic(net, geneve_net_id);
-
-	INIT_LIST_HEAD(&gn->sock_list);
-
-	return 0;
-}
-
-static struct pernet_operations geneve_net_ops = {
-	.init = geneve_init_net,
-	.id   = &geneve_net_id,
-	.size = sizeof(struct geneve_net),
-};
-
-static int __init geneve_init_module(void)
-{
-	int rc;
-
-	rc = register_pernet_subsys(&geneve_net_ops);
-	if (rc)
-		return rc;
-
-	pr_info("Geneve core logic\n");
-
-	return 0;
-}
-module_init(geneve_init_module);
-
-static void __exit geneve_cleanup_module(void)
-{
-	unregister_pernet_subsys(&geneve_net_ops);
-}
-module_exit(geneve_cleanup_module);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
-MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic");
-- 
cgit v1.2.3


From d66d6c3152e8d5a6db42a56bf7ae1c6cae87ba48 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Aug 2015 21:21:38 +0200
Subject: net: sched: register noqueue qdisc

This way users can attach noqueue just like any other qdisc using tc
without having to mess with tx_queue_len first.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  1 +
 net/sched/sch_api.c       |  1 +
 net/sched/sch_generic.c   | 12 +++++++++++-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 2eab08c38e32..444faa89a55f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -340,6 +340,7 @@ extern struct Qdisc noop_qdisc;
 extern struct Qdisc_ops noop_qdisc_ops;
 extern struct Qdisc_ops pfifo_fast_ops;
 extern struct Qdisc_ops mq_qdisc_ops;
+extern struct Qdisc_ops noqueue_qdisc_ops;
 extern const struct Qdisc_ops *default_qdisc_ops;
 
 struct Qdisc_class_common {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 59c227f26b56..a3c70a18a764 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1942,6 +1942,7 @@ static int __init pktsched_init(void)
 	register_qdisc(&bfifo_qdisc_ops);
 	register_qdisc(&pfifo_head_drop_qdisc_ops);
 	register_qdisc(&mq_qdisc_ops);
+	register_qdisc(&noqueue_qdisc_ops);
 
 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f501b7409320..d5c7c0d88786 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -416,9 +416,19 @@ struct Qdisc noop_qdisc = {
 };
 EXPORT_SYMBOL(noop_qdisc);
 
-static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
+static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt)
+{
+	/* register_qdisc() assigns a default of noop_enqueue if unset,
+	 * but __dev_queue_xmit() treats noqueue only as such
+	 * if this is NULL - so clear it here. */
+	qdisc->enqueue = NULL;
+	return 0;
+}
+
+struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
 	.id		=	"noqueue",
 	.priv_size	=	0,
+	.init		=	noqueue_init,
 	.enqueue	=	noop_enqueue,
 	.dequeue	=	noop_dequeue,
 	.peek		=	noop_dequeue,
-- 
cgit v1.2.3


From 72afa352d6a3d4da7783b5ddee02b94be49e051a Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 27 Aug 2015 16:06:59 -0700
Subject: net: Introduce ipv4_addr_hash and use it for tcp metrics

Refactors a common line into helper function.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h       |  5 +++++
 net/ipv4/tcp_metrics.c | 12 ++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index bee5f3582e38..7b9e1c782aa3 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -458,6 +458,11 @@ static __inline__ void inet_reset_saddr(struct sock *sk)
 
 #endif
 
+static inline unsigned int ipv4_addr_hash(__be32 ip)
+{
+	return (__force unsigned int) ip;
+}
+
 bool ip_call_ra_chain(struct sk_buff *skb);
 
 /*
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b3d64f61d922..3a4289268f97 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -249,7 +249,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 	case AF_INET:
 		saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
 		daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
-		hash = (__force unsigned int) daddr.addr.a4;
+		hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr);
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
@@ -289,7 +289,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
 		saddr.addr.a4 = tw->tw_rcv_saddr;
 		daddr.family = AF_INET;
 		daddr.addr.a4 = tw->tw_daddr;
-		hash = (__force unsigned int) daddr.addr.a4;
+		hash = ipv4_addr_hash(tw->tw_daddr);
 	}
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (tw->tw_family == AF_INET6) {
@@ -298,7 +298,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
 			saddr.addr.a4 = tw->tw_rcv_saddr;
 			daddr.family = AF_INET;
 			daddr.addr.a4 = tw->tw_daddr;
-			hash = (__force unsigned int) daddr.addr.a4;
+			hash = ipv4_addr_hash(tw->tw_daddr);
 		} else {
 			saddr.family = AF_INET6;
 			saddr.addr.in6 = tw->tw_v6_rcv_saddr;
@@ -339,7 +339,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 		saddr.addr.a4 = inet_sk(sk)->inet_saddr;
 		daddr.family = AF_INET;
 		daddr.addr.a4 = inet_sk(sk)->inet_daddr;
-		hash = (__force unsigned int) daddr.addr.a4;
+		hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
 	}
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (sk->sk_family == AF_INET6) {
@@ -348,7 +348,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 			saddr.addr.a4 = inet_sk(sk)->inet_saddr;
 			daddr.family = AF_INET;
 			daddr.addr.a4 = inet_sk(sk)->inet_daddr;
-			hash = (__force unsigned int) daddr.addr.a4;
+			hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
 		} else {
 			saddr.family = AF_INET6;
 			saddr.addr.in6 = sk->sk_v6_rcv_saddr;
@@ -959,7 +959,7 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
 		addr->family = AF_INET;
 		addr->addr.a4 = nla_get_in_addr(a);
 		if (hash)
-			*hash = (__force unsigned int) addr->addr.a4;
+			*hash = ipv4_addr_hash(addr->addr.a4);
 		return 0;
 	}
 	a = info->attrs[v6];
-- 
cgit v1.2.3


From 3abef286cf2f138de353fb0b54453621de961043 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 27 Aug 2015 16:07:00 -0700
Subject: net: Add set,get helpers for inetpeer addresses

Use inetpeer set,get helpers in tcp_metrics rather than peeking into
the inetpeer_addr struct.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h | 23 ++++++++++++++++++
 net/ipv4/tcp_metrics.c | 65 +++++++++++++++++++++-----------------------------
 2 files changed, 50 insertions(+), 38 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 002f0bd27001..f75b9e7036a2 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -71,6 +71,29 @@ void inet_initpeers(void) __init;
 
 #define INETPEER_METRICS_NEW	(~(u32) 0)
 
+static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip)
+{
+	iaddr->addr.a4 = ip;
+	iaddr->family = AF_INET;
+}
+
+static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr)
+{
+	return iaddr->addr.a4;
+}
+
+static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr,
+					struct in6_addr *in6)
+{
+	iaddr->addr.in6 = *in6;
+	iaddr->family = AF_INET6;
+}
+
+static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr)
+{
+	return &iaddr->addr.in6;
+}
+
 /* can be called with or without local BH being disabled */
 struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 			       const struct inetpeer_addr *daddr,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 3a4289268f97..4ef4dd4bf38c 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -247,14 +247,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 	daddr.family = req->rsk_ops->family;
 	switch (daddr.family) {
 	case AF_INET:
-		saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
-		daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
+		inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr);
+		inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr);
 		hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr);
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
-		daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr;
+		inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr);
+		inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr);
 		hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
 		break;
 #endif
@@ -285,25 +285,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
 	struct net *net;
 
 	if (tw->tw_family == AF_INET) {
-		saddr.family = AF_INET;
-		saddr.addr.a4 = tw->tw_rcv_saddr;
-		daddr.family = AF_INET;
-		daddr.addr.a4 = tw->tw_daddr;
+		inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
+		inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
 		hash = ipv4_addr_hash(tw->tw_daddr);
 	}
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (tw->tw_family == AF_INET6) {
 		if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
-			saddr.family = AF_INET;
-			saddr.addr.a4 = tw->tw_rcv_saddr;
-			daddr.family = AF_INET;
-			daddr.addr.a4 = tw->tw_daddr;
+			inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
+			inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
 			hash = ipv4_addr_hash(tw->tw_daddr);
 		} else {
-			saddr.family = AF_INET6;
-			saddr.addr.in6 = tw->tw_v6_rcv_saddr;
-			daddr.family = AF_INET6;
-			daddr.addr.in6 = tw->tw_v6_daddr;
+			inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr);
+			inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr);
 			hash = ipv6_addr_hash(&tw->tw_v6_daddr);
 		}
 	}
@@ -335,25 +329,19 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 	struct net *net;
 
 	if (sk->sk_family == AF_INET) {
-		saddr.family = AF_INET;
-		saddr.addr.a4 = inet_sk(sk)->inet_saddr;
-		daddr.family = AF_INET;
-		daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+		inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+		inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
 		hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
 	}
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (sk->sk_family == AF_INET6) {
 		if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
-			saddr.family = AF_INET;
-			saddr.addr.a4 = inet_sk(sk)->inet_saddr;
-			daddr.family = AF_INET;
-			daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+			inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+			inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
 			hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
 		} else {
-			saddr.family = AF_INET6;
-			saddr.addr.in6 = sk->sk_v6_rcv_saddr;
-			daddr.family = AF_INET6;
-			daddr.addr.in6 = sk->sk_v6_daddr;
+			inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr);
+			inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr);
 			hash = ipv6_addr_hash(&sk->sk_v6_daddr);
 		}
 	}
@@ -796,18 +784,18 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
 	switch (tm->tcpm_daddr.family) {
 	case AF_INET:
 		if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
-				    tm->tcpm_daddr.addr.a4) < 0)
+				    inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0)
 			goto nla_put_failure;
 		if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
-				    tm->tcpm_saddr.addr.a4) < 0)
+				    inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0)
 			goto nla_put_failure;
 		break;
 	case AF_INET6:
 		if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
-				     &tm->tcpm_daddr.addr.in6) < 0)
+				     inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0)
 			goto nla_put_failure;
 		if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
-				     &tm->tcpm_saddr.addr.in6) < 0)
+				     inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0)
 			goto nla_put_failure;
 		break;
 	default:
@@ -956,20 +944,21 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
 
 	a = info->attrs[v4];
 	if (a) {
-		addr->family = AF_INET;
-		addr->addr.a4 = nla_get_in_addr(a);
+		inetpeer_set_addr_v4(addr, nla_get_in_addr(a));
 		if (hash)
-			*hash = ipv4_addr_hash(addr->addr.a4);
+			*hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr));
 		return 0;
 	}
 	a = info->attrs[v6];
 	if (a) {
+		struct in6_addr in6;
+
 		if (nla_len(a) != sizeof(struct in6_addr))
 			return -EINVAL;
-		addr->family = AF_INET6;
-		addr->addr.in6 = nla_get_in6_addr(a);
+		in6 = nla_get_in6_addr(a);
+		inetpeer_set_addr_v6(addr, &in6);
 		if (hash)
-			*hash = ipv6_addr_hash(&addr->addr.in6);
+			*hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr));
 		return 0;
 	}
 	return optional ? 1 : -EAFNOSUPPORT;
-- 
cgit v1.2.3


From d39d14ffa24cca9f0e44aa4a63315f4c44c56a93 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 27 Aug 2015 16:07:01 -0700
Subject: net: Add helper function to compare inetpeer addresses

tcp_metrics and inetpeer both have functions to compare inetpeer
addresses. Consolidate into 1 version.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h | 16 ++++++++++++++++
 net/ipv4/inetpeer.c    | 20 ++------------------
 net/ipv4/tcp_metrics.c |  6 +-----
 3 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index f75b9e7036a2..9d9b3446731d 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -121,6 +121,22 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base,
 	return inet_getpeer(base, &daddr, create);
 }
 
+static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a,
+				    const struct inetpeer_addr *b)
+{
+	int i, n = (a->family == AF_INET ? 1 : 4);
+
+	for (i = 0; i < n; i++) {
+		if (a->addr.a6[i] == b->addr.a6[i])
+			continue;
+		if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
+			return -1;
+		return 1;
+	}
+
+	return 0;
+}
+
 /* can be called from BH context or outside */
 void inet_putpeer(struct inet_peer *p);
 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 241afd743d2c..86fa45809540 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -157,22 +157,6 @@ void __init inet_initpeers(void)
 	INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
 }
 
-static int addr_compare(const struct inetpeer_addr *a,
-			const struct inetpeer_addr *b)
-{
-	int i, n = (a->family == AF_INET ? 1 : 4);
-
-	for (i = 0; i < n; i++) {
-		if (a->addr.a6[i] == b->addr.a6[i])
-			continue;
-		if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
-			return -1;
-		return 1;
-	}
-
-	return 0;
-}
-
 #define rcu_deref_locked(X, BASE)				\
 	rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
 
@@ -188,7 +172,7 @@ static int addr_compare(const struct inetpeer_addr *a,
 	*stackptr++ = &_base->root;				\
 	for (u = rcu_deref_locked(_base->root, _base);		\
 	     u != peer_avl_empty;) {				\
-		int cmp = addr_compare(_daddr, &u->daddr);	\
+		int cmp = inetpeer_addr_cmp(_daddr, &u->daddr);	\
 		if (cmp == 0)					\
 			break;					\
 		if (cmp == -1)					\
@@ -215,7 +199,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
 	int count = 0;
 
 	while (u != peer_avl_empty) {
-		int cmp = addr_compare(daddr, &u->daddr);
+		int cmp = inetpeer_addr_cmp(daddr, &u->daddr);
 		if (cmp == 0) {
 			/* Before taking a reference, check if this entry was
 			 * deleted (refcnt=-1)
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4ef4dd4bf38c..c8cbc2b4b792 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -81,11 +81,7 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
 static bool addr_same(const struct inetpeer_addr *a,
 		      const struct inetpeer_addr *b)
 {
-	if (a->family != b->family)
-		return false;
-	if (a->family == AF_INET)
-		return a->addr.a4 == b->addr.a4;
-	return ipv6_addr_equal(&a->addr.in6, &b->addr.in6);
+	return inetpeer_addr_cmp(a, b) == 0;
 }
 
 struct tcpm_hash_bucket {
-- 
cgit v1.2.3


From 5345c2e12d41f815c1009c9dee72f3d5fcfd4282 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 27 Aug 2015 16:07:02 -0700
Subject: net: Refactor inetpeer address struct

Move the inetpeer_addr_base union to inetpeer_addr and drop
inetpeer_addr_base.

Both the a6 and in6_addr overlays are not needed; drop the __be32 version
and rename in6 to a6 for consistency with ipv4. Add a new u32 array to
the union which removes the need for the typecast in the compare function
and the use of a consistent arg for both ipv4 and ipv6 addresses which
makes the compare function more readable.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 9d9b3446731d..e34f98aa93b1 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -15,16 +15,14 @@
 #include <net/ipv6.h>
 #include <linux/atomic.h>
 
-struct inetpeer_addr_base {
+#define INETPEER_MAXKEYSZ   (sizeof(struct in6_addr) / sizeof(u32))
+
+struct inetpeer_addr {
 	union {
 		__be32			a4;
-		__be32			a6[4];
-		struct in6_addr		in6;
+		struct in6_addr		a6;
+		u32			key[INETPEER_MAXKEYSZ];
 	};
-};
-
-struct inetpeer_addr {
-	struct inetpeer_addr_base	addr;
 	__u16				family;
 };
 
@@ -73,25 +71,25 @@ void inet_initpeers(void) __init;
 
 static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip)
 {
-	iaddr->addr.a4 = ip;
+	iaddr->a4 = ip;
 	iaddr->family = AF_INET;
 }
 
 static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr)
 {
-	return iaddr->addr.a4;
+	return iaddr->a4;
 }
 
 static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr,
 					struct in6_addr *in6)
 {
-	iaddr->addr.in6 = *in6;
+	iaddr->a6 = *in6;
 	iaddr->family = AF_INET6;
 }
 
 static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr)
 {
-	return &iaddr->addr.in6;
+	return &iaddr->a6;
 }
 
 /* can be called with or without local BH being disabled */
@@ -105,7 +103,7 @@ static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base,
 {
 	struct inetpeer_addr daddr;
 
-	daddr.addr.a4 = v4daddr;
+	daddr.a4 = v4daddr;
 	daddr.family = AF_INET;
 	return inet_getpeer(base, &daddr, create);
 }
@@ -116,7 +114,7 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base,
 {
 	struct inetpeer_addr daddr;
 
-	daddr.addr.in6 = *v6daddr;
+	daddr.a6 = *v6daddr;
 	daddr.family = AF_INET6;
 	return inet_getpeer(base, &daddr, create);
 }
@@ -124,12 +122,17 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base,
 static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a,
 				    const struct inetpeer_addr *b)
 {
-	int i, n = (a->family == AF_INET ? 1 : 4);
+	int i, n;
+
+	if (a->family == AF_INET)
+		n = sizeof(a->a4) / sizeof(u32);
+	else
+		n = sizeof(a->a6) / sizeof(u32);
 
 	for (i = 0; i < n; i++) {
-		if (a->addr.a6[i] == b->addr.a6[i])
+		if (a->key[i] == b->key[i])
 			continue;
-		if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
+		if (a->key[i] < b->key[i])
 			return -1;
 		return 1;
 	}
-- 
cgit v1.2.3


From 192132b9a034d87566294be0fba5f8f75c2cf16b Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 27 Aug 2015 16:07:03 -0700
Subject: net: Add support for VRFs to inetpeer cache

inetpeer caches based on address only, so duplicate IP addresses within
a namespace return the same cached entry. Enhance the ipv4 address key
to contain both the IPv4 address and VRF device index.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h | 17 ++++++++++++-----
 net/ipv4/icmp.c        |  3 ++-
 net/ipv4/ip_fragment.c |  3 ++-
 net/ipv4/route.c       |  7 +++++--
 4 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index e34f98aa93b1..4a6009d4486b 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -15,11 +15,17 @@
 #include <net/ipv6.h>
 #include <linux/atomic.h>
 
+/* IPv4 address key for cache lookups */
+struct ipv4_addr_key {
+	__be32	addr;
+	int	vif;
+};
+
 #define INETPEER_MAXKEYSZ   (sizeof(struct in6_addr) / sizeof(u32))
 
 struct inetpeer_addr {
 	union {
-		__be32			a4;
+		struct ipv4_addr_key	a4;
 		struct in6_addr		a6;
 		u32			key[INETPEER_MAXKEYSZ];
 	};
@@ -71,13 +77,13 @@ void inet_initpeers(void) __init;
 
 static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip)
 {
-	iaddr->a4 = ip;
+	iaddr->a4.addr = ip;
 	iaddr->family = AF_INET;
 }
 
 static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr)
 {
-	return iaddr->a4;
+	return iaddr->a4.addr;
 }
 
 static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr,
@@ -99,11 +105,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 
 static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base,
 						__be32 v4daddr,
-						int create)
+						int vif, int create)
 {
 	struct inetpeer_addr daddr;
 
-	daddr.a4 = v4daddr;
+	daddr.a4.addr = v4daddr;
+	daddr.a4.vif = vif;
 	daddr.family = AF_INET;
 	return inet_getpeer(base, &daddr, create);
 }
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f16488efa1c8..79fe05befcae 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -309,9 +309,10 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 
 	rc = false;
 	if (icmp_global_allow()) {
+		int vif = vrf_master_ifindex(dst->dev);
 		struct inet_peer *peer;
 
-		peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+		peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
 		rc = inet_peer_xrlim_allow(peer,
 					   net->ipv4.sysctl_icmp_ratelimit);
 		if (peer)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 15762e758861..fa7f15305f9a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -151,7 +151,8 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 	qp->vif = arg->vif;
 	qp->user = arg->user;
 	qp->peer = sysctl_ipfrag_max_dist ?
-		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
+		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
+		NULL;
 }
 
 static void ip4_frag_free(struct inet_frag_queue *q)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3087aaa6dd8..6b91879e9cbe 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -838,6 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	struct inet_peer *peer;
 	struct net *net;
 	int log_martians;
+	int vif;
 
 	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(rt->dst.dev);
@@ -846,10 +847,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 		return;
 	}
 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+	vif = vrf_master_ifindex_rcu(rt->dst.dev);
 	rcu_read_unlock();
 
 	net = dev_net(rt->dst.dev);
-	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 	if (!peer) {
 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
@@ -938,7 +940,8 @@ static int ip_error(struct sk_buff *skb)
 		break;
 	}
 
-	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
+			       vrf_master_ifindex(skb->dev), 1);
 
 	send = true;
 	if (peer) {
-- 
cgit v1.2.3


From 46fa062ad63146dd138ec0f017e71224471e8ea5 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 28 Aug 2015 20:48:19 +0200
Subject: ip_tunnels: convert the mode field of ip_tunnel_info to flags

The mode field holds a single bit of information only (whether the
ip_tunnel_info struct is for rx or tx). Change the mode field to bit flags.
This allows more mode flags to be added.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c       | 2 +-
 drivers/net/vxlan.c        | 2 +-
 include/net/dst_metadata.h | 1 -
 include/net/ip_tunnels.h   | 9 ++-------
 net/ipv4/ip_gre.c          | 2 +-
 net/ipv4/route.c           | 2 +-
 net/ipv6/route.c           | 2 +-
 7 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4357bae732d7..4a39c09f144c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -623,7 +623,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (geneve->collect_md) {
 		info = skb_tunnel_info(skb);
-		if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) {
+		if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX))) {
 			netdev_dbg(dev, "no tunnel metadata\n");
 			goto tx_error;
 		}
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 30e56cb58884..bd1b8cdf2bf6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2113,7 +2113,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
-	    info && info->mode == IP_TUNNEL_INFO_TX) {
+	    info && info->mode & IP_TUNNEL_INFO_TX) {
 		vxlan_xmit_one(skb, dev, NULL, false);
 		return NETDEV_TX_OK;
 	}
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 60c03326c087..2b83f0d232e0 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -59,7 +59,6 @@ static inline struct metadata_dst *tun_rx_dst(__be16 flags,
 		return NULL;
 
 	info = &tun_dst->u.tun_info;
-	info->mode = IP_TUNNEL_INFO_RX;
 	info->key.tun_flags = flags;
 	info->key.tun_id = tunnel_id;
 	info->key.tp_src = 0;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 224e4ecec91b..9bdb3948798f 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -50,13 +50,8 @@ struct ip_tunnel_key {
 	__be16			tp_dst;
 };
 
-/* Indicates whether the tunnel info structure represents receive
- * or transmit tunnel parameters.
- */
-enum {
-	IP_TUNNEL_INFO_RX,
-	IP_TUNNEL_INFO_TX,
-};
+/* Flags for ip_tunnel_info mode. */
+#define IP_TUNNEL_INFO_TX	0x01	/* represents tx tunnel parameters */
 
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index faf1cde6f8da..1e813a9f9378 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -511,7 +511,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	int err;
 
 	tun_info = skb_tunnel_info(skb);
-	if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX))
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
 		goto err_free_skb;
 
 	key = &tun_info->key;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6b91879e9cbe..5f4a5565ad8b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1696,7 +1696,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	 */
 
 	tun_info = skb_tunnel_info(skb);
-	if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
 		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
 	else
 		fl4.flowi4_tun_key.tun_id = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index df3e353a012d..308dd5f9158f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1174,7 +1174,7 @@ void ip6_route_input(struct sk_buff *skb)
 	};
 
 	tun_info = skb_tunnel_info(skb);
-	if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
 	skb_dst_drop(skb);
 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
-- 
cgit v1.2.3


From 7f9562a1f405306eacb97f95d78cb996e33f27f5 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 28 Aug 2015 20:48:20 +0200
Subject: ip_tunnels: record IP version in tunnel info

There's currently nothing preventing directing packets with IPv6
encapsulation data to IPv4 tunnels (and vice versa). If this happens,
IPv6 addresses are incorrectly interpreted as IPv4 ones.

Track whether the given ip_tunnel_key contains IPv4 or IPv6 data. Store this
in ip_tunnel_info. Reject packets at appropriate places if they are supposed
to be encapsulated into an incompatible protocol.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c       |  2 ++
 drivers/net/vxlan.c        |  2 ++
 include/net/dst_metadata.h |  1 +
 include/net/ip_tunnels.h   | 10 ++++++++++
 net/core/filter.c          |  2 ++
 net/ipv4/ip_gre.c          |  3 ++-
 net/ipv4/ip_tunnel_core.c  |  2 +-
 net/openvswitch/flow.c     |  2 ++
 net/openvswitch/vport.c    |  2 ++
 9 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4a39c09f144c..3908a22f23d1 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -627,6 +627,8 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 			netdev_dbg(dev, "no tunnel metadata\n");
 			goto tx_error;
 		}
+		if (info && ip_tunnel_info_af(info) != AF_INET)
+			goto tx_error;
 	}
 
 	rt = geneve_get_rt(skb, dev, &fl4, info);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bd1b8cdf2bf6..e3adfe0ef66b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1903,6 +1903,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				  dev->name);
 			goto drop;
 		}
+		if (family != ip_tunnel_info_af(info))
+			goto drop;
 
 		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
 		vni = be64_to_cpu(info->key.tun_id);
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 2b83f0d232e0..d32f49cc621d 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -105,6 +105,7 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
 	info->key.u.ipv6.dst = ip6h->daddr;
 	info->key.tos = ipv6_get_dsfield(ip6h);
 	info->key.ttl = ip6h->hop_limit;
+	info->mode = IP_TUNNEL_INFO_IPV6;
 	return tun_dst;
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 9bdb3948798f..2b4fa06e91bd 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -4,6 +4,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/socket.h>
 #include <linux/types.h>
 #include <linux/u64_stats_sync.h>
 #include <net/dsfield.h>
@@ -52,6 +53,7 @@ struct ip_tunnel_key {
 
 /* Flags for ip_tunnel_info mode. */
 #define IP_TUNNEL_INFO_TX	0x01	/* represents tx tunnel parameters */
+#define IP_TUNNEL_INFO_IPV6	0x02	/* key contains IPv6 addresses */
 
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
@@ -208,6 +210,8 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
 
 	tun_info->options = opts;
 	tun_info->options_len = opts_len;
+
+	tun_info->mode = 0;
 }
 
 static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
@@ -221,6 +225,12 @@ static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
 			      tun_id, tun_flags, opts, opts_len);
 }
 
+static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
+					       *tun_info)
+{
+	return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/net/core/filter.c b/net/core/filter.c
index 66500d490995..13079f03902e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1493,6 +1493,8 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
 		return -EINVAL;
+	if (ip_tunnel_info_af(info) != AF_INET)
+		return -EINVAL;
 
 	to->tunnel_id = be64_to_cpu(info->key.tun_id);
 	to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1e813a9f9378..bd0679d90519 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -511,7 +511,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	int err;
 
 	tun_info = skb_tunnel_info(skb);
-	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
 		goto err_free_skb;
 
 	key = &tun_info->key;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 934f2ac8ad61..0c756ade1cf7 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -356,7 +356,7 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
 	if (tb[LWTUNNEL_IP6_FLAGS])
 		tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]);
 
-	tun_info->mode = IP_TUNNEL_INFO_TX;
+	tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
 	tun_info->options = NULL;
 	tun_info->options_len = 0;
 
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 5a3195e538ce..9760dc43bdb9 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -688,6 +688,8 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 {
 	/* Extract metadata from packet. */
 	if (tun_info) {
+		if (ip_tunnel_info_af(tun_info) != AF_INET)
+			return -EINVAL;
 		memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
 
 		if (tun_info->options) {
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index e2dc9dac59e6..40164037928e 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -587,6 +587,8 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
 
 	if (unlikely(!tun_info))
 		return -EINVAL;
+	if (ip_tunnel_info_af(tun_info) != AF_INET)
+		return -EINVAL;
 
 	tun_key = &tun_info->key;
 
-- 
cgit v1.2.3


From a43a9ef6a2e510fec61176ff2c34fab3e7d581da Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 28 Aug 2015 20:48:22 +0200
Subject: vxlan: do not receive IPv4 packets on IPv6 socket

By default (subject to the sysctl settings), IPv6 sockets listen also for
IPv4 traffic. Vxlan is not prepared for that and expects IPv6 header in
packets received through an IPv6 socket.

In addition, it's currently not possible to have both IPv4 and IPv6 vxlan
tunnel on the same port (unless bindv6only sysctl is enabled), as it's not
possible to create and bind both IPv4 and IPv6 vxlan interfaces and there's
no way to specify both IPv4 and IPv6 remote/group IP addresses.

Set IPV6_V6ONLY on vxlan sockets to fix both of these issues. This is not
done globally in udp_tunnel, as l2tp and tipc seems to work okay when
receiving IPv4 packets on IPv6 socket and people may rely on this behavior.
The other tunnels (geneve and fou) do not support IPv6.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 1 +
 include/net/udp_tunnel.h  | 3 ++-
 net/ipv6/ip6_udp_tunnel.c | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e3adfe0ef66b..6c5269aea544 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2530,6 +2530,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
 		udp_conf.family = AF_INET6;
 		udp_conf.use_udp6_rx_checksums =
 		    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
+		udp_conf.ipv6_v6only = 1;
 	} else {
 		udp_conf.family = AF_INET;
 	}
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 35041d0fc21e..cb2f89f20f5c 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -31,7 +31,8 @@ struct udp_port_cfg {
 	__be16			peer_udp_port;
 	unsigned int		use_udp_checksums:1,
 				use_udp6_tx_checksums:1,
-				use_udp6_rx_checksums:1;
+				use_udp6_rx_checksums:1,
+				ipv6_v6only:1;
 };
 
 int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index e1a1136bda7c..14dacf1df529 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -23,6 +23,15 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 	if (err < 0)
 		goto error;
 
+	if (cfg->ipv6_v6only) {
+		int val = 1;
+
+		err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
+					(char *) &val, sizeof(val));
+		if (err < 0)
+			goto error;
+	}
+
 	udp6_addr.sin6_family = AF_INET6;
 	memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
 	       sizeof(udp6_addr.sin6_addr));
-- 
cgit v1.2.3


From c4c6bc314618f60ba69b0cbf93e506e4c38a11d2 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Sun, 30 Aug 2015 11:29:41 +0530
Subject: net: Introduce helper functions to get the per cpu data

Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h   | 10 ++++++++++
 net/ipv4/af_inet.c | 41 +++++++++++++++++++++++++++--------------
 2 files changed, 37 insertions(+), 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 7b9e1c782aa3..9b9ca2839399 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -202,10 +202,20 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 #define NET_ADD_STATS_BH(net, field, adnd) SNMP_ADD_STATS_BH((net)->mib.net_statistics, field, adnd)
 #define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd)
 
+u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offct);
 unsigned long snmp_fold_field(void __percpu *mib, int offt);
 #if BITS_PER_LONG==32
+u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
+			 size_t syncp_offset);
 u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off);
 #else
+static inline u64  snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
+					size_t syncp_offset)
+{
+	return snmp_get_cpu_field(mib, cpu, offct);
+
+}
+
 static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off)
 {
 	return snmp_fold_field(mib, offt);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 675e88cac2b4..0c69c0bbe1a1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1452,38 +1452,51 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 }
 EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
 
+u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
+{
+	return  *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
+}
+EXPORT_SYMBOL_GPL(snmp_get_cpu_field);
+
 unsigned long snmp_fold_field(void __percpu *mib, int offt)
 {
 	unsigned long res = 0;
 	int i;
 
 	for_each_possible_cpu(i)
-		res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
+		res += snmp_get_cpu_field(mib, i, offt);
 	return res;
 }
 EXPORT_SYMBOL_GPL(snmp_fold_field);
 
 #if BITS_PER_LONG==32
 
+u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
+			 size_t syncp_offset)
+{
+	void *bhptr;
+	struct u64_stats_sync *syncp;
+	u64 v;
+	unsigned int start;
+
+	bhptr = per_cpu_ptr(mib, cpu);
+	syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+	do {
+		start = u64_stats_fetch_begin_irq(syncp);
+		v = *(((u64 *)bhptr) + offt);
+	} while (u64_stats_fetch_retry_irq(syncp, start));
+
+	return v;
+}
+EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);
+
 u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
 {
 	u64 res = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		void *bhptr;
-		struct u64_stats_sync *syncp;
-		u64 v;
-		unsigned int start;
-
-		bhptr = per_cpu_ptr(mib, cpu);
-		syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
-		do {
-			start = u64_stats_fetch_begin_irq(syncp);
-			v = *(((u64 *) bhptr) + offt);
-		} while (u64_stats_fetch_retry_irq(syncp, start));
-
-		res += v;
+		res += snmp_get_cpu_field(mib, cpu, offct, syncp_offset);
 	}
 	return res;
 }
-- 
cgit v1.2.3


From 4c22279848c531fc7f555d463daf3d0df963bd41 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Sun, 30 Aug 2015 18:09:38 -0700
Subject: ip-tunnel: Use API to access tunnel metadata options.

Currently tun-info options pointer is used in few cases to
pass options around. But tunnel options can be accessed using
ip_tunnel_info_opts() API without using the pointer. Following
patch removes the redundant pointer and consistently make use
of API.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c           |  9 ++----
 drivers/net/vxlan.c            |  4 +--
 include/net/dst_metadata.h     | 31 +++++++++----------
 include/net/ip_tunnels.h       | 67 +++++++++++++++++++-----------------------
 net/ipv4/ip_tunnel_core.c      |  2 --
 net/openvswitch/actions.c      |  8 +++--
 net/openvswitch/datapath.c     |  3 +-
 net/openvswitch/datapath.h     |  3 +-
 net/openvswitch/flow.c         |  7 +++--
 net/openvswitch/flow_netlink.c | 27 +++++++----------
 net/openvswitch/flow_netlink.h |  3 +-
 net/openvswitch/vport-geneve.c |  5 ++--
 net/openvswitch/vport-gre.c    |  5 ++--
 net/openvswitch/vport-vxlan.c  |  4 +--
 net/openvswitch/vport.c        | 27 +++++++++--------
 net/openvswitch/vport.h        |  7 +++--
 16 files changed, 100 insertions(+), 112 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 68b0f0325fc7..da3259ce7c8d 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -143,7 +143,6 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
 
 	if (ip_tunnel_collect_metadata() || gs->collect_md) {
 		__be16 flags;
-		void *opts;
 
 		flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
 			(gnvh->oam ? TUNNEL_OAM : 0) |
@@ -154,11 +153,9 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
 					 gnvh->opt_len * 4);
 		if (!tun_dst)
 			goto drop;
-
 		/* Update tunnel dst according to Geneve options. */
-		opts = ip_tunnel_info_opts(&tun_dst->u.tun_info,
-					   gnvh->opt_len * 4);
-		memcpy(opts, gnvh->options, gnvh->opt_len * 4);
+		ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
+					gnvh->options, gnvh->opt_len * 4);
 	} else {
 		/* Drop packets w/ critical options,
 		 * since we don't support any...
@@ -663,7 +660,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		tunnel_id_to_vni(key->tun_id, vni);
 		if (key->tun_flags & TUNNEL_GENEVE_OPT)
-			opts = ip_tunnel_info_opts(info, info->options_len);
+			opts = ip_tunnel_info_opts(info);
 
 		udp_csum = !!(key->tun_flags & TUNNEL_CSUM);
 		err = geneve_build_skb(rt, skb, key->tun_flags, vni,
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 6c5269aea544..ce988fd01b34 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1271,7 +1271,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 			goto drop;
 
 		info = &tun_dst->u.tun_info;
-		md = ip_tunnel_info_opts(info, sizeof(*md));
+		md = ip_tunnel_info_opts(info);
 	} else {
 		memset(md, 0, sizeof(*md));
 	}
@@ -1948,7 +1948,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		tos = info->key.tos;
 
 		if (info->options_len)
-			md = ip_tunnel_info_opts(info, sizeof(*md));
+			md = ip_tunnel_info_opts(info);
 	} else {
 		md->gbp = skb->mark;
 	}
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index d32f49cc621d..547ab8241593 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -48,21 +48,16 @@ static inline bool skb_valid_dst(const struct sk_buff *skb)
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
 
-static inline struct metadata_dst *tun_rx_dst(__be16 flags,
-					      __be64 tunnel_id, int md_size)
+static inline struct metadata_dst *tun_rx_dst(int md_size)
 {
 	struct metadata_dst *tun_dst;
-	struct ip_tunnel_info *info;
 
 	tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC);
 	if (!tun_dst)
 		return NULL;
 
-	info = &tun_dst->u.tun_info;
-	info->key.tun_flags = flags;
-	info->key.tun_id = tunnel_id;
-	info->key.tp_src = 0;
-	info->key.tp_dst = 0;
+	tun_dst->u.tun_info.options_len = 0;
+	tun_dst->u.tun_info.mode = 0;
 	return tun_dst;
 }
 
@@ -73,17 +68,14 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct metadata_dst *tun_dst;
-	struct ip_tunnel_info *info;
 
-	tun_dst = tun_rx_dst(flags, tunnel_id, md_size);
+	tun_dst = tun_rx_dst(md_size);
 	if (!tun_dst)
 		return NULL;
 
-	info = &tun_dst->u.tun_info;
-	info->key.u.ipv4.src = iph->saddr;
-	info->key.u.ipv4.dst = iph->daddr;
-	info->key.tos = iph->tos;
-	info->key.ttl = iph->ttl;
+	ip_tunnel_key_init(&tun_dst->u.tun_info.key,
+			   iph->saddr, iph->daddr, iph->tos, iph->ttl,
+			   0, 0, tunnel_id, flags);
 	return tun_dst;
 }
 
@@ -96,16 +88,21 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
 	struct metadata_dst *tun_dst;
 	struct ip_tunnel_info *info;
 
-	tun_dst = tun_rx_dst(flags, tunnel_id, md_size);
+	tun_dst = tun_rx_dst(md_size);
 	if (!tun_dst)
 		return NULL;
 
 	info = &tun_dst->u.tun_info;
+	info->mode = IP_TUNNEL_INFO_IPV6;
+	info->key.tun_flags = flags;
+	info->key.tun_id = tunnel_id;
+	info->key.tp_src = 0;
+	info->key.tp_dst = 0;
+
 	info->key.u.ipv6.src = ip6h->saddr;
 	info->key.u.ipv6.dst = ip6h->daddr;
 	info->key.tos = ipv6_get_dsfield(ip6h);
 	info->key.ttl = ip6h->hop_limit;
-	info->mode = IP_TUNNEL_INFO_IPV6;
 	return tun_dst;
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 2b4fa06e91bd..9a6a3ba888e8 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -57,7 +57,6 @@ struct ip_tunnel_key {
 
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
-	const void		*options;
 	u8			options_len;
 	u8			mode;
 };
@@ -180,49 +179,32 @@ int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op,
 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 			    unsigned int num);
 
-static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
-					 __be32 saddr, __be32 daddr,
-					 u8 tos, u8 ttl,
-					 __be16 tp_src, __be16 tp_dst,
-					 __be64 tun_id, __be16 tun_flags,
-					 const void *opts, u8 opts_len)
+static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
+				      __be32 saddr, __be32 daddr,
+				      u8 tos, u8 ttl,
+				      __be16 tp_src, __be16 tp_dst,
+				      __be64 tun_id, __be16 tun_flags)
 {
-	tun_info->key.tun_id = tun_id;
-	tun_info->key.u.ipv4.src = saddr;
-	tun_info->key.u.ipv4.dst = daddr;
-	memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD,
+	key->tun_id = tun_id;
+	key->u.ipv4.src = saddr;
+	key->u.ipv4.dst = daddr;
+	memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD,
 	       0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
-	tun_info->key.tos = tos;
-	tun_info->key.ttl = ttl;
-	tun_info->key.tun_flags = tun_flags;
+	key->tos = tos;
+	key->ttl = ttl;
+	key->tun_flags = tun_flags;
 
 	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
 	 * the upper tunnel are used.
 	 * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
 	 */
-	tun_info->key.tp_src = tp_src;
-	tun_info->key.tp_dst = tp_dst;
+	key->tp_src = tp_src;
+	key->tp_dst = tp_dst;
 
 	/* Clear struct padding. */
-	if (sizeof(tun_info->key) != IP_TUNNEL_KEY_SIZE)
-		memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_SIZE,
-		       0, sizeof(tun_info->key) - IP_TUNNEL_KEY_SIZE);
-
-	tun_info->options = opts;
-	tun_info->options_len = opts_len;
-
-	tun_info->mode = 0;
-}
-
-static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
-				       const struct iphdr *iph,
-				       __be16 tp_src, __be16 tp_dst,
-				       __be64 tun_id, __be16 tun_flags,
-				       const void *opts, u8 opts_len)
-{
-	__ip_tunnel_info_init(tun_info, iph->saddr, iph->daddr,
-			      iph->tos, iph->ttl, tp_src, tp_dst,
-			      tun_id, tun_flags, opts, opts_len);
+	if (sizeof(*key) != IP_TUNNEL_KEY_SIZE)
+		memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE,
+		       0, sizeof(*key) - IP_TUNNEL_KEY_SIZE);
 }
 
 static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
@@ -317,11 +299,24 @@ static inline void iptunnel_xmit_stats(int err,
 	}
 }
 
-static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n)
+static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info)
 {
 	return info + 1;
 }
 
+static inline void ip_tunnel_info_opts_get(void *to,
+					   const struct ip_tunnel_info *info)
+{
+	memcpy(to, info + 1, info->options_len);
+}
+
+static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
+					   const void *from, int len)
+{
+	memcpy(ip_tunnel_info_opts(info), from, len);
+	info->options_len = len;
+}
+
 static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
 {
 	return (struct ip_tunnel_info *)lwtstate->data;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0c756ade1cf7..29ed6c5a5185 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -249,7 +249,6 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
 		tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]);
 
 	tun_info->mode = IP_TUNNEL_INFO_TX;
-	tun_info->options = NULL;
 	tun_info->options_len = 0;
 
 	*ts = new_state;
@@ -357,7 +356,6 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
 		tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]);
 
 	tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
-	tun_info->options = NULL;
 	tun_info->options_len = 0;
 
 	*ts = new_state;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 090d9e3a460c..315f5330b6e5 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -793,11 +793,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			if (vport) {
 				int err;
 
+				upcall.egress_tun_info = &info;
 				err = ovs_vport_get_egress_tun_info(vport, skb,
-								    &info);
-				if (!err)
-					upcall.egress_tun_info = &info;
+								    &upcall);
+				if (err)
+					upcall.egress_tun_info = NULL;
 			}
+
 			break;
 		}
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 60c2ab8e6bc3..6fbd2decb19e 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -491,7 +491,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	if (upcall_info->egress_tun_info) {
 		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
 		err = ovs_nla_put_egress_tunnel_key(user_skb,
-						    upcall_info->egress_tun_info);
+						    upcall_info->egress_tun_info,
+						    upcall_info->egress_tun_opts);
 		BUG_ON(err);
 		nla_nest_end(user_skb, nla);
 	}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index c05b7d9e7bf2..f88038a99f44 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -116,7 +116,8 @@ struct ovs_skb_cb {
  * @mru: If not zero, Maximum received IP fragment size.
  */
 struct dp_upcall_info {
-	const struct ip_tunnel_info *egress_tun_info;
+	struct ip_tunnel_info *egress_tun_info;
+	const void *egress_tun_opts;
 	const struct nlattr *userdata;
 	const struct nlattr *actions;
 	int actions_len;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index bed8d09230cd..c8db44ab2ee7 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -702,12 +702,13 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			return -EINVAL;
 		memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
 
-		if (tun_info->options) {
+		if (tun_info->options_len) {
 			BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
 						   8)) - 1
 					> sizeof(key->tun_opts));
-			memcpy(TUN_METADATA_OPTS(key, tun_info->options_len),
-			       tun_info->options, tun_info->options_len);
+
+			ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len),
+						tun_info);
 			key->tun_opts_len = tun_info->options_len;
 		} else {
 			key->tun_opts_len = 0;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e22c5bfe8575..c92d6a262bc5 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -716,10 +716,11 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
 }
 
 int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
-				  const struct ip_tunnel_info *egress_tun_info)
+				  const struct ip_tunnel_info *egress_tun_info,
+				  const void *egress_tun_opts)
 {
 	return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
-				    egress_tun_info->options,
+				    egress_tun_opts,
 				    egress_tun_info->options_len);
 }
 
@@ -1876,20 +1877,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	tun_info = &tun_dst->u.tun_info;
 	tun_info->mode = IP_TUNNEL_INFO_TX;
 	tun_info->key = key.tun_key;
-	tun_info->options_len = key.tun_opts_len;
-
-	if (tun_info->options_len) {
-		/* We need to store the options in the action itself since
-		 * everything else will go away after flow setup. We can append
-		 * it to tun_info and then point there.
-		 */
-		memcpy((tun_info + 1),
-		       TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len);
-		tun_info->options = (tun_info + 1);
-	} else {
-		tun_info->options = NULL;
-	}
 
+	/* We need to store the options in the action itself since
+	 * everything else will go away after flow setup. We can append
+	 * it to tun_info and then point there.
+	 */
+	ip_tunnel_info_opts_set(tun_info,
+				TUN_METADATA_OPTS(&key, key.tun_opts_len),
+				key.tun_opts_len);
 	add_nested_action_end(*sfa, start);
 
 	return err;
@@ -2345,7 +2340,7 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 
 		err = ipv4_tun_to_nlattr(skb, &tun_info->key,
 					 tun_info->options_len ?
-						tun_info->options : NULL,
+					     ip_tunnel_info_opts(tun_info) : NULL,
 					 tun_info->options_len);
 		if (err)
 			return err;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 07878e22e783..6ca3f0baf449 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -56,7 +56,8 @@ int ovs_nla_get_match(struct net *, struct sw_flow_match *,
 		      const struct nlattr *key, const struct nlattr *mask,
 		      bool log);
 int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
-				  const struct ip_tunnel_info *);
+				  const struct ip_tunnel_info *,
+				  const void *egress_tun_opts);
 
 bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
 int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 24c56e56fedd..2735e9c4a3b8 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -53,15 +53,14 @@ static int geneve_get_options(const struct vport *vport,
 }
 
 static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				      struct ip_tunnel_info *egress_tun_info)
+				      struct dp_upcall_info *upcall)
 {
 	struct geneve_port *geneve_port = geneve_vport(vport);
 	struct net *net = ovs_dp_get_net(vport->dp);
 	__be16 dport = htons(geneve_port->port_no);
 	__be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
 
-	return ovs_tunnel_get_egress_info(egress_tun_info,
-					  ovs_dp_get_net(vport->dp),
+	return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
 					  skb, IPPROTO_UDP, sport, dport);
 }
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 36c39843607e..4d24481669c9 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -85,10 +85,9 @@ static struct vport *gre_create(const struct vport_parms *parms)
 }
 
 static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				   struct ip_tunnel_info *egress_tun_info)
+				   struct dp_upcall_info *upcall)
 {
-	return ovs_tunnel_get_egress_info(egress_tun_info,
-					  ovs_dp_get_net(vport->dp),
+	return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
 					  skb, IPPROTO_GRE, 0, 0);
 }
 
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index ed7b23f443ec..c11413d5075f 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -147,7 +147,7 @@ static struct vport *vxlan_create(const struct vport_parms *parms)
 }
 
 static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				     struct ip_tunnel_info *egress_tun_info)
+				     struct dp_upcall_info *upcall)
 {
 	struct vxlan_dev *vxlan = netdev_priv(vport->dev);
 	struct net *net = ovs_dp_get_net(vport->dp);
@@ -159,7 +159,7 @@ static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
 	inet_get_local_port_range(net, &port_min, &port_max);
 	src_port = udp_flow_src_port(net, skb, 0, 0, true);
 
-	return ovs_tunnel_get_egress_info(egress_tun_info, net,
+	return ovs_tunnel_get_egress_info(upcall, net,
 					  skb, IPPROTO_UDP,
 					  src_port, dst_port);
 }
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 1679dea7c6bc..dc81dc619aa2 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -487,13 +487,14 @@ void ovs_vport_deferred_free(struct vport *vport)
 }
 EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
 
-int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall,
 			       struct net *net,
 			       struct sk_buff *skb,
 			       u8 ipproto,
 			       __be16 tp_src,
 			       __be16 tp_dst)
 {
+	struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info;
 	const struct ip_tunnel_info *tun_info = skb_tunnel_info(skb);
 	const struct ip_tunnel_key *tun_key;
 	u32 skb_mark = skb->mark;
@@ -520,26 +521,26 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
 	/* Generate egress_tun_info based on tun_info,
 	 * saddr, tp_src and tp_dst
 	 */
-	__ip_tunnel_info_init(egress_tun_info,
-			      fl.saddr, tun_key->u.ipv4.dst,
-			      tun_key->tos,
-			      tun_key->ttl,
-			      tp_src, tp_dst,
-			      tun_key->tun_id,
-			      tun_key->tun_flags,
-			      tun_info->options,
-			      tun_info->options_len);
-
+	ip_tunnel_key_init(&egress_tun_info->key,
+			   fl.saddr, tun_key->u.ipv4.dst,
+			   tun_key->tos,
+			   tun_key->ttl,
+			   tp_src, tp_dst,
+			   tun_key->tun_id,
+			   tun_key->tun_flags);
+	egress_tun_info->options_len = tun_info->options_len;
+	egress_tun_info->mode = tun_info->mode;
+	upcall->egress_tun_opts = ip_tunnel_info_opts(egress_tun_info);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
 
 int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct ip_tunnel_info *info)
+				  struct dp_upcall_info *upcall)
 {
 	/* get_egress_tun_info() is only implemented on tunnel ports. */
 	if (unlikely(!vport->ops->get_egress_tun_info))
 		return -EINVAL;
 
-	return vport->ops->get_egress_tun_info(vport, skb, info);
+	return vport->ops->get_egress_tun_info(vport, skb, upcall);
 }
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 4b6f4a5296c3..a413f3ae6a7b 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -53,14 +53,15 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids);
 int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *);
 u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
 
-int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall,
 			       struct net *net,
 			       struct sk_buff *,
 			       u8 ipproto,
 			       __be16 tp_src,
 			       __be16 tp_dst);
+
 int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct ip_tunnel_info *info);
+				  struct dp_upcall_info *upcall);
 
 /**
  * struct vport_portids - array of netlink portids of a vport.
@@ -154,7 +155,7 @@ struct vport_ops {
 
 	void (*send)(struct vport *, struct sk_buff *);
 	int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
-				   struct ip_tunnel_info *);
+				   struct dp_upcall_info *upcall);
 
 	struct module *owner;
 	struct list_head list;
-- 
cgit v1.2.3


From c3a8d9474684d391b0afc3970d9b249add15ec07 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 31 Aug 2015 15:58:47 +0200
Subject: tcp: use dctcp if enabled on the route to the initiator

Currently, the following case doesn't use DCTCP, even if it should:
A responder has f.e. Cubic as system wide default, but for a specific
route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The
initiating host then uses DCTCP as congestion control, but since the
initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok,
and we have to fall back to Reno after 3WHS completes.

We were thinking on how to solve this in a minimal, non-intrusive
way without bloating tcp_ecn_create_request() needlessly: lets cache
the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0)
is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES
contains the unexposed (internal-only) DST_FEATURE_ECN_CA. This allows
to only do a single metric feature lookup inside tcp_ecn_create_request().

Joint work with Florian Westphal.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h        | 6 ++++++
 include/net/tcp.h        | 2 +-
 net/core/rtnetlink.c     | 6 ++++++
 net/ipv4/fib_semantics.c | 6 +++++-
 net/ipv4/tcp_cong.c      | 9 ++++++---
 net/ipv4/tcp_input.c     | 7 +++++--
 net/ipv6/route.c         | 9 +++++++--
 7 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 4c4801645371..9261d928303d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -207,6 +207,12 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
 		p[metric-1] = val;
 }
 
+/* Kernel-internal feature bits that are unallocated in user space. */
+#define DST_FEATURE_ECN_CA	(1 << 31)
+
+#define DST_FEATURE_MASK	(DST_FEATURE_ECN_CA)
+#define DST_FEATURE_ECN_MASK	(DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)
+
 static inline u32
 dst_feature(const struct dst_entry *dst, u32 feature)
 {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4a7b03947a38..0cab28cd43a9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
 
 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
-u32 tcp_ca_get_key_by_name(const char *name);
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
 #ifdef CONFIG_INET
 char *tcp_ca_get_name_by_key(u32 key, char *buffer);
 #else
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 788ceed39463..a466821d1441 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -678,6 +678,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
 					continue;
 				if (nla_put_string(skb, i + 1, name))
 					goto nla_put_failure;
+			} else if (i == RTAX_FEATURES - 1) {
+				u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
+
+				BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
+				if (nla_put_u32(skb, i + 1, user_features))
+					goto nla_put_failure;
 			} else {
 				if (nla_put_u32(skb, i + 1, metrics[i]))
 					goto nla_put_failure;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 115a08e70d43..992a9597daf8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -879,6 +879,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
 static int
 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 {
+	bool ecn_ca = false;
 	struct nlattr *nla;
 	int remaining;
 
@@ -898,7 +899,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 			char tmp[TCP_CA_NAME_MAX];
 
 			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(tmp);
+			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
 			if (val == TCP_CA_UNSPEC)
 				return -EINVAL;
 		} else {
@@ -913,6 +914,9 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 		fi->fib_metrics[type - 1] = val;
 	}
 
+	if (ecn_ca)
+		fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
 	return 0;
 }
 
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index a2ed23c595cf..93c4dc3ab23f 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 }
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
-u32 tcp_ca_get_key_by_name(const char *name)
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
 {
 	const struct tcp_congestion_ops *ca;
-	u32 key;
+	u32 key = TCP_CA_UNSPEC;
 
 	might_sleep();
 
 	rcu_read_lock();
 	ca = __tcp_ca_find_autoload(name);
-	key = ca ? ca->key : TCP_CA_UNSPEC;
+	if (ca) {
+		key = ca->key;
+		*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
+	}
 	rcu_read_unlock();
 
 	return key;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dc08e2352665..a8f515bb19c4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6003,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req,
 	const struct net *net = sock_net(listen_sk);
 	bool th_ecn = th->ece && th->cwr;
 	bool ect, ecn_ok;
+	u32 ecn_ok_dst;
 
 	if (!th_ecn)
 		return;
 
 	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
-	ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
+	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
+	ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
 
-	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
+	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+	    (ecn_ok_dst & DST_FEATURE_ECN_CA))
 		inet_rsk(req)->ecn_ok = 1;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8771530df45e..f45cac6f8356 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1698,6 +1698,7 @@ out:
 static int ip6_convert_metrics(struct mx6_config *mxc,
 			       const struct fib6_config *cfg)
 {
+	bool ecn_ca = false;
 	struct nlattr *nla;
 	int remaining;
 	u32 *mp;
@@ -1722,7 +1723,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 			char tmp[TCP_CA_NAME_MAX];
 
 			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(tmp);
+			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
 			if (val == TCP_CA_UNSPEC)
 				goto err;
 		} else {
@@ -1735,8 +1736,12 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 		__set_bit(type - 1, mxc->mx_valid);
 	}
 
-	mxc->mx = mp;
+	if (ecn_ca) {
+		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
+		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+	}
 
+	mxc->mx = mp;
 	return 0;
  err:
 	kfree(mp);
-- 
cgit v1.2.3


From c42858eaf4926eb2f44f3e26731b276ab966ac28 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 31 Aug 2015 13:57:34 -0700
Subject: gro_cells: remove spinlock protecting receive queues

As David pointed out, spinlock are no longer needed
to protect the per cpu queues used in gro cells infrastructure.

Also use new napi_complete_done() API so that gro_flush_timeout
tweaks have an effect.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gro_cells.h | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
index 0f712c0bc0bf..cf6c74550baa 100644
--- a/include/net/gro_cells.h
+++ b/include/net/gro_cells.h
@@ -32,37 +32,28 @@ static inline void gro_cells_receive(struct gro_cells *gcells, struct sk_buff *s
 		return;
 	}
 
-	/* We run in BH context */
-	spin_lock(&cell->napi_skbs.lock);
-
 	__skb_queue_tail(&cell->napi_skbs, skb);
 	if (skb_queue_len(&cell->napi_skbs) == 1)
 		napi_schedule(&cell->napi);
-
-	spin_unlock(&cell->napi_skbs.lock);
 }
 
-/* called unser BH context */
+/* called under BH context */
 static inline int gro_cell_poll(struct napi_struct *napi, int budget)
 {
 	struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
 	struct sk_buff *skb;
 	int work_done = 0;
 
-	spin_lock(&cell->napi_skbs.lock);
 	while (work_done < budget) {
 		skb = __skb_dequeue(&cell->napi_skbs);
 		if (!skb)
 			break;
-		spin_unlock(&cell->napi_skbs.lock);
 		napi_gro_receive(napi, skb);
 		work_done++;
-		spin_lock(&cell->napi_skbs.lock);
 	}
 
 	if (work_done < budget)
-		napi_complete(napi);
-	spin_unlock(&cell->napi_skbs.lock);
+		napi_complete_done(napi, work_done);
 	return work_done;
 }
 
@@ -77,7 +68,7 @@ static inline int gro_cells_init(struct gro_cells *gcells, struct net_device *de
 	for_each_possible_cpu(i) {
 		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
 
-		skb_queue_head_init(&cell->napi_skbs);
+		__skb_queue_head_init(&cell->napi_skbs);
 		netif_napi_add(dev, &cell->napi, gro_cell_poll, 64);
 		napi_enable(&cell->napi);
 	}
@@ -92,8 +83,9 @@ static inline void gro_cells_destroy(struct gro_cells *gcells)
 		return;
 	for_each_possible_cpu(i) {
 		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
+
 		netif_napi_del(&cell->napi);
-		skb_queue_purge(&cell->napi_skbs);
+		__skb_queue_purge(&cell->napi_skbs);
 	}
 	free_percpu(gcells->cells);
 	gcells->cells = NULL;
-- 
cgit v1.2.3


From 63b6c13dbb7d3e36f031629f7e4e86dacfcab8cf Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Mon, 31 Aug 2015 20:05:57 -0700
Subject: tun_dst: Remove opts_size

opts_size is only written and never read. Following patch
removes this unused variable.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h | 1 -
 net/core/dst.c             | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 547ab8241593..af9d5382f6cb 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -7,7 +7,6 @@
 
 struct metadata_dst {
 	struct dst_entry		dst;
-	size_t				opts_len;
 	union {
 		struct ip_tunnel_info	tun_info;
 	} u;
diff --git a/net/core/dst.c b/net/core/dst.c
index 477035ed7903..0771c8cb9307 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -378,7 +378,6 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
 	dst->output = dst_md_discard_sk;
 
 	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
-	md_dst->opts_len = optslen;
 }
 
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
-- 
cgit v1.2.3


From 9cf94eab8b309e8bcc78b41dd1561c75b537dd0b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 31 Aug 2015 19:11:02 +0200
Subject: netfilter: conntrack: use nf_ct_tmpl_free in CT/synproxy error paths

Commit 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack
templates") migrated templates to the new allocator api, but forgot to
update error paths for them in CT and synproxy to use nf_ct_tmpl_free()
instead of nf_conntrack_free().

Due to that, memory is being freed into the wrong kmemcache, but also
we drop the per net reference count of ct objects causing an imbalance.

In Brad's case, this leads to a wrap-around of net->ct.count and thus
lets __nf_conntrack_alloc() refuse to create a new ct object:

  [   10.340913] xt_addrtype: ipv6 does not support BROADCAST matching
  [   10.810168] nf_conntrack: table full, dropping packet
  [   11.917416] r8169 0000:07:00.0 eth0: link up
  [   11.917438] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
  [   12.815902] nf_conntrack: table full, dropping packet
  [   15.688561] nf_conntrack: table full, dropping packet
  [   15.689365] nf_conntrack: table full, dropping packet
  [   15.690169] nf_conntrack: table full, dropping packet
  [   15.690967] nf_conntrack: table full, dropping packet
  [...]

With slab debugging, it also reports the wrong kmemcache (kmalloc-512 vs.
nf_conntrack_ffffffff81ce75c0) and reports poison overwrites, etc. Thus,
to fix the problem, export and use nf_ct_tmpl_free() instead.

Fixes: 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates")
Reported-by: Brad Jackson <bjackson0971@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 1 +
 net/netfilter/nf_conntrack_core.c    | 3 ++-
 net/netfilter/nf_synproxy_core.c     | 2 +-
 net/netfilter/xt_CT.c                | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 37cd3911d5c5..4023c4ce260f 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -292,6 +292,7 @@ extern unsigned int nf_conntrack_hash_rnd;
 void init_nf_conntrack_hash_rnd(void);
 
 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags);
+void nf_ct_tmpl_free(struct nf_conn *tmpl);
 
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3c20d02aee73..0625a42df108 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -320,12 +320,13 @@ out_free:
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 
-static void nf_ct_tmpl_free(struct nf_conn *tmpl)
+void nf_ct_tmpl_free(struct nf_conn *tmpl)
 {
 	nf_ct_ext_destroy(tmpl);
 	nf_ct_ext_free(tmpl);
 	kfree(tmpl);
 }
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index d7f168527903..d6ee8f8b19b6 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -378,7 +378,7 @@ static int __net_init synproxy_net_init(struct net *net)
 err3:
 	free_percpu(snet->stats);
 err2:
-	nf_conntrack_free(ct);
+	nf_ct_tmpl_free(ct);
 err1:
 	return err;
 }
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 43ddeee404e9..f3377ce1ff18 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -233,7 +233,7 @@ out:
 	return 0;
 
 err3:
-	nf_conntrack_free(ct);
+	nf_ct_tmpl_free(ct);
 err2:
 	nf_ct_l3proto_module_put(par->family);
 err1:
-- 
cgit v1.2.3


From 9b8ff51822893e743eee09350c1928daa3ef503f Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 1 Sep 2015 14:26:35 -0600
Subject: net: Make table id type u32

A number of VRF patches used 'int' for table id. It should be u32 to be
consistent with the rest of the stack.

Fixes:
4e3c89920cd3a ("net: Introduce VRF related flags and helpers")
15be405eb2ea9 ("net: Add inet_addr lookup by table")
30bbaa1950055 ("net: Fix up inet_addr_type checks")
021dd3b8a142d ("net: Add routes to the table associated with the device")
dc028da54ed35 ("inet: Move VRF table lookup to inlined function")
f6d3c19274c74 ("net: FIB tracepoints")

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h        |  2 +-
 include/net/vrf.h          | 24 ++++++++++++------------
 include/trace/events/fib.h |  6 +++---
 net/ipv4/af_inet.c         |  2 +-
 net/ipv4/fib_frontend.c    | 10 +++++-----
 net/ipv4/fib_semantics.c   |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/include/net/route.h b/include/net/route.h
index 395d79bb556c..cc61cb95f059 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -188,7 +188,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
 void ip_rt_send_redirect(struct sk_buff *skb);
 
 unsigned int inet_addr_type(struct net *net, __be32 addr);
-unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id);
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr);
 unsigned int inet_addr_type_dev_table(struct net *net,
diff --git a/include/net/vrf.h b/include/net/vrf.h
index 5bfb16237fd7..593e6094ddd4 100644
--- a/include/net/vrf.h
+++ b/include/net/vrf.h
@@ -66,9 +66,9 @@ static inline int vrf_master_ifindex(const struct net_device *dev)
 }
 
 /* called with rcu_read_lock */
-static inline int vrf_dev_table_rcu(const struct net_device *dev)
+static inline u32 vrf_dev_table_rcu(const struct net_device *dev)
 {
-	int tb_id = 0;
+	u32 tb_id = 0;
 
 	if (dev) {
 		struct net_vrf_dev *vrf_ptr;
@@ -80,9 +80,9 @@ static inline int vrf_dev_table_rcu(const struct net_device *dev)
 	return tb_id;
 }
 
-static inline int vrf_dev_table(const struct net_device *dev)
+static inline u32 vrf_dev_table(const struct net_device *dev)
 {
-	int tb_id;
+	u32 tb_id;
 
 	rcu_read_lock();
 	tb_id = vrf_dev_table_rcu(dev);
@@ -91,10 +91,10 @@ static inline int vrf_dev_table(const struct net_device *dev)
 	return tb_id;
 }
 
-static inline int vrf_dev_table_ifindex(struct net *net, int ifindex)
+static inline u32 vrf_dev_table_ifindex(struct net *net, int ifindex)
 {
 	struct net_device *dev;
-	int tb_id = 0;
+	u32 tb_id = 0;
 
 	if (!ifindex)
 		return 0;
@@ -111,9 +111,9 @@ static inline int vrf_dev_table_ifindex(struct net *net, int ifindex)
 }
 
 /* called with rtnl */
-static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+static inline u32 vrf_dev_table_rtnl(const struct net_device *dev)
 {
-	int tb_id = 0;
+	u32 tb_id = 0;
 
 	if (dev) {
 		struct net_vrf_dev *vrf_ptr;
@@ -149,22 +149,22 @@ static inline int vrf_master_ifindex(const struct net_device *dev)
 	return 0;
 }
 
-static inline int vrf_dev_table_rcu(const struct net_device *dev)
+static inline u32 vrf_dev_table_rcu(const struct net_device *dev)
 {
 	return 0;
 }
 
-static inline int vrf_dev_table(const struct net_device *dev)
+static inline u32 vrf_dev_table(const struct net_device *dev)
 {
 	return 0;
 }
 
-static inline int vrf_dev_table_ifindex(struct net *net, int ifindex)
+static inline u32 vrf_dev_table_ifindex(struct net *net, int ifindex)
 {
 	return 0;
 }
 
-static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+static inline u32 vrf_dev_table_rtnl(const struct net_device *dev)
 {
 	return 0;
 }
diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index acd1d22571a2..833cfcb6750d 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -11,12 +11,12 @@
 
 TRACE_EVENT(fib_table_lookup,
 
-	TP_PROTO(int tb_id, const struct flowi4 *flp),
+	TP_PROTO(u32 tb_id, const struct flowi4 *flp),
 
 	TP_ARGS(tb_id, flp),
 
 	TP_STRUCT__entry(
-		__field(	int,	tb_id		)
+		__field(	u32,	tb_id		)
 		__field(	int,	oif		)
 		__field(	int,	iif		)
 		__field(	__u8,	tos		)
@@ -43,7 +43,7 @@ TRACE_EVENT(fib_table_lookup,
 		*p32 = flp->daddr;
 	),
 
-	TP_printk("table %d oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x",
+	TP_printk("table %u oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x",
 		  __entry->tb_id, __entry->oif, __entry->iif,
 		  __entry->src, __entry->dst, __entry->tos, __entry->scope,
 		  __entry->flags)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 96773a2f95a7..1d0c3adb6f34 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -428,7 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct net *net = sock_net(sk);
 	unsigned short snum;
 	int chk_addr_ret;
-	int tb_id = RT_TABLE_LOCAL;
+	u32 tb_id = RT_TABLE_LOCAL;
 	int err;
 
 	/* If the socket has its own bind function then use it. (RAW) */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4036c94dfbe1..6fcbd215cdbc 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -213,7 +213,7 @@ void fib_flush_external(struct net *net)
  */
 static inline unsigned int __inet_dev_addr_type(struct net *net,
 						const struct net_device *dev,
-						__be32 addr, int tb_id)
+						__be32 addr, u32 tb_id)
 {
 	struct flowi4		fl4 = { .daddr = addr };
 	struct fib_result	res;
@@ -240,7 +240,7 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	return ret;
 }
 
-unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id)
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
 {
 	return __inet_dev_addr_type(net, NULL, addr, tb_id);
 }
@@ -255,7 +255,7 @@ EXPORT_SYMBOL(inet_addr_type);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr)
 {
-	int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+	u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
 
 	return __inet_dev_addr_type(net, dev, addr, rt_table);
 }
@@ -268,7 +268,7 @@ unsigned int inet_addr_type_dev_table(struct net *net,
 				      const struct net_device *dev,
 				      __be32 addr)
 {
-	int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+	u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
 
 	return __inet_dev_addr_type(net, NULL, addr, rt_table);
 }
@@ -803,7 +803,7 @@ out:
 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
 {
 	struct net *net = dev_net(ifa->ifa_dev->dev);
-	int tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev);
+	u32 tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev);
 	struct fib_table *tb;
 	struct fib_config cfg = {
 		.fc_protocol = RTPROT_KERNEL,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 992a9597daf8..064bd3caaa4f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -863,7 +863,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
 {
 	if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
 	    fib_prefsrc != cfg->fc_dst) {
-		int tb_id = cfg->fc_table;
+		u32 tb_id = cfg->fc_table;
 
 		if (tb_id == RT_TABLE_MAIN)
 			tb_id = RT_TABLE_LOCAL;
-- 
cgit v1.2.3


From e5276937ae6e654a811345f0716266f12e77bede Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:23 -0700
Subject: flow_dissector: Move skb related functions to skbuff.h

Move the flow dissector functions that are specific to skbuffs into
skbuff.h out of flow_dissector.h. This makes flow_dissector.h have
no dependencies on skbuff.h.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 47 +++++++++++++++++++++++++++++++++++++++++
 include/net/flow_dissector.h | 50 --------------------------------------------
 2 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 989307f991db..8a697c673b58 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -945,6 +945,53 @@ skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
 	skb->hash = hash;
 }
 
+void __skb_get_hash(struct sk_buff *skb);
+u32 skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+		   const struct flow_keys *keys, int hlen);
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+			    void *data, int hlen_proto);
+
+static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
+					int thoff, u8 ip_proto)
+{
+	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+			     const struct flow_dissector_key *key,
+			     unsigned int key_count);
+
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
+			void *data, __be16 proto, int nhoff, int hlen);
+
+static inline bool skb_flow_dissect(const struct sk_buff *skb,
+				    struct flow_dissector *flow_dissector,
+				    void *target_container)
+{
+	return __skb_flow_dissect(skb, flow_dissector, target_container,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+					      struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
+						  void *data, __be16 proto,
+						  int nhoff, int hlen)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+				  data, proto, nhoff, hlen);
+}
+
 static inline __u32 skb_get_hash(struct sk_buff *skb)
 {
 	if (!skb->l4_hash && !skb->sw_hash)
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 1a8c22419936..6777a84c6f94 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -2,7 +2,6 @@
 #define _NET_FLOW_DISSECTOR_H
 
 #include <linux/types.h>
-#include <linux/skbuff.h>
 #include <linux/in6.h>
 #include <uapi/linux/if_ether.h>
 
@@ -134,23 +133,6 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
-void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
-			     const struct flow_dissector_key *key,
-			     unsigned int key_count);
-
-bool __skb_flow_dissect(const struct sk_buff *skb,
-			struct flow_dissector *flow_dissector,
-			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen);
-
-static inline bool skb_flow_dissect(const struct sk_buff *skb,
-				    struct flow_dissector *flow_dissector,
-				    void *target_container)
-{
-	return __skb_flow_dissect(skb, flow_dissector, target_container,
-				  NULL, 0, 0, 0);
-}
-
 struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
@@ -170,38 +152,6 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow);
 extern struct flow_dissector flow_keys_dissector;
 extern struct flow_dissector flow_keys_buf_dissector;
 
-static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
-					      struct flow_keys *flow)
-{
-	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
-				  NULL, 0, 0, 0);
-}
-
-static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
-						  void *data, __be16 proto,
-						  int nhoff, int hlen)
-{
-	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
-				  data, proto, nhoff, hlen);
-}
-
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
-
-static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
-					int thoff, u8 ip_proto)
-{
-	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
-}
-
-u32 flow_hash_from_keys(struct flow_keys *keys);
-void __skb_get_hash(struct sk_buff *skb);
-u32 skb_get_poff(const struct sk_buff *skb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
-
 /* struct flow_keys_digest:
  *
  * This structure is used to hold a digest of the full flow keys. This is a
-- 
cgit v1.2.3


From bcc83839ffdb063dd2b0370cd85c4f825761fc59 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:24 -0700
Subject: skbuff: Make __skb_set_sw_hash a general function

Move __skb_set_sw_hash to skbuff.h and add __skb_set_hash which is
a common method (between __skb_set_sw_hash and skb_set_hash) to set
the hash in an skbuff.

Also, move skb_clear_hash to be closer to __skb_set_hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 45 ++++++++++++++++++++++++++++----------------
 include/net/flow_dissector.h |  5 +++++
 net/core/flow_dissector.c    | 18 ++++++------------
 3 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8a697c673b58..5d2c812e725b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -937,14 +937,40 @@ enum pkt_hash_types {
 	PKT_HASH_TYPE_L4,	/* Input: src_IP, dst_IP, src_port, dst_port */
 };
 
-static inline void
-skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
+static inline void skb_clear_hash(struct sk_buff *skb)
 {
-	skb->l4_hash = (type == PKT_HASH_TYPE_L4);
+	skb->hash = 0;
 	skb->sw_hash = 0;
+	skb->l4_hash = 0;
+}
+
+static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
+{
+	if (!skb->l4_hash)
+		skb_clear_hash(skb);
+}
+
+static inline void
+__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
+{
+	skb->l4_hash = is_l4;
+	skb->sw_hash = is_sw;
 	skb->hash = hash;
 }
 
+static inline void
+skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
+{
+	/* Used by drivers to set hash from HW */
+	__skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
+}
+
+static inline void
+__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
+{
+	__skb_set_hash(skb, hash, true, is_l4);
+}
+
 void __skb_get_hash(struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
@@ -1027,19 +1053,6 @@ static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
 	return skb->hash;
 }
 
-static inline void skb_clear_hash(struct sk_buff *skb)
-{
-	skb->hash = 0;
-	skb->sw_hash = 0;
-	skb->l4_hash = 0;
-}
-
-static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
-{
-	if (!skb->l4_hash)
-		skb_clear_hash(skb);
-}
-
 static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
 {
 	to->hash = from->hash;
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 6777a84c6f94..af76c496f7db 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -167,4 +167,9 @@ struct flow_keys_digest {
 void make_flow_keys_digest(struct flow_keys_digest *digest,
 			   const struct flow_keys *flow);
 
+static inline bool flow_keys_have_l4(struct flow_keys *keys)
+{
+	return (keys->ports.ports || keys->tags.flow_label);
+}
+
 #endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 11e6540fa386..151b6e48b81f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -590,15 +590,6 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
-static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash,
-				     struct flow_keys *keys)
-{
-	if (keys->ports.ports)
-		skb->l4_hash = 1;
-	skb->sw_hash = 1;
-	skb->hash = hash;
-}
-
 /**
  * __skb_get_hash: calculate a flow hash
  * @skb: sk_buff to calculate flow hash from
@@ -619,7 +610,8 @@ void __skb_get_hash(struct sk_buff *skb)
 	if (!hash)
 		return;
 
-	__skb_set_sw_hash(skb, hash, &keys);
+	__skb_set_sw_hash(skb, hash,
+			  flow_keys_have_l4(&keys));
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
@@ -648,7 +640,8 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 	keys.tags.flow_label = (__force u32)fl6->flowlabel;
 	keys.basic.ip_proto = fl6->flowi6_proto;
 
-	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
+			  flow_keys_have_l4(&keys));
 
 	return skb->hash;
 }
@@ -668,7 +661,8 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
 	keys.keyid.keyid = fl4->fl4_gre_key;
 	keys.basic.ip_proto = fl4->flowi4_proto;
 
-	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
+			  flow_keys_have_l4(&keys));
 
 	return skb->hash;
 }
-- 
cgit v1.2.3


From c6cc1ca7f4d70cbb3ea3a5ca163c5dabaf155cdb Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:25 -0700
Subject: flowi: Abstract out functions to get flow hash based on flowi

Create __get_hash_from_flowi6 and __get_hash_from_flowi4 to get the
flow keys and hash based on flowi structures. These are called by
__skb_get_hash_flowi6 and __skb_get_hash_flowi4. Also, created
get_hash_from_flowi6 and get_hash_from_flowi4 which can be called
when just the hash value for a flowi is needed.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 16 ++++++++++++----
 include/net/flow.h           | 19 +++++++++++++++++++
 include/net/flow_dissector.h |  2 ++
 net/core/flow.c              | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5d2c812e725b..bbe41bccfc5f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1030,8 +1030,12 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6);
 
 static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 {
-	if (!skb->l4_hash && !skb->sw_hash)
-		__skb_get_hash_flowi6(skb, fl6);
+	if (!skb->l4_hash && !skb->sw_hash) {
+		struct flow_keys keys;
+
+		__skb_set_sw_hash(skb, __get_hash_from_flowi6(fl6, &keys),
+				  flow_keys_have_l4(&keys));
+	}
 
 	return skb->hash;
 }
@@ -1040,8 +1044,12 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl);
 
 static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
 {
-	if (!skb->l4_hash && !skb->sw_hash)
-		__skb_get_hash_flowi4(skb, fl4);
+	if (!skb->l4_hash && !skb->sw_hash) {
+		struct flow_keys keys;
+
+		__skb_set_sw_hash(skb, __get_hash_from_flowi4(fl4, &keys),
+				  flow_keys_have_l4(&keys));
+	}
 
 	return skb->hash;
 }
diff --git a/include/net/flow.h b/include/net/flow.h
index 9e0297c4c11d..dafe97c3c048 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -10,6 +10,7 @@
 #include <linux/socket.h>
 #include <linux/in6.h>
 #include <linux/atomic.h>
+#include <net/flow_dissector.h>
 
 /*
  * ifindex generation is per-net namespace, and loopback is
@@ -243,4 +244,22 @@ void flow_cache_flush(struct net *net);
 void flow_cache_flush_deferred(struct net *net);
 extern atomic_t flow_cache_genid;
 
+__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys);
+
+static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6)
+{
+	struct flow_keys keys;
+
+	return __get_hash_from_flowi6(fl6, &keys);
+}
+
+__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys);
+
+static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4)
+{
+	struct flow_keys keys;
+
+	return __get_hash_from_flowi4(fl4, &keys);
+}
+
 #endif
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index af76c496f7db..74fe160f0b05 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -172,4 +172,6 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys)
 	return (keys->ports.ports || keys->tags.flow_label);
 }
 
+u32 flow_hash_from_keys(struct flow_keys *keys);
+
 #endif
diff --git a/net/core/flow.c b/net/core/flow.c
index 1033725be40b..61930bb0eb59 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -22,6 +22,7 @@
 #include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <net/flow.h>
+#include <net/flow_dissector.h>
 #include <linux/atomic.h>
 #include <linux/security.h>
 #include <net/net_namespace.h>
@@ -509,3 +510,38 @@ void flow_cache_fini(struct net *net)
 	fc->percpu = NULL;
 }
 EXPORT_SYMBOL(flow_cache_fini);
+
+__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
+{
+	memset(keys, 0, sizeof(*keys));
+
+	memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
+	    sizeof(keys->addrs.v6addrs.src));
+	memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
+	    sizeof(keys->addrs.v6addrs.dst));
+	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	keys->ports.src = fl6->fl6_sport;
+	keys->ports.dst = fl6->fl6_dport;
+	keys->keyid.keyid = fl6->fl6_gre_key;
+	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->basic.ip_proto = fl6->flowi6_proto;
+
+	return flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(__get_hash_from_flowi6);
+
+__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys)
+{
+	memset(keys, 0, sizeof(*keys));
+
+	keys->addrs.v4addrs.src = fl4->saddr;
+	keys->addrs.v4addrs.dst = fl4->daddr;
+	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	keys->ports.src = fl4->fl4_sport;
+	keys->ports.dst = fl4->fl4_dport;
+	keys->keyid.keyid = fl4->fl4_gre_key;
+	keys->basic.ip_proto = fl4->flowi4_proto;
+
+	return flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(__get_hash_from_flowi4);
-- 
cgit v1.2.3


From 807e165dc44fd93f9d378f861f0540a158d7343a Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:28 -0700
Subject: flow_dissector: Add control/reporting of fragmentation

Add an input flag to flow dissector on rather dissection should be
attempted on a first fragment. Also add key_control flags to indicate
that a packet is a fragment or first fragment.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  4 ++++
 net/core/flow_dissector.c    | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 74fe160f0b05..34102270b086 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -12,6 +12,8 @@
 struct flow_dissector_key_control {
 	u16	thoff;
 	u16	addr_type;
+	u32	is_fragment:1;
+	u32	first_frag:1;
 };
 
 /**
@@ -122,6 +124,8 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_MAX,
 };
 
+#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG		BIT(0)
+
 struct flow_dissector_key {
 	enum flow_dissector_key_id key_id;
 	size_t offset; /* offset of struct flow_dissector_key_*
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c3d9807cb34e..7536a4669029 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -177,8 +177,6 @@ ip:
 		nhoff += iph->ihl * 4;
 
 		ip_proto = iph->protocol;
-		if (ip_is_fragment(iph))
-			ip_proto = 0;
 
 		if (!skb_flow_dissector_uses_key(flow_dissector,
 						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
@@ -189,6 +187,19 @@ ip:
 		memcpy(&key_addrs->v4addrs, &iph->saddr,
 		       sizeof(key_addrs->v4addrs));
 		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+
+		if (ip_is_fragment(iph)) {
+			key_control->is_fragment = 1;
+
+			if (iph->frag_off & htons(IP_OFFSET)) {
+				goto out_good;
+			} else {
+				key_control->first_frag = 1;
+				if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
+					goto out_good;
+			}
+		}
+
 		break;
 	}
 	case htons(ETH_P_IPV6): {
-- 
cgit v1.2.3


From 8306b688f1a6621b9efe3b0d827e26750528b12a Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:30 -0700
Subject: flow_dissector: Add flag to stop parsing at L3

Add an input flag to flow dissector on rather dissection should be
stopped when an L3 packet is encountered. This would be useful if a
caller just wanted to get IP addresses of the outermost header (e.g.
to do an L3 hash).

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 1 +
 net/core/flow_dissector.c    | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 34102270b086..bb81e3b576e7 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -125,6 +125,7 @@ enum flow_dissector_key_id {
 };
 
 #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG		BIT(0)
+#define FLOW_DISSECTOR_F_STOP_AT_L3		BIT(1)
 
 struct flow_dissector_key {
 	enum flow_dissector_key_id key_id;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 907de2f68b1f..94fd841f341f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -200,6 +200,9 @@ ip:
 			}
 		}
 
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
+			goto out_good;
+
 		break;
 	}
 	case htons(ETH_P_IPV6): {
@@ -238,6 +241,9 @@ ipv6:
 			}
 		}
 
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
+			goto out_good;
+
 		break;
 	}
 	case htons(ETH_P_8021AD):
-- 
cgit v1.2.3


From 872b1abb1ed47a691f465fb3d285f6cf6bcd8663 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:31 -0700
Subject: flow_dissector: Add flag to stop parsing when an IPv6 flow label is
 seen

Add an input flag to flow dissector on rather dissection should be
stopped when a flow label is encountered. Presumably, the flow label
is derived from a sufficient hash of an inner transport packet so
further dissection is not needed (that is ports are not included in
the flow hash). Using the flow label instead of ports has the additional
benefit that packet fragments should hash to same value as non-fragments
for a flow (assuming that the same flow label is used).

We set this flag by default in for skb_get_hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 1 +
 net/core/flow_dissector.c    | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index bb81e3b576e7..66dbc3498efb 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -126,6 +126,7 @@ enum flow_dissector_key_id {
 
 #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG		BIT(0)
 #define FLOW_DISSECTOR_F_STOP_AT_L3		BIT(1)
+#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	BIT(2)
 
 struct flow_dissector_key {
 	enum flow_dissector_key_id key_id;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 94fd841f341f..094e34354627 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -239,6 +239,8 @@ ipv6:
 								     target_container);
 				key_tags->flow_label = ntohl(flow_label);
 			}
+			if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
+				goto out_good;
 		}
 
 		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
@@ -599,7 +601,8 @@ EXPORT_SYMBOL(flow_hash_from_keys);
 static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
-	if (!skb_flow_dissect_flow_keys(skb, keys, 0))
+	if (!skb_flow_dissect_flow_keys(skb, keys,
+					FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL))
 		return 0;
 
 	return __flow_hash_from_keys(keys, keyval);
-- 
cgit v1.2.3


From 823b96939578eae67b9d6c0e33a39d6a7b6401e7 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:32 -0700
Subject: flow_dissector: Add control/reporting of encapsulation

Add an input flag to flow dissector on rather dissection should stop
when encapsulation is detected (IP/IP or GRE). Also, add a key_control
flag that indicates encapsulation was encountered during the
dissection.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  2 ++
 net/core/flow_dissector.c    | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 66dbc3498efb..bddd1089dbce 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -14,6 +14,7 @@ struct flow_dissector_key_control {
 	u16	addr_type;
 	u32	is_fragment:1;
 	u32	first_frag:1;
+	u32	encapsulation:1;
 };
 
 /**
@@ -127,6 +128,7 @@ enum flow_dissector_key_id {
 #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG		BIT(0)
 #define FLOW_DISSECTOR_F_STOP_AT_L3		BIT(1)
 #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	BIT(2)
+#define FLOW_DISSECTOR_F_STOP_AT_ENCAP		BIT(3)
 
 struct flow_dissector_key {
 	enum flow_dissector_key_id key_id;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 094e34354627..8d890132e2d7 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -397,6 +397,11 @@ ip_proto_again:
 			proto = eth->h_proto;
 			nhoff += sizeof(*eth);
 		}
+
+		key_control->encapsulation = 1;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+			goto out_good;
+
 		goto again;
 	}
 	case NEXTHDR_HOP:
@@ -444,9 +449,19 @@ ip_proto_again:
 	}
 	case IPPROTO_IPIP:
 		proto = htons(ETH_P_IP);
+
+		key_control->encapsulation = 1;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+			goto out_good;
+
 		goto ip;
 	case IPPROTO_IPV6:
 		proto = htons(ETH_P_IPV6);
+
+		key_control->encapsulation = 1;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+			goto out_good;
+
 		goto ipv6;
 	case IPPROTO_MPLS:
 		proto = htons(ETH_P_MPLS_UC);
-- 
cgit v1.2.3


From 4b36993d3df0834eff3b4172962de0343a4d9123 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 1 Sep 2015 16:46:08 -0700
Subject: flow_dissector: Don't use bit fields.

Just have a flags member instead.

   In file included from include/linux/linkage.h:4:0,
                    from include/linux/kernel.h:6,
                    from net/core/flow_dissector.c:1:
   In function 'flow_keys_hash_start',
       inlined from 'flow_hash_from_keys' at net/core/flow_dissector.c:553:34:
>> include/linux/compiler.h:447:38: error: call to '__compiletime_assert_459' declared with attribute error: BUILD_BUG_ON failed: FLOW_KEYS_HASH_OFFSET % sizeof(u32)

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  8 +++++---
 net/core/flow_dissector.c    | 14 +++++++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index bddd1089dbce..8c8548cf5888 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -12,11 +12,13 @@
 struct flow_dissector_key_control {
 	u16	thoff;
 	u16	addr_type;
-	u32	is_fragment:1;
-	u32	first_frag:1;
-	u32	encapsulation:1;
+	u32	flags;
 };
 
+#define FLOW_DIS_IS_FRAGMENT	BIT(0)
+#define FLOW_DIS_FIRST_FRAG	BIT(1)
+#define FLOW_DIS_ENCAPSULATION	BIT(2)
+
 /**
  * struct flow_dissector_key_basic:
  * @thoff: Transport header offset
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b563339436d0..8d32020303c6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -189,12 +189,12 @@ ip:
 		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 
 		if (ip_is_fragment(iph)) {
-			key_control->is_fragment = 1;
+			key_control->flags |= FLOW_DIS_IS_FRAGMENT;
 
 			if (iph->frag_off & htons(IP_OFFSET)) {
 				goto out_good;
 			} else {
-				key_control->first_frag = 1;
+				key_control->flags |= FLOW_DIS_FIRST_FRAG;
 				if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
 					goto out_good;
 			}
@@ -398,7 +398,7 @@ ip_proto_again:
 			nhoff += sizeof(*eth);
 		}
 
-		key_control->encapsulation = 1;
+		key_control->flags |= FLOW_DIS_ENCAPSULATION;
 		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
 			goto out_good;
 
@@ -434,12 +434,12 @@ ip_proto_again:
 		if (!fh)
 			goto out_bad;
 
-		key_control->is_fragment = 1;
+		key_control->flags |= FLOW_DIS_IS_FRAGMENT;
 
 		nhoff += sizeof(_fh);
 
 		if (!(fh->frag_off & htons(IP6_OFFSET))) {
-			key_control->first_frag = 1;
+			key_control->flags |= FLOW_DIS_FIRST_FRAG;
 			if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
 				ip_proto = fh->nexthdr;
 				goto ip_proto_again;
@@ -450,7 +450,7 @@ ip_proto_again:
 	case IPPROTO_IPIP:
 		proto = htons(ETH_P_IP);
 
-		key_control->encapsulation = 1;
+		key_control->flags |= FLOW_DIS_ENCAPSULATION;
 		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
 			goto out_good;
 
@@ -458,7 +458,7 @@ ip_proto_again:
 	case IPPROTO_IPV6:
 		proto = htons(ETH_P_IPV6);
 
-		key_control->encapsulation = 1;
+		key_control->flags |= FLOW_DIS_ENCAPSULATION;
 		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
 			goto out_good;
 
-- 
cgit v1.2.3


From 20a17bf6c04e3eca8824c930ecc55ab832558e3b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 1 Sep 2015 21:19:17 -0700
Subject: flow_dissector: Use 'const' where possible.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  8 ++---
 include/net/flow.h        |  8 ++---
 net/core/flow_dissector.c | 79 ++++++++++++++++++++++++-----------------------
 3 files changed, 49 insertions(+), 46 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eabfb810bc62..2738d355cdf9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1029,9 +1029,9 @@ static inline __u32 skb_get_hash(struct sk_buff *skb)
 	return skb->hash;
 }
 
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6);
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6);
 
-static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
@@ -1043,9 +1043,9 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 	return skb->hash;
 }
 
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl);
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl);
 
-static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
diff --git a/include/net/flow.h b/include/net/flow.h
index dafe97c3c048..acd6a096250e 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -244,18 +244,18 @@ void flow_cache_flush(struct net *net);
 void flow_cache_flush_deferred(struct net *net);
 extern atomic_t flow_cache_genid;
 
-__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys);
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys);
 
-static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6)
+static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6)
 {
 	struct flow_keys keys;
 
 	return __get_hash_from_flowi6(fl6, &keys);
 }
 
-__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys);
+__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys);
 
-static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4)
+static inline __u32 get_hash_from_flowi4(const struct flowi4 *fl4)
 {
 	struct flow_keys keys;
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 345a0408cfe4..d79699c9d1b9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,14 +19,14 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
-static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector,
-					enum flow_dissector_key_id key_id)
+static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
+			       enum flow_dissector_key_id key_id)
 {
 	return flow_dissector->used_keys & (1 << key_id);
 }
 
-static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector,
-				       enum flow_dissector_key_id key_id)
+static void dissector_set_key(struct flow_dissector *flow_dissector,
+			      enum flow_dissector_key_id key_id)
 {
 	flow_dissector->used_keys |= (1 << key_id);
 }
@@ -51,20 +51,20 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 		 * boundaries of unsigned short.
 		 */
 		BUG_ON(key->offset > USHRT_MAX);
-		BUG_ON(skb_flow_dissector_uses_key(flow_dissector,
-						   key->key_id));
+		BUG_ON(dissector_uses_key(flow_dissector,
+					  key->key_id));
 
-		skb_flow_dissector_set_key(flow_dissector, key->key_id);
+		dissector_set_key(flow_dissector, key->key_id);
 		flow_dissector->offset[key->key_id] = key->offset;
 	}
 
 	/* Ensure that the dissector always includes control and basic key.
 	 * That way we are able to avoid handling lack of these in fast path.
 	 */
-	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
-					    FLOW_DISSECTOR_KEY_CONTROL));
-	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
-					    FLOW_DISSECTOR_KEY_BASIC));
+	BUG_ON(!dissector_uses_key(flow_dissector,
+				   FLOW_DISSECTOR_KEY_CONTROL));
+	BUG_ON(!dissector_uses_key(flow_dissector,
+				   FLOW_DISSECTOR_KEY_BASIC));
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
@@ -154,8 +154,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
-	if (skb_flow_dissector_uses_key(flow_dissector,
-					FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+	if (dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
 		struct flow_dissector_key_eth_addrs *key_eth_addrs;
 
@@ -178,8 +178,8 @@ ip:
 
 		ip_proto = iph->protocol;
 
-		if (!skb_flow_dissector_uses_key(flow_dissector,
-						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
+		if (!dissector_uses_key(flow_dissector,
+					FLOW_DISSECTOR_KEY_IPV4_ADDRS))
 			break;
 
 		key_addrs = skb_flow_dissector_target(flow_dissector,
@@ -218,8 +218,8 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
 			struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
 
 			key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
@@ -232,8 +232,8 @@ ipv6:
 
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
-			if (skb_flow_dissector_uses_key(flow_dissector,
-				FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
 				key_tags = skb_flow_dissector_target(flow_dissector,
 								     FLOW_DISSECTOR_KEY_FLOW_LABEL,
 								     target_container);
@@ -257,8 +257,8 @@ ipv6:
 		if (!vlan)
 			goto out_bad;
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_VLANID)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_VLANID)) {
 			key_tags = skb_flow_dissector_target(flow_dissector,
 							     FLOW_DISSECTOR_KEY_VLANID,
 							     target_container);
@@ -298,8 +298,8 @@ ipv6:
 		if (!hdr)
 			goto out_bad;
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
 			key_addrs = skb_flow_dissector_target(flow_dissector,
 							      FLOW_DISSECTOR_KEY_TIPC_ADDRS,
 							      target_container);
@@ -320,8 +320,8 @@ mpls:
 
 		if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
 		     MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
-			if (skb_flow_dissector_uses_key(flow_dissector,
-							FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
 				key_keyid = skb_flow_dissector_target(flow_dissector,
 								      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
 								      target_container);
@@ -374,8 +374,8 @@ ip_proto_again:
 			if (!keyid)
 				goto out_bad;
 
-			if (skb_flow_dissector_uses_key(flow_dissector,
-							FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_GRE_KEYID)) {
 				key_keyid = skb_flow_dissector_target(flow_dissector,
 								      FLOW_DISSECTOR_KEY_GRE_KEYID,
 								      target_container);
@@ -470,8 +470,8 @@ ip_proto_again:
 		break;
 	}
 
-	if (skb_flow_dissector_uses_key(flow_dissector,
-					FLOW_DISSECTOR_KEY_PORTS)) {
+	if (dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_PORTS)) {
 		key_ports = skb_flow_dissector_target(flow_dissector,
 						      FLOW_DISSECTOR_KEY_PORTS,
 						      target_container);
@@ -497,18 +497,21 @@ static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval)
+static __always_inline u32 __flow_hash_words(const u32 *words, u32 length,
+					     u32 keyval)
 {
 	return jhash2(words, length, keyval);
 }
 
-static inline void *flow_keys_hash_start(struct flow_keys *flow)
+static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow)
 {
+	const void *p = flow;
+
 	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
-	return (void *)flow + FLOW_KEYS_HASH_OFFSET;
+	return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET);
 }
 
-static inline size_t flow_keys_hash_length(struct flow_keys *flow)
+static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
 {
 	size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
 	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
@@ -598,7 +601,7 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 
 	__flow_hash_consistentify(keys);
 
-	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
+	hash = __flow_hash_words(flow_keys_hash_start(keys),
 				 flow_keys_hash_length(keys), keyval);
 	if (!hash)
 		hash = 1;
@@ -677,7 +680,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
 {
 	struct flow_keys keys;
 
@@ -701,7 +704,7 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 }
 EXPORT_SYMBOL(__skb_get_hash_flowi6);
 
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
 {
 	struct flow_keys keys;
 
@@ -787,7 +790,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
 
-__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 {
 	memset(keys, 0, sizeof(*keys));
 
@@ -806,7 +809,7 @@ __u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
 }
 EXPORT_SYMBOL(__get_hash_from_flowi6);
 
-__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys)
+__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys)
 {
 	memset(keys, 0, sizeof(*keys));
 
-- 
cgit v1.2.3


From 62da98656b62a5ca57f22263705175af8ded5aa1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Sep 2015 01:26:07 +0200
Subject: netfilter: nf_conntrack: make nf_ct_zone_dflt built-in

Fengguang reported, that some randconfig generated the following linker
issue with nf_ct_zone_dflt object involved:

  [...]
  CC      init/version.o
  LD      init/built-in.o
  net/built-in.o: In function `ipv4_conntrack_defrag':
  nf_defrag_ipv4.c:(.text+0x93e95): undefined reference to `nf_ct_zone_dflt'
  net/built-in.o: In function `ipv6_defrag':
  nf_defrag_ipv6_hooks.c:(.text+0xe3ffe): undefined reference to `nf_ct_zone_dflt'
  make: *** [vmlinux] Error 1

Given that configurations exist where we have a built-in part, which is
accessing nf_ct_zone_dflt such as the two handlers nf_ct_defrag_user()
and nf_ct6_defrag_user(), and a part that configures nf_conntrack as a
module, we must move nf_ct_zone_dflt into a fixed, guaranteed built-in
area when netfilter is configured in general.

Therefore, split the more generic parts into a common header under
include/linux/netfilter/ and move nf_ct_zone_dflt into the built-in
section that already holds parts related to CONFIG_NF_CONNTRACK in the
netfilter core. This fixes the issue on my side.

Fixes: 308ac9143ee2 ("netfilter: nf_conntrack: push zone object into functions")
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h                          |  2 ++
 .../linux/netfilter/nf_conntrack_zones_common.h    | 23 ++++++++++++++++++++++
 include/net/netfilter/nf_conntrack_zones.h         | 19 +-----------------
 net/netfilter/core.c                               |  6 ++++++
 net/netfilter/nf_conntrack_core.c                  |  7 -------
 5 files changed, 32 insertions(+), 25 deletions(-)
 create mode 100644 include/linux/netfilter/nf_conntrack_zones_common.h

(limited to 'include/net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index d788ce62d826..36a652531791 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -368,6 +368,8 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 #endif /*CONFIG_NETFILTER*/
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <linux/netfilter/nf_conntrack_zones_common.h>
+
 extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
 void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
 extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
diff --git a/include/linux/netfilter/nf_conntrack_zones_common.h b/include/linux/netfilter/nf_conntrack_zones_common.h
new file mode 100644
index 000000000000..5d7cf36d4766
--- /dev/null
+++ b/include/linux/netfilter/nf_conntrack_zones_common.h
@@ -0,0 +1,23 @@
+#ifndef _NF_CONNTRACK_ZONES_COMMON_H
+#define _NF_CONNTRACK_ZONES_COMMON_H
+
+#include <uapi/linux/netfilter/nf_conntrack_tuple_common.h>
+
+#define NF_CT_DEFAULT_ZONE_ID	0
+
+#define NF_CT_ZONE_DIR_ORIG	(1 << IP_CT_DIR_ORIGINAL)
+#define NF_CT_ZONE_DIR_REPL	(1 << IP_CT_DIR_REPLY)
+
+#define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
+
+#define NF_CT_FLAG_MARK		1
+
+struct nf_conntrack_zone {
+	u16	id;
+	u8	flags;
+	u8	dir;
+};
+
+extern const struct nf_conntrack_zone nf_ct_zone_dflt;
+
+#endif /* _NF_CONNTRACK_ZONES_COMMON_H */
diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 5316c7b3a374..4e32512cef32 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -1,24 +1,7 @@
 #ifndef _NF_CONNTRACK_ZONES_H
 #define _NF_CONNTRACK_ZONES_H
 
-#include <linux/netfilter/nf_conntrack_tuple_common.h>
-
-#define NF_CT_DEFAULT_ZONE_ID	0
-
-#define NF_CT_ZONE_DIR_ORIG	(1 << IP_CT_DIR_ORIGINAL)
-#define NF_CT_ZONE_DIR_REPL	(1 << IP_CT_DIR_REPLY)
-
-#define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
-
-#define NF_CT_FLAG_MARK		1
-
-struct nf_conntrack_zone {
-	u16	id;
-	u8	flags;
-	u8	dir;
-};
-
-extern const struct nf_conntrack_zone nf_ct_zone_dflt;
+#include <linux/netfilter/nf_conntrack_zones_common.h>
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <net/netfilter/nf_conntrack_extend.h>
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0b939b7ad724..8e47f8113495 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -388,6 +388,12 @@ EXPORT_SYMBOL(nf_conntrack_destroy);
 struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
 EXPORT_SYMBOL_GPL(nfq_ct_hook);
 
+/* Built-in default zone used e.g. by modules. */
+const struct nf_conntrack_zone nf_ct_zone_dflt = {
+	.id	= NF_CT_DEFAULT_ZONE_ID,
+	.dir	= NF_CT_DEFAULT_ZONE_DIR,
+};
+EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
 #endif /* CONFIG_NF_CONNTRACK */
 
 #ifdef CONFIG_NF_NAT_NEEDED
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ac3be9b0629b..eedf0495f11f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1286,13 +1286,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
 
-/* Built-in default zone used e.g. by modules. */
-const struct nf_conntrack_zone nf_ct_zone_dflt = {
-	.id	= NF_CT_DEFAULT_ZONE_ID,
-	.dir	= NF_CT_DEFAULT_ZONE_DIR,
-};
-EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
-
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
 	.len	= sizeof(struct nf_conntrack_zone),
-- 
cgit v1.2.3


From 22f66895e60cfc55b92f6fa93f05bb3fbdbd0bed Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@intel.com>
Date: Tue, 18 Aug 2015 16:52:07 +0300
Subject: mac80211: protect non-HT BSS when HT TDLS traffic exists

HT TDLS traffic should be protected in a non-HT BSS to avoid
collisions. Therefore, when TDLS peers join/leave, check if
protection is (now) needed and set the ht_operation_mode of
the virtual interface according to the HT capabilities of the
TDLS peer(s).

This works because a non-HT BSS connection never sets (or
otherwise uses) the ht_operation_mode; it just means that
drivers must be aware that this field applies to all HT
traffic for this virtual interface, not just the traffic
within the BSS. Document that.

Signed-off-by: Avri Altman <avri.altman@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  4 ++-
 net/mac80211/tdls.c    | 70 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 70 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e3314e516681..bfc569498bfa 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -477,7 +477,9 @@ struct ieee80211_event {
  * @chandef: Channel definition for this BSS -- the hardware might be
  *	configured a higher bandwidth than this BSS uses, for example.
  * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation.
- *	This field is only valid when the channel type is one of the HT types.
+ *	This field is only valid when the channel is a wide HT/VHT channel.
+ *	Note that with TDLS this can be the case (channel is HT, protection must
+ *	be used from this field) even when the BSS association isn't using HT.
  * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value
  *	implies disabled
  * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index aee701a5649e..4e202d0679b2 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1249,6 +1249,58 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata)
 	mutex_unlock(&local->chanctx_mtx);
 }
 
+static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata)
+{
+	struct sta_info *sta;
+	bool result = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+		if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded ||
+		    !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
+		    !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) ||
+		    !sta->sta.ht_cap.ht_supported)
+			continue;
+		result = true;
+		break;
+	}
+	rcu_read_unlock();
+
+	return result;
+}
+
+static void
+iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata,
+				   struct sta_info *sta)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	bool tdls_ht;
+	u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED |
+			 IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT |
+			 IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;
+	u16 opmode;
+
+	/* Nothing to do if the BSS connection uses HT */
+	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+		return;
+
+	tdls_ht = (sta && sta->sta.ht_cap.ht_supported) ||
+		  iee80211_tdls_have_ht_peers(sdata);
+
+	opmode = sdata->vif.bss_conf.ht_operation_mode;
+
+	if (tdls_ht)
+		opmode |= protection;
+	else
+		opmode &= ~protection;
+
+	if (opmode == sdata->vif.bss_conf.ht_operation_mode)
+		return;
+
+	sdata->vif.bss_conf.ht_operation_mode = opmode;
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+}
+
 int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 			const u8 *peer, enum nl80211_tdls_operation oper)
 {
@@ -1274,6 +1326,10 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 		return -ENOTSUPP;
 	}
 
+	/* protect possible bss_conf changes and avoid concurrency in
+	 * ieee80211_bss_info_change_notify()
+	 */
+	sdata_lock(sdata);
 	mutex_lock(&local->mtx);
 	tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer);
 
@@ -1287,16 +1343,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 
 		iee80211_tdls_recalc_chanctx(sdata);
 
-		rcu_read_lock();
+		mutex_lock(&local->sta_mtx);
 		sta = sta_info_get(sdata, peer);
 		if (!sta) {
-			rcu_read_unlock();
+			mutex_unlock(&local->sta_mtx);
 			ret = -ENOLINK;
 			break;
 		}
 
+		iee80211_tdls_recalc_ht_protection(sdata, sta);
+
 		set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
-		rcu_read_unlock();
+		mutex_unlock(&local->sta_mtx);
 
 		WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) ||
 			     !ether_addr_equal(sdata->u.mgd.tdls_peer, peer));
@@ -1318,6 +1376,11 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 		ieee80211_flush_queues(local, sdata, false);
 
 		ret = sta_info_destroy_addr(sdata, peer);
+
+		mutex_lock(&local->sta_mtx);
+		iee80211_tdls_recalc_ht_protection(sdata, NULL);
+		mutex_unlock(&local->sta_mtx);
+
 		iee80211_tdls_recalc_chanctx(sdata);
 		break;
 	default:
@@ -1335,6 +1398,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 				     &sdata->u.mgd.request_smps_work);
 
 	mutex_unlock(&local->mtx);
+	sdata_unlock(sdata);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 33398cf2f360c5ce24c8a22436d52a06ad4e5eb5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 8 Sep 2015 15:01:02 -0700
Subject: memcg: export struct mem_cgroup

mem_cgroup structure is defined in mm/memcontrol.c currently which means
that the code outside of this file has to use external API even for
trivial access stuff.

This patch exports mm_struct with its dependencies and makes some of the
exported functions inlines.  This even helps to reduce the code size a bit
(make defconfig + CONFIG_MEMCG=y)

  text		data    bss     dec     	 hex 	filename
  12355346        1823792 1089536 15268674         e8fb42 vmlinux.before
  12354970        1823792 1089536 15268298         e8f9ca vmlinux.after

This is not much (370B) but better than nothing.

We also save a function call in some hot paths like callers of
mem_cgroup_count_vm_event which is used for accounting.

The patch doesn't introduce any functional changes.

[vdavykov@parallels.com: inline memcg_kmem_is_active]
[vdavykov@parallels.com: do not expose type outside of CONFIG_MEMCG]
[akpm@linux-foundation.org: memcontrol.h needs eventfd.h for eventfd_ctx]
[akpm@linux-foundation.org: export mem_cgroup_from_task() to modules]
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 370 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/swap.h       |  10 +-
 include/net/sock.h         |  28 ----
 mm/memcontrol.c            | 315 +-------------------------------------
 mm/memory-failure.c        |   2 +-
 mm/slab_common.c           |   2 +-
 mm/vmscan.c                |   2 +-
 7 files changed, 351 insertions(+), 378 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 73b02b0a8f60..ab2f6880e27b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,6 +23,11 @@
 #include <linux/vm_event_item.h>
 #include <linux/hardirq.h>
 #include <linux/jump_label.h>
+#include <linux/page_counter.h>
+#include <linux/vmpressure.h>
+#include <linux/eventfd.h>
+#include <linux/mmzone.h>
+#include <linux/writeback.h>
 
 struct mem_cgroup;
 struct page;
@@ -67,12 +72,221 @@ enum mem_cgroup_events_index {
 	MEMCG_NR_EVENTS,
 };
 
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+	MEM_CGROUP_TARGET_THRESH,
+	MEM_CGROUP_TARGET_SOFTLIMIT,
+	MEM_CGROUP_TARGET_NUMAINFO,
+	MEM_CGROUP_NTARGETS,
+};
+
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+	/* Currently active and new sockets should be assigned to cgroups */
+	MEMCG_SOCK_ACTIVE,
+	/* It was ever activated; we must disarm static keys on destruction */
+	MEMCG_SOCK_ACTIVATED,
+};
+
+struct cg_proto {
+	struct page_counter	memory_allocated;	/* Current allocated memory. */
+	struct percpu_counter	sockets_allocated;	/* Current number of sockets. */
+	int			memory_pressure;
+	long			sysctl_mem[3];
+	unsigned long		flags;
+	/*
+	 * memcg field is used to find which memcg we belong directly
+	 * Each memcg struct can hold more than one cg_proto, so container_of
+	 * won't really cut.
+	 *
+	 * The elegant solution would be having an inverse function to
+	 * proto_cgroup in struct proto, but that means polluting the structure
+	 * for everybody, instead of just for memcg users.
+	 */
+	struct mem_cgroup	*memcg;
+};
+
 #ifdef CONFIG_MEMCG
+struct mem_cgroup_stat_cpu {
+	long count[MEM_CGROUP_STAT_NSTATS];
+	unsigned long events[MEMCG_NR_EVENTS];
+	unsigned long nr_page_events;
+	unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct mem_cgroup_reclaim_iter {
+	struct mem_cgroup *position;
+	/* scan generation, increased every round-trip */
+	unsigned int generation;
+};
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+	struct lruvec		lruvec;
+	unsigned long		lru_size[NR_LRU_LISTS];
+
+	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
+
+	struct rb_node		tree_node;	/* RB tree node */
+	unsigned long		usage_in_excess;/* Set to the value by which */
+						/* the soft limit is exceeded*/
+	bool			on_tree;
+	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
+						/* use container_of	   */
+};
+
+struct mem_cgroup_per_node {
+	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_threshold {
+	struct eventfd_ctx *eventfd;
+	unsigned long threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+	/* An array index points to threshold just below or equal to usage. */
+	int current_threshold;
+	/* Size of entries[] */
+	unsigned int size;
+	/* Array of thresholds */
+	struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+	/* Primary thresholds array */
+	struct mem_cgroup_threshold_ary *primary;
+	/*
+	 * Spare threshold array.
+	 * This is needed to make mem_cgroup_unregister_event() "never fail".
+	 * It must be able to store at least primary->size - 1 entries.
+	 */
+	struct mem_cgroup_threshold_ary *spare;
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ */
+struct mem_cgroup {
+	struct cgroup_subsys_state css;
+
+	/* Accounted resources */
+	struct page_counter memory;
+	struct page_counter memsw;
+	struct page_counter kmem;
+
+	/* Normal memory consumption range */
+	unsigned long low;
+	unsigned long high;
+
+	unsigned long soft_limit;
+
+	/* vmpressure notifications */
+	struct vmpressure vmpressure;
+
+	/* css_online() has been completed */
+	int initialized;
+
+	/*
+	 * Should the accounting and control be hierarchical, per subtree?
+	 */
+	bool use_hierarchy;
+
+	/* protected by memcg_oom_lock */
+	bool		oom_lock;
+	int		under_oom;
+
+	int	swappiness;
+	/* OOM-Killer disable */
+	int		oom_kill_disable;
+
+	/* protect arrays of thresholds */
+	struct mutex thresholds_lock;
+
+	/* thresholds for memory usage. RCU-protected */
+	struct mem_cgroup_thresholds thresholds;
+
+	/* thresholds for mem+swap usage. RCU-protected */
+	struct mem_cgroup_thresholds memsw_thresholds;
+
+	/* For oom notifier event fd */
+	struct list_head oom_notify;
+
+	/*
+	 * Should we move charges of a task when a task is moved into this
+	 * mem_cgroup ? And what type of charges should we move ?
+	 */
+	unsigned long move_charge_at_immigrate;
+	/*
+	 * set > 0 if pages under this cgroup are moving to other cgroup.
+	 */
+	atomic_t		moving_account;
+	/* taken only while moving_account > 0 */
+	spinlock_t		move_lock;
+	struct task_struct	*move_lock_task;
+	unsigned long		move_lock_flags;
+	/*
+	 * percpu counter.
+	 */
+	struct mem_cgroup_stat_cpu __percpu *stat;
+	spinlock_t pcp_counter_lock;
+
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+	struct cg_proto tcp_mem;
+#endif
+#if defined(CONFIG_MEMCG_KMEM)
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
+	int kmemcg_id;
+	bool kmem_acct_activated;
+	bool kmem_acct_active;
+#endif
+
+	int last_scanned_node;
+#if MAX_NUMNODES > 1
+	nodemask_t	scan_nodes;
+	atomic_t	numainfo_events;
+	atomic_t	numainfo_updating;
+#endif
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct list_head cgwb_list;
+	struct wb_domain cgwb_domain;
+#endif
+
+	/* List of events which userspace want to receive */
+	struct list_head event_list;
+	spinlock_t event_list_lock;
+
+	struct mem_cgroup_per_node *nodeinfo[0];
+	/* WARNING: nodeinfo must be the last member here */
+};
 extern struct cgroup_subsys_state *mem_cgroup_root_css;
 
-void mem_cgroup_events(struct mem_cgroup *memcg,
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 		       enum mem_cgroup_events_index idx,
-		       unsigned int nr);
+		       unsigned int nr)
+{
+	this_cpu_add(memcg->stat->events[idx], nr);
+}
 
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 
@@ -90,15 +304,31 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
-			      struct mem_cgroup *root);
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
+	return css ? container_of(css, struct mem_cgroup, css) : NULL;
+}
+
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+				   struct mem_cgroup *,
+				   struct mem_cgroup_reclaim_cookie *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+
+static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
+			      struct mem_cgroup *root)
+{
+	if (root == memcg)
+		return true;
+	if (!root->use_hierarchy)
+		return false;
+	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
+}
 
 static inline bool mm_match_cgroup(struct mm_struct *mm,
 				   struct mem_cgroup *memcg)
@@ -114,22 +344,65 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 	return match;
 }
 
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
 extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
 
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
-				   struct mem_cgroup *,
-				   struct mem_cgroup_reclaim_cookie *);
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline bool mem_cgroup_disabled(void)
+{
+	if (memory_cgrp_subsys.disabled)
+		return true;
+	return false;
+}
 
 /*
  * For memory reclaim.
  */
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
-void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+		int nr_pages);
+
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return true;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	memcg = mz->memcg;
+
+	return !!(memcg->css.flags & CSS_ONLINE);
+}
+
+static inline
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+	struct mem_cgroup_per_zone *mz;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	return mz->lru_size[lru];
+}
+
+static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+{
+	unsigned long inactive_ratio;
+	unsigned long inactive;
+	unsigned long active;
+	unsigned long gb;
+
+	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
+
+	gb = (inactive + active) >> (30 - PAGE_SHIFT);
+	if (gb)
+		inactive_ratio = int_sqrt(10 * gb);
+	else
+		inactive_ratio = 1;
+
+	return inactive * inactive_ratio < active;
+}
+
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 					struct task_struct *p);
 
@@ -156,18 +429,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
 extern int do_swap_account;
 #endif
 
-static inline bool mem_cgroup_disabled(void)
-{
-	if (memory_cgrp_subsys.disabled)
-		return true;
-	return false;
-}
-
 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-				 enum mem_cgroup_stat_index idx, int val);
 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
 
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+				 enum mem_cgroup_stat_index idx, int val)
+{
+	VM_BUG_ON(!rcu_read_lock_held());
+
+	if (memcg)
+		this_cpu_add(memcg->stat->count[idx], val);
+}
+
 static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
 					    enum mem_cgroup_stat_index idx)
 {
@@ -184,13 +465,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 					     enum vm_event_item idx)
 {
+	struct mem_cgroup *memcg;
+
 	if (mem_cgroup_disabled())
 		return;
-	__mem_cgroup_count_vm_event(mm, idx);
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	if (unlikely(!memcg))
+		goto out;
+
+	switch (idx) {
+	case PGFAULT:
+		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+		break;
+	case PGMAJFAULT:
+		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+		break;
+	default:
+		BUG();
+	}
+out:
+	rcu_read_unlock();
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void mem_cgroup_split_huge_fixup(struct page *head);
@@ -275,12 +574,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
 	return true;
 }
 
-static inline struct cgroup_subsys_state
-		*mem_cgroup_css(struct mem_cgroup *memcg)
-{
-	return NULL;
-}
-
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
@@ -444,7 +737,10 @@ static inline bool memcg_kmem_enabled(void)
 	return static_key_false(&memcg_kmem_enabled_key);
 }
 
-bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+	return memcg->kmem_acct_active;
+}
 
 /*
  * In general, we'll do everything in our power to not incur in any overhead
@@ -463,7 +759,15 @@ void __memcg_kmem_commit_charge(struct page *page,
 				       struct mem_cgroup *memcg, int order);
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
-int memcg_cache_id(struct mem_cgroup *memcg);
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+	return memcg ? memcg->kmemcg_id : -1;
+}
 
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6282f1eb3d6a..2ce190709280 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -351,7 +351,15 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);
 extern int kswapd_run(int nid);
 extern void kswapd_stop(int nid);
 #ifdef CONFIG_MEMCG
-extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
+static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+	/* root ? */
+	if (mem_cgroup_disabled() || !memcg->css.parent)
+		return vm_swappiness;
+
+	return memcg->swappiness;
+}
+
 #else
 static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index 43c6abcf06ab..a98c71ea40c5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1042,34 +1042,6 @@ struct proto {
 #endif
 };
 
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
-	/* Currently active and new sockets should be assigned to cgroups */
-	MEMCG_SOCK_ACTIVE,
-	/* It was ever activated; we must disarm static keys on destruction */
-	MEMCG_SOCK_ACTIVATED,
-};
-
-struct cg_proto {
-	struct page_counter	memory_allocated;	/* Current allocated memory. */
-	struct percpu_counter	sockets_allocated;	/* Current number of sockets. */
-	int			memory_pressure;
-	long			sysctl_mem[3];
-	unsigned long		flags;
-	/*
-	 * memcg field is used to find which memcg we belong directly
-	 * Each memcg struct can hold more than one cg_proto, so container_of
-	 * won't really cut.
-	 *
-	 * The elegant solution would be having an inverse function to
-	 * proto_cgroup in struct proto, but that means polluting the structure
-	 * for everybody, instead of just for memcg users.
-	 */
-	struct mem_cgroup	*memcg;
-};
-
 int proto_register(struct proto *prot, int alloc_slab);
 void proto_unregister(struct proto *prot);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9871f13fc35b..6935f77589e7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
 	"unevictable",
 };
 
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
-	MEM_CGROUP_TARGET_THRESH,
-	MEM_CGROUP_TARGET_SOFTLIMIT,
-	MEM_CGROUP_TARGET_NUMAINFO,
-	MEM_CGROUP_NTARGETS,
-};
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET	1024
 
-struct mem_cgroup_stat_cpu {
-	long count[MEM_CGROUP_STAT_NSTATS];
-	unsigned long events[MEMCG_NR_EVENTS];
-	unsigned long nr_page_events;
-	unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
-	struct mem_cgroup *position;
-	/* scan generation, increased every round-trip */
-	unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
-	struct lruvec		lruvec;
-	unsigned long		lru_size[NR_LRU_LISTS];
-
-	struct reclaim_iter	iter[DEF_PRIORITY + 1];
-
-	struct rb_node		tree_node;	/* RB tree node */
-	unsigned long		usage_in_excess;/* Set to the value by which */
-						/* the soft limit is exceeded*/
-	bool			on_tree;
-	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
-						/* use container_of	   */
-};
-
-struct mem_cgroup_per_node {
-	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
 
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 
-struct mem_cgroup_threshold {
-	struct eventfd_ctx *eventfd;
-	unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
-	/* An array index points to threshold just below or equal to usage. */
-	int current_threshold;
-	/* Size of entries[] */
-	unsigned int size;
-	/* Array of thresholds */
-	struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
-	/* Primary thresholds array */
-	struct mem_cgroup_threshold_ary *primary;
-	/*
-	 * Spare threshold array.
-	 * This is needed to make mem_cgroup_unregister_event() "never fail".
-	 * It must be able to store at least primary->size - 1 entries.
-	 */
-	struct mem_cgroup_threshold_ary *spare;
-};
-
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
-	struct cgroup_subsys_state css;
-
-	/* Accounted resources */
-	struct page_counter memory;
-	struct page_counter memsw;
-	struct page_counter kmem;
-
-	/* Normal memory consumption range */
-	unsigned long low;
-	unsigned long high;
-
-	unsigned long soft_limit;
-
-	/* vmpressure notifications */
-	struct vmpressure vmpressure;
-
-	/* css_online() has been completed */
-	int initialized;
-
-	/*
-	 * Should the accounting and control be hierarchical, per subtree?
-	 */
-	bool use_hierarchy;
-
-	/* protected by memcg_oom_lock */
-	bool		oom_lock;
-	int		under_oom;
-
-	int	swappiness;
-	/* OOM-Killer disable */
-	int		oom_kill_disable;
-
-	/* protect arrays of thresholds */
-	struct mutex thresholds_lock;
-
-	/* thresholds for memory usage. RCU-protected */
-	struct mem_cgroup_thresholds thresholds;
-
-	/* thresholds for mem+swap usage. RCU-protected */
-	struct mem_cgroup_thresholds memsw_thresholds;
-
-	/* For oom notifier event fd */
-	struct list_head oom_notify;
-
-	/*
-	 * Should we move charges of a task when a task is moved into this
-	 * mem_cgroup ? And what type of charges should we move ?
-	 */
-	unsigned long move_charge_at_immigrate;
-	/*
-	 * set > 0 if pages under this cgroup are moving to other cgroup.
-	 */
-	atomic_t		moving_account;
-	/* taken only while moving_account > 0 */
-	spinlock_t		move_lock;
-	struct task_struct	*move_lock_task;
-	unsigned long		move_lock_flags;
-	/*
-	 * percpu counter.
-	 */
-	struct mem_cgroup_stat_cpu __percpu *stat;
-	spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-	struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params.memcg_caches array */
-	int kmemcg_id;
-	bool kmem_acct_activated;
-	bool kmem_acct_active;
-#endif
-
-	int last_scanned_node;
-#if MAX_NUMNODES > 1
-	nodemask_t	scan_nodes;
-	atomic_t	numainfo_events;
-	atomic_t	numainfo_updating;
-#endif
-
-#ifdef CONFIG_CGROUP_WRITEBACK
-	struct list_head cgwb_list;
-	struct wb_domain cgwb_domain;
-#endif
-
-	/* List of events which userspace want to receive */
-	struct list_head event_list;
-	spinlock_t event_list_lock;
-
-	struct mem_cgroup_per_node *nodeinfo[0];
-	/* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
-	return memcg->kmem_acct_active;
-}
-#endif
-
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
-	return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
 /* Some nice accessors for the vmpressure. */
 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 {
@@ -593,11 +409,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 	return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
 
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
-	return &memcg->css;
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -876,14 +687,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 }
 
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
-	struct mem_cgroup_per_zone *mz;
-
-	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-	return mz->lru_size[lru];
-}
-
 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 						  int nid,
 						  unsigned int lru_mask)
@@ -986,6 +789,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 
 	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
+EXPORT_SYMBOL(mem_cgroup_from_task);
 
 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
@@ -1031,7 +835,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
-	struct reclaim_iter *uninitialized_var(iter);
+	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 	struct cgroup_subsys_state *css = NULL;
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *pos = NULL;
@@ -1173,30 +977,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
-	struct mem_cgroup *memcg;
-
-	rcu_read_lock();
-	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-	if (unlikely(!memcg))
-		goto out;
-
-	switch (idx) {
-	case PGFAULT:
-		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
-		break;
-	case PGMAJFAULT:
-		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
-		break;
-	default:
-		BUG();
-	}
-out:
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
 /**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
@@ -1295,15 +1075,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 	VM_BUG_ON((long)(*lru_size) < 0);
 }
 
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
-	if (root == memcg)
-		return true;
-	if (!root->use_hierarchy)
-		return false;
-	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *task_memcg;
@@ -1330,39 +1101,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
 	return ret;
 }
 
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	unsigned long inactive_ratio;
-	unsigned long inactive;
-	unsigned long active;
-	unsigned long gb;
-
-	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
-	gb = (inactive + active) >> (30 - PAGE_SHIFT);
-	if (gb)
-		inactive_ratio = int_sqrt(10 * gb);
-	else
-		inactive_ratio = 1;
-
-	return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
-	struct mem_cgroup_per_zone *mz;
-	struct mem_cgroup *memcg;
-
-	if (mem_cgroup_disabled())
-		return true;
-
-	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-	memcg = mz->memcg;
-
-	return !!(memcg->css.flags & CSS_ONLINE);
-}
-
 #define mem_cgroup_from_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
@@ -1394,15 +1132,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 	return margin;
 }
 
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
-	/* root ? */
-	if (mem_cgroup_disabled() || !memcg->css.parent)
-		return vm_swappiness;
-
-	return memcg->swappiness;
-}
-
 /*
  * A routine for checking "mem" is under move_account() or not.
  *
@@ -2067,23 +1796,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(mem_cgroup_end_page_stat);
 
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-				 enum mem_cgroup_stat_index idx, int val)
-{
-	VM_BUG_ON(!rcu_read_lock_held());
-
-	if (memcg)
-		this_cpu_add(memcg->stat->count[idx], val);
-}
-
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
@@ -2509,16 +2221,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
 	css_put_many(&memcg->css, nr_pages);
 }
 
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
-	return memcg ? memcg->kmemcg_id : -1;
-}
-
 static int memcg_alloc_cache_id(void)
 {
 	int id, size;
@@ -5525,19 +5227,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.early_init = 0,
 };
 
-/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
-		       enum mem_cgroup_events_index idx,
-		       unsigned int nr)
-{
-	this_cpu_add(memcg->stat->events[idx], nr);
-}
-
 /**
  * mem_cgroup_low - check if memory consumption is below the normal range
  * @root: the highest ancestor to consider
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1f4446a90cef..016c814101ed 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p)
 	if (!mem)
 		return -EINVAL;
 
-	css = mem_cgroup_css(mem);
+	css = &mem->css;
 	ino = cgroup_ino(css->cgroup);
 	css_put(css);
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index bde04a699ab6..5ce4faeb16fb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 			     struct kmem_cache *root_cache)
 {
 	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
-	struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+	struct cgroup_subsys_state *css = &memcg->css;
 	struct memcg_cache_array *arr;
 	struct kmem_cache *s = NULL;
 	char *cache_name;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1139039122a..bf23c88621ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
 	if (!memcg)
 		return true;
 #ifdef CONFIG_CGROUP_WRITEBACK
-	if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+	if (memcg->css.cgroup)
 		return true;
 #endif
 	return false;
-- 
cgit v1.2.3


From e752eb68811aeece2220e183e23369a34122fb5e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 8 Sep 2015 15:01:16 -0700
Subject: memcg: move memcg_proto_active from sock.h

The only user is sock_update_memcg which is living in memcontrol.c so it
doesn't make much sense to pollute sock.h by this inline helper.  Move it
to memcontrol.c and open code it into its only caller.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/sock.h | 5 -----
 mm/memcontrol.c    | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index a98c71ea40c5..7aa78440559a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1045,11 +1045,6 @@ struct proto {
 int proto_register(struct proto *prot, int alloc_slab);
 void proto_unregister(struct proto *prot);
 
-static inline bool memcg_proto_active(struct cg_proto *cg_proto)
-{
-	return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
-}
-
 #ifdef SOCK_REFCNT_DEBUG
 static inline void sk_refcnt_debug_inc(struct sock *sk)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3033e6c42229..1742a2db89c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -315,7 +315,7 @@ void sock_update_memcg(struct sock *sk)
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
-		if (cg_proto && memcg_proto_active(cg_proto) &&
+		if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
 		    css_tryget_online(&memcg->css)) {
 			sk->sk_cgrp = cg_proto;
 		}
-- 
cgit v1.2.3


From f53de1e9a4aaf8cbe08845da6f7ff26a078ac507 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 9 Sep 2015 14:20:56 +0200
Subject: net: ipv6: use common fib_default_rule_pref

This switches IPv6 policy routing to use the shared
fib_default_rule_pref() function of IPv4 and DECnet. It is also used in
multicast routing for IPv4 as well as IPv6.

The motivation for this patch is a complaint about iproute2 behaving
inconsistent between IPv4 and IPv6 when adding policy rules: Formerly,
IPv6 rules were assigned a fixed priority of 0x3FFF whereas for IPv4 the
assigned priority value was decreased with each rule added.

Since then all users of the default_pref field have been converted to
assign the generic function fib_default_rule_pref(), fib_nl_newrule()
may just use it directly instead. Therefore get rid of the function
pointer altogether and make fib_default_rule_pref() static, as it's not
used outside fib_rules.c anymore.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h |  2 --
 net/core/fib_rules.c    | 10 +++-------
 net/decnet/dn_rules.c   |  1 -
 net/ipv4/fib_rules.c    |  1 -
 net/ipv4/ipmr.c         |  1 -
 net/ipv6/fib6_rules.c   |  6 ------
 net/ipv6/ip6mr.c        |  1 -
 7 files changed, 3 insertions(+), 19 deletions(-)

(limited to 'include/net')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 4e8f804f4589..59160de702b6 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -66,7 +66,6 @@ struct fib_rules_ops {
 					   struct nlattr **);
 	int			(*fill)(struct fib_rule *, struct sk_buff *,
 					struct fib_rule_hdr *);
-	u32			(*default_pref)(struct fib_rules_ops *ops);
 	size_t			(*nlmsg_payload)(struct fib_rule *);
 
 	/* Called after modifications to the rules set, must flush
@@ -118,5 +117,4 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags,
 		     struct fib_lookup_arg *);
 int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table,
 			 u32 flags);
-u32 fib_default_rule_pref(struct fib_rules_ops *ops);
 #endif
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index ae8306e7c56f..bf77e3639ce0 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -44,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 }
 EXPORT_SYMBOL(fib_default_rule_add);
 
-u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+static u32 fib_default_rule_pref(struct fib_rules_ops *ops)
 {
 	struct list_head *pos;
 	struct fib_rule *rule;
@@ -60,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops)
 
 	return 0;
 }
-EXPORT_SYMBOL(fib_default_rule_pref);
 
 static void notify_rule_change(int event, struct fib_rule *rule,
 			       struct fib_rules_ops *ops, struct nlmsghdr *nlh,
@@ -299,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	}
 	rule->fr_net = net;
 
-	if (tb[FRA_PRIORITY])
-		rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+	rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
+	                              : fib_default_rule_pref(ops);
 
 	if (tb[FRA_IIFNAME]) {
 		struct net_device *dev;
@@ -350,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	else
 		rule->suppress_ifgroup = -1;
 
-	if (!tb[FRA_PRIORITY] && ops->default_pref)
-		rule->pref = ops->default_pref(ops);
-
 	err = -EINVAL;
 	if (tb[FRA_GOTO]) {
 		if (rule->action != FR_ACT_GOTO)
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 9d66a0f72f90..295bbd6a56f2 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
 	.configure	= dn_fib_rule_configure,
 	.compare	= dn_fib_rule_compare,
 	.fill		= dn_fib_rule_fill,
-	.default_pref	= fib_default_rule_pref,
 	.flush_cache	= dn_fib_rule_flush_cache,
 	.nlgroup	= RTNLGRP_DECnet_RULE,
 	.policy		= dn_fib_rule_policy,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 18123d50f576..f2bda9e89c61 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -318,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
 	.delete		= fib4_rule_delete,
 	.compare	= fib4_rule_compare,
 	.fill		= fib4_rule_fill,
-	.default_pref	= fib_default_rule_pref,
 	.nlmsg_payload	= fib4_rule_nlmsg_payload,
 	.flush_cache	= fib4_rule_flush_cache,
 	.nlgroup	= RTNLGRP_IPV4_RULE,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3a2c0162c3ba..866ee89f5254 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
 	.match		= ipmr_rule_match,
 	.configure	= ipmr_rule_configure,
 	.compare	= ipmr_rule_compare,
-	.default_pref	= fib_default_rule_pref,
 	.fill		= ipmr_rule_fill,
 	.nlgroup	= RTNLGRP_IPV4_RULE,
 	.policy		= ipmr_rule_policy,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2367a16eae58..9f777ec59a59 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -258,11 +258,6 @@ nla_put_failure:
 	return -ENOBUFS;
 }
 
-static u32 fib6_rule_default_pref(struct fib_rules_ops *ops)
-{
-	return 0x3FFF;
-}
-
 static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
 {
 	return nla_total_size(16) /* dst */
@@ -279,7 +274,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
 	.configure		= fib6_rule_configure,
 	.compare		= fib6_rule_compare,
 	.fill			= fib6_rule_fill,
-	.default_pref		= fib6_rule_default_pref,
 	.nlmsg_payload		= fib6_rule_nlmsg_payload,
 	.nlgroup		= RTNLGRP_IPV6_RULE,
 	.policy			= fib6_rule_policy,
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 5f36266b1f5e..0e004cc42a22 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
 	.match		= ip6mr_rule_match,
 	.configure	= ip6mr_rule_configure,
 	.compare	= ip6mr_rule_compare,
-	.default_pref	= fib_default_rule_pref,
 	.fill		= ip6mr_rule_fill,
 	.nlgroup	= RTNLGRP_IPV6_RULE,
 	.policy		= ip6mr_rule_policy,
-- 
cgit v1.2.3