From e6247027e5173c00efb2084d688d06ff835bc3b0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 5 Dec 2013 04:45:08 -0800
Subject: net: introduce dev_consume_skb_any()

Some network drivers use dev_kfree_skb_any() and dev_kfree_skb_irq()
helpers to free skbs, both for dropped packets and TX completed ones.

We need to separate the two causes to get better diagnostics
given by dropwatch or "perf record -e skb:kfree_skb"

This patch provides two new helpers, dev_consume_skb_any() and
dev_consume_skb_irq() to be used for consumed skbs.

__dev_kfree_skb_irq() is slightly optimized to remove one
atomic_dec_and_test() in fast path, and use this_cpu_{r|w} accessors.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 53 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 9 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7f0ed423a360..9d55e5188b96 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2368,17 +2368,52 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
 #define DEFAULT_MAX_NUM_RSS_QUEUES	(8)
 int netif_get_num_default_rss_queues(void);
 
-/* Use this variant when it is known for sure that it
- * is executing from hardware interrupt context or with hardware interrupts
- * disabled.
- */
-void dev_kfree_skb_irq(struct sk_buff *skb);
+enum skb_free_reason {
+	SKB_REASON_CONSUMED,
+	SKB_REASON_DROPPED,
+};
+
+void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason);
+void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);
 
-/* Use this variant in places where it could be invoked
- * from either hardware interrupt or other context, with hardware interrupts
- * either disabled or enabled.
+/*
+ * It is not allowed to call kfree_skb() or consume_skb() from hardware
+ * interrupt context or with hardware interrupts being disabled.
+ * (in_irq() || irqs_disabled())
+ *
+ * We provide four helpers that can be used in following contexts :
+ *
+ * dev_kfree_skb_irq(skb) when caller drops a packet from irq context,
+ *  replacing kfree_skb(skb)
+ *
+ * dev_consume_skb_irq(skb) when caller consumes a packet from irq context.
+ *  Typically used in place of consume_skb(skb) in TX completion path
+ *
+ * dev_kfree_skb_any(skb) when caller doesn't know its current irq context,
+ *  replacing kfree_skb(skb)
+ *
+ * dev_consume_skb_any(skb) when caller doesn't know its current irq context,
+ *  and consumed a packet. Used in place of consume_skb(skb)
  */
-void dev_kfree_skb_any(struct sk_buff *skb);
+static inline void dev_kfree_skb_irq(struct sk_buff *skb)
+{
+	__dev_kfree_skb_irq(skb, SKB_REASON_DROPPED);
+}
+
+static inline void dev_consume_skb_irq(struct sk_buff *skb)
+{
+	__dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED);
+}
+
+static inline void dev_kfree_skb_any(struct sk_buff *skb)
+{
+	__dev_kfree_skb_any(skb, SKB_REASON_DROPPED);
+}
+
+static inline void dev_consume_skb_any(struct sk_buff *skb)
+{
+	__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
+}
 
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
-- 
cgit v1.2.3


From 37cb0620073cb64101d9307931c135c70b2e3f04 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Tue, 10 Dec 2013 20:45:41 -0800
Subject: tipc: remove TIPC usage of field af_packet_priv in struct net_device

TIPC is currently using the field 'af_packet_priv' in struct net_device
as a handle to find the bearer instance associated to the given network
device. But, by doing so it is blocking other networking cleanups, such
as the one discussed here:

http://patchwork.ozlabs.org/patch/178044/

This commit removes this usage from TIPC. Instead, we introduce a new
field, 'tipc_ptr', to the net_device structure, to serve this purpose.
When TIPC bearer is enabled, the bearer object is associated to
'tipc_ptr'. When a TIPC packet arrives in the recv_msg() upcall
from a networking device, the bearer object can now be obtained from
'tipc_ptr'. When a bearer is disabled, the bearer object is detached
from its underlying network device by setting 'tipc_ptr' to NULL.

Additionally, an RCU lock is used to protect the new pointer.
Henceforth, the existing tipc_net_lock is used in write mode to
serialize write accesses to this pointer, while the new RCU lock is
applied on the read side to ensure that the pointer is 100% valid
within its wrapped area for all readers.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Patrick McHardy <kaber@trash.net>
Reviewed-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 net/tipc/bearer.h         |  2 ++
 net/tipc/eth_media.c      | 55 +++++++++++++++++++++++++++--------------------
 net/tipc/ib_media.c       | 54 ++++++++++++++++++++++++++--------------------
 4 files changed, 68 insertions(+), 46 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9d55e5188b96..0ca8100f9fbc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1282,6 +1282,9 @@ struct net_device {
 #endif
 #if IS_ENABLED(CONFIG_NET_DSA)
 	struct dsa_switch_tree	*dsa_ptr;	/* dsa specific data */
+#endif
+#if IS_ENABLED(CONFIG_TIPC)
+	struct tipc_bearer __rcu *tipc_ptr;	/* TIPC specific data */
 #endif
 	void 			*atalk_ptr;	/* AppleTalk link 	*/
 	struct in_device __rcu	*ip_ptr;	/* IPv4 specific data	*/
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index e50266aa4d10..91b8d8b92373 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -105,6 +105,7 @@ struct tipc_media {
 
 /**
  * struct tipc_bearer - Generic TIPC bearer structure
+ * @dev: ptr to associated network device
  * @usr_handle: pointer to additional media-specific information about bearer
  * @mtu: max packet size bearer can support
  * @lock: spinlock for controlling access to bearer
@@ -127,6 +128,7 @@ struct tipc_media {
  * care of initializing all other fields.
  */
 struct tipc_bearer {
+	struct net_device *dev;
 	void *usr_handle;			/* initalized by media */
 	u32 mtu;				/* initalized by media */
 	struct tipc_media_addr addr;		/* initalized by media */
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 37fb145476ec..c5f685dee15b 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -63,6 +63,9 @@ static int eth_started;
 
 static int recv_notification(struct notifier_block *nb, unsigned long evt,
 			     void *dv);
+static int recv_msg(struct sk_buff *buf, struct net_device *dev,
+		    struct packet_type *pt, struct net_device *orig_dev);
+
 /*
  * Network device notifier info
  */
@@ -71,6 +74,11 @@ static struct notifier_block notifier = {
 	.priority	= 0
 };
 
+static struct packet_type tipc_packet_type __read_mostly = {
+	.type = __constant_htons(ETH_P_TIPC),
+	.func = recv_msg,
+};
+
 /**
  * eth_media_addr_set - initialize Ethernet media address structure
  *
@@ -128,20 +136,25 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
 static int recv_msg(struct sk_buff *buf, struct net_device *dev,
 		    struct packet_type *pt, struct net_device *orig_dev)
 {
-	struct eth_media *eb_ptr = (struct eth_media *)pt->af_packet_priv;
+	struct tipc_bearer *b_ptr;
 
 	if (!net_eq(dev_net(dev), &init_net)) {
 		kfree_skb(buf);
 		return NET_RX_DROP;
 	}
 
-	if (likely(eb_ptr->bearer)) {
+	rcu_read_lock();
+	b_ptr = rcu_dereference(dev->tipc_ptr);
+	if (likely(b_ptr)) {
 		if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
 			buf->next = NULL;
-			tipc_recv_msg(buf, eb_ptr->bearer);
+			tipc_recv_msg(buf, b_ptr);
+			rcu_read_unlock();
 			return NET_RX_SUCCESS;
 		}
 	}
+	rcu_read_unlock();
+
 	kfree_skb(buf);
 	return NET_RX_DROP;
 }
@@ -151,10 +164,7 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev,
  */
 static void setup_media(struct work_struct *work)
 {
-	struct eth_media *eb_ptr =
-		container_of(work, struct eth_media, setup);
-
-	dev_add_pack(&eb_ptr->tipc_packet_type);
+	dev_add_pack(&tipc_packet_type);
 }
 
 /**
@@ -183,15 +193,11 @@ static int enable_media(struct tipc_bearer *tb_ptr)
 
 	/* Create Ethernet bearer for device */
 	eb_ptr->dev = dev;
-	eb_ptr->tipc_packet_type.type = htons(ETH_P_TIPC);
-	eb_ptr->tipc_packet_type.dev = dev;
-	eb_ptr->tipc_packet_type.func = recv_msg;
-	eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr;
-	INIT_LIST_HEAD(&(eb_ptr->tipc_packet_type.list));
 	INIT_WORK(&eb_ptr->setup, setup_media);
 	schedule_work(&eb_ptr->setup);
 
 	/* Associate TIPC bearer with Ethernet bearer */
+	tb_ptr->dev = dev;
 	eb_ptr->bearer = tb_ptr;
 	tb_ptr->usr_handle = (void *)eb_ptr;
 	memset(tb_ptr->bcast_addr.value, 0, sizeof(tb_ptr->bcast_addr.value));
@@ -200,6 +206,7 @@ static int enable_media(struct tipc_bearer *tb_ptr)
 	tb_ptr->bcast_addr.broadcast = 1;
 	tb_ptr->mtu = dev->mtu;
 	eth_media_addr_set(tb_ptr, &tb_ptr->addr, (char *)dev->dev_addr);
+	rcu_assign_pointer(dev->tipc_ptr, tb_ptr);
 	return 0;
 }
 
@@ -213,7 +220,7 @@ static void cleanup_media(struct work_struct *work)
 	struct eth_media *eb_ptr =
 		container_of(work, struct eth_media, cleanup);
 
-	dev_remove_pack(&eb_ptr->tipc_packet_type);
+	dev_remove_pack(&tipc_packet_type);
 	dev_put(eb_ptr->dev);
 	eb_ptr->dev = NULL;
 }
@@ -232,6 +239,7 @@ static void disable_media(struct tipc_bearer *tb_ptr)
 	eb_ptr->bearer = NULL;
 	INIT_WORK(&eb_ptr->cleanup, cleanup_media);
 	schedule_work(&eb_ptr->cleanup);
+	RCU_INIT_POINTER(tb_ptr->dev->tipc_ptr, NULL);
 }
 
 /**
@@ -243,21 +251,20 @@ static void disable_media(struct tipc_bearer *tb_ptr)
 static int recv_notification(struct notifier_block *nb, unsigned long evt,
 			     void *ptr)
 {
+	struct tipc_bearer *b_ptr;
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct eth_media *eb_ptr = &eth_media_array[0];
-	struct eth_media *stop = &eth_media_array[MAX_ETH_MEDIA];
 
 	if (!net_eq(dev_net(dev), &init_net))
 		return NOTIFY_DONE;
 
-	while ((eb_ptr->dev != dev)) {
-		if (++eb_ptr == stop)
-			return NOTIFY_DONE;	/* couldn't find device */
-	}
-	if (!eb_ptr->bearer)
+	rcu_read_lock();
+	b_ptr = rcu_dereference(dev->tipc_ptr);
+	if (!b_ptr) {
+		rcu_read_unlock();
 		return NOTIFY_DONE;		/* bearer had been disabled */
+	}
 
-	eb_ptr->bearer->mtu = dev->mtu;
+	b_ptr->mtu = dev->mtu;
 
 	switch (evt) {
 	case NETDEV_CHANGE:
@@ -266,13 +273,15 @@ static int recv_notification(struct notifier_block *nb, unsigned long evt,
 	case NETDEV_DOWN:
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGEADDR:
-		tipc_reset_bearer(eb_ptr->bearer);
+		tipc_reset_bearer(b_ptr);
 		break;
 	case NETDEV_UNREGISTER:
 	case NETDEV_CHANGENAME:
-		tipc_disable_bearer(eb_ptr->bearer->name);
+		tipc_disable_bearer(b_ptr->name);
 		break;
 	}
+	rcu_read_unlock();
+
 	return NOTIFY_OK;
 }
 
diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c
index 48e1c07842e6..9fdf03cd672b 100644
--- a/net/tipc/ib_media.c
+++ b/net/tipc/ib_media.c
@@ -62,6 +62,13 @@ struct ib_media {
 
 static struct ib_media ib_media_array[MAX_IB_MEDIA];
 static int ib_started;
+static int recv_msg(struct sk_buff *buf, struct net_device *dev,
+		    struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type tipc_packet_type __read_mostly = {
+	.type = __constant_htons(ETH_P_TIPC),
+	.func = recv_msg,
+};
 
 /**
  * ib_media_addr_set - initialize Infiniband media address structure
@@ -120,20 +127,25 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
 static int recv_msg(struct sk_buff *buf, struct net_device *dev,
 		    struct packet_type *pt, struct net_device *orig_dev)
 {
-	struct ib_media *ib_ptr = (struct ib_media *)pt->af_packet_priv;
+	struct tipc_bearer *b_ptr;
 
 	if (!net_eq(dev_net(dev), &init_net)) {
 		kfree_skb(buf);
 		return NET_RX_DROP;
 	}
 
-	if (likely(ib_ptr->bearer)) {
+	rcu_read_lock();
+	b_ptr = rcu_dereference(dev->tipc_ptr);
+	if (likely(b_ptr)) {
 		if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
 			buf->next = NULL;
-			tipc_recv_msg(buf, ib_ptr->bearer);
+			tipc_recv_msg(buf, b_ptr);
+			rcu_read_unlock();
 			return NET_RX_SUCCESS;
 		}
 	}
+	rcu_read_unlock();
+
 	kfree_skb(buf);
 	return NET_RX_DROP;
 }
@@ -143,10 +155,7 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev,
  */
 static void setup_media(struct work_struct *work)
 {
-	struct ib_media *ib_ptr =
-		container_of(work, struct ib_media, setup);
-
-	dev_add_pack(&ib_ptr->tipc_packet_type);
+	dev_add_pack(&tipc_packet_type);
 }
 
 /**
@@ -175,15 +184,11 @@ static int enable_media(struct tipc_bearer *tb_ptr)
 
 	/* Create InfiniBand bearer for device */
 	ib_ptr->dev = dev;
-	ib_ptr->tipc_packet_type.type = htons(ETH_P_TIPC);
-	ib_ptr->tipc_packet_type.dev = dev;
-	ib_ptr->tipc_packet_type.func = recv_msg;
-	ib_ptr->tipc_packet_type.af_packet_priv = ib_ptr;
-	INIT_LIST_HEAD(&(ib_ptr->tipc_packet_type.list));
 	INIT_WORK(&ib_ptr->setup, setup_media);
 	schedule_work(&ib_ptr->setup);
 
 	/* Associate TIPC bearer with InfiniBand bearer */
+	tb_ptr->dev = dev;
 	ib_ptr->bearer = tb_ptr;
 	tb_ptr->usr_handle = (void *)ib_ptr;
 	memset(tb_ptr->bcast_addr.value, 0, sizeof(tb_ptr->bcast_addr.value));
@@ -192,6 +197,7 @@ static int enable_media(struct tipc_bearer *tb_ptr)
 	tb_ptr->bcast_addr.broadcast = 1;
 	tb_ptr->mtu = dev->mtu;
 	ib_media_addr_set(tb_ptr, &tb_ptr->addr, (char *)dev->dev_addr);
+	rcu_assign_pointer(dev->tipc_ptr, tb_ptr);
 	return 0;
 }
 
@@ -205,7 +211,7 @@ static void cleanup_bearer(struct work_struct *work)
 	struct ib_media *ib_ptr =
 		container_of(work, struct ib_media, cleanup);
 
-	dev_remove_pack(&ib_ptr->tipc_packet_type);
+	dev_remove_pack(&tipc_packet_type);
 	dev_put(ib_ptr->dev);
 	ib_ptr->dev = NULL;
 }
@@ -224,6 +230,7 @@ static void disable_media(struct tipc_bearer *tb_ptr)
 	ib_ptr->bearer = NULL;
 	INIT_WORK(&ib_ptr->cleanup, cleanup_bearer);
 	schedule_work(&ib_ptr->cleanup);
+	RCU_INIT_POINTER(tb_ptr->dev->tipc_ptr, NULL);
 }
 
 /**
@@ -235,21 +242,20 @@ static void disable_media(struct tipc_bearer *tb_ptr)
 static int recv_notification(struct notifier_block *nb, unsigned long evt,
 			     void *ptr)
 {
+	struct tipc_bearer *b_ptr;
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct ib_media *ib_ptr = &ib_media_array[0];
-	struct ib_media *stop = &ib_media_array[MAX_IB_MEDIA];
 
 	if (!net_eq(dev_net(dev), &init_net))
 		return NOTIFY_DONE;
 
-	while ((ib_ptr->dev != dev)) {
-		if (++ib_ptr == stop)
-			return NOTIFY_DONE;	/* couldn't find device */
-	}
-	if (!ib_ptr->bearer)
+	rcu_read_lock();
+	b_ptr = rcu_dereference(dev->tipc_ptr);
+	if (!b_ptr) {
+		rcu_read_unlock();
 		return NOTIFY_DONE;		/* bearer had been disabled */
+	}
 
-	ib_ptr->bearer->mtu = dev->mtu;
+	b_ptr->mtu = dev->mtu;
 
 	switch (evt) {
 	case NETDEV_CHANGE:
@@ -258,13 +264,15 @@ static int recv_notification(struct notifier_block *nb, unsigned long evt,
 	case NETDEV_DOWN:
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGEADDR:
-		tipc_reset_bearer(ib_ptr->bearer);
+		tipc_reset_bearer(b_ptr);
 		break;
 	case NETDEV_UNREGISTER:
 	case NETDEV_CHANGENAME:
-		tipc_disable_bearer(ib_ptr->bearer->name);
+		tipc_disable_bearer(b_ptr->name);
 		break;
 	}
+	rcu_read_unlock();
+
 	return NOTIFY_OK;
 }
 
-- 
cgit v1.2.3


From 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Wed, 11 Dec 2013 20:53:45 -0800
Subject: net-gro: Prepare GRO stack for the upcoming tunneling support

This patch modifies the GRO stack to avoid the use of "network_header"
and associated macros like ip_hdr() and ipv6_hdr() in order to allow
an arbitary number of IP hdrs (v4 or v6) to be used in the
encapsulation chain. This lays the foundation for various IP
tunneling support (IP-in-IP, GRE, VXLAN, SIT,...) to be added later.

With this patch, the GRO stack traversing now is mostly based on
skb_gro_offset rather than special hdr offsets saved in skb (e.g.,
skb->network_header). As a result all but the top layer (i.e., the
the transport layer) must have hdrs of the same length in order for
a pkt to be considered for aggregation. Therefore when adding a new
encap layer (e.g., for tunneling), one must check and skip flows
(e.g., by setting NAPI_GRO_CB(p)->same_flow to 0) that have a
different hdr length.

Note that unlike the network header, the transport header can and
will continue to be set by the GRO code since there will be at
most one "transport layer" in the encap chain.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 +-
 net/core/dev.c            | 76 ++++++++++++++++-------------------------------
 net/ipv4/af_inet.c        | 25 ++++++++++++----
 net/ipv4/tcp_offload.c    |  9 +++---
 net/ipv6/ip6_offload.c    | 54 ++++++++++++++++++++++++++-------
 net/ipv6/tcpv6_offload.c  |  6 ++--
 6 files changed, 97 insertions(+), 75 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0ca8100f9fbc..5260d2eae2e6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1676,7 +1676,7 @@ struct offload_callbacks {
 	int			(*gso_send_check)(struct sk_buff *skb);
 	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
 					       struct sk_buff *skb);
-	int			(*gro_complete)(struct sk_buff *skb);
+	int			(*gro_complete)(struct sk_buff *skb, int nhoff);
 };
 
 struct packet_offload {
diff --git a/net/core/dev.c b/net/core/dev.c
index 355df36360b4..c95d664b2b42 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3752,7 +3752,7 @@ static int napi_gro_complete(struct sk_buff *skb)
 		if (ptype->type != type || !ptype->callbacks.gro_complete)
 			continue;
 
-		err = ptype->callbacks.gro_complete(skb);
+		err = ptype->callbacks.gro_complete(skb, 0);
 		break;
 	}
 	rcu_read_unlock();
@@ -3818,6 +3818,23 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 	}
 }
 
+static void skb_gro_reset_offset(struct sk_buff *skb)
+{
+	const struct skb_shared_info *pinfo = skb_shinfo(skb);
+	const skb_frag_t *frag0 = &pinfo->frags[0];
+
+	NAPI_GRO_CB(skb)->data_offset = 0;
+	NAPI_GRO_CB(skb)->frag0 = NULL;
+	NAPI_GRO_CB(skb)->frag0_len = 0;
+
+	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
+	    pinfo->nr_frags &&
+	    !PageHighMem(skb_frag_page(frag0))) {
+		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
+	}
+}
+
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
@@ -3833,6 +3850,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
 		goto normal;
 
+	skb_gro_reset_offset(skb);
 	gro_list_prepare(napi, skb);
 
 	rcu_read_lock();
@@ -3938,27 +3956,8 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 	return ret;
 }
 
-static void skb_gro_reset_offset(struct sk_buff *skb)
-{
-	const struct skb_shared_info *pinfo = skb_shinfo(skb);
-	const skb_frag_t *frag0 = &pinfo->frags[0];
-
-	NAPI_GRO_CB(skb)->data_offset = 0;
-	NAPI_GRO_CB(skb)->frag0 = NULL;
-	NAPI_GRO_CB(skb)->frag0_len = 0;
-
-	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
-	    pinfo->nr_frags &&
-	    !PageHighMem(skb_frag_page(frag0))) {
-		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
-		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
-	}
-}
-
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
-	skb_gro_reset_offset(skb);
-
 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
 }
 EXPORT_SYMBOL(napi_gro_receive);
@@ -3992,12 +3991,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
 {
 	switch (ret) {
 	case GRO_NORMAL:
-	case GRO_HELD:
-		skb->protocol = eth_type_trans(skb, skb->dev);
-
-		if (ret == GRO_HELD)
-			skb_gro_pull(skb, -ETH_HLEN);
-		else if (netif_receive_skb(skb))
+		if (netif_receive_skb(skb))
 			ret = GRO_DROP;
 		break;
 
@@ -4006,6 +4000,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
 		napi_reuse_skb(napi, skb);
 		break;
 
+	case GRO_HELD:
 	case GRO_MERGED:
 		break;
 	}
@@ -4016,36 +4011,15 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 {
 	struct sk_buff *skb = napi->skb;
-	struct ethhdr *eth;
-	unsigned int hlen;
-	unsigned int off;
 
 	napi->skb = NULL;
 
-	skb_reset_mac_header(skb);
-	skb_gro_reset_offset(skb);
-
-	off = skb_gro_offset(skb);
-	hlen = off + sizeof(*eth);
-	eth = skb_gro_header_fast(skb, off);
-	if (skb_gro_header_hard(skb, hlen)) {
-		eth = skb_gro_header_slow(skb, hlen, off);
-		if (unlikely(!eth)) {
-			napi_reuse_skb(napi, skb);
-			skb = NULL;
-			goto out;
-		}
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
+		napi_reuse_skb(napi, skb);
+		return NULL;
 	}
+	skb->protocol = eth_type_trans(skb, skb->dev);
 
-	skb_gro_pull(skb, sizeof(*eth));
-
-	/*
-	 * This works because the only protocols we care about don't require
-	 * special handling.  We'll fix it up properly at the end.
-	 */
-	skb->protocol = eth->h_proto;
-
-out:
 	return skb;
 }
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 70011e029ac1..ef4f9df6d698 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1377,8 +1377,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
 
-		iph2 = ip_hdr(p);
-
+		iph2 = (struct iphdr *)(p->data + off);
+		/* The above works because, with the exception of the top
+		 * (inner most) layer, we only aggregate pkts with the same
+		 * hdr length so all the hdrs we'll need to verify will start
+		 * at the same offset.
+		 */
 		if ((iph->protocol ^ iph2->protocol) |
 		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
 		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
@@ -1397,6 +1401,11 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	}
 
 	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_set_network_header(skb, off);
+	/* The above will be needed by the transport layer if there is one
+	 * immediately following this IP hdr.
+	 */
+
 	skb_gro_pull(skb, sizeof(*iph));
 	skb_set_transport_header(skb, skb_gro_offset(skb));
 
@@ -1411,10 +1420,10 @@ out:
 	return pp;
 }
 
-static int inet_gro_complete(struct sk_buff *skb)
+static int inet_gro_complete(struct sk_buff *skb, int nhoff)
 {
-	__be16 newlen = htons(skb->len - skb_network_offset(skb));
-	struct iphdr *iph = ip_hdr(skb);
+	__be16 newlen = htons(skb->len - nhoff);
+	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
 	const struct net_offload *ops;
 	int proto = iph->protocol;
 	int err = -ENOSYS;
@@ -1427,7 +1436,11 @@ static int inet_gro_complete(struct sk_buff *skb)
 	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
 		goto out_unlock;
 
-	err = ops->callbacks.gro_complete(skb);
+	/* Only need to add sizeof(*iph) to get to the next hdr below
+	 * because any hdr with option will have been flushed in
+	 * inet_gro_receive().
+	 */
+	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
 
 out_unlock:
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 05606353c7e7..2658a27f540d 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -240,7 +240,7 @@ int tcp_gro_complete(struct sk_buff *skb)
 {
 	struct tcphdr *th = tcp_hdr(skb);
 
-	skb->csum_start = skb_transport_header(skb) - skb->head;
+	skb->csum_start = (unsigned char *)th - skb->head;
 	skb->csum_offset = offsetof(struct tcphdr, check);
 	skb->ip_summed = CHECKSUM_PARTIAL;
 
@@ -272,6 +272,7 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb)
 
 static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
+	/* Use the IP hdr immediately proceeding for this transport */
 	const struct iphdr *iph = skb_gro_network_header(skb);
 	__wsum wsum;
 
@@ -303,13 +304,13 @@ skip_csum:
 	return tcp_gro_receive(head, skb);
 }
 
-static int tcp4_gro_complete(struct sk_buff *skb)
+static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct tcphdr *th = tcp_hdr(skb);
 
-	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
-				  iph->saddr, iph->daddr, 0);
+	th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
+				  iph->daddr, 0);
 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
 
 	return tcp_gro_complete(skb);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4b851692b1f6..7540a0ed75ae 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -154,6 +154,35 @@ out:
 	return segs;
 }
 
+/* Return the total length of all the extension hdrs, following the same
+ * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs.
+ */
+static int ipv6_exthdrs_len(struct ipv6hdr *iph,
+			    const struct net_offload **opps)
+{
+	struct ipv6_opt_hdr *opth = NULL;
+	int len = 0, proto, optlen;
+
+	proto = iph->nexthdr;
+	for (;;) {
+		if (proto != NEXTHDR_HOP) {
+			*opps = rcu_dereference(inet6_offloads[proto]);
+			if (unlikely(!(*opps)))
+				break;
+			if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR))
+				break;
+		}
+		if (opth == NULL)
+			opth = (void *)(iph+1);
+		else
+			opth = (void *)opth + optlen;
+		optlen = ipv6_optlen(opth);
+		len += optlen;
+		proto = opth->nexthdr;
+	}
+	return len;
+}
+
 static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 					 struct sk_buff *skb)
 {
@@ -177,6 +206,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
+	skb_set_network_header(skb, off);
 	skb_gro_pull(skb, sizeof(*iph));
 	skb_set_transport_header(skb, skb_gro_offset(skb));
 
@@ -211,12 +241,16 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
 
-		iph2 = ipv6_hdr(p);
+		iph2 = (struct ipv6hdr *)(p->data + off);
 		first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ;
 
-		/* All fields must match except length and Traffic Class. */
-		if (nlen != skb_network_header_len(p) ||
-		    (first_word & htonl(0xF00FFFFF)) ||
+		/* All fields must match except length and Traffic Class.
+		 * XXX skbs on the gro_list have all been parsed and pulled
+		 * already so we don't need to compare nlen
+		 * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
+		 * memcmp() alone below is suffcient, right?
+		 */
+		 if ((first_word & htonl(0xF00FFFFF)) ||
 		    memcmp(&iph->nexthdr, &iph2->nexthdr,
 			   nlen - offsetof(struct ipv6hdr, nexthdr))) {
 			NAPI_GRO_CB(p)->same_flow = 0;
@@ -245,21 +279,21 @@ out:
 	return pp;
 }
 
-static int ipv6_gro_complete(struct sk_buff *skb)
+static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	const struct net_offload *ops;
-	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
 	int err = -ENOSYS;
 
-	iph->payload_len = htons(skb->len - skb_network_offset(skb) -
-				 sizeof(*iph));
+	iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
 
 	rcu_read_lock();
-	ops = rcu_dereference(inet6_offloads[NAPI_GRO_CB(skb)->proto]);
+
+	nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
 	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
 		goto out_unlock;
 
-	err = ops->callbacks.gro_complete(skb);
+	err = ops->callbacks.gro_complete(skb, nhoff);
 
 out_unlock:
 	rcu_read_unlock();
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 6d18157dc32c..0d78132ff18a 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -66,13 +66,13 @@ skip_csum:
 	return tcp_gro_receive(head, skb);
 }
 
-static int tcp6_gro_complete(struct sk_buff *skb)
+static int tcp6_gro_complete(struct sk_buff *skb, int thoff)
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct tcphdr *th = tcp_hdr(skb);
 
-	th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb),
-				  &iph->saddr, &iph->daddr, 0);
+	th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
+				  &iph->daddr, 0);
 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
 
 	return tcp_gro_complete(skb);
-- 
cgit v1.2.3


From e001bfad913bf119fb67c1e8dd2d4ec1f5d392fa Mon Sep 17 00:00:00 2001
From: dingtianhong <dingtianhong@huawei.com>
Date: Fri, 13 Dec 2013 10:19:55 +0800
Subject: bonding: create bond_first_slave_rcu()

The bond_first_slave_rcu() will be used to instead of bond_first_slave()
in rcu_read_lock().

According to the Jay Vosburgh's suggestion, the struct netdev_adjacent
should hide from users who wanted to use it directly. so I package a
new function to get the first slave of the bond.

Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
Suggested-by: Jay Vosburgh <fubar@us.ibm.com>
Suggested-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bonding.h |  4 ++++
 include/linux/netdevice.h     |  1 +
 net/core/dev.c                | 21 +++++++++++++++++++++
 3 files changed, 26 insertions(+)

(limited to 'include/linux/netdevice.h')

diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 8283cbdec50a..8f0d6d0c383b 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -101,6 +101,10 @@
 		netdev_adjacent_get_private(bond_slave_list(bond)->prev) : \
 		NULL)
 
+/* Caller must have rcu_read_lock */
+#define bond_first_slave_rcu(bond) \
+	netdev_lower_get_first_private_rcu(bond->dev)
+
 #define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond))
 #define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond))
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5260d2eae2e6..2c74d20dad34 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2907,6 +2907,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 	     priv = netdev_lower_get_next_private_rcu(dev, &(iter)))
 
 void *netdev_adjacent_get_private(struct list_head *adj_list);
+void *netdev_lower_get_first_private_rcu(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
 int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index c95d664b2b42..9d4369ece679 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4543,6 +4543,27 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 
+/**
+ * netdev_lower_get_first_private_rcu - Get the first ->private from the
+ *				       lower neighbour list, RCU
+ *				       variant
+ * @dev: device
+ *
+ * Gets the first netdev_adjacent->private from the dev's lower neighbour
+ * list. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_first_private_rcu(struct net_device *dev)
+{
+	struct netdev_adjacent *lower;
+
+	lower = list_first_or_null_rcu(&dev->adj_list.lower,
+			struct netdev_adjacent, list);
+	if (lower)
+		return lower->private;
+	return NULL;
+}
+EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
+
 /**
  * netdev_master_upper_dev_get_rcu - Get master upper device
  * @dev: device
-- 
cgit v1.2.3


From 477bb93320cec7ae74d5ccfad4f2bfa0b28fbe90 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemming@brocade.com>
Date: Fri, 13 Dec 2013 12:35:56 -0800
Subject: net: remove dead code for add/del multiple

These function to manipulate multiple addresses are not used anywhere
in current net-next tree. Some out of tree code maybe using these but
too bad; they should submit their code upstream..

Also, make __hw_addr_flush local since only used by dev_addr_lists.c

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 ------
 net/core/dev_addr_lists.c | 97 +----------------------------------------------
 2 files changed, 1 insertion(+), 107 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2c74d20dad34..a0dfcc8c002b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2810,17 +2810,10 @@ int register_netdev(struct net_device *dev);
 void unregister_netdev(struct net_device *dev);
 
 /* General hardware address lists handling functions */
-int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
-			   struct netdev_hw_addr_list *from_list,
-			   int addr_len, unsigned char addr_type);
-void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
-			    struct netdev_hw_addr_list *from_list,
-			    int addr_len, unsigned char addr_type);
 int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
 		   struct netdev_hw_addr_list *from_list, int addr_len);
 void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
 		      struct netdev_hw_addr_list *from_list, int addr_len);
-void __hw_addr_flush(struct netdev_hw_addr_list *list);
 void __hw_addr_init(struct netdev_hw_addr_list *list);
 
 /* Functions used for device addresses handling */
@@ -2828,10 +2821,6 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr,
 		 unsigned char addr_type);
 int dev_addr_del(struct net_device *dev, const unsigned char *addr,
 		 unsigned char addr_type);
-int dev_addr_add_multiple(struct net_device *to_dev,
-			  struct net_device *from_dev, unsigned char addr_type);
-int dev_addr_del_multiple(struct net_device *to_dev,
-			  struct net_device *from_dev, unsigned char addr_type);
 void dev_addr_flush(struct net_device *dev);
 int dev_addr_init(struct net_device *dev);
 
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index ec40a849fc42..bb504a919e33 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -186,47 +186,6 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
 	return err;
 }
 
-int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
-			   struct netdev_hw_addr_list *from_list,
-			   int addr_len, unsigned char addr_type)
-{
-	int err;
-	struct netdev_hw_addr *ha, *ha2;
-	unsigned char type;
-
-	list_for_each_entry(ha, &from_list->list, list) {
-		type = addr_type ? addr_type : ha->type;
-		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
-		if (err)
-			goto unroll;
-	}
-	return 0;
-
-unroll:
-	list_for_each_entry(ha2, &from_list->list, list) {
-		if (ha2 == ha)
-			break;
-		type = addr_type ? addr_type : ha2->type;
-		__hw_addr_del(to_list, ha2->addr, addr_len, type);
-	}
-	return err;
-}
-EXPORT_SYMBOL(__hw_addr_add_multiple);
-
-void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
-			    struct netdev_hw_addr_list *from_list,
-			    int addr_len, unsigned char addr_type)
-{
-	struct netdev_hw_addr *ha;
-	unsigned char type;
-
-	list_for_each_entry(ha, &from_list->list, list) {
-		type = addr_type ? addr_type : ha->type;
-		__hw_addr_del(to_list, ha->addr, addr_len, type);
-	}
-}
-EXPORT_SYMBOL(__hw_addr_del_multiple);
-
 /* This function only works where there is a strict 1-1 relationship
  * between source and destionation of they synch. If you ever need to
  * sync addresses to more then 1 destination, you need to use
@@ -264,7 +223,7 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
 }
 EXPORT_SYMBOL(__hw_addr_unsync);
 
-void __hw_addr_flush(struct netdev_hw_addr_list *list)
+static void __hw_addr_flush(struct netdev_hw_addr_list *list)
 {
 	struct netdev_hw_addr *ha, *tmp;
 
@@ -274,7 +233,6 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list)
 	}
 	list->count = 0;
 }
-EXPORT_SYMBOL(__hw_addr_flush);
 
 void __hw_addr_init(struct netdev_hw_addr_list *list)
 {
@@ -400,59 +358,6 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr,
 }
 EXPORT_SYMBOL(dev_addr_del);
 
-/**
- *	dev_addr_add_multiple - Add device addresses from another device
- *	@to_dev: device to which addresses will be added
- *	@from_dev: device from which addresses will be added
- *	@addr_type: address type - 0 means type will be used from from_dev
- *
- *	Add device addresses of the one device to another.
- **
- *	The caller must hold the rtnl_mutex.
- */
-int dev_addr_add_multiple(struct net_device *to_dev,
-			  struct net_device *from_dev,
-			  unsigned char addr_type)
-{
-	int err;
-
-	ASSERT_RTNL();
-
-	if (from_dev->addr_len != to_dev->addr_len)
-		return -EINVAL;
-	err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
-				     to_dev->addr_len, addr_type);
-	if (!err)
-		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
-	return err;
-}
-EXPORT_SYMBOL(dev_addr_add_multiple);
-
-/**
- *	dev_addr_del_multiple - Delete device addresses by another device
- *	@to_dev: device where the addresses will be deleted
- *	@from_dev: device supplying the addresses to be deleted
- *	@addr_type: address type - 0 means type will be used from from_dev
- *
- *	Deletes addresses in to device by the list of addresses in from device.
- *
- *	The caller must hold the rtnl_mutex.
- */
-int dev_addr_del_multiple(struct net_device *to_dev,
-			  struct net_device *from_dev,
-			  unsigned char addr_type)
-{
-	ASSERT_RTNL();
-
-	if (from_dev->addr_len != to_dev->addr_len)
-		return -EINVAL;
-	__hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
-			       to_dev->addr_len, addr_type);
-	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
-	return 0;
-}
-EXPORT_SYMBOL(dev_addr_del_multiple);
-
 /*
  * Unicast list handling functions
  */
-- 
cgit v1.2.3


From 2205369a314e12fcec4781cc73ac9c08fc2b47de Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 31 Dec 2013 16:23:35 -0500
Subject: vlan: Fix header ops passthru when doing TX VLAN offload.

When the vlan code detects that the real device can do TX VLAN offloads
in hardware, it tries to arrange for the real device's header_ops to
be invoked directly.

But it does so illegally, by simply hooking the real device's
header_ops up to the VLAN device.

This doesn't work because we will end up invoking a set of header_ops
routines which expect a device type which matches the real device, but
will see a VLAN device instead.

Fix this by providing a pass-thru set of header_ops which will arrange
to pass the proper real device instead.

To facilitate this add a dev_rebuild_header().  There are
implementations which provide a ->cache and ->create but not a
->rebuild (f.e. PLIP).  So we need a helper function just like
dev_hard_header() to avoid crashes.

Use this helper in the one existing place where the
header_ops->rebuild was being invoked, the neighbour code.

With lots of help from Florian Westphal.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 +++++++++
 net/8021q/vlan_dev.c      | 19 ++++++++++++++++++-
 net/core/neighbour.c      |  2 +-
 3 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d9a550bf3e8e..7514b9c37a39 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1912,6 +1912,15 @@ static inline int dev_parse_header(const struct sk_buff *skb,
 	return dev->header_ops->parse(skb, haddr);
 }
 
+static inline int dev_rebuild_header(struct sk_buff *skb)
+{
+	const struct net_device *dev = skb->dev;
+
+	if (!dev->header_ops || !dev->header_ops->rebuild)
+		return 0;
+	return dev->header_ops->rebuild(skb);
+}
+
 typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len);
 int register_gifconf(unsigned int family, gifconf_func_t *gifconf);
 static inline int unregister_gifconf(unsigned int family)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 762896ebfcf5..47c908f1f626 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -530,6 +530,23 @@ static const struct header_ops vlan_header_ops = {
 	.parse	 = eth_header_parse,
 };
 
+static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev,
+				     unsigned short type,
+				     const void *daddr, const void *saddr,
+				     unsigned int len)
+{
+	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+	struct net_device *real_dev = vlan->real_dev;
+
+	return dev_hard_header(skb, real_dev, type, daddr, saddr, len);
+}
+
+static const struct header_ops vlan_passthru_header_ops = {
+	.create	 = vlan_passthru_hard_header,
+	.rebuild = dev_rebuild_header,
+	.parse	 = eth_header_parse,
+};
+
 static struct device_type vlan_type = {
 	.name	= "vlan",
 };
@@ -573,7 +590,7 @@ static int vlan_dev_init(struct net_device *dev)
 
 	dev->needed_headroom = real_dev->needed_headroom;
 	if (real_dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
-		dev->header_ops      = real_dev->header_ops;
+		dev->header_ops      = &vlan_passthru_header_ops;
 		dev->hard_header_len = real_dev->hard_header_len;
 	} else {
 		dev->header_ops      = &vlan_header_ops;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 36b1443f9ae4..932c6d7cf666 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1275,7 +1275,7 @@ int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb)
 
 	if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
 			    skb->len) < 0 &&
-	    dev->header_ops->rebuild(skb))
+	    dev_rebuild_header(skb))
 		return 0;
 
 	return dev_queue_xmit(skb);
-- 
cgit v1.2.3


From 1d143d9f0c833fcf38cc737eb0a8698fa2dd144c Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Sun, 29 Dec 2013 14:01:29 -0800
Subject: net: core functions cleanup

The following functions are not used outside of net/core/dev.c
and should be declared static.

  call_netdevice_notifiers_info
  __dev_remove_offload
  netdev_has_any_upper_dev
  __netdev_adjacent_dev_remove
  __netdev_adjacent_dev_link_lists
  __netdev_adjacent_dev_unlink_lists
  __netdev_adjacent_dev_unlink
  __netdev_adjacent_dev_link_neighbour
  __netdev_adjacent_dev_unlink_neighbour

And the following are never used and should be deleted
  netdev_lower_dev_get_private_rcu
  __netdev_find_adj_rcu

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ----
 net/core/dev.c            | 82 +++++++++++++++--------------------------------
 2 files changed, 26 insertions(+), 62 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 88afa8048a7c..bec60c481966 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1741,8 +1741,6 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
 	return info->dev;
 }
 
-int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
-				  struct netdev_notifier_info *info);
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 
 
@@ -1809,7 +1807,6 @@ void dev_remove_pack(struct packet_type *pt);
 void __dev_remove_pack(struct packet_type *pt);
 void dev_add_offload(struct packet_offload *po);
 void dev_remove_offload(struct packet_offload *po);
-void __dev_remove_offload(struct packet_offload *po);
 
 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags,
 					unsigned short mask);
@@ -2867,7 +2864,6 @@ extern int		weight_p;
 extern int		bpf_jit_enable;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
-bool netdev_has_any_upper_dev(struct net_device *dev);
 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 						     struct list_head **iter);
 
@@ -2907,8 +2903,6 @@ int netdev_master_upper_dev_link_private(struct net_device *dev,
 					 void *private);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
-void *netdev_lower_dev_get_private_rcu(struct net_device *dev,
-				       struct net_device *lower_dev);
 void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev);
 int skb_checksum_help(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index cc9ab80581d7..77f43aa373fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -480,7 +480,7 @@ EXPORT_SYMBOL(dev_add_offload);
  *	and must not be freed until after all the CPU's have gone
  *	through a quiescent state.
  */
-void __dev_remove_offload(struct packet_offload *po)
+static void __dev_remove_offload(struct packet_offload *po)
 {
 	struct list_head *head = &offload_base;
 	struct packet_offload *po1;
@@ -498,7 +498,6 @@ void __dev_remove_offload(struct packet_offload *po)
 out:
 	spin_unlock(&offload_lock);
 }
-EXPORT_SYMBOL(__dev_remove_offload);
 
 /**
  *	dev_remove_offload	 - remove packet offload handler
@@ -1566,14 +1565,14 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
  *	are as for raw_notifier_call_chain().
  */
 
-int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
-				  struct netdev_notifier_info *info)
+static int call_netdevice_notifiers_info(unsigned long val,
+					 struct net_device *dev,
+					 struct netdev_notifier_info *info)
 {
 	ASSERT_RTNL();
 	netdev_notifier_info_init(info, dev);
 	return raw_notifier_call_chain(&netdev_chain, val, info);
 }
-EXPORT_SYMBOL(call_netdevice_notifiers_info);
 
 /**
  *	call_netdevice_notifiers - call all network notifier blocks
@@ -4355,19 +4354,6 @@ struct netdev_adjacent {
 	struct rcu_head rcu;
 };
 
-static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev,
-						     struct net_device *adj_dev,
-						     struct list_head *adj_list)
-{
-	struct netdev_adjacent *adj;
-
-	list_for_each_entry_rcu(adj, adj_list, list) {
-		if (adj->dev == adj_dev)
-			return adj;
-	}
-	return NULL;
-}
-
 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
 						 struct net_device *adj_dev,
 						 struct list_head *adj_list)
@@ -4406,13 +4392,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
  * Find out if a device is linked to an upper device and return true in case
  * it is. The caller must hold the RTNL lock.
  */
-bool netdev_has_any_upper_dev(struct net_device *dev)
+static bool netdev_has_any_upper_dev(struct net_device *dev)
 {
 	ASSERT_RTNL();
 
 	return !list_empty(&dev->all_adj_list.upper);
 }
-EXPORT_SYMBOL(netdev_has_any_upper_dev);
 
 /**
  * netdev_master_upper_dev_get - Get master upper device
@@ -4644,9 +4629,9 @@ free_adj:
 	return ret;
 }
 
-void __netdev_adjacent_dev_remove(struct net_device *dev,
-				  struct net_device *adj_dev,
-				  struct list_head *dev_list)
+static void __netdev_adjacent_dev_remove(struct net_device *dev,
+					 struct net_device *adj_dev,
+					 struct list_head *dev_list)
 {
 	struct netdev_adjacent *adj;
 	char linkname[IFNAMSIZ+7];
@@ -4684,11 +4669,11 @@ void __netdev_adjacent_dev_remove(struct net_device *dev,
 	kfree_rcu(adj, rcu);
 }
 
-int __netdev_adjacent_dev_link_lists(struct net_device *dev,
-				     struct net_device *upper_dev,
-				     struct list_head *up_list,
-				     struct list_head *down_list,
-				     void *private, bool master)
+static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
+					    struct net_device *upper_dev,
+					    struct list_head *up_list,
+					    struct list_head *down_list,
+					    void *private, bool master)
 {
 	int ret;
 
@@ -4707,8 +4692,8 @@ int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 	return 0;
 }
 
-int __netdev_adjacent_dev_link(struct net_device *dev,
-			       struct net_device *upper_dev)
+static int __netdev_adjacent_dev_link(struct net_device *dev,
+				      struct net_device *upper_dev)
 {
 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 						&dev->all_adj_list.upper,
@@ -4716,26 +4701,26 @@ int __netdev_adjacent_dev_link(struct net_device *dev,
 						NULL, false);
 }
 
-void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
-					struct net_device *upper_dev,
-					struct list_head *up_list,
-					struct list_head *down_list)
+static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
+					       struct net_device *upper_dev,
+					       struct list_head *up_list,
+					       struct list_head *down_list)
 {
 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
 }
 
-void __netdev_adjacent_dev_unlink(struct net_device *dev,
-				  struct net_device *upper_dev)
+static void __netdev_adjacent_dev_unlink(struct net_device *dev,
+					 struct net_device *upper_dev)
 {
 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
 					   &dev->all_adj_list.upper,
 					   &upper_dev->all_adj_list.lower);
 }
 
-int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
-					 struct net_device *upper_dev,
-					 void *private, bool master)
+static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+						struct net_device *upper_dev,
+						void *private, bool master)
 {
 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
 
@@ -4754,8 +4739,8 @@ int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 	return 0;
 }
 
-void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
-					    struct net_device *upper_dev)
+static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
+						   struct net_device *upper_dev)
 {
 	__netdev_adjacent_dev_unlink(dev, upper_dev);
 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
@@ -4944,21 +4929,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
-void *netdev_lower_dev_get_private_rcu(struct net_device *dev,
-				       struct net_device *lower_dev)
-{
-	struct netdev_adjacent *lower;
-
-	if (!lower_dev)
-		return NULL;
-	lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower);
-	if (!lower)
-		return NULL;
-
-	return lower->private;
-}
-EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu);
-
 void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev)
 {
-- 
cgit v1.2.3


From 7a7ffbabf99445704be01bff5d7e360da908cf8e Mon Sep 17 00:00:00 2001
From: Wei-Chun Chao <weichunc@plumgrid.com>
Date: Thu, 26 Dec 2013 13:10:22 -0800
Subject: ipv4: fix tunneled VM traffic over hw VXLAN/GRE GSO NIC

VM to VM GSO traffic is broken if it goes through VXLAN or GRE
tunnel and the physical NIC on the host supports hardware VXLAN/GRE
GSO offload (e.g. bnx2x and next-gen mlx4).

Two issues -
(VXLAN) VM traffic has SKB_GSO_DODGY and SKB_GSO_UDP_TUNNEL with
SKB_GSO_TCP/UDP set depending on the inner protocol. GSO header
integrity check fails in udp4_ufo_fragment if inner protocol is
TCP. Also gso_segs is calculated incorrectly using skb->len that
includes tunnel header. Fix: robust check should only be applied
to the inner packet.

(VXLAN & GRE) Once GSO header integrity check passes, NULL segs
is returned and the original skb is sent to hardware. However the
tunnel header is already pulled. Fix: tunnel header needs to be
restored so that hardware can perform GSO properly on the original
packet.

Signed-off-by: Wei-Chun Chao <weichunc@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 +++++++++++++
 net/ipv4/gre_offload.c    | 11 +++++++----
 net/ipv4/udp.c            |  6 +++++-
 net/ipv4/udp_offload.c    | 37 +++++++++++++++++++------------------
 4 files changed, 44 insertions(+), 23 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7514b9c37a39..5faaadb0c74f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3017,6 +3017,19 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
+static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
+					int pulled_hlen, u16 mac_offset,
+					int mac_len)
+{
+	skb->protocol = protocol;
+	skb->encapsulation = 1;
+	skb_push(skb, pulled_hlen);
+	skb_reset_transport_header(skb);
+	skb->mac_header = mac_offset;
+	skb->network_header = skb->mac_header + mac_len;
+	skb->mac_len = mac_len;
+}
+
 static inline bool netif_is_macvlan(struct net_device *dev)
 {
 	return dev->priv_flags & IFF_MACVLAN;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e5d436188464..2cd02f32f99f 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -28,6 +28,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	netdev_features_t enc_features;
 	int ghl = GRE_HEADER_SECTION;
 	struct gre_base_hdr *greh;
+	u16 mac_offset = skb->mac_header;
 	int mac_len = skb->mac_len;
 	__be16 protocol = skb->protocol;
 	int tnl_hlen;
@@ -58,13 +59,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	} else
 		csum = false;
 
+	if (unlikely(!pskb_may_pull(skb, ghl)))
+		goto out;
+
 	/* setup inner skb. */
 	skb->protocol = greh->protocol;
 	skb->encapsulation = 0;
 
-	if (unlikely(!pskb_may_pull(skb, ghl)))
-		goto out;
-
 	__skb_pull(skb, ghl);
 	skb_reset_mac_header(skb);
 	skb_set_network_header(skb, skb_inner_network_offset(skb));
@@ -73,8 +74,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	/* segment inner packet. */
 	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
 	segs = skb_mac_gso_segment(skb, enc_features);
-	if (!segs || IS_ERR(segs))
+	if (!segs || IS_ERR(segs)) {
+		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
 		goto out;
+	}
 
 	skb = segs;
 	tnl_hlen = skb_tnl_header_len(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f140048334ce..a7e4729e974b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2478,6 +2478,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	u16 mac_offset = skb->mac_header;
 	int mac_len = skb->mac_len;
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	__be16 protocol = skb->protocol;
@@ -2497,8 +2498,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 	/* segment inner packet. */
 	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
 	segs = skb_mac_gso_segment(skb, enc_features);
-	if (!segs || IS_ERR(segs))
+	if (!segs || IS_ERR(segs)) {
+		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+				     mac_len);
 		goto out;
+	}
 
 	outer_hlen = skb_tnl_header_len(skb);
 	skb = segs;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 83206de2bc76..79c62bdcd3c5 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -41,6 +41,14 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	unsigned int mss;
+	int offset;
+	__wsum csum;
+
+	if (skb->encapsulation &&
+	    skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) {
+		segs = skb_udp_tunnel_segment(skb, features);
+		goto out;
+	}
 
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
@@ -63,27 +71,20 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		goto out;
 	}
 
+	/* Do software UFO. Complete and fill in the UDP checksum as
+	 * HW cannot do checksum of UDP packets sent as multiple
+	 * IP fragments.
+	 */
+	offset = skb_checksum_start_offset(skb);
+	csum = skb_checksum(skb, offset, skb->len - offset, 0);
+	offset += skb->csum_offset;
+	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
+	skb->ip_summed = CHECKSUM_NONE;
+
 	/* Fragment the skb. IP headers of the fragments are updated in
 	 * inet_gso_segment()
 	 */
-	if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
-		segs = skb_udp_tunnel_segment(skb, features);
-	else {
-		int offset;
-		__wsum csum;
-
-		/* Do software UFO. Complete and fill in the UDP checksum as
-		 * HW cannot do checksum of UDP packets sent as multiple
-		 * IP fragments.
-		 */
-		offset = skb_checksum_start_offset(skb);
-		csum = skb_checksum(skb, offset, skb->len - offset, 0);
-		offset += skb->csum_offset;
-		*(__sum16 *)(skb->data + offset) = csum_fold(csum);
-		skb->ip_summed = CHECKSUM_NONE;
-
-		segs = skb_segment(skb, features);
-	}
+	segs = skb_segment(skb, features);
 out:
 	return segs;
 }
-- 
cgit v1.2.3


From 86f8515f9721fa171483f0fe0391968fbb949cc9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sun, 29 Dec 2013 17:27:11 +0100
Subject: net: netprio: rename config to be more consistent with cgroup configs

While we're at it and introduced CGROUP_NET_CLASSID, lets also make
NETPRIO_CGROUP more consistent with the rest of cgroups and rename it
into CONFIG_CGROUP_NET_PRIO so that for networking, we now have
CONFIG_CGROUP_NET_{PRIO,CLASSID}. This not only makes the CONFIG
option consistent among networking cgroups, but also among cgroups
CONFIG conventions in general as the vast majority has a prefix of
CONFIG_CGROUP_<SUBSYS>.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: cgroups@vger.kernel.org
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/cgroup_subsys.h |  2 +-
 include/linux/netdevice.h     |  2 +-
 include/net/netprio_cgroup.h  | 18 ++++++------------
 include/net/sock.h            |  2 +-
 net/Kconfig                   |  4 ++--
 net/core/Makefile             |  2 +-
 net/core/dev.c                |  2 +-
 net/core/sock.c               |  2 +-
 8 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 58bf94de4b8e..7b99d717411d 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -43,7 +43,7 @@ SUBSYS(blkio)
 SUBSYS(perf)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 SUBSYS(net_prio)
 #endif
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5260d2eae2e6..45cf68194aa8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1444,7 +1444,7 @@ struct net_device {
 	/* max exchange id for FCoE LRO by ddp */
 	unsigned int		fcoe_ddp_xid;
 #endif
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
 	/* phy device may attach itself for hardware timestamping */
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index 099d02782e22..dafc09f0fdbc 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -13,12 +13,12 @@
 
 #ifndef _NETPRIO_CGROUP_H
 #define _NETPRIO_CGROUP_H
+
 #include <linux/cgroup.h>
 #include <linux/hardirq.h>
 #include <linux/rcupdate.h>
 
-
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 struct netprio_map {
 	struct rcu_head rcu;
 	u32 priomap_len;
@@ -27,8 +27,7 @@ struct netprio_map {
 
 void sock_update_netprioidx(struct sock *sk);
 
-#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP)
-
+#if IS_BUILTIN(CONFIG_CGROUP_NET_PRIO)
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	struct cgroup_subsys_state *css;
@@ -40,9 +39,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
 	rcu_read_unlock();
 	return idx;
 }
-
-#elif IS_MODULE(CONFIG_NETPRIO_CGROUP)
-
+#elif IS_MODULE(CONFIG_CGROUP_NET_PRIO)
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	struct cgroup_subsys_state *css;
@@ -56,9 +53,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
 	return idx;
 }
 #endif
-
-#else /* !CONFIG_NETPRIO_CGROUP */
-
+#else /* !CONFIG_CGROUP_NET_PRIO */
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	return 0;
@@ -66,6 +61,5 @@ static inline u32 task_netprioidx(struct task_struct *p)
 
 #define sock_update_netprioidx(sk)
 
-#endif /* CONFIG_NETPRIO_CGROUP */
-
+#endif /* CONFIG_CGROUP_NET_PRIO */
 #endif  /* _NET_CLS_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 2ef3c3eca47a..ef5e2be6eaf3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -395,7 +395,7 @@ struct sock {
 	unsigned short		sk_ack_backlog;
 	unsigned short		sk_max_ack_backlog;
 	__u32			sk_priority;
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	__u32			sk_cgrp_prioidx;
 #endif
 	struct pid		*sk_peer_pid;
diff --git a/net/Kconfig b/net/Kconfig
index 7da10b830d70..e411046a62e3 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -238,12 +238,12 @@ config XPS
 	depends on SMP
 	default y
 
-config NETPRIO_CGROUP
+config CGROUP_NET_PRIO
 	tristate "Network priority cgroup"
 	depends on CGROUPS
 	---help---
 	  Cgroup subsystem for use in assigning processes to network priorities on
-	  a per-interface basis
+	  a per-interface basis.
 
 config CGROUP_NET_CLASSID
 	boolean "Network classid cgroup"
diff --git a/net/core/Makefile b/net/core/Makefile
index 4839a2796964..9628c20acff6 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -21,5 +21,5 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
 obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
-obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
+obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
diff --git a/net/core/dev.c b/net/core/dev.c
index c95d664b2b42..888a79b2b8b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2747,7 +2747,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	return rc;
 }
 
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 static void skb_update_prio(struct sk_buff *skb)
 {
 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
diff --git a/net/core/sock.c b/net/core/sock.c
index 3f150729fb15..a29735c9a05d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1308,7 +1308,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
 	module_put(owner);
 }
 
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 void sock_update_netprioidx(struct sock *sk)
 {
 	if (in_interrupt())
-- 
cgit v1.2.3


From 8f84985fec10de64a6b4cdfea45f2b0ab8f07c78 Mon Sep 17 00:00:00 2001
From: Li RongQing <roy.qing.li@gmail.com>
Date: Sat, 4 Jan 2014 13:57:59 +0800
Subject: net: unify the pcpu_tstats and br_cpu_netstats as one

They are same, so unify them as one, pcpu_sw_netstats.

Define pcpu_sw_netstat in netdevice.h, remove pcpu_tstats
from if_tunnel and remove br_cpu_netstats from br_private.h

Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 11 ++++++-----
 include/linux/if_tunnel.h |  9 ---------
 include/linux/netdevice.h | 11 ++++++++++-
 include/net/ip6_tunnel.h  |  2 +-
 include/net/ip_tunnels.h  |  4 ++--
 net/bridge/br_device.c    | 10 +++++-----
 net/bridge/br_input.c     |  2 +-
 net/bridge/br_private.h   | 10 +---------
 net/ipv4/ip_tunnel.c      |  9 +++++----
 net/ipv4/ip_vti.c         |  2 +-
 net/ipv6/ip6_gre.c        | 10 +++++-----
 net/ipv6/ip6_tunnel.c     | 12 ++++++------
 net/ipv6/ip6_vti.c        | 10 +++++-----
 net/ipv6/sit.c            | 10 +++++-----
 net/openvswitch/vport.c   | 12 ++++++------
 net/openvswitch/vport.h   |  2 +-
 16 files changed, 60 insertions(+), 66 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index aef44aa44fe3..474a99ed0222 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1081,7 +1081,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
 	struct iphdr *oip = NULL;
 	struct ipv6hdr *oip6 = NULL;
 	struct vxlan_dev *vxlan;
-	struct pcpu_tstats *stats;
+	struct pcpu_sw_netstats *stats;
 	union vxlan_addr saddr;
 	__u32 vni;
 	int err = 0;
@@ -1587,11 +1587,12 @@ EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
 static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 			       struct vxlan_dev *dst_vxlan)
 {
-	struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
-	struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
+	struct pcpu_sw_netstats *tx_stats, *rx_stats;
 	union vxlan_addr loopback;
 	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
 
+	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
+	rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
 	skb->pkt_type = PACKET_HOST;
 	skb->encapsulation = 0;
 	skb->dev = dst_vxlan->dev;
@@ -1897,12 +1898,12 @@ static int vxlan_init(struct net_device *dev)
 	struct vxlan_sock *vs;
 	int i;
 
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
-		struct pcpu_tstats *vxlan_stats;
+		struct pcpu_sw_netstats *vxlan_stats;
 		vxlan_stats = per_cpu_ptr(dev->tstats, i);
 		u64_stats_init(&vxlan_stats->syncp);
 	}
diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index f4e56ecd0b1a..712710bc0580 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -13,13 +13,4 @@
 #define for_each_ip_tunnel_rcu(pos, start) \
 	for (pos = rcu_dereference(start); pos; pos = rcu_dereference(pos->next))
 
-/* often modified stats are per cpu, other are shared (netdev->stats) */
-struct pcpu_tstats {
-	u64	rx_packets;
-	u64	rx_bytes;
-	u64	tx_packets;
-	u64	tx_bytes;
-	struct u64_stats_sync	syncp;
-};
-
 #endif /* _IF_TUNNEL_H_ */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bec60c481966..51c0fe258163 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1409,7 +1409,7 @@ struct net_device {
 	union {
 		void				*ml_priv;
 		struct pcpu_lstats __percpu	*lstats; /* loopback stats */
-		struct pcpu_tstats __percpu	*tstats; /* tunnel stats */
+		struct pcpu_sw_netstats __percpu	*tstats;
 		struct pcpu_dstats __percpu	*dstats; /* dummy stats */
 		struct pcpu_vstats __percpu	*vstats; /* veth stats */
 	};
@@ -1685,6 +1685,15 @@ struct packet_offload {
 	struct list_head	 list;
 };
 
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_sw_netstats {
+	u64     rx_packets;
+	u64     rx_bytes;
+	u64     tx_packets;
+	u64     tx_bytes;
+	struct u64_stats_sync   syncp;
+};
+
 #include <linux/notifier.h>
 
 /* netdevice notifier chain. Please remember to update the rtnetlink
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 6d1549c4893c..a5593dab6af7 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -79,7 +79,7 @@ static inline void ip6tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	err = ip6_local_out(skb);
 
 	if (net_xmit_eval(err) == 0) {
-		struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats);
+		struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
 		u64_stats_update_begin(&tstats->syncp);
 		tstats->tx_bytes += pkt_len;
 		tstats->tx_packets++;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 9e25b1bc31da..cd729becbb07 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -162,10 +162,10 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum,
 
 static inline void iptunnel_xmit_stats(int err,
 				       struct net_device_stats *err_stats,
-				       struct pcpu_tstats __percpu *stats)
+				       struct pcpu_sw_netstats __percpu *stats)
 {
 	if (err > 0) {
-		struct pcpu_tstats *tstats = this_cpu_ptr(stats);
+		struct pcpu_sw_netstats *tstats = this_cpu_ptr(stats);
 
 		u64_stats_update_begin(&tstats->syncp);
 		tstats->tx_bytes += err;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f00cfd2a0143..e4401a531afb 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -32,7 +32,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	const unsigned char *dest = skb->data;
 	struct net_bridge_fdb_entry *dst;
 	struct net_bridge_mdb_entry *mdst;
-	struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats);
+	struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
 	u16 vid = 0;
 
 	rcu_read_lock();
@@ -90,12 +90,12 @@ static int br_dev_init(struct net_device *dev)
 	struct net_bridge *br = netdev_priv(dev);
 	int i;
 
-	br->stats = alloc_percpu(struct br_cpu_netstats);
+	br->stats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!br->stats)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
-		struct br_cpu_netstats *br_dev_stats;
+		struct pcpu_sw_netstats *br_dev_stats;
 		br_dev_stats = per_cpu_ptr(br->stats, i);
 		u64_stats_init(&br_dev_stats->syncp);
 	}
@@ -135,12 +135,12 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
 						struct rtnl_link_stats64 *stats)
 {
 	struct net_bridge *br = netdev_priv(dev);
-	struct br_cpu_netstats tmp, sum = { 0 };
+	struct pcpu_sw_netstats tmp, sum = { 0 };
 	unsigned int cpu;
 
 	for_each_possible_cpu(cpu) {
 		unsigned int start;
-		const struct br_cpu_netstats *bstats
+		const struct pcpu_sw_netstats *bstats
 			= per_cpu_ptr(br->stats, cpu);
 		do {
 			start = u64_stats_fetch_begin_bh(&bstats->syncp);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7e73c32e205d..bf8dc7d308d6 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -28,7 +28,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
 {
 	struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
 	struct net_bridge *br = netdev_priv(brdev);
-	struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats);
+	struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
 
 	u64_stats_update_begin(&brstats->syncp);
 	brstats->rx_packets++;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 2e77d923c8ee..3733f152351c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -210,21 +210,13 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *
 		rtnl_dereference(dev->rx_handler_data) : NULL;
 }
 
-struct br_cpu_netstats {
-	u64			rx_packets;
-	u64			rx_bytes;
-	u64			tx_packets;
-	u64			tx_bytes;
-	struct u64_stats_sync	syncp;
-};
-
 struct net_bridge
 {
 	spinlock_t			lock;
 	struct list_head		port_list;
 	struct net_device		*dev;
 
-	struct br_cpu_netstats __percpu *stats;
+	struct pcpu_sw_netstats		__percpu *stats;
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 #ifdef CONFIG_BRIDGE_NETFILTER
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index e2c9cff26eb5..07a5ed374262 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -132,7 +132,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 	int i;
 
 	for_each_possible_cpu(i) {
-		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+		const struct pcpu_sw_netstats *tstats =
+						   per_cpu_ptr(dev->tstats, i);
 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 		unsigned int start;
 
@@ -460,7 +461,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
 {
-	struct pcpu_tstats *tstats;
+	struct pcpu_sw_netstats *tstats;
 	const struct iphdr *iph = ip_hdr(skb);
 	int err;
 
@@ -1049,12 +1050,12 @@ int ip_tunnel_init(struct net_device *dev)
 	int i, err;
 
 	dev->destructor	= ip_tunnel_dev_free;
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
-		struct pcpu_tstats *ipt_stats;
+		struct pcpu_sw_netstats *ipt_stats;
 		ipt_stats = per_cpu_ptr(dev->tstats, i);
 		u64_stats_init(&ipt_stats->syncp);
 	}
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 52b802a0cd8c..0783200ad8d2 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -60,7 +60,7 @@ static int vti_rcv(struct sk_buff *skb)
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 				  iph->saddr, iph->daddr, 0);
 	if (tunnel != NULL) {
-		struct pcpu_tstats *tstats;
+		struct pcpu_sw_netstats *tstats;
 		u32 oldmark = skb->mark;
 		int ret;
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index e27fb78c61f2..e7a440dd5c0d 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -498,7 +498,7 @@ static int ip6gre_rcv(struct sk_buff *skb)
 					  &ipv6h->saddr, &ipv6h->daddr, key,
 					  gre_proto);
 	if (tunnel) {
-		struct pcpu_tstats *tstats;
+		struct pcpu_sw_netstats *tstats;
 
 		if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
 			goto drop;
@@ -1265,12 +1265,12 @@ static int ip6gre_tunnel_init(struct net_device *dev)
 	if (ipv6_addr_any(&tunnel->parms.raddr))
 		dev->header_ops = &ip6gre_header_ops;
 
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
-		struct pcpu_tstats *ip6gre_tunnel_stats;
+		struct pcpu_sw_netstats *ip6gre_tunnel_stats;
 		ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i);
 		u64_stats_init(&ip6gre_tunnel_stats->syncp);
 	}
@@ -1466,12 +1466,12 @@ static int ip6gre_tap_init(struct net_device *dev)
 
 	ip6gre_tnl_link_config(tunnel, 1);
 
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
-		struct pcpu_tstats *ip6gre_tap_stats;
+		struct pcpu_sw_netstats *ip6gre_tap_stats;
 		ip6gre_tap_stats = per_cpu_ptr(dev->tstats, i);
 		u64_stats_init(&ip6gre_tap_stats->syncp);
 	}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 8d7c9867a445..02894216a46d 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -29,7 +29,6 @@
 #include <linux/if.h>
 #include <linux/in.h>
 #include <linux/ip.h>
-#include <linux/if_tunnel.h>
 #include <linux/net.h>
 #include <linux/in6.h>
 #include <linux/netdevice.h>
@@ -102,11 +101,12 @@ struct ip6_tnl_net {
 
 static struct net_device_stats *ip6_get_stats(struct net_device *dev)
 {
-	struct pcpu_tstats sum = { 0 };
+	struct pcpu_sw_netstats sum = { 0 };
 	int i;
 
 	for_each_possible_cpu(i) {
-		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+		const struct pcpu_sw_netstats *tstats =
+						   per_cpu_ptr(dev->tstats, i);
 
 		sum.rx_packets += tstats->rx_packets;
 		sum.rx_bytes   += tstats->rx_bytes;
@@ -784,7 +784,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 
 	if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
 					&ipv6h->daddr)) != NULL) {
-		struct pcpu_tstats *tstats;
+		struct pcpu_sw_netstats *tstats;
 
 		if (t->parms.proto != ipproto && t->parms.proto != 0) {
 			rcu_read_unlock();
@@ -1497,12 +1497,12 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
 
 	t->dev = dev;
 	t->net = dev_net(dev);
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
-		struct pcpu_tstats *ip6_tnl_stats;
+		struct pcpu_sw_netstats *ip6_tnl_stats;
 		ip6_tnl_stats = per_cpu_ptr(dev->tstats, i);
 		u64_stats_init(&ip6_tnl_stats->syncp);
 	}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ed94ba61dda0..da1d9e4d62ca 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -24,7 +24,6 @@
 #include <linux/if.h>
 #include <linux/in.h>
 #include <linux/ip.h>
-#include <linux/if_tunnel.h>
 #include <linux/net.h>
 #include <linux/in6.h>
 #include <linux/netdevice.h>
@@ -77,11 +76,12 @@ struct vti6_net {
 
 static struct net_device_stats *vti6_get_stats(struct net_device *dev)
 {
-	struct pcpu_tstats sum = { 0 };
+	struct pcpu_sw_netstats sum = { 0 };
 	int i;
 
 	for_each_possible_cpu(i) {
-		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+		const struct pcpu_sw_netstats *tstats =
+						   per_cpu_ptr(dev->tstats, i);
 
 		sum.rx_packets += tstats->rx_packets;
 		sum.rx_bytes   += tstats->rx_bytes;
@@ -312,7 +312,7 @@ static int vti6_rcv(struct sk_buff *skb)
 
 	if ((t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
 				 &ipv6h->daddr)) != NULL) {
-		struct pcpu_tstats *tstats;
+		struct pcpu_sw_netstats *tstats;
 
 		if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) {
 			rcu_read_unlock();
@@ -753,7 +753,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev)
 
 	t->dev = dev;
 	t->net = dev_net(dev);
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 	return 0;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 366fbba3359a..9937b2616713 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -671,7 +671,7 @@ static int ipip6_rcv(struct sk_buff *skb)
 	tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
 				     iph->saddr, iph->daddr);
 	if (tunnel != NULL) {
-		struct pcpu_tstats *tstats;
+		struct pcpu_sw_netstats *tstats;
 
 		if (tunnel->parms.iph.protocol != IPPROTO_IPV6 &&
 		    tunnel->parms.iph.protocol != 0)
@@ -1361,12 +1361,12 @@ static int ipip6_tunnel_init(struct net_device *dev)
 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
 
 	ipip6_tunnel_bind_dev(dev);
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
-		struct pcpu_tstats *ipip6_tunnel_stats;
+		struct pcpu_sw_netstats *ipip6_tunnel_stats;
 		ipip6_tunnel_stats = per_cpu_ptr(dev->tstats, i);
 		u64_stats_init(&ipip6_tunnel_stats->syncp);
 	}
@@ -1391,12 +1391,12 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
 	iph->ihl		= 5;
 	iph->ttl		= 64;
 
-	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
 	for_each_possible_cpu(i) {
-		struct pcpu_tstats *ipip6_fb_stats;
+		struct pcpu_sw_netstats *ipip6_fb_stats;
 		ipip6_fb_stats = per_cpu_ptr(dev->tstats, i);
 		u64_stats_init(&ipip6_fb_stats->syncp);
 	}
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index d830a95f03a4..f5275dd29cd9 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -136,14 +136,14 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
 	vport->ops = ops;
 	INIT_HLIST_NODE(&vport->dp_hash_node);
 
-	vport->percpu_stats = alloc_percpu(struct pcpu_tstats);
+	vport->percpu_stats = alloc_percpu(struct pcpu_sw_netstats);
 	if (!vport->percpu_stats) {
 		kfree(vport);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	for_each_possible_cpu(i) {
-		struct pcpu_tstats *vport_stats;
+		struct pcpu_sw_netstats *vport_stats;
 		vport_stats = per_cpu_ptr(vport->percpu_stats, i);
 		u64_stats_init(&vport_stats->syncp);
 	}
@@ -275,8 +275,8 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
 	spin_unlock_bh(&vport->stats_lock);
 
 	for_each_possible_cpu(i) {
-		const struct pcpu_tstats *percpu_stats;
-		struct pcpu_tstats local_stats;
+		const struct pcpu_sw_netstats *percpu_stats;
+		struct pcpu_sw_netstats local_stats;
 		unsigned int start;
 
 		percpu_stats = per_cpu_ptr(vport->percpu_stats, i);
@@ -344,7 +344,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
 void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 		       struct ovs_key_ipv4_tunnel *tun_key)
 {
-	struct pcpu_tstats *stats;
+	struct pcpu_sw_netstats *stats;
 
 	stats = this_cpu_ptr(vport->percpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -370,7 +370,7 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
 	int sent = vport->ops->send(vport, skb);
 
 	if (likely(sent > 0)) {
-		struct pcpu_tstats *stats;
+		struct pcpu_sw_netstats *stats;
 
 		stats = this_cpu_ptr(vport->percpu_stats);
 
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 1a9fbcec6e1b..bc97ef7fa2af 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -87,7 +87,7 @@ struct vport {
 	struct hlist_node dp_hash_node;
 	const struct vport_ops *ops;
 
-	struct pcpu_tstats __percpu *percpu_stats;
+	struct pcpu_sw_netstats __percpu *percpu_stats;
 
 	spinlock_t stats_lock;
 	struct vport_err_stats err_stats;
-- 
cgit v1.2.3


From bf5a755f5e9186406bbf50f4087100af5bd68e40 Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Tue, 7 Jan 2014 10:23:19 -0800
Subject: net-gre-gro: Add GRE support to the GRO stack

This patch built on top of Commit 299603e8370a93dd5d8e8d800f0dff1ce2c53d36
("net-gro: Prepare GRO stack for the upcoming tunneling support") to add
the support of the standard GRE (RFC1701/RFC2784/RFC2890) to the GRO
stack. It also serves as an example for supporting other encapsulation
protocols in the GRO stack in the future.

The patch supports version 0 and all the flags (key, csum, seq#) but
will flush any pkt with the S (seq#) flag. This is because the S flag
is not support by GSO, and a GRO pkt may end up in the forwarding path,
thus requiring GSO support to break it up correctly.

Currently the "packet_offload" structure only contains L3 (ETH_P_IP/
ETH_P_IPV6) GRO offload support so the encapped pkts are limited to
IP pkts (i.e., w/o L2 hdr). But support for other protocol type can
be easily added, so is the support for GRE variations like NVGRE.

The patch also support csum offload. Specifically if the csum flag is on
and the h/w is capable of checksumming the payload (CHECKSUM_COMPLETE),
the code will take advantage of the csum computed by the h/w when
validating the GRE csum.

Note that commit 60769a5dcd8755715c7143b4571d5c44f01796f1 "ipv4: gre:
add GRO capability" already introduces GRO capability to IPv4 GRE
tunnels, using the gro_cells infrastructure. But GRO is done after
GRE hdr has been removed (i.e., decapped). The following patch applies
GRO when pkts first come in (before hitting the GRE tunnel code). There
is some performance advantage for applying GRO as early as possible.
Also this approach is transparent to other subsystem like Open vSwitch
where GRE decap is handled outside of the IP stack hence making it
harder for the gro_cells stuff to apply. On the other hand, some NICs
are still not capable of hashing on the inner hdr of a GRE pkt (RSS).
In that case the GRO processing of pkts from the same remote host will
all happen on the same CPU and the performance may be suboptimal.

I'm including some rough preliminary performance numbers below. Note
that the performance will be highly dependent on traffic load, mix as
usual. Moreover it also depends on NIC offload features hence the
following is by no means a comprehesive study. Local testing and tuning
will be needed to decide the best setting.

All tests spawned 50 copies of netperf TCP_STREAM and ran for 30 secs.
(super_netperf 50 -H 192.168.1.18 -l 30)

An IP GRE tunnel with only the key flag on (e.g., ip tunnel add gre1
mode gre local 10.246.17.18 remote 10.246.17.17 ttl 255 key 123)
is configured.

The GRO support for pkts AFTER decap are controlled through the device
feature of the GRE device (e.g., ethtool -K gre1 gro on/off).

1.1 ethtool -K gre1 gro off; ethtool -K eth0 gro off
thruput: 9.16Gbps
CPU utilization: 19%

1.2 ethtool -K gre1 gro on; ethtool -K eth0 gro off
thruput: 5.9Gbps
CPU utilization: 15%

1.3 ethtool -K gre1 gro off; ethtool -K eth0 gro on
thruput: 9.26Gbps
CPU utilization: 12-13%

1.4 ethtool -K gre1 gro on; ethtool -K eth0 gro on
thruput: 9.26Gbps
CPU utilization: 10%

The following tests were performed on a different NIC that is capable of
csum offload. I.e., the h/w is capable of computing IP payload csum
(CHECKSUM_COMPLETE).

2.1 ethtool -K gre1 gro on (hence will use gro_cells)

2.1.1 ethtool -K eth0 gro off; csum offload disabled
thruput: 8.53Gbps
CPU utilization: 9%

2.1.2 ethtool -K eth0 gro off; csum offload enabled
thruput: 8.97Gbps
CPU utilization: 7-8%

2.1.3 ethtool -K eth0 gro on; csum offload disabled
thruput: 8.83Gbps
CPU utilization: 5-6%

2.1.4 ethtool -K eth0 gro on; csum offload enabled
thruput: 8.98Gbps
CPU utilization: 5%

2.2 ethtool -K gre1 gro off

2.2.1 ethtool -K eth0 gro off; csum offload disabled
thruput: 5.93Gbps
CPU utilization: 9%

2.2.2 ethtool -K eth0 gro off; csum offload enabled
thruput: 5.62Gbps
CPU utilization: 8%

2.2.3 ethtool -K eth0 gro on; csum offload disabled
thruput: 7.69Gbps
CPU utilization: 8%

2.2.4 ethtool -K eth0 gro on; csum offload enabled
thruput: 8.96Gbps
CPU utilization: 5-6%

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  18 +++++-
 net/core/dev.c            |  26 ++++++++
 net/ipv4/af_inet.c        |  10 ++-
 net/ipv4/gre_offload.c    | 160 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_offload.c    |   7 +-
 net/ipv6/ip6_offload.c    |   2 +-
 6 files changed, 216 insertions(+), 7 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d9c961aa6a7f..a2a70cc70e7b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1632,7 +1632,10 @@ struct napi_gro_cb {
 	int data_offset;
 
 	/* This is non-zero if the packet cannot be merged with the new skb. */
-	int flush;
+	u16	flush;
+
+	/* Save the IP ID here and check when we get to the transport layer */
+	u16	flush_id;
 
 	/* Number of segments aggregated. */
 	u16	count;
@@ -1651,6 +1654,9 @@ struct napi_gro_cb {
 	/* Used in ipv6_gro_receive() */
 	int	proto;
 
+	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
+	__wsum	csum;
+
 	/* used in skb_gro_receive() slow path */
 	struct sk_buff *last;
 };
@@ -1900,6 +1906,14 @@ static inline void *skb_gro_network_header(struct sk_buff *skb)
 	       skb_network_offset(skb);
 }
 
+static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
+					const void *start, unsigned int len)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
+						  csum_partial(start, len, 0));
+}
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
@@ -2440,6 +2454,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
+struct packet_offload *gro_find_receive_by_type(__be16 type);
+struct packet_offload *gro_find_complete_by_type(__be16 type);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index b3c574a88026..ce01847793c0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3846,6 +3846,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 
 	skb_gro_reset_offset(skb);
 	gro_list_prepare(napi, skb);
+	NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
@@ -3922,6 +3923,31 @@ normal:
 	goto pull;
 }
 
+struct packet_offload *gro_find_receive_by_type(__be16 type)
+{
+	struct list_head *offload_head = &offload_base;
+	struct packet_offload *ptype;
+
+	list_for_each_entry_rcu(ptype, offload_head, list) {
+		if (ptype->type != type || !ptype->callbacks.gro_receive)
+			continue;
+		return ptype;
+	}
+	return NULL;
+}
+
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+	struct list_head *offload_head = &offload_base;
+	struct packet_offload *ptype;
+
+	list_for_each_entry_rcu(ptype, offload_head, list) {
+		if (ptype->type != type || !ptype->callbacks.gro_complete)
+			continue;
+		return ptype;
+	}
+	return NULL;
+}
 
 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b8bc1a3d5cf1..6268a4751e64 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1391,9 +1391,15 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 		NAPI_GRO_CB(p)->flush |=
 			(iph->ttl ^ iph2->ttl) |
 			(iph->tos ^ iph2->tos) |
-			(__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) |
-			((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
 
+		/* Save the IP ID check to be included later when we get to
+		 * the transport layer so only the inner most IP ID is checked.
+		 * This is because some GSO/TSO implementations do not
+		 * correctly increment the IP ID for the outer hdrs.
+		 */
+		NAPI_GRO_CB(p)->flush_id =
+			    ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
 		NAPI_GRO_CB(p)->flush |= flush;
 	}
 
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 9138cfb10140..746a7b10d434 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -116,10 +116,170 @@ out:
 	return segs;
 }
 
+/* Compute the whole skb csum in s/w and store it, then verify GRO csum
+ * starting from gro_offset.
+ */
+static __sum16 gro_skb_checksum(struct sk_buff *skb)
+{
+	__sum16 sum;
+
+	skb->csum = skb_checksum(skb, 0, skb->len, 0);
+	NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
+		csum_partial(skb->data, skb_gro_offset(skb), 0));
+	sum = csum_fold(NAPI_GRO_CB(skb)->csum);
+	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
+		if (unlikely(!sum))
+			netdev_rx_csum_fault(skb->dev);
+	} else
+		skb->ip_summed = CHECKSUM_COMPLETE;
+
+	return sum;
+}
+
+static struct sk_buff **gre_gro_receive(struct sk_buff **head,
+					struct sk_buff *skb)
+{
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	const struct gre_base_hdr *greh;
+	unsigned int hlen, grehlen;
+	unsigned int off;
+	int flush = 1;
+	struct packet_offload *ptype;
+	__be16 type;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*greh);
+	greh = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		greh = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!greh))
+			goto out;
+	}
+
+	/* Only support version 0 and K (key), C (csum) flags. Note that
+	 * although the support for the S (seq#) flag can be added easily
+	 * for GRO, this is problematic for GSO hence can not be enabled
+	 * here because a GRO pkt may end up in the forwarding path, thus
+	 * requiring GSO support to break it up correctly.
+	 */
+	if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
+		goto out;
+
+	type = greh->protocol;
+
+	rcu_read_lock();
+	ptype = gro_find_receive_by_type(type);
+	if (ptype == NULL)
+		goto out_unlock;
+
+	grehlen = GRE_HEADER_SECTION;
+
+	if (greh->flags & GRE_KEY)
+		grehlen += GRE_HEADER_SECTION;
+
+	if (greh->flags & GRE_CSUM)
+		grehlen += GRE_HEADER_SECTION;
+
+	hlen = off + grehlen;
+	if (skb_gro_header_hard(skb, hlen)) {
+		greh = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!greh))
+			goto out_unlock;
+	}
+	if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */
+		__sum16 csum = 0;
+
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			csum = csum_fold(NAPI_GRO_CB(skb)->csum);
+		/* Don't trust csum error calculated/reported by h/w */
+		if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
+			csum = gro_skb_checksum(skb);
+
+		/* GRE CSUM is the 1's complement of the 1's complement sum
+		 * of the GRE hdr plus payload so it should add up to 0xffff
+		 * (and 0 after csum_fold()) just like the IPv4 hdr csum.
+		 */
+		if (csum)
+			goto out_unlock;
+	}
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		const struct gre_base_hdr *greh2;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		/* The following checks are needed to ensure only pkts
+		 * from the same tunnel are considered for aggregation.
+		 * The criteria for "the same tunnel" includes:
+		 * 1) same version (we only support version 0 here)
+		 * 2) same protocol (we only support ETH_P_IP for now)
+		 * 3) same set of flags
+		 * 4) same key if the key field is present.
+		 */
+		greh2 = (struct gre_base_hdr *)(p->data + off);
+
+		if (greh2->flags != greh->flags ||
+		    greh2->protocol != greh->protocol) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+		if (greh->flags & GRE_KEY) {
+			/* compare keys */
+			if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
+				NAPI_GRO_CB(p)->same_flow = 0;
+				continue;
+			}
+		}
+	}
+
+	skb_gro_pull(skb, grehlen);
+
+	/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+	skb_gro_postpull_rcsum(skb, greh, grehlen);
+
+	pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+
+int gre_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
+	struct packet_offload *ptype;
+	unsigned int grehlen = sizeof(*greh);
+	int err = -ENOENT;
+	__be16 type;
+
+	type = greh->protocol;
+	if (greh->flags & GRE_KEY)
+		grehlen += GRE_HEADER_SECTION;
+
+	if (greh->flags & GRE_CSUM)
+		grehlen += GRE_HEADER_SECTION;
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype != NULL)
+		err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
+
+	rcu_read_unlock();
+	return err;
+}
+
 static const struct net_offload gre_offload = {
 	.callbacks = {
 		.gso_send_check = gre_gso_send_check,
 		.gso_segment = gre_gso_segment,
+		.gro_receive = gre_gro_receive,
+		.gro_complete = gre_gro_complete,
 	},
 };
 
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 2658a27f540d..771a3950d87a 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -197,7 +197,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	goto out_check_final;
 
 found:
-	flush = NAPI_GRO_CB(p)->flush;
+	/* Include the IP ID check below from the inner most IP hdr */
+	flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
 	flush |= (__force int)(flags & TCP_FLAG_CWR);
 	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
 		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -230,7 +231,7 @@ out_check_final:
 		pp = head;
 
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	NAPI_GRO_CB(skb)->flush |= (flush != 0);
 
 	return pp;
 }
@@ -280,7 +281,7 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *
 	if (NAPI_GRO_CB(skb)->flush)
 		goto skip_csum;
 
-	wsum = skb->csum;
+	wsum = NAPI_GRO_CB(skb)->csum;
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_NONE:
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 6fb4162fa785..1e8683b135bb 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -190,7 +190,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 	unsigned int nlen;
 	unsigned int hlen;
 	unsigned int off;
-	int flush = 1;
+	u16 flush = 1;
 	int proto;
 	__wsum csum;
 
-- 
cgit v1.2.3


From f663dd9aaf9ed124f25f0f8452edf238f087ad50 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 10 Jan 2014 16:18:26 +0800
Subject: net: core: explicitly select a txq before doing l2 forwarding

Currently, the tx queue were selected implicitly in ndo_dfwd_start_xmit(). The
will cause several issues:

- NETIF_F_LLTX were removed for macvlan, so txq lock were done for macvlan
  instead of lower device which misses the necessary txq synchronization for
  lower device such as txq stopping or frozen required by dev watchdog or
  control path.
- dev_hard_start_xmit() was called with NULL txq which bypasses the net device
  watchdog.
- dev_hard_start_xmit() does not check txq everywhere which will lead a crash
  when tso is disabled for lower device.

Fix this by explicitly introducing a new param for .ndo_select_queue() for just
selecting queues in the case of l2 forwarding offload. netdev_pick_tx() was also
extended to accept this parameter and dev_queue_xmit_accel() was used to do l2
forwarding transmission.

With this fixes, NETIF_F_LLTX could be preserved for macvlan and there's no need
to check txq against NULL in dev_hard_start_xmit(). Also there's no need to keep
a dedicated ndo_dfwd_start_xmit() and we can just reuse the code of
dev_queue_xmit() to do the transmission.

In the future, it was also required for macvtap l2 forwarding support since it
provides a necessary synchronization method.

Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: e1000-devel@lists.sourceforge.net
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                 |  3 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |  3 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h |  3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 33 ++++++++++---------------
 drivers/net/ethernet/lantiq_etop.c              |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c      |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |  3 ++-
 drivers/net/ethernet/tile/tilegx.c              |  3 ++-
 drivers/net/macvlan.c                           |  9 +++----
 drivers/net/team/team.c                         |  3 ++-
 drivers/net/tun.c                               |  3 ++-
 drivers/net/wireless/mwifiex/main.c             |  3 ++-
 drivers/staging/bcm/Bcmnet.c                    |  3 ++-
 drivers/staging/netlogic/xlr_net.c              |  3 ++-
 drivers/staging/rtl8188eu/os_dep/os_intfs.c     |  3 ++-
 include/linux/netdevice.h                       | 12 ++++++---
 net/core/dev.c                                  | 29 +++++++++++++---------
 net/core/flow_dissector.c                       | 10 +++++---
 net/core/netpoll.c                              |  2 +-
 net/mac80211/iface.c                            |  6 +++--
 net/sched/sch_generic.c                         |  2 +-
 21 files changed, 80 insertions(+), 62 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 398e299ee1bd..4b8c58b0ec24 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3732,7 +3732,8 @@ static inline int bond_slave_override(struct bonding *bond,
 }
 
 
-static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
+			     void *accel_priv)
 {
 	/*
 	 * This helper function exists to help dev_pick_tx get the correct
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 550412088dd0..bf811565ee24 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1833,7 +1833,8 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw)
 		bnx2x_napi_disable_cnic(bp);
 }
 
-u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb)
+u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index da8fcaa74495..41f3ca5ad972 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -524,7 +524,8 @@ int bnx2x_set_vf_mac(struct net_device *dev, int queue, u8 *mac);
 int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos);
 
 /* select_queue callback */
-u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb);
+u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv);
 
 static inline void bnx2x_update_rx_prod(struct bnx2x *bp,
 					struct bnx2x_fastpath *fp,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index cc06854296a3..5bcc870f8367 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6827,12 +6827,20 @@ static inline int ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size)
 	return __ixgbe_maybe_stop_tx(tx_ring, size);
 }
 
-#ifdef IXGBE_FCOE
-static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
+			      void *accel_priv)
 {
+	struct ixgbe_fwd_adapter *fwd_adapter = accel_priv;
+#ifdef IXGBE_FCOE
 	struct ixgbe_adapter *adapter;
 	struct ixgbe_ring_feature *f;
 	int txq;
+#endif
+
+	if (fwd_adapter)
+		return skb->queue_mapping + fwd_adapter->tx_base_queue;
+
+#ifdef IXGBE_FCOE
 
 	/*
 	 * only execute the code below if protocol is FCoE
@@ -6858,9 +6866,11 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb)
 		txq -= f->indices;
 
 	return txq + f->offset;
+#else
+	return __netdev_pick_tx(dev, skb);
+#endif
 }
 
-#endif
 netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 			  struct ixgbe_adapter *adapter,
 			  struct ixgbe_ring *tx_ring)
@@ -7629,27 +7639,11 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
 	kfree(fwd_adapter);
 }
 
-static netdev_tx_t ixgbe_fwd_xmit(struct sk_buff *skb,
-				  struct net_device *dev,
-				  void *priv)
-{
-	struct ixgbe_fwd_adapter *fwd_adapter = priv;
-	unsigned int queue;
-	struct ixgbe_ring *tx_ring;
-
-	queue = skb->queue_mapping + fwd_adapter->tx_base_queue;
-	tx_ring = fwd_adapter->real_adapter->tx_ring[queue];
-
-	return __ixgbe_xmit_frame(skb, dev, tx_ring);
-}
-
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
 	.ndo_start_xmit		= ixgbe_xmit_frame,
-#ifdef IXGBE_FCOE
 	.ndo_select_queue	= ixgbe_select_queue,
-#endif
 	.ndo_set_rx_mode	= ixgbe_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= ixgbe_set_mac,
@@ -7689,7 +7683,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_bridge_getlink	= ixgbe_ndo_bridge_getlink,
 	.ndo_dfwd_add_station	= ixgbe_fwd_add,
 	.ndo_dfwd_del_station	= ixgbe_fwd_del,
-	.ndo_dfwd_start_xmit	= ixgbe_fwd_xmit,
 };
 
 /**
diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c
index 6a6c1f76d8e0..ec94a20d7099 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -619,7 +619,8 @@ ltq_etop_set_multicast_list(struct net_device *dev)
 }
 
 static u16
-ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb)
+ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb,
+		      void *accel_priv)
 {
 	/* we are currently only using the first queue */
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index f54ebd5a1702..a7fcd593b2db 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -592,7 +592,8 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk
 	}
 }
 
-u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb)
+u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
+			 void *accel_priv)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	u16 rings_p_up = priv->num_tx_rings_p_up;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index f3758de59c05..d5758adceaa2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -714,7 +714,8 @@ int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
 int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
 
 void mlx4_en_tx_irq(struct mlx4_cq *mcq);
-u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb);
+u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
+			 void *accel_priv);
 netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index 628b736e5ae7..0e9fb3301b11 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -2080,7 +2080,8 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* Return subqueue id on this core (one per core). */
-static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb,
+				 void *accel_priv)
 {
 	return smp_processor_id();
 }
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 5360f73c9817..bc8faaec33f5 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -299,7 +299,7 @@ netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 
 	if (vlan->fwd_priv) {
 		skb->dev = vlan->lowerdev;
-		ret = dev_hard_start_xmit(skb, skb->dev, NULL, vlan->fwd_priv);
+		ret = dev_queue_xmit_accel(skb, vlan->fwd_priv);
 	} else {
 		ret = macvlan_queue_xmit(skb, dev);
 	}
@@ -365,10 +365,8 @@ static int macvlan_open(struct net_device *dev)
 		 */
 		if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
 			vlan->fwd_priv = NULL;
-		} else {
-			dev->features &= ~NETIF_F_LLTX;
+		} else
 			return 0;
-		}
 	}
 
 	err = -EBUSY;
@@ -702,8 +700,7 @@ static netdev_features_t macvlan_fix_features(struct net_device *dev,
 	features = netdev_increment_features(vlan->lowerdev->features,
 					     features,
 					     mask);
-	if (!vlan->fwd_priv)
-		features |= NETIF_F_LLTX;
+	features |= NETIF_F_LLTX;
 
 	return features;
 }
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 736050d6b451..b75ae5bde673 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1647,7 +1647,8 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb,
+			     void *accel_priv)
 {
 	/*
 	 * This helper function exists to help dev_pick_tx get the correct
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7c8343a4f918..ecec8029c5e8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -348,7 +348,8 @@ unlock:
  * different rxq no. here. If we could not get rxhash, then we would
  * hope the rxq no. may help here.
  */
-static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_flow_entry *e;
diff --git a/drivers/net/wireless/mwifiex/main.c b/drivers/net/wireless/mwifiex/main.c
index 78e8a6666cc6..8bb8988c435c 100644
--- a/drivers/net/wireless/mwifiex/main.c
+++ b/drivers/net/wireless/mwifiex/main.c
@@ -746,7 +746,8 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev)
 }
 
 static u16
-mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb)
+mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb,
+				void *accel_priv)
 {
 	skb->priority = cfg80211_classify8021d(skb);
 	return mwifiex_1d_to_wmm_queue[skb->priority];
diff --git a/drivers/staging/bcm/Bcmnet.c b/drivers/staging/bcm/Bcmnet.c
index 53fee2f9a498..8dfdd2732bdc 100644
--- a/drivers/staging/bcm/Bcmnet.c
+++ b/drivers/staging/bcm/Bcmnet.c
@@ -39,7 +39,8 @@ static INT bcm_close(struct net_device *dev)
 	return 0;
 }
 
-static u16 bcm_select_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 bcm_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv)
 {
 	return ClassifyPacket(netdev_priv(dev), skb);
 }
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index 235d2b1ec593..eedffed17e39 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -306,7 +306,8 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb)
+static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb,
+				void *accel_priv)
 {
 	return (u16)smp_processor_id();
 }
diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c
index 17659bb04bef..dd69e344e409 100644
--- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c
+++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c
@@ -652,7 +652,8 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb)
 	return dscp >> 5;
 }
 
-static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv)
 {
 	struct adapter	*padapter = rtw_netdev_priv(dev);
 	struct mlme_priv *pmlmepriv = &padapter->mlmepriv;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5faaadb0c74f..ce2a1f5f9a1e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -769,7 +769,8 @@ struct netdev_phys_port_id {
  *        (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
  *	Required can not be NULL.
  *
- * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb);
+ * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
+ *                         void *accel_priv);
  *	Called to decide which queue to when device supports multiple
  *	transmit queues.
  *
@@ -990,7 +991,8 @@ struct net_device_ops {
 	netdev_tx_t		(*ndo_start_xmit) (struct sk_buff *skb,
 						   struct net_device *dev);
 	u16			(*ndo_select_queue)(struct net_device *dev,
-						    struct sk_buff *skb);
+						    struct sk_buff *skb,
+						    void *accel_priv);
 	void			(*ndo_change_rx_flags)(struct net_device *dev,
 						       int flags);
 	void			(*ndo_set_rx_mode)(struct net_device *dev);
@@ -1529,7 +1531,8 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 }
 
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
-				    struct sk_buff *skb);
+				    struct sk_buff *skb,
+				    void *accel_priv);
 u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb);
 
 /*
@@ -1819,6 +1822,7 @@ int dev_close(struct net_device *dev);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct sk_buff *newskb);
 int dev_queue_xmit(struct sk_buff *skb);
+int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
 void unregister_netdevice_many(struct list_head *head);
@@ -2426,7 +2430,7 @@ int dev_change_carrier(struct net_device *, bool new_carrier);
 int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_port_id *ppid);
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq, void *accel_priv);
+			struct netdev_queue *txq);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 
 extern int		netdev_budget;
diff --git a/net/core/dev.c b/net/core/dev.c
index 4fc17221545d..0ce469e5ec80 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2539,7 +2539,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb,
 }
 
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq, void *accel_priv)
+			struct netdev_queue *txq)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int rc = NETDEV_TX_OK;
@@ -2605,13 +2605,10 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			dev_queue_xmit_nit(skb, dev);
 
 		skb_len = skb->len;
-		if (accel_priv)
-			rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
-		else
 			rc = ops->ndo_start_xmit(skb, dev);
 
 		trace_net_dev_xmit(skb, rc, dev, skb_len);
-		if (rc == NETDEV_TX_OK && txq)
+		if (rc == NETDEV_TX_OK)
 			txq_trans_update(txq);
 		return rc;
 	}
@@ -2627,10 +2624,7 @@ gso:
 			dev_queue_xmit_nit(nskb, dev);
 
 		skb_len = nskb->len;
-		if (accel_priv)
-			rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
-		else
-			rc = ops->ndo_start_xmit(nskb, dev);
+		rc = ops->ndo_start_xmit(nskb, dev);
 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
@@ -2811,7 +2805,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);
  *      the BH enable code must have IRQs enabled so that it will not deadlock.
  *          --BLG
  */
-int dev_queue_xmit(struct sk_buff *skb)
+int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 {
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq;
@@ -2827,7 +2821,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 
 	skb_update_prio(skb);
 
-	txq = netdev_pick_tx(dev, skb);
+	txq = netdev_pick_tx(dev, skb, accel_priv);
 	q = rcu_dereference_bh(txq->qdisc);
 
 #ifdef CONFIG_NET_CLS_ACT
@@ -2863,7 +2857,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 
 			if (!netif_xmit_stopped(txq)) {
 				__this_cpu_inc(xmit_recursion);
-				rc = dev_hard_start_xmit(skb, dev, txq, NULL);
+				rc = dev_hard_start_xmit(skb, dev, txq);
 				__this_cpu_dec(xmit_recursion);
 				if (dev_xmit_complete(rc)) {
 					HARD_TX_UNLOCK(dev, txq);
@@ -2892,8 +2886,19 @@ out:
 	rcu_read_unlock_bh();
 	return rc;
 }
+
+int dev_queue_xmit(struct sk_buff *skb)
+{
+	return __dev_queue_xmit(skb, NULL);
+}
 EXPORT_SYMBOL(dev_queue_xmit);
 
+int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+{
+	return __dev_queue_xmit(skb, accel_priv);
+}
+EXPORT_SYMBOL(dev_queue_xmit_accel);
+
 
 /*=======================================================================
 			Receiver routines
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d6ef17322500..2fc5beaf5783 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -395,17 +395,21 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 EXPORT_SYMBOL(__netdev_pick_tx);
 
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
-				    struct sk_buff *skb)
+				    struct sk_buff *skb,
+				    void *accel_priv)
 {
 	int queue_index = 0;
 
 	if (dev->real_num_tx_queues != 1) {
 		const struct net_device_ops *ops = dev->netdev_ops;
 		if (ops->ndo_select_queue)
-			queue_index = ops->ndo_select_queue(dev, skb);
+			queue_index = ops->ndo_select_queue(dev, skb,
+							    accel_priv);
 		else
 			queue_index = __netdev_pick_tx(dev, skb);
-		queue_index = dev_cap_txqueue(dev, queue_index);
+
+		if (!accel_priv)
+			queue_index = dev_cap_txqueue(dev, queue_index);
 	}
 
 	skb_set_queue_mapping(skb, queue_index);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 303097874633..19fe9c717ced 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -375,7 +375,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 	if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
 		struct netdev_queue *txq;
 
-		txq = netdev_pick_tx(dev, skb);
+		txq = netdev_pick_tx(dev, skb, NULL);
 
 		/* try until next clock tick */
 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 36c3a4cbcabf..a0757913046e 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1061,7 +1061,8 @@ static void ieee80211_uninit(struct net_device *dev)
 }
 
 static u16 ieee80211_netdev_select_queue(struct net_device *dev,
-					 struct sk_buff *skb)
+					 struct sk_buff *skb,
+					 void *accel_priv)
 {
 	return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
 }
@@ -1078,7 +1079,8 @@ static const struct net_device_ops ieee80211_dataif_ops = {
 };
 
 static u16 ieee80211_monitor_select_queue(struct net_device *dev,
-					  struct sk_buff *skb)
+					  struct sk_buff *skb,
+					  void *accel_priv)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 922a09406ba7..7fc899a943a8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -126,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	if (!netif_xmit_frozen_or_stopped(txq))
-		ret = dev_hard_start_xmit(skb, dev, txq, NULL);
+		ret = dev_hard_start_xmit(skb, dev, txq);
 
 	HARD_TX_UNLOCK(dev, txq);
 
-- 
cgit v1.2.3


From 5bb025fae53889cc99a21058c5dd369bf8cce820 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Tue, 14 Jan 2014 21:58:51 +0100
Subject: net: rename sysfs symlinks on device name change

Currently, we don't rename the upper/lower_ifc symlinks in
/sys/class/net/*/ , which might result stale/duplicate links/names.

Fix this by adding netdev_adjacent_rename_links(dev, oldname) which renames
all the upper/lower interface's links to dev from the upper/lower_oldname
to the new name.

We don't need a rollback because only we control these symlinks and if we
fail to rename them - sysfs will anyway complain.

Reported-by: Ding Tianhong <dingtianhong@huawei.com>
CC: Ding Tianhong <dingtianhong@huawei.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5c88ab19b3eb..30f6513f3b4d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2941,6 +2941,7 @@ int netdev_master_upper_dev_link_private(struct net_device *dev,
 					 void *private);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
+void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
 void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev);
 int skb_checksum_help(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 130d3bd0ce6f..995755733997 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1119,6 +1119,8 @@ rollback:
 
 	write_seqcount_end(&devnet_rename_seq);
 
+	netdev_adjacent_rename_links(dev, oldname);
+
 	write_lock_bh(&dev_base_lock);
 	hlist_del_rcu(&dev->name_hlist);
 	write_unlock_bh(&dev_base_lock);
@@ -1138,6 +1140,7 @@ rollback:
 			err = ret;
 			write_seqcount_begin(&devnet_rename_seq);
 			memcpy(dev->name, oldname, IFNAMSIZ);
+			memcpy(oldname, newname, IFNAMSIZ);
 			goto rollback;
 		} else {
 			pr_err("%s: name change rollback failed: %d\n",
@@ -4999,6 +5002,25 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
+void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
+{
+	struct netdev_adjacent *iter;
+
+	list_for_each_entry(iter, &dev->adj_list.upper, list) {
+		netdev_adjacent_sysfs_del(iter->dev, oldname,
+					  &iter->dev->adj_list.lower);
+		netdev_adjacent_sysfs_add(iter->dev, dev,
+					  &iter->dev->adj_list.lower);
+	}
+
+	list_for_each_entry(iter, &dev->adj_list.lower, list) {
+		netdev_adjacent_sysfs_del(iter->dev, oldname,
+					  &iter->dev->adj_list.upper);
+		netdev_adjacent_sysfs_add(iter->dev, dev,
+					  &iter->dev->adj_list.upper);
+	}
+}
+
 void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev)
 {
-- 
cgit v1.2.3


From 1d486bfb66971ebacc2a46a23431ace9af70dc66 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@redhat.com>
Date: Thu, 16 Jan 2014 00:02:18 +0100
Subject: net: add NETDEV_PRECHANGEMTU to notify before mtu change happens

Currently, if a device changes its mtu, first the change happens (invloving
all the side effects), and after that the NETDEV_CHANGEMTU is sent so that
other devices can catch up with the new mtu. However, if they return
NOTIFY_BAD, then the change is reverted and error returned.

This is a really long and costy operation (sometimes). To fix this, add
NETDEV_PRECHANGEMTU notification which is called prior to any change
actually happening, and if any callee returns NOTIFY_BAD - the change is
aborted. This way we're skipping all the playing with apply/revert the mtu.

CC: "David S. Miller" <davem@davemloft.net>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Eric Dumazet <edumazet@google.com>
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Cong Wang <amwang@redhat.com>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ++-
 net/core/dev.c            | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 30f6513f3b4d..d7668b881d08 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1718,7 +1718,7 @@ struct pcpu_sw_netstats {
 #define NETDEV_CHANGE	0x0004	/* Notify device state change */
 #define NETDEV_REGISTER 0x0005
 #define NETDEV_UNREGISTER	0x0006
-#define NETDEV_CHANGEMTU	0x0007
+#define NETDEV_CHANGEMTU	0x0007 /* notify after mtu change happened */
 #define NETDEV_CHANGEADDR	0x0008
 #define NETDEV_GOING_DOWN	0x0009
 #define NETDEV_CHANGENAME	0x000A
@@ -1734,6 +1734,7 @@ struct pcpu_sw_netstats {
 #define NETDEV_JOIN		0x0014
 #define NETDEV_CHANGEUPPER	0x0015
 #define NETDEV_RESEND_IGMP	0x0016
+#define NETDEV_PRECHANGEMTU	0x0017 /* notify before mtu change happened */
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/net/core/dev.c b/net/core/dev.c
index b2c1869b04e3..f87bedd51eed 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5392,6 +5392,11 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	if (!netif_device_present(dev))
 		return -ENODEV;
 
+	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
+	err = notifier_to_errno(err);
+	if (err)
+		return err;
+
 	orig_mtu = dev->mtu;
 	err = __dev_set_mtu(dev, new_mtu);
 
-- 
cgit v1.2.3


From a953be53ce40440acb4740edb48577b9468d4c3d Mon Sep 17 00:00:00 2001
From: Michael Dalton <mwdalton@google.com>
Date: Thu, 16 Jan 2014 22:23:28 -0800
Subject: net-sysfs: add support for device-specific rx queue sysfs attributes

Extend existing support for netdevice receive queue sysfs attributes to
permit a device-specific attribute group. Initial use case for this
support will be to allow the virtio-net device to export per-receive
queue mergeable receive buffer size.

Signed-off-by: Michael Dalton <mwdalton@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++----
 net/core/dev.c            | 12 ++++++------
 net/core/net-sysfs.c      | 50 +++++++++++++++++++++++++++--------------------
 3 files changed, 66 insertions(+), 31 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d7668b881d08..e985231fe04b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -668,15 +668,28 @@ extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 			 u16 filter_id);
 #endif
+#endif /* CONFIG_RPS */
 
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
+#ifdef CONFIG_RPS
 	struct rps_map __rcu		*rps_map;
 	struct rps_dev_flow_table __rcu	*rps_flow_table;
+#endif
 	struct kobject			kobj;
 	struct net_device		*dev;
 } ____cacheline_aligned_in_smp;
-#endif /* CONFIG_RPS */
+
+/*
+ * RX queue sysfs structures and functions.
+ */
+struct rx_queue_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct netdev_rx_queue *queue,
+	    struct rx_queue_attribute *attr, char *buf);
+	ssize_t (*store)(struct netdev_rx_queue *queue,
+	    struct rx_queue_attribute *attr, const char *buf, size_t len);
+};
 
 #ifdef CONFIG_XPS
 /*
@@ -1313,7 +1326,7 @@ struct net_device {
 						   unicast) */
 
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	struct netdev_rx_queue	*_rx;
 
 	/* Number of RX queues allocated at register_netdev() time */
@@ -1424,6 +1437,8 @@ struct net_device {
 	struct device		dev;
 	/* space for optional device, statistics, and wireless sysfs groups */
 	const struct attribute_group *sysfs_groups[4];
+	/* space for optional per-rx queue attributes */
+	const struct attribute_group *sysfs_rx_queue_group;
 
 	/* rtnetlink link ops */
 	const struct rtnl_link_ops *rtnl_link_ops;
@@ -2375,7 +2390,7 @@ static inline bool netif_is_multiqueue(const struct net_device *dev)
 
 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
 #else
 static inline int netif_set_real_num_rx_queues(struct net_device *dev,
@@ -2394,7 +2409,7 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
 					   from_dev->real_num_tx_queues);
 	if (err)
 		return err;
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	return netif_set_real_num_rx_queues(to_dev,
 					    from_dev->real_num_rx_queues);
 #else
@@ -2402,6 +2417,18 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
 #endif
 }
 
+#ifdef CONFIG_SYSFS
+static inline unsigned int get_netdev_rx_queue_index(
+		struct netdev_rx_queue *queue)
+{
+	struct net_device *dev = queue->dev;
+	int index = queue - dev->_rx;
+
+	BUG_ON(index >= dev->num_rx_queues);
+	return index;
+}
+#endif
+
 #define DEFAULT_MAX_NUM_RSS_QUEUES	(8)
 int netif_get_num_default_rss_queues(void);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index f87bedd51eed..288df6232006 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2083,7 +2083,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 }
 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 /**
  *	netif_set_real_num_rx_queues - set actual number of RX queues used
  *	@dev: Network device
@@ -5764,7 +5764,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 }
 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 static int netif_alloc_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
@@ -6309,7 +6309,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	if (rxqs < 1) {
 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 		return NULL;
@@ -6365,7 +6365,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (netif_alloc_netdev_queues(dev))
 		goto free_all;
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	dev->num_rx_queues = rxqs;
 	dev->real_num_rx_queues = rxqs;
 	if (netif_alloc_rx_queues(dev))
@@ -6385,7 +6385,7 @@ free_all:
 free_pcpu:
 	free_percpu(dev->pcpu_refcnt);
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	kfree(dev->_rx);
 #endif
 
@@ -6410,7 +6410,7 @@ void free_netdev(struct net_device *dev)
 	release_net(dev_net(dev));
 
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	kfree(dev->_rx);
 #endif
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 49843bf7e43e..7eeadeecc5a2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -498,17 +498,7 @@ static struct attribute_group wireless_group = {
 #define net_class_groups	NULL
 #endif /* CONFIG_SYSFS */
 
-#ifdef CONFIG_RPS
-/*
- * RX queue sysfs structures and functions.
- */
-struct rx_queue_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct netdev_rx_queue *queue,
-	    struct rx_queue_attribute *attr, char *buf);
-	ssize_t (*store)(struct netdev_rx_queue *queue,
-	    struct rx_queue_attribute *attr, const char *buf, size_t len);
-};
+#ifdef CONFIG_SYSFS
 #define to_rx_queue_attr(_attr) container_of(_attr,		\
     struct rx_queue_attribute, attr)
 
@@ -543,6 +533,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {
 	.store = rx_queue_attr_store,
 };
 
+#ifdef CONFIG_RPS
 static ssize_t show_rps_map(struct netdev_rx_queue *queue,
 			    struct rx_queue_attribute *attribute, char *buf)
 {
@@ -718,16 +709,20 @@ static struct rx_queue_attribute rps_cpus_attribute =
 static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
 	__ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
 	    show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+#endif /* CONFIG_RPS */
 
 static struct attribute *rx_queue_default_attrs[] = {
+#ifdef CONFIG_RPS
 	&rps_cpus_attribute.attr,
 	&rps_dev_flow_table_cnt_attribute.attr,
+#endif
 	NULL
 };
 
 static void rx_queue_release(struct kobject *kobj)
 {
 	struct netdev_rx_queue *queue = to_rx_queue(kobj);
+#ifdef CONFIG_RPS
 	struct rps_map *map;
 	struct rps_dev_flow_table *flow_table;
 
@@ -743,6 +738,7 @@ static void rx_queue_release(struct kobject *kobj)
 		RCU_INIT_POINTER(queue->rps_flow_table, NULL);
 		call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
 	}
+#endif
 
 	memset(kobj, 0, sizeof(*kobj));
 	dev_put(queue->dev);
@@ -763,25 +759,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
 	kobj->kset = net->queues_kset;
 	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
 	    "rx-%u", index);
-	if (error) {
-		kobject_put(kobj);
-		return error;
+	if (error)
+		goto exit;
+
+	if (net->sysfs_rx_queue_group) {
+		error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
+		if (error)
+			goto exit;
 	}
 
 	kobject_uevent(kobj, KOBJ_ADD);
 	dev_hold(queue->dev);
 
+	return error;
+exit:
+	kobject_put(kobj);
 	return error;
 }
-#endif /* CONFIG_RPS */
+#endif /* CONFIG_SYFS */
 
 int
 net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 {
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	int i;
 	int error = 0;
 
+#ifndef CONFIG_RPS
+	if (!net->sysfs_rx_queue_group)
+		return 0;
+#endif
 	for (i = old_num; i < new_num; i++) {
 		error = rx_queue_add_kobject(net, i);
 		if (error) {
@@ -790,8 +797,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 		}
 	}
 
-	while (--i >= new_num)
+	while (--i >= new_num) {
+		if (net->sysfs_rx_queue_group)
+			sysfs_remove_group(&net->_rx[i].kobj,
+					   net->sysfs_rx_queue_group);
 		kobject_put(&net->_rx[i].kobj);
+	}
 
 	return error;
 #else
@@ -1155,9 +1166,6 @@ static int register_queue_kobjects(struct net_device *net)
 	    NULL, &net->dev.kobj);
 	if (!net->queues_kset)
 		return -ENOMEM;
-#endif
-
-#ifdef CONFIG_RPS
 	real_rx = net->real_num_rx_queues;
 #endif
 	real_tx = net->real_num_tx_queues;
@@ -1184,7 +1192,7 @@ static void remove_queue_kobjects(struct net_device *net)
 {
 	int real_rx = 0, real_tx = 0;
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	real_rx = net->real_num_rx_queues;
 #endif
 	real_tx = net->real_num_tx_queues;
-- 
cgit v1.2.3


From 1d3ee88ae0d605629bf369ab0b868dae8ca62a48 Mon Sep 17 00:00:00 2001
From: "sfeldma@cumulusnetworks.com" <sfeldma@cumulusnetworks.com>
Date: Thu, 16 Jan 2014 22:57:56 -0800
Subject: bonding: add netlink attributes to slave link dev

If link is IFF_SLAVE, extend link dev netlink attributes to include
slave attributes with new IFLA_SLAVE nest.  Add netlink notification
(RTM_NEWLINK) when slave status changes from backup to active, or
visa-versa.

Adds new ndo_get_slave op to net_device_ops to fill skb with IFLA_SLAVE
attributes.  Currently only used by bonding driver, but could be
used by other aggregating devices with slaves.

Signed-off-by: Scott Feldman <sfeldma@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c    |  1 +
 drivers/net/bonding/bond_netlink.c | 36 +++++++++++++++++++++++++
 drivers/net/bonding/bonding.h      | 11 ++++++--
 include/linux/netdevice.h          |  5 ++++
 include/uapi/linux/if_link.h       | 13 +++++++++
 net/core/rtnetlink.c               | 54 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 118 insertions(+), 2 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index df85cec3e5d9..3220b488dd1e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3883,6 +3883,7 @@ static const struct net_device_ops bond_netdev_ops = {
 #endif
 	.ndo_add_slave		= bond_enslave,
 	.ndo_del_slave		= bond_release,
+	.ndo_get_slave		= bond_get_slave,
 	.ndo_fix_features	= bond_fix_features,
 };
 
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 555c7837d8e6..21c648854a8c 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -22,6 +22,42 @@
 #include <linux/reciprocal_div.h>
 #include "bonding.h"
 
+int bond_get_slave(struct net_device *slave_dev, struct sk_buff *skb)
+{
+	struct slave *slave = bond_slave_get_rtnl(slave_dev);
+	const struct aggregator *agg;
+
+	if (nla_put_u8(skb, IFLA_SLAVE_STATE, bond_slave_state(slave)))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_SLAVE_MII_STATUS, slave->link))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, IFLA_SLAVE_LINK_FAILURE_COUNT,
+			slave->link_failure_count))
+		goto nla_put_failure;
+
+	if (nla_put(skb, IFLA_SLAVE_PERM_HWADDR,
+		    slave_dev->addr_len, slave->perm_hwaddr))
+		goto nla_put_failure;
+
+	if (nla_put_u16(skb, IFLA_SLAVE_QUEUE_ID, slave->queue_id))
+		goto nla_put_failure;
+
+	if (slave->bond->params.mode == BOND_MODE_8023AD) {
+		agg = SLAVE_AD_INFO(slave).port.aggregator;
+		if (agg)
+			if (nla_put_u16(skb, IFLA_SLAVE_AD_AGGREGATOR_ID,
+					agg->aggregator_identifier))
+				goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_MODE]		= { .type = NLA_U8 },
 	[IFLA_BOND_ACTIVE_SLAVE]	= { .type = NLA_U32 },
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 309757d8482b..8a935f8f2b3c 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -285,12 +285,18 @@ static inline bool bond_is_lb(const struct bonding *bond)
 
 static inline void bond_set_active_slave(struct slave *slave)
 {
-	slave->backup = 0;
+	if (slave->backup) {
+		slave->backup = 0;
+		rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_KERNEL);
+	}
 }
 
 static inline void bond_set_backup_slave(struct slave *slave)
 {
-	slave->backup = 1;
+	if (!slave->backup) {
+		slave->backup = 1;
+		rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_KERNEL);
+	}
 }
 
 static inline int bond_slave_state(struct slave *slave)
@@ -426,6 +432,7 @@ int bond_sysfs_slave_add(struct slave *slave);
 void bond_sysfs_slave_del(struct slave *slave);
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev);
 int bond_release(struct net_device *bond_dev, struct net_device *slave_dev);
+int bond_get_slave(struct net_device *slave_dev, struct sk_buff *skb);
 int bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, int count);
 int bond_parse_parm(const char *mode_arg, const struct bond_parm_tbl *tbl);
 int bond_parm_tbl_lookup(int mode, const struct bond_parm_tbl *tbl);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e985231fe04b..83ce2aee65e6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -921,6 +921,9 @@ struct netdev_phys_port_id {
  * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
  *	Called to release previously enslaved netdev.
  *
+ * int (*ndo_get_slave)(struct net_device *slave_dev, struct sk_buff *skb);
+ *	Called to fill netlink skb with slave info.
+ *
  *      Feature/offload setting functions.
  * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
  *		netdev_features_t features);
@@ -1093,6 +1096,8 @@ struct net_device_ops {
 						 struct net_device *slave_dev);
 	int			(*ndo_del_slave)(struct net_device *dev,
 						 struct net_device *slave_dev);
+	int			(*ndo_get_slave)(struct net_device *slave_dev,
+						 struct sk_buff *skb);
 	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
 						    netdev_features_t features);
 	int			(*ndo_set_features)(struct net_device *dev,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3e6bd3c7445d..ba2f3bf5fdf5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -144,6 +144,7 @@ enum {
 	IFLA_NUM_RX_QUEUES,
 	IFLA_CARRIER,
 	IFLA_PHYS_PORT_ID,
+	IFLA_SLAVE,
 	__IFLA_MAX
 };
 
@@ -368,6 +369,18 @@ enum {
 
 #define IFLA_BOND_AD_INFO_MAX	(__IFLA_BOND_AD_INFO_MAX - 1)
 
+enum {
+	IFLA_SLAVE_STATE,
+	IFLA_SLAVE_MII_STATUS,
+	IFLA_SLAVE_LINK_FAILURE_COUNT,
+	IFLA_SLAVE_PERM_HWADDR,
+	IFLA_SLAVE_QUEUE_ID,
+	IFLA_SLAVE_AD_AGGREGATOR_ID,
+	__IFLA_SLAVE_MAX,
+};
+
+#define IFLA_SLAVE_MAX	(__IFLA_SLAVE_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e6e7d582f901..4f85de7aca33 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -721,6 +721,28 @@ static size_t rtnl_port_size(const struct net_device *dev)
 		return port_self_size;
 }
 
+static size_t rtnl_bond_slave_size(const struct net_device *dev)
+{
+	struct net_device *bond;
+	size_t slave_size =
+		nla_total_size(sizeof(struct nlattr)) +	/* IFLA_SLAVE */
+		nla_total_size(1) +	/* IFLA_SLAVE_STATE */
+		nla_total_size(1) +	/* IFLA_SLAVE_MII_STATUS */
+		nla_total_size(4) +	/* IFLA_SLAVE_LINK_FAILURE_COUNT */
+		nla_total_size(MAX_ADDR_LEN) +	/* IFLA_SLAVE_PERM_HWADDR */
+		nla_total_size(2) +	/* IFLA_SLAVE_QUEUE_ID */
+		nla_total_size(2) +	/* IFLA_SLAVE_AD_AGGREGATOR_ID */
+		0;
+
+	if (netif_is_bond_slave((struct net_device *)dev)) {
+		bond = netdev_master_upper_dev_get((struct net_device *)dev);
+		if (bond && bond->netdev_ops->ndo_get_slave)
+			return slave_size;
+	}
+
+	return 0;
+}
+
 static noinline size_t if_nlmsg_size(const struct net_device *dev,
 				     u32 ext_filter_mask)
 {
@@ -750,6 +772,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
 	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
 	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+	       + rtnl_bond_slave_size(dev) /* IFLA_SLAVE */
 	       + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
 }
 
@@ -847,6 +870,34 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+static size_t rtnl_bond_slave_fill(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device *bond;
+	struct nlattr *nest;
+	int err;
+
+	if (!netif_is_bond_slave(dev))
+		return 0;
+
+	bond = netdev_master_upper_dev_get(dev);
+	if (!bond || !bond->netdev_ops->ndo_get_slave)
+		return 0;
+
+	nest = nla_nest_start(skb, IFLA_SLAVE);
+	if (!nest)
+		return -EMSGSIZE;
+
+	err = bond->netdev_ops->ndo_get_slave(dev, skb);
+	if (err) {
+		nla_nest_cancel(skb, nest);
+		return (err == -EMSGSIZE) ? err : 0;
+	}
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask)
@@ -1001,6 +1052,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (rtnl_port_fill(skb, dev))
 		goto nla_put_failure;
 
+	if (rtnl_bond_slave_fill(skb, dev))
+		goto nla_put_failure;
+
 	if (dev->rtnl_link_ops) {
 		if (rtnl_link_fill(skb, dev) < 0)
 			goto nla_put_failure;
-- 
cgit v1.2.3


From b582ef0990d457f7ce8ccf827af51a575ca0b4a6 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Mon, 20 Jan 2014 13:59:19 +0200
Subject: net: Add GRO support for UDP encapsulating protocols

Add GRO handlers for protocols that do UDP encapsulation, with the intent of
being able to coalesce packets which encapsulate packets belonging to
the same TCP session.

For GRO purposes, the destination UDP port takes the role of the ether type
field in the ethernet header or the next protocol in the IP header.

The UDP GRO handler will only attempt to coalesce packets whose destination
port is registered to have gro handler.

Use a mark on the skb GRO CB data to disallow (flush) running the udp gro receive
code twice on a packet. This solves the problem of udp encapsulated packets whose
inner VM packet is udp and happen to carry a port which has registered offloads.

Signed-off-by: Shlomo Pongratz <shlomop@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  10 +++-
 include/net/protocol.h    |   3 +
 net/core/dev.c            |   1 +
 net/ipv4/udp_offload.c    | 143 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 156 insertions(+), 1 deletion(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 83ce2aee65e6..c31022980e18 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1675,7 +1675,10 @@ struct napi_gro_cb {
 	unsigned long age;
 
 	/* Used in ipv6_gro_receive() */
-	int	proto;
+	u16	proto;
+
+	/* Used in udp_gro_receive */
+	u16	udp_mark;
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
@@ -1714,6 +1717,11 @@ struct packet_offload {
 	struct list_head	 list;
 };
 
+struct udp_offload {
+	__be16			 port;
+	struct offload_callbacks callbacks;
+};
+
 /* often modified stats are per cpu, other are shared (netdev->stats) */
 struct pcpu_sw_netstats {
 	u64     rx_packets;
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 0e5f8665d7fb..a7e986b08147 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -108,6 +108,9 @@ int inet_del_offload(const struct net_offload *prot, unsigned char num);
 void inet_register_protosw(struct inet_protosw *p);
 void inet_unregister_protosw(struct inet_protosw *p);
 
+int  udp_add_offload(struct udp_offload *prot);
+void udp_del_offload(struct udp_offload *prot);
+
 #if IS_ENABLED(CONFIG_IPV6)
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char num);
 int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char num);
diff --git a/net/core/dev.c b/net/core/dev.c
index a578af589198..da92305c344f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3893,6 +3893,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->same_flow = 0;
 		NAPI_GRO_CB(skb)->flush = 0;
 		NAPI_GRO_CB(skb)->free = 0;
+		NAPI_GRO_CB(skb)->udp_mark = 0;
 
 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
 		break;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 79c62bdcd3c5..ee853c55deea 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -14,6 +14,15 @@
 #include <net/udp.h>
 #include <net/protocol.h>
 
+static DEFINE_SPINLOCK(udp_offload_lock);
+static struct udp_offload_priv *udp_offload_base __read_mostly;
+
+struct udp_offload_priv {
+	struct udp_offload	*offload;
+	struct rcu_head		rcu;
+	struct udp_offload_priv __rcu *next;
+};
+
 static int udp4_ufo_send_check(struct sk_buff *skb)
 {
 	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -89,10 +98,144 @@ out:
 	return segs;
 }
 
+int udp_add_offload(struct udp_offload *uo)
+{
+	struct udp_offload_priv **head = &udp_offload_base;
+	struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_KERNEL);
+
+	if (!new_offload)
+		return -ENOMEM;
+
+	new_offload->offload = uo;
+
+	spin_lock(&udp_offload_lock);
+	rcu_assign_pointer(new_offload->next, rcu_dereference(*head));
+	rcu_assign_pointer(*head, rcu_dereference(new_offload));
+	spin_unlock(&udp_offload_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(udp_add_offload);
+
+static void udp_offload_free_routine(struct rcu_head *head)
+{
+	struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
+	kfree(ou_priv);
+}
+
+void udp_del_offload(struct udp_offload *uo)
+{
+	struct udp_offload_priv __rcu **head = &udp_offload_base;
+	struct udp_offload_priv *uo_priv;
+
+	spin_lock(&udp_offload_lock);
+
+	uo_priv = rcu_dereference(*head);
+	for (; uo_priv != NULL;
+		uo_priv = rcu_dereference(*head)) {
+
+		if (uo_priv->offload == uo) {
+			rcu_assign_pointer(*head, rcu_dereference(uo_priv->next));
+			goto unlock;
+		}
+		head = &uo_priv->next;
+	}
+	pr_warn("udp_del_offload: didn't find offload for port %d\n", htons(uo->port));
+unlock:
+	spin_unlock(&udp_offload_lock);
+	if (uo_priv != NULL)
+		call_rcu(&uo_priv->rcu, udp_offload_free_routine);
+}
+EXPORT_SYMBOL(udp_del_offload);
+
+static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	struct udp_offload_priv *uo_priv;
+	struct sk_buff *p, **pp = NULL;
+	struct udphdr *uh, *uh2;
+	unsigned int hlen, off;
+	int flush = 1;
+
+	if (NAPI_GRO_CB(skb)->udp_mark ||
+	    (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE))
+		goto out;
+
+	/* mark that this skb passed once through the udp gro layer */
+	NAPI_GRO_CB(skb)->udp_mark = 1;
+
+	off  = skb_gro_offset(skb);
+	hlen = off + sizeof(*uh);
+	uh   = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		uh = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!uh))
+			goto out;
+	}
+
+	rcu_read_lock();
+	uo_priv = rcu_dereference(udp_offload_base);
+	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+		if (uo_priv->offload->port == uh->dest &&
+		    uo_priv->offload->callbacks.gro_receive)
+			goto unflush;
+	}
+	goto out_unlock;
+
+unflush:
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		uh2 = (struct udphdr   *)(p->data + off);
+		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+	}
+
+	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
+	pp = uo_priv->offload->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+	return pp;
+}
+
+static int udp_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct udp_offload_priv *uo_priv;
+	__be16 newlen = htons(skb->len - nhoff);
+	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+	int err = -ENOSYS;
+
+	uh->len = newlen;
+
+	rcu_read_lock();
+
+	uo_priv = rcu_dereference(udp_offload_base);
+	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+		if (uo_priv->offload->port == uh->dest &&
+		    uo_priv->offload->callbacks.gro_complete)
+			break;
+	}
+
+	if (uo_priv != NULL)
+		err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr));
+
+	rcu_read_unlock();
+	return err;
+}
+
 static const struct net_offload udpv4_offload = {
 	.callbacks = {
 		.gso_send_check = udp4_ufo_send_check,
 		.gso_segment = udp4_ufo_fragment,
+		.gro_receive  =	udp_gro_receive,
+		.gro_complete =	udp_gro_complete,
 	},
 };
 
-- 
cgit v1.2.3


From a9517d0f43832d787a4a0348163d659641bfd83c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 22 Jan 2014 09:05:57 +0100
Subject: rtnetlink: remove ndo_get_slave

No longer used API bond-specific can be removed now. This is now handled
in a generic way in rtnl_link_ops.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c31022980e18..440a02ee6f92 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -921,9 +921,6 @@ struct netdev_phys_port_id {
  * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
  *	Called to release previously enslaved netdev.
  *
- * int (*ndo_get_slave)(struct net_device *slave_dev, struct sk_buff *skb);
- *	Called to fill netlink skb with slave info.
- *
  *      Feature/offload setting functions.
  * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
  *		netdev_features_t features);
@@ -1096,8 +1093,6 @@ struct net_device_ops {
 						 struct net_device *slave_dev);
 	int			(*ndo_del_slave)(struct net_device *dev,
 						 struct net_device *slave_dev);
-	int			(*ndo_get_slave)(struct net_device *slave_dev,
-						 struct sk_buff *skb);
 	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
 						    netdev_features_t features);
 	int			(*ndo_set_features)(struct net_device *dev,
-- 
cgit v1.2.3


From d206940319c41df4299db75ed56142177bb2e5f6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 13 Feb 2014 23:09:11 +0100
Subject: net: core: introduce netif_skb_dev_features

Will be used by upcoming ipv4 forward path change that needs to
determine feature mask using skb->dst->dev instead of skb->dev.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 ++++++-
 net/core/dev.c            | 22 ++++++++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 440a02ee6f92..21d4e6be8949 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3068,7 +3068,12 @@ void netdev_change_features(struct net_device *dev);
 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 					struct net_device *dev);
 
-netdev_features_t netif_skb_features(struct sk_buff *skb);
+netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
+					 const struct net_device *dev);
+static inline netdev_features_t netif_skb_features(struct sk_buff *skb)
+{
+	return netif_skb_dev_features(skb, skb->dev);
+}
 
 static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 4ad1b78c9c77..b1b0c8d4d7df 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2420,7 +2420,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault);
  * 2. No high memory really exists on this machine.
  */
 
-static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
+static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_HIGHMEM
 	int i;
@@ -2495,34 +2495,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
 }
 
 static netdev_features_t harmonize_features(struct sk_buff *skb,
-	netdev_features_t features)
+					    const struct net_device *dev,
+					    netdev_features_t features)
 {
 	if (skb->ip_summed != CHECKSUM_NONE &&
 	    !can_checksum_protocol(features, skb_network_protocol(skb))) {
 		features &= ~NETIF_F_ALL_CSUM;
-	} else if (illegal_highdma(skb->dev, skb)) {
+	} else if (illegal_highdma(dev, skb)) {
 		features &= ~NETIF_F_SG;
 	}
 
 	return features;
 }
 
-netdev_features_t netif_skb_features(struct sk_buff *skb)
+netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
+					 const struct net_device *dev)
 {
 	__be16 protocol = skb->protocol;
-	netdev_features_t features = skb->dev->features;
+	netdev_features_t features = dev->features;
 
-	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+	if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
 		features &= ~NETIF_F_GSO_MASK;
 
 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
 		protocol = veh->h_vlan_encapsulated_proto;
 	} else if (!vlan_tx_tag_present(skb)) {
-		return harmonize_features(skb, features);
+		return harmonize_features(skb, dev, features);
 	}
 
-	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
+	features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
 					       NETIF_F_HW_VLAN_STAG_TX);
 
 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
@@ -2530,9 +2532,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
 				NETIF_F_HW_VLAN_STAG_TX;
 
-	return harmonize_features(skb, features);
+	return harmonize_features(skb, dev, features);
 }
-EXPORT_SYMBOL(netif_skb_features);
+EXPORT_SYMBOL(netif_skb_dev_features);
 
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
-- 
cgit v1.2.3


From 99932d4fc03a13bb3e94938fe25458fabc8f2fc3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sun, 16 Feb 2014 15:55:20 +0100
Subject: netdevice: add queue selection fallback handler for ndo_select_queue

Add a new argument for ndo_select_queue() callback that passes a
fallback handler. This gets invoked through netdev_pick_tx();
fallback handler is currently __netdev_pick_tx() as most drivers
invoke this function within their customized implementation in
case for skbs that don't need any special handling. This fallback
handler can then be replaced on other call-sites with different
queue selection methods (e.g. in packet sockets, pktgen etc).

This also has the nice side-effect that __netdev_pick_tx() is
then only invoked from netdev_pick_tx() and export of that
function to modules can be undone.

Suggested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                 | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 6 +++---
 drivers/net/ethernet/lantiq_etop.c              | 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c      | 4 ++--
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    | 2 +-
 drivers/net/ethernet/tile/tilegx.c              | 2 +-
 drivers/net/team/team.c                         | 2 +-
 drivers/net/tun.c                               | 2 +-
 drivers/net/wireless/mwifiex/main.c             | 2 +-
 drivers/staging/bcm/Bcmnet.c                    | 2 +-
 drivers/staging/netlogic/xlr_net.c              | 2 +-
 drivers/staging/rtl8188eu/os_dep/os_intfs.c     | 2 +-
 include/linux/netdevice.h                       | 9 ++++++---
 net/core/flow_dissector.c                       | 7 +++----
 net/mac80211/iface.c                            | 6 ++++--
 17 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 867664918715..1c6104d3501d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3707,7 +3707,7 @@ static inline int bond_slave_override(struct bonding *bond,
 
 
 static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv)
+			     void *accel_priv, select_queue_fallback_t fallback)
 {
 	/*
 	 * This helper function exists to help dev_pick_tx get the correct
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 9d7419e0390b..66c0df78c3ff 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1873,7 +1873,7 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw)
 }
 
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv)
+		       void *accel_priv, select_queue_fallback_t fallback)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
@@ -1895,7 +1895,7 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
 	}
 
 	/* select a non-FCoE queue */
-	return __netdev_pick_tx(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp);
+	return fallback(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp);
 }
 
 void bnx2x_set_num_queues(struct bnx2x *bp)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index bfc58d488bb5..a89a40f88c25 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -496,7 +496,7 @@ int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos);
 
 /* select_queue callback */
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv);
+		       void *accel_priv, select_queue_fallback_t fallback);
 
 static inline void bnx2x_update_rx_prod(struct bnx2x *bp,
 					struct bnx2x_fastpath *fp,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6d4ada72dfd0..18076c4178b4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6881,7 +6881,7 @@ static inline int ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size)
 }
 
 static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
-			      void *accel_priv)
+			      void *accel_priv, select_queue_fallback_t fallback)
 {
 	struct ixgbe_fwd_adapter *fwd_adapter = accel_priv;
 #ifdef IXGBE_FCOE
@@ -6907,7 +6907,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 		if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED)
 			break;
 	default:
-		return __netdev_pick_tx(dev, skb);
+		return fallback(dev, skb);
 	}
 
 	f = &adapter->ring_feature[RING_F_FCOE];
@@ -6920,7 +6920,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 
 	return txq + f->offset;
 #else
-	return __netdev_pick_tx(dev, skb);
+	return fallback(dev, skb);
 #endif
 }
 
diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c
index 8f9266c64c75..fd4b6aecf6ee 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -619,7 +619,7 @@ ltq_etop_set_multicast_list(struct net_device *dev)
 
 static u16
 ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb,
-		      void *accel_priv)
+		      void *accel_priv, select_queue_fallback_t fallback)
 {
 	/* we are currently only using the first queue */
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 8e8a7eb43a2c..13457032d15f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -629,7 +629,7 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk
 }
 
 u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
-			 void *accel_priv)
+			 void *accel_priv, select_queue_fallback_t fallback)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	u16 rings_p_up = priv->num_tx_rings_p_up;
@@ -641,7 +641,7 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
 	if (vlan_tx_tag_present(skb))
 		up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT;
 
-	return __netdev_pick_tx(dev, skb) % rings_p_up + up * rings_p_up;
+	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
 }
 
 static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 3af04c3f42ea..9ca223bc90fc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -723,7 +723,7 @@ int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
 
 void mlx4_en_tx_irq(struct mlx4_cq *mcq);
 u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
-			 void *accel_priv);
+			 void *accel_priv, select_queue_fallback_t fallback);
 netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index 023237a65720..17503da9f7a5 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -2071,7 +2071,7 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
 
 /* Return subqueue id on this core (one per core). */
 static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb,
-				 void *accel_priv)
+				 void *accel_priv, select_queue_fallback_t fallback)
 {
 	return smp_processor_id();
 }
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 28407426fd6f..c8624a8235ab 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1648,7 +1648,7 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv)
+			     void *accel_priv, select_queue_fallback_t fallback)
 {
 	/*
 	 * This helper function exists to help dev_pick_tx get the correct
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 44c4db8450f0..8fe9cb7d0f72 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -366,7 +366,7 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
  * hope the rxq no. may help here.
  */
 static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv)
+			    void *accel_priv, select_queue_fallback_t fallback)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_flow_entry *e;
diff --git a/drivers/net/wireless/mwifiex/main.c b/drivers/net/wireless/mwifiex/main.c
index 4d79761b9c87..9d3d2758ec35 100644
--- a/drivers/net/wireless/mwifiex/main.c
+++ b/drivers/net/wireless/mwifiex/main.c
@@ -748,7 +748,7 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev)
 
 static u16
 mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb,
-				void *accel_priv)
+				void *accel_priv, select_queue_fallback_t fallback)
 {
 	skb->priority = cfg80211_classify8021d(skb, NULL);
 	return mwifiex_1d_to_wmm_queue[skb->priority];
diff --git a/drivers/staging/bcm/Bcmnet.c b/drivers/staging/bcm/Bcmnet.c
index 8dfdd2732bdc..95a2358267ba 100644
--- a/drivers/staging/bcm/Bcmnet.c
+++ b/drivers/staging/bcm/Bcmnet.c
@@ -40,7 +40,7 @@ static INT bcm_close(struct net_device *dev)
 }
 
 static u16 bcm_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv)
+			    void *accel_priv, select_queue_fallback_t fallback)
 {
 	return ClassifyPacket(netdev_priv(dev), skb);
 }
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index eedffed17e39..6f9ac27730af 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -307,7 +307,7 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb,
 }
 
 static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb,
-				void *accel_priv)
+				void *accel_priv, select_queue_fallback_t fallback)
 {
 	return (u16)smp_processor_id();
 }
diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c
index 68f98fa114d2..7c9ee58f47bb 100644
--- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c
+++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c
@@ -653,7 +653,7 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb)
 }
 
 static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv)
+			    void *accel_priv, select_queue_fallback_t fallback)
 {
 	struct adapter	*padapter = rtw_netdev_priv(dev);
 	struct mlme_priv *pmlmepriv = &padapter->mlmepriv;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 21d4e6be8949..1de9c136b066 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -752,6 +752,9 @@ struct netdev_phys_port_id {
 	unsigned char id_len;
 };
 
+typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
+				       struct sk_buff *skb);
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -783,7 +786,7 @@ struct netdev_phys_port_id {
  *	Required can not be NULL.
  *
  * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
- *                         void *accel_priv);
+ *                         void *accel_priv, select_queue_fallback_t fallback);
  *	Called to decide which queue to when device supports multiple
  *	transmit queues.
  *
@@ -1005,7 +1008,8 @@ struct net_device_ops {
 						   struct net_device *dev);
 	u16			(*ndo_select_queue)(struct net_device *dev,
 						    struct sk_buff *skb,
-						    void *accel_priv);
+						    void *accel_priv,
+						    select_queue_fallback_t fallback);
 	void			(*ndo_change_rx_flags)(struct net_device *dev,
 						       int flags);
 	void			(*ndo_set_rx_mode)(struct net_device *dev);
@@ -1551,7 +1555,6 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
 				    void *accel_priv);
-u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb);
 
 /*
  * Net namespace inlines
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 87577d447554..75fe83f590ea 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -372,7 +372,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 #endif
 }
 
-u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 	int queue_index = sk_tx_queue_get(sk);
@@ -392,7 +392,6 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 	return queue_index;
 }
-EXPORT_SYMBOL(__netdev_pick_tx);
 
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
@@ -403,8 +402,8 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 	if (dev->real_num_tx_queues != 1) {
 		const struct net_device_ops *ops = dev->netdev_ops;
 		if (ops->ndo_select_queue)
-			queue_index = ops->ndo_select_queue(dev, skb,
-							    accel_priv);
+			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+							    __netdev_pick_tx);
 		else
 			queue_index = __netdev_pick_tx(dev, skb);
 
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index d6d1f1df9119..ce1c44370610 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1057,7 +1057,8 @@ static void ieee80211_uninit(struct net_device *dev)
 
 static u16 ieee80211_netdev_select_queue(struct net_device *dev,
 					 struct sk_buff *skb,
-					 void *accel_priv)
+					 void *accel_priv,
+					 select_queue_fallback_t fallback)
 {
 	return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
 }
@@ -1075,7 +1076,8 @@ static const struct net_device_ops ieee80211_dataif_ops = {
 
 static u16 ieee80211_monitor_select_queue(struct net_device *dev,
 					  struct sk_buff *skb,
-					  void *accel_priv)
+					  void *accel_priv,
+					  select_queue_fallback_t fallback)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
-- 
cgit v1.2.3


From b9507bdaf40e91fea2b1c0c1ee7dc627c8ee6fd6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sun, 16 Feb 2014 15:55:21 +0100
Subject: netdevice: move netdev_cap_txqueue for shared usage to header

In order to allow users to invoke netdev_cap_txqueue, it needs to
be moved into netdevice.h header file. While at it, also add kernel
doc header to document the API.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 20 ++++++++++++++++++++
 net/core/flow_dissector.c | 13 +------------
 2 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1de9c136b066..e8eeebd49a98 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2278,6 +2278,26 @@ static inline void netdev_reset_queue(struct net_device *dev_queue)
 	netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0));
 }
 
+/**
+ * 	netdev_cap_txqueue - check if selected tx queue exceeds device queues
+ * 	@dev: network device
+ * 	@queue_index: given tx queue index
+ *
+ * 	Returns 0 if given tx queue index >= number of device tx queues,
+ * 	otherwise returns the originally passed tx queue index.
+ */
+static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
+{
+	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
+		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
+				     dev->name, queue_index,
+				     dev->real_num_tx_queues);
+		return 0;
+	}
+
+	return queue_index;
+}
+
 /**
  *	netif_running - test if up
  *	@dev: network device
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 75fe83f590ea..e29e810663d7 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -323,17 +323,6 @@ u32 __skb_get_poff(const struct sk_buff *skb)
 	return poff;
 }
 
-static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
-{
-	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
-		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
-				     dev->name, queue_index,
-				     dev->real_num_tx_queues);
-		return 0;
-	}
-	return queue_index;
-}
-
 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
@@ -408,7 +397,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 			queue_index = __netdev_pick_tx(dev, skb);
 
 		if (!accel_priv)
-			queue_index = dev_cap_txqueue(dev, queue_index);
+			queue_index = netdev_cap_txqueue(dev, queue_index);
 	}
 
 	skb_set_queue_mapping(skb, queue_index);
-- 
cgit v1.2.3


From 53d6471cef17262d3ad1c7ce8982a234244f68ec Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Thu, 27 Mar 2014 17:26:18 -0400
Subject: net: Account for all vlan headers in skb_mac_gso_segment

skb_network_protocol() already accounts for multiple vlan
headers that may be present in the skb.  However, skb_mac_gso_segment()
doesn't know anything about it and assumes that skb->mac_len
is set correctly to skip all mac headers.  That may not
always be the case.  If we are simply forwarding the packet (via
bridge or macvtap), all vlan headers may not be accounted for.

A simple solution is to allow skb_network_protocol to return
the vlan depth it has calculated.  This way skb_mac_gso_segment
will correctly skip all mac headers.

Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 +-
 net/core/dev.c            | 13 +++++++++----
 net/core/skbuff.c         |  3 ++-
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux/netdevice.h')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e8eeebd49a98..daafd9561cbc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3014,7 +3014,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
 {
 	return __skb_gso_segment(skb, features, true);
 }
-__be16 skb_network_protocol(struct sk_buff *skb);
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth);
 
 static inline bool can_checksum_protocol(netdev_features_t features,
 					 __be16 protocol)
diff --git a/net/core/dev.c b/net/core/dev.c
index b1b0c8d4d7df..45fa2f11f84d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2286,7 +2286,7 @@ out:
 }
 EXPORT_SYMBOL(skb_checksum_help);
 
-__be16 skb_network_protocol(struct sk_buff *skb)
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 {
 	__be16 type = skb->protocol;
 	int vlan_depth = ETH_HLEN;
@@ -2313,6 +2313,8 @@ __be16 skb_network_protocol(struct sk_buff *skb)
 		vlan_depth += VLAN_HLEN;
 	}
 
+	*depth = vlan_depth;
+
 	return type;
 }
 
@@ -2326,12 +2328,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 {
 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 	struct packet_offload *ptype;
-	__be16 type = skb_network_protocol(skb);
+	int vlan_depth = skb->mac_len;
+	__be16 type = skb_network_protocol(skb, &vlan_depth);
 
 	if (unlikely(!type))
 		return ERR_PTR(-EINVAL);
 
-	__skb_pull(skb, skb->mac_len);
+	__skb_pull(skb, vlan_depth);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, &offload_base, list) {
@@ -2498,8 +2501,10 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
 					    const struct net_device *dev,
 					    netdev_features_t features)
 {
+	int tmp;
+
 	if (skb->ip_summed != CHECKSUM_NONE &&
-	    !can_checksum_protocol(features, skb_network_protocol(skb))) {
+	    !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) {
 		features &= ~NETIF_F_ALL_CSUM;
 	} else if (illegal_highdma(dev, skb)) {
 		features &= ~NETIF_F_SG;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 97e5a2c3d947..90b96a11b974 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2879,8 +2879,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	int err = -ENOMEM;
 	int i = 0;
 	int pos;
+	int dummy;
 
-	proto = skb_network_protocol(head_skb);
+	proto = skb_network_protocol(head_skb, &dummy);
 	if (unlikely(!proto))
 		return ERR_PTR(-EINVAL);
 
-- 
cgit v1.2.3