From 8fc54f68919298ff9689d980efb495707ef43f30 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sat, 23 Aug 2014 20:58:54 +0200
Subject: net: use reciprocal_scale() helper

Replace open codings of (((u64) <x> * <y>) >> 32) with reciprocal_scale().

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/core/dev.c')
diff --git a/net/core/dev.c b/net/core/dev.c
index b65a5051361f..1421dad4cb29 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3124,8 +3124,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	}
 
 	if (map) {
-		tcpu = map->cpus[((u64) hash * map->len) >> 32];
-
+		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 		if (cpu_online(tcpu)) {
 			cpu = tcpu;
 			goto done;
-- 
cgit v1.2.3


From 573e8fca255a27e3573b51f9b183d62641c47a3d Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Fri, 22 Aug 2014 13:33:47 -0700
Subject: net: skb_gro_checksum_* functions

Add skb_gro_checksum_validate, skb_gro_checksum_validate_zero_check,
and skb_gro_checksum_simple_validate, and __skb_gro_checksum_complete.
These are the cognates of the normal checksum functions but are used
in the gro_receive path and operate on GRO related fields in sk_buffs.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 76 +++++++++++++++++++++++++++++++++++++++++++++--
 net/core/dev.c            | 34 ++++++++++++++++++++-
 2 files changed, 107 insertions(+), 3 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7e2b0b8b5cd7..eb73444e1bd0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1883,7 +1883,13 @@ struct napi_gro_cb {
 	u16	proto;
 
 	/* Used in udp_gro_receive */
-	u16	udp_mark;
+	u8	udp_mark:1;
+
+	/* GRO checksum is valid */
+	u8	csum_valid:1;
+
+	/* Number encapsulation layers crossed */
+	u8	encapsulation;
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
@@ -2154,11 +2160,77 @@ static inline void *skb_gro_network_header(struct sk_buff *skb)
 static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
 					const void *start, unsigned int len)
 {
-	if (skb->ip_summed == CHECKSUM_COMPLETE)
+	if (NAPI_GRO_CB(skb)->csum_valid)
 		NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
 						  csum_partial(start, len, 0));
 }
 
+/* GRO checksum functions. These are logical equivalents of the normal
+ * checksum functions (in skbuff.h) except that they operate on the GRO
+ * offsets and fields in sk_buff.
+ */
+
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
+
+static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
+						      bool zero_okay,
+						      __sum16 check)
+{
+	return (skb->ip_summed != CHECKSUM_PARTIAL &&
+		(skb->ip_summed != CHECKSUM_UNNECESSARY ||
+		 (NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) &&
+		(!zero_okay || check));
+}
+
+static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
+							   __wsum psum)
+{
+	if (NAPI_GRO_CB(skb)->csum_valid &&
+	    !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum)))
+		return 0;
+
+	NAPI_GRO_CB(skb)->csum = psum;
+
+	return __skb_gro_checksum_complete(skb);
+}
+
+/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level
+ * checksum or an encapsulated one during GRO. This saves work
+ * if we fallback to normal path with the packet.
+ */
+static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
+{
+	if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+		if (NAPI_GRO_CB(skb)->encapsulation)
+			skb->encapsulation = 1;
+	} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		skb->encapsulation = 0;
+	}
+}
+
+#define __skb_gro_checksum_validate(skb, proto, zero_okay, check,	\
+				    compute_pseudo)			\
+({									\
+	__sum16 __ret = 0;						\
+	if (__skb_gro_checksum_validate_needed(skb, zero_okay, check))	\
+		__ret = __skb_gro_checksum_validate_complete(skb,	\
+				compute_pseudo(skb, proto));		\
+	if (!__ret)							\
+		skb_gro_incr_csum_unnecessary(skb);			\
+	__ret;								\
+})
+
+#define skb_gro_checksum_validate(skb, proto, compute_pseudo)		\
+	__skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo)
+
+#define skb_gro_checksum_validate_zero_check(skb, proto, check,		\
+					     compute_pseudo)		\
+	__skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo)
+
+#define skb_gro_checksum_simple_validate(skb)				\
+	__skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo)
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/net/core/dev.c b/net/core/dev.c
index 1421dad4cb29..b6a718ec11c1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3962,7 +3962,13 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		goto normal;
 
 	gro_list_prepare(napi, skb);
-	NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		NAPI_GRO_CB(skb)->csum = skb->csum;
+		NAPI_GRO_CB(skb)->csum_valid = 1;
+	} else {
+		NAPI_GRO_CB(skb)->csum_valid = 0;
+	}
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
@@ -3975,6 +3981,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->flush = 0;
 		NAPI_GRO_CB(skb)->free = 0;
 		NAPI_GRO_CB(skb)->udp_mark = 0;
+		NAPI_GRO_CB(skb)->encapsulation = 0;
 
 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
 		break;
@@ -4205,6 +4212,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_gro_frags);
 
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+	__wsum wsum;
+	__sum16 sum;
+
+	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+	if (likely(!sum)) {
+		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+		    !skb->csum_complete_sw)
+			netdev_rx_csum_fault(skb->dev);
+	}
+
+	NAPI_GRO_CB(skb)->csum = wsum;
+	NAPI_GRO_CB(skb)->csum_valid = 1;
+
+	return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
 /*
  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
  * Note: called with local irq disabled, but exits with local irq enabled.
-- 
cgit v1.2.3


From 4798248e4e023170e937a65a1d30fcc52496dd42 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 22 Aug 2014 16:21:53 -0700
Subject: net: Add ops->ndo_xmit_flush()

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/dlci.c              |  2 +-
 drivers/usb/gadget/function/f_ncm.c |  2 +-
 include/linux/netdevice.h           | 35 +++++++++++++++++++++++++++++++++++
 net/atm/mpc.c                       |  2 +-
 net/core/dev.c                      |  5 ++---
 net/core/netpoll.c                  |  3 +--
 net/core/pktgen.c                   |  4 +---
 net/packet/af_packet.c              |  3 +--
 net/sched/sch_teql.c                |  3 +--
 9 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c
index 43c9960dce1c..81b22a180aad 100644
--- a/drivers/net/wan/dlci.c
+++ b/drivers/net/wan/dlci.c
@@ -193,7 +193,7 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev)
 	struct dlci_local *dlp = netdev_priv(dev);
 
 	if (skb)
-		dlp->slave->netdev_ops->ndo_start_xmit(skb, dlp->slave);
+		netdev_start_xmit(skb, dlp->slave);
 	return NETDEV_TX_OK;
 }
 
diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c
index bcdc882cd415..cb5d646db6a7 100644
--- a/drivers/usb/gadget/function/f_ncm.c
+++ b/drivers/usb/gadget/function/f_ncm.c
@@ -1101,7 +1101,7 @@ static void ncm_tx_tasklet(unsigned long data)
 	/* Only send if data is available. */
 	if (ncm->skb_tx_data) {
 		ncm->timer_force_tx = true;
-		ncm->netdev->netdev_ops->ndo_start_xmit(NULL, ncm->netdev);
+		netdev_start_xmit(NULL, ncm->netdev);
 		ncm->timer_force_tx = false;
 	}
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eb73444e1bd0..220c50984688 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -782,6 +782,19 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *        (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
  *	Required can not be NULL.
  *
+ * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue);
+ *	A driver implements this function when it wishes to support
+ *	deferred TX queue flushing.  The idea is that the expensive
+ *	operation to trigger TX queue processing can be done after
+ *	N calls to ndo_start_xmit rather than being done every single
+ *	time.  In this regime ndo_start_xmit will be called one or more
+ *	times, and then a final ndo_xmit_flush call will be made to
+ *	have the driver tell the device about the new pending TX queue
+ *	entries.  The kernel keeps track of which queues need flushing
+ *	by monitoring skb->queue_mapping of the packets it submits to
+ *	ndo_start_xmit.  This is the queue value that will be passed
+ *	to ndo_xmit_flush.
+ *
  * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
  *                         void *accel_priv, select_queue_fallback_t fallback);
  *	Called to decide which queue to when device supports multiple
@@ -1005,6 +1018,7 @@ struct net_device_ops {
 	int			(*ndo_stop)(struct net_device *dev);
 	netdev_tx_t		(*ndo_start_xmit) (struct sk_buff *skb,
 						   struct net_device *dev);
+	void			(*ndo_xmit_flush)(struct net_device *dev, u16 queue);
 	u16			(*ndo_select_queue)(struct net_device *dev,
 						    struct sk_buff *skb,
 						    void *accel_priv,
@@ -3430,6 +3444,27 @@ int __init dev_proc_init(void);
 #define dev_proc_init() 0
 #endif
 
+static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
+					      struct sk_buff *skb, struct net_device *dev)
+{
+	netdev_tx_t ret;
+	u16 q;
+
+	q = skb->queue_mapping;
+	ret = ops->ndo_start_xmit(skb, dev);
+	if (dev_xmit_complete(ret) && ops->ndo_xmit_flush)
+		ops->ndo_xmit_flush(dev, q);
+
+	return ret;
+}
+
+static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	return __netdev_start_xmit(ops, skb, dev);
+}
+
 int netdev_class_create_file_ns(struct class_attribute *class_attr,
 				const void *ns);
 void netdev_class_remove_file_ns(struct class_attribute *class_attr,
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index e8e0e7a8a23d..d662da161e5a 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
 	}
 
 non_ip:
-	return mpc->old_ops->ndo_start_xmit(skb, dev);
+	return __netdev_start_xmit(mpc->old_ops, skb, dev);
 }
 
 static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
diff --git a/net/core/dev.c b/net/core/dev.c
index b6a718ec11c1..26d296c2447c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2602,7 +2602,6 @@ EXPORT_SYMBOL(netif_skb_features);
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
 	int rc = NETDEV_TX_OK;
 	unsigned int skb_len;
 
@@ -2667,7 +2666,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 		skb_len = skb->len;
 		trace_net_dev_start_xmit(skb, dev);
-		rc = ops->ndo_start_xmit(skb, dev);
+		rc = netdev_start_xmit(skb, dev);
 		trace_net_dev_xmit(skb, rc, dev, skb_len);
 		if (rc == NETDEV_TX_OK)
 			txq_trans_update(txq);
@@ -2686,7 +2685,7 @@ gso:
 
 		skb_len = nskb->len;
 		trace_net_dev_start_xmit(nskb, dev);
-		rc = ops->ndo_start_xmit(nskb, dev);
+		rc = netdev_start_xmit(nskb, dev);
 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 907fb5e36c02..a5ad06828d67 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644);
 static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			      struct netdev_queue *txq)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
 	int status = NETDEV_TX_OK;
 	netdev_features_t features;
 
@@ -92,7 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		skb->vlan_tci = 0;
 	}
 
-	status = ops->ndo_start_xmit(skb, dev);
+	status = netdev_start_xmit(skb, dev);
 	if (status == NETDEV_TX_OK)
 		txq_trans_update(txq);
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8b849ddfef2e..83e2b4b19eb7 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3285,8 +3285,6 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
 static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 {
 	struct net_device *odev = pkt_dev->odev;
-	netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *)
-		= odev->netdev_ops->ndo_start_xmit;
 	struct netdev_queue *txq;
 	u16 queue_map;
 	int ret;
@@ -3339,7 +3337,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		goto unlock;
 	}
 	atomic_inc(&(pkt_dev->skb->users));
-	ret = (*xmit)(pkt_dev->skb, odev);
+	ret = netdev_start_xmit(pkt_dev->skb, odev);
 
 	switch (ret) {
 	case NETDEV_TX_OK:
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 93896d2092f6..0dfa990d4eaa 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -240,7 +240,6 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
 static int packet_direct_xmit(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
-	const struct net_device_ops *ops = dev->netdev_ops;
 	netdev_features_t features;
 	struct netdev_queue *txq;
 	int ret = NETDEV_TX_BUSY;
@@ -262,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb)
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
-		ret = ops->ndo_start_xmit(skb, dev);
+		ret = netdev_start_xmit(skb, dev);
 		if (ret == NETDEV_TX_OK)
 			txq_trans_update(txq);
 	}
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index bd33793b527e..64cd93ca8104 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -301,7 +301,6 @@ restart:
 	do {
 		struct net_device *slave = qdisc_dev(q);
 		struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
-		const struct net_device_ops *slave_ops = slave->netdev_ops;
 
 		if (slave_txq->qdisc_sleeping != q)
 			continue;
@@ -317,7 +316,7 @@ restart:
 				unsigned int length = qdisc_pkt_len(skb);
 
 				if (!netif_xmit_frozen_or_stopped(slave_txq) &&
-				    slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
+				    netdev_start_xmit(skb, slave) == NETDEV_TX_OK) {
 					txq_trans_update(slave_txq);
 					__netif_tx_unlock(slave_txq);
 					master->slaves = NEXT_SLAVE(q);
-- 
cgit v1.2.3


From 903ceff7ca7b4d80c083a80ee5163b74e9fa359f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Sun, 17 Aug 2014 12:30:35 -0500
Subject: net: Replace get_cpu_var through this_cpu_ptr

Replace uses of get_cpu_var for address calculation through this_cpu_ptr.

Cc: netdev@vger.kernel.org
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/net/netfilter/nf_conntrack.h |  2 +-
 include/net/snmp.h                   |  6 +++---
 net/core/dev.c                       | 14 +++++++-------
 net/core/drop_monitor.c              |  2 +-
 net/core/skbuff.c                    |  2 +-
 net/ipv4/route.c                     |  4 ++--
 net/ipv4/syncookies.c                |  2 +-
 net/ipv4/tcp.c                       |  2 +-
 net/ipv4/tcp_output.c                |  2 +-
 net/ipv6/syncookies.c                |  2 +-
 net/rds/ib_rdma.c                    |  2 +-
 11 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 37252f71a380..c8a7db605e03 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -242,7 +242,7 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
 DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 static inline struct nf_conn *nf_ct_untracked_get(void)
 {
-	return &__raw_get_cpu_var(nf_conntrack_untracked);
+	return raw_cpu_ptr(&nf_conntrack_untracked);
 }
 void nf_ct_untracked_status_or(unsigned long bits);
 
diff --git a/include/net/snmp.h b/include/net/snmp.h
index f1f27fdbb0d5..e154133877a2 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -168,7 +168,7 @@ struct linux_xfrm_mib {
 
 #define SNMP_ADD_STATS64_BH(mib, field, addend) 			\
 	do {								\
-		__typeof__(*mib) *ptr = __this_cpu_ptr(mib);		\
+		__typeof__(*mib) *ptr = raw_cpu_ptr(mib);		\
 		u64_stats_update_begin(&ptr->syncp);			\
 		ptr->mibs[field] += addend;				\
 		u64_stats_update_end(&ptr->syncp);			\
@@ -189,8 +189,8 @@ struct linux_xfrm_mib {
 #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1)
 #define SNMP_UPD_PO_STATS64_BH(mib, basefield, addend)			\
 	do {								\
-		__typeof__(*mib) *ptr;					\
-		ptr = __this_cpu_ptr(mib);				\
+		__typeof__(*mib) *ptr;				\
+		ptr = raw_cpu_ptr((mib));				\
 		u64_stats_update_begin(&ptr->syncp);			\
 		ptr->mibs[basefield##PKTS]++;				\
 		ptr->mibs[basefield##OCTETS] += addend;			\
diff --git a/net/core/dev.c b/net/core/dev.c
index b65a5051361f..9ef13ff354fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2153,7 +2153,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	sd = &__get_cpu_var(softnet_data);
+	sd = this_cpu_ptr(&softnet_data);
 	q->next_sched = NULL;
 	*sd->output_queue_tailp = q;
 	sd->output_queue_tailp = &q->next_sched;
@@ -3195,7 +3195,7 @@ static void rps_trigger_softirq(void *data)
 static int rps_ipi_queued(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
+	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 
 	if (sd != mysd) {
 		sd->rps_ipi_next = mysd->rps_ipi_list;
@@ -3222,7 +3222,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 	if (qlen < (netdev_max_backlog >> 1))
 		return false;
 
-	sd = &__get_cpu_var(softnet_data);
+	sd = this_cpu_ptr(&softnet_data);
 
 	rcu_read_lock();
 	fl = rcu_dereference(sd->flow_limit);
@@ -3369,7 +3369,7 @@ EXPORT_SYMBOL(netif_rx_ni);
 
 static void net_tx_action(struct softirq_action *h)
 {
-	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
@@ -3794,7 +3794,7 @@ EXPORT_SYMBOL(netif_receive_skb);
 static void flush_backlog(void *arg)
 {
 	struct net_device *dev = arg;
-	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 	struct sk_buff *skb, *tmp;
 
 	rps_lock(sd);
@@ -4301,7 +4301,7 @@ void __napi_schedule(struct napi_struct *n)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	____napi_schedule(&__get_cpu_var(softnet_data), n);
+	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__napi_schedule);
@@ -4422,7 +4422,7 @@ EXPORT_SYMBOL(netif_napi_del);
 
 static void net_rx_action(struct softirq_action *h)
 {
-	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 	unsigned long time_limit = jiffies + 2;
 	int budget = netdev_budget;
 	void *have;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 50f9a9db5792..252e155c837b 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -146,7 +146,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	data = &__get_cpu_var(dm_cpu_data);
+	data = this_cpu_ptr(&dm_cpu_data);
 	spin_lock(&data->lock);
 	dskb = data->skb;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 163b673f9e62..adfc7ee1acf2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -345,7 +345,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	nc = &__get_cpu_var(netdev_alloc_cache);
+	nc = this_cpu_ptr(&netdev_alloc_cache);
 	if (unlikely(!nc->frag.page)) {
 refill:
 		for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index eaa4b000c7b4..7d6f4e021846 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1311,7 +1311,7 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 	if (rt_is_input_route(rt)) {
 		p = (struct rtable **)&nh->nh_rth_input;
 	} else {
-		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
+		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
 	}
 	orig = *p;
 
@@ -1939,7 +1939,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 				do_cache = false;
 				goto add;
 			}
-			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
+			prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
 		}
 		rth = rcu_dereference(*prth);
 		if (rt_cache_valid(rth)) {
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c0c75688896e..f83391bfdd76 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -40,7 +40,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 
 	net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
 
-	tmp  = __get_cpu_var(ipv4_cookie_scratch);
+	tmp  = this_cpu_ptr(ipv4_cookie_scratch);
 	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
 	tmp[0] = (__force u32)saddr;
 	tmp[1] = (__force u32)daddr;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 541f26a67ba2..b2cab7770a11 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3058,7 +3058,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
 	local_bh_disable();
 	p = ACCESS_ONCE(tcp_md5sig_pool);
 	if (p)
-		return __this_cpu_ptr(p);
+		return raw_cpu_ptr(p);
 
 	local_bh_enable();
 	return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5a7c41fbc6d3..d145f7ef78f3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -842,7 +842,7 @@ void tcp_wfree(struct sk_buff *skb)
 
 		/* queue this socket to tasklet queue */
 		local_irq_save(flags);
-		tsq = &__get_cpu_var(tsq_tasklet);
+		tsq = this_cpu_ptr(&tsq_tasklet);
 		list_add(&tp->tsq_node, &tsq->head);
 		tasklet_schedule(&tsq->tasklet);
 		local_irq_restore(flags);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 83cea1d39466..637de5b87589 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -67,7 +67,7 @@ static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *dadd
 
 	net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
 
-	tmp  = __get_cpu_var(ipv6_cookie_scratch);
+	tmp  = this_cpu_ptr(ipv6_cookie_scratch);
 
 	/*
 	 * we have 320 bits of information to hash, copy in the remaining
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index e8fdb172adbb..273b8bff6ba4 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -267,7 +267,7 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
 	unsigned long *flag;
 
 	preempt_disable();
-	flag = &__get_cpu_var(clean_list_grace);
+	flag = this_cpu_ptr(&clean_list_grace);
 	set_bit(CLEAN_LIST_BUSY_BIT, flag);
 	ret = llist_del_first(&pool->clean_list);
 	if (ret)
-- 
cgit v1.2.3


From 662880f4420340aad4f9a62a349c6c9d4faa1a5d Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Wed, 27 Aug 2014 21:26:56 -0700
Subject: net: Allow GRO to use and set levels of checksum unnecessary

Allow GRO path to "consume" checksums provided in CHECKSUM_UNNECESSARY
and to report new checksums verfied for use in fallback to normal
path.

Change GRO checksum path to track csum_level using a csum_cnt field
in NAPI_GRO_CB. On GRO initialization, if ip_summed is
CHECKSUM_UNNECESSARY set NAPI_GRO_CB(skb)->csum_cnt to
skb->csum_level + 1. For each checksum verified, decrement
NAPI_GRO_CB(skb)->csum_cnt while its greater than zero. If a checksum
is verfied and NAPI_GRO_CB(skb)->csum_cnt == 0, we have verified a
deeper checksum than originally indicated in skbuf so increment
csum_level (or initialize to CHECKSUM_UNNECESSARY if ip_summed is
CHECKSUM_NONE or CHECKSUM_COMPLETE).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 26 ++++++++++++--------------
 net/core/dev.c            | 24 ++++++++++++++++--------
 net/ipv4/gre_offload.c    |  7 ++-----
 net/ipv4/udp_offload.c    |  5 +++--
 4 files changed, 33 insertions(+), 29 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dfc1d8b8bd0f..456eb1fe51e8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1883,8 +1883,8 @@ struct napi_gro_cb {
 	/* GRO checksum is valid */
 	u8	csum_valid:1;
 
-	/* Number encapsulation layers crossed */
-	u8	encapsulation;
+	/* Number of checksums via CHECKSUM_UNNECESSARY */
+	u8	csum_cnt:3;
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
@@ -2179,8 +2179,7 @@ static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
 						      __sum16 check)
 {
 	return (skb->ip_summed != CHECKSUM_PARTIAL &&
-		(skb->ip_summed != CHECKSUM_UNNECESSARY ||
-		 (NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) &&
+		NAPI_GRO_CB(skb)->csum_cnt == 0 &&
 		(!zero_okay || check));
 }
 
@@ -2196,18 +2195,17 @@ static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
 	return __skb_gro_checksum_complete(skb);
 }
 
-/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level
- * checksum or an encapsulated one during GRO. This saves work
- * if we fallback to normal path with the packet.
- */
 static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
 {
-	if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
-		if (NAPI_GRO_CB(skb)->encapsulation)
-			skb->encapsulation = 1;
-	} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		skb->encapsulation = 0;
+	if (NAPI_GRO_CB(skb)->csum_cnt > 0) {
+		/* Consume a checksum from CHECKSUM_UNNECESSARY */
+		NAPI_GRO_CB(skb)->csum_cnt--;
+	} else {
+		/* Update skb for CHECKSUM_UNNECESSARY and csum_level when we
+		 * verified a new top level checksum or an encapsulated one
+		 * during GRO. This saves work if we fallback to normal path.
+		 */
+		__skb_incr_checksum_unnecessary(skb);
 	}
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 26d296c2447c..a6077ef56345 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3962,13 +3962,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 
 	gro_list_prepare(napi, skb);
 
-	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		NAPI_GRO_CB(skb)->csum = skb->csum;
-		NAPI_GRO_CB(skb)->csum_valid = 1;
-	} else {
-		NAPI_GRO_CB(skb)->csum_valid = 0;
-	}
-
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
 		if (ptype->type != type || !ptype->callbacks.gro_receive)
@@ -3980,7 +3973,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->flush = 0;
 		NAPI_GRO_CB(skb)->free = 0;
 		NAPI_GRO_CB(skb)->udp_mark = 0;
-		NAPI_GRO_CB(skb)->encapsulation = 0;
+
+		/* Setup for GRO checksum validation */
+		switch (skb->ip_summed) {
+		case CHECKSUM_COMPLETE:
+			NAPI_GRO_CB(skb)->csum = skb->csum;
+			NAPI_GRO_CB(skb)->csum_valid = 1;
+			NAPI_GRO_CB(skb)->csum_cnt = 0;
+			break;
+		case CHECKSUM_UNNECESSARY:
+			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+			NAPI_GRO_CB(skb)->csum_valid = 0;
+			break;
+		default:
+			NAPI_GRO_CB(skb)->csum_cnt = 0;
+			NAPI_GRO_CB(skb)->csum_valid = 0;
+		}
 
 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
 		break;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index d1bd16937d93..a4d7965fb880 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -172,12 +172,9 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
 	}
 
 	/* Don't bother verifying checksum if we're going to flush anyway. */
-	if (greh->flags & GRE_CSUM) {
-		if (!NAPI_GRO_CB(skb)->flush &&
-		    skb_gro_checksum_simple_validate(skb))
+	if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush &&
+	    skb_gro_checksum_simple_validate(skb))
 			goto out_unlock;
-		NAPI_GRO_CB(skb)->encapsulation++;
-	}
 
 	flush = 0;
 
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 8ed460e3753c..a6adff98382a 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -238,12 +238,13 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 	int flush = 1;
 
 	if (NAPI_GRO_CB(skb)->udp_mark ||
-	    (!skb->encapsulation && !NAPI_GRO_CB(skb)->csum_valid))
+	    (skb->ip_summed != CHECKSUM_PARTIAL &&
+	     NAPI_GRO_CB(skb)->csum_cnt == 0 &&
+	     !NAPI_GRO_CB(skb)->csum_valid))
 		goto out;
 
 	/* mark that this skb passed once through the udp gro layer */
 	NAPI_GRO_CB(skb)->udp_mark = 1;
-	NAPI_GRO_CB(skb)->encapsulation++;
 
 	rcu_read_lock();
 	uo_priv = rcu_dereference(udp_offload_base);
-- 
cgit v1.2.3


From 10b3ad8c21bb4b135768c30dd4c51a1c744da699 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 29 Aug 2014 21:07:24 -0700
Subject: net: Do txq_trans_update() in netdev_start_xmit()

That way we don't have to audit every call site to make sure it is
doing this properly.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/dlci.c    |  6 ++++--
 include/linux/netdevice.h | 10 ++++++++--
 net/core/dev.c            |  7 ++-----
 net/core/netpoll.c        |  4 +---
 net/core/pktgen.c         |  3 +--
 net/packet/af_packet.c    |  7 ++-----
 net/sched/sch_teql.c      |  3 +--
 7 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c
index 81b22a180aad..6427e8283419 100644
--- a/drivers/net/wan/dlci.c
+++ b/drivers/net/wan/dlci.c
@@ -192,8 +192,10 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct dlci_local *dlp = netdev_priv(dev);
 
-	if (skb)
-		netdev_start_xmit(skb, dlp->slave);
+	if (skb) {
+		struct netdev_queue *txq = skb_get_tx_queue(dev, skb);
+		netdev_start_xmit(skb, dlp->slave, txq);
+	}
 	return NETDEV_TX_OK;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 456eb1fe51e8..16171802ea7d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3437,11 +3437,17 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
 	return ops->ndo_start_xmit(skb, dev);
 }
 
-static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
+					    struct netdev_queue *txq)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
+	int rc;
 
-	return __netdev_start_xmit(ops, skb, dev);
+	rc = __netdev_start_xmit(ops, skb, dev);
+	if (rc == NETDEV_TX_OK)
+		txq_trans_update(txq);
+
+	return rc;
 }
 
 int netdev_class_create_file_ns(struct class_attribute *class_attr,
diff --git a/net/core/dev.c b/net/core/dev.c
index a6077ef56345..6392adaaa22f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2666,10 +2666,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 		skb_len = skb->len;
 		trace_net_dev_start_xmit(skb, dev);
-		rc = netdev_start_xmit(skb, dev);
+		rc = netdev_start_xmit(skb, dev, txq);
 		trace_net_dev_xmit(skb, rc, dev, skb_len);
-		if (rc == NETDEV_TX_OK)
-			txq_trans_update(txq);
 		return rc;
 	}
 
@@ -2685,7 +2683,7 @@ gso:
 
 		skb_len = nskb->len;
 		trace_net_dev_start_xmit(nskb, dev);
-		rc = netdev_start_xmit(nskb, dev);
+		rc = netdev_start_xmit(nskb, dev, txq);
 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
@@ -2694,7 +2692,6 @@ gso:
 			skb->next = nskb;
 			return rc;
 		}
-		txq_trans_update(txq);
 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 12b1df976562..05bc57edaa81 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -91,9 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		skb->vlan_tci = 0;
 	}
 
-	status = netdev_start_xmit(skb, dev);
-	if (status == NETDEV_TX_OK)
-		txq_trans_update(txq);
+	status = netdev_start_xmit(skb, dev, txq);
 
 out:
 	return status;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index d81b540096c3..34bd2ff9f121 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3335,11 +3335,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		goto unlock;
 	}
 	atomic_inc(&(pkt_dev->skb->users));
-	ret = netdev_start_xmit(pkt_dev->skb, odev);
+	ret = netdev_start_xmit(pkt_dev->skb, odev, txq);
 
 	switch (ret) {
 	case NETDEV_TX_OK:
-		txq_trans_update(txq);
 		pkt_dev->last_ok = 1;
 		pkt_dev->sofar++;
 		pkt_dev->seq_num++;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b7a7f5a721bd..fe305a05a8fc 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -258,11 +258,8 @@ static int packet_direct_xmit(struct sk_buff *skb)
 	local_bh_disable();
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
-	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
-		ret = netdev_start_xmit(skb, dev);
-		if (ret == NETDEV_TX_OK)
-			txq_trans_update(txq);
-	}
+	if (!netif_xmit_frozen_or_drv_stopped(txq))
+		ret = netdev_start_xmit(skb, dev, txq);
 	HARD_TX_UNLOCK(dev, txq);
 
 	local_bh_enable();
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 64cd93ca8104..193dc2cba1ec 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -316,8 +316,7 @@ restart:
 				unsigned int length = qdisc_pkt_len(skb);
 
 				if (!netif_xmit_frozen_or_stopped(slave_txq) &&
-				    netdev_start_xmit(skb, slave) == NETDEV_TX_OK) {
-					txq_trans_update(slave_txq);
+				    netdev_start_xmit(skb, slave, slave_txq) == NETDEV_TX_OK) {
 					__netif_tx_unlock(slave_txq);
 					master->slaves = NEXT_SLAVE(q);
 					netif_wake_queue(dev);
-- 
cgit v1.2.3


From 2ea255137555052655c6a646c4e48ea7481494c7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 29 Aug 2014 21:10:01 -0700
Subject: net: Create xmit_one() helper for dev_hard_start_xmit()

Hopefully making the code a bit easier to read and digest.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6392adaaa22f..0fde7d2153db 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2599,11 +2599,27 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_skb_features);
 
+static int xmit_one(struct sk_buff *skb, struct net_device *dev,
+		    struct netdev_queue *txq)
+{
+	unsigned int len;
+	int rc;
+
+	if (!list_empty(&ptype_all))
+		dev_queue_xmit_nit(skb, dev);
+
+	len = skb->len;
+	trace_net_dev_start_xmit(skb, dev);
+	rc = netdev_start_xmit(skb, dev, txq);
+	trace_net_dev_xmit(skb, rc, dev, len);
+
+	return rc;
+}
+
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
 	int rc = NETDEV_TX_OK;
-	unsigned int skb_len;
 
 	if (likely(!skb->next)) {
 		netdev_features_t features;
@@ -2661,14 +2677,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			}
 		}
 
-		if (!list_empty(&ptype_all))
-			dev_queue_xmit_nit(skb, dev);
-
-		skb_len = skb->len;
-		trace_net_dev_start_xmit(skb, dev);
-		rc = netdev_start_xmit(skb, dev, txq);
-		trace_net_dev_xmit(skb, rc, dev, skb_len);
-		return rc;
+		return xmit_one(skb, dev, txq);
 	}
 
 gso:
@@ -2678,13 +2687,7 @@ gso:
 		skb->next = nskb->next;
 		nskb->next = NULL;
 
-		if (!list_empty(&ptype_all))
-			dev_queue_xmit_nit(nskb, dev);
-
-		skb_len = nskb->len;
-		trace_net_dev_start_xmit(nskb, dev);
-		rc = netdev_start_xmit(nskb, dev, txq);
-		trace_net_dev_xmit(nskb, rc, dev, skb_len);
+		rc = xmit_one(nskb, dev, txq);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
 				goto out_kfree_gso_skb;
-- 
cgit v1.2.3


From 7f2e870f2a48a0524a3b03b04fa019311d16a7f7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 29 Aug 2014 21:19:14 -0700
Subject: net: Move main gso loop out of dev_hard_start_xmit() into helper.

There is a slight policy change happening here as well.

The previous code would drop the entire rest of the GSO skb if any of
them got, for example, a congestion notification.

That makes no sense, anything NET_XMIT_MASK and below is something
like congestion or policing.  And in the congestion case it doesn't
even mean the packet was actually dropped.

Just continue until dev_xmit_complete() evaluates to false.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 48 +++++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0fde7d2153db..ab7bb809711e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2616,6 +2616,34 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 	return rc;
 }
 
+static struct sk_buff *xmit_list(struct sk_buff *first, struct net_device *dev,
+				 struct netdev_queue *txq, int *ret)
+{
+	struct sk_buff *skb = first;
+	int rc = NETDEV_TX_OK;
+
+	while (skb) {
+		struct sk_buff *next = skb->next;
+
+		skb->next = NULL;
+		rc = xmit_one(skb, dev, txq);
+		if (unlikely(!dev_xmit_complete(rc))) {
+			skb->next = next;
+			goto out;
+		}
+
+		skb = next;
+		if (netif_xmit_stopped(txq) && skb) {
+			rc = NETDEV_TX_BUSY;
+			break;
+		}
+	}
+
+out:
+	*ret = rc;
+	return skb;
+}
+
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
@@ -2681,25 +2709,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 gso:
-	do {
-		struct sk_buff *nskb = skb->next;
-
-		skb->next = nskb->next;
-		nskb->next = NULL;
-
-		rc = xmit_one(nskb, dev, txq);
-		if (unlikely(rc != NETDEV_TX_OK)) {
-			if (rc & ~NETDEV_TX_MASK)
-				goto out_kfree_gso_skb;
-			nskb->next = skb->next;
-			skb->next = nskb;
-			return rc;
-		}
-		if (unlikely(netif_xmit_stopped(txq) && skb->next))
-			return NETDEV_TX_BUSY;
-	} while (skb->next);
-
-out_kfree_gso_skb:
+	skb->next = xmit_list(skb->next, dev, txq, &rc);
 	if (likely(skb->next == NULL)) {
 		skb->destructor = DEV_GSO_CB(skb)->destructor;
 		consume_skb(skb);
-- 
cgit v1.2.3


From fa2dbdc253c2aee2a760c64de454cb62469ec11d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 29 Aug 2014 21:55:22 -0700
Subject: net: Pass a "more" indication down into netdev_start_xmit() code
 paths.

For now it will always be false.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/dlci.c    | 2 +-
 include/linux/netdevice.h | 9 +++++----
 net/atm/mpc.c             | 2 +-
 net/core/dev.c            | 2 +-
 net/core/netpoll.c        | 2 +-
 net/core/pktgen.c         | 2 +-
 net/packet/af_packet.c    | 2 +-
 net/sched/sch_teql.c      | 3 ++-
 8 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c
index 6427e8283419..ae6ecf401189 100644
--- a/drivers/net/wan/dlci.c
+++ b/drivers/net/wan/dlci.c
@@ -194,7 +194,7 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (skb) {
 		struct netdev_queue *txq = skb_get_tx_queue(dev, skb);
-		netdev_start_xmit(skb, dlp->slave, txq);
+		netdev_start_xmit(skb, dlp->slave, txq, false);
 	}
 	return NETDEV_TX_OK;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 16171802ea7d..5050218c5b7f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3431,19 +3431,20 @@ int __init dev_proc_init(void);
 #endif
 
 static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
-					      struct sk_buff *skb, struct net_device *dev)
+					      struct sk_buff *skb, struct net_device *dev,
+					      bool more)
 {
-	skb->xmit_more = 0;
+	skb->xmit_more = more ? 1 : 0;
 	return ops->ndo_start_xmit(skb, dev);
 }
 
 static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
-					    struct netdev_queue *txq)
+					    struct netdev_queue *txq, bool more)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int rc;
 
-	rc = __netdev_start_xmit(ops, skb, dev);
+	rc = __netdev_start_xmit(ops, skb, dev, more);
 	if (rc == NETDEV_TX_OK)
 		txq_trans_update(txq);
 
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index d662da161e5a..0e982222d425 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
 	}
 
 non_ip:
-	return __netdev_start_xmit(mpc->old_ops, skb, dev);
+	return __netdev_start_xmit(mpc->old_ops, skb, dev, false);
 }
 
 static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
diff --git a/net/core/dev.c b/net/core/dev.c
index ab7bb809711e..f0ed5a611a97 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2610,7 +2610,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 	len = skb->len;
 	trace_net_dev_start_xmit(skb, dev);
-	rc = netdev_start_xmit(skb, dev, txq);
+	rc = netdev_start_xmit(skb, dev, txq, false);
 	trace_net_dev_xmit(skb, rc, dev, len);
 
 	return rc;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 05bc57edaa81..e6645b4f330a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -91,7 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		skb->vlan_tci = 0;
 	}
 
-	status = netdev_start_xmit(skb, dev, txq);
+	status = netdev_start_xmit(skb, dev, txq, false);
 
 out:
 	return status;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 34bd2ff9f121..5b36a9428c59 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3335,7 +3335,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		goto unlock;
 	}
 	atomic_inc(&(pkt_dev->skb->users));
-	ret = netdev_start_xmit(pkt_dev->skb, odev, txq);
+	ret = netdev_start_xmit(pkt_dev->skb, odev, txq, false);
 
 	switch (ret) {
 	case NETDEV_TX_OK:
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index fe305a05a8fc..87d20f48ff06 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -259,7 +259,7 @@ static int packet_direct_xmit(struct sk_buff *skb)
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	if (!netif_xmit_frozen_or_drv_stopped(txq))
-		ret = netdev_start_xmit(skb, dev, txq);
+		ret = netdev_start_xmit(skb, dev, txq, false);
 	HARD_TX_UNLOCK(dev, txq);
 
 	local_bh_enable();
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 193dc2cba1ec..aaa8d03ed054 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -316,7 +316,8 @@ restart:
 				unsigned int length = qdisc_pkt_len(skb);
 
 				if (!netif_xmit_frozen_or_stopped(slave_txq) &&
-				    netdev_start_xmit(skb, slave, slave_txq) == NETDEV_TX_OK) {
+				    netdev_start_xmit(skb, slave, slave_txq, false) ==
+				    NETDEV_TX_OK) {
 					__netif_tx_unlock(slave_txq);
 					master->slaves = NEXT_SLAVE(q);
 					netif_wake_queue(dev);
-- 
cgit v1.2.3


From 95f6b3dda2a4a052f7dabe9998e4ffac491b7bc2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 29 Aug 2014 21:57:30 -0700
Subject: net: Have xmit_list() signal more==true when appropriate.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index f0ed5a611a97..6d82194e414b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2600,7 +2600,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 EXPORT_SYMBOL(netif_skb_features);
 
 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
-		    struct netdev_queue *txq)
+		    struct netdev_queue *txq, bool more)
 {
 	unsigned int len;
 	int rc;
@@ -2610,7 +2610,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 	len = skb->len;
 	trace_net_dev_start_xmit(skb, dev);
-	rc = netdev_start_xmit(skb, dev, txq, false);
+	rc = netdev_start_xmit(skb, dev, txq, more);
 	trace_net_dev_xmit(skb, rc, dev, len);
 
 	return rc;
@@ -2626,7 +2626,7 @@ static struct sk_buff *xmit_list(struct sk_buff *first, struct net_device *dev,
 		struct sk_buff *next = skb->next;
 
 		skb->next = NULL;
-		rc = xmit_one(skb, dev, txq);
+		rc = xmit_one(skb, dev, txq, next != NULL);
 		if (unlikely(!dev_xmit_complete(rc))) {
 			skb->next = next;
 			goto out;
@@ -2705,7 +2705,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			}
 		}
 
-		return xmit_one(skb, dev, txq);
+		return xmit_one(skb, dev, txq, false);
 	}
 
 gso:
-- 
cgit v1.2.3


From eae3f88ee44251bcca3a085f9565257c6f9f9e69 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 30 Aug 2014 15:17:13 -0700
Subject: net: Separate out SKB validation logic from transmit path.

dev_hard_start_xmit() does two things, it first validates and
canonicalizes the SKB, then it actually sends it.

Make a set of helper functions for doing the first part.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 125 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 71 insertions(+), 54 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6d82194e414b..704a5434f77d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2644,80 +2644,97 @@ out:
 	return skb;
 }
 
-int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq)
+struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features)
 {
-	int rc = NETDEV_TX_OK;
+	if (vlan_tx_tag_present(skb) &&
+	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+		skb = __vlan_put_tag(skb, skb->vlan_proto,
+				     vlan_tx_tag_get(skb));
+		if (skb)
+			skb->vlan_tci = 0;
+	}
+	return skb;
+}
 
-	if (likely(!skb->next)) {
-		netdev_features_t features;
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
+{
+	netdev_features_t features;
 
-		/*
-		 * If device doesn't need skb->dst, release it right now while
-		 * its hot in this cpu cache
-		 */
-		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
-			skb_dst_drop(skb);
+	if (skb->next)
+		return skb;
 
-		features = netif_skb_features(skb);
+	/* If device doesn't need skb->dst, release it right now while
+	 * its hot in this cpu cache
+	 */
+	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+		skb_dst_drop(skb);
 
-		if (vlan_tx_tag_present(skb) &&
-		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-			skb = __vlan_put_tag(skb, skb->vlan_proto,
-					     vlan_tx_tag_get(skb));
-			if (unlikely(!skb))
-				goto out;
+	features = netif_skb_features(skb);
+	skb = validate_xmit_vlan(skb, features);
+	if (unlikely(!skb))
+		goto out_null;
 
-			skb->vlan_tci = 0;
-		}
+	/* If encapsulation offload request, verify we are testing
+	 * hardware encapsulation features instead of standard
+	 * features for the netdev
+	 */
+	if (skb->encapsulation)
+		features &= dev->hw_enc_features;
 
-		/* If encapsulation offload request, verify we are testing
-		 * hardware encapsulation features instead of standard
-		 * features for the netdev
-		 */
-		if (skb->encapsulation)
-			features &= dev->hw_enc_features;
+	if (netif_needs_gso(skb, features)) {
+		if (unlikely(dev_gso_segment(skb, features)))
+			goto out_kfree_skb;
+	} else {
+		if (skb_needs_linearize(skb, features) &&
+		    __skb_linearize(skb))
+			goto out_kfree_skb;
 
-		if (netif_needs_gso(skb, features)) {
-			if (unlikely(dev_gso_segment(skb, features)))
-				goto out_kfree_skb;
-			if (skb->next)
-				goto gso;
-		} else {
-			if (skb_needs_linearize(skb, features) &&
-			    __skb_linearize(skb))
+		/* If packet is not checksummed and device does not
+		 * support checksumming for this protocol, complete
+		 * checksumming here.
+		 */
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			if (skb->encapsulation)
+				skb_set_inner_transport_header(skb,
+							       skb_checksum_start_offset(skb));
+			else
+				skb_set_transport_header(skb,
+							 skb_checksum_start_offset(skb));
+			if (!(features & NETIF_F_ALL_CSUM) &&
+			    skb_checksum_help(skb))
 				goto out_kfree_skb;
-
-			/* If packet is not checksummed and device does not
-			 * support checksumming for this protocol, complete
-			 * checksumming here.
-			 */
-			if (skb->ip_summed == CHECKSUM_PARTIAL) {
-				if (skb->encapsulation)
-					skb_set_inner_transport_header(skb,
-						skb_checksum_start_offset(skb));
-				else
-					skb_set_transport_header(skb,
-						skb_checksum_start_offset(skb));
-				if (!(features & NETIF_F_ALL_CSUM) &&
-				     skb_checksum_help(skb))
-					goto out_kfree_skb;
-			}
 		}
+	}
+
+	return skb;
+
+out_kfree_skb:
+	kfree_skb(skb);
+out_null:
+	return NULL;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+			struct netdev_queue *txq)
+{
+	int rc = NETDEV_TX_OK;
+
+	skb = validate_xmit_skb(skb, dev);
+	if (!skb)
+		return rc;
 
+	if (likely(!skb->next))
 		return xmit_one(skb, dev, txq, false);
-	}
 
-gso:
 	skb->next = xmit_list(skb->next, dev, txq, &rc);
 	if (likely(skb->next == NULL)) {
 		skb->destructor = DEV_GSO_CB(skb)->destructor;
 		consume_skb(skb);
 		return rc;
 	}
-out_kfree_skb:
+
 	kfree_skb(skb);
-out:
+
 	return rc;
 }
 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
-- 
cgit v1.2.3


From 50cbe9ab5f8d92d2d4a327b56e96559d8f63a1fa Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 30 Aug 2014 19:13:51 -0700
Subject: net: Validate xmit SKBs right when we pull them out of the qdisc.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 6 +-----
 net/sched/sch_generic.c   | 5 ++++-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5050218c5b7f..47c49ba2dcf4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2827,6 +2827,7 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *);
 int dev_change_carrier(struct net_device *, bool new_carrier);
 int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_port_id *ppid);
+struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev);
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq);
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 704a5434f77d..75bc5b068a13 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2656,7 +2656,7 @@ struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t featur
 	return skb;
 }
 
-static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
+struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 {
 	netdev_features_t features;
 
@@ -2719,10 +2719,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 {
 	int rc = NETDEV_TX_OK;
 
-	skb = validate_xmit_skb(skb, dev);
-	if (!skb)
-		return rc;
-
 	if (likely(!skb->next))
 		return xmit_one(skb, dev, txq, false);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 05b3f5d104af..f178798a5836 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -70,8 +70,11 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 		} else
 			skb = NULL;
 	} else {
-		if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq))
+		if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) {
 			skb = q->dequeue(q);
+			if (skb)
+				skb = validate_xmit_skb(skb, qdisc_dev(q));
+		}
 	}
 
 	return skb;
-- 
cgit v1.2.3


From ce93718fb7cdbc064c3000ff59e4d3200bdfa744 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 30 Aug 2014 19:22:20 -0700
Subject: net: Don't keep around original SKB when we software segment GSO
 frames.

Just maintain the list properly by returning the head of the remaining
SKB list from dev_hard_start_xmit().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 +--
 net/core/dev.c            | 79 +++++++++--------------------------------------
 net/sched/sch_generic.c   |  2 +-
 3 files changed, 17 insertions(+), 68 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 47c49ba2dcf4..202c25a9aadf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2828,8 +2828,8 @@ int dev_change_carrier(struct net_device *, bool new_carrier);
 int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_port_id *ppid);
 struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev);
-int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq);
+struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+				    struct netdev_queue *txq, int *ret);
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 75bc5b068a13..c89da4f306b1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2485,52 +2485,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 	return 0;
 }
 
-struct dev_gso_cb {
-	void (*destructor)(struct sk_buff *skb);
-};
-
-#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
-
-static void dev_gso_skb_destructor(struct sk_buff *skb)
-{
-	struct dev_gso_cb *cb;
-
-	kfree_skb_list(skb->next);
-	skb->next = NULL;
-
-	cb = DEV_GSO_CB(skb);
-	if (cb->destructor)
-		cb->destructor(skb);
-}
-
-/**
- *	dev_gso_segment - Perform emulated hardware segmentation on skb.
- *	@skb: buffer to segment
- *	@features: device features as applicable to this skb
- *
- *	This function segments the given skb and stores the list of segments
- *	in skb->next.
- */
-static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
-{
-	struct sk_buff *segs;
-
-	segs = skb_gso_segment(skb, features);
-
-	/* Verifying header integrity only. */
-	if (!segs)
-		return 0;
-
-	if (IS_ERR(segs))
-		return PTR_ERR(segs);
-
-	skb->next = segs;
-	DEV_GSO_CB(skb)->destructor = skb->destructor;
-	skb->destructor = dev_gso_skb_destructor;
-
-	return 0;
-}
-
 /* If MPLS offload request, verify we are testing hardware MPLS features
  * instead of standard features for the netdev.
  */
@@ -2682,8 +2636,13 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 		features &= dev->hw_enc_features;
 
 	if (netif_needs_gso(skb, features)) {
-		if (unlikely(dev_gso_segment(skb, features)))
-			goto out_kfree_skb;
+		struct sk_buff *segs;
+
+		segs = skb_gso_segment(skb, features);
+		kfree_skb(skb);
+		if (IS_ERR(segs))
+			segs = NULL;
+		skb = segs;
 	} else {
 		if (skb_needs_linearize(skb, features) &&
 		    __skb_linearize(skb))
@@ -2714,26 +2673,16 @@ out_null:
 	return NULL;
 }
 
-int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq)
+struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+				    struct netdev_queue *txq, int *ret)
 {
-	int rc = NETDEV_TX_OK;
-
-	if (likely(!skb->next))
-		return xmit_one(skb, dev, txq, false);
-
-	skb->next = xmit_list(skb->next, dev, txq, &rc);
-	if (likely(skb->next == NULL)) {
-		skb->destructor = DEV_GSO_CB(skb)->destructor;
-		consume_skb(skb);
-		return rc;
+	if (likely(!skb->next)) {
+		*ret = xmit_one(skb, dev, txq, false);
+		return skb;
 	}
 
-	kfree_skb(skb);
-
-	return rc;
+	return xmit_list(skb, dev, txq, ret);
 }
-EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
 
 static void qdisc_pkt_len_init(struct sk_buff *skb)
 {
@@ -2945,7 +2894,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 			if (!netif_xmit_stopped(txq)) {
 				__this_cpu_inc(xmit_recursion);
-				rc = dev_hard_start_xmit(skb, dev, txq);
+				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 				__this_cpu_dec(xmit_recursion);
 				if (dev_xmit_complete(rc)) {
 					HARD_TX_UNLOCK(dev, txq);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f178798a5836..a8bf9f9928bd 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -129,7 +129,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	if (!netif_xmit_frozen_or_stopped(txq))
-		ret = dev_hard_start_xmit(skb, dev, txq);
+		skb = dev_hard_start_xmit(skb, dev, txq, &ret);
 
 	HARD_TX_UNLOCK(dev, txq);
 
-- 
cgit v1.2.3


From 8dcda22a5d0abaf347b21b057655f3809b91639d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 1 Sep 2014 15:06:40 -0700
Subject: net: xmit_list() becomes dev_hard_start_xmit().

Now fundamentally we can process lists of SKBs as cheaply
as single packets.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index c89da4f306b1..6857d57aa294 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2570,8 +2570,8 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 	return rc;
 }
 
-static struct sk_buff *xmit_list(struct sk_buff *first, struct net_device *dev,
-				 struct netdev_queue *txq, int *ret)
+struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
+				    struct netdev_queue *txq, int *ret)
 {
 	struct sk_buff *skb = first;
 	int rc = NETDEV_TX_OK;
@@ -2673,17 +2673,6 @@ out_null:
 	return NULL;
 }
 
-struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-				    struct netdev_queue *txq, int *ret)
-{
-	if (likely(!skb->next)) {
-		*ret = xmit_one(skb, dev, txq, false);
-		return skb;
-	}
-
-	return xmit_list(skb, dev, txq, ret);
-}
-
 static void qdisc_pkt_len_init(struct sk_buff *skb)
 {
 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
-- 
cgit v1.2.3


From 5a21232983aa7acfe7fd26170832a9e0a4a7b4ae Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sun, 31 Aug 2014 15:12:41 -0700
Subject: net: Support for csum_bad in skbuff

This flag indicates that an invalid checksum was detected in the
packet. __skb_mark_checksum_bad helper function was added to set this.

Checksums can be marked bad from a driver or the GRO path (the latter
is implemented in this patch). csum_bad is checked in
__skb_checksum_validate_complete (i.e. calling that when ip_summed ==
CHECKSUM_NONE).

csum_bad works in conjunction with ip_summed value. In the case that
ip_summed is CHECKSUM_NONE and csum_bad is set, this implies that the
first (or next) checksum encountered in the packet is bad. When
ip_summed is CHECKSUM_UNNECESSARY, the first checksum after the last
one validated is bad. For example, if ip_summed == CHECKSUM_UNNECESSARY,
csum_level == 1, and csum_bad is set-- then the third checksum in the
packet is bad. In the normal path, the packet will be dropped when
processing the protocol layer of the bad checksum:
__skb_decr_checksum_unnecessary called twice for the good checksums
changing ip_summed to CHECKSUM_NONE so that
__skb_checksum_validate_complete is called to validate the third
checksum and that will fail since csum_bad is set.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 +++-
 include/linux/skbuff.h    | 21 ++++++++++++++++++++-
 net/core/dev.c            |  2 +-
 3 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 202c25a9aadf..a0ab6d9d400a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2216,7 +2216,9 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
 	if (__skb_gro_checksum_validate_needed(skb, zero_okay, check))	\
 		__ret = __skb_gro_checksum_validate_complete(skb,	\
 				compute_pseudo(skb, proto));		\
-	if (!__ret)							\
+	if (__ret)							\
+		__skb_mark_checksum_bad(skb);				\
+	else								\
 		skb_gro_incr_csum_unnecessary(skb);			\
 	__ret;								\
 })
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c93b5859a772..23710a243439 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -617,7 +617,8 @@ struct sk_buff {
 
 	kmemcheck_bitfield_begin(flags3);
 	__u8			csum_level:2;
-	/* 14 bit hole */
+	__u8			csum_bad:1;
+	/* 13 bit hole */
 	kmemcheck_bitfield_end(flags3);
 
 	__be16			inner_protocol;
@@ -2825,6 +2826,21 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
 	}
 }
 
+static inline void __skb_mark_checksum_bad(struct sk_buff *skb)
+{
+	/* Mark current checksum as bad (typically called from GRO
+	 * path). In the case that ip_summed is CHECKSUM_NONE
+	 * this must be the first checksum encountered in the packet.
+	 * When ip_summed is CHECKSUM_UNNECESSARY, this is the first
+	 * checksum after the last one validated. For UDP, a zero
+	 * checksum can not be marked as bad.
+	 */
+
+	if (skb->ip_summed == CHECKSUM_NONE ||
+	    skb->ip_summed == CHECKSUM_UNNECESSARY)
+		skb->csum_bad = 1;
+}
+
 /* Check if we need to perform checksum complete validation.
  *
  * Returns true if checksum complete is needed, false otherwise
@@ -2866,6 +2882,9 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
 			skb->csum_valid = 1;
 			return 0;
 		}
+	} else if (skb->csum_bad) {
+		/* ip_summed == CHECKSUM_NONE in this case */
+		return 1;
 	}
 
 	skb->csum = psum;
diff --git a/net/core/dev.c b/net/core/dev.c
index 6857d57aa294..3774afc3bebf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3918,7 +3918,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (!(skb->dev->features & NETIF_F_GRO))
 		goto normal;
 
-	if (skb_is_gso(skb) || skb_has_frag_list(skb))
+	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
 		goto normal;
 
 	gro_list_prepare(napi, skb);
-- 
cgit v1.2.3


From 1f59533f9ca5634e7b8914252e48aee9d9cbe501 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Sep 2014 17:56:09 +0200
Subject: qdisc: validate frames going through the direct_xmit path

In commit 50cbe9ab5f8d ("net: Validate xmit SKBs right when we
pull them out of the qdisc") the validation code was moved out of
dev_hard_start_xmit and into dequeue_skb.

However this overlooked the fact that we do not always enqueue
the skb onto a qdisc. First situation is if qdisc have flag
TCQ_F_CAN_BYPASS and qdisc is empty.  Second situation is if
there is no qdisc on the device, which is a common case for
software devices.

Originally spotted and inital patch by Alexander Duyck.
As a result Alex was seeing issues trying to connect to a
vhost_net interface after commit 50cbe9ab5f8d was applied.

Added a call to validate_xmit_skb() in __dev_xmit_skb(), in the
code path for qdiscs with TCQ_F_CAN_BYPASS flag, and in
__dev_queue_xmit() when no qdisc.

Also handle the error situation where dev_hard_start_xmit() could
return a skb list, and does not return dev_xmit_complete(rc) and
falls through to the kfree_skb(), in that situation it should
call kfree_skb_list().

Fixes:  50cbe9ab5f8d ("net: Validate xmit SKBs right when we pull them out of the qdisc")
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 3774afc3bebf..2f3dbd657570 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2739,7 +2739,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		qdisc_bstats_update(q, skb);
 
-		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+		skb = validate_xmit_skb(skb, dev);
+		if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
 				contended = false;
@@ -2879,6 +2880,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
 				goto recursion_alert;
 
+			skb = validate_xmit_skb(skb, dev);
+			if (!skb)
+				goto drop;
+
 			HARD_TX_LOCK(dev, txq, cpu);
 
 			if (!netif_xmit_stopped(txq)) {
@@ -2904,10 +2909,11 @@ recursion_alert:
 	}
 
 	rc = -ENETDOWN;
+drop:
 	rcu_read_unlock_bh();
 
 	atomic_long_inc(&dev->tx_dropped);
-	kfree_skb(skb);
+	kfree_skb_list(skb);
 	return rc;
 out:
 	rcu_read_unlock_bh();
-- 
cgit v1.2.3


From 46e5da40aec256155cfedee96dd21a75da941f2c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 12 Sep 2014 20:04:52 -0700
Subject: net: qdisc: use rcu prefix and silence sparse warnings

Add __rcu notation to qdisc handling by doing this we can make
smatch output more legible. And anyways some of the cases should
be using rcu_dereference() see qdisc_all_tx_empty(),
qdisc_tx_chainging(), and so on.

Also *wake_queue() API is commonly called from driver timer routines
without rcu lock or rtnl lock. So I added rcu_read_lock() blocks
around netif_wake_subqueue and netif_tx_wake_queue.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 29 ++++-----------------------
 include/net/sch_generic.h | 21 +++++++++++++------
 net/core/dev.c            | 51 +++++++++++++++++++++++++++++++++++++++++++++--
 net/sched/sch_generic.c   |  4 ++--
 net/sched/sch_mqprio.c    |  6 ++++--
 net/sched/sch_teql.c      | 13 +++++++-----
 6 files changed, 82 insertions(+), 42 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ba72f6baae1a..ae721f53739e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -543,7 +543,7 @@ struct netdev_queue {
  * read mostly part
  */
 	struct net_device	*dev;
-	struct Qdisc		*qdisc;
+	struct Qdisc __rcu	*qdisc;
 	struct Qdisc		*qdisc_sleeping;
 #ifdef CONFIG_SYSFS
 	struct kobject		kobj;
@@ -2356,12 +2356,7 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd,
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
 void __netif_schedule(struct Qdisc *q);
-
-static inline void netif_schedule_queue(struct netdev_queue *txq)
-{
-	if (!(txq->state & QUEUE_STATE_ANY_XOFF))
-		__netif_schedule(txq->qdisc);
-}
+void netif_schedule_queue(struct netdev_queue *txq);
 
 static inline void netif_tx_schedule_all(struct net_device *dev)
 {
@@ -2397,11 +2392,7 @@ static inline void netif_tx_start_all_queues(struct net_device *dev)
 	}
 }
 
-static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue)
-{
-	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state))
-		__netif_schedule(dev_queue->qdisc);
-}
+void netif_tx_wake_queue(struct netdev_queue *dev_queue);
 
 /**
  *	netif_wake_queue - restart transmit
@@ -2673,19 +2664,7 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev,
 	return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
 }
 
-/**
- *	netif_wake_subqueue - allow sending packets on subqueue
- *	@dev: network device
- *	@queue_index: sub queue index
- *
- * Resume individual transmit queue of a device with multiple transmit queues.
- */
-static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
-{
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state))
-		__netif_schedule(txq->qdisc);
-}
+void netif_wake_subqueue(struct net_device *dev, u16 queue_index);
 
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a3cfb8ebeb53..56838ab29b42 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -259,7 +259,9 @@ static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
 
 static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
 {
-	return qdisc->dev_queue->qdisc;
+	struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc);
+
+	return q;
 }
 
 static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
@@ -384,7 +386,7 @@ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
 	struct Qdisc *qdisc;
 
 	for (; i < dev->num_tx_queues; i++) {
-		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
 		if (qdisc) {
 			spin_lock_bh(qdisc_lock(qdisc));
 			qdisc_reset(qdisc);
@@ -402,13 +404,18 @@ static inline void qdisc_reset_all_tx(struct net_device *dev)
 static inline bool qdisc_all_tx_empty(const struct net_device *dev)
 {
 	unsigned int i;
+
+	rcu_read_lock();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		const struct Qdisc *q = txq->qdisc;
+		const struct Qdisc *q = rcu_dereference(txq->qdisc);
 
-		if (q->q.qlen)
+		if (q->q.qlen) {
+			rcu_read_unlock();
 			return false;
+		}
 	}
+	rcu_read_unlock();
 	return true;
 }
 
@@ -416,9 +423,10 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev)
 static inline bool qdisc_tx_changing(const struct net_device *dev)
 {
 	unsigned int i;
+
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		if (txq->qdisc != txq->qdisc_sleeping)
+		if (rcu_access_pointer(txq->qdisc) != txq->qdisc_sleeping)
 			return true;
 	}
 	return false;
@@ -428,9 +436,10 @@ static inline bool qdisc_tx_changing(const struct net_device *dev)
 static inline bool qdisc_tx_is_noop(const struct net_device *dev)
 {
 	unsigned int i;
+
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		if (txq->qdisc != &noop_qdisc)
+		if (rcu_access_pointer(txq->qdisc) != &noop_qdisc)
 			return false;
 	}
 	return true;
diff --git a/net/core/dev.c b/net/core/dev.c
index 3c6a967e5830..b3d6dbc0c696 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2177,6 +2177,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 	return (struct dev_kfree_skb_cb *)skb->cb;
 }
 
+void netif_schedule_queue(struct netdev_queue *txq)
+{
+	rcu_read_lock();
+	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+		struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+		__netif_schedule(q);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(netif_schedule_queue);
+
+/**
+ *	netif_wake_subqueue - allow sending packets on subqueue
+ *	@dev: network device
+ *	@queue_index: sub queue index
+ *
+ * Resume individual transmit queue of a device with multiple transmit queues.
+ */
+void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
+
+	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
+		struct Qdisc *q;
+
+		rcu_read_lock();
+		q = rcu_dereference(txq->qdisc);
+		__netif_schedule(q);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL(netif_wake_subqueue);
+
+void netif_tx_wake_queue(struct netdev_queue *dev_queue)
+{
+	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
+		struct Qdisc *q;
+
+		rcu_read_lock();
+		q = rcu_dereference(dev_queue->qdisc);
+		__netif_schedule(q);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL(netif_tx_wake_queue);
+
 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 {
 	unsigned long flags;
@@ -3432,7 +3479,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	q = rxq->qdisc;
+	q = rcu_dereference(rxq->qdisc);
 	if (q != &noop_qdisc) {
 		spin_lock(qdisc_lock(q));
 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
@@ -3449,7 +3496,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 {
 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
 
-	if (!rxq || rxq->qdisc == &noop_qdisc)
+	if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
 		goto out;
 
 	if (*pt_prev) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 19696ebe9ebc..346ef85617d3 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -783,7 +783,7 @@ static void dev_deactivate_queue(struct net_device *dev,
 	struct Qdisc *qdisc_default = _qdisc_default;
 	struct Qdisc *qdisc;
 
-	qdisc = dev_queue->qdisc;
+	qdisc = rtnl_dereference(dev_queue->qdisc);
 	if (qdisc) {
 		spin_lock_bh(qdisc_lock(qdisc));
 
@@ -876,7 +876,7 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 {
 	struct Qdisc *qdisc = _qdisc;
 
-	dev_queue->qdisc = qdisc;
+	rcu_assign_pointer(dev_queue->qdisc, qdisc);
 	dev_queue->qdisc_sleeping = qdisc;
 }
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 6749e2f540d0..37e7d25d21f1 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -231,7 +231,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
-		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
 		spin_lock_bh(qdisc_lock(qdisc));
 		sch->q.qlen		+= qdisc->q.qlen;
 		sch->bstats.bytes	+= qdisc->bstats.bytes;
@@ -340,7 +340,9 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 		spin_unlock_bh(d->lock);
 
 		for (i = tc.offset; i < tc.offset + tc.count; i++) {
-			qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+			struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+
+			qdisc = rtnl_dereference(q->qdisc);
 			spin_lock_bh(qdisc_lock(qdisc));
 			bstats.bytes      += qdisc->bstats.bytes;
 			bstats.packets    += qdisc->bstats.packets;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index aaa8d03ed054..5cd291bd00e4 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -96,11 +96,14 @@ teql_dequeue(struct Qdisc *sch)
 	struct teql_sched_data *dat = qdisc_priv(sch);
 	struct netdev_queue *dat_queue;
 	struct sk_buff *skb;
+	struct Qdisc *q;
 
 	skb = __skb_dequeue(&dat->q);
 	dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
+	q = rcu_dereference_bh(dat_queue->qdisc);
+
 	if (skb == NULL) {
-		struct net_device *m = qdisc_dev(dat_queue->qdisc);
+		struct net_device *m = qdisc_dev(q);
 		if (m) {
 			dat->m->slaves = sch;
 			netif_wake_queue(m);
@@ -108,7 +111,7 @@ teql_dequeue(struct Qdisc *sch)
 	} else {
 		qdisc_bstats_update(sch, skb);
 	}
-	sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
+	sch->q.qlen = dat->q.qlen + q->q.qlen;
 	return skb;
 }
 
@@ -157,9 +160,9 @@ teql_destroy(struct Qdisc *sch)
 						txq = netdev_get_tx_queue(master->dev, 0);
 						master->slaves = NULL;
 
-						root_lock = qdisc_root_sleeping_lock(txq->qdisc);
+						root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc));
 						spin_lock_bh(root_lock);
-						qdisc_reset(txq->qdisc);
+						qdisc_reset(rtnl_dereference(txq->qdisc));
 						spin_unlock_bh(root_lock);
 					}
 				}
@@ -266,7 +269,7 @@ static inline int teql_resolve(struct sk_buff *skb,
 	struct dst_entry *dst = skb_dst(skb);
 	int res;
 
-	if (txq->qdisc == &noop_qdisc)
+	if (rcu_access_pointer(txq->qdisc) == &noop_qdisc)
 		return -ENODEV;
 
 	if (!dev->header_ops || !dst)
-- 
cgit v1.2.3


From 6c555490e0ce885a9caf0a045db69382a3ccbc9c Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 11 Sep 2014 15:35:09 -0700
Subject: ipv6: drop useless rcu_read_lock() in anycast

These code is now protected by rtnl lock, rcu read lock
is useless now.

Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 net/core/dev.c            | 14 ++++++++------
 net/ipv6/anycast.c        | 18 ++++++------------
 3 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ae721f53739e..ee38b948d9a0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2083,8 +2083,8 @@ void __dev_remove_pack(struct packet_type *pt);
 void dev_add_offload(struct packet_offload *po);
 void dev_remove_offload(struct packet_offload *po);
 
-struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags,
-					unsigned short mask);
+struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
+				      unsigned short mask);
 struct net_device *dev_get_by_name(struct net *net, const char *name);
 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
 struct net_device *__dev_get_by_name(struct net *net, const char *name);
diff --git a/net/core/dev.c b/net/core/dev.c
index b3d6dbc0c696..e916ba8caccf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -897,23 +897,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 
 /**
- *	dev_get_by_flags_rcu - find any device with given flags
+ *	__dev_get_by_flags - find any device with given flags
  *	@net: the applicable net namespace
  *	@if_flags: IFF_* values
  *	@mask: bitmask of bits in if_flags to check
  *
  *	Search for any interface with the given flags. Returns NULL if a device
  *	is not found or a pointer to the device. Must be called inside
- *	rcu_read_lock(), and result refcount is unchanged.
+ *	rtnl_lock(), and result refcount is unchanged.
  */
 
-struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
-				    unsigned short mask)
+struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
+				      unsigned short mask)
 {
 	struct net_device *dev, *ret;
 
+	ASSERT_RTNL();
+
 	ret = NULL;
-	for_each_netdev_rcu(net, dev) {
+	for_each_netdev(net, dev) {
 		if (((dev->flags ^ if_flags) & mask) == 0) {
 			ret = dev;
 			break;
@@ -921,7 +923,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags
 	}
 	return ret;
 }
-EXPORT_SYMBOL(dev_get_by_flags_rcu);
+EXPORT_SYMBOL(__dev_get_by_flags);
 
 /**
  *	dev_valid_name - check if name is okay for network device
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index ff2de7d9d8e6..3b0429bc6b5a 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -78,7 +78,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	pac->acl_addr = *addr;
 
 	rtnl_lock();
-	rcu_read_lock();
 	if (ifindex == 0) {
 		struct rt6_info *rt;
 
@@ -91,11 +90,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 			goto error;
 		} else {
 			/* router, no matching interface: just pick one */
-			dev = dev_get_by_flags_rcu(net, IFF_UP,
-						   IFF_UP | IFF_LOOPBACK);
+			dev = __dev_get_by_flags(net, IFF_UP,
+						 IFF_UP | IFF_LOOPBACK);
 		}
 	} else
-		dev = dev_get_by_index_rcu(net, ifindex);
+		dev = __dev_get_by_index(net, ifindex);
 
 	if (dev == NULL) {
 		err = -ENODEV;
@@ -137,7 +136,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	}
 
 error:
-	rcu_read_unlock();
 	rtnl_unlock();
 	if (pac)
 		sock_kfree_s(sk, pac, sizeof(*pac));
@@ -174,11 +172,9 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	spin_unlock_bh(&ipv6_sk_ac_lock);
 
 	rtnl_lock();
-	rcu_read_lock();
-	dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
+	dev = __dev_get_by_index(net, pac->acl_ifindex);
 	if (dev)
 		ipv6_dev_ac_dec(dev, &pac->acl_addr);
-	rcu_read_unlock();
 	rtnl_unlock();
 
 	sock_kfree_s(sk, pac, sizeof(*pac));
@@ -203,12 +199,11 @@ void ipv6_sock_ac_close(struct sock *sk)
 
 	prev_index = 0;
 	rtnl_lock();
-	rcu_read_lock();
 	while (pac) {
 		struct ipv6_ac_socklist *next = pac->acl_next;
 
 		if (pac->acl_ifindex != prev_index) {
-			dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
+			dev = __dev_get_by_index(net, pac->acl_ifindex);
 			prev_index = pac->acl_ifindex;
 		}
 		if (dev)
@@ -216,7 +211,6 @@ void ipv6_sock_ac_close(struct sock *sk)
 		sock_kfree_s(sk, pac, sizeof(*pac));
 		pac = next;
 	}
-	rcu_read_unlock();
 	rtnl_unlock();
 }
 
@@ -341,7 +335,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 	return 0;
 }
 
-/* called with rcu_read_lock() */
+/* called with rtnl_lock() */
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr)
 {
 	struct inet6_dev *idev = __in6_dev_get(dev);
-- 
cgit v1.2.3


From cecda693a969816bac5e470e1d9c9c0ef5567bca Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 19 Sep 2014 16:04:38 +0800
Subject: net: keep original skb which only needs header checking during
 software GSO

Commit ce93718fb7cdbc064c3000ff59e4d3200bdfa744 ("net: Don't keep
around original SKB when we software segment GSO frames") frees the
original skb after software GSO even for dodgy gso skbs. This breaks
the stream throughput from untrusted sources, since only header
checking was done during software GSO instead of a true
segmentation. This patch fixes this by freeing the original gso skb
only when it was really segmented by software.

Fixes ce93718fb7cdbc064c3000ff59e4d3200bdfa744 ("net: Don't keep
around original SKB when we software segment GSO frames.")

Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index e916ba8caccf..52cd71a4a343 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2694,10 +2694,12 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 		struct sk_buff *segs;
 
 		segs = skb_gso_segment(skb, features);
-		kfree_skb(skb);
-		if (IS_ERR(segs))
+		if (IS_ERR(segs)) {
 			segs = NULL;
-		skb = segs;
+		} else if (segs) {
+			consume_skb(skb);
+			skb = segs;
+		}
 	} else {
 		if (skb_needs_linearize(skb, features) &&
 		    __skb_linearize(skb))
-- 
cgit v1.2.3


From 53e50398968d43338c4d932114e68bc099fc5fbd Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sat, 20 Sep 2014 14:52:30 -0700
Subject: net: Remove gso_send_check as an offload callback

The send_check logic was only interesting in cases of TCP offload and
UDP UFO where the checksum needed to be initialized to the pseudo
header checksum. Now we've moved that logic into the related
gso_segment functions so gso_send_check is no longer needed.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 -
 net/core/dev.c            | 10 ----------
 net/ipv4/af_inet.c        | 36 ------------------------------------
 net/ipv4/gre_offload.c    | 11 +++--------
 net/ipv4/tcp_offload.c    |  6 ------
 net/ipv4/udp_offload.c    |  6 ------
 net/ipv6/ip6_offload.c    | 27 ---------------------------
 net/ipv6/tcpv6_offload.c  |  6 ------
 net/ipv6/udp_offload.c    |  6 ------
 net/mpls/mpls_gso.c       |  7 -------
 10 files changed, 3 insertions(+), 113 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4354b4307e37..9f5d293a0281 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1911,7 +1911,6 @@ struct packet_type {
 struct offload_callbacks {
 	struct sk_buff		*(*gso_segment)(struct sk_buff *skb,
 						netdev_features_t features);
-	int			(*gso_send_check)(struct sk_buff *skb);
 	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
 					       struct sk_buff *skb);
 	int			(*gro_complete)(struct sk_buff *skb, int nhoff);
diff --git a/net/core/dev.c b/net/core/dev.c
index db0388607329..e2ced01c01e4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2422,16 +2422,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, &offload_base, list) {
 		if (ptype->type == type && ptype->callbacks.gso_segment) {
-			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
-				int err;
-
-				err = ptype->callbacks.gso_send_check(skb);
-				segs = ERR_PTR(err);
-				if (err || skb_gso_ok(skb, features))
-					break;
-				__skb_push(skb, (skb->data -
-						 skb_network_header(skb)));
-			}
 			segs = ptype->callbacks.gso_segment(skb, features);
 			break;
 		}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 72011cc4c13b..28e589c5f32d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1197,40 +1197,6 @@ int inet_sk_rebuild_header(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_sk_rebuild_header);
 
-static int inet_gso_send_check(struct sk_buff *skb)
-{
-	const struct net_offload *ops;
-	const struct iphdr *iph;
-	int proto;
-	int ihl;
-	int err = -EINVAL;
-
-	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
-		goto out;
-
-	iph = ip_hdr(skb);
-	ihl = iph->ihl * 4;
-	if (ihl < sizeof(*iph))
-		goto out;
-
-	proto = iph->protocol;
-
-	/* Warning: after this point, iph might be no longer valid */
-	if (unlikely(!pskb_may_pull(skb, ihl)))
-		goto out;
-	__skb_pull(skb, ihl);
-
-	skb_reset_transport_header(skb);
-	err = -EPROTONOSUPPORT;
-
-	ops = rcu_dereference(inet_offloads[proto]);
-	if (likely(ops && ops->callbacks.gso_send_check))
-		err = ops->callbacks.gso_send_check(skb);
-
-out:
-	return err;
-}
-
 static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 					netdev_features_t features)
 {
@@ -1655,7 +1621,6 @@ static int ipv4_proc_init(void);
 static struct packet_offload ip_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
 	.callbacks = {
-		.gso_send_check = inet_gso_send_check,
 		.gso_segment = inet_gso_segment,
 		.gro_receive = inet_gro_receive,
 		.gro_complete = inet_gro_complete,
@@ -1664,7 +1629,6 @@ static struct packet_offload ip_packet_offload __read_mostly = {
 
 static const struct net_offload ipip_offload = {
 	.callbacks = {
-		.gso_send_check = inet_gso_send_check,
 		.gso_segment	= inet_gso_segment,
 		.gro_receive	= inet_gro_receive,
 		.gro_complete	= inet_gro_complete,
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index d3fe2ac05167..a77729503071 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -15,13 +15,6 @@
 #include <net/protocol.h>
 #include <net/gre.h>
 
-static int gre_gso_send_check(struct sk_buff *skb)
-{
-	if (!skb->encapsulation)
-		return -EINVAL;
-	return 0;
-}
-
 static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
@@ -46,6 +39,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_IPIP)))
 		goto out;
 
+	if (!skb->encapsulation)
+		goto out;
+
 	if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
 		goto out;
 
@@ -256,7 +252,6 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload gre_offload = {
 	.callbacks = {
-		.gso_send_check = gre_gso_send_check,
 		.gso_segment = gre_gso_segment,
 		.gro_receive = gre_gro_receive,
 		.gro_complete = gre_gro_complete,
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 7cd12b0458ff..5b90f2f447a5 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -288,11 +288,6 @@ int tcp_gro_complete(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(tcp_gro_complete);
 
-static int tcp_v4_gso_send_check(struct sk_buff *skb)
-{
-	return 0;
-}
-
 static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
 	/* Don't bother verifying checksum if we're going to flush anyway. */
@@ -320,7 +315,6 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
 
 static const struct net_offload tcpv4_offload = {
 	.callbacks = {
-		.gso_send_check	=	tcp_v4_gso_send_check,
 		.gso_segment	=	tcp4_gso_segment,
 		.gro_receive	=	tcp4_gro_receive,
 		.gro_complete	=	tcp4_gro_complete,
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 2918cc914824..19ebe6a39ddc 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -25,11 +25,6 @@ struct udp_offload_priv {
 	struct udp_offload_priv __rcu *next;
 };
 
-static int udp4_ufo_send_check(struct sk_buff *skb)
-{
-	return 0;
-}
-
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
@@ -346,7 +341,6 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv4_offload = {
 	.callbacks = {
-		.gso_send_check = udp4_ufo_send_check,
 		.gso_segment = udp4_ufo_fragment,
 		.gro_receive  =	udp4_gro_receive,
 		.gro_complete =	udp4_gro_complete,
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 9952f3fce30a..9034f76ae013 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -53,31 +53,6 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
 	return proto;
 }
 
-static int ipv6_gso_send_check(struct sk_buff *skb)
-{
-	const struct ipv6hdr *ipv6h;
-	const struct net_offload *ops;
-	int err = -EINVAL;
-
-	if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
-		goto out;
-
-	ipv6h = ipv6_hdr(skb);
-	__skb_pull(skb, sizeof(*ipv6h));
-	err = -EPROTONOSUPPORT;
-
-	ops = rcu_dereference(inet6_offloads[
-		ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]);
-
-	if (likely(ops && ops->callbacks.gso_send_check)) {
-		skb_reset_transport_header(skb);
-		err = ops->callbacks.gso_send_check(skb);
-	}
-
-out:
-	return err;
-}
-
 static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	netdev_features_t features)
 {
@@ -306,7 +281,6 @@ out_unlock:
 static struct packet_offload ipv6_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IPV6),
 	.callbacks = {
-		.gso_send_check = ipv6_gso_send_check,
 		.gso_segment = ipv6_gso_segment,
 		.gro_receive = ipv6_gro_receive,
 		.gro_complete = ipv6_gro_complete,
@@ -315,7 +289,6 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
 
 static const struct net_offload sit_offload = {
 	.callbacks = {
-		.gso_send_check = ipv6_gso_send_check,
 		.gso_segment	= ipv6_gso_segment,
 		.gro_receive	= ipv6_gro_receive,
 		.gro_complete	= ipv6_gro_complete,
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 96253154db3a..c1ab77105b4c 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -15,11 +15,6 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static int tcp_v6_gso_send_check(struct sk_buff *skb)
-{
-	return 0;
-}
-
 static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
 					 struct sk_buff *skb)
 {
@@ -71,7 +66,6 @@ struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
 }
 static const struct net_offload tcpv6_offload = {
 	.callbacks = {
-		.gso_send_check	=	tcp_v6_gso_send_check,
 		.gso_segment	=	tcp6_gso_segment,
 		.gro_receive	=	tcp6_gro_receive,
 		.gro_complete	=	tcp6_gro_complete,
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index e4af6437ea3b..212ebfc7973f 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,11 +17,6 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static int udp6_ufo_send_check(struct sk_buff *skb)
-{
-	return 0;
-}
-
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -166,7 +161,6 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv6_offload = {
 	.callbacks = {
-		.gso_send_check =	udp6_ufo_send_check,
 		.gso_segment	=	udp6_ufo_fragment,
 		.gro_receive	=	udp6_gro_receive,
 		.gro_complete	=	udp6_gro_complete,
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 6b38d083e1c9..e28ed2ef5b06 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -65,15 +65,9 @@ out:
 	return segs;
 }
 
-static int mpls_gso_send_check(struct sk_buff *skb)
-{
-	return 0;
-}
-
 static struct packet_offload mpls_mc_offload = {
 	.type = cpu_to_be16(ETH_P_MPLS_MC),
 	.callbacks = {
-		.gso_send_check =	mpls_gso_send_check,
 		.gso_segment    =	mpls_gso_segment,
 	},
 };
@@ -81,7 +75,6 @@ static struct packet_offload mpls_mc_offload = {
 static struct packet_offload mpls_uc_offload = {
 	.type = cpu_to_be16(ETH_P_MPLS_UC),
 	.callbacks = {
-		.gso_send_check =	mpls_gso_send_check,
 		.gso_segment    =	mpls_gso_segment,
 	},
 };
-- 
cgit v1.2.3


From 6ea754eb761d9e7a8ac6fa462b05f9e4cf04fb6c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 22 Sep 2014 11:10:50 -0700
Subject: net: Change netdev_<level> logging functions to return void

No caller or macro uses the return value so make all
the functions return void.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 19 +++++++++----------
 net/core/dev.c            | 44 +++++++++++++++++---------------------------
 2 files changed, 26 insertions(+), 37 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9f5d293a0281..9b7fbacb6296 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3612,22 +3612,22 @@ static inline const char *netdev_reg_state(const struct net_device *dev)
 }
 
 __printf(3, 4)
-int netdev_printk(const char *level, const struct net_device *dev,
-		  const char *format, ...);
+void netdev_printk(const char *level, const struct net_device *dev,
+		   const char *format, ...);
 __printf(2, 3)
-int netdev_emerg(const struct net_device *dev, const char *format, ...);
+void netdev_emerg(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
-int netdev_alert(const struct net_device *dev, const char *format, ...);
+void netdev_alert(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
-int netdev_crit(const struct net_device *dev, const char *format, ...);
+void netdev_crit(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
-int netdev_err(const struct net_device *dev, const char *format, ...);
+void netdev_err(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
-int netdev_warn(const struct net_device *dev, const char *format, ...);
+void netdev_warn(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
-int netdev_notice(const struct net_device *dev, const char *format, ...);
+void netdev_notice(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
-int netdev_info(const struct net_device *dev, const char *format, ...);
+void netdev_info(const struct net_device *dev, const char *format, ...);
 
 #define MODULE_ALIAS_NETDEV(device) \
 	MODULE_ALIAS("netdev-" device)
@@ -3645,7 +3645,6 @@ do {								\
 ({								\
 	if (0)							\
 		netdev_printk(KERN_DEBUG, __dev, format, ##args); \
-	0;							\
 })
 #endif
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e2ced01c01e4..e55c546717d4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7066,53 +7066,45 @@ const char *netdev_drivername(const struct net_device *dev)
 	return empty;
 }
 
-static int __netdev_printk(const char *level, const struct net_device *dev,
-			   struct va_format *vaf)
+static void __netdev_printk(const char *level, const struct net_device *dev,
+			    struct va_format *vaf)
 {
-	int r;
-
 	if (dev && dev->dev.parent) {
-		r = dev_printk_emit(level[1] - '0',
-				    dev->dev.parent,
-				    "%s %s %s%s: %pV",
-				    dev_driver_string(dev->dev.parent),
-				    dev_name(dev->dev.parent),
-				    netdev_name(dev), netdev_reg_state(dev),
-				    vaf);
+		dev_printk_emit(level[1] - '0',
+				dev->dev.parent,
+				"%s %s %s%s: %pV",
+				dev_driver_string(dev->dev.parent),
+				dev_name(dev->dev.parent),
+				netdev_name(dev), netdev_reg_state(dev),
+				vaf);
 	} else if (dev) {
-		r = printk("%s%s%s: %pV", level, netdev_name(dev),
-			   netdev_reg_state(dev), vaf);
+		printk("%s%s%s: %pV",
+		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
 	} else {
-		r = printk("%s(NULL net_device): %pV", level, vaf);
+		printk("%s(NULL net_device): %pV", level, vaf);
 	}
-
-	return r;
 }
 
-int netdev_printk(const char *level, const struct net_device *dev,
-		  const char *format, ...)
+void netdev_printk(const char *level, const struct net_device *dev,
+		   const char *format, ...)
 {
 	struct va_format vaf;
 	va_list args;
-	int r;
 
 	va_start(args, format);
 
 	vaf.fmt = format;
 	vaf.va = &args;
 
-	r = __netdev_printk(level, dev, &vaf);
+	__netdev_printk(level, dev, &vaf);
 
 	va_end(args);
-
-	return r;
 }
 EXPORT_SYMBOL(netdev_printk);
 
 #define define_netdev_printk_level(func, level)			\
-int func(const struct net_device *dev, const char *fmt, ...)	\
+void func(const struct net_device *dev, const char *fmt, ...)	\
 {								\
-	int r;							\
 	struct va_format vaf;					\
 	va_list args;						\
 								\
@@ -7121,11 +7113,9 @@ int func(const struct net_device *dev, const char *fmt, ...)	\
 	vaf.fmt = fmt;						\
 	vaf.va = &args;						\
 								\
-	r = __netdev_printk(level, dev, &vaf);			\
+	__netdev_printk(level, dev, &vaf);			\
 								\
 	va_end(args);						\
-								\
-	return r;						\
 }								\
 EXPORT_SYMBOL(func);
 
-- 
cgit v1.2.3


From 7bced397510ab569d31de4c70b39e13355046387 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 30 Dec 2013 12:37:29 -0800
Subject: net_dma: simple removal

Per commit "77873803363c net_dma: mark broken" net_dma is no longer used
and there is no plan to fix it.

This is the mechanical removal of bits in CONFIG_NET_DMA ifdef guards.
Reverting the remainder of the net_dma induced changes is deferred to
subsequent patches.

Marked for stable due to Roman's report of a memory leak in
dma_pin_iovec_pages():

    https://lkml.org/lkml/2014/9/3/177

Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: David Whipple <whipple@securedatainnovations.ch>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Cc: <stable@vger.kernel.org>
Reported-by: Roman Gushchin <klamm@yandex-team.ru>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/removed/net_dma      |   8 +
 Documentation/networking/ip-sysctl.txt |   6 -
 drivers/dma/Kconfig                    |  12 --
 drivers/dma/Makefile                   |   1 -
 drivers/dma/dmaengine.c                | 104 ------------
 drivers/dma/ioat/dma.c                 |   1 -
 drivers/dma/ioat/dma.h                 |   7 -
 drivers/dma/ioat/dma_v2.c              |   1 -
 drivers/dma/ioat/dma_v3.c              |   1 -
 drivers/dma/iovlock.c                  | 280 ---------------------------------
 include/linux/dmaengine.h              |  22 +--
 include/linux/skbuff.h                 |   8 +-
 include/linux/tcp.h                    |   8 -
 include/net/netdma.h                   |  32 ----
 include/net/sock.h                     |  19 +--
 include/net/tcp.h                      |   8 -
 kernel/sysctl_binary.c                 |   1 -
 net/core/Makefile                      |   1 -
 net/core/dev.c                         |  10 --
 net/core/sock.c                        |   6 -
 net/core/user_dma.c                    | 131 ---------------
 net/dccp/proto.c                       |   4 +-
 net/ipv4/sysctl_net_ipv4.c             |   9 --
 net/ipv4/tcp.c                         | 147 ++---------------
 net/ipv4/tcp_input.c                   |  61 -------
 net/ipv4/tcp_ipv4.c                    |  18 +--
 net/ipv6/tcp_ipv6.c                    |  13 +-
 net/llc/af_llc.c                       |  10 +-
 28 files changed, 35 insertions(+), 894 deletions(-)
 create mode 100644 Documentation/ABI/removed/net_dma
 delete mode 100644 drivers/dma/iovlock.c
 delete mode 100644 include/net/netdma.h
 delete mode 100644 net/core/user_dma.c

(limited to 'net/core/dev.c')

diff --git a/Documentation/ABI/removed/net_dma b/Documentation/ABI/removed/net_dma
new file mode 100644
index 000000000000..a173aecc2f18
--- /dev/null
+++ b/Documentation/ABI/removed/net_dma
@@ -0,0 +1,8 @@
+What:		tcp_dma_copybreak sysctl
+Date:		Removed in kernel v3.13
+Contact:	Dan Williams <dan.j.williams@intel.com>
+Description:
+	Formerly the lower limit, in bytes, of the size of socket reads
+	that will be offloaded to a DMA copy engine.  Removed due to
+	coherency issues of the cpu potentially touching the buffers
+	while dma is in flight.
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ab42c95f9985..ea8f3b182e70 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -582,12 +582,6 @@ tcp_workaround_signed_windows - BOOLEAN
 	not receive a window scaling option from them.
 	Default: 0
 
-tcp_dma_copybreak - INTEGER
-	Lower limit, in bytes, of the size of socket reads that will be
-	offloaded to a DMA copy engine, if one is present in the system
-	and CONFIG_NET_DMA is enabled.
-	Default: 4096
-
 tcp_thin_linear_timeouts - BOOLEAN
 	Enable dynamic triggering of linear timeouts for thin streams.
 	If set, a check is performed upon retransmission by timeout to
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 605b016bcea4..6b5f37e01a70 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -368,18 +368,6 @@ config DMA_OF
 comment "DMA Clients"
 	depends on DMA_ENGINE
 
-config NET_DMA
-	bool "Network: TCP receive copy offload"
-	depends on DMA_ENGINE && NET
-	default (INTEL_IOATDMA || FSL_DMA)
-	depends on BROKEN
-	help
-	  This enables the use of DMA engines in the network stack to
-	  offload receive copy-to-user operations, freeing CPU cycles.
-
-	  Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise
-	  say N.
-
 config ASYNC_TX_DMA
 	bool "Async_tx: Offload support for the async_tx api"
 	depends on DMA_ENGINE
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index a029d0f4a1be..0c9dc7549327 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -6,7 +6,6 @@ obj-$(CONFIG_DMA_VIRTUAL_CHANNELS) += virt-dma.o
 obj-$(CONFIG_DMA_ACPI) += acpi-dma.o
 obj-$(CONFIG_DMA_OF) += of-dma.o
 
-obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_MID_DMAC) += intel_mid_dma.o
 obj-$(CONFIG_DMATEST) += dmatest.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioat/
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index ed610b497518..268de183b519 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1084,110 +1084,6 @@ dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags)
 }
 EXPORT_SYMBOL(dmaengine_get_unmap_data);
 
-/**
- * dma_async_memcpy_pg_to_pg - offloaded copy from page to page
- * @chan: DMA channel to offload copy to
- * @dest_pg: destination page
- * @dest_off: offset in page to copy to
- * @src_pg: source page
- * @src_off: offset in page to copy from
- * @len: length
- *
- * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
- * address according to the DMA mapping API rules for streaming mappings.
- * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
- * (kernel memory or locked user space pages).
- */
-dma_cookie_t
-dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg,
-	unsigned int dest_off, struct page *src_pg, unsigned int src_off,
-	size_t len)
-{
-	struct dma_device *dev = chan->device;
-	struct dma_async_tx_descriptor *tx;
-	struct dmaengine_unmap_data *unmap;
-	dma_cookie_t cookie;
-	unsigned long flags;
-
-	unmap = dmaengine_get_unmap_data(dev->dev, 2, GFP_NOWAIT);
-	if (!unmap)
-		return -ENOMEM;
-
-	unmap->to_cnt = 1;
-	unmap->from_cnt = 1;
-	unmap->addr[0] = dma_map_page(dev->dev, src_pg, src_off, len,
-				      DMA_TO_DEVICE);
-	unmap->addr[1] = dma_map_page(dev->dev, dest_pg, dest_off, len,
-				      DMA_FROM_DEVICE);
-	unmap->len = len;
-	flags = DMA_CTRL_ACK;
-	tx = dev->device_prep_dma_memcpy(chan, unmap->addr[1], unmap->addr[0],
-					 len, flags);
-
-	if (!tx) {
-		dmaengine_unmap_put(unmap);
-		return -ENOMEM;
-	}
-
-	dma_set_unmap(tx, unmap);
-	cookie = tx->tx_submit(tx);
-	dmaengine_unmap_put(unmap);
-
-	preempt_disable();
-	__this_cpu_add(chan->local->bytes_transferred, len);
-	__this_cpu_inc(chan->local->memcpy_count);
-	preempt_enable();
-
-	return cookie;
-}
-EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
-
-/**
- * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
- * @chan: DMA channel to offload copy to
- * @dest: destination address (virtual)
- * @src: source address (virtual)
- * @len: length
- *
- * Both @dest and @src must be mappable to a bus address according to the
- * DMA mapping API rules for streaming mappings.
- * Both @dest and @src must stay memory resident (kernel memory or locked
- * user space pages).
- */
-dma_cookie_t
-dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest,
-			    void *src, size_t len)
-{
-	return dma_async_memcpy_pg_to_pg(chan, virt_to_page(dest),
-					 (unsigned long) dest & ~PAGE_MASK,
-					 virt_to_page(src),
-					 (unsigned long) src & ~PAGE_MASK, len);
-}
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
-
-/**
- * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
- * @chan: DMA channel to offload copy to
- * @page: destination page
- * @offset: offset in page to copy to
- * @kdata: source address (virtual)
- * @len: length
- *
- * Both @page/@offset and @kdata must be mappable to a bus address according
- * to the DMA mapping API rules for streaming mappings.
- * Both @page/@offset and @kdata must stay memory resident (kernel memory or
- * locked user space pages)
- */
-dma_cookie_t
-dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page,
-			   unsigned int offset, void *kdata, size_t len)
-{
-	return dma_async_memcpy_pg_to_pg(chan, page, offset,
-					 virt_to_page(kdata),
-					 (unsigned long) kdata & ~PAGE_MASK, len);
-}
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
-
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 	struct dma_chan *chan)
 {
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c
index b76c1485933b..940c1502a8b5 100644
--- a/drivers/dma/ioat/dma.c
+++ b/drivers/dma/ioat/dma.c
@@ -1222,7 +1222,6 @@ int ioat1_dma_probe(struct ioatdma_device *device, int dca)
 	err = ioat_probe(device);
 	if (err)
 		return err;
-	ioat_set_tcp_copy_break(4096);
 	err = ioat_register(device);
 	if (err)
 		return err;
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h
index e982f00a9843..d63f68b1aa35 100644
--- a/drivers/dma/ioat/dma.h
+++ b/drivers/dma/ioat/dma.h
@@ -214,13 +214,6 @@ __dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw,
 #define dump_desc_dbg(c, d) \
 	({ if (d) __dump_desc_dbg(&c->base, d->hw, &d->txd, desc_id(d)); 0; })
 
-static inline void ioat_set_tcp_copy_break(unsigned long copybreak)
-{
-	#ifdef CONFIG_NET_DMA
-	sysctl_tcp_dma_copybreak = copybreak;
-	#endif
-}
-
 static inline struct ioat_chan_common *
 ioat_chan_by_index(struct ioatdma_device *device, int index)
 {
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
index 2ce9be498608..695483e6be32 100644
--- a/drivers/dma/ioat/dma_v2.c
+++ b/drivers/dma/ioat/dma_v2.c
@@ -900,7 +900,6 @@ int ioat2_dma_probe(struct ioatdma_device *device, int dca)
 	err = ioat_probe(device);
 	if (err)
 		return err;
-	ioat_set_tcp_copy_break(2048);
 
 	list_for_each_entry(c, &dma->channels, device_node) {
 		chan = to_chan_common(c);
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 85971d6e9646..895f869d6c2c 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -1655,7 +1655,6 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca)
 	err = ioat_probe(device);
 	if (err)
 		return err;
-	ioat_set_tcp_copy_break(262144);
 
 	list_for_each_entry(c, &dma->channels, device_node) {
 		chan = to_chan_common(c);
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
deleted file mode 100644
index bb48a57c2fc1..000000000000
--- a/drivers/dma/iovlock.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- * Portions based on net/core/datagram.c and copyrighted by their authors.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-/*
- * This code allows the net stack to make use of a DMA engine for
- * skb to iovec copies.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <net/tcp.h> /* for memcpy_toiovec */
-#include <asm/io.h>
-#include <asm/uaccess.h>
-
-static int num_pages_spanned(struct iovec *iov)
-{
-	return
-	((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
-	((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT);
-}
-
-/*
- * Pin down all the iovec pages needed for len bytes.
- * Return a struct dma_pinned_list to keep track of pages pinned down.
- *
- * We are allocating a single chunk of memory, and then carving it up into
- * 3 sections, the latter 2 whose size depends on the number of iovecs and the
- * total number of pages, respectively.
- */
-struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
-{
-	struct dma_pinned_list *local_list;
-	struct page **pages;
-	int i;
-	int ret;
-	int nr_iovecs = 0;
-	int iovec_len_used = 0;
-	int iovec_pages_used = 0;
-
-	/* don't pin down non-user-based iovecs */
-	if (segment_eq(get_fs(), KERNEL_DS))
-		return NULL;
-
-	/* determine how many iovecs/pages there are, up front */
-	do {
-		iovec_len_used += iov[nr_iovecs].iov_len;
-		iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]);
-		nr_iovecs++;
-	} while (iovec_len_used < len);
-
-	/* single kmalloc for pinned list, page_list[], and the page arrays */
-	local_list = kmalloc(sizeof(*local_list)
-		+ (nr_iovecs * sizeof (struct dma_page_list))
-		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
-	if (!local_list)
-		goto out;
-
-	/* list of pages starts right after the page list array */
-	pages = (struct page **) &local_list->page_list[nr_iovecs];
-
-	local_list->nr_iovecs = 0;
-
-	for (i = 0; i < nr_iovecs; i++) {
-		struct dma_page_list *page_list = &local_list->page_list[i];
-
-		len -= iov[i].iov_len;
-
-		if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len))
-			goto unpin;
-
-		page_list->nr_pages = num_pages_spanned(&iov[i]);
-		page_list->base_address = iov[i].iov_base;
-
-		page_list->pages = pages;
-		pages += page_list->nr_pages;
-
-		/* pin pages down */
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(
-			current,
-			current->mm,
-			(unsigned long) iov[i].iov_base,
-			page_list->nr_pages,
-			1,	/* write */
-			0,	/* force */
-			page_list->pages,
-			NULL);
-		up_read(&current->mm->mmap_sem);
-
-		if (ret != page_list->nr_pages)
-			goto unpin;
-
-		local_list->nr_iovecs = i + 1;
-	}
-
-	return local_list;
-
-unpin:
-	dma_unpin_iovec_pages(local_list);
-out:
-	return NULL;
-}
-
-void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)
-{
-	int i, j;
-
-	if (!pinned_list)
-		return;
-
-	for (i = 0; i < pinned_list->nr_iovecs; i++) {
-		struct dma_page_list *page_list = &pinned_list->page_list[i];
-		for (j = 0; j < page_list->nr_pages; j++) {
-			set_page_dirty_lock(page_list->pages[j]);
-			page_cache_release(page_list->pages[j]);
-		}
-	}
-
-	kfree(pinned_list);
-}
-
-
-/*
- * We have already pinned down the pages we will be using in the iovecs.
- * Each entry in iov array has corresponding entry in pinned_list->page_list.
- * Using array indexing to keep iov[] and page_list[] in sync.
- * Initial elements in iov array's iov->iov_len will be 0 if already copied into
- *   by another call.
- * iov array length remaining guaranteed to be bigger than len.
- */
-dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
-	struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len)
-{
-	int iov_byte_offset;
-	int copy;
-	dma_cookie_t dma_cookie = 0;
-	int iovec_idx;
-	int page_idx;
-
-	if (!chan)
-		return memcpy_toiovec(iov, kdata, len);
-
-	iovec_idx = 0;
-	while (iovec_idx < pinned_list->nr_iovecs) {
-		struct dma_page_list *page_list;
-
-		/* skip already used-up iovecs */
-		while (!iov[iovec_idx].iov_len)
-			iovec_idx++;
-
-		page_list = &pinned_list->page_list[iovec_idx];
-
-		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
-		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
-			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
-
-		/* break up copies to not cross page boundary */
-		while (iov[iovec_idx].iov_len) {
-			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
-			copy = min_t(int, copy, iov[iovec_idx].iov_len);
-
-			dma_cookie = dma_async_memcpy_buf_to_pg(chan,
-					page_list->pages[page_idx],
-					iov_byte_offset,
-					kdata,
-					copy);
-			/* poll for a descriptor slot */
-			if (unlikely(dma_cookie < 0)) {
-				dma_async_issue_pending(chan);
-				continue;
-			}
-
-			len -= copy;
-			iov[iovec_idx].iov_len -= copy;
-			iov[iovec_idx].iov_base += copy;
-
-			if (!len)
-				return dma_cookie;
-
-			kdata += copy;
-			iov_byte_offset = 0;
-			page_idx++;
-		}
-		iovec_idx++;
-	}
-
-	/* really bad if we ever run out of iovecs */
-	BUG();
-	return -EFAULT;
-}
-
-dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
-	struct dma_pinned_list *pinned_list, struct page *page,
-	unsigned int offset, size_t len)
-{
-	int iov_byte_offset;
-	int copy;
-	dma_cookie_t dma_cookie = 0;
-	int iovec_idx;
-	int page_idx;
-	int err;
-
-	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
-	/* TODO: use dma for this */
-	if (!chan || !pinned_list) {
-		u8 *vaddr = kmap(page);
-		err = memcpy_toiovec(iov, vaddr + offset, len);
-		kunmap(page);
-		return err;
-	}
-
-	iovec_idx = 0;
-	while (iovec_idx < pinned_list->nr_iovecs) {
-		struct dma_page_list *page_list;
-
-		/* skip already used-up iovecs */
-		while (!iov[iovec_idx].iov_len)
-			iovec_idx++;
-
-		page_list = &pinned_list->page_list[iovec_idx];
-
-		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
-		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
-			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
-
-		/* break up copies to not cross page boundary */
-		while (iov[iovec_idx].iov_len) {
-			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
-			copy = min_t(int, copy, iov[iovec_idx].iov_len);
-
-			dma_cookie = dma_async_memcpy_pg_to_pg(chan,
-					page_list->pages[page_idx],
-					iov_byte_offset,
-					page,
-					offset,
-					copy);
-			/* poll for a descriptor slot */
-			if (unlikely(dma_cookie < 0)) {
-				dma_async_issue_pending(chan);
-				continue;
-			}
-
-			len -= copy;
-			iov[iovec_idx].iov_len -= copy;
-			iov[iovec_idx].iov_base += copy;
-
-			if (!len)
-				return dma_cookie;
-
-			offset += copy;
-			iov_byte_offset = 0;
-			page_idx++;
-		}
-		iovec_idx++;
-	}
-
-	/* really bad if we ever run out of iovecs */
-	BUG();
-	return -EFAULT;
-}
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index c5c92d59e531..3e382ecd1927 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -903,18 +903,6 @@ static inline void dmaengine_put(void)
 }
 #endif
 
-#ifdef CONFIG_NET_DMA
-#define net_dmaengine_get()	dmaengine_get()
-#define net_dmaengine_put()	dmaengine_put()
-#else
-static inline void net_dmaengine_get(void)
-{
-}
-static inline void net_dmaengine_put(void)
-{
-}
-#endif
-
 #ifdef CONFIG_ASYNC_TX_DMA
 #define async_dmaengine_get()	dmaengine_get()
 #define async_dmaengine_put()	dmaengine_put()
@@ -936,16 +924,8 @@ async_dma_find_channel(enum dma_transaction_type type)
 	return NULL;
 }
 #endif /* CONFIG_ASYNC_TX_DMA */
-
-dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
-	void *dest, void *src, size_t len);
-dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
-	struct page *page, unsigned int offset, void *kdata, size_t len);
-dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
-	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
-	unsigned int src_off, size_t len);
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
-	struct dma_chan *chan);
+				  struct dma_chan *chan);
 
 static inline void async_tx_ack(struct dma_async_tx_descriptor *tx)
 {
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5e1e6f2d98c2..bdbf7afad6b7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -28,7 +28,6 @@
 #include <linux/textsearch.h>
 #include <net/checksum.h>
 #include <linux/rcupdate.h>
-#include <linux/dmaengine.h>
 #include <linux/hrtimer.h>
 #include <linux/dma-mapping.h>
 #include <linux/netdev_features.h>
@@ -515,11 +514,8 @@ struct sk_buff {
 	/* 6/8 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
-#if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL
-	union {
-		unsigned int	napi_id;
-		dma_cookie_t	dma_cookie;
-	};
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned int	napi_id;
 #endif
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4ad0706d40eb..90895b8dc7f2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -19,7 +19,6 @@
 
 
 #include <linux/skbuff.h>
-#include <linux/dmaengine.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -169,13 +168,6 @@ struct tcp_sock {
 		struct iovec		*iov;
 		int			memory;
 		int			len;
-#ifdef CONFIG_NET_DMA
-		/* members for async copy */
-		struct dma_chan		*dma_chan;
-		int			wakeup;
-		struct dma_pinned_list	*pinned_list;
-		dma_cookie_t		dma_cookie;
-#endif
 	} ucopy;
 
 	u32	snd_wl1;	/* Sequence for window update		*/
diff --git a/include/net/netdma.h b/include/net/netdma.h
deleted file mode 100644
index 8ba8ce284eeb..000000000000
--- a/include/net/netdma.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-#ifndef NETDMA_H
-#define NETDMA_H
-#ifdef CONFIG_NET_DMA
-#include <linux/dmaengine.h>
-#include <linux/skbuff.h>
-
-int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
-		struct sk_buff *skb, int offset, struct iovec *to,
-		size_t len, struct dma_pinned_list *pinned_list);
-
-#endif /* CONFIG_NET_DMA */
-#endif /* NETDMA_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index b9586a137cad..3353b47f3d40 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -231,7 +231,6 @@ struct cg_proto;
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
   *	@sk_write_queue: Packet sending queue
-  *	@sk_async_wait_queue: DMA copied packets
   *	@sk_omem_alloc: "o" is "option" or "other"
   *	@sk_wmem_queued: persistent queue size
   *	@sk_forward_alloc: space allocated forward
@@ -354,10 +353,6 @@ struct sock {
 	struct sk_filter __rcu	*sk_filter;
 	struct socket_wq __rcu	*sk_wq;
 
-#ifdef CONFIG_NET_DMA
-	struct sk_buff_head	sk_async_wait_queue;
-#endif
-
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
 #endif
@@ -2214,27 +2209,15 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags);
  * sk_eat_skb - Release a skb if it is no longer needed
  * @sk: socket to eat this skb from
  * @skb: socket buffer to eat
- * @copied_early: flag indicating whether DMA operations copied this data early
  *
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
-#ifdef CONFIG_NET_DMA
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, bool copied_early)
-{
-	__skb_unlink(skb, &sk->sk_receive_queue);
-	if (!copied_early)
-		__kfree_skb(skb);
-	else
-		__skb_queue_tail(&sk->sk_async_wait_queue, skb);
-}
-#else
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, bool copied_early)
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
 	__kfree_skb(skb);
 }
-#endif
 
 static inline
 struct net *sock_net(const struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8c4dd63134d4..2c2f24ffa383 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -27,7 +27,6 @@
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/skbuff.h>
-#include <linux/dmaengine.h>
 #include <linux/crypto.h>
 #include <linux/cryptohash.h>
 #include <linux/kref.h>
@@ -267,7 +266,6 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
-extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
@@ -1023,12 +1021,6 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp)
 	tp->ucopy.len = 0;
 	tp->ucopy.memory = 0;
 	skb_queue_head_init(&tp->ucopy.prequeue);
-#ifdef CONFIG_NET_DMA
-	tp->ucopy.dma_chan = NULL;
-	tp->ucopy.wakeup = 0;
-	tp->ucopy.pinned_list = NULL;
-	tp->ucopy.dma_cookie = 0;
-#endif
 }
 
 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 653cbbd9e7ad..d457005acedf 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
 	{ CTL_INT,	NET_TCP_MTU_PROBING,			"tcp_mtu_probing" },
 	{ CTL_INT,	NET_TCP_BASE_MSS,			"tcp_base_mss" },
 	{ CTL_INT,	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,	"tcp_workaround_signed_windows" },
-	{ CTL_INT,	NET_TCP_DMA_COPYBREAK,			"tcp_dma_copybreak" },
 	{ CTL_INT,	NET_TCP_SLOW_START_AFTER_IDLE,		"tcp_slow_start_after_idle" },
 	{ CTL_INT,	NET_CIPSOV4_CACHE_ENABLE,		"cipso_cache_enable" },
 	{ CTL_INT,	NET_CIPSOV4_CACHE_BUCKET_SIZE,		"cipso_cache_bucket_size" },
diff --git a/net/core/Makefile b/net/core/Makefile
index 9628c20acff6..5038f1ea0349 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,7 +16,6 @@ obj-y += net-sysfs.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
-obj-$(CONFIG_NET_DMA) += user_dma.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
diff --git a/net/core/dev.c b/net/core/dev.c
index b1b0c8d4d7df..5e37e9abe8c5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1266,7 +1266,6 @@ static int __dev_open(struct net_device *dev)
 		clear_bit(__LINK_STATE_START, &dev->state);
 	else {
 		dev->flags |= IFF_UP;
-		net_dmaengine_get();
 		dev_set_rx_mode(dev);
 		dev_activate(dev);
 		add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1342,7 +1341,6 @@ static int __dev_close_many(struct list_head *head)
 			ops->ndo_stop(dev);
 
 		dev->flags &= ~IFF_UP;
-		net_dmaengine_put();
 	}
 
 	return 0;
@@ -4405,14 +4403,6 @@ static void net_rx_action(struct softirq_action *h)
 out:
 	net_rps_action_and_irq_enable(sd);
 
-#ifdef CONFIG_NET_DMA
-	/*
-	 * There may not be any more sk_buffs coming right now, so push
-	 * any pending DMA copies to hardware
-	 */
-	dma_issue_pending_all();
-#endif
-
 	return;
 
 softnet_break:
diff --git a/net/core/sock.c b/net/core/sock.c
index c0fc6bdad1e3..2f143c3b190a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1452,9 +1452,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_omem_alloc, 0);
 		skb_queue_head_init(&newsk->sk_receive_queue);
 		skb_queue_head_init(&newsk->sk_write_queue);
-#ifdef CONFIG_NET_DMA
-		skb_queue_head_init(&newsk->sk_async_wait_queue);
-#endif
 
 		spin_lock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
@@ -2265,9 +2262,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	skb_queue_head_init(&sk->sk_receive_queue);
 	skb_queue_head_init(&sk->sk_write_queue);
 	skb_queue_head_init(&sk->sk_error_queue);
-#ifdef CONFIG_NET_DMA
-	skb_queue_head_init(&sk->sk_async_wait_queue);
-#endif
 
 	sk->sk_send_head	=	NULL;
 
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
deleted file mode 100644
index 1b5fefdb8198..000000000000
--- a/net/core/user_dma.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- * Portions based on net/core/datagram.c and copyrighted by their authors.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-/*
- * This code allows the net stack to make use of a DMA engine for
- * skb to iovec copies.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/socket.h>
-#include <linux/export.h>
-#include <net/tcp.h>
-#include <net/netdma.h>
-
-#define NET_DMA_DEFAULT_COPYBREAK 4096
-
-int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
-EXPORT_SYMBOL(sysctl_tcp_dma_copybreak);
-
-/**
- *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
- *	@skb - buffer to copy
- *	@offset - offset in the buffer to start copying from
- *	@iovec - io vector to copy to
- *	@len - amount of data to copy from buffer to iovec
- *	@pinned_list - locked iovec buffer data
- *
- *	Note: the iovec is modified during the copy.
- */
-int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
-			struct sk_buff *skb, int offset, struct iovec *to,
-			size_t len, struct dma_pinned_list *pinned_list)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-	dma_cookie_t cookie = 0;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
-					    skb->data + offset, copy);
-		if (cookie < 0)
-			goto fault;
-		len -= copy;
-		if (len == 0)
-			goto end;
-		offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		copy = end - offset;
-		if (copy > 0) {
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-
-			cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
-					frag->page_offset + offset - start, copy);
-			if (cookie < 0)
-				goto fault;
-			len -= copy;
-			if (len == 0)
-				goto end;
-			offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		copy = end - offset;
-		if (copy > 0) {
-			if (copy > len)
-				copy = len;
-			cookie = dma_skb_copy_datagram_iovec(chan, frag_iter,
-							     offset - start,
-							     to, copy,
-							     pinned_list);
-			if (cookie < 0)
-				goto fault;
-			len -= copy;
-			if (len == 0)
-				goto end;
-			offset += copy;
-		}
-		start = end;
-	}
-
-end:
-	if (!len) {
-		skb->dma_cookie = cookie;
-		return cookie;
-	}
-
-fault:
-	return -EFAULT;
-}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index eb892b4f4814..f9076f295b13 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -848,7 +848,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		default:
 			dccp_pr_debug("packet_type=%s\n",
 				      dccp_packet_name(dh->dccph_type));
-			sk_eat_skb(sk, skb, false);
+			sk_eat_skb(sk, skb);
 		}
 verify_sock_status:
 		if (sock_flag(sk, SOCK_DONE)) {
@@ -905,7 +905,7 @@ verify_sock_status:
 			len = skb->len;
 	found_fin_ok:
 		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb, false);
+			sk_eat_skb(sk, skb);
 		break;
 	} while (1);
 out:
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44eba052b43d..c3d2a48481f1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -635,15 +635,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-#ifdef CONFIG_NET_DMA
-	{
-		.procname	= "tcp_dma_copybreak",
-		.data		= &sysctl_tcp_dma_copybreak,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-#endif
 	{
 		.procname	= "tcp_slow_start_after_idle",
 		.data		= &sysctl_tcp_slow_start_after_idle,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 97c8f5620c43..28595a364f09 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -274,7 +274,6 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
-#include <net/netdma.h>
 #include <net/sock.h>
 
 #include <asm/uaccess.h>
@@ -1454,39 +1453,6 @@ static void tcp_prequeue_process(struct sock *sk)
 	tp->ucopy.memory = 0;
 }
 
-#ifdef CONFIG_NET_DMA
-static void tcp_service_net_dma(struct sock *sk, bool wait)
-{
-	dma_cookie_t done, used;
-	dma_cookie_t last_issued;
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (!tp->ucopy.dma_chan)
-		return;
-
-	last_issued = tp->ucopy.dma_cookie;
-	dma_async_issue_pending(tp->ucopy.dma_chan);
-
-	do {
-		if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
-					      last_issued, &done,
-					      &used) == DMA_COMPLETE) {
-			/* Safe to free early-copied skbs now */
-			__skb_queue_purge(&sk->sk_async_wait_queue);
-			break;
-		} else {
-			struct sk_buff *skb;
-			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
-			       (dma_async_is_complete(skb->dma_cookie, done,
-						      used) == DMA_COMPLETE)) {
-				__skb_dequeue(&sk->sk_async_wait_queue);
-				kfree_skb(skb);
-			}
-		}
-	} while (wait);
-}
-#endif
-
 static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 {
 	struct sk_buff *skb;
@@ -1504,7 +1470,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 		 * splitted a fat GRO packet, while we released socket lock
 		 * in skb_splice_bits()
 		 */
-		sk_eat_skb(sk, skb, false);
+		sk_eat_skb(sk, skb);
 	}
 	return NULL;
 }
@@ -1570,11 +1536,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				continue;
 		}
 		if (tcp_hdr(skb)->fin) {
-			sk_eat_skb(sk, skb, false);
+			sk_eat_skb(sk, skb);
 			++seq;
 			break;
 		}
-		sk_eat_skb(sk, skb, false);
+		sk_eat_skb(sk, skb);
 		if (!desc->count)
 			break;
 		tp->copied_seq = seq;
@@ -1612,7 +1578,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct task_struct *user_recv = NULL;
-	bool copied_early = false;
 	struct sk_buff *skb;
 	u32 urg_hole = 0;
 
@@ -1655,28 +1620,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
-#ifdef CONFIG_NET_DMA
-	tp->ucopy.dma_chan = NULL;
-	preempt_disable();
-	skb = skb_peek_tail(&sk->sk_receive_queue);
-	{
-		int available = 0;
-
-		if (skb)
-			available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
-		if ((available < target) &&
-		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
-		    !sysctl_tcp_low_latency &&
-		    net_dma_find_channel()) {
-			preempt_enable();
-			tp->ucopy.pinned_list =
-					dma_pin_iovec_pages(msg->msg_iov, len);
-		} else {
-			preempt_enable();
-		}
-	}
-#endif
-
 	do {
 		u32 offset;
 
@@ -1807,16 +1750,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			/* __ Set realtime policy in scheduler __ */
 		}
 
-#ifdef CONFIG_NET_DMA
-		if (tp->ucopy.dma_chan) {
-			if (tp->rcv_wnd == 0 &&
-			    !skb_queue_empty(&sk->sk_async_wait_queue)) {
-				tcp_service_net_dma(sk, true);
-				tcp_cleanup_rbuf(sk, copied);
-			} else
-				dma_async_issue_pending(tp->ucopy.dma_chan);
-		}
-#endif
 		if (copied >= target) {
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
@@ -1824,11 +1757,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		} else
 			sk_wait_data(sk, &timeo);
 
-#ifdef CONFIG_NET_DMA
-		tcp_service_net_dma(sk, false);  /* Don't block */
-		tp->ucopy.wakeup = 0;
-#endif
-
 		if (user_recv) {
 			int chunk;
 
@@ -1886,43 +1814,13 @@ do_prequeue:
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-#ifdef CONFIG_NET_DMA
-			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-				tp->ucopy.dma_chan = net_dma_find_channel();
-
-			if (tp->ucopy.dma_chan) {
-				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
-					tp->ucopy.dma_chan, skb, offset,
-					msg->msg_iov, used,
-					tp->ucopy.pinned_list);
-
-				if (tp->ucopy.dma_cookie < 0) {
-
-					pr_alert("%s: dma_cookie < 0\n",
-						 __func__);
-
-					/* Exception. Bailout! */
-					if (!copied)
-						copied = -EFAULT;
-					break;
-				}
-
-				dma_async_issue_pending(tp->ucopy.dma_chan);
-
-				if ((offset + used) == skb->len)
-					copied_early = true;
-
-			} else
-#endif
-			{
-				err = skb_copy_datagram_iovec(skb, offset,
-						msg->msg_iov, used);
-				if (err) {
-					/* Exception. Bailout! */
-					if (!copied)
-						copied = -EFAULT;
-					break;
-				}
+			err = skb_copy_datagram_iovec(skb, offset,
+						      msg->msg_iov, used);
+			if (err) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
 			}
 		}
 
@@ -1942,19 +1840,15 @@ skip_copy:
 
 		if (tcp_hdr(skb)->fin)
 			goto found_fin_ok;
-		if (!(flags & MSG_PEEK)) {
-			sk_eat_skb(sk, skb, copied_early);
-			copied_early = false;
-		}
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb);
 		continue;
 
 	found_fin_ok:
 		/* Process the FIN. */
 		++*seq;
-		if (!(flags & MSG_PEEK)) {
-			sk_eat_skb(sk, skb, copied_early);
-			copied_early = false;
-		}
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb);
 		break;
 	} while (len > 0);
 
@@ -1977,16 +1871,6 @@ skip_copy:
 		tp->ucopy.len = 0;
 	}
 
-#ifdef CONFIG_NET_DMA
-	tcp_service_net_dma(sk, true);  /* Wait for queue to drain */
-	tp->ucopy.dma_chan = NULL;
-
-	if (tp->ucopy.pinned_list) {
-		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
-		tp->ucopy.pinned_list = NULL;
-	}
-#endif
-
 	/* According to UNIX98, msg_name/msg_namelen are ignored
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
@@ -2330,9 +2214,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 	__skb_queue_purge(&sk->sk_receive_queue);
 	tcp_write_queue_purge(sk);
 	__skb_queue_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_NET_DMA
-	__skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
 
 	inet->inet_dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eeaac399420d..1342e9851f97 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -73,7 +73,6 @@
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
-#include <net/netdma.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -4970,53 +4969,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
 	       __tcp_checksum_complete_user(sk, skb);
 }
 
-#ifdef CONFIG_NET_DMA
-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
-				  int hlen)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	int chunk = skb->len - hlen;
-	int dma_cookie;
-	bool copied_early = false;
-
-	if (tp->ucopy.wakeup)
-		return false;
-
-	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-		tp->ucopy.dma_chan = net_dma_find_channel();
-
-	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
-
-		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
-							 skb, hlen,
-							 tp->ucopy.iov, chunk,
-							 tp->ucopy.pinned_list);
-
-		if (dma_cookie < 0)
-			goto out;
-
-		tp->ucopy.dma_cookie = dma_cookie;
-		copied_early = true;
-
-		tp->ucopy.len -= chunk;
-		tp->copied_seq += chunk;
-		tcp_rcv_space_adjust(sk);
-
-		if ((tp->ucopy.len == 0) ||
-		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
-		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
-			tp->ucopy.wakeup = 1;
-			sk->sk_data_ready(sk, 0);
-		}
-	} else if (chunk > 0) {
-		tp->ucopy.wakeup = 1;
-		sk->sk_data_ready(sk, 0);
-	}
-out:
-	return copied_early;
-}
-#endif /* CONFIG_NET_DMA */
-
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5201,14 +5153,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 			if (tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len) {
-#ifdef CONFIG_NET_DMA
-				if (tp->ucopy.task == current &&
-				    sock_owned_by_user(sk) &&
-				    tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
-					copied_early = 1;
-					eaten = 1;
-				}
-#endif
 				if (tp->ucopy.task == current &&
 				    sock_owned_by_user(sk) && !copied_early) {
 					__set_current_state(TASK_RUNNING);
@@ -5274,11 +5218,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
 				__tcp_ack_snd_check(sk, 0);
 no_ack:
-#ifdef CONFIG_NET_DMA
-			if (copied_early)
-				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
-			else
-#endif
 			if (eaten)
 				kfree_skb_partial(skb, fragstolen);
 			sk->sk_data_ready(sk, 0);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3cf976510497..737c2e270ee3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -72,7 +72,6 @@
 #include <net/inet_common.h>
 #include <net/timewait_sock.h>
 #include <net/xfrm.h>
-#include <net/netdma.h>
 #include <net/secure_seq.h>
 #include <net/tcp_memcontrol.h>
 #include <net/busy_poll.h>
@@ -1999,18 +1998,8 @@ process:
 	bh_lock_sock_nested(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-#ifdef CONFIG_NET_DMA
-		struct tcp_sock *tp = tcp_sk(sk);
-		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = net_dma_find_channel();
-		if (tp->ucopy.dma_chan)
+		if (!tcp_prequeue(sk, skb))
 			ret = tcp_v4_do_rcv(sk, skb);
-		else
-#endif
-		{
-			if (!tcp_prequeue(sk, skb))
-				ret = tcp_v4_do_rcv(sk, skb);
-		}
 	} else if (unlikely(sk_add_backlog(sk, skb,
 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
 		bh_unlock_sock(sk);
@@ -2169,11 +2158,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	}
 #endif
 
-#ifdef CONFIG_NET_DMA
-	/* Cleans up our sk_async_wait_queue */
-	__skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
-
 	/* Clean prequeue, it must be empty really */
 	__skb_queue_purge(&tp->ucopy.prequeue);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 889079b2ea85..cb21fccf2089 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -59,7 +59,6 @@
 #include <net/snmp.h>
 #include <net/dsfield.h>
 #include <net/timewait_sock.h>
-#include <net/netdma.h>
 #include <net/inet_common.h>
 #include <net/secure_seq.h>
 #include <net/tcp_memcontrol.h>
@@ -1520,18 +1519,8 @@ process:
 	bh_lock_sock_nested(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-#ifdef CONFIG_NET_DMA
-		struct tcp_sock *tp = tcp_sk(sk);
-		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = net_dma_find_channel();
-		if (tp->ucopy.dma_chan)
+		if (!tcp_prequeue(sk, skb))
 			ret = tcp_v6_do_rcv(sk, skb);
-		else
-#endif
-		{
-			if (!tcp_prequeue(sk, skb))
-				ret = tcp_v6_do_rcv(sk, skb);
-		}
 	} else if (unlikely(sk_add_backlog(sk, skb,
 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
 		bh_unlock_sock(sk);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 0080d2b0a8ae..bb9cbc17d926 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -839,7 +839,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 		if (!(flags & MSG_PEEK)) {
 			spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
-			sk_eat_skb(sk, skb, false);
+			sk_eat_skb(sk, skb);
 			spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
 			*seq = 0;
 		}
@@ -861,10 +861,10 @@ copy_uaddr:
 		llc_cmsg_rcv(msg, skb);
 
 	if (!(flags & MSG_PEEK)) {
-			spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
-			sk_eat_skb(sk, skb, false);
-			spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
-			*seq = 0;
+		spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
+		sk_eat_skb(sk, skb);
+		spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+		*seq = 0;
 	}
 
 	goto out;
-- 
cgit v1.2.3


From 55a93b3ea780908b7d1b3a8cf1976223a9268d78 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 3 Oct 2014 15:31:07 -0700
Subject: qdisc: validate skb without holding lock

Validation of skb can be pretty expensive :

GSO segmentation and/or checksum computations.

We can do this without holding qdisc lock, so that other cpus
can queue additional packets.

Trick is that requeued packets were already validated, so we carry
a boolean so that sch_direct_xmit() can validate a fresh skb list,
or directly use an old one.

Tested on 40Gb NIC (8 TX queues) and 200 concurrent flows, 48 threads
host.

Turning TSO on or off had no effect on throughput, only few more cpu
cycles. Lock contention on qdisc lock disappeared.

Same if disabling TX checksum offload.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 +-
 include/net/pkt_sched.h   |  2 +-
 net/core/dev.c            | 29 +++++++++++++++++++---
 net/sched/sch_generic.c   | 61 ++++++++++++++++++++++-------------------------
 4 files changed, 56 insertions(+), 38 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9b7fbacb6296..910fb17ad148 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2821,7 +2821,7 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *);
 int dev_change_carrier(struct net_device *, bool new_carrier);
 int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_port_id *ppid);
-struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev);
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 8bbe626e9ece..e4b3c828c1c2 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -99,7 +99,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab);
 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc);
 int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 		    struct net_device *dev, struct netdev_queue *txq,
-		    spinlock_t *root_lock);
+		    spinlock_t *root_lock, bool validate);
 
 void __qdisc_run(struct Qdisc *q);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e55c546717d4..1a90530f83ff 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2655,7 +2655,7 @@ struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t featur
 	return skb;
 }
 
-struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 {
 	netdev_features_t features;
 
@@ -2720,6 +2720,30 @@ out_null:
 	return NULL;
 }
 
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
+{
+	struct sk_buff *next, *head = NULL, *tail;
+
+	while (skb) {
+		next = skb->next;
+		skb->next = NULL;
+		skb = validate_xmit_skb(skb, dev);
+		if (skb) {
+			struct sk_buff *end = skb;
+
+			while (end->next)
+				end = end->next;
+			if (!head)
+				head = skb;
+			else
+				tail->next = skb;
+			tail = end;
+		}
+		skb = next;
+	}
+	return head;
+}
+
 static void qdisc_pkt_len_init(struct sk_buff *skb)
 {
 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
@@ -2786,8 +2810,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		qdisc_bstats_update(q, skb);
 
-		skb = validate_xmit_skb(skb, dev);
-		if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
 				contended = false;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 797ebef73642..2b349a4de3c8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -56,40 +56,34 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 	return 0;
 }
 
-static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q,
-					    struct sk_buff *head_skb,
-					    int bytelimit)
+static void try_bulk_dequeue_skb(struct Qdisc *q,
+				 struct sk_buff *skb,
+				 const struct netdev_queue *txq)
 {
-	struct sk_buff *skb, *tail_skb = head_skb;
+	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
 
 	while (bytelimit > 0) {
-		skb = q->dequeue(q);
-		if (!skb)
-			break;
+		struct sk_buff *nskb = q->dequeue(q);
 
-		bytelimit -= skb->len; /* covers GSO len */
-		skb = validate_xmit_skb(skb, qdisc_dev(q));
-		if (!skb)
+		if (!nskb)
 			break;
 
-		while (tail_skb->next) /* GSO list goto tail */
-			tail_skb = tail_skb->next;
-
-		tail_skb->next = skb;
-		tail_skb = skb;
+		bytelimit -= nskb->len; /* covers GSO len */
+		skb->next = nskb;
+		skb = nskb;
 	}
-
-	return head_skb;
+	skb->next = NULL;
 }
 
 /* Note that dequeue_skb can possibly return a SKB list (via skb->next).
  * A requeued skb (via q->gso_skb) can also be a SKB list.
  */
-static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
+static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate)
 {
 	struct sk_buff *skb = q->gso_skb;
 	const struct netdev_queue *txq = q->dev_queue;
 
+	*validate = true;
 	if (unlikely(skb)) {
 		/* check the reason of requeuing without tx lock first */
 		txq = skb_get_tx_queue(txq->dev, skb);
@@ -98,21 +92,16 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 			q->q.qlen--;
 		} else
 			skb = NULL;
+		/* skb in gso_skb were already validated */
+		*validate = false;
 	} else {
 		if (!(q->flags & TCQ_F_ONETXQUEUE) ||
 		    !netif_xmit_frozen_or_stopped(txq)) {
-			int bytelimit = qdisc_avail_bulklimit(txq);
-
 			skb = q->dequeue(q);
-			if (skb) {
-				bytelimit -= skb->len;
-				skb = validate_xmit_skb(skb, qdisc_dev(q));
-			}
 			if (skb && qdisc_may_bulk(q))
-				skb = try_bulk_dequeue_skb(q, skb, bytelimit);
+				try_bulk_dequeue_skb(q, skb, txq);
 		}
 	}
-
 	return skb;
 }
 
@@ -156,19 +145,24 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
  */
 int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 		    struct net_device *dev, struct netdev_queue *txq,
-		    spinlock_t *root_lock)
+		    spinlock_t *root_lock, bool validate)
 {
 	int ret = NETDEV_TX_BUSY;
 
 	/* And release qdisc */
 	spin_unlock(root_lock);
 
-	HARD_TX_LOCK(dev, txq, smp_processor_id());
-	if (!netif_xmit_frozen_or_stopped(txq))
-		skb = dev_hard_start_xmit(skb, dev, txq, &ret);
+	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
+	if (validate)
+		skb = validate_xmit_skb_list(skb, dev);
 
-	HARD_TX_UNLOCK(dev, txq);
+	if (skb) {
+		HARD_TX_LOCK(dev, txq, smp_processor_id());
+		if (!netif_xmit_frozen_or_stopped(txq))
+			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
 
+		HARD_TX_UNLOCK(dev, txq);
+	}
 	spin_lock(root_lock);
 
 	if (dev_xmit_complete(ret)) {
@@ -217,9 +211,10 @@ static inline int qdisc_restart(struct Qdisc *q)
 	struct net_device *dev;
 	spinlock_t *root_lock;
 	struct sk_buff *skb;
+	bool validate;
 
 	/* Dequeue packet */
-	skb = dequeue_skb(q);
+	skb = dequeue_skb(q, &validate);
 	if (unlikely(!skb))
 		return 0;
 
@@ -229,7 +224,7 @@ static inline int qdisc_restart(struct Qdisc *q)
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
 
-	return sch_direct_xmit(skb, q, dev, txq, root_lock);
+	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
 }
 
 void __qdisc_run(struct Qdisc *q)
-- 
cgit v1.2.3


From bec3cfdca36bf43cfa3751ad7b56db1a307e0760 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 3 Oct 2014 20:59:19 -0700
Subject: net: skb_segment() provides list head and tail

Its unfortunate we have to walk again skb list to find the tail
after segmentation, even if data is probably hot in cpu caches.

skb_segment() can store the tail of the list into segs->prev,
and validate_xmit_skb_list() can immediately get the tail.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c    | 27 +++++++++++++++------------
 net/core/skbuff.c |  5 +++++
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1a90530f83ff..7d5691cc1f47 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2724,22 +2724,25 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 {
 	struct sk_buff *next, *head = NULL, *tail;
 
-	while (skb) {
+	for (; skb != NULL; skb = next) {
 		next = skb->next;
 		skb->next = NULL;
+
+		/* in case skb wont be segmented, point to itself */
+		skb->prev = skb;
+
 		skb = validate_xmit_skb(skb, dev);
-		if (skb) {
-			struct sk_buff *end = skb;
+		if (!skb)
+			continue;
 
-			while (end->next)
-				end = end->next;
-			if (!head)
-				head = skb;
-			else
-				tail->next = skb;
-			tail = end;
-		}
-		skb = next;
+		if (!head)
+			head = skb;
+		else
+			tail->next = skb;
+		/* If skb was segmented, skb->prev points to
+		 * the last segment. If not, it still contains skb.
+		 */
+		tail = skb->prev;
 	}
 	return head;
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9a423e2c5766..7b3df0d518ab 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3083,6 +3083,11 @@ perform_csum_check:
 		}
 	} while ((offset += len) < head_skb->len);
 
+	/* Some callers want to get the end of the list.
+	 * Put it in segs->prev to avoid walking the list.
+	 * (see validate_xmit_skb_list() for example)
+	 */
+	segs->prev = tail;
 	return segs;
 
 err:
-- 
cgit v1.2.3


From fcbeb976d7ce783fd58e63e61c196d9a8912b3be Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 5 Oct 2014 10:11:27 -0700
Subject: net: introduce netdevice gso_min_segs attribute

Some TSO engines might have a too heavy setup cost, that impacts
performance on hosts sending small bursts (2 MSS per packet).

This patch adds a device gso_min_segs, allowing drivers to set
a minimum segment size for TSO packets, according to the NIC
performance.

Tested on a mlx4 NIC, this allows to get a ~110% increase of
throughput when sending 2 MSS per packet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 +++-
 net/core/dev.c            | 9 ++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 22d54b9b700d..2df86f50261c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1416,6 +1416,8 @@ enum netdev_priv_flags {
  *	@gso_max_size:	Maximum size of generic segmentation offload
  *	@gso_max_segs:	Maximum number of segments that can be passed to the
  *			NIC for GSO
+ *	@gso_min_segs:	Minimum number of segments that can be passed to the
+ *			NIC for GSO
  *
  *	@dcbnl_ops:	Data Center Bridging netlink ops
  *	@num_tc:	Number of traffic classes in the net device
@@ -1666,7 +1668,7 @@ struct net_device {
 	unsigned int		gso_max_size;
 #define GSO_MAX_SEGS		65535
 	u16			gso_max_segs;
-
+	u16			gso_min_segs;
 #ifdef CONFIG_DCB
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 7d5691cc1f47..c1a4de275576 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2567,10 +2567,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
 
 netdev_features_t netif_skb_features(struct sk_buff *skb)
 {
+	const struct net_device *dev = skb->dev;
+	netdev_features_t features = dev->features;
+	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 	__be16 protocol = skb->protocol;
-	netdev_features_t features = skb->dev->features;
 
-	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
 		features &= ~NETIF_F_GSO_MASK;
 
 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
@@ -2581,7 +2583,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	}
 
 	features = netdev_intersect_features(features,
-					     skb->dev->vlan_features |
+					     dev->vlan_features |
 					     NETIF_F_HW_VLAN_CTAG_TX |
 					     NETIF_F_HW_VLAN_STAG_TX);
 
@@ -6661,6 +6663,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
+	dev->gso_min_segs = 0;
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
-- 
cgit v1.2.3


From 1ff0dc9499b25d016777f9b8d3ee486fd588ba59 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Oct 2014 11:26:27 -0700
Subject: net: validate_xmit_vlan() is static

Marking this as static allows compiler to inline it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index c1a4de275576..a63b8c43c1b6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2645,7 +2645,8 @@ out:
 	return skb;
 }
 
-struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features)
+static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
+					  netdev_features_t features)
 {
 	if (vlan_tx_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-- 
cgit v1.2.3


From 0287587884b15041203b3a362d485e1ab1f24445 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 5 Oct 2014 18:38:35 -0700
Subject: net: better IFF_XMIT_DST_RELEASE support

Testing xmit_more support with netperf and connected UDP sockets,
I found strange dst refcount false sharing.

Current handling of IFF_XMIT_DST_RELEASE is not optimal.

Dropping dst in validate_xmit_skb() is certainly too late in case
packet was queued by cpu X but dequeued by cpu Y

The logical point to take care of drop/force is in __dev_queue_xmit()
before even taking qdisc lock.

As Julian Anastasov pointed out, need for skb_dst() might come from some
packet schedulers or classifiers.

This patch adds new helper to cleanly express needs of various drivers
or qdiscs/classifiers.

Drivers that need skb_dst() in their ndo_start_xmit() should call
following helper in their setup instead of the prior :

	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
->
	netif_keep_dst(dev);

Instead of using a single bit, we use two bits, one being
eventually rebuilt in bonding/team drivers.

The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being
rebuilt in bonding/team. Eventually, we could add something
smarter later.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c |  2 +-
 drivers/net/appletalk/ipddp.c             |  2 +-
 drivers/net/bonding/bond_main.c           |  9 ++++++---
 drivers/net/eql.c                         |  2 +-
 drivers/net/ifb.c                         |  3 ++-
 drivers/net/loopback.c                    |  2 +-
 drivers/net/macvlan.c                     |  3 ++-
 drivers/net/ppp/ppp_generic.c             |  2 +-
 drivers/net/team/team.c                   |  8 +++++---
 drivers/net/vxlan.c                       |  2 +-
 drivers/net/wan/hdlc_fr.c                 |  2 +-
 drivers/s390/net/qeth_l3_main.c           |  2 +-
 include/linux/netdevice.h                 |  8 ++++++++
 net/8021q/vlan_dev.c                      |  3 ++-
 net/atm/clip.c                            |  2 +-
 net/core/dev.c                            | 19 +++++++++----------
 net/ipv4/ip_gre.c                         |  2 +-
 net/ipv4/ip_vti.c                         |  2 +-
 net/ipv4/ipip.c                           |  2 +-
 net/ipv6/ip6_gre.c                        |  2 +-
 net/ipv6/ip6_tunnel.c                     |  2 +-
 net/ipv6/ip6_vti.c                        |  2 +-
 net/ipv6/sit.c                            |  2 +-
 net/sched/cls_flow.c                      |  2 ++
 net/sched/cls_route.c                     |  1 +
 net/sched/sch_generic.c                   |  3 ---
 net/sched/sch_teql.c                      |  2 +-
 27 files changed, 54 insertions(+), 39 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 13e6e0431592..58b5aa3b6f2d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1364,7 +1364,7 @@ void ipoib_setup(struct net_device *dev)
 	dev->tx_queue_len	 = ipoib_sendq_size * 2;
 	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
 				    NETIF_F_HIGHDMA);
-	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c
index 10d0dba572c2..e90c6a7333d7 100644
--- a/drivers/net/appletalk/ipddp.c
+++ b/drivers/net/appletalk/ipddp.c
@@ -74,7 +74,7 @@ static struct net_device * __init ipddp_init(void)
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 	strcpy(dev->name, "ipddp%d");
 
 	if (version_printed++ == 0)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3ad5413d4f57..c9ac06cfe6b7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1002,7 +1002,8 @@ static netdev_features_t bond_fix_features(struct net_device *dev,
 
 static void bond_compute_features(struct bonding *bond)
 {
-	unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
+	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+					IFF_XMIT_DST_RELEASE_PERM;
 	netdev_features_t vlan_features = BOND_VLAN_FEATURES;
 	netdev_features_t enc_features  = BOND_ENC_FEATURES;
 	struct net_device *bond_dev = bond->dev;
@@ -1038,8 +1039,10 @@ done:
 	bond_dev->gso_max_segs = gso_max_segs;
 	netif_set_gso_max_size(bond_dev, gso_max_size);
 
-	flags = bond_dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
-	bond_dev->priv_flags = flags | dst_release_flag;
+	bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
+	    dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+		bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;
 
 	netdev_change_features(bond_dev);
 }
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index 957e5c0cede3..a10ad74cc8d2 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -199,7 +199,7 @@ static void __init eql_setup(struct net_device *dev)
 
 	dev->type       	= ARPHRD_SLIP;
 	dev->tx_queue_len 	= 5;		/* Hands them off fast */
-	dev->priv_flags	       &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 }
 
 static int eql_open(struct net_device *dev)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index d2d4a3d2237f..34f846b4bd05 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -185,7 +185,8 @@ static void ifb_setup(struct net_device *dev)
 
 	dev->flags |= IFF_NOARP;
 	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	netif_keep_dst(dev);
 	eth_hw_addr_random(dev);
 }
 
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 8f2262540561..c76283c2f84a 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -169,7 +169,7 @@ static void loopback_setup(struct net_device *dev)
 	dev->type		= ARPHRD_LOOPBACK;	/* 0x0001*/
 	dev->flags		= IFF_LOOPBACK;
 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
-	dev->priv_flags	       &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 	dev->hw_features	= NETIF_F_ALL_TSO | NETIF_F_UFO;
 	dev->features 		= NETIF_F_SG | NETIF_F_FRAGLIST
 		| NETIF_F_ALL_TSO
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index e8a453f1b458..38b4fae61f04 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1025,7 +1025,8 @@ void macvlan_common_setup(struct net_device *dev)
 {
 	ether_setup(dev);
 
-	dev->priv_flags	       &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+	dev->priv_flags	       &= ~IFF_TX_SKB_SHARING;
+	netif_keep_dst(dev);
 	dev->priv_flags	       |= IFF_UNICAST_FLT;
 	dev->netdev_ops		= &macvlan_netdev_ops;
 	dev->destructor		= free_netdev;
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index fa0d71727894..80e6f3430f65 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1103,7 +1103,7 @@ static void ppp_setup(struct net_device *dev)
 	dev->type = ARPHRD_PPP;
 	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
 	dev->features |= NETIF_F_NETNS_LOCAL;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 }
 
 /*
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 2277c3679a51..a94a9df3e6bd 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -970,7 +970,8 @@ static void __team_compute_features(struct team *team)
 	struct team_port *port;
 	u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
 	unsigned short max_hard_header_len = ETH_HLEN;
-	unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
+	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+					IFF_XMIT_DST_RELEASE_PERM;
 
 	list_for_each_entry(port, &team->port_list, list) {
 		vlan_features = netdev_increment_features(vlan_features,
@@ -985,8 +986,9 @@ static void __team_compute_features(struct team *team)
 	team->dev->vlan_features = vlan_features;
 	team->dev->hard_header_len = max_hard_header_len;
 
-	flags = team->dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
-	team->dev->priv_flags = flags | dst_release_flag;
+	team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+		team->dev->priv_flags |= IFF_XMIT_DST_RELEASE;
 
 	netdev_change_features(team->dev);
 }
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2af795d6ba05..2a51e6e48e1e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2193,7 +2193,7 @@ static void vxlan_setup(struct net_device *dev)
 	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
 	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 	dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
-	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 
 	INIT_LIST_HEAD(&vxlan->next);
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index e5c7e6165a4b..3ebed1c40abb 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1047,7 +1047,7 @@ static void pvc_setup(struct net_device *dev)
 	dev->flags = IFF_POINTOPOINT;
 	dev->hard_header_len = 10;
 	dev->addr_len = 2;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 }
 
 static const struct net_device_ops pvc_ops = {
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index f8427a2c4840..afebb9709763 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -3306,7 +3306,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
 	card->dev->features |=	NETIF_F_HW_VLAN_CTAG_TX |
 				NETIF_F_HW_VLAN_CTAG_RX |
 				NETIF_F_HW_VLAN_CTAG_FILTER;
-	card->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(card->dev);
 	card->dev->gso_max_size = 15 * PAGE_SIZE;
 
 	SET_NETDEV_DEV(card->dev, &card->gdev->dev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2df86f50261c..3a4315b39d20 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1206,6 +1206,7 @@ enum netdev_priv_flags {
 	IFF_SUPP_NOFCS			= 1<<19,
 	IFF_LIVE_ADDR_CHANGE		= 1<<20,
 	IFF_MACVLAN			= 1<<21,
+	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1230,6 +1231,7 @@ enum netdev_priv_flags {
 #define IFF_SUPP_NOFCS			IFF_SUPP_NOFCS
 #define IFF_LIVE_ADDR_CHANGE		IFF_LIVE_ADDR_CHANGE
 #define IFF_MACVLAN			IFF_MACVLAN
+#define IFF_XMIT_DST_RELEASE_PERM	IFF_XMIT_DST_RELEASE_PERM
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -3588,6 +3590,12 @@ static inline bool netif_supports_nofcs(struct net_device *dev)
 	return dev->priv_flags & IFF_SUPP_NOFCS;
 }
 
+/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
+static inline void netif_keep_dst(struct net_device *dev)
+{
+	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
+}
+
 extern struct pernet_operations __net_initdata loopback_net_ops;
 
 /* Logging, debugging and troubleshooting/diagnostic helpers. */
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 35a6b6b15e8a..0d441ec8763e 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -799,7 +799,8 @@ void vlan_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->priv_flags		|= IFF_802_1Q_VLAN;
-	dev->priv_flags		&= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+	dev->priv_flags		&= ~IFF_TX_SKB_SHARING;
+	netif_keep_dst(dev);
 	dev->tx_queue_len	= 0;
 
 	dev->netdev_ops		= &vlan_netdev_ops;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 1d9eaa4f041a..17e55dfecbe2 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev)
 	/* without any more elaborate queuing. 100 is a reasonable */
 	/* compromise between decent burst-tolerance and protection */
 	/* against memory hogs. */
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 }
 
 static int clip_create(int number)
diff --git a/net/core/dev.c b/net/core/dev.c
index a63b8c43c1b6..3c5bdaa44486 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2665,12 +2665,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 	if (skb->next)
 		return skb;
 
-	/* If device doesn't need skb->dst, release it right now while
-	 * its hot in this cpu cache
-	 */
-	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
-		skb_dst_drop(skb);
-
 	features = netif_skb_features(skb);
 	skb = validate_xmit_vlan(skb, features);
 	if (unlikely(!skb))
@@ -2811,8 +2805,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		 * waiting to be sent out; and the qdisc is not running -
 		 * xmit the skb directly.
 		 */
-		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
-			skb_dst_force(skb);
 
 		qdisc_bstats_update(q, skb);
 
@@ -2827,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		rc = NET_XMIT_SUCCESS;
 	} else {
-		skb_dst_force(skb);
 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
@@ -2924,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	skb_update_prio(skb);
 
+	/* If device/qdisc don't need skb->dst, release it right now while
+	 * its hot in this cpu cache.
+	 */
+	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+		skb_dst_drop(skb);
+	else
+		skb_dst_force(skb);
+
 	txq = netdev_pick_tx(dev, skb, accel_priv);
 	q = rcu_dereference_bh(txq->qdisc);
 
@@ -6674,7 +6673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->adj_list.lower);
 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
-	dev->priv_flags = IFF_XMIT_DST_RELEASE;
+	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
 	dev->num_tx_queues = txqs;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0485ef18d254..12055fdbe716 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -510,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	memcpy(dev->broadcast, &iph->daddr, 4);
 
 	dev->flags		= IFF_NOARP;
-	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 	dev->addr_len		= 4;
 
 	if (iph->daddr) {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index e453cb724a95..3e861011e4a3 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev)
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_LLTX;
-	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 
 	return ip_tunnel_init(dev);
 }
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ea88ab3102a8..37096d64730e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -289,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_LLTX;
-	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 
 	dev->features		|= IPIP_FEATURES;
 	dev->hw_features	|= IPIP_FEATURES;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 74b677916a70..de3b1c86b8d3 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1242,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 	dev->flags |= IFF_NOARP;
 	dev->iflink = 0;
 	dev->addr_len = sizeof(struct in6_addr);
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 }
 
 static int ip6gre_tunnel_init(struct net_device *dev)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index d3e8888ad611..9409887fb664 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1493,7 +1493,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
 		dev->mtu -= 8;
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 	/* This perm addr will be used as interface identifier by IPv6 */
 	dev->addr_assign_type = NET_ADDR_RANDOM;
 	eth_random_addr(dev->perm_addr);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 5833a2244467..d440bb585524 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -807,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev)
 	dev->mtu = ETH_DATA_LEN;
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 }
 
 /**
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0d4e27466f82..6eab37cf5345 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1364,7 +1364,7 @@ static void ipip6_tunnel_setup(struct net_device *dev)
 	dev->hard_header_len	= LL_MAX_HEADER + t_hlen;
 	dev->mtu		= ETH_DATA_LEN - t_hlen;
 	dev->flags		= IFF_NOARP;
-	dev->priv_flags	       &= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_LLTX;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index a5d2b20db560..4ac515f2a6ce 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -493,6 +493,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	tcf_exts_change(tp, &fnew->exts, &e);
 	tcf_em_tree_change(tp, &fnew->ematches, &t);
 
+	netif_keep_dst(qdisc_dev(tp->q));
+
 	if (tb[TCA_FLOW_KEYS]) {
 		fnew->keymask = keymask;
 		fnew->nkeys   = nkeys;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 6f22baae0afa..109a329b7198 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -524,6 +524,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 		if (f->handle < f1->handle)
 			break;
 
+	netif_keep_dst(qdisc_dev(tp->q));
 	rcu_assign_pointer(f->next, f1);
 	rcu_assign_pointer(*fp, f);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2b349a4de3c8..38d58e6cef07 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops);
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	skb_dst_force(skb);
 	q->gso_skb = skb;
 	q->qstats.requeues++;
 	q->q.qlen++;	/* it's still part of the queue */
@@ -218,8 +217,6 @@ static inline int qdisc_restart(struct Qdisc *q)
 	if (unlikely(!skb))
 		return 0;
 
-	WARN_ON_ONCE(skb_dst_is_noref(skb));
-
 	root_lock = qdisc_lock(q);
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 5cd291bd00e4..6ada42396a24 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -470,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev)
 	dev->tx_queue_len	= 100;
 	dev->flags		= IFF_NOARP;
 	dev->hard_header_len	= LL_MAX_HEADER;
-	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+	netif_keep_dst(dev);
 }
 
 static LIST_HEAD(master_dev_list);
-- 
cgit v1.2.3


From 04ffcb255f22a2a988ce7393e6e72f6eb3fcb7aa Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 14 Oct 2014 15:19:06 -0700
Subject: net: Add ndo_gso_check

Add ndo_gso_check which a device can define to indicate whether is
is capable of doing GSO on a packet. This funciton would be called from
the stack to determine whether software GSO is needed to be done. A
driver should populate this function if it advertises GSO types for
which there are combinations that it wouldn't be able to handle. For
instance a device that performs UDP tunneling might only implement
support for transparent Ethernet bridging type of inner packets
or might have limitations on lengths of inner headers.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c      |  2 +-
 drivers/net/xen-netfront.c |  2 +-
 include/linux/netdevice.h  | 12 +++++++++++-
 net/core/dev.c             |  2 +-
 4 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 0c6adaaf898c..65e2892342bd 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -298,7 +298,7 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
 	 */
 	if (q->flags & IFF_VNET_HDR)
 		features |= vlan->tap_features;
-	if (netif_needs_gso(skb, features)) {
+	if (netif_needs_gso(dev, skb, features)) {
 		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
 
 		if (IS_ERR(segs))
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index ca82f545ec2c..3c0b37574d05 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -638,7 +638,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (unlikely(!netif_carrier_ok(dev) ||
 		     (slots > 1 && !xennet_can_sg(dev)) ||
-		     netif_needs_gso(skb, netif_skb_features(skb)))) {
+		     netif_needs_gso(dev, skb, netif_skb_features(skb)))) {
 		spin_unlock_irqrestore(&queue->tx_lock, flags);
 		goto drop;
 	}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 838407aea705..74fd5d37f15a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -998,6 +998,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	Callback to use for xmit over the accelerated station. This
  *	is used in place of ndo_start_xmit on accelerated net
  *	devices.
+ * bool	(*ndo_gso_check) (struct sk_buff *skb,
+ *			  struct net_device *dev);
+ *	Called by core transmit path to determine if device is capable of
+ *	performing GSO on a packet. The device returns true if it is
+ *	able to GSO the packet, false otherwise. If the return value is
+ *	false the stack will do software GSO.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1147,6 +1153,8 @@ struct net_device_ops {
 							struct net_device *dev,
 							void *priv);
 	int			(*ndo_get_lock_subclass)(struct net_device *dev);
+	bool			(*ndo_gso_check) (struct sk_buff *skb,
+						  struct net_device *dev);
 };
 
 /**
@@ -3572,10 +3580,12 @@ static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
 	       (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
 }
 
-static inline bool netif_needs_gso(struct sk_buff *skb,
+static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb,
 				   netdev_features_t features)
 {
 	return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
+		(dev->netdev_ops->ndo_gso_check &&
+		 !dev->netdev_ops->ndo_gso_check(skb, dev)) ||
 		unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
 			 (skb->ip_summed != CHECKSUM_UNNECESSARY)));
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 4699dcfdc4ab..9f77a78c6b1c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2675,7 +2675,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 	if (skb->encapsulation)
 		features &= dev->hw_enc_features;
 
-	if (netif_needs_gso(skb, features)) {
+	if (netif_needs_gso(dev, skb, features)) {
 		struct sk_buff *segs;
 
 		segs = skb_gso_segment(skb, features);
-- 
cgit v1.2.3


From 93a35f59f1b13a02674877e3efdf07ae47e34052 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Oct 2014 06:30:30 -0700
Subject: net: napi_reuse_skb() should check pfmemalloc

Do not reuse skb if it was pfmemalloc tainted, otherwise
future frame might be dropped anyway.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Roman Gushchin <klamm@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index b793e3521a36..945bbd001359 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4157,6 +4157,10 @@ EXPORT_SYMBOL(napi_gro_receive);
 
 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 {
+	if (unlikely(skb->pfmemalloc)) {
+		consume_skb(skb);
+		return;
+	}
 	__skb_pull(skb, skb_headlen(skb));
 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
-- 
cgit v1.2.3


From bc9ad166e38ae1cdcb5323a8aa45dff834d68bfa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Oct 2014 18:05:13 -0700
Subject: net: introduce napi_schedule_irqoff()

napi_schedule() can be called from any context and has to mask hard
irqs.

Add a variant that can only be called from hard interrupts handlers
or when irqs are already masked.

Many NIC drivers can use it from their hard IRQ handler instead of
generic variant.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 +++++++++++++
 net/core/dev.c            | 15 ++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 74fd5d37f15a..c85e06512246 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -386,6 +386,7 @@ typedef enum rx_handler_result rx_handler_result_t;
 typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
 
 void __napi_schedule(struct napi_struct *n);
+void __napi_schedule_irqoff(struct napi_struct *n);
 
 static inline bool napi_disable_pending(struct napi_struct *n)
 {
@@ -420,6 +421,18 @@ static inline void napi_schedule(struct napi_struct *n)
 		__napi_schedule(n);
 }
 
+/**
+ *	napi_schedule_irqoff - schedule NAPI poll
+ *	@n: napi context
+ *
+ * Variant of napi_schedule(), assuming hard irqs are masked.
+ */
+static inline void napi_schedule_irqoff(struct napi_struct *n)
+{
+	if (napi_schedule_prep(n))
+		__napi_schedule_irqoff(n);
+}
+
 /* Try to reschedule poll. Called by dev->poll() after napi_complete().  */
 static inline bool napi_reschedule(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index b793e3521a36..759940cdf896 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4372,7 +4372,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
  * __napi_schedule - schedule for receive
  * @n: entry to schedule
  *
- * The entry's receive function will be scheduled to run
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
  */
 void __napi_schedule(struct napi_struct *n)
 {
@@ -4384,6 +4385,18 @@ void __napi_schedule(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule);
 
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+	____napi_schedule(this_cpu_ptr(&softnet_data), n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
 void __napi_complete(struct napi_struct *n)
 {
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-- 
cgit v1.2.3


From d75b1ade567ffab085e8adbbdacf0092d10cd09c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 2 Nov 2014 06:19:33 -0800
Subject: net: less interrupt masking in NAPI

net_rx_action() can mask irqs a single time to transfert sd->poll_list
into a private list, for a very short duration.

Then, napi_complete() can avoid masking irqs again,
and net_rx_action() only needs to mask irq again in slow path.

This patch removes 2 couples of irq mask/unmask per typical NAPI run,
more if multiple napi were triggered.

Note this also allows to give control back to caller (do_softirq())
more often, so that other softirq handlers can be called a bit earlier,
or ksoftirqd can be wakeup earlier under pressure.

This was developed while testing an alternative to RX interrupt
mitigation to reduce latencies while keeping or improving GRO
aggregation on fast NIC.

Idea is to test napi->gro_list at the end of a napi->poll() and
reschedule one NAPI poll, but after servicing a full round of
softirqs (timers, TX, rcu, ...). This will be allowed only if softirq
is currently serviced by idle task or ksoftirqd, and resched not needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 68 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 25 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ebf778df58cd..40be481268de 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4316,20 +4316,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 		local_irq_enable();
 }
 
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	return sd->rps_ipi_list != NULL;
+#else
+	return false;
+#endif
+}
+
 static int process_backlog(struct napi_struct *napi, int quota)
 {
 	int work = 0;
 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 
-#ifdef CONFIG_RPS
 	/* Check if we have pending ipi, its better to send them now,
 	 * not waiting net_rx_action() end.
 	 */
-	if (sd->rps_ipi_list) {
+	if (sd_has_rps_ipi_waiting(sd)) {
 		local_irq_disable();
 		net_rps_action_and_irq_enable(sd);
 	}
-#endif
+
 	napi->weight = weight_p;
 	local_irq_disable();
 	while (1) {
@@ -4356,7 +4364,6 @@ static int process_backlog(struct napi_struct *napi, int quota)
 			 * We can use a plain write instead of clear_bit(),
 			 * and we dont need an smp_mb() memory barrier.
 			 */
-			list_del(&napi->poll_list);
 			napi->state = 0;
 			rps_unlock(sd);
 
@@ -4406,7 +4413,7 @@ void __napi_complete(struct napi_struct *n)
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
 	BUG_ON(n->gro_list);
 
-	list_del(&n->poll_list);
+	list_del_init(&n->poll_list);
 	smp_mb__before_atomic();
 	clear_bit(NAPI_STATE_SCHED, &n->state);
 }
@@ -4424,9 +4431,15 @@ void napi_complete(struct napi_struct *n)
 		return;
 
 	napi_gro_flush(n, false);
-	local_irq_save(flags);
-	__napi_complete(n);
-	local_irq_restore(flags);
+
+	if (likely(list_empty(&n->poll_list))) {
+		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+	} else {
+		/* If n->poll_list is not empty, we need to mask irqs */
+		local_irq_save(flags);
+		__napi_complete(n);
+		local_irq_restore(flags);
+	}
 }
 EXPORT_SYMBOL(napi_complete);
 
@@ -4520,29 +4533,28 @@ static void net_rx_action(struct softirq_action *h)
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 	unsigned long time_limit = jiffies + 2;
 	int budget = netdev_budget;
+	LIST_HEAD(list);
+	LIST_HEAD(repoll);
 	void *have;
 
 	local_irq_disable();
+	list_splice_init(&sd->poll_list, &list);
+	local_irq_enable();
 
-	while (!list_empty(&sd->poll_list)) {
+	while (!list_empty(&list)) {
 		struct napi_struct *n;
 		int work, weight;
 
-		/* If softirq window is exhuasted then punt.
+		/* If softirq window is exhausted then punt.
 		 * Allow this to run for 2 jiffies since which will allow
 		 * an average latency of 1.5/HZ.
 		 */
 		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
 			goto softnet_break;
 
-		local_irq_enable();
 
-		/* Even though interrupts have been re-enabled, this
-		 * access is safe because interrupts can only add new
-		 * entries to the tail of this list, and only ->poll()
-		 * calls can remove this head entry from the list.
-		 */
-		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
+		n = list_first_entry(&list, struct napi_struct, poll_list);
+		list_del_init(&n->poll_list);
 
 		have = netpoll_poll_lock(n);
 
@@ -4564,8 +4576,6 @@ static void net_rx_action(struct softirq_action *h)
 
 		budget -= work;
 
-		local_irq_disable();
-
 		/* Drivers must not modify the NAPI state if they
 		 * consume the entire weight.  In such cases this code
 		 * still "owns" the NAPI instance and therefore can
@@ -4573,32 +4583,40 @@ static void net_rx_action(struct softirq_action *h)
 		 */
 		if (unlikely(work == weight)) {
 			if (unlikely(napi_disable_pending(n))) {
-				local_irq_enable();
 				napi_complete(n);
-				local_irq_disable();
 			} else {
 				if (n->gro_list) {
 					/* flush too old packets
 					 * If HZ < 1000, flush all packets.
 					 */
-					local_irq_enable();
 					napi_gro_flush(n, HZ >= 1000);
-					local_irq_disable();
 				}
-				list_move_tail(&n->poll_list, &sd->poll_list);
+				list_add_tail(&n->poll_list, &repoll);
 			}
 		}
 
 		netpoll_poll_unlock(have);
 	}
+
+	if (!sd_has_rps_ipi_waiting(sd) &&
+	    list_empty(&list) &&
+	    list_empty(&repoll))
+		return;
 out:
+	local_irq_disable();
+
+	list_splice_tail_init(&sd->poll_list, &list);
+	list_splice_tail(&repoll, &list);
+	list_splice(&list, &sd->poll_list);
+	if (!list_empty(&sd->poll_list))
+		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+
 	net_rps_action_and_irq_enable(sd);
 
 	return;
 
 softnet_break:
 	sd->time_squeeze++;
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	goto out;
 }
 
-- 
cgit v1.2.3


From ff960a731788a7408b6f66ec4fd772ff18833211 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Oct 2014 17:04:56 +0100
Subject: netdev, sched/wait: Fix sleeping inside wait event

rtnl_lock_unregistering*() take rtnl_lock() -- a mutex -- inside a
wait loop. The wait loop relies on current->state to function, but so
does mutex_lock(), nesting them makes for the inner to destroy the
outer state.

Fix this using the new wait_woken() bits.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cong Wang <cwang@twopensource.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jerry Chu <hkchu@google.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Cc: sfeldma@cumulusnetworks.com <sfeldma@cumulusnetworks.com>
Cc: stephen hemminger <stephen@networkplumber.org>
Cc: Tom Gundersen <teg@jklm.no>
Cc: Tom Herbert <therbert@google.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Vlad Yasevich <vyasevic@redhat.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20141029173110.GE15602@worktop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 net/core/dev.c       | 10 +++++-----
 net/core/rtnetlink.c | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index b793e3521a36..c5a9d73147a6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7196,11 +7196,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
 	 */
 	struct net *net;
 	bool unregistering;
-	DEFINE_WAIT(wait);
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
+	add_wait_queue(&netdev_unregistering_wq, &wait);
 	for (;;) {
-		prepare_to_wait(&netdev_unregistering_wq, &wait,
-				TASK_UNINTERRUPTIBLE);
 		unregistering = false;
 		rtnl_lock();
 		list_for_each_entry(net, net_list, exit_list) {
@@ -7212,9 +7211,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
 		if (!unregistering)
 			break;
 		__rtnl_unlock();
-		schedule();
+
+		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 	}
-	finish_wait(&netdev_unregistering_wq, &wait);
+	remove_wait_queue(&netdev_unregistering_wq, &wait);
 }
 
 static void __net_exit default_device_exit_batch(struct list_head *net_list)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a6882686ca3a..b095296f711f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -365,11 +365,10 @@ static void rtnl_lock_unregistering_all(void)
 {
 	struct net *net;
 	bool unregistering;
-	DEFINE_WAIT(wait);
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
+	add_wait_queue(&netdev_unregistering_wq, &wait);
 	for (;;) {
-		prepare_to_wait(&netdev_unregistering_wq, &wait,
-				TASK_UNINTERRUPTIBLE);
 		unregistering = false;
 		rtnl_lock();
 		for_each_net(net) {
@@ -381,9 +380,10 @@ static void rtnl_lock_unregistering_all(void)
 		if (!unregistering)
 			break;
 		__rtnl_unlock();
-		schedule();
+
+		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 	}
-	finish_wait(&netdev_unregistering_wq, &wait);
+	remove_wait_queue(&netdev_unregistering_wq, &wait);
 }
 
 /**
-- 
cgit v1.2.3


From 25cd9ba0abc0749e5cb78e6493c6f6b3311ec6c5 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Mon, 6 Oct 2014 05:05:13 -0700
Subject: openvswitch: Add basic MPLS support to kernel

Allow datapath to recognize and extract MPLS labels into flow keys
and execute actions which push, pop, and set labels on packets.

Based heavily on work by Leo Alterman, Ravi K, Isaku Yamahata and Joe Stringer.

Cc: Ravi K <rkerur@gmail.com>
Cc: Leo Alterman <lalterman@nicira.com>
Cc: Isaku Yamahata <yamahata@valinux.co.jp>
Cc: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 include/net/mpls.h               |  39 +++++++++++
 include/uapi/linux/openvswitch.h |  32 +++++++++
 net/core/dev.c                   |   3 +-
 net/openvswitch/Kconfig          |   1 +
 net/openvswitch/actions.c        | 106 ++++++++++++++++++++++++++++-
 net/openvswitch/datapath.c       |   6 +-
 net/openvswitch/flow.c           |  30 +++++++++
 net/openvswitch/flow.h           |  17 +++--
 net/openvswitch/flow_netlink.c   | 139 ++++++++++++++++++++++++++++++++++-----
 net/openvswitch/flow_netlink.h   |   2 +-
 10 files changed, 345 insertions(+), 30 deletions(-)
 create mode 100644 include/net/mpls.h

(limited to 'net/core/dev.c')

diff --git a/include/net/mpls.h b/include/net/mpls.h
new file mode 100644
index 000000000000..5b3b5addfb08
--- /dev/null
+++ b/include/net/mpls.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _NET_MPLS_H
+#define _NET_MPLS_H 1
+
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+
+#define MPLS_HLEN 4
+
+static inline bool eth_p_mpls(__be16 eth_type)
+{
+	return eth_type == htons(ETH_P_MPLS_UC) ||
+		eth_type == htons(ETH_P_MPLS_MC);
+}
+
+/*
+ * For non-MPLS skbs this will correspond to the network header.
+ * For MPLS skbs it will be before the network_header as the MPLS
+ * label stack lies between the end of the mac header and the network
+ * header. That is, for MPLS skbs the end of the mac header
+ * is the top of the MPLS label stack.
+ */
+static inline unsigned char *skb_mpls_header(struct sk_buff *skb)
+{
+	return skb_mac_header(skb) + skb->mac_len;
+}
+#endif
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 435eabc5ffaa..631056b66f80 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -293,6 +293,9 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_DP_HASH,      /* u32 hash value. Value 0 indicates the hash
 				   is not computed by the datapath. */
 	OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */
+	OVS_KEY_ATTR_MPLS,      /* array of struct ovs_key_mpls.
+				 * The implementation may restrict
+				 * the accepted length of the array. */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ovs_tunnel_info */
@@ -340,6 +343,10 @@ struct ovs_key_ethernet {
 	__u8	 eth_dst[ETH_ALEN];
 };
 
+struct ovs_key_mpls {
+	__be32 mpls_lse;
+};
+
 struct ovs_key_ipv4 {
 	__be32 ipv4_src;
 	__be32 ipv4_dst;
@@ -483,6 +490,19 @@ enum ovs_userspace_attr {
 
 #define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1)
 
+/**
+ * struct ovs_action_push_mpls - %OVS_ACTION_ATTR_PUSH_MPLS action argument.
+ * @mpls_lse: MPLS label stack entry to push.
+ * @mpls_ethertype: Ethertype to set in the encapsulating ethernet frame.
+ *
+ * The only values @mpls_ethertype should ever be given are %ETH_P_MPLS_UC and
+ * %ETH_P_MPLS_MC, indicating MPLS unicast or multicast. Other are rejected.
+ */
+struct ovs_action_push_mpls {
+	__be32 mpls_lse;
+	__be16 mpls_ethertype; /* Either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC */
+};
+
 /**
  * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument.
  * @vlan_tpid: Tag protocol identifier (TPID) to push.
@@ -534,6 +554,15 @@ struct ovs_action_hash {
  * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet.
  * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in
  * the nested %OVS_SAMPLE_ATTR_* attributes.
+ * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the
+ * top of the packets MPLS label stack.  Set the ethertype of the
+ * encapsulating frame to either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC to
+ * indicate the new packet contents.
+ * @OVS_ACTION_ATTR_POP_MPLS: Pop an MPLS label stack entry off of the
+ * packet's MPLS label stack.  Set the encapsulating frame's ethertype to
+ * indicate the new packet contents. This could potentially still be
+ * %ETH_P_MPLS if the resulting MPLS label stack is not empty.  If there
+ * is no MPLS label stack, as determined by ethertype, no action is taken.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -550,6 +579,9 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_SAMPLE,       /* Nested OVS_SAMPLE_ATTR_*. */
 	OVS_ACTION_ATTR_RECIRC,       /* u32 recirc_id. */
 	OVS_ACTION_ATTR_HASH,	      /* struct ovs_action_hash. */
+	OVS_ACTION_ATTR_PUSH_MPLS,    /* struct ovs_action_push_mpls. */
+	OVS_ACTION_ATTR_POP_MPLS,     /* __be16 ethertype. */
+
 	__OVS_ACTION_ATTR_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 40be481268de..70bb609c283d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -118,6 +118,7 @@
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <net/ip.h>
+#include <net/mpls.h>
 #include <linux/ipv6.h>
 #include <linux/in.h>
 #include <linux/jhash.h>
@@ -2530,7 +2531,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb,
 					   netdev_features_t features,
 					   __be16 type)
 {
-	if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+	if (eth_p_mpls(type))
 		features &= skb->dev->mpls_features;
 
 	return features;
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2a9673e39ca1..454ce12efbbf 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -30,6 +30,7 @@ config OPENVSWITCH
 
 config OPENVSWITCH_GRE
 	tristate "Open vSwitch GRE tunneling support"
+	select NET_MPLS_GSO
 	depends on INET
 	depends on OPENVSWITCH
 	depends on NET_IPGRE_DEMUX
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 922c133b1933..930b1b6e4cef 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -28,10 +28,12 @@
 #include <linux/in6.h>
 #include <linux/if_arp.h>
 #include <linux/if_vlan.h>
+
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/checksum.h>
 #include <net/dsfield.h>
+#include <net/mpls.h>
 #include <net/sctp/checksum.h>
 
 #include "datapath.h"
@@ -118,6 +120,92 @@ static int make_writable(struct sk_buff *skb, int write_len)
 	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 }
 
+static int push_mpls(struct sk_buff *skb,
+		     const struct ovs_action_push_mpls *mpls)
+{
+	__be32 *new_mpls_lse;
+	struct ethhdr *hdr;
+
+	/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
+	if (skb->encapsulation)
+		return -ENOTSUPP;
+
+	if (skb_cow_head(skb, MPLS_HLEN) < 0)
+		return -ENOMEM;
+
+	skb_push(skb, MPLS_HLEN);
+	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+	skb_reset_mac_header(skb);
+
+	new_mpls_lse = (__be32 *)skb_mpls_header(skb);
+	*new_mpls_lse = mpls->mpls_lse;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
+							     MPLS_HLEN, 0));
+
+	hdr = eth_hdr(skb);
+	hdr->h_proto = mpls->mpls_ethertype;
+
+	skb_set_inner_protocol(skb, skb->protocol);
+	skb->protocol = mpls->mpls_ethertype;
+
+	return 0;
+}
+
+static int pop_mpls(struct sk_buff *skb, const __be16 ethertype)
+{
+	struct ethhdr *hdr;
+	int err;
+
+	err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_sub(skb->csum,
+				     csum_partial(skb_mpls_header(skb),
+						  MPLS_HLEN, 0));
+
+	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+
+	__skb_pull(skb, MPLS_HLEN);
+	skb_reset_mac_header(skb);
+
+	/* skb_mpls_header() is used to locate the ethertype
+	 * field correctly in the presence of VLAN tags.
+	 */
+	hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
+	hdr->h_proto = ethertype;
+	if (eth_p_mpls(skb->protocol))
+		skb->protocol = ethertype;
+	return 0;
+}
+
+static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse)
+{
+	__be32 *stack;
+	int err;
+
+	err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	stack = (__be32 *)skb_mpls_header(skb);
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		__be32 diff[] = { ~(*stack), *mpls_lse };
+
+		skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+					  ~skb->csum);
+	}
+
+	*stack = *mpls_lse;
+
+	return 0;
+}
+
 /* remove VLAN header from packet and update csum accordingly. */
 static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
 {
@@ -140,10 +228,12 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
 
 	vlan_set_encap_proto(skb, vhdr);
 	skb->mac_header += VLAN_HLEN;
+
 	if (skb_network_offset(skb) < ETH_HLEN)
 		skb_set_network_header(skb, ETH_HLEN);
-	skb_reset_mac_len(skb);
 
+	/* Update mac_len for subsequent MPLS actions */
+	skb_reset_mac_len(skb);
 	return 0;
 }
 
@@ -186,6 +276,8 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
 
 		if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
 			return -ENOMEM;
+		/* Update mac_len for subsequent MPLS actions */
+		skb->mac_len += VLAN_HLEN;
 
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
 			skb->csum = csum_add(skb->csum, csum_partial(skb->data
@@ -612,6 +704,10 @@ static int execute_set_action(struct sk_buff *skb,
 	case OVS_KEY_ATTR_SCTP:
 		err = set_sctp(skb, nla_data(nested_attr));
 		break;
+
+	case OVS_KEY_ATTR_MPLS:
+		err = set_mpls(skb, nla_data(nested_attr));
+		break;
 	}
 
 	return err;
@@ -690,6 +786,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			execute_hash(skb, key, a);
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_MPLS:
+			err = push_mpls(skb, nla_data(a));
+			break;
+
+		case OVS_ACTION_ATTR_POP_MPLS:
+			err = pop_mpls(skb, nla_get_be16(a));
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 			err = push_vlan(skb, nla_data(a));
 			if (unlikely(err)) /* skb already freed. */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f18302f32049..688cb9bc0ef1 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -560,7 +560,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		goto err_flow_free;
 
 	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
-				   &flow->key, 0, &acts);
+				   &flow->key, &acts);
 	if (err)
 		goto err_flow_free;
 
@@ -846,7 +846,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		goto err_kfree_flow;
 
 	error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
-				     0, &acts);
+				     &acts);
 	if (error) {
 		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
 		goto err_kfree_acts;
@@ -953,7 +953,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
 		return acts;
 
 	ovs_flow_mask_key(&masked_key, key, mask);
-	error = ovs_nla_copy_actions(a, &masked_key, 0, &acts);
+	error = ovs_nla_copy_actions(a, &masked_key, &acts);
 	if (error) {
 		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
 		kfree(acts);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2b78789ea7c5..90a21010fc8f 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -32,6 +32,7 @@
 #include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/mpls.h>
 #include <linux/sctp.h>
 #include <linux/smp.h>
 #include <linux/tcp.h>
@@ -42,6 +43,7 @@
 #include <net/ip.h>
 #include <net/ip_tunnels.h>
 #include <net/ipv6.h>
+#include <net/mpls.h>
 #include <net/ndisc.h>
 
 #include "datapath.h"
@@ -480,6 +482,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 		return -ENOMEM;
 
 	skb_reset_network_header(skb);
+	skb_reset_mac_len(skb);
 	__skb_push(skb, skb->data - skb_mac_header(skb));
 
 	/* Network layer. */
@@ -584,6 +587,33 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 			memset(&key->ip, 0, sizeof(key->ip));
 			memset(&key->ipv4, 0, sizeof(key->ipv4));
 		}
+	} else if (eth_p_mpls(key->eth.type)) {
+		size_t stack_len = MPLS_HLEN;
+
+		/* In the presence of an MPLS label stack the end of the L2
+		 * header and the beginning of the L3 header differ.
+		 *
+		 * Advance network_header to the beginning of the L3
+		 * header. mac_len corresponds to the end of the L2 header.
+		 */
+		while (1) {
+			__be32 lse;
+
+			error = check_header(skb, skb->mac_len + stack_len);
+			if (unlikely(error))
+				return 0;
+
+			memcpy(&lse, skb_network_header(skb), MPLS_HLEN);
+
+			if (stack_len == MPLS_HLEN)
+				memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
+
+			skb_set_network_header(skb, skb->mac_len + stack_len);
+			if (lse & htonl(MPLS_LS_S_MASK))
+				break;
+
+			stack_len += MPLS_HLEN;
+		}
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		int nh_len;             /* IPv6 Header + Extensions */
 
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 71813318c8c7..4962bee81a11 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -102,12 +102,17 @@ struct sw_flow_key {
 		__be16 tci;		/* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
 		__be16 type;		/* Ethernet frame type. */
 	} eth;
-	struct {
-		u8     proto;		/* IP protocol or lower 8 bits of ARP opcode. */
-		u8     tos;		/* IP ToS. */
-		u8     ttl;		/* IP TTL/hop limit. */
-		u8     frag;		/* One of OVS_FRAG_TYPE_*. */
-	} ip;
+	union {
+		struct {
+			__be32 top_lse;	/* top label stack entry */
+		} mpls;
+		struct {
+			u8     proto;	/* IP protocol or lower 8 bits of ARP opcode. */
+			u8     tos;	    /* IP ToS. */
+			u8     ttl;	    /* IP TTL/hop limit. */
+			u8     frag;	/* One of OVS_FRAG_TYPE_*. */
+		} ip;
+	};
 	struct {
 		__be16 src;		/* TCP/UDP/SCTP source port. */
 		__be16 dst;		/* TCP/UDP/SCTP destination port. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 939bcb32100f..569309c49cc0 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -46,6 +46,7 @@
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/ndisc.h>
+#include <net/mpls.h>
 
 #include "flow_netlink.h"
 
@@ -134,7 +135,8 @@ static bool match_validate(const struct sw_flow_match *match,
 			| (1 << OVS_KEY_ATTR_ICMP)
 			| (1 << OVS_KEY_ATTR_ICMPV6)
 			| (1 << OVS_KEY_ATTR_ARP)
-			| (1 << OVS_KEY_ATTR_ND));
+			| (1 << OVS_KEY_ATTR_ND)
+			| (1 << OVS_KEY_ATTR_MPLS));
 
 	/* Always allowed mask fields. */
 	mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
@@ -149,6 +151,12 @@ static bool match_validate(const struct sw_flow_match *match,
 			mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
 	}
 
+	if (eth_p_mpls(match->key->eth.type)) {
+		key_expected |= 1 << OVS_KEY_ATTR_MPLS;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1 << OVS_KEY_ATTR_MPLS;
+	}
+
 	if (match->key->eth.type == htons(ETH_P_IP)) {
 		key_expected |= 1 << OVS_KEY_ATTR_IPV4;
 		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
@@ -266,6 +274,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32),
 	[OVS_KEY_ATTR_DP_HASH] = sizeof(u32),
 	[OVS_KEY_ATTR_TUNNEL] = -1,
+	[OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls),
 };
 
 static bool is_all_zero(const u8 *fp, size_t size)
@@ -735,6 +744,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
+		const struct ovs_key_mpls *mpls_key;
+
+		mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]);
+		SW_FLOW_KEY_PUT(match, mpls.top_lse,
+				mpls_key->mpls_lse, is_mask);
+
+		attrs &= ~(1 << OVS_KEY_ATTR_MPLS);
+	 }
+
 	if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
 		const struct ovs_key_tcp *tcp_key;
 
@@ -1140,6 +1159,14 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
 		arp_key->arp_op = htons(output->ip.proto);
 		ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha);
 		ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha);
+	} else if (eth_p_mpls(swkey->eth.type)) {
+		struct ovs_key_mpls *mpls_key;
+
+		nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key));
+		if (!nla)
+			goto nla_put_failure;
+		mpls_key = nla_data(nla);
+		mpls_key->mpls_lse = output->mpls.top_lse;
 	}
 
 	if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -1336,9 +1363,15 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa,
 	a->nla_len = sfa->actions_len - st_offset;
 }
 
+static int ovs_nla_copy_actions__(const struct nlattr *attr,
+				  const struct sw_flow_key *key,
+				  int depth, struct sw_flow_actions **sfa,
+				  __be16 eth_type, __be16 vlan_tci);
+
 static int validate_and_copy_sample(const struct nlattr *attr,
 				    const struct sw_flow_key *key, int depth,
-				    struct sw_flow_actions **sfa)
+				    struct sw_flow_actions **sfa,
+				    __be16 eth_type, __be16 vlan_tci)
 {
 	const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
 	const struct nlattr *probability, *actions;
@@ -1375,7 +1408,8 @@ static int validate_and_copy_sample(const struct nlattr *attr,
 	if (st_acts < 0)
 		return st_acts;
 
-	err = ovs_nla_copy_actions(actions, key, depth + 1, sfa);
+	err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa,
+				     eth_type, vlan_tci);
 	if (err)
 		return err;
 
@@ -1385,10 +1419,10 @@ static int validate_and_copy_sample(const struct nlattr *attr,
 	return 0;
 }
 
-static int validate_tp_port(const struct sw_flow_key *flow_key)
+static int validate_tp_port(const struct sw_flow_key *flow_key,
+			    __be16 eth_type)
 {
-	if ((flow_key->eth.type == htons(ETH_P_IP) ||
-	     flow_key->eth.type == htons(ETH_P_IPV6)) &&
+	if ((eth_type == htons(ETH_P_IP) || eth_type == htons(ETH_P_IPV6)) &&
 	    (flow_key->tp.src || flow_key->tp.dst))
 		return 0;
 
@@ -1483,7 +1517,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 static int validate_set(const struct nlattr *a,
 			const struct sw_flow_key *flow_key,
 			struct sw_flow_actions **sfa,
-			bool *set_tun)
+			bool *set_tun, __be16 eth_type)
 {
 	const struct nlattr *ovs_key = nla_data(a);
 	int key_type = nla_type(ovs_key);
@@ -1508,6 +1542,9 @@ static int validate_set(const struct nlattr *a,
 		break;
 
 	case OVS_KEY_ATTR_TUNNEL:
+		if (eth_p_mpls(eth_type))
+			return -EINVAL;
+
 		*set_tun = true;
 		err = validate_and_copy_set_tun(a, sfa);
 		if (err)
@@ -1515,7 +1552,7 @@ static int validate_set(const struct nlattr *a,
 		break;
 
 	case OVS_KEY_ATTR_IPV4:
-		if (flow_key->eth.type != htons(ETH_P_IP))
+		if (eth_type != htons(ETH_P_IP))
 			return -EINVAL;
 
 		if (!flow_key->ip.proto)
@@ -1531,7 +1568,7 @@ static int validate_set(const struct nlattr *a,
 		break;
 
 	case OVS_KEY_ATTR_IPV6:
-		if (flow_key->eth.type != htons(ETH_P_IPV6))
+		if (eth_type != htons(ETH_P_IPV6))
 			return -EINVAL;
 
 		if (!flow_key->ip.proto)
@@ -1553,19 +1590,24 @@ static int validate_set(const struct nlattr *a,
 		if (flow_key->ip.proto != IPPROTO_TCP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_type);
 
 	case OVS_KEY_ATTR_UDP:
 		if (flow_key->ip.proto != IPPROTO_UDP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_type);
+
+	case OVS_KEY_ATTR_MPLS:
+		if (!eth_p_mpls(eth_type))
+			return -EINVAL;
+		break;
 
 	case OVS_KEY_ATTR_SCTP:
 		if (flow_key->ip.proto != IPPROTO_SCTP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_type);
 
 	default:
 		return -EINVAL;
@@ -1609,12 +1651,13 @@ static int copy_action(const struct nlattr *from,
 	return 0;
 }
 
-int ovs_nla_copy_actions(const struct nlattr *attr,
-			 const struct sw_flow_key *key,
-			 int depth,
-			 struct sw_flow_actions **sfa)
+static int ovs_nla_copy_actions__(const struct nlattr *attr,
+				  const struct sw_flow_key *key,
+				  int depth, struct sw_flow_actions **sfa,
+				  __be16 eth_type, __be16 vlan_tci)
 {
 	const struct nlattr *a;
+	bool out_tnl_port = false;
 	int rem, err;
 
 	if (depth >= SAMPLE_ACTION_DEPTH)
@@ -1626,6 +1669,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 			[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
 			[OVS_ACTION_ATTR_RECIRC] = sizeof(u32),
 			[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+			[OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls),
+			[OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16),
 			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
 			[OVS_ACTION_ATTR_POP_VLAN] = 0,
 			[OVS_ACTION_ATTR_SET] = (u32)-1,
@@ -1655,6 +1700,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 		case OVS_ACTION_ATTR_OUTPUT:
 			if (nla_get_u32(a) >= DP_MAX_PORTS)
 				return -EINVAL;
+			out_tnl_port = false;
+
 			break;
 
 		case OVS_ACTION_ATTR_HASH: {
@@ -1671,6 +1718,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 		}
 
 		case OVS_ACTION_ATTR_POP_VLAN:
+			vlan_tci = htons(0);
 			break;
 
 		case OVS_ACTION_ATTR_PUSH_VLAN:
@@ -1679,19 +1727,66 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 				return -EINVAL;
 			if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
 				return -EINVAL;
+			vlan_tci = vlan->vlan_tci;
 			break;
 
 		case OVS_ACTION_ATTR_RECIRC:
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_MPLS: {
+			const struct ovs_action_push_mpls *mpls = nla_data(a);
+
+			/* Networking stack do not allow simultaneous Tunnel
+			 * and MPLS GSO.
+			 */
+			if (out_tnl_port)
+				return -EINVAL;
+
+			if (!eth_p_mpls(mpls->mpls_ethertype))
+				return -EINVAL;
+			/* Prohibit push MPLS other than to a white list
+			 * for packets that have a known tag order.
+			 */
+			if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+			    (eth_type != htons(ETH_P_IP) &&
+			     eth_type != htons(ETH_P_IPV6) &&
+			     eth_type != htons(ETH_P_ARP) &&
+			     eth_type != htons(ETH_P_RARP) &&
+			     !eth_p_mpls(eth_type)))
+				return -EINVAL;
+			eth_type = mpls->mpls_ethertype;
+			break;
+		}
+
+		case OVS_ACTION_ATTR_POP_MPLS:
+			if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+			    !eth_p_mpls(eth_type))
+				return -EINVAL;
+
+			/* Disallow subsequent L2.5+ set and mpls_pop actions
+			 * as there is no check here to ensure that the new
+			 * eth_type is valid and thus set actions could
+			 * write off the end of the packet or otherwise
+			 * corrupt it.
+			 *
+			 * Support for these actions is planned using packet
+			 * recirculation.
+			 */
+			eth_type = htons(0);
+			break;
+
 		case OVS_ACTION_ATTR_SET:
-			err = validate_set(a, key, sfa, &skip_copy);
+			err = validate_set(a, key, sfa,
+					   &out_tnl_port, eth_type);
 			if (err)
 				return err;
+
+			skip_copy = out_tnl_port;
 			break;
 
 		case OVS_ACTION_ATTR_SAMPLE:
-			err = validate_and_copy_sample(a, key, depth, sfa);
+			err = validate_and_copy_sample(a, key, depth, sfa,
+						       eth_type, vlan_tci);
 			if (err)
 				return err;
 			skip_copy = true;
@@ -1713,6 +1808,14 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 	return 0;
 }
 
+int ovs_nla_copy_actions(const struct nlattr *attr,
+			 const struct sw_flow_key *key,
+			 struct sw_flow_actions **sfa)
+{
+	return ovs_nla_copy_actions__(attr, key, 0, sfa, key->eth.type,
+				      key->eth.tci);
+}
+
 static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
 {
 	const struct nlattr *a;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 206e45add888..6355b1d01329 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -49,7 +49,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 		      const struct nlattr *);
 
 int ovs_nla_copy_actions(const struct nlattr *attr,
-			 const struct sw_flow_key *key, int depth,
+			 const struct sw_flow_key *key,
 			 struct sw_flow_actions **sfa);
 int ovs_nla_put_actions(const struct nlattr *attr,
 			int len, struct sk_buff *skb);
-- 
cgit v1.2.3


From 3b47d30396bae4f0bd1ff0dbcd7c4f5077e7df4e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 6 Nov 2014 21:09:44 -0800
Subject: net: gro: add a per device gro flush timer

Tuning coalescing parameters on NIC can be really hard.

Servers can handle both bulk and RPC like traffic, with conflicting
goals : bulk flows want as big GRO packets as possible, RPC want minimal
latencies.

To reach big GRO packets on 10Gbe NIC, one can use :

ethtool -C eth0 rx-usecs 4 rx-frames 44

But this penalizes rpc sessions, with an increase of latencies, up to
50% in some cases, as NICs generally do not force an interrupt when
a packet with TCP Push flag is received.

Some NICs do not have an absolute timer, only a timer rearmed for every
incoming packet.

This patch uses a different strategy : Let GRO stack decides what do do,
based on traffic pattern.

Packets with Push flag wont be delayed.
Packets without Push flag might be held in GRO engine, if we keep
receiving data.

This new mechanism is off by default, and shall be enabled by setting
/sys/class/net/ethX/gro_flush_timeout to a value in nanosecond.

To fully enable this mechanism, drivers should use napi_complete_done()
instead of napi_complete().

Tested:
 Ran 200 netperf TCP_STREAM from A to B (10Gbe mlx4 link, 8 RX queues)

Without this feature, we send back about 305,000 ACK per second.

GRO aggregation ratio is low (811/305 = 2.65 segments per GRO packet)

Setting a timer of 2000 nsec is enough to increase GRO packet sizes
and reduce number of ACK packets. (811/19.2 = 42)

Receiver performs less calls to upper stacks, less wakes up.
This also reduces cpu usage on the sender, as it receives less ACK
packets.

Note that reducing number of wakes up increases cpu efficiency, but can
decrease QPS, as applications wont have the chance to warmup cpu caches
doing a partial read of RPC requests/answers if they fit in one skb.

B:~# sar -n DEV 1 10 | grep eth0 | tail -1
Average:         eth0 811269.80 305732.30 1199462.57  19705.72      0.00
0.00      0.50

B:~# echo 2000 >/sys/class/net/eth0/gro_flush_timeout

B:~# sar -n DEV 1 10 | grep eth0 | tail -1
Average:         eth0 811577.30  19230.80 1199916.51   1239.80      0.00
0.00      0.50

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 20 ++++++++++----------
 net/core/dev.c            | 45 +++++++++++++++++++++++++++++++++++++++++----
 net/core/net-sysfs.c      | 18 ++++++++++++++++++
 3 files changed, 69 insertions(+), 14 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 90ac95900a11..888d5513fa4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -314,6 +314,7 @@ struct napi_struct {
 	struct net_device	*dev;
 	struct sk_buff		*gro_list;
 	struct sk_buff		*skb;
+	struct hrtimer		timer;
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	unsigned int		napi_id;
@@ -443,14 +444,19 @@ static inline bool napi_reschedule(struct napi_struct *napi)
 	return false;
 }
 
+void __napi_complete(struct napi_struct *n);
+void napi_complete_done(struct napi_struct *n, int work_done);
 /**
  *	napi_complete - NAPI processing complete
  *	@n: napi context
  *
  * Mark NAPI processing as complete.
+ * Consider using napi_complete_done() instead.
  */
-void __napi_complete(struct napi_struct *n);
-void napi_complete(struct napi_struct *n);
+static inline void napi_complete(struct napi_struct *n)
+{
+	return napi_complete_done(n, 0);
+}
 
 /**
  *	napi_by_id - lookup a NAPI by napi_id
@@ -485,14 +491,7 @@ void napi_hash_del(struct napi_struct *napi);
  * Stop NAPI from being scheduled on this context.
  * Waits till any outstanding processing completes.
  */
-static inline void napi_disable(struct napi_struct *n)
-{
-	might_sleep();
-	set_bit(NAPI_STATE_DISABLE, &n->state);
-	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
-		msleep(1);
-	clear_bit(NAPI_STATE_DISABLE, &n->state);
-}
+void napi_disable(struct napi_struct *n);
 
 /**
  *	napi_enable - enable NAPI scheduling
@@ -1603,6 +1602,7 @@ struct net_device {
 
 #endif
 
+	unsigned long		gro_flush_timeout;
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 70bb609c283d..bb09b0364619 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -134,6 +134,7 @@
 #include <linux/vmalloc.h>
 #include <linux/if_macvlan.h>
 #include <linux/errqueue.h>
+#include <linux/hrtimer.h>
 
 #include "net-sysfs.h"
 
@@ -4412,7 +4413,6 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
 void __napi_complete(struct napi_struct *n)
 {
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-	BUG_ON(n->gro_list);
 
 	list_del_init(&n->poll_list);
 	smp_mb__before_atomic();
@@ -4420,7 +4420,7 @@ void __napi_complete(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_complete);
 
-void napi_complete(struct napi_struct *n)
+void napi_complete_done(struct napi_struct *n, int work_done)
 {
 	unsigned long flags;
 
@@ -4431,8 +4431,18 @@ void napi_complete(struct napi_struct *n)
 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
 		return;
 
-	napi_gro_flush(n, false);
+	if (n->gro_list) {
+		unsigned long timeout = 0;
 
+		if (work_done)
+			timeout = n->dev->gro_flush_timeout;
+
+		if (timeout)
+			hrtimer_start(&n->timer, ns_to_ktime(timeout),
+				      HRTIMER_MODE_REL_PINNED);
+		else
+			napi_gro_flush(n, false);
+	}
 	if (likely(list_empty(&n->poll_list))) {
 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
 	} else {
@@ -4442,7 +4452,7 @@ void napi_complete(struct napi_struct *n)
 		local_irq_restore(flags);
 	}
 }
-EXPORT_SYMBOL(napi_complete);
+EXPORT_SYMBOL(napi_complete_done);
 
 /* must be called under rcu_read_lock(), as we dont take a reference */
 struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4496,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL_GPL(napi_hash_del);
 
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+	struct napi_struct *napi;
+
+	napi = container_of(timer, struct napi_struct, timer);
+	if (napi->gro_list)
+		napi_schedule(napi);
+
+	return HRTIMER_NORESTART;
+}
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
 	INIT_LIST_HEAD(&napi->poll_list);
+	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	napi->timer.function = napi_watchdog;
 	napi->gro_count = 0;
 	napi->gro_list = NULL;
 	napi->skb = NULL;
@@ -4518,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 }
 EXPORT_SYMBOL(netif_napi_add);
 
+void napi_disable(struct napi_struct *n)
+{
+	might_sleep();
+	set_bit(NAPI_STATE_DISABLE, &n->state);
+
+	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+		msleep(1);
+
+	hrtimer_cancel(&n->timer);
+
+	clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
 void netif_napi_del(struct napi_struct *napi)
 {
 	list_del_init(&napi->dev_list);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 9dd06699b09c..1a24602cd54e 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -325,6 +325,23 @@ static ssize_t tx_queue_len_store(struct device *dev,
 }
 NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
 
+static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
+{
+	dev->gro_flush_timeout = val;
+	return 0;
+}
+
+static ssize_t gro_flush_timeout_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+}
+NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+
 static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -422,6 +439,7 @@ static struct attribute *net_class_attrs[] = {
 	&dev_attr_mtu.attr,
 	&dev_attr_flags.attr,
 	&dev_attr_tx_queue_len.attr,
+	&dev_attr_gro_flush_timeout.attr,
 	&dev_attr_phys_port_id.attr,
 	NULL,
 };
-- 
cgit v1.2.3


From fbe168ba91f7c327856f205699404284c2f09e36 Mon Sep 17 00:00:00 2001
From: Michal Kubeček <mkubecek@suse.cz>
Date: Thu, 13 Nov 2014 07:54:50 +0100
Subject: net: generic dev_disable_lro() stacked device handling

Large receive offloading is known to cause problems if received packets
are passed to other host. Therefore the kernel disables it by calling
dev_disable_lro() whenever a network device is enslaved in a bridge or
forwarding is enabled for it (or globally). For virtual devices we need
to disable LRO on the underlying physical device (which is actually
receiving the packets).

Current dev_disable_lro() code handles this  propagation for a vlan
(including 802.1ad nested vlan), macvlan or a vlan on top of a macvlan.
It doesn't handle other stacked devices and their combinations, in
particular propagation from a bond to its slaves which often causes
problems in virtualization setups.

As we now have generic data structures describing the upper-lower device
relationship, dev_disable_lro() can be generalized to disable LRO also
for all lower devices (if any) once it is disabled for the device
itself.

For bonding and teaming devices, it is necessary to disable LRO not only
on current slaves at the moment when dev_disable_lro() is called but
also on any slave (port) added later.

v2: use lower device links for all devices (including vlan and macvlan)

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Veaceslav Falico <vfalico@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  3 +++
 drivers/net/team/team.c         |  3 +++
 net/core/dev.c                  | 15 +++++----------
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b9b34566b9b8..8575fee8b359 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1526,6 +1526,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	}
 #endif
 
+	if (!(bond_dev->features & NETIF_F_LRO))
+		dev_disable_lro(slave_dev);
+
 	res = netdev_rx_handler_register(slave_dev, bond_handle_frame,
 					 new_slave);
 	if (res) {
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 2368395d8ae5..93e224217e24 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1179,6 +1179,9 @@ static int team_port_add(struct team *team, struct net_device *port_dev)
 		goto err_enable_netpoll;
 	}
 
+	if (!(dev->features & NETIF_F_LRO))
+		dev_disable_lro(port_dev);
+
 	err = netdev_rx_handler_register(port_dev, team_handle_frame,
 					 port);
 	if (err) {
diff --git a/net/core/dev.c b/net/core/dev.c
index bb09b0364619..1ab168e0fdf7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1437,22 +1437,17 @@ EXPORT_SYMBOL(dev_close);
  */
 void dev_disable_lro(struct net_device *dev)
 {
-	/*
-	 * If we're trying to disable lro on a vlan device
-	 * use the underlying physical device instead
-	 */
-	if (is_vlan_dev(dev))
-		dev = vlan_dev_real_dev(dev);
-
-	/* the same for macvlan devices */
-	if (netif_is_macvlan(dev))
-		dev = macvlan_dev_real_dev(dev);
+	struct net_device *lower_dev;
+	struct list_head *iter;
 
 	dev->wanted_features &= ~NETIF_F_LRO;
 	netdev_update_features(dev);
 
 	if (unlikely(dev->features & NETIF_F_LRO))
 		netdev_WARN(dev, "failed to disable LRO!\n");
+
+	netdev_for_each_lower_dev(dev, lower_dev, iter)
+		dev_disable_lro(lower_dev);
 }
 EXPORT_SYMBOL(dev_disable_lro);
 
-- 
cgit v1.2.3


From 62749e2cb3c4a7da3eaa5c01a7e787aebeff8536 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:04:58 +0100
Subject: vlan: rename __vlan_put_tag to vlan_insert_tag_set_proto

Name fits better. Plus there's going to be introduced
__vlan_insert_tag later on.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c             |  4 ++--
 drivers/net/ethernet/emulex/benet/be_main.c |  6 ++++--
 drivers/net/vxlan.c                         | 12 ++++++------
 include/linux/if_vlan.h                     |  8 +++++---
 net/bridge/br_vlan.c                        |  4 ++--
 net/core/dev.c                              |  4 ++--
 net/core/netpoll.c                          |  4 ++--
 net/ipv4/geneve.c                           | 11 +++++------
 net/openvswitch/actions.c                   |  4 +++-
 net/openvswitch/datapath.c                  |  3 ++-
 net/openvswitch/vport-gre.c                 |  6 +++---
 11 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e26c68232032..c1d7da427a3e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2146,8 +2146,8 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
 
 		netdev_dbg(slave_dev, "inner tag: proto %X vid %X\n",
 			   ntohs(outer_tag->vlan_proto), tags->vlan_id);
-		skb = __vlan_put_tag(skb, tags->vlan_proto,
-				     tags->vlan_id);
+		skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto,
+						tags->vlan_id);
 		if (!skb) {
 			net_err_ratelimited("failed to insert inner VLAN tag\n");
 			return;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 54160cc62656..d02fbc7694ea 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -887,7 +887,8 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 	}
 
 	if (vlan_tag) {
-		skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+		skb = vlan_insert_tag_set_proto(skb, htons(ETH_P_8021Q),
+						vlan_tag);
 		if (unlikely(!skb))
 			return skb;
 		skb->vlan_tci = 0;
@@ -896,7 +897,8 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 	/* Insert the outer VLAN, if any */
 	if (adapter->qnq_vid) {
 		vlan_tag = adapter->qnq_vid;
-		skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+		skb = vlan_insert_tag_set_proto(skb, htons(ETH_P_8021Q),
+						vlan_tag);
 		if (unlikely(!skb))
 			return skb;
 		if (skip_hw_vlan)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 23b1e8c0d547..bb8fbab438e8 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1600,9 +1600,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 		return err;
 
 	if (vlan_tx_tag_present(skb)) {
-		if (WARN_ON(!__vlan_put_tag(skb,
-					    skb->vlan_proto,
-					    vlan_tx_tag_get(skb))))
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
+		if (WARN_ON(!skb))
 			return -ENOMEM;
 
 		skb->vlan_tci = 0;
@@ -1644,9 +1644,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 		return err;
 
 	if (vlan_tx_tag_present(skb)) {
-		if (WARN_ON(!__vlan_put_tag(skb,
-					    skb->vlan_proto,
-					    vlan_tx_tag_get(skb))))
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
+		if (WARN_ON(!skb))
 			return -ENOMEM;
 
 		skb->vlan_tci = 0;
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 75b70a5e4a6d..46e4a15b9b55 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -320,8 +320,9 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
 }
 
 /**
- * __vlan_put_tag - regular VLAN tag inserting
+ * vlan_insert_tag_set_proto - regular VLAN tag inserting
  * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
  *
  * Inserts the VLAN tag into @skb as part of the payload
@@ -330,8 +331,9 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
  * Following the skb_unshare() example, in case of error, the calling function
  * doesn't have to worry about freeing the original skb.
  */
-static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb,
-					     __be16 vlan_proto, u16 vlan_tci)
+static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
+							__be16 vlan_proto,
+							u16 vlan_tci)
 {
 	skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
 	if (skb)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 150048fb99b0..97b8ddf57363 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -199,8 +199,8 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 		if (skb->vlan_proto != proto) {
 			/* Protocol-mismatch, empty out vlan_tci for new tag */
 			skb_push(skb, ETH_HLEN);
-			skb = __vlan_put_tag(skb, skb->vlan_proto,
-					     vlan_tx_tag_get(skb));
+			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+							vlan_tx_tag_get(skb));
 			if (unlikely(!skb))
 				return false;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 1ab168e0fdf7..3611e60df407 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2645,8 +2645,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 {
 	if (vlan_tx_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-		skb = __vlan_put_tag(skb, skb->vlan_proto,
-				     vlan_tx_tag_get(skb));
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
 		if (skb)
 			skb->vlan_tci = 0;
 	}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e6645b4f330a..65d372384a3f 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -79,8 +79,8 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	if (vlan_tx_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-		skb = __vlan_put_tag(skb, skb->vlan_proto,
-				     vlan_tx_tag_get(skb));
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
 		if (unlikely(!skb)) {
 			/* This is actually a packet drop, but we
 			 * don't want the code that calls this
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 31802afce34f..fd430a6a1c37 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -132,12 +132,11 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 		return err;
 
 	if (vlan_tx_tag_present(skb)) {
-		if (unlikely(!__vlan_put_tag(skb,
-					     skb->vlan_proto,
-					     vlan_tx_tag_get(skb)))) {
-			err = -ENOMEM;
-			return err;
-		}
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
+		if (unlikely(!skb)
+			return -ENOMEM;
+
 		skb->vlan_tci = 0;
 	}
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 749a30163071..426b9131ede0 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -287,7 +287,9 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 		/* push down current VLAN tag */
 		current_tag = vlan_tx_tag_get(skb);
 
-		if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						current_tag);
+		if (!skb)
 			return -ENOMEM;
 		/* Update mac_len for subsequent MPLS actions */
 		skb->mac_len += VLAN_HLEN;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ab141d49bb9d..c63e60e4d947 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -425,7 +425,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 		if (!nskb)
 			return -ENOMEM;
 
-		nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb));
+		nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto,
+						 vlan_tx_tag_get(nskb));
 		if (!nskb)
 			return -ENOMEM;
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 8e61a5c6ae7c..777cd8c71d53 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -176,9 +176,9 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 	}
 
 	if (vlan_tx_tag_present(skb)) {
-		if (unlikely(!__vlan_put_tag(skb,
-					     skb->vlan_proto,
-					     vlan_tx_tag_get(skb)))) {
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
+		if (unlikely(!skb) {
 			err = -ENOMEM;
 			goto err_free_rt;
 		}
-- 
cgit v1.2.3


From 5968250c868ceee680aa77395b24e6ddcae17d36 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:04:59 +0100
Subject: vlan: introduce *vlan_hwaccel_push_inside helpers

Use them to push skb->vlan_tci into the payload and avoid code
duplication.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c         | 22 ++++++----------------
 include/linux/if_vlan.h     | 34 ++++++++++++++++++++++++++++++++++
 net/core/dev.c              |  8 ++------
 net/core/netpoll.c          |  4 +---
 net/ipv4/geneve.c           | 11 +++--------
 net/openvswitch/datapath.c  |  4 +---
 net/openvswitch/vport-gre.c | 12 ++++--------
 7 files changed, 51 insertions(+), 44 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bb8fbab438e8..64d45fa3d997 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1599,14 +1599,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 	if (unlikely(err))
 		return err;
 
-	if (vlan_tx_tag_present(skb)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (WARN_ON(!skb))
-			return -ENOMEM;
-
-		skb->vlan_tci = 0;
-	}
+	skb = vlan_hwaccel_push_inside(skb);
+	if (WARN_ON(!skb))
+		return -ENOMEM;
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 	vxh->vx_flags = htonl(VXLAN_FLAGS);
@@ -1643,14 +1638,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 	if (unlikely(err))
 		return err;
 
-	if (vlan_tx_tag_present(skb)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (WARN_ON(!skb))
-			return -ENOMEM;
-
-		skb->vlan_tci = 0;
-	}
+	skb = vlan_hwaccel_push_inside(skb);
+	if (WARN_ON(!skb))
+		return -ENOMEM;
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 	vxh->vx_flags = htonl(VXLAN_FLAGS);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 46e4a15b9b55..291e6706876e 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -341,6 +341,40 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
 	return skb;
 }
 
+/*
+ * __vlan_hwaccel_push_inside - pushes vlan tag to the payload
+ * @skb: skbuff to tag
+ *
+ * Pushes the VLAN tag from @skb->vlan_tci inside to the payload.
+ *
+ * Following the skb_unshare() example, in case of error, the calling function
+ * doesn't have to worry about freeing the original skb.
+ */
+static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
+{
+	skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+					vlan_tx_tag_get(skb));
+	if (likely(skb))
+		skb->vlan_tci = 0;
+	return skb;
+}
+/*
+ * vlan_hwaccel_push_inside - pushes vlan tag to the payload
+ * @skb: skbuff to tag
+ *
+ * Checks is tag is present in @skb->vlan_tci and if it is, it pushes the
+ * VLAN tag from @skb->vlan_tci inside to the payload.
+ *
+ * Following the skb_unshare() example, in case of error, the calling function
+ * doesn't have to worry about freeing the original skb.
+ */
+static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb)
+{
+	if (vlan_tx_tag_present(skb))
+		skb = __vlan_hwaccel_push_inside(skb);
+	return skb;
+}
+
 /**
  * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
  * @skb: skbuff to tag
diff --git a/net/core/dev.c b/net/core/dev.c
index 3611e60df407..ac4836241a96 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2644,12 +2644,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 					  netdev_features_t features)
 {
 	if (vlan_tx_tag_present(skb) &&
-	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (skb)
-			skb->vlan_tci = 0;
-	}
+	    !vlan_hw_offload_capable(features, skb->vlan_proto))
+		skb = __vlan_hwaccel_push_inside(skb);
 	return skb;
 }
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 65d372384a3f..e0ad5d16c9c5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -79,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	if (vlan_tx_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
+		skb = __vlan_hwaccel_push_inside(skb);
 		if (unlikely(!skb)) {
 			/* This is actually a packet drop, but we
 			 * don't want the code that calls this
@@ -88,7 +87,6 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			 */
 			goto out;
 		}
-		skb->vlan_tci = 0;
 	}
 
 	status = netdev_start_xmit(skb, dev, txq, false);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index fd430a6a1c37..a457232f0131 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -131,14 +131,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 	if (unlikely(err))
 		return err;
 
-	if (vlan_tx_tag_present(skb)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (unlikely(!skb)
-			return -ENOMEM;
-
-		skb->vlan_tci = 0;
-	}
+	skb = vlan_hwaccel_push_inside(skb);
+	if (unlikely(!skb))
+		return -ENOMEM;
 
 	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
 	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c63e60e4d947..f37ca3e5824c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -425,12 +425,10 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 		if (!nskb)
 			return -ENOMEM;
 
-		nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto,
-						 vlan_tx_tag_get(nskb));
+		nskb = __vlan_hwaccel_push_inside(nskb);
 		if (!nskb)
 			return -ENOMEM;
 
-		nskb->vlan_tci = 0;
 		skb = nskb;
 	}
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 777cd8c71d53..6b69df545b1d 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -175,14 +175,10 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 			goto err_free_rt;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (unlikely(!skb) {
-			err = -ENOMEM;
-			goto err_free_rt;
-		}
-		skb->vlan_tci = 0;
+	skb = vlan_hwaccel_push_inside(skb);
+	if (unlikely(!skb)) {
+		err = -ENOMEM;
+		goto err_free_rt;
 	}
 
 	/* Push Tunnel header. */
-- 
cgit v1.2.3


From 02637fce3e0103ba086b9c33b6d529e69460e4b6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 28 Nov 2014 14:34:16 +0100
Subject: net: rename netdev_phys_port_id to more generic name

So this can be reused for identification of other "items" as well.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Reviewed-by: Thomas Graf <tgraf@suug.ch>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c      |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c   |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  2 +-
 include/linux/netdevice.h                        | 16 ++++++++--------
 net/core/dev.c                                   |  2 +-
 net/core/net-sysfs.c                             |  2 +-
 net/core/rtnetlink.c                             |  6 +++---
 8 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index c4bd025c74c9..336ef3cf5773 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12537,7 +12537,7 @@ static int bnx2x_validate_addr(struct net_device *dev)
 }
 
 static int bnx2x_get_phys_port_id(struct net_device *netdev,
-				  struct netdev_phys_port_id *ppid)
+				  struct netdev_phys_item_id *ppid)
 {
 	struct bnx2x *bp = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5ed5e4036dd9..9ae4270db0b3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7511,7 +7511,7 @@ static void i40e_del_vxlan_port(struct net_device *netdev,
 
 #endif
 static int i40e_get_phys_port_id(struct net_device *netdev,
-				 struct netdev_phys_port_id *ppid)
+				 struct netdev_phys_item_id *ppid)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_pf *pf = np->vsi->back;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index b7c99780aef3..1597fb07576c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2259,7 +2259,7 @@ static int mlx4_en_set_vf_link_state(struct net_device *dev, int vf, int link_st
 
 #define PORT_ID_BYTE_LEN 8
 static int mlx4_en_get_phys_port_id(struct net_device *dev,
-				    struct netdev_phys_port_id *ppid)
+				    struct netdev_phys_item_id *ppid)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_dev *mdev = priv->mdev->dev;
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 3227c8063edd..1aa25b13ace1 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -461,7 +461,7 @@ static void qlcnic_82xx_cancel_idc_work(struct qlcnic_adapter *adapter)
 }
 
 static int qlcnic_get_phys_port_id(struct net_device *netdev,
-				   struct netdev_phys_port_id *ppid)
+				   struct netdev_phys_item_id *ppid)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	struct qlcnic_hardware_context *ahw = adapter->ahw;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 589929cf4700..4bd41d72559d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -754,13 +754,13 @@ struct netdev_fcoe_hbainfo {
 };
 #endif
 
-#define MAX_PHYS_PORT_ID_LEN 32
+#define MAX_PHYS_ITEM_ID_LEN 32
 
-/* This structure holds a unique identifier to identify the
- * physical port used by a netdevice.
+/* This structure holds a unique identifier to identify some
+ * physical item (port for example) used by a netdevice.
  */
-struct netdev_phys_port_id {
-	unsigned char id[MAX_PHYS_PORT_ID_LEN];
+struct netdev_phys_item_id {
+	unsigned char id[MAX_PHYS_ITEM_ID_LEN];
 	unsigned char id_len;
 };
 
@@ -976,7 +976,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
  *
  * int (*ndo_get_phys_port_id)(struct net_device *dev,
- *			       struct netdev_phys_port_id *ppid);
+ *			       struct netdev_phys_item_id *ppid);
  *	Called to get ID of physical port of this device. If driver does
  *	not implement this, it is assumed that the hw is not able to have
  *	multiple net devices on single physical port.
@@ -1152,7 +1152,7 @@ struct net_device_ops {
 	int			(*ndo_change_carrier)(struct net_device *dev,
 						      bool new_carrier);
 	int			(*ndo_get_phys_port_id)(struct net_device *dev,
-							struct netdev_phys_port_id *ppid);
+							struct netdev_phys_item_id *ppid);
 	void			(*ndo_add_vxlan_port)(struct  net_device *dev,
 						      sa_family_t sa_family,
 						      __be16 port);
@@ -2870,7 +2870,7 @@ void dev_set_group(struct net_device *, int);
 int dev_set_mac_address(struct net_device *, struct sockaddr *);
 int dev_change_carrier(struct net_device *, bool new_carrier);
 int dev_get_phys_port_id(struct net_device *dev,
-			 struct netdev_phys_port_id *ppid);
+			 struct netdev_phys_item_id *ppid);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index ac4836241a96..0814a560e5f3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5846,7 +5846,7 @@ EXPORT_SYMBOL(dev_change_carrier);
  *	Get device physical port ID
  */
 int dev_get_phys_port_id(struct net_device *dev,
-			 struct netdev_phys_port_id *ppid)
+			 struct netdev_phys_item_id *ppid)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1a24602cd54e..26c46f4726c5 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -404,7 +404,7 @@ static ssize_t phys_port_id_show(struct device *dev,
 		return restart_syscall();
 
 	if (dev_isalive(netdev)) {
-		struct netdev_phys_port_id ppid;
+		struct netdev_phys_item_id ppid;
 
 		ret = dev_get_phys_port_id(netdev, &ppid);
 		if (!ret)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1a233c1c8ab4..0640f1fbe9dd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -869,7 +869,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
 	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
 	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
-	       + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
+	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_PORT_ID */
 }
 
 static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -953,7 +953,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
 static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	int err;
-	struct netdev_phys_port_id ppid;
+	struct netdev_phys_item_id ppid;
 
 	err = dev_get_phys_port_id(dev, &ppid);
 	if (err) {
@@ -1197,7 +1197,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROMISCUITY]	= { .type = NLA_U32 },
 	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 },
 	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 },
-	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
+	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */
 };
 
-- 
cgit v1.2.3


From 395eea6ccf2b253f81b4718ffbcae67d36fe2e69 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Wed, 3 Dec 2014 13:46:24 -0800
Subject: rtnetlink: delay RTM_DELLINK notification until after ndo_uninit()

The commit 56bfa7ee7c ("unregister_netdevice : move RTM_DELLINK to
until after ndo_uninit") tried to do this ealier but while doing so
it created a problem. Unfortunately the delayed rtmsg_ifinfo() also
delayed call to fill_info(). So this translated into asking driver
to remove private state and then query it's private state. This
could have catastropic consequences.

This change breaks the rtmsg_ifinfo() into two parts - one takes the
precise snapshot of the device by called fill_info() before calling
the ndo_uninit() and the second part sends the notification using
collected snapshot.

It was brought to notice when last link is deleted from an ipvlan device
when it has free-ed the port and the subsequent .fill_info() call is
trying to get the info from the port.

kernel: [  255.139429] ------------[ cut here ]------------
kernel: [  255.139439] WARNING: CPU: 12 PID: 11173 at net/core/rtnetlink.c:2238 rtmsg_ifinfo+0x100/0x110()
kernel: [  255.139493] Modules linked in: ipvlan bonding w1_therm ds2482 wire cdc_acm ehci_pci ehci_hcd i2c_dev i2c_i801 i2c_core msr cpuid bnx2x ptp pps_core mdio libcrc32c
kernel: [  255.139513] CPU: 12 PID: 11173 Comm: ip Not tainted 3.18.0-smp-DEV #167
kernel: [  255.139514] Hardware name: Intel RML,PCH/Ibis_QC_18, BIOS 1.0.10 05/15/2012
kernel: [  255.139515]  0000000000000009 ffff880851b6b828 ffffffff815d87f4 00000000000000e0
kernel: [  255.139516]  0000000000000000 ffff880851b6b868 ffffffff8109c29c 0000000000000000
kernel: [  255.139518]  00000000ffffffa6 00000000000000d0 ffffffff81aaf580 0000000000000011
kernel: [  255.139520] Call Trace:
kernel: [  255.139527]  [<ffffffff815d87f4>] dump_stack+0x46/0x58
kernel: [  255.139531]  [<ffffffff8109c29c>] warn_slowpath_common+0x8c/0xc0
kernel: [  255.139540]  [<ffffffff8109c2ea>] warn_slowpath_null+0x1a/0x20
kernel: [  255.139544]  [<ffffffff8150d570>] rtmsg_ifinfo+0x100/0x110
kernel: [  255.139547]  [<ffffffff814f78b5>] rollback_registered_many+0x1d5/0x2d0
kernel: [  255.139549]  [<ffffffff814f79cf>] unregister_netdevice_many+0x1f/0xb0
kernel: [  255.139551]  [<ffffffff8150acab>] rtnl_dellink+0xbb/0x110
kernel: [  255.139553]  [<ffffffff8150da90>] rtnetlink_rcv_msg+0xa0/0x240
kernel: [  255.139557]  [<ffffffff81329283>] ? rhashtable_lookup_compare+0x43/0x80
kernel: [  255.139558]  [<ffffffff8150d9f0>] ? __rtnl_unlock+0x20/0x20
kernel: [  255.139562]  [<ffffffff8152cb11>] netlink_rcv_skb+0xb1/0xc0
kernel: [  255.139563]  [<ffffffff8150a495>] rtnetlink_rcv+0x25/0x40
kernel: [  255.139565]  [<ffffffff8152c398>] netlink_unicast+0x178/0x230
kernel: [  255.139567]  [<ffffffff8152c75f>] netlink_sendmsg+0x30f/0x420
kernel: [  255.139571]  [<ffffffff814e0b0c>] sock_sendmsg+0x9c/0xd0
kernel: [  255.139575]  [<ffffffff811d1d7f>] ? rw_copy_check_uvector+0x6f/0x130
kernel: [  255.139577]  [<ffffffff814e11c9>] ? copy_msghdr_from_user+0x139/0x1b0
kernel: [  255.139578]  [<ffffffff814e1774>] ___sys_sendmsg+0x304/0x310
kernel: [  255.139581]  [<ffffffff81198723>] ? handle_mm_fault+0xca3/0xde0
kernel: [  255.139585]  [<ffffffff811ebc4c>] ? destroy_inode+0x3c/0x70
kernel: [  255.139589]  [<ffffffff8108e6ec>] ? __do_page_fault+0x20c/0x500
kernel: [  255.139597]  [<ffffffff811e8336>] ? dput+0xb6/0x190
kernel: [  255.139606]  [<ffffffff811f05f6>] ? mntput+0x26/0x40
kernel: [  255.139611]  [<ffffffff811d2b94>] ? __fput+0x174/0x1e0
kernel: [  255.139613]  [<ffffffff814e2129>] __sys_sendmsg+0x49/0x90
kernel: [  255.139615]  [<ffffffff814e2182>] SyS_sendmsg+0x12/0x20
kernel: [  255.139617]  [<ffffffff815df092>] system_call_fastpath+0x12/0x17
kernel: [  255.139619] ---[ end trace 5e6703e87d984f6b ]---

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Reported-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
Cc: David S. Miller <davem@davemloft.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  5 +++++
 net/core/dev.c            | 12 +++++++++---
 net/core/rtnetlink.c      | 25 +++++++++++++++++++++----
 3 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 3b0419072f88..5db76a32fcab 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -17,6 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+				       unsigned change, gfp_t flags);
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
+		       gfp_t flags);
+
 
 /* RTNL is used as a global lock for all changes to network configuration  */
 extern void rtnl_lock(void);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0814a560e5f3..dd3bf582e6f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5925,6 +5925,8 @@ static void rollback_registered_many(struct list_head *head)
 	synchronize_net();
 
 	list_for_each_entry(dev, head, unreg_list) {
+		struct sk_buff *skb = NULL;
+
 		/* Shutdown queueing discipline. */
 		dev_shutdown(dev);
 
@@ -5934,6 +5936,11 @@ static void rollback_registered_many(struct list_head *head)
 		*/
 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
+		if (!dev->rtnl_link_ops ||
+		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+						     GFP_KERNEL);
+
 		/*
 		 *	Flush the unicast and multicast chains
 		 */
@@ -5943,9 +5950,8 @@ static void rollback_registered_many(struct list_head *head)
 		if (dev->netdev_ops->ndo_uninit)
 			dev->netdev_ops->ndo_uninit(dev);
 
-		if (!dev->rtnl_link_ops ||
-		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+		if (skb)
+			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 
 		/* Notifier chain MUST detach us all upper devices. */
 		WARN_ON(netdev_has_any_upper_dev(dev));
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 61cb7e7cc3c7..a9be2c161702 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2245,8 +2245,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
-		  gfp_t flags)
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+				       unsigned int change, gfp_t flags)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2264,11 +2264,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		kfree_skb(skb);
 		goto errout;
 	}
-	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
-	return;
+	return skb;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+	return NULL;
+}
+
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+{
+	struct net *net = dev_net(dev);
+
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+		  gfp_t flags)
+{
+	struct sk_buff *skb;
+
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
+	if (skb)
+		rtmsg_ifinfo_send(skb, dev, flags);
 }
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
-- 
cgit v1.2.3


From e008f3f07ffba2be471ffd79bd1922af0042f936 Mon Sep 17 00:00:00 2001
From: Li RongQing <roy.qing.li@gmail.com>
Date: Mon, 8 Dec 2014 09:42:55 +0800
Subject: net: avoid to call skb_queue_len again

the queue length of sd->input_pkt_queue has been put into qlen,
and impossible to change, since hold the lock

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index dd3bf582e6f0..3f191da383f6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3297,7 +3297,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	rps_lock(sd);
 	qlen = skb_queue_len(&sd->input_pkt_queue);
 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
-		if (skb_queue_len(&sd->input_pkt_queue)) {
+		if (qlen) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
 			input_queue_tail_incr_save(sd, qtail);
-- 
cgit v1.2.3


From fd11a83dd3630ec6a60f8a702446532c5c7e1991 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Tue, 9 Dec 2014 19:40:49 -0800
Subject: net: Pull out core bits of __netdev_alloc_skb and add
 __napi_alloc_skb

This change pulls the core functionality out of __netdev_alloc_skb and
places them in a new function named __alloc_rx_skb.  The reason for doing
this is to make these bits accessible to a new function __napi_alloc_skb.
In addition __alloc_rx_skb now has a new flags value that is used to
determine which page frag pool to allocate from.  If the SKB_ALLOC_NAPI
flag is set then the NAPI pool is used.  The advantage of this is that we
do not have to use local_irq_save/restore when accessing the NAPI pool from
NAPI context.

In my test setup I saw at least 11ns of savings using the napi_alloc_skb
function versus the netdev_alloc_skb function, most of this being due to
the fact that we didn't have to call local_irq_save/restore.

The main use case for napi_alloc_skb would be for things such as copybreak
or page fragment based receive paths where an skb is allocated after the
data has been received instead of before.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  9 ++++++
 net/core/dev.c         |  2 +-
 net/core/skbuff.c      | 74 +++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 77 insertions(+), 8 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 736cc99f3f6c..85ab7d72b54c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -151,6 +151,7 @@ struct net_device;
 struct scatterlist;
 struct pipe_inode_info;
 struct iov_iter;
+struct napi_struct;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -673,6 +674,7 @@ struct sk_buff {
 
 #define SKB_ALLOC_FCLONE	0x01
 #define SKB_ALLOC_RX		0x02
+#define SKB_ALLOC_NAPI		0x04
 
 /* Returns true if the skb was allocated from PFMEMALLOC reserves */
 static inline bool skb_pfmemalloc(const struct sk_buff *skb)
@@ -2165,6 +2167,13 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 }
 
 void *napi_alloc_frag(unsigned int fragsz);
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
+				 unsigned int length, gfp_t gfp_mask);
+static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
+					     unsigned int length)
+{
+	return __napi_alloc_skb(napi, length, GFP_ATOMIC);
+}
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index 3f191da383f6..80f798da3d9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4172,7 +4172,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 	struct sk_buff *skb = napi->skb;
 
 	if (!skb) {
-		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 		napi->skb = skb;
 	}
 	return skb;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 56ed17cd2151..ae13ef6b3ea7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -444,10 +444,13 @@ void *napi_alloc_frag(unsigned int fragsz)
 EXPORT_SYMBOL(napi_alloc_frag);
 
 /**
- *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
- *	@dev: network device to receive on
+ *	__alloc_rx_skb - allocate an skbuff for rx
  *	@length: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb
+ *	@flags:	If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ *		allocations in case we have to fallback to __alloc_skb()
+ *		If SKB_ALLOC_NAPI is set, page fragment will be allocated
+ *		from napi_cache instead of netdev_cache.
  *
  *	Allocate a new &sk_buff and assign it a usage count of one. The
  *	buffer has unspecified headroom built in. Users should allocate
@@ -456,11 +459,11 @@ EXPORT_SYMBOL(napi_alloc_frag);
  *
  *	%NULL is returned if there is no free memory.
  */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
-				   unsigned int length, gfp_t gfp_mask)
+static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
+				      int flags)
 {
 	struct sk_buff *skb = NULL;
-	unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
+	unsigned int fragsz = SKB_DATA_ALIGN(length) +
 			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
@@ -469,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 		if (sk_memalloc_socks())
 			gfp_mask |= __GFP_MEMALLOC;
 
-		data = __netdev_alloc_frag(fragsz, gfp_mask);
+		data = (flags & SKB_ALLOC_NAPI) ?
+			__napi_alloc_frag(fragsz, gfp_mask) :
+			__netdev_alloc_frag(fragsz, gfp_mask);
 
 		if (likely(data)) {
 			skb = build_skb(data, fragsz);
@@ -477,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 				put_page(virt_to_head_page(data));
 		}
 	} else {
-		skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+		skb = __alloc_skb(length, gfp_mask,
 				  SKB_ALLOC_RX, NUMA_NO_NODE);
 	}
+	return skb;
+}
+
+/**
+ *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *	@dev: network device to receive on
+ *	@length: length to allocate
+ *	@gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ *	Allocate a new &sk_buff and assign it a usage count of one. The
+ *	buffer has NET_SKB_PAD headroom built in. Users should allocate
+ *	the headroom they think they need without accounting for the
+ *	built in space. The built in space is used for optimisations.
+ *
+ *	%NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+				   unsigned int length, gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+
+	length += NET_SKB_PAD;
+	skb = __alloc_rx_skb(length, gfp_mask, 0);
+
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
 	}
+
 	return skb;
 }
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
+/**
+ *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ *	@napi: napi instance this buffer was allocated for
+ *	@length: length to allocate
+ *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
+ *
+ *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
+ *	attempt to allocate the head from a special reserved region used
+ *	only for NAPI Rx allocation.  By doing this we can save several
+ *	CPU cycles by avoiding having to disable and re-enable IRQs.
+ *
+ *	%NULL is returned if there is no free memory.
+ */
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
+				 unsigned int length, gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+
+	length += NET_SKB_PAD + NET_IP_ALIGN;
+	skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+
+	if (likely(skb)) {
+		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+		skb->dev = napi->dev;
+	}
+
+	return skb;
+}
+EXPORT_SYMBOL(__napi_alloc_skb);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		     int size, unsigned int truesize)
 {
-- 
cgit v1.2.3


From af6dabc9c70ae3f307685b1f32f52d60b1bf0527 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 19 Dec 2014 11:09:13 +0800
Subject: net: drop the packet when fails to do software segmentation or header
 check

Commit cecda693a969816bac5e470e1d9c9c0ef5567bca ("net: keep original skb
which only needs header checking during software GSO") keeps the original
skb for packets that only needs header check, but it doesn't drop the
packet if software segmentation or header check were failed.

Fixes cecda693a9 ("net: keep original skb which only needs header checking during software GSO")
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index f411c28d0a66..a989f8502412 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2673,7 +2673,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 
 		segs = skb_gso_segment(skb, features);
 		if (IS_ERR(segs)) {
-			segs = NULL;
+			goto out_kfree_skb;
 		} else if (segs) {
 			consume_skb(skb);
 			skb = segs;
-- 
cgit v1.2.3


From 726ce70e9e4050409243f3a1d735dc86bc6e6e57 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 21 Dec 2014 07:16:21 +1100
Subject: net: Move napi polling code out of net_rx_action

This patch creates a new function napi_poll and moves the napi
polling code from net_rx_action into it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 98 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 54 insertions(+), 44 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index a989f8502412..493ae8ee569f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4557,6 +4557,59 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);
 
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+	void *have;
+	int work, weight;
+
+	list_del_init(&n->poll_list);
+
+	have = netpoll_poll_lock(n);
+
+	weight = n->weight;
+
+	/* This NAPI_STATE_SCHED test is for avoiding a race
+	 * with netpoll's poll_napi().  Only the entity which
+	 * obtains the lock and sees NAPI_STATE_SCHED set will
+	 * actually make the ->poll() call.  Therefore we avoid
+	 * accidentally calling ->poll() when NAPI is not scheduled.
+	 */
+	work = 0;
+	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+		work = n->poll(n, weight);
+		trace_napi_poll(n);
+	}
+
+	WARN_ON_ONCE(work > weight);
+
+	if (likely(work < weight))
+		goto out_unlock;
+
+	/* Drivers must not modify the NAPI state if they
+	 * consume the entire weight.  In such cases this code
+	 * still "owns" the NAPI instance and therefore can
+	 * move the instance around on the list at-will.
+	 */
+	if (unlikely(napi_disable_pending(n))) {
+		napi_complete(n);
+		goto out_unlock;
+	}
+
+	if (n->gro_list) {
+		/* flush too old packets
+		 * If HZ < 1000, flush all packets.
+		 */
+		napi_gro_flush(n, HZ >= 1000);
+	}
+
+	list_add_tail(&n->poll_list, repoll);
+
+out_unlock:
+	netpoll_poll_unlock(have);
+
+	return work;
+}
+
 static void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -4564,7 +4617,6 @@ static void net_rx_action(struct softirq_action *h)
 	int budget = netdev_budget;
 	LIST_HEAD(list);
 	LIST_HEAD(repoll);
-	void *have;
 
 	local_irq_disable();
 	list_splice_init(&sd->poll_list, &list);
@@ -4572,7 +4624,6 @@ static void net_rx_action(struct softirq_action *h)
 
 	while (!list_empty(&list)) {
 		struct napi_struct *n;
-		int work, weight;
 
 		/* If softirq window is exhausted then punt.
 		 * Allow this to run for 2 jiffies since which will allow
@@ -4583,48 +4634,7 @@ static void net_rx_action(struct softirq_action *h)
 
 
 		n = list_first_entry(&list, struct napi_struct, poll_list);
-		list_del_init(&n->poll_list);
-
-		have = netpoll_poll_lock(n);
-
-		weight = n->weight;
-
-		/* This NAPI_STATE_SCHED test is for avoiding a race
-		 * with netpoll's poll_napi().  Only the entity which
-		 * obtains the lock and sees NAPI_STATE_SCHED set will
-		 * actually make the ->poll() call.  Therefore we avoid
-		 * accidentally calling ->poll() when NAPI is not scheduled.
-		 */
-		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
-			work = n->poll(n, weight);
-			trace_napi_poll(n);
-		}
-
-		WARN_ON_ONCE(work > weight);
-
-		budget -= work;
-
-		/* Drivers must not modify the NAPI state if they
-		 * consume the entire weight.  In such cases this code
-		 * still "owns" the NAPI instance and therefore can
-		 * move the instance around on the list at-will.
-		 */
-		if (unlikely(work == weight)) {
-			if (unlikely(napi_disable_pending(n))) {
-				napi_complete(n);
-			} else {
-				if (n->gro_list) {
-					/* flush too old packets
-					 * If HZ < 1000, flush all packets.
-					 */
-					napi_gro_flush(n, HZ >= 1000);
-				}
-				list_add_tail(&n->poll_list, &repoll);
-			}
-		}
-
-		netpoll_poll_unlock(have);
+		budget -= napi_poll(n, &repoll);
 	}
 
 	if (!sd_has_rps_ipi_waiting(sd) &&
-- 
cgit v1.2.3


From 001ce546bb537bb5b7821f05633556a0c9787e32 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 21 Dec 2014 07:16:22 +1100
Subject: net: Detect drivers that reschedule NAPI and exhaust budget

The commit d75b1ade567ffab085e8adbbdacf0092d10cd09c (net: less
interrupt masking in NAPI) required drivers to leave poll_list
empty if the entire budget is consumed.

We have already had two broken drivers so let's add a check for
this.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 493ae8ee569f..c0cf1293df06 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4602,6 +4602,15 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 		napi_gro_flush(n, HZ >= 1000);
 	}
 
+	/* Some drivers may have called napi_schedule
+	 * prior to exhausting their budget.
+	 */
+	if (unlikely(!list_empty(&n->poll_list))) {
+		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+			     n->dev ? n->dev->name : "backlog");
+		goto out_unlock;
+	}
+
 	list_add_tail(&n->poll_list, repoll);
 
 out_unlock:
-- 
cgit v1.2.3


From 6bd373ebbac4b13ecd39ddc37a0dc5ad4c5e4585 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 21 Dec 2014 07:16:24 +1100
Subject: net: Always poll at least one device in net_rx_action

We should only perform the softnet_break check after we have polled
at least one device in net_rx_action.  Otherwise a zero or negative
setting of netdev_budget can lock up the whole system.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index c0cf1293df06..b85eba9dfd88 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4634,16 +4634,15 @@ static void net_rx_action(struct softirq_action *h)
 	while (!list_empty(&list)) {
 		struct napi_struct *n;
 
+		n = list_first_entry(&list, struct napi_struct, poll_list);
+		budget -= napi_poll(n, &repoll);
+
 		/* If softirq window is exhausted then punt.
 		 * Allow this to run for 2 jiffies since which will allow
 		 * an average latency of 1.5/HZ.
 		 */
 		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
 			goto softnet_break;
-
-
-		n = list_first_entry(&list, struct napi_struct, poll_list);
-		budget -= napi_poll(n, &repoll);
 	}
 
 	if (!sd_has_rps_ipi_waiting(sd) &&
-- 
cgit v1.2.3


From ceb8d5bf17d366534f32d2f60f41d905a5bc864b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 21 Dec 2014 07:16:25 +1100
Subject: net: Rearrange loop in net_rx_action

This patch rearranges the loop in net_rx_action to reduce the
amount of jumping back and forth when reading the code.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index b85eba9dfd88..c97ae6fec040 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4631,9 +4631,15 @@ static void net_rx_action(struct softirq_action *h)
 	list_splice_init(&sd->poll_list, &list);
 	local_irq_enable();
 
-	while (!list_empty(&list)) {
+	for (;;) {
 		struct napi_struct *n;
 
+		if (list_empty(&list)) {
+			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
+				return;
+			break;
+		}
+
 		n = list_first_entry(&list, struct napi_struct, poll_list);
 		budget -= napi_poll(n, &repoll);
 
@@ -4641,15 +4647,13 @@ static void net_rx_action(struct softirq_action *h)
 		 * Allow this to run for 2 jiffies since which will allow
 		 * an average latency of 1.5/HZ.
 		 */
-		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
-			goto softnet_break;
+		if (unlikely(budget <= 0 ||
+			     time_after_eq(jiffies, time_limit))) {
+			sd->time_squeeze++;
+			break;
+		}
 	}
 
-	if (!sd_has_rps_ipi_waiting(sd) &&
-	    list_empty(&list) &&
-	    list_empty(&repoll))
-		return;
-out:
 	local_irq_disable();
 
 	list_splice_tail_init(&sd->poll_list, &list);
@@ -4659,12 +4663,6 @@ out:
 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 	net_rps_action_and_irq_enable(sd);
-
-	return;
-
-softnet_break:
-	sd->time_squeeze++;
-	goto out;
 }
 
 struct netdev_adjacent {
-- 
cgit v1.2.3


From d0edc7bf397a5e0f312bf8a1e87cfee0019dc07b Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 23 Dec 2014 16:20:11 -0800
Subject: mpls: Fix config check for mpls.

Fixes MPLS GSO for case when mpls is compiled as kernel module.

Fixes: 0d89d2035f ("MPLS: Add limited GSO support").
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index c97ae6fec040..67b6210a589a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2522,7 +2522,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 /* If MPLS offload request, verify we are testing hardware MPLS features
  * instead of standard features for the netdev.
  */
-#ifdef CONFIG_NET_MPLS_GSO
+#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 static netdev_features_t net_mpls_features(struct sk_buff *skb,
 					   netdev_features_t features,
 					   __be16 type)
-- 
cgit v1.2.3


From 796f2da81bead71ffc91ef70912cd8d1827bf756 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Mon, 22 Dec 2014 19:04:14 +0900
Subject: net: Fix stacked vlan offload features computation

When vlan tags are stacked, it is very likely that the outer tag is stored
in skb->vlan_tci and skb->protocol shows the inner tag's vlan_proto.
Currently netif_skb_features() first looks at skb->protocol even if there
is the outer tag in vlan_tci, thus it incorrectly retrieves the protocol
encapsulated by the inner vlan instead of the inner vlan protocol.
This allows GSO packets to be passed to HW and they end up being
corrupted.

Fixes: 58e998c6d239 ("offloading: Force software GSO for multiple vlan tags.")
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 67b6210a589a..bd44e28c735e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2570,11 +2570,14 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
 		features &= ~NETIF_F_GSO_MASK;
 
-	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
-		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
-		protocol = veh->h_vlan_encapsulated_proto;
-	} else if (!vlan_tx_tag_present(skb)) {
-		return harmonize_features(skb, features);
+	if (!vlan_tx_tag_present(skb)) {
+		if (unlikely(protocol == htons(ETH_P_8021Q) ||
+			     protocol == htons(ETH_P_8021AD))) {
+			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+			protocol = veh->h_vlan_encapsulated_proto;
+		} else {
+			return harmonize_features(skb, features);
+		}
 	}
 
 	features = netdev_intersect_features(features,
-- 
cgit v1.2.3


From 2c26d34bbcc0b3f30385d5587aa232289e2eed8e Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <jay.vosburgh@canonical.com>
Date: Fri, 19 Dec 2014 15:32:00 -0800
Subject: net/core: Handle csum for CHECKSUM_COMPLETE VXLAN forwarding

When using VXLAN tunnels and a sky2 device, I have experienced
checksum failures of the following type:

[ 4297.761899] eth0: hw csum failure
[...]
[ 4297.765223] Call Trace:
[ 4297.765224]  <IRQ>  [<ffffffff8172f026>] dump_stack+0x46/0x58
[ 4297.765235]  [<ffffffff8162ba52>] netdev_rx_csum_fault+0x42/0x50
[ 4297.765238]  [<ffffffff8161c1a0>] ? skb_push+0x40/0x40
[ 4297.765240]  [<ffffffff8162325c>] __skb_checksum_complete+0xbc/0xd0
[ 4297.765243]  [<ffffffff8168c602>] tcp_v4_rcv+0x2e2/0x950
[ 4297.765246]  [<ffffffff81666ca0>] ? ip_rcv_finish+0x360/0x360

	These are reliably reproduced in a network topology of:

container:eth0 == host(OVS VXLAN on VLAN) == bond0 == eth0 (sky2) -> switch

	When VXLAN encapsulated traffic is received from a similarly
configured peer, the above warning is generated in the receive
processing of the encapsulated packet.  Note that the warning is
associated with the container eth0.

        The skbs from sky2 have ip_summed set to CHECKSUM_COMPLETE, and
because the packet is an encapsulated Ethernet frame, the checksum
generated by the hardware includes the inner protocol and Ethernet
headers.

	The receive code is careful to update the skb->csum, except in
__dev_forward_skb, as called by dev_forward_skb.  __dev_forward_skb
calls eth_type_trans, which in turn calls skb_pull_inline(skb, ETH_HLEN)
to skip over the Ethernet header, but does not update skb->csum when
doing so.

	This patch resolves the problem by adding a call to
skb_postpull_rcsum to update the skb->csum after the call to
eth_type_trans.

Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index bd44e28c735e..0094562b732a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1694,6 +1694,7 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 
 	skb_scrub_packet(skb, true);
 	skb->protocol = eth_type_trans(skb, dev);
+	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 5f35227ea34bb616c436d9da47fc325866c428f3 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Tue, 23 Dec 2014 22:37:26 -0800
Subject: net: Generalize ndo_gso_check to ndo_features_check

GSO isn't the only offload feature with restrictions that
potentially can't be expressed with the current features mechanism.
Checksum is another although it's a general issue that could in
theory apply to anything. Even if it may be possible to
implement these restrictions in other ways, it can result in
duplicate code or inefficient per-packet behavior.

This generalizes ndo_gso_check so that drivers can remove any
features that don't make sense for a given packet, similar to
netif_skb_features(). It also converts existing driver
restrictions to the new format, completing the work that was
done to support tunnel protocols since the issues apply to
checksums as well.

By actually removing features from the set that are used to do
offloading, it solves another problem with the existing
interface. In these cases, GSO would run with the original set
of features and not do anything because it appears that
segmentation is not required.

CC: Tom Herbert <therbert@google.com>
CC: Joe Stringer <joestringer@nicira.com>
CC: Eric Dumazet <edumazet@google.com>
CC: Hayes Wang <hayeswang@realtek.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by:  Tom Herbert <therbert@google.com>
Fixes: 04ffcb255f22 ("net: Add ndo_gso_check")
Tested-by: Hayes Wang <hayeswang@realtek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |  8 ++++---
 drivers/net/ethernet/emulex/benet/be_main.c      |  8 ++++---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c   | 10 +++++----
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  8 ++++---
 include/linux/netdevice.h                        | 20 +++++++++--------
 include/net/vxlan.h                              | 28 ++++++++++++++++++++----
 net/core/dev.c                                   | 23 +++++++++++--------
 7 files changed, 70 insertions(+), 35 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 9f5e38769a29..72eef9fc883e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12553,9 +12553,11 @@ static int bnx2x_get_phys_port_id(struct net_device *netdev,
 	return 0;
 }
 
-static bool bnx2x_gso_check(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t bnx2x_features_check(struct sk_buff *skb,
+					      struct net_device *dev,
+					      netdev_features_t features)
 {
-	return vxlan_gso_check(skb);
+	return vxlan_features_check(skb, features);
 }
 
 static const struct net_device_ops bnx2x_netdev_ops = {
@@ -12589,7 +12591,7 @@ static const struct net_device_ops bnx2x_netdev_ops = {
 #endif
 	.ndo_get_phys_port_id	= bnx2x_get_phys_port_id,
 	.ndo_set_vf_link_state	= bnx2x_set_vf_link_state,
-	.ndo_gso_check		= bnx2x_gso_check,
+	.ndo_features_check	= bnx2x_features_check,
 };
 
 static int bnx2x_set_coherency_mask(struct bnx2x *bp)
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 196073110e32..41a0a5498da7 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4459,9 +4459,11 @@ done:
 	adapter->vxlan_port_count--;
 }
 
-static bool be_gso_check(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t be_features_check(struct sk_buff *skb,
+					   struct net_device *dev,
+					   netdev_features_t features)
 {
-	return vxlan_gso_check(skb);
+	return vxlan_features_check(skb, features);
 }
 #endif
 
@@ -4492,7 +4494,7 @@ static const struct net_device_ops be_netdev_ops = {
 #ifdef CONFIG_BE2NET_VXLAN
 	.ndo_add_vxlan_port	= be_add_vxlan_port,
 	.ndo_del_vxlan_port	= be_del_vxlan_port,
-	.ndo_gso_check		= be_gso_check,
+	.ndo_features_check	= be_features_check,
 #endif
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 190cbd931f6b..d0d6dc1b8e46 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2365,9 +2365,11 @@ static void mlx4_en_del_vxlan_port(struct  net_device *dev,
 	queue_work(priv->mdev->workqueue, &priv->vxlan_del_task);
 }
 
-static bool mlx4_en_gso_check(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t mlx4_en_features_check(struct sk_buff *skb,
+						struct net_device *dev,
+						netdev_features_t features)
 {
-	return vxlan_gso_check(skb);
+	return vxlan_features_check(skb, features);
 }
 #endif
 
@@ -2400,7 +2402,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 #ifdef CONFIG_MLX4_EN_VXLAN
 	.ndo_add_vxlan_port	= mlx4_en_add_vxlan_port,
 	.ndo_del_vxlan_port	= mlx4_en_del_vxlan_port,
-	.ndo_gso_check		= mlx4_en_gso_check,
+	.ndo_features_check	= mlx4_en_features_check,
 #endif
 };
 
@@ -2434,7 +2436,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 #ifdef CONFIG_MLX4_EN_VXLAN
 	.ndo_add_vxlan_port	= mlx4_en_add_vxlan_port,
 	.ndo_del_vxlan_port	= mlx4_en_del_vxlan_port,
-	.ndo_gso_check		= mlx4_en_gso_check,
+	.ndo_features_check	= mlx4_en_features_check,
 #endif
 };
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 1aa25b13ace1..9929b97cfb36 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -505,9 +505,11 @@ static void qlcnic_del_vxlan_port(struct net_device *netdev,
 	adapter->flags |= QLCNIC_DEL_VXLAN_PORT;
 }
 
-static bool qlcnic_gso_check(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t qlcnic_features_check(struct sk_buff *skb,
+					       struct net_device *dev,
+					       netdev_features_t features)
 {
-	return vxlan_gso_check(skb);
+	return vxlan_features_check(skb, features);
 }
 #endif
 
@@ -532,7 +534,7 @@ static const struct net_device_ops qlcnic_netdev_ops = {
 #ifdef CONFIG_QLCNIC_VXLAN
 	.ndo_add_vxlan_port	= qlcnic_add_vxlan_port,
 	.ndo_del_vxlan_port	= qlcnic_del_vxlan_port,
-	.ndo_gso_check		= qlcnic_gso_check,
+	.ndo_features_check	= qlcnic_features_check,
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = qlcnic_poll_controller,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c31f74d76ebd..679e6e90aa4c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1012,12 +1012,15 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	Callback to use for xmit over the accelerated station. This
  *	is used in place of ndo_start_xmit on accelerated net
  *	devices.
- * bool	(*ndo_gso_check) (struct sk_buff *skb,
- *			  struct net_device *dev);
+ * netdev_features_t (*ndo_features_check) (struct sk_buff *skb,
+ *					    struct net_device *dev
+ *					    netdev_features_t features);
  *	Called by core transmit path to determine if device is capable of
- *	performing GSO on a packet. The device returns true if it is
- *	able to GSO the packet, false otherwise. If the return value is
- *	false the stack will do software GSO.
+ *	performing offload operations on a given packet. This is to give
+ *	the device an opportunity to implement any restrictions that cannot
+ *	be otherwise expressed by feature flags. The check is called with
+ *	the set of features that the stack has calculated and it returns
+ *	those the driver believes to be appropriate.
  *
  * int (*ndo_switch_parent_id_get)(struct net_device *dev,
  *				   struct netdev_phys_item_id *psid);
@@ -1178,8 +1181,9 @@ struct net_device_ops {
 							struct net_device *dev,
 							void *priv);
 	int			(*ndo_get_lock_subclass)(struct net_device *dev);
-	bool			(*ndo_gso_check) (struct sk_buff *skb,
-						  struct net_device *dev);
+	netdev_features_t	(*ndo_features_check) (struct sk_buff *skb,
+						       struct net_device *dev,
+						       netdev_features_t features);
 #ifdef CONFIG_NET_SWITCHDEV
 	int			(*ndo_switch_parent_id_get)(struct net_device *dev,
 							    struct netdev_phys_item_id *psid);
@@ -3611,8 +3615,6 @@ static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb,
 				   netdev_features_t features)
 {
 	return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
-		(dev->netdev_ops->ndo_gso_check &&
-		 !dev->netdev_ops->ndo_gso_check(skb, dev)) ||
 		unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
 			 (skb->ip_summed != CHECKSUM_UNNECESSARY)));
 }
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 57cccd0052e5..903461aa5644 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -1,6 +1,9 @@
 #ifndef __NET_VXLAN_H
 #define __NET_VXLAN_H 1
 
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/udp.h>
@@ -51,16 +54,33 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 		   __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
 		   __be16 src_port, __be16 dst_port, __be32 vni, bool xnet);
 
-static inline bool vxlan_gso_check(struct sk_buff *skb)
+static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
+						     netdev_features_t features)
 {
-	if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
+	u8 l4_hdr = 0;
+
+	if (!skb->encapsulation)
+		return features;
+
+	switch (vlan_get_protocol(skb)) {
+	case htons(ETH_P_IP):
+		l4_hdr = ip_hdr(skb)->protocol;
+		break;
+	case htons(ETH_P_IPV6):
+		l4_hdr = ipv6_hdr(skb)->nexthdr;
+		break;
+	default:
+		return features;;
+	}
+
+	if ((l4_hdr == IPPROTO_UDP) &&
 	    (skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
 	     skb->inner_protocol != htons(ETH_P_TEB) ||
 	     (skb_inner_mac_header(skb) - skb_transport_header(skb) !=
 	      sizeof(struct udphdr) + sizeof(struct vxlanhdr))))
-		return false;
+		return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK);
 
-	return true;
+	return features;
 }
 
 /* IP header + UDP + VXLAN + Ethernet header */
diff --git a/net/core/dev.c b/net/core/dev.c
index 0094562b732a..683d493aa1bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2563,7 +2563,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
 
 netdev_features_t netif_skb_features(struct sk_buff *skb)
 {
-	const struct net_device *dev = skb->dev;
+	struct net_device *dev = skb->dev;
 	netdev_features_t features = dev->features;
 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 	__be16 protocol = skb->protocol;
@@ -2571,13 +2571,20 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
 		features &= ~NETIF_F_GSO_MASK;
 
+	/* If encapsulation offload request, verify we are testing
+	 * hardware encapsulation features instead of standard
+	 * features for the netdev
+	 */
+	if (skb->encapsulation)
+		features &= dev->hw_enc_features;
+
 	if (!vlan_tx_tag_present(skb)) {
 		if (unlikely(protocol == htons(ETH_P_8021Q) ||
 			     protocol == htons(ETH_P_8021AD))) {
 			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
 			protocol = veh->h_vlan_encapsulated_proto;
 		} else {
-			return harmonize_features(skb, features);
+			goto finalize;
 		}
 	}
 
@@ -2595,6 +2602,11 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 						     NETIF_F_HW_VLAN_CTAG_TX |
 						     NETIF_F_HW_VLAN_STAG_TX);
 
+finalize:
+	if (dev->netdev_ops->ndo_features_check)
+		features &= dev->netdev_ops->ndo_features_check(skb, dev,
+								features);
+
 	return harmonize_features(skb, features);
 }
 EXPORT_SYMBOL(netif_skb_features);
@@ -2665,13 +2677,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 	if (unlikely(!skb))
 		goto out_null;
 
-	/* If encapsulation offload request, verify we are testing
-	 * hardware encapsulation features instead of standard
-	 * features for the netdev
-	 */
-	if (skb->encapsulation)
-		features &= dev->hw_enc_features;
-
 	if (netif_needs_gso(dev, skb, features)) {
 		struct sk_buff *segs;
 
-- 
cgit v1.2.3