From 5293efe62df81908f2e90c9820c7edcc8e61f5e9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 18 Aug 2016 01:00:39 +0200
Subject: bpf: add bpf_skb_change_tail helper

This work adds a bpf_skb_change_tail() helper for tc BPF programs. The
basic idea is to expand or shrink the skb in a controlled manner. The
eBPF program can then rewrite the rest via helpers like bpf_skb_store_bytes(),
bpf_lX_csum_replace() and others rather than passing a raw buffer for
writing here.

bpf_skb_change_tail() is really a slow path helper and intended for
replies with f.e. ICMP control messages. Concept is similar to other
helpers like bpf_skb_change_proto() helper to keep the helper without
protocol specifics and let the BPF program mangle the remaining parts.
A flags field has been added and is reserved for now should we extend
the helper in future.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux/skbuff.h')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0f665cb26b50..7047448e8129 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2295,7 +2295,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
 
 int ___pskb_trim(struct sk_buff *skb, unsigned int len);
 
-static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
+static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
 {
 	if (unlikely(skb_is_nonlinear(skb))) {
 		WARN_ON(1);
@@ -2305,6 +2305,11 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
 	skb_set_tail_pointer(skb, len);
 }
 
+static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
+{
+	__skb_set_length(skb, len);
+}
+
 void skb_trim(struct sk_buff *skb, unsigned int len);
 
 static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
@@ -2335,6 +2340,20 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
 	BUG_ON(err);
 }
 
+static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
+{
+	unsigned int diff = len - skb->len;
+
+	if (skb_tailroom(skb) < diff) {
+		int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
+					   GFP_ATOMIC);
+		if (ret)
+			return ret;
+	}
+	__skb_set_length(skb, len);
+	return 0;
+}
+
 /**
  *	skb_orphan - orphan a buffer
  *	@skb: buffer to orphan
@@ -2938,6 +2957,21 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 	return __pskb_trim(skb, len);
 }
 
+static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->ip_summed = CHECKSUM_NONE;
+	__skb_trim(skb, len);
+	return 0;
+}
+
+static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->ip_summed = CHECKSUM_NONE;
+	return __skb_grow(skb, len);
+}
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3726,6 +3760,13 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb)
 	return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
 }
 
+static inline void skb_gso_reset(struct sk_buff *skb)
+{
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_segs = 0;
+	skb_shinfo(skb)->gso_type = 0;
+}
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb);
 
 static inline bool skb_warn_if_lro(const struct sk_buff *skb)
-- 
cgit v1.2.3


From 6bc506b4fb065eac3d89ca1ce37082e174493d9e Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 25 Aug 2016 18:42:37 +0200
Subject: bridge: switchdev: Add forward mark support for stacked devices

switchdev_port_fwd_mark_set() is used to set the 'offload_fwd_mark' of
port netdevs so that packets being flooded by the device won't be
flooded twice.

It works by assigning a unique identifier (the ifindex of the first
bridge port) to bridge ports sharing the same parent ID. This prevents
packets from being flooded twice by the same switch, but will flood
packets through bridge ports belonging to a different switch.

This method is problematic when stacked devices are taken into account,
such as VLANs. In such cases, a physical port netdev can have upper
devices being members in two different bridges, thus requiring two
different 'offload_fwd_mark's to be configured on the port netdev, which
is impossible.

The main problem is that packet and netdev marking is performed at the
physical netdev level, whereas flooding occurs between bridge ports,
which are not necessarily port netdevs.

Instead, packet and netdev marking should really be done in the bridge
driver with the switch driver only telling it which packets it already
forwarded. The bridge driver will mark such packets using the mark
assigned to the ingress bridge port and will prevent the packet from
being forwarded through any bridge port sharing the same mark (i.e.
having the same parent ID).

Remove the current switchdev 'offload_fwd_mark' implementation and
instead implement the proposed method. In addition, make rocker - the
sole user of the mark - use the proposed method.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/switchdev.txt     | 13 ++---
 drivers/net/ethernet/rocker/rocker_main.c  |  2 +-
 drivers/net/ethernet/rocker/rocker_ofdpa.c |  4 --
 include/linux/netdevice.h                  |  5 --
 include/linux/skbuff.h                     | 13 ++---
 include/net/switchdev.h                    |  6 ---
 net/bridge/Makefile                        |  2 +
 net/bridge/br_forward.c                    |  3 +-
 net/bridge/br_if.c                         | 10 ++--
 net/bridge/br_input.c                      |  2 +
 net/bridge/br_private.h                    | 37 +++++++++++++
 net/bridge/br_switchdev.c                  | 57 ++++++++++++++++++++
 net/core/dev.c                             | 10 ----
 net/switchdev/switchdev.c                  | 85 ------------------------------
 14 files changed, 117 insertions(+), 132 deletions(-)
 create mode 100644 net/bridge/br_switchdev.c

(limited to 'include/linux/skbuff.h')

diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
index 31c39115834d..44235e83799b 100644
--- a/Documentation/networking/switchdev.txt
+++ b/Documentation/networking/switchdev.txt
@@ -283,15 +283,10 @@ be sent to the port netdev for processing by the bridge driver.  The
 bridge should not reflood the packet to the same ports the device flooded,
 otherwise there will be duplicate packets on the wire.
 
-To avoid duplicate packets, the device/driver should mark a packet as already
-forwarded using skb->offload_fwd_mark.  The same mark is set on the device
-ports in the domain using dev->offload_fwd_mark.  If the skb->offload_fwd_mark
-is non-zero and matches the forwarding egress port's dev->skb_mark, the kernel
-will drop the skb right before transmit on the egress port, with the
-understanding that the device already forwarded the packet on same egress port.
-The driver can use switchdev_port_fwd_mark_set() to set a globally unique mark
-for port's dev->offload_fwd_mark, based on the port's parent ID (switch ID) and
-a group ifindex.
+To avoid duplicate packets, the switch driver should mark a packet as already
+forwarded by setting the skb->offload_fwd_mark bit. The bridge driver will mark
+the skb using the ingress bridge port's mark and prevent it from being forwarded
+through any bridge port with the same mark.
 
 It is possible for the switch device to not handle flooding and push the
 packets up to the bridge driver for flooding.  This is not ideal as the number
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index f0b09b05ed3f..1f0c08602eba 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2412,7 +2412,7 @@ static int rocker_port_rx_proc(const struct rocker *rocker,
 	skb->protocol = eth_type_trans(skb, rocker_port->dev);
 
 	if (rx_flags & ROCKER_RX_FLAGS_FWD_OFFLOAD)
-		skb->offload_fwd_mark = rocker_port->dev->offload_fwd_mark;
+		skb->offload_fwd_mark = 1;
 
 	rocker_port->dev->stats.rx_packets++;
 	rocker_port->dev->stats.rx_bytes += skb->len;
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 1ca796316173..fcad907baecf 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2558,7 +2558,6 @@ static int ofdpa_port_init(struct rocker_port *rocker_port)
 	struct ofdpa_port *ofdpa_port = rocker_port->wpriv;
 	int err;
 
-	switchdev_port_fwd_mark_set(ofdpa_port->dev, NULL, false);
 	rocker_port_set_learning(rocker_port,
 				 !!(ofdpa_port->brport_flags & BR_LEARNING));
 
@@ -2817,7 +2816,6 @@ static int ofdpa_port_bridge_join(struct ofdpa_port *ofdpa_port,
 		ofdpa_port_internal_vlan_id_get(ofdpa_port, bridge->ifindex);
 
 	ofdpa_port->bridge_dev = bridge;
-	switchdev_port_fwd_mark_set(ofdpa_port->dev, bridge, true);
 
 	return ofdpa_port_vlan_add(ofdpa_port, NULL, OFDPA_UNTAGGED_VID, 0);
 }
@@ -2836,8 +2834,6 @@ static int ofdpa_port_bridge_leave(struct ofdpa_port *ofdpa_port)
 		ofdpa_port_internal_vlan_id_get(ofdpa_port,
 						ofdpa_port->dev->ifindex);
 
-	switchdev_port_fwd_mark_set(ofdpa_port->dev, ofdpa_port->bridge_dev,
-				    false);
 	ofdpa_port->bridge_dev = NULL;
 
 	err = ofdpa_port_vlan_add(ofdpa_port, NULL, OFDPA_UNTAGGED_VID, 0);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 794bb0733799..d122be9345c7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1562,8 +1562,6 @@ enum netdev_priv_flags {
  *
  *	@xps_maps:	XXX: need comments on this one
  *
- *	@offload_fwd_mark:	Offload device fwding mark
- *
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
  *	@watchdog_timer:	List of timers
@@ -1814,9 +1812,6 @@ struct net_device {
 #ifdef CONFIG_NET_CLS_ACT
 	struct tcf_proto __rcu  *egress_cl_list;
 #endif
-#ifdef CONFIG_NET_SWITCHDEV
-	u32			offload_fwd_mark;
-#endif
 
 	/* These may be needed for future network-power-down code. */
 	struct timer_list	watchdog_timer;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7047448e8129..cfb7219be665 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -612,7 +612,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
   *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
- *	@offload_fwd_mark: fwding offload mark
  *	@mark: Generic packet mark
  *	@vlan_proto: vlan encapsulation protocol
  *	@vlan_tci: vlan tag control information
@@ -730,7 +729,10 @@ struct sk_buff {
 	__u8			ipvs_property:1;
 	__u8			inner_protocol_type:1;
 	__u8			remcsum_offload:1;
-	/* 3 or 5 bit hole */
+#ifdef CONFIG_NET_SWITCHDEV
+	__u8			offload_fwd_mark:1;
+#endif
+	/* 2, 4 or 5 bit hole */
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
@@ -757,14 +759,9 @@ struct sk_buff {
 		unsigned int	sender_cpu;
 	};
 #endif
-	union {
 #ifdef CONFIG_NETWORK_SECMARK
-		__u32		secmark;
+	__u32		secmark;
 #endif
-#ifdef CONFIG_NET_SWITCHDEV
-		__u32		offload_fwd_mark;
-#endif
-	};
 
 	union {
 		__u32		mark;
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 62f6a967a1b7..82f5e0462021 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -347,12 +347,6 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb,
        return idx;
 }
 
-static inline void switchdev_port_fwd_mark_set(struct net_device *dev,
-					       struct net_device *group_dev,
-					       bool joining)
-{
-}
-
 static inline bool switchdev_port_same_parent_id(struct net_device *a,
 						 struct net_device *b)
 {
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index a1cda5d4718d..0aefc011b668 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -20,4 +20,6 @@ bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
 
 bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o
 
+bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
+
 obj-$(CONFIG_NETFILTER) += netfilter/
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 63a83d8d7da3..32a02de39cd2 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -29,7 +29,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
 
 	vg = nbp_vlan_group_rcu(p);
 	return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
-		br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING;
+		br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
+		nbp_switchdev_allowed_egress(p, skb);
 }
 
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f2fede05d32c..1da3221845f1 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -545,6 +545,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err5;
 
+	err = nbp_switchdev_mark_set(p);
+	if (err)
+		goto err6;
+
 	dev_disable_lro(dev);
 
 	list_add_rcu(&p->list, &br->port_list);
@@ -566,7 +570,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	err = nbp_vlan_init(p);
 	if (err) {
 		netdev_err(dev, "failed to initialize vlan filtering on this port\n");
-		goto err6;
+		goto err7;
 	}
 
 	spin_lock_bh(&br->lock);
@@ -589,12 +593,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	return 0;
 
-err6:
+err7:
 	list_del_rcu(&p->list);
 	br_fdb_delete_by_port(br, p, 0, 1);
 	nbp_update_port_count(br);
+err6:
 	netdev_upper_dev_unlink(dev, br->dev);
-
 err5:
 	dev->priv_flags &= ~IFF_BRIDGE_PORT;
 	netdev_rx_handler_unregister(dev);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 8e486203d133..3132cfc80e9d 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -145,6 +145,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
 		goto out;
 
+	nbp_switchdev_frame_mark(p, skb);
+
 	/* insert into forwarding database after filtering to avoid spoofing */
 	br = p->br;
 	if (p->flags & BR_LEARNING)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aac2a6e6b008..2379b2b865c9 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -251,6 +251,9 @@ struct net_bridge_port
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 	struct net_bridge_vlan_group	__rcu *vlgrp;
 #endif
+#ifdef CONFIG_NET_SWITCHDEV
+	int				offload_fwd_mark;
+#endif
 };
 
 #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
@@ -359,6 +362,11 @@ struct net_bridge
 	struct timer_list		gc_timer;
 	struct kobject			*ifobj;
 	u32				auto_cnt;
+
+#ifdef CONFIG_NET_SWITCHDEV
+	int offload_fwd_mark;
+#endif
+
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 	struct net_bridge_vlan_group	__rcu *vlgrp;
 	u8				vlan_enabled;
@@ -381,6 +389,10 @@ struct br_input_skb_cb {
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 	bool vlan_filtered;
 #endif
+
+#ifdef CONFIG_NET_SWITCHDEV
+	int offload_fwd_mark;
+#endif
 };
 
 #define BR_INPUT_SKB_CB(__skb)	((struct br_input_skb_cb *)(__skb)->cb)
@@ -1034,4 +1046,29 @@ static inline int br_sysfs_addbr(struct net_device *dev) { return 0; }
 static inline void br_sysfs_delbr(struct net_device *dev) { return; }
 #endif /* CONFIG_SYSFS */
 
+/* br_switchdev.c */
+#ifdef CONFIG_NET_SWITCHDEV
+int nbp_switchdev_mark_set(struct net_bridge_port *p);
+void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+			      struct sk_buff *skb);
+bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+				  const struct sk_buff *skb);
+#else
+static inline int nbp_switchdev_mark_set(struct net_bridge_port *p)
+{
+	return 0;
+}
+
+static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+					    struct sk_buff *skb)
+{
+}
+
+static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+						const struct sk_buff *skb)
+{
+	return true;
+}
+#endif /* CONFIG_NET_SWITCHDEV */
+
 #endif
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
new file mode 100644
index 000000000000..f4097b900de1
--- /dev/null
+++ b/net/bridge/br_switchdev.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/switchdev.h>
+
+#include "br_private.h"
+
+static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev)
+{
+	struct net_bridge_port *p;
+
+	/* dev is yet to be added to the port list. */
+	list_for_each_entry(p, &br->port_list, list) {
+		if (switchdev_port_same_parent_id(dev, p->dev))
+			return p->offload_fwd_mark;
+	}
+
+	return ++br->offload_fwd_mark;
+}
+
+int nbp_switchdev_mark_set(struct net_bridge_port *p)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = p->dev,
+		.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+	};
+	int err;
+
+	ASSERT_RTNL();
+
+	err = switchdev_port_attr_get(p->dev, &attr);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev);
+
+	return 0;
+}
+
+void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
+			      struct sk_buff *skb)
+{
+	if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark))
+		BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark;
+}
+
+bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
+				  const struct sk_buff *skb)
+{
+	return !skb->offload_fwd_mark ||
+	       BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark;
+}
diff --git a/net/core/dev.c b/net/core/dev.c
index 7feae74ca928..1d5c6dda1988 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3355,16 +3355,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	else
 		skb_dst_force(skb);
 
-#ifdef CONFIG_NET_SWITCHDEV
-	/* Don't forward if offload device already forwarded */
-	if (skb->offload_fwd_mark &&
-	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
-		consume_skb(skb);
-		rc = NET_XMIT_SUCCESS;
-		goto out;
-	}
-#endif
-
 	txq = netdev_pick_tx(dev, skb, accel_priv);
 	q = rcu_dereference_bh(txq->qdisc);
 
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 2c683f24d557..1031a0327fff 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -1305,88 +1305,3 @@ bool switchdev_port_same_parent_id(struct net_device *a,
 	return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id);
-
-static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
-				       struct net_device *group_dev)
-{
-	struct net_device *lower_dev;
-	struct list_head *iter;
-
-	netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
-		if (lower_dev == dev)
-			continue;
-		if (switchdev_port_same_parent_id(dev, lower_dev))
-			return lower_dev->offload_fwd_mark;
-		return switchdev_port_fwd_mark_get(dev, lower_dev);
-	}
-
-	return dev->ifindex;
-}
-
-static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
-					  u32 old_mark, u32 *reset_mark)
-{
-	struct net_device *lower_dev;
-	struct list_head *iter;
-
-	netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
-		if (lower_dev->offload_fwd_mark == old_mark) {
-			if (!*reset_mark)
-				*reset_mark = lower_dev->ifindex;
-			lower_dev->offload_fwd_mark = *reset_mark;
-		}
-		switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark);
-	}
-}
-
-/**
- *	switchdev_port_fwd_mark_set - Set port offload forwarding mark
- *
- *	@dev: port device
- *	@group_dev: containing device
- *	@joining: true if dev is joining group; false if leaving group
- *
- *	An ungrouped port's offload mark is just its ifindex.  A grouped
- *	port's (member of a bridge, for example) offload mark is the ifindex
- *	of one of the ports in the group with the same parent (switch) ID.
- *	Ports on the same device in the same group will have the same mark.
- *
- *	Example:
- *
- *		br0		ifindex=9
- *		  sw1p1		ifindex=2	mark=2
- *		  sw1p2		ifindex=3	mark=2
- *		  sw2p1		ifindex=4	mark=5
- *		  sw2p2		ifindex=5	mark=5
- *
- *	If sw2p2 leaves the bridge, we'll have:
- *
- *		br0		ifindex=9
- *		  sw1p1		ifindex=2	mark=2
- *		  sw1p2		ifindex=3	mark=2
- *		  sw2p1		ifindex=4	mark=4
- *		sw2p2		ifindex=5	mark=5
- */
-void switchdev_port_fwd_mark_set(struct net_device *dev,
-				 struct net_device *group_dev,
-				 bool joining)
-{
-	u32 mark = dev->ifindex;
-	u32 reset_mark = 0;
-
-	if (group_dev) {
-		ASSERT_RTNL();
-		if (joining)
-			mark = switchdev_port_fwd_mark_get(dev, group_dev);
-		else if (dev->offload_fwd_mark == mark)
-			/* Ohoh, this port was the mark reference port,
-			 * but it's leaving the group, so reset the
-			 * mark for the remaining ports in the group.
-			 */
-			switchdev_port_fwd_mark_reset(group_dev, mark,
-						      &reset_mark);
-	}
-
-	dev->offload_fwd_mark = mark;
-}
-EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set);
-- 
cgit v1.2.3


From 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a Mon Sep 17 00:00:00 2001
From: Yaogong Wang <wygivan@google.com>
Date: Wed, 7 Sep 2016 14:49:28 -0700
Subject: tcp: use an RB tree for ooo receive queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Over the years, TCP BDP has increased by several orders of magnitude,
and some people are considering to reach the 2 Gbytes limit.

Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000
MSS.

In presence of packet losses (or reorders), TCP stores incoming packets
into an out of order queue, and number of skbs sitting there waiting for
the missing packets to be received can be in the 10^5 range.

Most packets are appended to the tail of this queue, and when
packets can finally be transferred to receive queue, we scan the queue
from its head.

However, in presence of heavy losses, we might have to find an arbitrary
point in this queue, involving a linear scan for every incoming packet,
throwing away cpu caches.

This patch converts it to a RB tree, to get bounded latencies.

Yaogong wrote a preliminary patch about 2 years ago.
Eric did the rebase, added ofo_last_skb cache, polishing and tests.

Tested with network dropping between 1 and 10 % packets, with good
success (about 30 % increase of throughput in stress tests)

Next step would be to also use an RB tree for the write queue at sender
side ;)

Signed-off-by: Yaogong Wang <wygivan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-By: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h   |   2 +
 include/linux/tcp.h      |   7 +-
 include/net/tcp.h        |   2 +-
 net/core/skbuff.c        |  19 +++
 net/ipv4/tcp.c           |   4 +-
 net/ipv4/tcp_input.c     | 330 +++++++++++++++++++++++++++--------------------
 net/ipv4/tcp_ipv4.c      |   2 +-
 net/ipv4/tcp_minisocks.c |   1 -
 8 files changed, 218 insertions(+), 149 deletions(-)

(limited to 'include/linux/skbuff.h')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cfb7219be665..4c5662f05bda 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2402,6 +2402,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
+void skb_rbtree_purge(struct rb_root *root);
+
 void *netdev_alloc_frag(unsigned int fragsz);
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7be9b1242354..c723a465125d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -281,10 +281,9 @@ struct tcp_sock {
 	struct sk_buff* lost_skb_hint;
 	struct sk_buff *retransmit_skb_hint;
 
-	/* OOO segments go in this list. Note that socket lock must be held,
-	 * as we do not use sk_buff_head lock.
-	 */
-	struct sk_buff_head	out_of_order_queue;
+	/* OOO segments go in this rbtree. Socket lock must be held. */
+	struct rb_root	out_of_order_queue;
+	struct sk_buff	*ooo_last_skb; /* cache rb_last(out_of_order_queue) */
 
 	/* SACKs data, these 2 need to be together (see tcp_options_write) */
 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d6ae36512429..fdfbedd61c67 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -640,7 +640,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (skb_queue_empty(&tp->out_of_order_queue) &&
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
 	    tp->rcv_wnd &&
 	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
 	    !tp->urg_data)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b68fa1..1e329d411242 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2444,6 +2444,25 @@ void skb_queue_purge(struct sk_buff_head *list)
 }
 EXPORT_SYMBOL(skb_queue_purge);
 
+/**
+ *	skb_rbtree_purge - empty a skb rbtree
+ *	@root: root of the rbtree to empty
+ *
+ *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ *	the list and one reference dropped. This function does not take
+ *	any lock. Synchronization should be handled by the caller (e.g., TCP
+ *	out-of-order queue is protected by the socket lock).
+ */
+void skb_rbtree_purge(struct rb_root *root)
+{
+	struct sk_buff *skb, *next;
+
+	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
+		kfree_skb(skb);
+
+	*root = RB_ROOT;
+}
+
 /**
  *	skb_queue_head - queue a buffer at the list head
  *	@list: list to use
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 77311a92275c..a13fcb369f52 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -380,7 +380,7 @@ void tcp_init_sock(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	__skb_queue_head_init(&tp->out_of_order_queue);
+	tp->out_of_order_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
 	INIT_LIST_HEAD(&tp->tsq_node);
@@ -2243,7 +2243,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
 	tcp_write_queue_purge(sk);
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 
 	inet->inet_dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8cd02c0b056c..a5934c4c8cd4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4108,7 +4108,7 @@ void tcp_fin(struct sock *sk)
 	/* It _is_ possible, that we have something out-of-order _after_ FIN.
 	 * Probably, we should reset in this case. For now drop them.
 	 */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 	if (tcp_is_sack(tp))
 		tcp_sack_reset(&tp->rx_opt);
 	sk_mem_reclaim(sk);
@@ -4268,7 +4268,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	int this_sack;
 
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-	if (skb_queue_empty(&tp->out_of_order_queue)) {
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
 		return;
 	}
@@ -4344,10 +4344,13 @@ static void tcp_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 dsack_high = tp->rcv_nxt;
+	bool fin, fragstolen, eaten;
 	struct sk_buff *skb, *tail;
-	bool fragstolen, eaten;
+	struct rb_node *p;
 
-	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+	p = rb_first(&tp->out_of_order_queue);
+	while (p) {
+		skb = rb_entry(p, struct sk_buff, rbnode);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4357,9 +4360,10 @@ static void tcp_ofo_queue(struct sock *sk)
 				dsack_high = TCP_SKB_CB(skb)->end_seq;
 			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
 		}
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
 
-		__skb_unlink(skb, &tp->out_of_order_queue);
-		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			tcp_drop(sk, skb);
 			continue;
@@ -4371,12 +4375,19 @@ static void tcp_ofo_queue(struct sock *sk)
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (!eaten)
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
-			tcp_fin(sk);
-		if (eaten)
+		else
 			kfree_skb_partial(skb, fragstolen);
+
+		if (unlikely(fin)) {
+			tcp_fin(sk);
+			/* tcp_fin() purges tp->out_of_order_queue,
+			 * so we must end this loop right now.
+			 */
+			break;
+		}
 	}
 }
 
@@ -4403,8 +4414,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct rb_node **p, *q, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
+	bool fragstolen;
 
 	tcp_ecn_check_ce(tp, skb);
 
@@ -4419,88 +4432,85 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	inet_csk_schedule_ack(sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+	seq = TCP_SKB_CB(skb)->seq;
+	end_seq = TCP_SKB_CB(skb)->end_seq;
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
-		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+		   tp->rcv_nxt, seq, end_seq);
 
-	skb1 = skb_peek_tail(&tp->out_of_order_queue);
-	if (!skb1) {
+	p = &tp->out_of_order_queue.rb_node;
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		/* Initial out of order segment, build 1 SACK. */
 		if (tcp_is_sack(tp)) {
 			tp->rx_opt.num_sacks = 1;
-			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
-			tp->selective_acks[0].end_seq =
-						TCP_SKB_CB(skb)->end_seq;
+			tp->selective_acks[0].start_seq = seq;
+			tp->selective_acks[0].end_seq = end_seq;
 		}
-		__skb_queue_head(&tp->out_of_order_queue, skb);
+		rb_link_node(&skb->rbnode, NULL, p);
+		rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+		tp->ooo_last_skb = skb;
 		goto end;
 	}
 
-	seq = TCP_SKB_CB(skb)->seq;
-	end_seq = TCP_SKB_CB(skb)->end_seq;
-
-	if (seq == TCP_SKB_CB(skb1)->end_seq) {
-		bool fragstolen;
-
-		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
-			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-		} else {
-			tcp_grow_window(sk, skb);
-			kfree_skb_partial(skb, fragstolen);
-			skb = NULL;
+	/* In the typical case, we are adding an skb to the end of the list.
+	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+	 */
+	if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+coalesce_done:
+		tcp_grow_window(sk, skb);
+		kfree_skb_partial(skb, fragstolen);
+		skb = NULL;
+		goto add_sack;
+	}
+
+	/* Find place to insert this segment. Handle overlaps on the way. */
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+			p = &parent->rb_left;
+			continue;
 		}
-
-		if (!tp->rx_opt.num_sacks ||
-		    tp->selective_acks[0].end_seq != seq)
-			goto add_sack;
-
-		/* Common case: data arrive in order after hole. */
-		tp->selective_acks[0].end_seq = end_seq;
-		goto end;
-	}
-
-	/* Find place to insert this segment. */
-	while (1) {
-		if (!after(TCP_SKB_CB(skb1)->seq, seq))
-			break;
-		if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
-			skb1 = NULL;
-			break;
+		if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+				/* All the bits are present. Drop. */
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				__kfree_skb(skb);
+				skb = NULL;
+				tcp_dsack_set(sk, seq, end_seq);
+				goto add_sack;
+			}
+			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+				/* Partial overlap. */
+				tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+			} else {
+				/* skb's seq == skb1's seq and skb covers skb1.
+				 * Replace skb1 with skb.
+				 */
+				rb_replace_node(&skb1->rbnode, &skb->rbnode,
+						&tp->out_of_order_queue);
+				tcp_dsack_extend(sk,
+						 TCP_SKB_CB(skb1)->seq,
+						 TCP_SKB_CB(skb1)->end_seq);
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				__kfree_skb(skb1);
+				goto add_sack;
+			}
+		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+			goto coalesce_done;
 		}
-		skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+		p = &parent->rb_right;
 	}
 
-	/* Do skb overlap to previous one? */
-	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
-		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
-			/* All the bits are present. Drop. */
-			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-			tcp_drop(sk, skb);
-			skb = NULL;
-			tcp_dsack_set(sk, seq, end_seq);
-			goto add_sack;
-		}
-		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
-			/* Partial overlap. */
-			tcp_dsack_set(sk, seq,
-				      TCP_SKB_CB(skb1)->end_seq);
-		} else {
-			if (skb_queue_is_first(&tp->out_of_order_queue,
-					       skb1))
-				skb1 = NULL;
-			else
-				skb1 = skb_queue_prev(
-					&tp->out_of_order_queue,
-					skb1);
-		}
-	}
-	if (!skb1)
-		__skb_queue_head(&tp->out_of_order_queue, skb);
-	else
-		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+	/* Insert segment into RB tree. */
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
 
-	/* And clean segments covered by new one as whole. */
-	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
-		skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+	/* Remove other segments covered by skb. */
+	while ((q = rb_next(&skb->rbnode)) != NULL) {
+		skb1 = rb_entry(q, struct sk_buff, rbnode);
 
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
@@ -4509,12 +4519,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 					 end_seq);
 			break;
 		}
-		__skb_unlink(skb1, &tp->out_of_order_queue);
+		rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
 		tcp_drop(sk, skb1);
 	}
+	/* If there is no skb after us, we are the last_skb ! */
+	if (!q)
+		tp->ooo_last_skb = skb;
 
 add_sack:
 	if (tcp_is_sack(tp))
@@ -4651,13 +4664,13 @@ queue_and_out:
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			tcp_fin(sk);
 
-		if (!skb_queue_empty(&tp->out_of_order_queue)) {
+		if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 			tcp_ofo_queue(sk);
 
 			/* RFC2581. 4.2. SHOULD send immediate ACK, when
 			 * gap in queue is filled.
 			 */
-			if (skb_queue_empty(&tp->out_of_order_queue))
+			if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 				inet_csk(sk)->icsk_ack.pingpong = 0;
 		}
 
@@ -4711,48 +4724,76 @@ drop:
 	tcp_data_queue_ofo(sk, skb);
 }
 
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+	if (list)
+		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+
+	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+}
+
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
-					struct sk_buff_head *list)
+					struct sk_buff_head *list,
+					struct rb_root *root)
 {
-	struct sk_buff *next = NULL;
+	struct sk_buff *next = tcp_skb_next(skb, list);
 
-	if (!skb_queue_is_last(list, skb))
-		next = skb_queue_next(list, skb);
+	if (list)
+		__skb_unlink(skb, list);
+	else
+		rb_erase(&skb->rbnode, root);
 
-	__skb_unlink(skb, list);
 	__kfree_skb(skb);
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
 
 	return next;
 }
 
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sk_buff *skb1;
+
+	while (*p) {
+		parent = *p;
+		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, root);
+}
+
 /* Collapse contiguous sequence of skbs head..tail with
  * sequence numbers start..end.
  *
- * If tail is NULL, this means until the end of the list.
+ * If tail is NULL, this means until the end of the queue.
  *
  * Segments with FIN/SYN are not collapsed (only because this
  * simplifies code)
  */
 static void
-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
-	     struct sk_buff *head, struct sk_buff *tail,
-	     u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+	     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
 {
-	struct sk_buff *skb, *n;
+	struct sk_buff *skb = head, *n;
+	struct sk_buff_head tmp;
 	bool end_of_skbs;
 
 	/* First, check that queue is collapsible and find
-	 * the point where collapsing can be useful. */
-	skb = head;
+	 * the point where collapsing can be useful.
+	 */
 restart:
-	end_of_skbs = true;
-	skb_queue_walk_from_safe(list, skb, n) {
-		if (skb == tail)
-			break;
+	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+		n = tcp_skb_next(skb, list);
+
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-			skb = tcp_collapse_one(sk, skb, list);
+			skb = tcp_collapse_one(sk, skb, list, root);
 			if (!skb)
 				break;
 			goto restart;
@@ -4770,13 +4811,10 @@ restart:
 			break;
 		}
 
-		if (!skb_queue_is_last(list, skb)) {
-			struct sk_buff *next = skb_queue_next(list, skb);
-			if (next != tail &&
-			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
-				end_of_skbs = false;
-				break;
-			}
+		if (n && n != tail &&
+		    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+			end_of_skbs = false;
+			break;
 		}
 
 		/* Decided to skip this, advance start seq. */
@@ -4786,17 +4824,22 @@ restart:
 	    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
 		return;
 
+	__skb_queue_head_init(&tmp);
+
 	while (before(start, end)) {
 		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
 		struct sk_buff *nskb;
 
 		nskb = alloc_skb(copy, GFP_ATOMIC);
 		if (!nskb)
-			return;
+			break;
 
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-		__skb_queue_before(list, skb, nskb);
+		if (list)
+			__skb_queue_before(list, skb, nskb);
+		else
+			__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
 		skb_set_owner_r(nskb, sk);
 
 		/* Copy data, releasing collapsed skbs. */
@@ -4814,14 +4857,17 @@ restart:
 				start += size;
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-				skb = tcp_collapse_one(sk, skb, list);
+				skb = tcp_collapse_one(sk, skb, list, root);
 				if (!skb ||
 				    skb == tail ||
 				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
-					return;
+					goto end;
 			}
 		}
 	}
+end:
+	skb_queue_walk_safe(&tmp, skb, n)
+		tcp_rbtree_insert(root, skb);
 }
 
 /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4830,43 +4876,43 @@ restart:
 static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
-	struct sk_buff *head;
+	struct sk_buff *skb, *head;
+	struct rb_node *p;
 	u32 start, end;
 
-	if (!skb)
+	p = rb_first(&tp->out_of_order_queue);
+	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+new_range:
+	if (!skb) {
+		p = rb_last(&tp->out_of_order_queue);
+		/* Note: This is possible p is NULL here. We do not
+		 * use rb_entry_safe(), as ooo_last_skb is valid only
+		 * if rbtree is not empty.
+		 */
+		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
 		return;
-
+	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
-	head = skb;
-
-	for (;;) {
-		struct sk_buff *next = NULL;
 
-		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
-			next = skb_queue_next(&tp->out_of_order_queue, skb);
-		skb = next;
+	for (head = skb;;) {
+		skb = tcp_skb_next(skb, NULL);
 
-		/* Segment is terminated when we see gap or when
-		 * we are at the end of all the queue. */
+		/* Range is terminated when we see a gap or when
+		 * we are at the queue end.
+		 */
 		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, &tp->out_of_order_queue,
+			tcp_collapse(sk, NULL, &tp->out_of_order_queue,
 				     head, skb, start, end);
-			head = skb;
-			if (!skb)
-				break;
-			/* Start new segment */
+			goto new_range;
+		}
+
+		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
 			start = TCP_SKB_CB(skb)->seq;
+		if (after(TCP_SKB_CB(skb)->end_seq, end))
 			end = TCP_SKB_CB(skb)->end_seq;
-		} else {
-			if (before(TCP_SKB_CB(skb)->seq, start))
-				start = TCP_SKB_CB(skb)->seq;
-			if (after(TCP_SKB_CB(skb)->end_seq, end))
-				end = TCP_SKB_CB(skb)->end_seq;
-		}
 	}
 }
 
@@ -4883,20 +4929,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
+	struct rb_node *node, *prev;
 
-	if (skb_queue_empty(&tp->out_of_order_queue))
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 		return false;
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-
-	while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
-		tcp_drop(sk, skb);
+	node = &tp->ooo_last_skb->rbnode;
+	do {
+		prev = rb_prev(node);
+		rb_erase(node, &tp->out_of_order_queue);
+		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
 		sk_mem_reclaim(sk);
 		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 		    !tcp_under_memory_pressure(sk))
 			break;
-	}
+		node = prev;
+	} while (node);
+	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
@@ -4930,7 +4980,7 @@ static int tcp_prune_queue(struct sock *sk)
 
 	tcp_collapse_ofo_queue(sk);
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		tcp_collapse(sk, &sk->sk_receive_queue,
+		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
 			     skb_peek(&sk->sk_receive_queue),
 			     NULL,
 			     tp->copied_seq, tp->rcv_nxt);
@@ -5035,7 +5085,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* We have out of order data. */
-	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+	    (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
 		/* Then ack it now */
 		tcp_send_ack(sk);
 	} else {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a75bf48d7950..04b989328558 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1845,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	tcp_write_queue_purge(sk);
 
 	/* Cleans up our, hopefully empty, out_of_order_queue. */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Clean up the MD5 key list, if any */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4b95ec4ed2c8..f63c73dc0acb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -488,7 +488,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_cwnd_cnt = 0;
 
 		tcp_init_xmit_timers(newsk);
-		__skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
 
 		newtp->rx_opt.saw_tstamp = 0;
-- 
cgit v1.2.3


From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 20 Sep 2016 00:26:13 +0200
Subject: bpf: direct packet write and access for helpers for clsact progs

This work implements direct packet access for helpers and direct packet
write in a similar fashion as already available for XDP types via commits
4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and
6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a
complementary feature to the already available direct packet read for tc
(cls/act) programs.

For enabling this, we need to introduce two helpers, bpf_skb_pull_data()
and bpf_csum_update(). The first is generally needed for both, read and
write, because they would otherwise only be limited to the current linear
skb head. Usually, when the data_end test fails, programs just bail out,
or, in the direct read case, use bpf_skb_load_bytes() as an alternative
to overcome this limitation. If such data sits in non-linear parts, we
can just pull them in once with the new helper, retest and eventually
access them.

At the same time, this also makes sure the skb is uncloned, which is, of
course, a necessary condition for direct write. As this needs to be an
invariant for the write part only, the verifier detects writes and adds
a prologue that is calling bpf_skb_pull_data() to effectively unclone the
skb from the very beginning in case it is indeed cloned. The heuristic
makes use of a similar trick that was done in 233577a22089 ("net: filter:
constify detection of pkt_type_offset"). This comes at zero cost for other
programs that do not use the direct write feature. Should a program use
this feature only sparsely and has read access for the most parts with,
for example, drop return codes, then such write action can be delegated
to a tail called program for mitigating this cost of potential uncloning
to a late point in time where it would have been paid similarly with the
bpf_skb_store_bytes() as well. Advantage of direct write is that the
writes are inlined whereas the helper cannot make any length assumptions
and thus needs to generate a call to memcpy() also for small sizes, as well
as cost of helper call itself with sanity checks are avoided. Plus, when
direct read is already used, we don't need to cache or perform rechecks
on the data boundaries (due to verifier invalidating previous checks for
helpers that change skb->data), so more complex programs using rewrites
can benefit from switching to direct read plus write.

For direct packet access to helpers, we save the otherwise needed copy into
a temp struct sitting on stack memory when use-case allows. Both facilities
are enabled via may_access_direct_pkt_data() in verifier. For now, we limit
this to map helpers and csum_diff, and can successively enable other helpers
where we find it makes sense. Helpers that definitely cannot be allowed for
this are those part of bpf_helper_changes_skb_data() since they can change
underlying data, and those that write into memory as this could happen for
packet typed args when still cloned. bpf_csum_update() helper accommodates
for the fact that we need to fixup checksum_complete when using direct write
instead of bpf_skb_store_bytes(), meaning the programs can use available
helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(),
csum_block_add(), csum_block_sub() equivalents in eBPF together with the
new helper. A usage example will be provided for iproute2's examples/bpf/
directory.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   4 +-
 include/linux/skbuff.h   |  14 ++++-
 include/uapi/linux/bpf.h |  21 ++++++++
 kernel/bpf/helpers.c     |   3 ++
 kernel/bpf/verifier.c    |  54 ++++++++++++++-----
 net/core/filter.c        | 134 +++++++++++++++++++++++++++++++++++++++++------
 6 files changed, 196 insertions(+), 34 deletions(-)

(limited to 'include/linux/skbuff.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a904f63f8c1..5691fdc83819 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_return_type {
 struct bpf_func_proto {
 	u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 	bool gpl_only;
+	bool pkt_access;
 	enum bpf_return_type ret_type;
 	enum bpf_arg_type arg1_type;
 	enum bpf_arg_type arg2_type;
@@ -151,7 +152,8 @@ struct bpf_verifier_ops {
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
 				enum bpf_reg_type *reg_type);
-
+	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
+			    const struct bpf_prog *prog);
 	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
 				  int src_reg, int ctx_off,
 				  struct bpf_insn *insn, struct bpf_prog *prog);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c5662f05bda..c6dab3f7457c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -676,13 +676,23 @@ struct sk_buff {
 	 */
 	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
+
+/* if you move cloned around you also must adapt those constants */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define CLONED_MASK	(1 << 7)
+#else
+#define CLONED_MASK	1
+#endif
+#define CLONED_OFFSET()		offsetof(struct sk_buff, __cloned_offset)
+
+	__u8			__cloned_offset[0];
 	__u8			cloned:1,
 				nohdr:1,
 				fclone:2,
 				peeked:1,
 				head_frag:1,
-				xmit_more:1;
-	/* one bit hole */
+				xmit_more:1,
+				__unused:1; /* one bit hole */
 	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f896dfac4ac0..e07432b9f8b8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -398,6 +398,27 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_change_tail,
 
+	/**
+	 * bpf_skb_pull_data(skb, len)
+	 * The helper will pull in non-linear data in case the
+	 * skb is non-linear and not all of len are part of the
+	 * linear section. Only needed for read/write with direct
+	 * packet access.
+	 * @skb: pointer to skb
+	 * @len: len to make read/writeable
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_skb_pull_data,
+
+	/**
+	 * bpf_csum_update(skb, csum)
+	 * Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+	 * @skb: pointer to skb
+	 * @csum: csum to add
+	 * Return: csum on success or negative error
+	 */
+	BPF_FUNC_csum_update,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a5b8bf8cfcfd..39918402e6e9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -36,6 +36,7 @@ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.func		= bpf_map_lookup_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -51,6 +52,7 @@ BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 const struct bpf_func_proto bpf_map_update_elem_proto = {
 	.func		= bpf_map_update_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -67,6 +69,7 @@ BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.func		= bpf_map_delete_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc138f34e38c..3a75ee3bdcd1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -196,6 +196,7 @@ struct verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
+	bool seen_direct_write;
 };
 
 #define BPF_COMPLEXITY_LIMIT_INSNS	65536
@@ -204,6 +205,7 @@ struct verifier_env {
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
+	bool pkt_access;
 	int regno;
 	int access_size;
 };
@@ -654,10 +656,17 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
-static bool may_write_pkt_data(enum bpf_prog_type type)
+static bool may_access_direct_pkt_data(struct verifier_env *env,
+				       const struct bpf_call_arg_meta *meta)
 {
-	switch (type) {
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+		if (meta)
+			return meta->pkt_access;
+
+		env->seen_direct_write = true;
 		return true;
 	default:
 		return false;
@@ -817,7 +826,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -950,8 +959,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) {
-		verbose("helper access to the packet is not allowed for clsact\n");
+	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1191,6 +1200,7 @@ static int check_call(struct verifier_env *env, int func_id)
 	changes_data = bpf_helper_changes_skb_data(fn->func);
 
 	memset(&meta, 0, sizeof(meta));
+	meta.pkt_access = fn->pkt_access;
 
 	/* We only support one arg being in raw mode at the moment, which
 	 * is sufficient for the helper functions we have right now.
@@ -2675,18 +2685,35 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
  */
 static int convert_ctx_accesses(struct verifier_env *env)
 {
-	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
-	struct bpf_insn insn_buf[16];
+	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i;
+	int i, insn_cnt, cnt;
 
-	if (!env->prog->aux->ops->convert_ctx_access)
+	if (ops->gen_prologue) {
+		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+					env->prog);
+		if (cnt >= ARRAY_SIZE(insn_buf)) {
+			verbose("bpf verifier is misconfigured\n");
+			return -EINVAL;
+		} else if (cnt) {
+			new_prog = bpf_patch_insn_single(env->prog, 0,
+							 insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+			env->prog = new_prog;
+		}
+	}
+
+	if (!ops->convert_ctx_access)
 		return 0;
 
+	insn_cnt = env->prog->len;
+	insn = env->prog->insnsi;
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		u32 insn_delta, cnt;
+		u32 insn_delta;
 
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
@@ -2703,9 +2730,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			continue;
 		}
 
-		cnt = env->prog->aux->ops->
-			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					   insn->off, insn_buf, env->prog);
+		cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+					      insn->off, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index 298b146b47e7..0920c2ac1d00 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 	return err;
 }
 
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+	return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
 static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
 {
 	if (skb_at_tc_ingress(skb))
@@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_STACK_SIZE,
 };
 
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+	/* Idea is the following: should the needed direct read/write
+	 * test fail during runtime, we can pull in more data and redo
+	 * again, since implicitly, we invalidate previous checks here.
+	 *
+	 * Or, since we know how much we need to make read/writeable,
+	 * this can be done once at the program beginning for direct
+	 * access case. By this we overcome limitations of only current
+	 * headroom being accessible.
+	 */
+	return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+	.func		= bpf_skb_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
 	   u64, from, u64, to, u64, flags)
 {
@@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
 static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.func		= bpf_csum_diff,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_STACK,
 	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+	/* The interface is to be used in combination with bpf_csum_diff()
+	 * for direct packet writes. csum rotation for alignment as well
+	 * as emulating csum_sub() can be done from the eBPF program.
+	 */
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		return (skb->csum = csum_add(skb->csum, csum));
+
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+	.func		= bpf_csum_update,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
 {
 	return dev_forward_skb(dev, skb);
@@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 {
 	struct net_device *dev;
+	struct sk_buff *clone;
+	int ret;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return -EINVAL;
@@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 	if (unlikely(!dev))
 		return -EINVAL;
 
-	skb = skb_clone(skb, GFP_ATOMIC);
-	if (unlikely(!skb))
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (unlikely(!clone))
 		return -ENOMEM;
 
-	bpf_push_mac_rcsum(skb);
+	/* For direct write, we need to keep the invariant that the skbs
+	 * we're dealing with need to be uncloned. Should uncloning fail
+	 * here, we need to free the just generated clone to unclone once
+	 * again.
+	 */
+	ret = bpf_try_make_head_writable(skb);
+	if (unlikely(ret)) {
+		kfree_skb(clone);
+		return -ENOMEM;
+	}
+
+	bpf_push_mac_rcsum(clone);
 
 	return flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+	       __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
 }
 
 static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -2063,19 +2124,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 
 bool bpf_helper_changes_skb_data(void *func)
 {
-	if (func == bpf_skb_vlan_push)
-		return true;
-	if (func == bpf_skb_vlan_pop)
-		return true;
-	if (func == bpf_skb_store_bytes)
-		return true;
-	if (func == bpf_skb_change_proto)
-		return true;
-	if (func == bpf_skb_change_tail)
-		return true;
-	if (func == bpf_l3_csum_replace)
-		return true;
-	if (func == bpf_l4_csum_replace)
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace)
 		return true;
 
 	return false;
@@ -2440,8 +2496,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
 	case BPF_FUNC_csum_diff:
 		return &bpf_csum_diff_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
@@ -2533,6 +2593,45 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			       const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (!direct_write)
+		return 0;
+
+	/* if (!skb->cloned)
+	 *       goto start;
+	 *
+	 * (Fast-path, otherwise approximation that we might be
+	 *  a clone, do the rest in helper.)
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+	/* ret = bpf_skb_pull_data(skb, 0); */
+	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			       BPF_FUNC_skb_pull_data);
+	/* if (!ret)
+	 *      goto restore;
+	 * return TC_ACT_SHOT;
+	 */
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+	*insn++ = BPF_EXIT_INSN();
+
+	/* restore: */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+	/* start: */
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
 				       enum bpf_reg_type *reg_type)
@@ -2810,6 +2909,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
 };
 
 static const struct bpf_verifier_ops xdp_ops = {
-- 
cgit v1.2.3


From bfca4c520f7ea78138ddccea2de18dc062b0fefd Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 19 Sep 2016 19:11:09 +0300
Subject: net: skbuff: Export __skb_vlan_pop

This exports the functionality of extracting the tag from the payload,
without moving next vlan tag into hw accel tag.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux/skbuff.h')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c6dab3f7457c..9bf60b556bd2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3085,6 +3085,7 @@ bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7bf82a28e10a..6c22351bd519 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4522,8 +4522,10 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
 }
 EXPORT_SYMBOL(skb_ensure_writable);
 
-/* remove VLAN header from packet and update csum accordingly. */
-static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
 {
 	struct vlan_hdr *vhdr;
 	unsigned int offset = skb->data - skb_mac_header(skb);
@@ -4554,6 +4556,7 @@ pull:
 
 	return err;
 }
+EXPORT_SYMBOL(__skb_vlan_pop);
 
 int skb_vlan_pop(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 25869262ef7af24ccde988867ac3eb1c3d4b88d4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 17 Sep 2016 21:02:10 -0400
Subject: skb_splice_bits(): get rid of callback

since pipe_lock is the outermost now, we don't need to drop/regain
socket locks around the call of splice_to_pipe() from skb_splice_bits(),
which kills the need to have a socket-specific callback; we can just
call splice_to_pipe() and be done with that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h |  8 +-------
 net/core/skbuff.c      | 28 ++--------------------------
 net/ipv4/tcp.c         |  3 +--
 net/kcm/kcmsock.c      | 16 +---------------
 net/unix/af_unix.c     | 17 +----------------
 5 files changed, 6 insertions(+), 66 deletions(-)

(limited to 'include/linux/skbuff.h')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0f665cb26b50..f520251ec43f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3021,15 +3021,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
 			      int len, __wsum csum);
-ssize_t skb_socket_splice(struct sock *sk,
-			  struct pipe_inode_info *pipe,
-			  struct splice_pipe_desc *spd);
 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int len,
-		    unsigned int flags,
-		    ssize_t (*splice_cb)(struct sock *,
-					 struct pipe_inode_info *,
-					 struct splice_pipe_desc *));
+		    unsigned int flags);
 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
 int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b68fa1..208a9bc4af74 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1962,37 +1962,13 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
 	return false;
 }
 
-ssize_t skb_socket_splice(struct sock *sk,
-			  struct pipe_inode_info *pipe,
-			  struct splice_pipe_desc *spd)
-{
-	int ret;
-
-	/* Drop the socket lock, otherwise we have reverse
-	 * locking dependencies between sk_lock and i_mutex
-	 * here as compared to sendfile(). We enter here
-	 * with the socket lock held, and splice_to_pipe() will
-	 * grab the pipe inode lock. For sendfile() emulation,
-	 * we call into ->sendpage() with the i_mutex lock held
-	 * and networking will grab the socket lock.
-	 */
-	release_sock(sk);
-	ret = splice_to_pipe(pipe, spd);
-	lock_sock(sk);
-
-	return ret;
-}
-
 /*
  * Map data from the skb to a pipe. Should handle both the linear part,
  * the fragments, and the frag list.
  */
 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int tlen,
-		    unsigned int flags,
-		    ssize_t (*splice_cb)(struct sock *,
-					 struct pipe_inode_info *,
-					 struct splice_pipe_desc *))
+		    unsigned int flags)
 {
 	struct partial_page partial[MAX_SKB_FRAGS];
 	struct page *pages[MAX_SKB_FRAGS];
@@ -2009,7 +1985,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 	__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
 
 	if (spd.nr_pages)
-		ret = splice_cb(sk, pipe, &spd);
+		ret = splice_to_pipe(pipe, &spd);
 
 	return ret;
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ffbb218de520..ddd2179538f7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -688,8 +688,7 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 	int ret;
 
 	ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
-			      min(rd_desc->count, len), tss->flags,
-			      skb_socket_splice);
+			      min(rd_desc->count, len), tss->flags);
 	if (ret > 0)
 		rd_desc->count -= ret;
 	return ret;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 411693288648..8a720ba50061 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1462,19 +1462,6 @@ out:
 	return copied ? : err;
 }
 
-static ssize_t kcm_sock_splice(struct sock *sk,
-			       struct pipe_inode_info *pipe,
-			       struct splice_pipe_desc *spd)
-{
-	int ret;
-
-	release_sock(sk);
-	ret = splice_to_pipe(pipe, spd);
-	lock_sock(sk);
-
-	return ret;
-}
-
 static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
 			       struct pipe_inode_info *pipe, size_t len,
 			       unsigned int flags)
@@ -1504,8 +1491,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
 	if (len > rxm->full_len)
 		len = rxm->full_len;
 
-	copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags,
-				 kcm_sock_splice);
+	copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags);
 	if (copied < 0) {
 		err = copied;
 		goto err_out;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 8309687a56b0..145082e2ba36 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2475,28 +2475,13 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 	return unix_stream_read_generic(&state);
 }
 
-static ssize_t skb_unix_socket_splice(struct sock *sk,
-				      struct pipe_inode_info *pipe,
-				      struct splice_pipe_desc *spd)
-{
-	int ret;
-	struct unix_sock *u = unix_sk(sk);
-
-	mutex_unlock(&u->iolock);
-	ret = splice_to_pipe(pipe, spd);
-	mutex_lock(&u->iolock);
-
-	return ret;
-}
-
 static int unix_stream_splice_actor(struct sk_buff *skb,
 				    int skip, int chunk,
 				    struct unix_stream_read_state *state)
 {
 	return skb_splice_bits(skb, state->socket->sk,
 			       UNIXCB(skb).consumed + skip,
-			       state->pipe, chunk, state->splice_flags,
-			       skb_unix_socket_splice);
+			       state->pipe, chunk, state->splice_flags);
 }
 
 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
-- 
cgit v1.2.3