summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-03-31 03:06:50 +0300
committerDavid S. Miller <davem@davemloft.net>2021-03-31 03:06:50 +0300
commitdf82e9c6dd8426577bf02392928c51c8e7ed30e1 (patch)
tree77d847de135f17ddea9d6e36089f33d8e16cfaf4
parentdc5fa2073f63965faad232af0adf9b7e50b5f340 (diff)
parenta062260a9d5fce22977744ed6dab64b7b1508ab3 (diff)
downloadlinux-df82e9c6dd8426577bf02392928c51c8e7ed30e1.tar.xz
Merge branch 'udp-gro-L4'
Paolo Abeni says: ==================== udp: GRO L4 improvements This series improves the UDP L4 - either 'forward' or 'frag_list' - co-existence with UDP tunnel GRO, allowing the first to take place correctly even for encapsulated UDP traffic. The first for patches are mostly bugfixes, addressing some GRO edge-cases when both tunnels and L4 are present, enabled and in use. The next 3 patches avoid unneeded segmentation when UDP GRO traffic traverses in the receive path UDP tunnels. Finally, some self-tests are included, covering the relevant GRO scenarios. Even if most patches are actually bugfixes, this series is targeting net-next, as overall it makes available a new feature. v2 -> v3: - no code changes, more verbose commit messages and comment in patch 1/8 v1 -> v2: - restrict post segmentation csum fixup to the only the relevant pkts - use individual 'accept_gso_type' fields instead of whole gso bitmask (Willem) - use only ipv6 addesses from test range in self-tests (Willem) - hopefully clarified most individual patches commit messages ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/bareudp.c1
-rw-r--r--drivers/net/geneve.c1
-rw-r--r--drivers/net/vxlan.c1
-rw-r--r--include/linux/udp.h22
-rw-r--r--include/net/udp.h23
-rw-r--r--net/ipv4/udp.c5
-rw-r--r--net/ipv4/udp_offload.c27
-rw-r--r--net/ipv6/udp.c1
-rw-r--r--net/ipv6/udp_offload.c3
-rw-r--r--tools/testing/selftests/net/Makefile1
-rwxr-xr-xtools/testing/selftests/net/udpgro_fwd.sh251
11 files changed, 323 insertions, 13 deletions
diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 7511bca9c15e..edfad93e7b68 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -218,6 +218,7 @@ static struct socket *bareudp_create_sock(struct net *net, __be16 port)
if (err < 0)
return ERR_PTR(err);
+ udp_allow_gso(sock->sk);
return sock;
}
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4ac0373326ef..5d7a2b1469f4 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -461,6 +461,7 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6,
if (err < 0)
return ERR_PTR(err);
+ udp_allow_gso(sock->sk);
return sock;
}
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7665817f3cb6..39ee1300cdd9 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3484,6 +3484,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
if (err < 0)
return ERR_PTR(err);
+ udp_allow_gso(sock->sk);
return sock;
}
diff --git a/include/linux/udp.h b/include/linux/udp.h
index aa84597bdc33..ae66dadd8543 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -51,7 +51,9 @@ struct udp_sock {
* different encapsulation layer set
* this
*/
- gro_enabled:1; /* Can accept GRO packets */
+ gro_enabled:1, /* Request GRO aggregation */
+ accept_udp_l4:1,
+ accept_udp_fraglist:1;
/*
* Following member retains the information to create a UDP header
* when the socket is uncorked.
@@ -131,8 +133,22 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
{
- return !udp_sk(sk)->gro_enabled && skb_is_gso(skb) &&
- skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4;
+ if (!skb_is_gso(skb))
+ return false;
+
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
+ return true;
+
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
+ return true;
+
+ return false;
+}
+
+static inline void udp_allow_gso(struct sock *sk)
+{
+ udp_sk(sk)->accept_udp_l4 = 1;
+ udp_sk(sk)->accept_udp_fraglist = 1;
}
#define udp_portaddr_for_each_entry(__sk, list) \
diff --git a/include/net/udp.h b/include/net/udp.h
index d4d064c59232..adf2ff8ac87c 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -515,6 +515,29 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
return segs;
}
+static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
+{
+ /* UDP-lite can't land here - no GRO */
+ WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov);
+
+ /* UDP packets generated with UDP_SEGMENT and traversing:
+ *
+ * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx)
+ *
+ * can reach an UDP socket with CHECKSUM_NONE, because
+ * __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE.
+ * SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will
+ * have a valid checksum, as the GRO engine validates the UDP csum
+ * before the aggregation and nobody strips such info in between.
+ * Instead of adding another check in the tunnel fastpath, we can force
+ * a valid csum after the segmentation.
+ * Additionally fixup the UDP CB.
+ */
+ UDP_SKB_CB(skb)->cscov = skb->len;
+ if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid)
+ skb->csum_valid = 1;
+}
+
#ifdef CONFIG_BPF_SYSCALL
struct sk_psock;
struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4a0478b17243..c0695ce42dc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2178,6 +2178,8 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
segs = udp_rcv_segment(sk, skb, true);
skb_list_walk_safe(segs, skb, next) {
__skb_pull(skb, skb_transport_offset(skb));
+
+ udp_post_segment_fix_csum(skb);
ret = udp_queue_rcv_one_skb(sk, skb);
if (ret > 0)
ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
@@ -2664,9 +2666,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
case UDP_GRO:
lock_sock(sk);
+
+ /* when enabling GRO, accept the related GSO packet type */
if (valbool)
udp_tunnel_encap_enable(sk->sk_socket);
up->gro_enabled = valbool;
+ up->accept_udp_l4 = valbool;
release_sock(sk);
break;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index c5b4b586570f..54e06b88af69 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -515,21 +515,24 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
unsigned int off = skb_gro_offset(skb);
int flush = 1;
+ /* we can do L4 aggregation only if the packet can't land in a tunnel
+ * otherwise we could corrupt the inner stream
+ */
NAPI_GRO_CB(skb)->is_flist = 0;
- if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
- NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
+ if (!sk || !udp_sk(sk)->gro_receive) {
+ if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
+ NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1;
- if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
- (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {
- pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+ if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
+ (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
+ pp = call_gro_receive(udp_gro_receive_segment, head, skb);
return pp;
}
- if (!sk || NAPI_GRO_CB(skb)->encap_mark ||
+ if (NAPI_GRO_CB(skb)->encap_mark ||
(uh->check && skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
- !NAPI_GRO_CB(skb)->csum_valid) ||
- !udp_sk(sk)->gro_receive)
+ !NAPI_GRO_CB(skb)->csum_valid))
goto out;
/* mark that this skb passed once through the tunnel gro layer */
@@ -639,6 +642,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
: SKB_GSO_UDP_TUNNEL;
+ /* clear the encap mark, so that inner frag_list gro_complete
+ * can take place
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
/* Set encapsulation before calling into inner gro_complete()
* functions to make them set up the inner offsets.
*/
@@ -662,7 +670,8 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (NAPI_GRO_CB(skb)->is_flist) {
+ /* do fraglist only if there is no outer UDP encap (or we already processed it) */
+ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
uh->len = htons(skb->len - nhoff);
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d25e5a9252fd..fa2f54738392 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -749,6 +749,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
skb_list_walk_safe(segs, skb, next) {
__skb_pull(skb, skb_transport_offset(skb));
+ udp_post_segment_fix_csum(skb);
ret = udpv6_queue_rcv_one_skb(sk, skb);
if (ret > 0)
ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index faa823c24292..b3d9ed96e5ea 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -163,7 +163,8 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (NAPI_GRO_CB(skb)->is_flist) {
+ /* do fraglist only if there is no outer UDP encap (or we already processed it) */
+ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
uh->len = htons(skb->len - nhoff);
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 25f198bec0b2..2d71b283dde3 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -23,6 +23,7 @@ TEST_PROGS += drop_monitor_tests.sh
TEST_PROGS += vrf_route_leaking.sh
TEST_PROGS += bareudp.sh
TEST_PROGS += unicast_extensions.sh
+TEST_PROGS += udpgro_fwd.sh
TEST_PROGS_EXTENDED := in_netns.sh
TEST_GEN_FILES = socket nettest
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh
new file mode 100755
index 000000000000..a8fa64136282
--- /dev/null
+++ b/tools/testing/selftests/net/udpgro_fwd.sh
@@ -0,0 +1,251 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+readonly BASE="ns-$(mktemp -u XXXXXX)"
+readonly SRC=2
+readonly DST=1
+readonly DST_NAT=100
+readonly NS_SRC=$BASE$SRC
+readonly NS_DST=$BASE$DST
+
+# "baremetal" network used for raw UDP traffic
+readonly BM_NET_V4=192.168.1.
+readonly BM_NET_V6=2001:db8::
+
+# "overlay" network used for UDP over UDP tunnel traffic
+readonly OL_NET_V4=172.16.1.
+readonly OL_NET_V6=2001:db8:1::
+readonly NPROCS=`nproc`
+
+cleanup() {
+ local ns
+ local -r jobs="$(jobs -p)"
+ [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
+
+ for ns in $NS_SRC $NS_DST; do
+ ip netns del $ns 2>/dev/null
+ done
+}
+
+trap cleanup EXIT
+
+create_ns() {
+ local net
+ local ns
+
+ for ns in $NS_SRC $NS_DST; do
+ ip netns add $ns
+ ip -n $ns link set dev lo up
+ done
+
+ ip link add name veth$SRC type veth peer name veth$DST
+
+ for ns in $SRC $DST; do
+ ip link set dev veth$ns netns $BASE$ns
+ ip -n $BASE$ns link set dev veth$ns up
+ ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24
+ ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad
+ done
+ ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null
+}
+
+create_vxlan_endpoint() {
+ local -r netns=$1
+ local -r bm_dev=$2
+ local -r bm_rem_addr=$3
+ local -r vxlan_dev=$4
+ local -r vxlan_id=$5
+ local -r vxlan_port=4789
+
+ ip -n $netns link set dev $bm_dev up
+ ip -n $netns link add dev $vxlan_dev type vxlan id $vxlan_id \
+ dstport $vxlan_port remote $bm_rem_addr
+ ip -n $netns link set dev $vxlan_dev up
+}
+
+create_vxlan_pair() {
+ local ns
+
+ create_ns
+
+ for ns in $SRC $DST; do
+ # note that 3 - $SRC == $DST and 3 - $DST == $SRC
+ create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V4$((3 - $ns)) vxlan$ns 4
+ ip -n $BASE$ns addr add dev vxlan$ns $OL_NET_V4$ns/24
+ done
+ for ns in $SRC $DST; do
+ create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V6$((3 - $ns)) vxlan6$ns 6
+ ip -n $BASE$ns addr add dev vxlan6$ns $OL_NET_V6$ns/24 nodad
+ done
+}
+
+is_ipv6() {
+ if [[ $1 =~ .*:.* ]]; then
+ return 0
+ fi
+ return 1
+}
+
+run_test() {
+ local -r msg=$1
+ local -r dst=$2
+ local -r pkts=$3
+ local -r vxpkts=$4
+ local bind=$5
+ local rx_args=""
+ local rx_family="-4"
+ local family=-4
+ local filter=IpInReceives
+ local ipt=iptables
+
+ printf "%-40s" "$msg"
+
+ if is_ipv6 $dst; then
+ # rx program does not support '-6' and implies ipv6 usage by default
+ rx_family=""
+ family=-6
+ filter=Ip6InReceives
+ ipt=ip6tables
+ fi
+
+ rx_args="$rx_family"
+ [ -n "$bind" ] && rx_args="$rx_args -b $bind"
+
+ # send a single GSO packet, segmented in 10 UDP frames.
+ # Always expect 10 UDP frames on RX side as rx socket does
+ # not enable GRO
+ ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 4789
+ ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 8000
+ ip netns exec $NS_DST ./udpgso_bench_rx -C 1000 -R 10 -n 10 -l 1300 $rx_args &
+ local spid=$!
+ sleep 0.1
+ ip netns exec $NS_SRC ./udpgso_bench_tx $family -M 1 -s 13000 -S 1300 -D $dst
+ local retc=$?
+ wait $spid
+ local rets=$?
+ if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
+ echo " fail client exit code $retc, server $rets"
+ ret=1
+ return
+ fi
+
+ local rcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 8000' | \
+ sed -e 's/\[//' -e 's/:.*//'`
+ if [ $rcv != $pkts ]; then
+ echo " fail - received $rvs packets, expected $pkts"
+ ret=1
+ return
+ fi
+
+ local vxrcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 4789' | \
+ sed -e 's/\[//' -e 's/:.*//'`
+
+ # upper net can generate a little noise, allow some tolerance
+ if [ $vxrcv -lt $vxpkts -o $vxrcv -gt $((vxpkts + 3)) ]; then
+ echo " fail - received $vxrcv vxlan packets, expected $vxpkts"
+ ret=1
+ return
+ fi
+ echo " ok"
+}
+
+run_bench() {
+ local -r msg=$1
+ local -r dst=$2
+ local family=-4
+
+ printf "%-40s" "$msg"
+ if [ $NPROCS -lt 2 ]; then
+ echo " skip - needed 2 CPUs found $NPROCS"
+ return
+ fi
+
+ is_ipv6 $dst && family=-6
+
+ # bind the sender and the receiver to different CPUs to try
+ # get reproducible results
+ ip netns exec $NS_DST bash -c "echo 2 > /sys/class/net/veth$DST/queues/rx-0/rps_cpus"
+ ip netns exec $NS_DST taskset 0x2 ./udpgso_bench_rx -C 1000 -R 10 &
+ local spid=$!
+ sleep 0.1
+ ip netns exec $NS_SRC taskset 0x1 ./udpgso_bench_tx $family -l 3 -S 1300 -D $dst
+ local retc=$?
+ wait $spid
+ local rets=$?
+ if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
+ echo " fail client exit code $retc, server $rets"
+ ret=1
+ return
+ fi
+}
+
+for family in 4 6; do
+ BM_NET=$BM_NET_V4
+ OL_NET=$OL_NET_V4
+ IPT=iptables
+ SUFFIX=24
+ VXDEV=vxlan
+
+ if [ $family = 6 ]; then
+ BM_NET=$BM_NET_V6
+ OL_NET=$OL_NET_V6
+ SUFFIX="64 nodad"
+ VXDEV=vxlan6
+ IPT=ip6tables
+ fi
+
+ echo "IPv$family"
+
+ create_ns
+ run_test "No GRO" $BM_NET$DST 10 0
+ cleanup
+
+ create_ns
+ ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
+ run_test "GRO frag list" $BM_NET$DST 1 0
+ cleanup
+
+ # UDP GRO fwd skips aggregation when find an udp socket with the GRO option
+ # if there is an UDP tunnel in the running system, such lookup happen
+ # take place.
+ # use NAT to circumvent GRO FWD check
+ create_ns
+ ip -n $NS_DST addr add dev veth$DST $BM_NET$DST_NAT/$SUFFIX
+ ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+ ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $BM_NET$DST_NAT \
+ -j DNAT --to-destination $BM_NET$DST
+ run_test "GRO fwd" $BM_NET$DST_NAT 1 0 $BM_NET$DST
+ cleanup
+
+ create_ns
+ run_bench "UDP fwd perf" $BM_NET$DST
+ ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+ run_bench "UDP GRO fwd perf" $BM_NET$DST
+ cleanup
+
+ create_vxlan_pair
+ ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
+ run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1
+ cleanup
+
+ # use NAT to circumvent GRO FWD check
+ create_vxlan_pair
+ ip -n $NS_DST addr add dev $VXDEV$DST $OL_NET$DST_NAT/$SUFFIX
+ ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+ ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $OL_NET$DST_NAT \
+ -j DNAT --to-destination $OL_NET$DST
+
+ # load arp cache before running the test to reduce the amount of
+ # stray traffic on top of the UDP tunnel
+ ip netns exec $NS_SRC ping -q -c 1 $OL_NET$DST_NAT >/dev/null
+ run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST
+ cleanup
+
+ create_vxlan_pair
+ run_bench "UDP tunnel fwd perf" $OL_NET$DST
+ ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+ run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST
+ cleanup
+done
+
+exit $ret