summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2021-10-26 04:02:16 +0300
committerJakub Kicinski <kuba@kernel.org>2021-10-26 04:02:17 +0300
commite43b76abf768cb10880282990eca54c10d0ff40c (patch)
treeaafe4c0416c1ccb732d061d9401ca760fb773490
parentfd559a943e3ad919b47ee480e3edb230a818a6b1 (diff)
parent12c8691de30734a29b84cae35a4bf1cccca6ad0a (diff)
downloadlinux-e43b76abf768cb10880282990eca54c10d0ff40c.tar.xz
Merge branch 'tcp-receive-path-optimizations'
Eric Dumazet says: ==================== tcp: receive path optimizations This series aims to reduce cache line misses in RX path. I am still working on better cache locality in tcp_sock but this will wait few more weeks. ==================== Link: https://lore.kernel.org/r/20211025164825.259415-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--include/linux/ipv6.h1
-rw-r--r--include/net/busy_poll.h3
-rw-r--r--include/net/inet_sock.h3
-rw-r--r--include/net/ip.h2
-rw-r--r--include/net/ipv6.h1
-rw-r--r--include/net/sock.h21
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/tcp_ipv4.c25
-rw-r--r--net/ipv6/ipv6_sockglue.c11
-rw-r--r--net/ipv6/tcp_ipv6.c35
-rw-r--r--net/ipv6/udp.c4
11 files changed, 79 insertions, 38 deletions
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ef4a69865737..c383630d3f06 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -282,7 +282,6 @@ struct ipv6_pinfo {
__be32 rcv_flowinfo;
__u32 dst_cookie;
- __u32 rx_dst_cookie;
struct ipv6_mc_socklist __rcu *ipv6_mc_list;
struct ipv6_ac_socklist *ipv6_ac_list;
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 40296ed976a9..4202c609bb0b 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -130,7 +130,8 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
- WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
+ if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id))
+ WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
sk_rx_queue_set(sk, skb);
}
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 89163ef8cf4b..9e1111f5915b 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -207,11 +207,10 @@ struct inet_sock {
__be32 inet_saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
+ struct ip_options_rcu __rcu *inet_opt;
__be16 inet_sport;
__u16 inet_id;
- struct ip_options_rcu __rcu *inet_opt;
- int rx_dst_ifindex;
__u8 tos;
__u8 min_ttl;
__u8 mc_ttl;
diff --git a/include/net/ip.h b/include/net/ip.h
index cf229a531194..b71e88507c4a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -24,6 +24,7 @@
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/sockptr.h>
+#include <linux/static_key.h>
#include <net/inet_sock.h>
#include <net/route.h>
@@ -750,6 +751,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
struct ipcm_cookie *ipc, bool allow_ipv6);
+DECLARE_STATIC_KEY_FALSE(ip4_min_ttl);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f2d0ecc257bb..c19bf51ded1d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1092,6 +1092,7 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
/*
* socket options (ipv6_sockglue.c)
*/
+DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
diff --git a/include/net/sock.h b/include/net/sock.h
index 596ba85611bc..b76be30674ef 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -259,6 +259,8 @@ struct bpf_local_storage;
* @sk_rcvbuf: size of receive buffer in bytes
* @sk_wq: sock wait queue and async head
* @sk_rx_dst: receive input route used by early demux
+ * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
+ * @sk_rx_dst_cookie: cookie for @sk_rx_dst
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
@@ -430,6 +432,9 @@ struct sock {
struct xfrm_policy __rcu *sk_policy[2];
#endif
struct dst_entry *sk_rx_dst;
+ int sk_rx_dst_ifindex;
+ u32 sk_rx_dst_cookie;
+
struct dst_entry __rcu *sk_dst_cache;
atomic_t sk_omem_alloc;
int sk_sndbuf;
@@ -1911,10 +1916,8 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
if (skb_rx_queue_recorded(skb)) {
u16 rx_queue = skb_get_rx_queue(skb);
- if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING))
- return;
-
- sk->sk_rx_queue_mapping = rx_queue;
+ if (unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
+ WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
}
#endif
}
@@ -1922,15 +1925,19 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
- sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
+ WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif
}
static inline int sk_rx_queue_get(const struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
- if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
- return sk->sk_rx_queue_mapping;
+ if (sk) {
+ int res = READ_ONCE(sk->sk_rx_queue_mapping);
+
+ if (res != NO_QUEUE_MAPPING)
+ return res;
+ }
#endif
return -1;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b297bb28556e..38d29b175ca6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -886,6 +886,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
return ip_mc_leave_group(sk, &mreq);
}
+DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);
+
static int do_ip_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -1352,7 +1354,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
if (val < 0 || val > 255)
goto e_inval;
- inet->min_ttl = val;
+
+ if (val)
+ static_branch_enable(&ip4_min_ttl);
+
+ /* tcp_v4_err() and tcp_v4_rcv() might read min_ttl
+ * while we are changint it.
+ */
+ WRITE_ONCE(inet->min_ttl, val);
break;
default:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5e6d8cb82ce5..13d868c43284 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -508,9 +508,12 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
if (sk->sk_state == TCP_CLOSE)
goto out;
- if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto out;
+ if (static_branch_unlikely(&ip4_min_ttl)) {
+ /* min_ttl can be changed concurrently from do_ip_setsockopt() */
+ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
}
tp = tcp_sk(sk);
@@ -1703,7 +1706,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
- if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+ if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
dst, 0)) {
dst_release(dst);
@@ -1788,7 +1791,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst &&
- inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
@@ -2068,9 +2071,13 @@ process:
return 0;
}
}
- if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto discard_and_relse;
+
+ if (static_branch_unlikely(&ip4_min_ttl)) {
+ /* min_ttl can be changed concurrently from do_ip_setsockopt() */
+ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+ }
}
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
@@ -2195,7 +2202,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ sk->sk_rx_dst_ifindex = skb->skb_iif;
}
}
EXPORT_SYMBOL(inet_sk_rx_dst_set);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index e4bdb09c5586..41efca817db4 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -55,6 +55,8 @@
struct ip6_ra_chain *ip6_ra_chain;
DEFINE_RWLOCK(ip6_ra_lock);
+DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount);
+
int ip6_ra_control(struct sock *sk, int sel)
{
struct ip6_ra_chain *ra, *new_ra, **rap;
@@ -950,7 +952,14 @@ done:
goto e_inval;
if (val < 0 || val > 255)
goto e_inval;
- np->min_hopcount = val;
+
+ if (val)
+ static_branch_enable(&ip6_min_hopcount);
+
+ /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
+ * while we are changing it.
+ */
+ WRITE_ONCE(np->min_hopcount, val);
retv = 0;
break;
case IPV6_DONTFRAG:
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1ef27cd7dbff..c678e778c1fb 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -108,8 +108,8 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
const struct rt6_info *rt = (const struct rt6_info *)dst;
sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
- tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
+ sk->sk_rx_dst_ifindex = skb->skb_iif;
+ sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
}
}
@@ -414,9 +414,12 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sk->sk_state == TCP_CLOSE)
goto out;
- if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto out;
+ if (static_branch_unlikely(&ip6_min_hopcount)) {
+ /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
+ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
}
tp = tcp_sk(sk);
@@ -569,7 +572,7 @@ done:
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->ipv6_opt);
- kfree_skb(inet_rsk(req)->pktopts);
+ consume_skb(inet_rsk(req)->pktopts);
}
#ifdef CONFIG_TCP_MD5SIG
@@ -1509,9 +1512,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
- if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+ if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
- dst, np->rx_dst_cookie) == NULL) {
+ dst, sk->sk_rx_dst_cookie) == NULL) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
@@ -1591,7 +1594,7 @@ ipv6_pktoptions:
}
}
- kfree_skb(opt_skb);
+ consume_skb(opt_skb);
return 0;
}
@@ -1726,9 +1729,13 @@ process:
return 0;
}
}
- if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto discard_and_relse;
+
+ if (static_branch_unlikely(&ip6_min_hopcount)) {
+ /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
+ if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+ }
}
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
@@ -1872,9 +1879,9 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
if (dst)
- dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie);
+ dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst &&
- inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8d785232b479..14a94cddcf0b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -884,7 +884,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
if (udp_sk_rx_dst_set(sk, dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;
- inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
+ sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
}
}
@@ -1073,7 +1073,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb)
dst = READ_ONCE(sk->sk_rx_dst);
if (dst)
- dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
+ dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst) {
/* set noref for now.
* any place which wants to hold dst has to call