From 7c268eaeec6388b7bee36aef3fb5e62c9222ad3b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 23:54:55 +0000 Subject: net: Allow opt-out from global protocol memory accounting. Some protocols (e.g., TCP, UDP) implement memory accounting for socket buffers and charge memory to per-protocol global counters pointed to by sk->sk_proto->memory_allocated. Sometimes, system processes do not want that limitation. For a similar purpose, there is SO_RESERVE_MEM for sockets under memcg. Also, by opting out of the per-protocol accounting, sockets under memcg can avoid paying costs for two orthogonal memory accounting mechanisms. A microbenchmark result is in the subsequent bpf patch. Let's allow opt-out from the per-protocol memory accounting if sk->sk_bypass_prot_mem is true. sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache line, and sk_has_account() always fetches sk->sk_prot before accessing sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch. The following patches will set sk->sk_bypass_prot_mem to true, and then, the per-protocol memory accounting will be skipped. Note that this does NOT disable memcg, but rather the per-protocol one. Another option not to use the hole in struct sock_common is create sk_prot variants like tcp_prot_bypass, but this would complicate SOCKMAP logic, tcp_bpf_prots etc. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Shakeel Butt Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com --- include/net/proto_memory.h | 3 +++ include/net/sock.h | 3 +++ include/net/tcp.h | 3 +++ 3 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index 8e91a8fa31b5..ad6d703ce6fe 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -35,6 +35,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) mem_cgroup_sk_under_memory_pressure(sk)) return true; + if (sk->sk_bypass_prot_mem) + return false; + return !!READ_ONCE(*sk->sk_prot->memory_pressure); } diff --git a/include/net/sock.h b/include/net/sock.h index 30ac2eb4ef9b..415e7381aa50 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair; * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting + * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol @@ -174,6 +175,7 @@ struct sock_common { unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; + unsigned char skc_bypass_prot_mem:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -381,6 +383,7 @@ struct sock { #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt +#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot diff --git a/include/net/tcp.h b/include/net/tcp.h index 1e547138f4fb..439e327fdbfa 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -303,6 +303,9 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk) mem_cgroup_sk_under_memory_pressure(sk)) return true; + if (sk->sk_bypass_prot_mem) + return false; + return READ_ONCE(tcp_memory_pressure); } /* -- cgit v1.2.3 From b46ab63181ff973ddce44ebc9ac24b269d42f481 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 23:54:56 +0000 Subject: net: Introduce net.core.bypass_prot_mem sysctl. If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out of the global protocol memory accounting. Let's control the flag by a new sysctl knob. The flag is written once during socket(2) and is inherited to child sockets. Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" # ulimit -n 524288 Without net.core.bypass_prot_mem, charged to tcp_mem & memcg # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 <-------------------------------------- charged to memcg # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With net.core.bypass_prot_mem=1, charged to memcg only: # sysctl -q net.core.bypass_prot_mem=1 # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 <------------------------------------ charged to memcg # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Shakeel Butt Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Link: https://patch.msgid.link/20251014235604.3057003-4-kuniyu@google.com --- Documentation/admin-guide/sysctl/net.rst | 8 ++++++++ include/net/netns/core.h | 1 + net/core/sock.c | 5 +++++ net/core/sysctl_net_core.c | 9 +++++++++ 4 files changed, 23 insertions(+) (limited to 'include') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 40749b3cd356..991773dcb9cf 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -212,6 +212,14 @@ mem_pcpu_rsv Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU. +bypass_prot_mem +--------------- + +Skip charging socket buffers to the global per-protocol memory +accounting controlled by net.ipv4.tcp_mem, net.ipv4.udp_mem, etc. + +Default: 0 (off) + rmem_default ------------ diff --git a/include/net/netns/core.h b/include/net/netns/core.h index cb9c3e4cd738..9ef3d70e5e9c 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -17,6 +17,7 @@ struct netns_core { int sysctl_optmem_max; u8 sysctl_txrehash; u8 sysctl_tstamp_allow_data; + u8 sysctl_bypass_prot_mem; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/net/core/sock.c b/net/core/sock.c index 5bf208579c02..b78533fb9268 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2306,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + + if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) + sk->sk_bypass_prot_mem = 1; + sk->sk_kern_sock = kern; sock_lock_init(sk); + sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f79137826d7f..8d4decb2606f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -683,6 +683,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ -- cgit v1.2.3 From 38163af068810b388f6723a681dfd8c7b3680d38 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 23:54:58 +0000 Subject: bpf: Introduce SK_BPF_BYPASS_PROT_MEM. If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out of the global protocol memory accounting. This is easily controlled by net.core.bypass_prot_mem sysctl, but it lacks flexibility. Let's support flagging (and clearing) sk->sk_bypass_prot_mem via bpf_setsockopt() at the BPF_CGROUP_INET_SOCK_CREATE hook. int val = 1; bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM, &val, sizeof(val)); As with net.core.bypass_prot_mem, this is inherited to child sockets, and BPF always takes precedence over sysctl at socket(2) and accept(2). SK_BPF_BYPASS_PROT_MEM is only supported at BPF_CGROUP_INET_SOCK_CREATE and not supported on other hooks for some reasons: 1. UDP charges memory under sk->sk_receive_queue.lock instead of lock_sock() 2. Modifying the flag after skb is charged to sk requires such adjustment during bpf_setsockopt() and complicates the logic unnecessarily We can support other hooks later if a real use case justifies that. Most changes are inline and hard to trace, but a microbenchmark on __sk_mem_raise_allocated() during neper/tcp_stream showed that more samples completed faster with sk->sk_bypass_prot_mem == 1. This will be more visible under tcp_mem pressure (but it's not a fair comparison). # bpftrace -e 'kprobe:__sk_mem_raise_allocated { @start[tid] = nsecs; } kretprobe:__sk_mem_raise_allocated /@start[tid]/ { @end[tid] = nsecs - @start[tid]; @times = hist(@end[tid]); delete(@start[tid]); }' # tcp_stream -6 -F 1000 -N -T 256 Without bpf prog: [128, 256) 3846 | | [256, 512) 1505326 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 1371006 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 198207 |@@@@@@ | [2K, 4K) 31199 |@ | With bpf prog in the next patch: (must be attached before tcp_stream) # bpftool prog load sk_bypass_prot_mem.bpf.o /sys/fs/bpf/test type cgroup/sock_create # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/test [128, 256) 6413 | | [256, 512) 1868425 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 1101697 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 117031 |@@@@ | [2K, 4K) 11773 | | Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Acked-by: Roman Gushchin Link: https://patch.msgid.link/20251014235604.3057003-6-kuniyu@google.com --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 37 +++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 40 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6829936d33f5..6eb75ad900b1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7200,6 +7200,8 @@ enum { TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ + SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ + }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ed3f0e536059..16105f52927d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5733,9 +5733,37 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, + char *optval, int optlen, + bool getopt) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + if (!sk_has_account(sk)) + return -EOPNOTSUPP; + + if (getopt) { + *(int *)optval = sk->sk_bypass_prot_mem; + return 0; + } + + val = *(int *)optval; + if (val < 0 || val > 1) + return -EINVAL; + + sk->sk_bypass_prot_mem = val; + return 0; +} + BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) + return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); + return __bpf_setsockopt(sk, level, optname, optval, optlen); } @@ -5753,6 +5781,15 @@ static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { + int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); + + if (err) + memset(optval, 0, optlen); + + return err; + } + return __bpf_getsockopt(sk, level, optname, optval, optlen); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6829936d33f5..9b17d937edf7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7200,6 +7200,7 @@ enum { TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ + SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ }; enum { -- cgit v1.2.3