From 7865dfb1eb941ddd25802a9e13b6ff5f3f4dc02f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Dec 2023 15:23:24 -0800 Subject: bpf: sockmap, added comments describing update proto rules Add a comment describing that the psock update proto callbback can be called multiple times and this must be safe. Signed-off-by: John Fastabend Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20231221232327.43678-3-john.fastabend@gmail.com --- include/linux/skmsg.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c953b8c0d2f4..888a4b217829 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -100,6 +100,11 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + /* psock_update_sk_prot may be called with restore=false many times + * so the handler must be safe for this case. It will be called + * exactly once with restore=true when the psock is being destroyed + * and psock refcnt is zero, but before an RCU grace period. + */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; -- cgit v1.2.3 From 9fc8e802048ad150e8032c4f3dbf40112160cfe9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:39 -0800 Subject: bpf: Add objcg to bpf_mem_alloc The objcg is a bpf_mem_alloc level property since all bpf_mem_cache's are with the same objcg. This patch made such a property explicit. The next patch will use this property to save and restore objcg for percpu unit allocator. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031739.1288590-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 1 + kernel/bpf/memalloc.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index bb1223b21308..acef8c808599 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -11,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct obj_cgroup *objcg; bool percpu; struct work_struct work; }; diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 288ec4a967d0..4a21050f0359 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -523,6 +523,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; @@ -542,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); for (i = 0; i < NUM_CACHES; i++) { @@ -691,9 +693,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - /* objcg is the same across cpus */ - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -709,8 +710,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } -- cgit v1.2.3 From c39aa3b289e9c10d0d246cd919b06809f13b72b8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:45 -0800 Subject: bpf: Allow per unit prefill for non-fix-size percpu memory allocator Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation") added support for non-fix-size percpu memory allocation. Such allocation will allocate percpu memory for all buckets on all cpus and the memory consumption is in the order to quadratic. For example, let us say, 4 cpus, unit size 16 bytes, so each cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes. Then let us say, 8 cpus with the same unit size, each cpu has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes. So if the number of cpus doubles, the number of memory consumption will be 4 times. So for a system with large number of cpus, the memory consumption goes up quickly with quadratic order. For example, for 4KB percpu allocation, 128 cpus. The total memory consumption will 4KB * 128 * 128 = 64MB. Things will become worse if the number of cpus is bigger (e.g., 512, 1024, etc.) In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is done in boot time, so for system with large number of cpus, the initial percpu memory consumption is very visible. For example, for 128 cpu system, the total percpu memory allocation will be at least (16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096) * 128 * 128 = ~138MB. which is pretty big. It will be even bigger for larger number of cpus. Note that the current prefill also allocates 4 entries if the unit size is less than 256. So on top of 138MB memory consumption, this will add more consumption with 3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB. Next patch will try to reduce this memory consumption. Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory at init stage") moved the non-fix-size percpu memory allocation to bpf verificaiton stage. Once a particular bpf_percpu_obj_new() is called by bpf program, the memory allocator will try to fill in the cache with all sizes, causing the same amount of percpu memory consumption as in the boot stage. To reduce the initial percpu memory consumption for non-fix-size percpu memory allocation, instead of filling the cache with all supported allocation sizes, this patch intends to fill the cache only for the requested size. As typically users will not use large percpu data structure, this can save memory significantly. For example, the allocation size is 64 bytes with 128 cpus. Then total percpu memory amount will be 64 * 128 * 128 = 1MB, much less than previous 138MB. Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 7 ++++++ kernel/bpf/memalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 37 +++++++++++++++++----------- 3 files changed, 86 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index acef8c808599..aaf004d94322 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -22,8 +22,15 @@ struct bpf_mem_alloc { * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. * Alloc and free are done with bpf_mem_{alloc,free}() and the size of * the returned object is given by the size argument of bpf_mem_alloc(). + * If percpu equals true, error will be returned in order to avoid + * large memory consumption and the below bpf_mem_alloc_percpu_unit_init() + * should be used to do on-demand per-cpu allocation for each size. */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); +/* Initialize a non-fix-size percpu memory allocator */ +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg); +/* The percpu allocation with a specific unit size. */ +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 4a21050f0359..f71da07eb8a0 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -121,6 +121,8 @@ struct bpf_mem_caches { struct bpf_mem_cache cache[NUM_CACHES]; }; +static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; @@ -499,12 +501,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { - static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; + if (percpu && size == 0) + return -EINVAL; + /* room for llist_node and per-cpu pointer */ if (percpu) percpu_size = LLIST_NODE_SZ + sizeof(void *); @@ -524,6 +528,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) objcg = get_obj_cgroup_from_current(); #endif ma->objcg = objcg; + for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; @@ -562,6 +567,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) return 0; } +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) +{ + struct bpf_mem_caches __percpu *pcc; + + pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; + + ma->caches = pcc; + ma->objcg = objcg; + ma->percpu = true; + return 0; +} + +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) +{ + struct bpf_mem_caches *cc, __percpu *pcc; + int cpu, i, unit_size, percpu_size; + struct obj_cgroup *objcg; + struct bpf_mem_cache *c; + + i = bpf_mem_cache_idx(size); + if (i < 0) + return -EINVAL; + + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + + unit_size = sizes[i]; + objcg = ma->objcg; + pcc = ma->caches; + + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + c = &cc->cache[i]; + if (cpu == 0 && c->unit_size) + break; + + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + c->tgt = c; + + init_refill_work(c); + prefill_mem_cache(c, cpu); + } + + return 0; +} + static void drain_mem_cache(struct bpf_mem_cache *c) { bool percpu = !!c->percpu_size; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4e31f61de0e..e9699a2cfe4f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12139,20 +12139,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (!bpf_global_percpu_ma_set) { - mutex_lock(&bpf_percpu_ma_lock); - if (!bpf_global_percpu_ma_set) { - err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); - if (!err) - bpf_global_percpu_ma_set = true; - } - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - } - if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); return -EINVAL; @@ -12173,6 +12159,29 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { -- cgit v1.2.3 From 98e20e5e13d2811898921f999288be7151a11954 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Tue, 26 Dec 2023 14:07:42 +0100 Subject: bpfilter: remove bpfilter bpfilter was supposed to convert iptables filtering rules into BPF programs on the fly, from the kernel, through a usermode helper. The base code for the UMH was introduced in 2018, and couple of attempts (2, 3) tried to introduce the BPF program generate features but were abandoned. bpfilter now sits in a kernel tree unused and unusable, occasionally causing confusion amongst Linux users (4, 5). As bpfilter is now developed in a dedicated repository on GitHub (6), it was suggested a couple of times this year (LSFMM/BPF 2023, LPC 2023) to remove the deprecated kernel part of the project. This is the purpose of this patch. [1]: https://lore.kernel.org/lkml/20180522022230.2492505-1-ast@kernel.org/ [2]: https://lore.kernel.org/bpf/20210829183608.2297877-1-me@ubique.spb.ru/#t [3]: https://lore.kernel.org/lkml/20221224000402.476079-1-qde@naccy.de/ [4]: https://dxuuu.xyz/bpfilter.html [5]: https://github.com/linuxkit/linuxkit/pull/3904 [6]: https://github.com/facebook/bpfilter Signed-off-by: Quentin Deslandes Link: https://lore.kernel.org/r/20231226130745.465988-1-qde@naccy.de Signed-off-by: Alexei Starovoitov --- arch/loongarch/configs/loongson3_defconfig | 1 - include/linux/bpfilter.h | 24 ----- include/uapi/linux/bpfilter.h | 21 ----- net/Kconfig | 2 - net/Makefile | 1 - net/bpfilter/.gitignore | 2 - net/bpfilter/Kconfig | 23 ----- net/bpfilter/Makefile | 20 ----- net/bpfilter/bpfilter_kern.c | 136 ----------------------------- net/bpfilter/bpfilter_umh_blob.S | 7 -- net/bpfilter/main.c | 64 -------------- net/bpfilter/msgfmt.h | 17 ---- net/ipv4/Makefile | 2 - net/ipv4/bpfilter/Makefile | 2 - net/ipv4/bpfilter/sockopt.c | 71 --------------- net/ipv4/ip_sockglue.c | 12 --- tools/bpf/bpftool/feature.c | 4 - tools/testing/selftests/bpf/config.aarch64 | 1 - tools/testing/selftests/bpf/config.s390x | 1 - tools/testing/selftests/bpf/config.x86_64 | 1 - tools/testing/selftests/hid/config | 1 - 21 files changed, 413 deletions(-) delete mode 100644 include/linux/bpfilter.h delete mode 100644 include/uapi/linux/bpfilter.h delete mode 100644 net/bpfilter/.gitignore delete mode 100644 net/bpfilter/Kconfig delete mode 100644 net/bpfilter/Makefile delete mode 100644 net/bpfilter/bpfilter_kern.c delete mode 100644 net/bpfilter/bpfilter_umh_blob.S delete mode 100644 net/bpfilter/main.c delete mode 100644 net/bpfilter/msgfmt.h delete mode 100644 net/ipv4/bpfilter/Makefile delete mode 100644 net/ipv4/bpfilter/sockopt.c (limited to 'include/linux') diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 9c333d133c30..60e331af9839 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -276,7 +276,6 @@ CONFIG_BRIDGE_EBT_T_NAT=m CONFIG_BRIDGE_EBT_ARP=m CONFIG_BRIDGE_EBT_IP=m CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BPFILTER=y CONFIG_IP_SCTP=m CONFIG_RDS=y CONFIG_L2TP=m diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h deleted file mode 100644 index 736ded4905e0..000000000000 --- a/include/linux/bpfilter.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BPFILTER_H -#define _LINUX_BPFILTER_H - -#include -#include -#include - -struct sock; -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen); -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen); - -struct bpfilter_umh_ops { - struct umd_info info; - /* since ip_getsockopt() can run in parallel, serialize access to umh */ - struct mutex lock; - int (*sockopt)(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set); - int (*start)(void); -}; -extern struct bpfilter_umh_ops bpfilter_ops; -#endif diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h deleted file mode 100644 index cbc1f5813f50..000000000000 --- a/include/uapi/linux/bpfilter.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_BPFILTER_H -#define _UAPI_LINUX_BPFILTER_H - -#include - -enum { - BPFILTER_IPT_SO_SET_REPLACE = 64, - BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65, - BPFILTER_IPT_SET_MAX, -}; - -enum { - BPFILTER_IPT_SO_GET_INFO = 64, - BPFILTER_IPT_SO_GET_ENTRIES = 65, - BPFILTER_IPT_SO_GET_REVISION_MATCH = 66, - BPFILTER_IPT_SO_GET_REVISION_TARGET = 67, - BPFILTER_IPT_GET_MAX, -}; - -#endif /* _UAPI_LINUX_BPFILTER_H */ diff --git a/net/Kconfig b/net/Kconfig index 3ec6bc98fa05..4adc47d0c9c2 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -233,8 +233,6 @@ source "net/bridge/netfilter/Kconfig" endif -source "net/bpfilter/Kconfig" - source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" diff --git a/net/Makefile b/net/Makefile index 4c4dc535453d..b06b5539e7a6 100644 --- a/net/Makefile +++ b/net/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX_SCM) += unix/ obj-y += ipv6/ -obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore deleted file mode 100644 index f34e85ee8204..000000000000 --- a/net/bpfilter/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -bpfilter_umh diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig deleted file mode 100644 index 3d4a21462458..000000000000 --- a/net/bpfilter/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menuconfig BPFILTER - bool "BPF based packet filtering framework (BPFILTER)" - depends on BPF && INET - select USERMODE_DRIVER - help - This builds experimental bpfilter framework that is aiming to - provide netfilter compatible functionality via BPF - -if BPFILTER -config BPFILTER_UMH - tristate "bpfilter kernel module with user mode helper" - depends on CC_CAN_LINK - depends on m || CC_CAN_LINK_STATIC - default m - help - This builds bpfilter kernel module with embedded user mode helper - - Note: To compile this as built-in, your toolchain must support - building static binaries, since rootfs isn't mounted at the time - when __init functions are called and do_execv won't be able to find - the elf interpreter. -endif diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile deleted file mode 100644 index cdac82b8c53a..000000000000 --- a/net/bpfilter/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the Linux BPFILTER layer. -# - -userprogs := bpfilter_umh -bpfilter_umh-objs := main.o -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi - -ifeq ($(CONFIG_BPFILTER_UMH), y) -# builtin bpfilter_umh should be linked with -static -# since rootfs isn't mounted at the time of __init -# function is called and do_execv won't find elf interpreter -userldflags += -static -endif - -$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh - -obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o -bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c deleted file mode 100644 index 97e129e3f31c..000000000000 --- a/net/bpfilter/bpfilter_kern.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include "msgfmt.h" - -extern char bpfilter_umh_start; -extern char bpfilter_umh_end; - -static void shutdown_umh(void) -{ - struct umd_info *info = &bpfilter_ops.info; - struct pid *tgid = info->tgid; - - if (tgid) { - kill_pid(tgid, SIGKILL, 1); - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(info); - } -} - -static void __stop_umh(void) -{ - if (IS_ENABLED(CONFIG_INET)) - shutdown_umh(); -} - -static int bpfilter_send_req(struct mbox_request *req) -{ - struct mbox_reply reply; - loff_t pos = 0; - ssize_t n; - - if (!bpfilter_ops.info.tgid) - return -EFAULT; - pos = 0; - n = kernel_write(bpfilter_ops.info.pipe_to_umh, req, sizeof(*req), - &pos); - if (n != sizeof(*req)) { - pr_err("write fail %zd\n", n); - goto stop; - } - pos = 0; - n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), - &pos); - if (n != sizeof(reply)) { - pr_err("read fail %zd\n", n); - goto stop; - } - return reply.status; -stop: - __stop_umh(); - return -EFAULT; -} - -static int bpfilter_process_sockopt(struct sock *sk, int optname, - sockptr_t optval, unsigned int optlen, - bool is_set) -{ - struct mbox_request req = { - .is_set = is_set, - .pid = current->pid, - .cmd = optname, - .addr = (uintptr_t)optval.user, - .len = optlen, - }; - if (sockptr_is_kernel(optval)) { - pr_err("kernel access not supported\n"); - return -EFAULT; - } - return bpfilter_send_req(&req); -} - -static int start_umh(void) -{ - struct mbox_request req = { .pid = current->pid }; - int err; - - /* fork usermode process */ - err = fork_usermode_driver(&bpfilter_ops.info); - if (err) - return err; - pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid)); - - /* health check that usermode process started correctly */ - if (bpfilter_send_req(&req) != 0) { - shutdown_umh(); - return -EFAULT; - } - - return 0; -} - -static int __init load_umh(void) -{ - int err; - - err = umd_load_blob(&bpfilter_ops.info, - &bpfilter_umh_start, - &bpfilter_umh_end - &bpfilter_umh_start); - if (err) - return err; - - mutex_lock(&bpfilter_ops.lock); - err = start_umh(); - if (!err && IS_ENABLED(CONFIG_INET)) { - bpfilter_ops.sockopt = &bpfilter_process_sockopt; - bpfilter_ops.start = &start_umh; - } - mutex_unlock(&bpfilter_ops.lock); - if (err) - umd_unload_blob(&bpfilter_ops.info); - return err; -} - -static void __exit fini_umh(void) -{ - mutex_lock(&bpfilter_ops.lock); - if (IS_ENABLED(CONFIG_INET)) { - shutdown_umh(); - bpfilter_ops.start = NULL; - bpfilter_ops.sockopt = NULL; - } - mutex_unlock(&bpfilter_ops.lock); - - umd_unload_blob(&bpfilter_ops.info); -} -module_init(load_umh); -module_exit(fini_umh); -MODULE_LICENSE("GPL"); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S deleted file mode 100644 index 40311d10d2f2..000000000000 --- a/net/bpfilter/bpfilter_umh_blob.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" - .global bpfilter_umh_start -bpfilter_umh_start: - .incbin "net/bpfilter/bpfilter_umh" - .global bpfilter_umh_end -bpfilter_umh_end: diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c deleted file mode 100644 index 291a92546246..000000000000 --- a/net/bpfilter/main.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include "../../include/uapi/linux/bpf.h" -#include -#include "msgfmt.h" - -FILE *debug_f; - -static int handle_get_cmd(struct mbox_request *cmd) -{ - switch (cmd->cmd) { - case 0: - return 0; - default: - break; - } - return -ENOPROTOOPT; -} - -static int handle_set_cmd(struct mbox_request *cmd) -{ - return -ENOPROTOOPT; -} - -static void loop(void) -{ - while (1) { - struct mbox_request req; - struct mbox_reply reply; - int n; - - n = read(0, &req, sizeof(req)); - if (n != sizeof(req)) { - fprintf(debug_f, "invalid request %d\n", n); - return; - } - - reply.status = req.is_set ? - handle_set_cmd(&req) : - handle_get_cmd(&req); - - n = write(1, &reply, sizeof(reply)); - if (n != sizeof(reply)) { - fprintf(debug_f, "reply failed %d\n", n); - return; - } - } -} - -int main(void) -{ - debug_f = fopen("/dev/kmsg", "w"); - setvbuf(debug_f, 0, _IOLBF, 0); - fprintf(debug_f, "<5>Started bpfilter\n"); - loop(); - fclose(debug_f); - return 0; -} diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h deleted file mode 100644 index 98d121c62945..000000000000 --- a/net/bpfilter/msgfmt.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_BPFILTER_MSGFMT_H -#define _NET_BPFILTER_MSGFMT_H - -struct mbox_request { - __u64 addr; - __u32 len; - __u32 is_set; - __u32 cmd; - __u32 pid; -}; - -struct mbox_reply { - __u32 status; -}; - -#endif diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index e144a02a6a61..ec36d2ec059e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -16,8 +16,6 @@ obj-y := route.o inetpeer.o protocol.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ metrics.o netlink.o nexthop.o udp_tunnel_stub.o -obj-$(CONFIG_BPFILTER) += bpfilter/ - obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile deleted file mode 100644 index 00af5305e05a..000000000000 --- a/net/ipv4/bpfilter/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_BPFILTER) += sockopt.o diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c deleted file mode 100644 index 193bcc2acccc..000000000000 --- a/net/ipv4/bpfilter/sockopt.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct bpfilter_umh_ops bpfilter_ops; -EXPORT_SYMBOL_GPL(bpfilter_ops); - -static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set) -{ - int err; - mutex_lock(&bpfilter_ops.lock); - if (!bpfilter_ops.sockopt) { - mutex_unlock(&bpfilter_ops.lock); - request_module("bpfilter"); - mutex_lock(&bpfilter_ops.lock); - - if (!bpfilter_ops.sockopt) { - err = -ENOPROTOOPT; - goto out; - } - } - if (bpfilter_ops.info.tgid && - thread_group_exited(bpfilter_ops.info.tgid)) - umd_cleanup_helper(&bpfilter_ops.info); - - if (!bpfilter_ops.info.tgid) { - err = bpfilter_ops.start(); - if (err) - goto out; - } - err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); -out: - mutex_unlock(&bpfilter_ops.lock); - return err; -} - -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen) -{ - return bpfilter_mbox_request(sk, optname, optval, optlen, true); -} - -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) -{ - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - return bpfilter_mbox_request(sk, optname, USER_SOCKPTR(optval), len, - false); -} - -static int __init bpfilter_sockopt_init(void) -{ - mutex_init(&bpfilter_ops.lock); - bpfilter_ops.info.tgid = NULL; - bpfilter_ops.info.driver_name = "bpfilter_umh"; - - return 0; -} -device_initcall(bpfilter_sockopt_init); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 66247e8b429e..7aa9dc0e6760 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -47,8 +47,6 @@ #include #include -#include - /* * SOL_IP control messages. */ @@ -1411,11 +1409,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_SET_REPLACE && - optname < BPFILTER_IPT_SET_MAX) - err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); -#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && @@ -1763,11 +1756,6 @@ int ip_getsockopt(struct sock *sk, int level, err = do_ip_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_GET_INFO && - optname < BPFILTER_IPT_GET_MAX) - err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index edda4fc2c4d0..708733b0ea06 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -426,10 +426,6 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_BPF_STREAM_PARSER", }, /* xt_bpf module for passing BPF programs to netfilter */ { "CONFIG_NETFILTER_XT_MATCH_BPF", }, - /* bpfilter back-end for iptables */ - { "CONFIG_BPFILTER", }, - /* bpftilter module with "user mode helper" */ - { "CONFIG_BPFILTER_UMH", }, /* test_bpf module for BPF tests */ { "CONFIG_TEST_BPF", }, diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64 index 29c8635c5722..3720b7611523 100644 --- a/tools/testing/selftests/bpf/config.aarch64 +++ b/tools/testing/selftests/bpf/config.aarch64 @@ -11,7 +11,6 @@ CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_SD=y CONFIG_BONDING=y -CONFIG_BPFILTER=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y CONFIG_BPF_PRELOAD_UMD=y diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x index e93330382849..706931a8c2c6 100644 --- a/tools/testing/selftests/bpf/config.s390x +++ b/tools/testing/selftests/bpf/config.s390x @@ -9,7 +9,6 @@ CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y CONFIG_BPF_PRELOAD=y CONFIG_BPF_PRELOAD_UMD=y -CONFIG_BPFILTER=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_FREEZER=y diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64 index b946088017f1..5680befae8c6 100644 --- a/tools/testing/selftests/bpf/config.x86_64 +++ b/tools/testing/selftests/bpf/config.x86_64 @@ -19,7 +19,6 @@ CONFIG_BOOTTIME_TRACING=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_PRELOAD=y CONFIG_BPF_PRELOAD_UMD=y -CONFIG_BPFILTER=y CONFIG_BSD_DISKLABEL=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_CFS_BANDWIDTH=y diff --git a/tools/testing/selftests/hid/config b/tools/testing/selftests/hid/config index 4f425178b56f..1758b055f295 100644 --- a/tools/testing/selftests/hid/config +++ b/tools/testing/selftests/hid/config @@ -1,5 +1,4 @@ CONFIG_BPF_EVENTS=y -CONFIG_BPFILTER=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT=y CONFIG_BPF_KPROBE_OVERRIDE=y -- cgit v1.2.3 From 19bfcdf9498aa968ea293417fbbc39e523527ca8 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 3 Jan 2024 20:05:44 +0100 Subject: bpf: Relax tracing prog recursive attach rules Currently, it's not allowed to attach an fentry/fexit prog to another one fentry/fexit. At the same time it's not uncommon to see a tracing program with lots of logic in use, and the attachment limitation prevents usage of fentry/fexit for performance analysis (e.g. with "bpftool prog profile" command) in this case. An example could be falcosecurity libs project that uses tp_btf tracing programs. Following the corresponding discussion [1], the reason for that is to avoid tracing progs call cycles without introducing more complex solutions. But currently it seems impossible to load and attach tracing programs in a way that will form such a cycle. The limitation is coming from the fact that attach_prog_fd is specified at the prog load (thus making it impossible to attach to a program loaded after it in this way), as well as tracing progs not implementing link_detach. Replace "no same type" requirement with verification that no more than one level of attachment nesting is allowed. In this way only one fentry/fexit program could be attached to another fentry/fexit to cover profiling use case, and still no cycle could be formed. To implement, add a new field into bpf_prog_aux to track nested attachment for tracing programs. [1]: https://lore.kernel.org/bpf/20191108064039.2041889-16-ast@kernel.org/ Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-2-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 23 ++++++++++++++++++++++- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7671530d6e4e..e30100597d0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1449,6 +1449,7 @@ struct bpf_prog_aux { bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1bf9805ee185..d15f9998bc20 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2738,6 +2738,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) goto free_prog_sec; } + /* + * Bookkeeping for managing the program attachment chain. + * + * It might be tempting to set attach_tracing_prog flag at the attachment + * time, but this will not prevent from loading bunch of tracing prog + * first, then attach them one to another. + * + * The flag attach_tracing_prog is set for the whole program lifecycle, and + * doesn't have to be cleared in bpf_tracing_link_release, since tracing + * programs cannot change attachment target. + */ + if (type == BPF_PROG_TYPE_TRACING && dst_prog && + dst_prog->type == BPF_PROG_TYPE_TRACING) { + prog->aux->attach_tracing_prog = true; + } + /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) @@ -3171,7 +3187,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, } if (tgt_prog_fd) { - /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + /* + * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this + * part would be changed to implement the same for + * BPF_PROG_TYPE_TRACING, do not forget to update the way how + * attach_tracing_prog flag is set. + */ if (prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d5f4ff1eb235..adbf330d364b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20317,6 +20317,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info) { bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; + bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING; const char prefix[] = "btf_trace_"; int ret = 0, subprog = -1, i; const struct btf_type *t; @@ -20387,10 +20388,21 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Can attach to only JITed progs\n"); return -EINVAL; } - if (tgt_prog->type == prog->type) { - /* Cannot fentry/fexit another fentry/fexit program. - * Cannot attach program extension to another extension. - * It's ok to attach fentry/fexit to extension program. + if (prog_tracing) { + if (aux->attach_tracing_prog) { + /* + * Target program is an fentry/fexit which is already attached + * to another tracing program. More levels of nesting + * attachment are not allowed. + */ + bpf_log(log, "Cannot nest tracing program attach more than once\n"); + return -EINVAL; + } + } else if (tgt_prog->type == prog->type) { + /* + * To avoid potential call chain cycles, prevent attaching of a + * program extension to another extension. It's ok to attach + * fentry/fexit to extension program. */ bpf_log(log, "Cannot recursively attach\n"); return -EINVAL; @@ -20403,16 +20415,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance * analysis, stats and can be attached to any program - * type except themselves. When extension program is - * replacing XDP function it is necessary to allow - * performance analysis of all functions. Both original - * XDP program and its program extension. Hence - * attaching fentry/fexit to BPF_PROG_TYPE_EXT is - * allowed. If extending of fentry/fexit was allowed it - * would be possible to create long call chain - * fentry->extension->fentry->extension beyond - * reasonable stack size. Hence extending fentry is not - * allowed. + * type. When extension program is replacing XDP function + * it is necessary to allow performance analysis of all + * functions. Both original XDP program and its program + * extension. Hence attaching fentry/fexit to + * BPF_PROG_TYPE_EXT is allowed. If extending of + * fentry/fexit was allowed it would be possible to create + * long call chain fentry->extension->fentry->extension + * beyond reasonable stack size. Hence extending fentry + * is not allowed. */ bpf_log(log, "Cannot extend fentry/fexit\n"); return -EINVAL; -- cgit v1.2.3