diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-10-27 06:02:40 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-10-27 06:02:41 +0300 |
commit | c6f9b7138bf5c6b826175c9e0ad5f5dbfff4fa36 (patch) | |
tree | c7bf02f5524d49f829258fbda2cedab62e25efbd /include | |
parent | cc33a80b816406f900a53c7f98a50f6eacdd2e31 (diff) | |
parent | ea41b880cc85f0a992571f66e4554a69f7806246 (diff) | |
download | linux-c6f9b7138bf5c6b826175c9e0ad5f5dbfff4fa36.tar.xz |
Merge tag 'for-netdev' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says:
====================
pull-request: bpf-next 2023-10-26
We've added 51 non-merge commits during the last 10 day(s) which contain
a total of 75 files changed, 5037 insertions(+), 200 deletions(-).
The main changes are:
1) Add open-coded task, css_task and css iterator support.
One of the use cases is customizable OOM victim selection via BPF,
from Chuyi Zhou.
2) Fix BPF verifier's iterator convergence logic to use exact states
comparison for convergence checks, from Eduard Zingerman,
Andrii Nakryiko and Alexei Starovoitov.
3) Add BPF programmable net device where bpf_mprog defines the logic
of its xmit routine. It can operate in L3 and L2 mode,
from Daniel Borkmann and Nikolay Aleksandrov.
4) Batch of fixes for BPF per-CPU kptr and re-enable unit_size checking
for global per-CPU allocator, from Hou Tao.
5) Fix libbpf which eagerly assumed that SHT_GNU_verdef ELF section
was going to be present whenever a binary has SHT_GNU_versym section,
from Andrii Nakryiko.
6) Fix BPF ringbuf correctness to fold smp_mb__before_atomic() into
atomic_set_release(), from Paul E. McKenney.
7) Add a warning if NAPI callback missed xdp_do_flush() under
CONFIG_DEBUG_NET which helps checking if drivers were missing
the former, from Sebastian Andrzej Siewior.
8) Fix missed RCU read-lock in bpf_task_under_cgroup() which was throwing
a warning under sleepable programs, from Yafang Shao.
9) Avoid unnecessary -EBUSY from htab_lock_bucket by disabling IRQ before
checking map_locked, from Song Liu.
10) Make BPF CI linked_list failure test more robust,
from Kumar Kartikeya Dwivedi.
11) Enable samples/bpf to be built as PIE in Fedora, from Viktor Malik.
12) Fix xsk starving when multiple xsk sockets were associated with
a single xsk_buff_pool, from Albert Huang.
13) Clarify the signed modulo implementation for the BPF ISA standardization
document that it uses truncated division, from Dave Thaler.
14) Improve BPF verifier's JEQ/JNE branch taken logic to also consider
signed bounds knowledge, from Andrii Nakryiko.
15) Add an option to XDP selftests to use multi-buffer AF_XDP
xdp_hw_metadata and mark used XDP programs as capable to use frags,
from Larysa Zaremba.
16) Fix bpftool's BTF dumper wrt printing a pointer value and another
one to fix struct_ops dump in an array, from Manu Bretelle.
* tag 'for-netdev' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (51 commits)
netkit: Remove explicit active/peer ptr initialization
selftests/bpf: Fix selftests broken by mitigations=off
samples/bpf: Allow building with custom bpftool
samples/bpf: Fix passing LDFLAGS to libbpf
samples/bpf: Allow building with custom CFLAGS/LDFLAGS
bpf: Add more WARN_ON_ONCE checks for mismatched alloc and free
selftests/bpf: Add selftests for netkit
selftests/bpf: Add netlink helper library
bpftool: Extend net dump with netkit progs
bpftool: Implement link show support for netkit
libbpf: Add link-based API for netkit
tools: Sync if_link uapi header
netkit, bpf: Add bpf programmable net device
bpf: Improve JEQ/JNE branch taken logic
bpf: Fold smp_mb__before_atomic() into atomic_set_release()
bpf: Fix unnecessary -EBUSY from htab_lock_bucket
xsk: Avoid starving the xsk further down the list
bpf: print full verifier states on infinite loop detection
selftests/bpf: test if state loops are detected in a tricky case
bpf: correct loop detection for iterators convergence
...
====================
Link: https://lore.kernel.org/r/20231026150509.2824-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/bpf.h | 4 | ||||
-rw-r--r-- | include/linux/bpf_mem_alloc.h | 1 | ||||
-rw-r--r-- | include/linux/bpf_verifier.h | 35 | ||||
-rw-r--r-- | include/linux/btf.h | 1 | ||||
-rw-r--r-- | include/linux/cgroup.h | 12 | ||||
-rw-r--r-- | include/linux/percpu.h | 1 | ||||
-rw-r--r-- | include/net/netkit.h | 38 | ||||
-rw-r--r-- | include/net/tcx.h | 7 | ||||
-rw-r--r-- | include/net/xdp_sock.h | 16 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 14 | ||||
-rw-r--r-- | include/uapi/linux/if_link.h | 24 |
11 files changed, 132 insertions, 21 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3c51a507508..b4825d3cdb29 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); @@ -2478,6 +2479,9 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); + +bool dev_check_flush(void); +bool cpu_map_check_flush(void); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index d644bbb298af..bb1223b21308 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -11,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + bool percpu; struct work_struct work; }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 94ec766432f5..24213a99cc79 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -373,10 +373,25 @@ struct bpf_verifier_state { struct bpf_active_lock active_lock; bool speculative; bool active_rcu_lock; + /* If this state was ever pointed-to by other state's loop_entry field + * this flag would be set to true. Used to avoid freeing such states + * while they are still in use. + */ + bool used_as_loop_entry; /* first and last insn idx of this verifier state */ u32 first_insn_idx; u32 last_insn_idx; + /* If this state is a part of states loop this field points to some + * parent of this state such that: + * - it is also a member of the same states loop; + * - DFS states traversal starting from initial state visits loop_entry + * state before this state. + * Used to compute topmost loop entry for state loops. + * State loops might appear because of open coded iterators logic. + * See get_loop_entry() for more information. + */ + struct bpf_verifier_state *loop_entry; /* jmp history recorded from first to last. * backtracking is using it to go from last to first. * For most states jmp_history_cnt is [0-3]. @@ -384,21 +399,21 @@ struct bpf_verifier_state { */ struct bpf_idx_pair *jmp_history; u32 jmp_history_cnt; + u32 dfs_depth; }; -#define bpf_get_spilled_reg(slot, frame) \ +#define bpf_get_spilled_reg(slot, frame, mask) \ (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ - (frame->stack[slot].slot_type[0] == STACK_SPILL)) \ + ((1 << frame->stack[slot].slot_type[0]) & (mask))) \ ? &frame->stack[slot].spilled_ptr : NULL) /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ -#define bpf_for_each_spilled_reg(iter, frame, reg) \ - for (iter = 0, reg = bpf_get_spilled_reg(iter, frame); \ +#define bpf_for_each_spilled_reg(iter, frame, reg, mask) \ + for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask); \ iter < frame->allocated_stack / BPF_REG_SIZE; \ - iter++, reg = bpf_get_spilled_reg(iter, frame)) + iter++, reg = bpf_get_spilled_reg(iter, frame, mask)) -/* Invoke __expr over regsiters in __vst, setting __state and __reg */ -#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ +#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ int ___i, ___j; \ @@ -410,7 +425,7 @@ struct bpf_verifier_state { __reg = &___regs[___j]; \ (void)(__expr); \ } \ - bpf_for_each_spilled_reg(___j, __state, __reg) { \ + bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \ if (!__reg) \ continue; \ (void)(__expr); \ @@ -418,6 +433,10 @@ struct bpf_verifier_state { } \ }) +/* Invoke __expr over regsiters in __vst, setting __state and __reg */ +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr) + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; diff --git a/include/linux/btf.h b/include/linux/btf.h index 928113a80a95..c2231c64d60b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -74,6 +74,7 @@ #define KF_ITER_NEW (1 << 8) /* kfunc implements BPF iter constructor */ #define KF_ITER_NEXT (1 << 9) /* kfunc implements BPF iter next method */ #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */ +#define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b307013b9c6c..0ef0af66080e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -40,13 +40,11 @@ struct kernel_clone_args; #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 -/* walk only threadgroup leaders */ -#define CSS_TASK_ITER_PROCS (1U << 0) -/* walk all threaded css_sets in the domain */ -#define CSS_TASK_ITER_THREADED (1U << 1) - -/* internal flags */ -#define CSS_TASK_ITER_SKIPPED (1U << 16) +enum { + CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ + CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ + CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ +}; /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 68fac2e7cbe6..8c677f185901 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -132,6 +132,7 @@ extern void __init setup_per_cpu_areas(void); extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); extern void free_percpu(void __percpu *__pdata); +extern size_t pcpu_alloc_size(void __percpu *__pdata); DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T)) diff --git a/include/net/netkit.h b/include/net/netkit.h new file mode 100644 index 000000000000..0ba2e6b847ca --- /dev/null +++ b/include/net/netkit.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_NETKIT_H +#define __NET_NETKIT_H + +#include <linux/bpf.h> + +#ifdef CONFIG_NETKIT +int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +#else +static inline int netkit_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} +#endif /* CONFIG_NETKIT */ +#endif /* __NET_NETKIT_H */ diff --git a/include/net/tcx.h b/include/net/tcx.h index 264f147953ba..04be9377785d 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -38,16 +38,11 @@ static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) return container_of(bundle, struct tcx_entry, bundle); } -static inline struct tcx_link *tcx_link(struct bpf_link *link) +static inline struct tcx_link *tcx_link(const struct bpf_link *link) { return container_of(link, struct tcx_link, link); } -static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link) -{ - return tcx_link((struct bpf_link *)link); -} - void tcx_inc(void); void tcx_dec(void); diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 69b472604b86..f83128007fb0 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -63,6 +63,13 @@ struct xdp_sock { struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head tx_list; + /* record the number of tx descriptors sent by this xsk and + * when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs + * to be given to other xsks for sending tx descriptors, thereby + * preventing other XSKs from being starved. + */ + u32 tx_budget_spent; + /* Protects generic receive. */ spinlock_t rx_lock; @@ -109,4 +116,13 @@ static inline void __xsk_map_flush(void) #endif /* CONFIG_XDP_SOCKETS */ +#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET) +bool xsk_map_check_flush(void); +#else +static inline bool xsk_map_check_flush(void) +{ + return false; +} +#endif + #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7ba61b75bc0e..0f6cdf52b1da 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1052,6 +1052,8 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_RECVMSG, BPF_CGROUP_UNIX_GETPEERNAME, BPF_CGROUP_UNIX_GETSOCKNAME, + BPF_NETKIT_PRIMARY, + BPF_NETKIT_PEER, __MAX_BPF_ATTACH_TYPE }; @@ -1071,6 +1073,7 @@ enum bpf_link_type { BPF_LINK_TYPE_NETFILTER = 10, BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, + BPF_LINK_TYPE_NETKIT = 13, MAX_BPF_LINK_TYPE, }; @@ -1656,6 +1659,13 @@ union bpf_attr { __u32 flags; __u32 pid; } uprobe_multi; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } netkit; }; } link_create; @@ -6576,6 +6586,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } tcx; + struct { + __u32 ifindex; + __u32 attach_type; + } netkit; }; } __attribute__((aligned(8))); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index f4191be137a4..29ff80da2775 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -758,6 +758,30 @@ struct tunnel_msg { __u32 ifindex; }; +/* netkit section */ +enum netkit_action { + NETKIT_NEXT = -1, + NETKIT_PASS = 0, + NETKIT_DROP = 2, + NETKIT_REDIRECT = 7, +}; + +enum netkit_mode { + NETKIT_L2, + NETKIT_L3, +}; + +enum { + IFLA_NETKIT_UNSPEC, + IFLA_NETKIT_PEER_INFO, + IFLA_NETKIT_PRIMARY, + IFLA_NETKIT_POLICY, + IFLA_NETKIT_PEER_POLICY, + IFLA_NETKIT_MODE, + __IFLA_NETKIT_MAX, +}; +#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) + /* VXLAN section */ /* include statistics in the dump */ |