diff options
author | Jakub Kicinski <kuba@kernel.org> | 2020-12-04 18:48:11 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2020-12-04 18:48:12 +0300 |
commit | a1dd1d86973182458da7798a95f26cfcbea599b4 (patch) | |
tree | 1adda22ea30ccfac7651a7eed7b7c90356f8243a /include | |
parent | 55fd59b003f6e8fd88cf16590e79823d7ccf3026 (diff) | |
parent | eceae70bdeaeb6b8ceb662983cf663ff352fbc96 (diff) | |
download | linux-a1dd1d86973182458da7798a95f26cfcbea599b4.tar.xz |
Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says:
====================
pull-request: bpf-next 2020-12-03
The main changes are:
1) Support BTF in kernel modules, from Andrii.
2) Introduce preferred busy-polling, from Björn.
3) bpf_ima_inode_hash() and bpf_bprm_opts_set() helpers, from KP Singh.
4) Memcg-based memory accounting for bpf objects, from Roman.
5) Allow bpf_{s,g}etsockopt from cgroup bind{4,6} hooks, from Stanislav.
* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (118 commits)
selftests/bpf: Fix invalid use of strncat in test_sockmap
libbpf: Use memcpy instead of strncpy to please GCC
selftests/bpf: Add fentry/fexit/fmod_ret selftest for kernel module
selftests/bpf: Add tp_btf CO-RE reloc test for modules
libbpf: Support attachment of BPF tracing programs to kernel modules
libbpf: Factor out low-level BPF program loading helper
bpf: Allow to specify kernel module BTFs when attaching BPF programs
bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier
selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF
selftests/bpf: Add support for marking sub-tests as skipped
selftests/bpf: Add bpf_testmod kernel module for testing
libbpf: Add kernel module BTF support for CO-RE relocations
libbpf: Refactor CO-RE relocs to not assume a single BTF object
libbpf: Add internal helper to load BTF data by FD
bpf: Keep module's btf_data_size intact after load
bpf: Fix bpf_put_raw_tracepoint()'s use of __module_address()
selftests/bpf: Add Userspace tests for TCP_WINDOW_CLAMP
bpf: Adds support for setting window clamp
samples/bpf: Fix spelling mistake "recieving" -> "receiving"
bpf: Fix cold build of test_progs-no_alu32
...
====================
Link: https://lore.kernel.org/r/20201204021936.85653-1-alexei.starovoitov@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/bpf-cgroup.h | 12 | ||||
-rw-r--r-- | include/linux/bpf.h | 71 | ||||
-rw-r--r-- | include/linux/bpf_verifier.h | 28 | ||||
-rw-r--r-- | include/linux/btf.h | 6 | ||||
-rw-r--r-- | include/linux/ima.h | 6 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 215 | ||||
-rw-r--r-- | include/linux/mm.h | 22 | ||||
-rw-r--r-- | include/linux/mm_types.h | 5 | ||||
-rw-r--r-- | include/linux/netdevice.h | 35 | ||||
-rw-r--r-- | include/linux/page-flags.h | 11 | ||||
-rw-r--r-- | include/net/busy_poll.h | 27 | ||||
-rw-r--r-- | include/net/sock.h | 6 | ||||
-rw-r--r-- | include/net/tcp.h | 1 | ||||
-rw-r--r-- | include/net/xdp.h | 3 | ||||
-rw-r--r-- | include/net/xdp_sock_drv.h | 7 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 2 | ||||
-rw-r--r-- | include/uapi/asm-generic/socket.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 45 |
18 files changed, 399 insertions, 106 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index ed71bd1a0825..72e69a0e1e8c 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -246,11 +246,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ sk->sk_prot->pre_connect) @@ -434,8 +434,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 581b2a2e78eb..d05e75ed8c1b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -20,6 +20,8 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/capability.h> +#include <linux/sched/mm.h> +#include <linux/slab.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -37,6 +39,7 @@ struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; +struct mem_cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -135,11 +138,6 @@ struct bpf_map_ops { const struct bpf_iter_seq_info *iter_seq_info; }; -struct bpf_map_memory { - u32 pages; - struct user_struct *user; -}; - struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -160,7 +158,9 @@ struct bpf_map { u32 btf_key_type_id; u32 btf_value_type_id; struct btf *btf; - struct bpf_map_memory memory; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg; +#endif char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; bool bypass_spec_v1; @@ -421,7 +421,10 @@ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; union { int ctx_field_size; - u32 btf_id; + struct { + struct btf *btf; + u32 btf_id; + }; }; struct bpf_verifier_log *log; /* for verbose logs */ }; @@ -458,6 +461,7 @@ struct bpf_verifier_ops { struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, + const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); @@ -771,6 +775,7 @@ struct bpf_prog_aux { u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; + struct btf *attach_btf; const struct bpf_ctx_arg_aux *ctx_arg_info; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; @@ -1005,7 +1010,6 @@ struct bpf_event_entry { bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); -const char *kernel_type_name(u32 btf_type_id); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); @@ -1202,8 +1206,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i); void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); -int __bpf_prog_charge(struct user_struct *user, u32 pages); -void __bpf_prog_uncharge(struct user_struct *user, u32 pages); void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len); @@ -1218,12 +1220,6 @@ void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); -int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); -void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); -void bpf_map_charge_finish(struct bpf_map_memory *mem); -void bpf_map_charge_move(struct bpf_map_memory *dst, - struct bpf_map_memory *src); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); @@ -1240,6 +1236,34 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); +#ifdef CONFIG_MEMCG_KMEM +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node); +void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, + size_t align, gfp_t flags); +#else +static inline void * +bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + return kmalloc_node(size, flags, node); +} + +static inline void * +bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) +{ + return kzalloc(size, flags); +} + +static inline void __percpu * +bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, + gfp_t flags) +{ + return __alloc_percpu_gfp(size, align, flags); +} +#endif + extern int sysctl_unprivileged_bpf_disabled; static inline bool bpf_allow_ptr_leaks(void) @@ -1430,12 +1454,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); -int btf_struct_access(struct bpf_verifier_log *log, +int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); bool btf_struct_ids_match(struct bpf_verifier_log *log, - int off, u32 id, u32 need_type_id); + const struct btf *btf, u32 id, int off, + const struct btf *need_btf, u32 need_type_id); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, @@ -1490,15 +1515,6 @@ bpf_prog_inc_not_zero(struct bpf_prog *prog) return ERR_PTR(-EOPNOTSUPP); } -static inline int __bpf_prog_charge(struct user_struct *user, u32 pages) -{ - return 0; -} - -static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) -{ -} - static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) @@ -1842,6 +1858,7 @@ extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 306869d4743b..e941fe1484e5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -5,6 +5,7 @@ #define _LINUX_BPF_VERIFIER_H 1 #include <linux/bpf.h> /* for enum bpf_reg_type */ +#include <linux/btf.h> /* for struct btf and btf_id() */ #include <linux/filter.h> /* for MAX_BPF_STACK */ #include <linux/tnum.h> @@ -43,6 +44,8 @@ enum bpf_reg_liveness { struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; + /* Fixed part of pointer offset, pointer types only */ + s32 off; union { /* valid when type == PTR_TO_PACKET */ int range; @@ -52,15 +55,20 @@ struct bpf_reg_state { */ struct bpf_map *map_ptr; - u32 btf_id; /* for PTR_TO_BTF_ID */ + /* for PTR_TO_BTF_ID */ + struct { + struct btf *btf; + u32 btf_id; + }; u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ /* Max size from any of the above. */ - unsigned long raw; + struct { + unsigned long raw1; + unsigned long raw2; + } raw; }; - /* Fixed part of pointer offset, pointer types only */ - s32 off; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we @@ -311,7 +319,10 @@ struct bpf_insn_aux_data { struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ union { - u32 btf_id; /* btf_id for struct typed var */ + struct { + struct btf *btf; + u32 btf_id; /* btf_id for struct typed var */ + }; u32 mem_size; /* mem_size for non-struct typed var */ }; } btf_var; @@ -459,9 +470,12 @@ int check_ctx_reg(struct bpf_verifier_env *env, /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, - u32 btf_id) + struct btf *btf, u32 btf_id) { - return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id; + if (tgt_prog) + return ((u64)tgt_prog->aux->id << 32) | btf_id; + else + return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } int bpf_check_attach_target(struct bpf_verifier_log *log, diff --git a/include/linux/btf.h b/include/linux/btf.h index 2bf641829664..4c200f5d242b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -18,6 +18,7 @@ struct btf_show; extern const struct file_operations btf_fops; +void btf_get(struct btf *btf); void btf_put(struct btf *btf); int btf_new_fd(const union bpf_attr *attr); struct btf *btf_get_by_fd(int fd); @@ -88,7 +89,8 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, char *buf, int len, u64 flags); int btf_get_fd_by_id(u32 id); -u32 btf_id(const struct btf *btf); +u32 btf_obj_id(const struct btf *btf); +bool btf_is_kernel(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); @@ -206,6 +208,8 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo( } #ifdef CONFIG_BPF_SYSCALL +struct bpf_prog; + const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); diff --git a/include/linux/ima.h b/include/linux/ima.h index 8fa7bcfb2da2..7233a2751754 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -29,6 +29,7 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size, enum kernel_read_file_id id); extern void ima_post_path_mknod(struct dentry *dentry); extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); +extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size); extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size); #ifdef CONFIG_IMA_KEXEC @@ -115,6 +116,11 @@ static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size) return -EOPNOTSUPP; } +static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size) +{ + return -EOPNOTSUPP; +} + static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {} #endif /* CONFIG_IMA */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 922a7f600465..320369c841f5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -343,6 +343,175 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +enum page_memcg_data_flags { + /* page->memcg_data is a pointer to an objcgs vector */ + MEMCG_DATA_OBJCGS = (1UL << 0), + /* page has been accounted as a non-slab kernel page */ + MEMCG_DATA_KMEM = (1UL << 1), + /* the next bit after the last actual flag */ + __NR_MEMCG_DATA_FLAGS = (1UL << 2), +}; + +#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) + +/* + * page_memcg - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * page_memcg_rcu - locklessly get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + */ +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + WARN_ON_ONCE(!rcu_read_lock_held()); + + return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) & + ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * page_memcg_check - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function unlike page_memcg() can take any page + * as an argument. It has to be used in cases when it's not known if a page + * has an associated memory cgroup pointer or an object cgroups vector. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + /* + * Because page->memcg_data might be changed asynchronously + * for slab pages, READ_ONCE() should be used here. + */ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + if (memcg_data & MEMCG_DATA_OBJCGS) + return NULL; + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * PageMemcgKmem - check if the page has MemcgKmem flag set + * @page: a pointer to the page struct + * + * Checks if the page has MemcgKmem flag set. The caller must ensure that + * the page has an associated memory cgroup. It's not safe to call this function + * against some types of pages, e.g. slab pages. + */ +static inline bool PageMemcgKmem(struct page *page) +{ + VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page); + return page->memcg_data & MEMCG_DATA_KMEM; +} + +#ifdef CONFIG_MEMCG_KMEM +/* + * page_objcgs - get the object cgroups vector associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroups vector associated with the page, + * or NULL. This function assumes that the page is known to have an + * associated object cgroups vector. It's not safe to call this function + * against pages, which might have an associated memory cgroup: e.g. + * kernel stack pages. + */ +static inline struct obj_cgroup **page_objcgs(struct page *page) +{ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * page_objcgs_check - get the object cgroups vector associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroups vector associated with the page, + * or NULL. This function is safe to use if the page can be directly associated + * with a memory cgroup. + */ +static inline struct obj_cgroup **page_objcgs_check(struct page *page) +{ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) + return NULL; + + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * set_page_objcgs - associate a page with a object cgroups vector + * @page: a pointer to the page struct + * @objcgs: a pointer to the object cgroups vector + * + * Atomically associates a page with a vector of object cgroups. + */ +static inline bool set_page_objcgs(struct page *page, + struct obj_cgroup **objcgs) +{ + return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | + MEMCG_DATA_OBJCGS); +} +#else +static inline struct obj_cgroup **page_objcgs(struct page *page) +{ + return NULL; +} + +static inline struct obj_cgroup **page_objcgs_check(struct page *page) +{ + return NULL; +} + +static inline bool set_page_objcgs(struct page *page, + struct obj_cgroup **objcgs) +{ + return true; +} +#endif + static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) @@ -743,15 +912,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline void __mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - __mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + __mod_memcg_state(memcg, idx, val); } static inline void mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + mod_memcg_state(memcg, idx, val); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, @@ -834,16 +1007,17 @@ static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ + struct mem_cgroup *memcg = page_memcg(head); pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!head->mem_cgroup) { + if (!memcg) { __mod_node_page_state(pgdat, idx, val); return; } - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -878,8 +1052,10 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, static inline void count_memcg_page_event(struct page *page, enum vm_event_item idx) { - if (page->mem_cgroup) - count_memcg_events(page->mem_cgroup, idx, 1); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + count_memcg_events(memcg, idx, 1); } static inline void count_memcg_event_mm(struct mm_struct *mm, @@ -948,6 +1124,27 @@ void mem_cgroup_split_huge_fixup(struct page *head); struct mem_cgroup; +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return NULL; +} + +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + return NULL; +} + +static inline bool PageMemcgKmem(struct page *page) +{ + return false; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; @@ -1437,7 +1634,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page, if (mem_cgroup_disabled()) return; - if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) + if (unlikely(&page_memcg(page)->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } diff --git a/include/linux/mm.h b/include/linux/mm.h index db6ae4d3fb4e..6b0c9d2c1d10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1484,28 +1484,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return page->mem_cgroup; -} -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(page->mem_cgroup); -} -#else -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return NULL; -} -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return NULL; -} -#endif - /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5a9238f6caad..80f5d755c037 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -199,10 +199,7 @@ struct page { atomic_t _refcount; #ifdef CONFIG_MEMCG - union { - struct mem_cgroup *mem_cgroup; - struct obj_cgroup **obj_cgroups; - }; + unsigned long memcg_data; #endif /* diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f85c23d52ab..7bf167993c05 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -350,23 +350,25 @@ struct napi_struct { }; enum { - NAPI_STATE_SCHED, /* Poll is scheduled */ - NAPI_STATE_MISSED, /* reschedule a napi */ - NAPI_STATE_DISABLE, /* Disable pending */ - NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_LISTED, /* NAPI added to system lists */ - NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ - NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ + NAPI_STATE_SCHED, /* Poll is scheduled */ + NAPI_STATE_MISSED, /* reschedule a napi */ + NAPI_STATE_DISABLE, /* Disable pending */ + NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ + NAPI_STATE_LISTED, /* NAPI added to system lists */ + NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ + NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ }; enum { - NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), - NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), - NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), - NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), - NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), - NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), - NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), + NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), }; enum gro_result { @@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n) return test_bit(NAPI_STATE_DISABLE, &n->state); } +static inline bool napi_prefer_busy_poll(struct napi_struct *n) +{ + return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); +} + bool napi_schedule_prep(struct napi_struct *n); /** diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4f6ba9379112..fc0e1bd48e73 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -715,9 +715,8 @@ PAGEFLAG_FALSE(DoubleMap) #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 #define PG_offline 0x00000100 -#define PG_kmemcg 0x00000200 -#define PG_table 0x00000400 -#define PG_guard 0x00000800 +#define PG_table 0x00000200 +#define PG_guard 0x00000400 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -769,12 +768,6 @@ PAGE_TYPE_OPS(Buddy, buddy) PAGE_TYPE_OPS(Offline, offline) /* - * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on - * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. - */ -PAGE_TYPE_OPS(Kmemcg, kmemcg) - -/* * Marks pages in use as page tables. */ PAGE_TYPE_OPS(Table, table) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index b001fa91c14e..73af4a64a599 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -23,6 +23,8 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +#define BUSY_POLL_BUDGET 8 + #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; @@ -43,7 +45,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg); + void *loop_end_arg, bool prefer_busy_poll, u16 budget); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -105,7 +107,9 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) - napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, + READ_ONCE(sk->sk_prefer_busy_poll), + READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } @@ -131,13 +135,28 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) sk_rx_queue_set(sk, skb); } +static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (!READ_ONCE(sk->sk_napi_id)) + WRITE_ONCE(sk->sk_napi_id, napi_id); +#endif +} + /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL - if (!READ_ONCE(sk->sk_napi_id)) - WRITE_ONCE(sk->sk_napi_id, skb->napi_id); + __sk_mark_napi_id_once(sk, skb->napi_id); +#endif +} + +static inline void sk_mark_napi_id_once_xdp(struct sock *sk, + const struct xdp_buff *xdp) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); #endif } diff --git a/include/net/sock.h b/include/net/sock.h index f59764614e30..ffacdfdd9894 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -301,6 +301,8 @@ struct bpf_local_storage; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_prefer_busy_poll: prefer busypolling over softirq processing + * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family @@ -479,6 +481,10 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; +#ifdef CONFIG_NET_RX_BUSY_POLL + u8 sk_prefer_busy_poll; + u16 sk_busy_poll_budget; +#endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; diff --git a/include/net/tcp.h b/include/net/tcp.h index 4525d6256321..a62fb7f8a1e3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -410,6 +410,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); +int tcp_set_window_clamp(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, diff --git a/include/net/xdp.h b/include/net/xdp.h index 7d48b2ae217a..700ad5db7f5d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -59,6 +59,7 @@ struct xdp_rxq_info { u32 queue_index; u32 reg_state; struct xdp_mem_info mem; + unsigned int napi_id; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { @@ -226,7 +227,7 @@ static inline void xdp_release_frame(struct xdp_frame *xdpf) } int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index); + struct net_device *dev, u32 queue_index, unsigned int napi_id); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 5b1ee8a9976d..4e295541e396 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -13,6 +13,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); @@ -128,6 +129,12 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, return false; } +static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, + u32 max) +{ + return 0; +} + static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 57d795365987..1efa463c4979 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -257,7 +257,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); + __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 77f7c1638eb1..4dcd13d097a9 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -119,6 +119,9 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 162999b12790..1233f14f659f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -557,7 +557,12 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ - __u32 attach_prog_fd; /* 0 to attach to vmlinux */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -3787,6 +3792,36 @@ union bpf_attr { * *ARG_PTR_TO_BTF_ID* of type *task_struct*. * Return * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's avaialable). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3948,6 +3983,9 @@ union bpf_attr { FN(task_storage_get), \ FN(task_storage_delete), \ FN(get_current_task_btf), \ + FN(bprm_opts_set), \ + FN(ktime_get_coarse_ns), \ + FN(ima_inode_hash), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4119,6 +4157,11 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_IP, }; +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ |