From 27ae7997a66174cb8afd6a75b3989f5e0c1b9e5a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:03 -0800 Subject: bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS This patch allows the kernel's struct ops (i.e. func ptr) to be implemented in BPF. The first use case in this series is the "struct tcp_congestion_ops" which will be introduced in a latter patch. This patch introduces a new prog type BPF_PROG_TYPE_STRUCT_OPS. The BPF_PROG_TYPE_STRUCT_OPS prog is verified against a particular func ptr of a kernel struct. The attr->attach_btf_id is the btf id of a kernel struct. The attr->expected_attach_type is the member "index" of that kernel struct. The first member of a struct starts with member index 0. That will avoid ambiguity when a kernel struct has multiple func ptrs with the same func signature. For example, a BPF_PROG_TYPE_STRUCT_OPS prog is written to implement the "init" func ptr of the "struct tcp_congestion_ops". The attr->attach_btf_id is the btf id of the "struct tcp_congestion_ops" of the _running_ kernel. The attr->expected_attach_type is 3. The ctx of BPF_PROG_TYPE_STRUCT_OPS is an array of u64 args saved by arch_prepare_bpf_trampoline that will be done in the next patch when introducing BPF_MAP_TYPE_STRUCT_OPS. "struct bpf_struct_ops" is introduced as a common interface for the kernel struct that supports BPF_PROG_TYPE_STRUCT_OPS prog. The supporting kernel struct will need to implement an instance of the "struct bpf_struct_ops". The supporting kernel struct also needs to implement a bpf_verifier_ops. During BPF_PROG_LOAD, bpf_struct_ops_find() will find the right bpf_verifier_ops by searching the attr->attach_btf_id. A new "btf_struct_access" is also added to the bpf_verifier_ops such that the supporting kernel struct can optionally provide its own specific check on accessing the func arg (e.g. provide limited write access). After btf_vmlinux is parsed, the new bpf_struct_ops_init() is called to initialize some values (e.g. the btf id of the supporting kernel struct) and it can only be done once the btf_vmlinux is available. The R0 checks at BPF_EXIT is excluded for the BPF_PROG_TYPE_STRUCT_OPS prog if the return type of the prog->aux->attach_func_proto is "void". Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003503.3855825-1-kafai@fb.com --- include/linux/bpf.h | 30 ++++++++++++++++++++++++++++++ include/linux/bpf_types.h | 4 ++++ include/linux/btf.h | 34 ++++++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 1 + 4 files changed, 69 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b14e51d56a82..50f3b20ae284 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -349,6 +349,10 @@ struct bpf_verifier_ops { const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); + int (*btf_struct_access)(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id); }; struct bpf_prog_offload_ops { @@ -668,6 +672,32 @@ struct bpf_array_aux { struct work_struct work; }; +struct btf_type; +struct btf_member; + +#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 +struct bpf_struct_ops { + const struct bpf_verifier_ops *verifier_ops; + int (*init)(struct btf *btf); + int (*check_member)(const struct btf_type *t, + const struct btf_member *member); + const struct btf_type *type; + const char *name; + struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; + u32 type_id; +}; + +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); +void bpf_struct_ops_init(struct btf *btf); +#else +static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) +{ + return NULL; +} +static inline void bpf_struct_ops_init(struct btf *btf) { } +#endif + struct bpf_array { struct bpf_map map; u32 elem_size; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 93740b3614d7..fadd243ffa2d 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -65,6 +65,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, struct sk_reuseport_md, struct sk_reuseport_kern) #endif +#if defined(CONFIG_BPF_JIT) +BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, + void *, void *) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/btf.h b/include/linux/btf.h index 79d4abc2556a..f74a09a7120b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -53,6 +53,18 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); +s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); +const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, + u32 id, u32 *res_id); +const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, + u32 id, u32 *res_id); +const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, + u32 id, u32 *res_id); + +#define for_each_member(i, struct_type, member) \ + for (i = 0, member = btf_type_member(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) static inline bool btf_type_is_ptr(const struct btf_type *t) { @@ -84,6 +96,28 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; } +static inline u16 btf_type_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static inline bool btf_type_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) + : 0; +} + +static inline const struct btf_member *btf_type_member(const struct btf_type *t) +{ + return (const struct btf_member *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7df436da542d..c1eeb3e0e116 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -174,6 +174,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_PROG_TYPE_TRACING, + BPF_PROG_TYPE_STRUCT_OPS, }; enum bpf_attach_type { -- cgit v1.2.3 From 85d33df357b634649ddbe0a20fd2d0fc5732c3cb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:05 -0800 Subject: bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS The patch introduces BPF_MAP_TYPE_STRUCT_OPS. The map value is a kernel struct with its func ptr implemented in bpf prog. This new map is the interface to register/unregister/introspect a bpf implemented kernel struct. The kernel struct is actually embedded inside another new struct (or called the "value" struct in the code). For example, "struct tcp_congestion_ops" is embbeded in: struct bpf_struct_ops_tcp_congestion_ops { refcount_t refcnt; enum bpf_struct_ops_state state; struct tcp_congestion_ops data; /* <-- kernel subsystem struct here */ } The map value is "struct bpf_struct_ops_tcp_congestion_ops". The "bpftool map dump" will then be able to show the state ("inuse"/"tobefree") and the number of subsystem's refcnt (e.g. number of tcp_sock in the tcp_congestion_ops case). This "value" struct is created automatically by a macro. Having a separate "value" struct will also make extending "struct bpf_struct_ops_XYZ" easier (e.g. adding "void (*init)(void)" to "struct bpf_struct_ops_XYZ" to do some initialization works before registering the struct_ops to the kernel subsystem). The libbpf will take care of finding and populating the "struct bpf_struct_ops_XYZ" from "struct XYZ". Register a struct_ops to a kernel subsystem: 1. Load all needed BPF_PROG_TYPE_STRUCT_OPS prog(s) 2. Create a BPF_MAP_TYPE_STRUCT_OPS with attr->btf_vmlinux_value_type_id set to the btf id "struct bpf_struct_ops_tcp_congestion_ops" of the running kernel. Instead of reusing the attr->btf_value_type_id, btf_vmlinux_value_type_id s added such that attr->btf_fd can still be used as the "user" btf which could store other useful sysadmin/debug info that may be introduced in the furture, e.g. creation-date/compiler-details/map-creator...etc. 3. Create a "struct bpf_struct_ops_tcp_congestion_ops" object as described in the running kernel btf. Populate the value of this object. The function ptr should be populated with the prog fds. 4. Call BPF_MAP_UPDATE with the object created in (3) as the map value. The key is always "0". During BPF_MAP_UPDATE, the code that saves the kernel-func-ptr's args as an array of u64 is generated. BPF_MAP_UPDATE also allows the specific struct_ops to do some final checks in "st_ops->init_member()" (e.g. ensure all mandatory func ptrs are implemented). If everything looks good, it will register this kernel struct to the kernel subsystem. The map will not allow further update from this point. Unregister a struct_ops from the kernel subsystem: BPF_MAP_DELETE with key "0". Introspect a struct_ops: BPF_MAP_LOOKUP_ELEM with key "0". The map value returned will have the prog _id_ populated as the func ptr. The map value state (enum bpf_struct_ops_state) will transit from: INIT (map created) => INUSE (map updated, i.e. reg) => TOBEFREE (map value deleted, i.e. unreg) The kernel subsystem needs to call bpf_struct_ops_get() and bpf_struct_ops_put() to manage the "refcnt" in the "struct bpf_struct_ops_XYZ". This patch uses a separate refcnt for the purose of tracking the subsystem usage. Another approach is to reuse the map->refcnt and then "show" (i.e. during map_lookup) the subsystem's usage by doing map->refcnt - map->usercnt to filter out the map-fd/pinned-map usage. However, that will also tie down the future semantics of map->refcnt and map->usercnt. The very first subsystem's refcnt (during reg()) holds one count to map->refcnt. When the very last subsystem's refcnt is gone, it will also release the map->refcnt. All bpf_prog will be freed when the map->refcnt reaches 0 (i.e. during map_free()). Here is how the bpftool map command will look like: [root@arch-fb-vm1 bpf]# bpftool map show 6: struct_ops name dctcp flags 0x0 key 4B value 256B max_entries 1 memlock 4096B btf_id 6 [root@arch-fb-vm1 bpf]# bpftool map dump id 6 [{ "value": { "refcnt": { "refs": { "counter": 1 } }, "state": 1, "data": { "list": { "next": 0, "prev": 0 }, "key": 0, "flags": 2, "init": 24, "release": 0, "ssthresh": 25, "cong_avoid": 30, "set_state": 27, "cwnd_event": 28, "in_ack_event": 26, "undo_cwnd": 29, "pkts_acked": 0, "min_tso_segs": 0, "sndbuf_expand": 0, "cong_control": 0, "get_info": 0, "name": [98,112,102,95,100,99,116,99,112,0,0,0,0,0,0,0 ], "owner": 0 } } } ] Misc Notes: * bpf_struct_ops_map_sys_lookup_elem() is added for syscall lookup. It does an inplace update on "*value" instead returning a pointer to syscall.c. Otherwise, it needs a separate copy of "zero" value for the BPF_STRUCT_OPS_STATE_INIT to avoid races. * The bpf_struct_ops_map_delete_elem() is also called without preempt_disable() from map_delete_elem(). It is because the "->unreg()" may requires sleepable context, e.g. the "tcp_unregister_congestion_control()". * "const" is added to some of the existing "struct btf_func_model *" function arg to avoid a compiler warning caused by this patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003505.3855919-1-kafai@fb.com --- arch/x86/net/bpf_jit_comp.c | 18 +- include/linux/bpf.h | 49 ++++- include/linux/bpf_types.h | 3 + include/linux/btf.h | 13 ++ include/uapi/linux/bpf.h | 7 +- kernel/bpf/bpf_struct_ops.c | 511 +++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/btf.c | 20 +- kernel/bpf/map_in_map.c | 3 +- kernel/bpf/syscall.c | 52 +++-- kernel/bpf/trampoline.c | 8 +- kernel/bpf/verifier.c | 5 + 11 files changed, 642 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4c8a2d1f8470..9ba08e9abc09 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1328,7 +1328,7 @@ emit_jmp: return proglen; } -static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args, +static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args, int stack_size) { int i; @@ -1344,7 +1344,7 @@ static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args, -(stack_size - i * 8)); } -static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args, +static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, int stack_size) { int i; @@ -1361,7 +1361,7 @@ static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args, -(stack_size - i * 8)); } -static int invoke_bpf(struct btf_func_model *m, u8 **pprog, +static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_prog **progs, int prog_cnt, int stack_size) { u8 *prog = *pprog; @@ -1456,7 +1456,8 @@ static int invoke_bpf(struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +int arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call) @@ -1523,13 +1524,10 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags /* skip our return address and return to parent */ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ EMIT1(0xC3); /* ret */ - /* One half of the page has active running trampoline. - * Another half is an area for next trampoline. - * Make sure the trampoline generation logic doesn't overflow. - */ - if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY)) + /* Make sure the trampoline generation logic doesn't overflow */ + if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) return -EFAULT; - return 0; + return prog - (u8 *)image; } static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50f3b20ae284..a7bfe8a388c6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -17,6 +17,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -106,6 +107,7 @@ struct bpf_map { struct btf *btf; struct bpf_map_memory memory; char name[BPF_OBJ_NAME_LEN]; + u32 btf_vmlinux_value_type_id; bool unpriv_array; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 22 bytes hole */ @@ -183,7 +185,8 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->btf && map->ops->map_seq_show_elem; + return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) && + map->ops->map_seq_show_elem; } int map_check_no_btf(const struct bpf_map *map, @@ -441,7 +444,8 @@ struct btf_func_model { * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ -int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +int arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call); @@ -672,6 +676,7 @@ struct bpf_array_aux { struct work_struct work; }; +struct bpf_struct_ops_value; struct btf_type; struct btf_member; @@ -681,21 +686,61 @@ struct bpf_struct_ops { int (*init)(struct btf *btf); int (*check_member)(const struct btf_type *t, const struct btf_member *member); + int (*init_member)(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata); + int (*reg)(void *kdata); + void (*unreg)(void *kdata); const struct btf_type *type; + const struct btf_type *value_type; const char *name; struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; + u32 value_id; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +#define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); void bpf_struct_ops_init(struct btf *btf); +bool bpf_struct_ops_get(const void *kdata); +void bpf_struct_ops_put(const void *kdata); +int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, + void *value); +static inline bool bpf_try_module_get(const void *data, struct module *owner) +{ + if (owner == BPF_MODULE_OWNER) + return bpf_struct_ops_get(data); + else + return try_module_get(owner); +} +static inline void bpf_module_put(const void *data, struct module *owner) +{ + if (owner == BPF_MODULE_OWNER) + bpf_struct_ops_put(data); + else + module_put(owner); +} #else static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { return NULL; } static inline void bpf_struct_ops_init(struct btf *btf) { } +static inline bool bpf_try_module_get(const void *data, struct module *owner) +{ + return try_module_get(owner); +} +static inline void bpf_module_put(const void *data, struct module *owner) +{ + module_put(owner); +} +static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, + void *key, + void *value) +{ + return -EINVAL; +} #endif struct bpf_array { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fadd243ffa2d..9f326e6ef885 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,3 +109,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) +#if defined(CONFIG_BPF_JIT) +BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) +#endif diff --git a/include/linux/btf.h b/include/linux/btf.h index f74a09a7120b..881e9b76ef49 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -7,6 +7,8 @@ #include #include +#define BTF_TYPE_EMIT(type) ((void)(type *)0) + struct btf; struct btf_member; struct btf_type; @@ -60,6 +62,10 @@ const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, u32 id, u32 *res_id); const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, u32 id, u32 *res_id); +const struct btf_type * +btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size, const struct btf_type **elem_type, + u32 *total_nelems); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ @@ -106,6 +112,13 @@ static inline bool btf_type_kflag(const struct btf_type *t) return BTF_INFO_KFLAG(t->info); } +static inline u32 btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) + : member->offset; +} + static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type, const struct btf_member *member) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c1eeb3e0e116..38059880963e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -136,6 +136,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, + BPF_MAP_TYPE_STRUCT_OPS, }; /* Note that tracing related programs such as @@ -398,6 +399,10 @@ union bpf_attr { __u32 btf_fd; /* fd pointing to a BTF type data */ __u32 btf_key_type_id; /* BTF type_id of the key */ __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -3350,7 +3355,7 @@ struct bpf_map_info { __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; - __u32 :32; + __u32 btf_vmlinux_value_type_id; __u64 netns_dev; __u64 netns_ino; __u32 btf_id; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 2ea68fe34c33..ddf48f49914b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -9,9 +9,68 @@ #include #include #include +#include +enum bpf_struct_ops_state { + BPF_STRUCT_OPS_STATE_INIT, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE, +}; + +#define BPF_STRUCT_OPS_COMMON_VALUE \ + refcount_t refcnt; \ + enum bpf_struct_ops_state state + +struct bpf_struct_ops_value { + BPF_STRUCT_OPS_COMMON_VALUE; + char data[0] ____cacheline_aligned_in_smp; +}; + +struct bpf_struct_ops_map { + struct bpf_map map; + const struct bpf_struct_ops *st_ops; + /* protect map_update */ + struct mutex lock; + /* progs has all the bpf_prog that is populated + * to the func ptr of the kernel's struct + * (in kvalue.data). + */ + struct bpf_prog **progs; + /* image is a page that has all the trampolines + * that stores the func args before calling the bpf_prog. + * A PAGE_SIZE "image" is enough to store all trampoline for + * "progs[]". + */ + void *image; + /* uvalue->data stores the kernel struct + * (e.g. tcp_congestion_ops) that is more useful + * to userspace than the kvalue. For example, + * the bpf_prog's id is stored instead of the kernel + * address of a func ptr. + */ + struct bpf_struct_ops_value *uvalue; + /* kvalue.data stores the actual kernel's struct + * (e.g. tcp_congestion_ops) that will be + * registered to the kernel subsystem. + */ + struct bpf_struct_ops_value kvalue; +}; + +#define VALUE_PREFIX "bpf_struct_ops_" +#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) + +/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is + * the map's value exposed to the userspace and its btf-type-id is + * stored at the map->btf_vmlinux_value_type_id. + * + */ #define BPF_STRUCT_OPS_TYPE(_name) \ -extern struct bpf_struct_ops bpf_##_name; +extern struct bpf_struct_ops bpf_##_name; \ + \ +struct bpf_struct_ops_##_name { \ + BPF_STRUCT_OPS_COMMON_VALUE; \ + struct _name data ____cacheline_aligned_in_smp; \ +}; #include "bpf_struct_ops_types.h" #undef BPF_STRUCT_OPS_TYPE @@ -35,19 +94,50 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { const struct bpf_prog_ops bpf_struct_ops_prog_ops = { }; +static const struct btf_type *module_type; + void bpf_struct_ops_init(struct btf *btf) { + s32 type_id, value_id, module_id; const struct btf_member *member; struct bpf_struct_ops *st_ops; struct bpf_verifier_log log = {}; const struct btf_type *t; + char value_name[128]; const char *mname; - s32 type_id; u32 i, j; + /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ +#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + + module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); + if (module_id < 0) { + pr_warn("Cannot find struct module in btf_vmlinux\n"); + return; + } + module_type = btf_type_by_id(btf, module_id); + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { st_ops = bpf_struct_ops[i]; + if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= + sizeof(value_name)) { + pr_warn("struct_ops name %s is too long\n", + st_ops->name); + continue; + } + sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); + + value_id = btf_find_by_name_kind(btf, value_name, + BTF_KIND_STRUCT); + if (value_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + value_name); + continue; + } + type_id = btf_find_by_name_kind(btf, st_ops->name, BTF_KIND_STRUCT); if (type_id < 0) { @@ -98,6 +188,9 @@ void bpf_struct_ops_init(struct btf *btf) } else { st_ops->type_id = type_id; st_ops->type = t; + st_ops->value_id = value_id; + st_ops->value_type = btf_type_by_id(btf, + value_id); } } } @@ -105,6 +198,22 @@ void bpf_struct_ops_init(struct btf *btf) extern struct btf *btf_vmlinux; +static const struct bpf_struct_ops * +bpf_struct_ops_find_value(u32 value_id) +{ + unsigned int i; + + if (!value_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->value_id == value_id) + return bpf_struct_ops[i]; + } + + return NULL; +} + const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { unsigned int i; @@ -119,3 +228,401 @@ const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) return NULL; } + +static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + if (key && *(u32 *)key == 0) + return -ENOENT; + + *(u32 *)next_key = 0; + return 0; +} + +int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + struct bpf_struct_ops_value *uvalue, *kvalue; + enum bpf_struct_ops_state state; + + if (unlikely(*(u32 *)key != 0)) + return -ENOENT; + + kvalue = &st_map->kvalue; + /* Pair with smp_store_release() during map_update */ + state = smp_load_acquire(&kvalue->state); + if (state == BPF_STRUCT_OPS_STATE_INIT) { + memset(value, 0, map->value_size); + return 0; + } + + /* No lock is needed. state and refcnt do not need + * to be updated together under atomic context. + */ + uvalue = (struct bpf_struct_ops_value *)value; + memcpy(uvalue, st_map->uvalue, map->value_size); + uvalue->state = state; + refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + + return 0; +} + +static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) +{ + const struct btf_type *t = st_map->st_ops->type; + u32 i; + + for (i = 0; i < btf_type_vlen(t); i++) { + if (st_map->progs[i]) { + bpf_prog_put(st_map->progs[i]); + st_map->progs[i] = NULL; + } + } +} + +static int check_zero_holes(const struct btf_type *t, void *data) +{ + const struct btf_member *member; + u32 i, moff, msize, prev_mend = 0; + const struct btf_type *mtype; + + for_each_member(i, t, member) { + moff = btf_member_bit_offset(t, member) / 8; + if (moff > prev_mend && + memchr_inv(data + prev_mend, 0, moff - prev_mend)) + return -EINVAL; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) + return PTR_ERR(mtype); + prev_mend = moff + msize; + } + + if (t->size > prev_mend && + memchr_inv(data + prev_mend, 0, t->size - prev_mend)) + return -EINVAL; + + return 0; +} + +static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + const struct bpf_struct_ops *st_ops = st_map->st_ops; + struct bpf_struct_ops_value *uvalue, *kvalue; + const struct btf_member *member; + const struct btf_type *t = st_ops->type; + void *udata, *kdata; + int prog_fd, err = 0; + void *image; + u32 i; + + if (flags) + return -EINVAL; + + if (*(u32 *)key != 0) + return -E2BIG; + + err = check_zero_holes(st_ops->value_type, value); + if (err) + return err; + + uvalue = (struct bpf_struct_ops_value *)value; + err = check_zero_holes(t, uvalue->data); + if (err) + return err; + + if (uvalue->state || refcount_read(&uvalue->refcnt)) + return -EINVAL; + + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; + + mutex_lock(&st_map->lock); + + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { + err = -EBUSY; + goto unlock; + } + + memcpy(uvalue, value, map->value_size); + + udata = &uvalue->data; + kdata = &kvalue->data; + image = st_map->image; + + for_each_member(i, t, member) { + const struct btf_type *mtype, *ptype; + struct bpf_prog *prog; + u32 moff; + + moff = btf_member_bit_offset(t, member) / 8; + ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); + if (ptype == module_type) { + if (*(void **)(udata + moff)) + goto reset_unlock; + *(void **)(kdata + moff) = BPF_MODULE_OWNER; + continue; + } + + err = st_ops->init_member(t, member, kdata, udata); + if (err < 0) + goto reset_unlock; + + /* The ->init_member() has handled this member */ + if (err > 0) + continue; + + /* If st_ops->init_member does not handle it, + * we will only handle func ptrs and zero-ed members + * here. Reject everything else. + */ + + /* All non func ptr member must be 0 */ + if (!ptype || !btf_type_is_func_proto(ptype)) { + u32 msize; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) { + err = PTR_ERR(mtype); + goto reset_unlock; + } + + if (memchr_inv(udata + moff, 0, msize)) { + err = -EINVAL; + goto reset_unlock; + } + + continue; + } + + prog_fd = (int)(*(unsigned long *)(udata + moff)); + /* Similar check as the attr->attach_prog_fd */ + if (!prog_fd) + continue; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto reset_unlock; + } + st_map->progs[i] = prog; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || + prog->aux->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != i) { + err = -EINVAL; + goto reset_unlock; + } + + err = arch_prepare_bpf_trampoline(image, + st_map->image + PAGE_SIZE, + &st_ops->func_models[i], 0, + &prog, 1, NULL, 0, NULL); + if (err < 0) + goto reset_unlock; + + *(void **)(kdata + moff) = image; + image += err; + + /* put prog_id to udata */ + *(unsigned long *)(udata + moff) = prog->aux->id; + } + + refcount_set(&kvalue->refcnt, 1); + bpf_map_inc(map); + + set_memory_ro((long)st_map->image, 1); + set_memory_x((long)st_map->image, 1); + err = st_ops->reg(kdata); + if (likely(!err)) { + /* Pair with smp_load_acquire() during lookup_elem(). + * It ensures the above udata updates (e.g. prog->aux->id) + * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. + */ + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); + goto unlock; + } + + /* Error during st_ops->reg(). It is very unlikely since + * the above init_member() should have caught it earlier + * before reg(). The only possibility is if there was a race + * in registering the struct_ops (under the same name) to + * a sub-system through different struct_ops's maps. + */ + set_memory_nx((long)st_map->image, 1); + set_memory_rw((long)st_map->image, 1); + bpf_map_put(map); + +reset_unlock: + bpf_struct_ops_map_put_progs(st_map); + memset(uvalue, 0, map->value_size); + memset(kvalue, 0, map->value_size); +unlock: + mutex_unlock(&st_map->lock); + return err; +} + +static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +{ + enum bpf_struct_ops_state prev_state; + struct bpf_struct_ops_map *st_map; + + st_map = (struct bpf_struct_ops_map *)map; + prev_state = cmpxchg(&st_map->kvalue.state, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE); + if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + st_map->st_ops->unreg(&st_map->kvalue.data); + if (refcount_dec_and_test(&st_map->kvalue.refcnt)) + bpf_map_put(map); + } + + return 0; +} + +static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + value = bpf_struct_ops_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, + value, m); + seq_puts(m, "\n"); +} + +static void bpf_struct_ops_map_free(struct bpf_map *map) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + + if (st_map->progs) + bpf_struct_ops_map_put_progs(st_map); + bpf_map_area_free(st_map->progs); + bpf_jit_free_exec(st_map->image); + bpf_map_area_free(st_map->uvalue); + bpf_map_area_free(st_map); +} + +static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) +{ + if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || + attr->map_flags || !attr->btf_vmlinux_value_type_id) + return -EINVAL; + return 0; +} + +static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) +{ + const struct bpf_struct_ops *st_ops; + size_t map_total_size, st_map_size; + struct bpf_struct_ops_map *st_map; + const struct btf_type *t, *vt; + struct bpf_map_memory mem; + struct bpf_map *map; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); + if (!st_ops) + return ERR_PTR(-ENOTSUPP); + + vt = st_ops->value_type; + if (attr->value_size != vt->size) + return ERR_PTR(-EINVAL); + + t = st_ops->type; + + st_map_size = sizeof(*st_map) + + /* kvalue stores the + * struct bpf_struct_ops_tcp_congestions_ops + */ + (vt->size - sizeof(struct bpf_struct_ops_value)); + map_total_size = st_map_size + + /* uvalue */ + sizeof(vt->size) + + /* struct bpf_progs **progs */ + btf_type_vlen(t) * sizeof(struct bpf_prog *); + err = bpf_map_charge_init(&mem, map_total_size); + if (err < 0) + return ERR_PTR(err); + + st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); + if (!st_map) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + st_map->st_ops = st_ops; + map = &st_map->map; + + st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); + st_map->progs = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + NUMA_NO_NODE); + st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!st_map->uvalue || !st_map->progs || !st_map->image) { + bpf_struct_ops_map_free(map); + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + + mutex_init(&st_map->lock); + set_vm_flush_reset_perms(st_map->image); + bpf_map_init_from_attr(map, attr); + bpf_map_charge_move(&map->memory, &mem); + + return map; +} + +const struct bpf_map_ops bpf_struct_ops_map_ops = { + .map_alloc_check = bpf_struct_ops_map_alloc_check, + .map_alloc = bpf_struct_ops_map_alloc, + .map_free = bpf_struct_ops_map_free, + .map_get_next_key = bpf_struct_ops_map_get_next_key, + .map_lookup_elem = bpf_struct_ops_map_lookup_elem, + .map_delete_elem = bpf_struct_ops_map_delete_elem, + .map_update_elem = bpf_struct_ops_map_update_elem, + .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, +}; + +/* "const void *" because some subsystem is + * passing a const (e.g. const struct tcp_congestion_ops *) + */ +bool bpf_struct_ops_get(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + + return refcount_inc_not_zero(&kvalue->refcnt); +} + +void bpf_struct_ops_put(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + if (refcount_dec_and_test(&kvalue->refcnt)) { + struct bpf_struct_ops_map *st_map; + + st_map = container_of(kvalue, struct bpf_struct_ops_map, + kvalue); + bpf_map_put(&st_map->map); + } +} diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 12af4a1bb1a4..81d9cf75cacd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -500,13 +500,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "UNKN"; } -static u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) - : member->offset; -} - static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -1089,7 +1082,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *elem_type: same as return type ("struct X") * *total_nelems: 1 */ -static const struct btf_type * +const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, u32 *total_nelems) @@ -1143,8 +1136,10 @@ resolved: return ERR_PTR(-EINVAL); *type_size = nelems * size; - *total_nelems = nelems; - *elem_type = type; + if (total_nelems) + *total_nelems = nelems; + if (elem_type) + *elem_type = type; return array_type ? : type; } @@ -1858,7 +1853,10 @@ static void btf_modifier_seq_show(const struct btf *btf, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) { - t = btf_type_id_resolve(btf, &type_id); + if (btf->resolved_ids) + t = btf_type_id_resolve(btf, &type_id); + else + t = btf_type_skip_modifiers(btf, type_id, NULL); btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5e9366b33f0f..b3c48d1533cb 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { fdput(f); return ERR_PTR(-ENOTSUPP); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 03a02ef4c496..f9db72a96ec0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -628,7 +628,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -642,6 +642,14 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (attr->btf_vmlinux_value_type_id) { + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || + attr->btf_key_type_id || attr->btf_value_type_id) + return -EINVAL; + } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + return -EINVAL; + } + f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; @@ -664,32 +672,35 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - if (attr->btf_key_type_id || attr->btf_value_type_id) { + map->spin_lock_off = -EINVAL; + if (attr->btf_key_type_id || attr->btf_value_type_id || + /* Even the map's value is a kernel's struct, + * the bpf_prog.o must have BTF to begin with + * to figure out the corresponding kernel's + * counter part. Thus, attr->btf_fd has + * to be valid also. + */ + attr->btf_vmlinux_value_type_id) { struct btf *btf; - if (!attr->btf_value_type_id) { - err = -EINVAL; - goto free_map; - } - btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto free_map; } + map->btf = btf; - err = map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); - if (err) { - btf_put(btf); - goto free_map; + if (attr->btf_value_type_id) { + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); + if (err) + goto free_map; } - map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; - } else { - map->spin_lock_off = -EINVAL; + map->btf_vmlinux_value_type_id = + attr->btf_vmlinux_value_type_id; } err = security_bpf_map_alloc(map); @@ -888,6 +899,9 @@ static int map_lookup_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_peek_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* struct_ops map requires directly updating "value" */ + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) @@ -1003,7 +1017,8 @@ static int map_update_elem(union bpf_attr *attr) goto out; } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP) { + map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } else if (IS_FD_PROG_ARRAY(map)) { @@ -1092,7 +1107,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; - } else if (IS_FD_PROG_ARRAY(map)) { + } else if (IS_FD_PROG_ARRAY(map) || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* These maps require sleepable context */ err = map->ops->map_delete_elem(map, key); goto out; } @@ -2822,6 +2839,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } + info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 505f4e4b31d2..79a04417050d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -160,11 +160,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (fexit_cnt) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags, + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + &tr->func.model, flags, fentry, fentry_cnt, fexit, fexit_cnt, tr->func.addr); - if (err) + if (err < 0) goto out; if (tr->selector) @@ -296,7 +297,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) } int __weak -arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 586ed3c94b80..f5af759a8a5f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8155,6 +8155,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + verbose(env, "bpf_struct_ops map cannot be used in prog\n"); + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From 0baf26b0fcd74bbfcef53c5d5e8bad2b99c8d0d2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:08 -0800 Subject: bpf: tcp: Support tcp_congestion_ops in bpf This patch makes "struct tcp_congestion_ops" to be the first user of BPF STRUCT_OPS. It allows implementing a tcp_congestion_ops in bpf. The BPF implemented tcp_congestion_ops can be used like regular kernel tcp-cc through sysctl and setsockopt. e.g. [root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic net.ipv4.tcp_congestion_control = bpf_cubic There has been attempt to move the TCP CC to the user space (e.g. CCP in TCP). The common arguments are faster turn around, get away from long-tail kernel versions in production...etc, which are legit points. BPF has been the continuous effort to join both kernel and userspace upsides together (e.g. XDP to gain the performance advantage without bypassing the kernel). The recent BPF advancements (in particular BTF-aware verifier, BPF trampoline, BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc) possible in BPF. It allows a faster turnaround for testing algorithm in the production while leveraging the existing (and continue growing) BPF feature/framework instead of building one specifically for userspace TCP CC. This patch allows write access to a few fields in tcp-sock (in bpf_tcp_ca_btf_struct_access()). The optional "get_info" is unsupported now. It can be added later. One possible way is to output the info with a btf-id to describe the content. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003508.3856115-1-kafai@fb.com --- include/linux/filter.h | 2 + include/net/tcp.h | 2 + kernel/bpf/bpf_struct_ops_types.h | 7 +- net/core/filter.c | 2 +- net/ipv4/Makefile | 4 + net/ipv4/bpf_tcp_ca.c | 230 ++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_cong.c | 16 +-- net/ipv4/tcp_ipv4.c | 6 +- net/ipv4/tcp_minisocks.c | 4 +- net/ipv4/tcp_output.c | 4 +- 10 files changed, 261 insertions(+), 16 deletions(-) create mode 100644 net/ipv4/bpf_tcp_ca.c (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 70e6dd960bca..a366a0b64a57 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -843,6 +843,8 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 7df37e2fddca..9dd975be7fdf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1007,6 +1007,7 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; @@ -1101,6 +1102,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h index 7bb13ff49ec2..066d83ea1c99 100644 --- a/kernel/bpf/bpf_struct_ops_types.h +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* internal file - do not include directly */ -/* To be filled in a later patch */ +#ifdef CONFIG_BPF_JIT +#ifdef CONFIG_INET +#include +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) +#endif +#endif diff --git a/net/core/filter.c b/net/core/filter.c index 42fd17c48c5f..a702761ef369 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5935,7 +5935,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -static const struct bpf_func_proto * +const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d57ecfaf89d4..9d97bace13c8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o + +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o +endif diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c new file mode 100644 index 000000000000..9c7745b958d7 --- /dev/null +++ b/net/ipv4/bpf_tcp_ca.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include +#include +#include +#include + +static u32 optional_ops[] = { + offsetof(struct tcp_congestion_ops, init), + offsetof(struct tcp_congestion_ops, release), + offsetof(struct tcp_congestion_ops, set_state), + offsetof(struct tcp_congestion_ops, cwnd_event), + offsetof(struct tcp_congestion_ops, in_ack_event), + offsetof(struct tcp_congestion_ops, pkts_acked), + offsetof(struct tcp_congestion_ops, min_tso_segs), + offsetof(struct tcp_congestion_ops, sndbuf_expand), + offsetof(struct tcp_congestion_ops, cong_control), +}; + +static u32 unsupported_ops[] = { + offsetof(struct tcp_congestion_ops, get_info), +}; + +static const struct btf_type *tcp_sock_type; +static u32 tcp_sock_id, sock_id; + +static int bpf_tcp_ca_init(struct btf *btf) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + sock_id = type_id; + + type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + tcp_sock_id = type_id; + tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); + + return 0; +} + +static bool is_optional(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { + if (member_offset == optional_ops[i]) + return true; + } + + return false; +} + +static bool is_unsupported(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { + if (member_offset == unsupported_ops[i]) + return true; + } + + return false; +} + +extern struct btf *btf_vmlinux; + +static bool bpf_tcp_ca_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + if (!btf_ctx_access(off, size, type, prog, info)) + return false; + + if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) + /* promote it to tcp_sock */ + info->btf_id = tcp_sock_id; + + return true; +} + +static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id) +{ + size_t end; + + if (atype == BPF_READ) + return btf_struct_access(log, t, off, size, atype, next_btf_id); + + if (t != tcp_sock_type) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + switch (off) { + case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): + end = offsetofend(struct inet_connection_sock, icsk_ca_priv); + break; + case offsetof(struct inet_connection_sock, icsk_ack.pending): + end = offsetofend(struct inet_connection_sock, + icsk_ack.pending); + break; + case offsetof(struct tcp_sock, snd_cwnd): + end = offsetofend(struct tcp_sock, snd_cwnd); + break; + case offsetof(struct tcp_sock, snd_cwnd_cnt): + end = offsetofend(struct tcp_sock, snd_cwnd_cnt); + break; + case offsetof(struct tcp_sock, snd_ssthresh): + end = offsetofend(struct tcp_sock, snd_ssthresh); + break; + case offsetof(struct tcp_sock, ecn_flags): + end = offsetofend(struct tcp_sock, ecn_flags); + break; + default: + bpf_log(log, "no write support to tcp_sock at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n", + off, size, end); + return -EACCES; + } + + return NOT_INIT; +} + +static const struct bpf_func_proto * +bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id); +} + +static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { + .get_func_proto = bpf_tcp_ca_get_func_proto, + .is_valid_access = bpf_tcp_ca_is_valid_access, + .btf_struct_access = bpf_tcp_ca_btf_struct_access, +}; + +static int bpf_tcp_ca_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct tcp_congestion_ops *utcp_ca; + struct tcp_congestion_ops *tcp_ca; + size_t tcp_ca_name_len; + int prog_fd; + u32 moff; + + utcp_ca = (const struct tcp_congestion_ops *)udata; + tcp_ca = (struct tcp_congestion_ops *)kdata; + + moff = btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct tcp_congestion_ops, flags): + if (utcp_ca->flags & ~TCP_CONG_MASK) + return -EINVAL; + tcp_ca->flags = utcp_ca->flags; + return 1; + case offsetof(struct tcp_congestion_ops, name): + tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); + if (!tcp_ca_name_len || + tcp_ca_name_len == sizeof(utcp_ca->name)) + return -EINVAL; + if (tcp_ca_find(utcp_ca->name)) + return -EEXIST; + memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); + return 1; + } + + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) + return 0; + + /* Ensure bpf_prog is provided for compulsory func ptr */ + prog_fd = (int)(*(unsigned long *)(udata + moff)); + if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) + return -EINVAL; + + return 0; +} + +static int bpf_tcp_ca_check_member(const struct btf_type *t, + const struct btf_member *member) +{ + if (is_unsupported(btf_member_bit_offset(t, member) / 8)) + return -ENOTSUPP; + return 0; +} + +static int bpf_tcp_ca_reg(void *kdata) +{ + return tcp_register_congestion_control(kdata); +} + +static void bpf_tcp_ca_unreg(void *kdata) +{ + tcp_unregister_congestion_control(kdata); +} + +/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + +struct bpf_struct_ops bpf_tcp_congestion_ops = { + .verifier_ops = &bpf_tcp_ca_verifier_ops, + .reg = bpf_tcp_ca_reg, + .unreg = bpf_tcp_ca_unreg, + .check_member = bpf_tcp_ca_check_member, + .init_member = bpf_tcp_ca_init_member, + .init = bpf_tcp_ca_init, + .name = "tcp_congestion_ops", +}; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 3737ec096650..3172e31987be 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -21,7 +21,7 @@ static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ -static struct tcp_congestion_ops *tcp_ca_find(const char *name) +struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; @@ -162,7 +162,7 @@ void tcp_assign_congestion_control(struct sock *sk) rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); - if (unlikely(!try_module_get(ca->owner))) + if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); @@ -208,7 +208,7 @@ void tcp_cleanup_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); - module_put(icsk->icsk_ca_ops->owner); + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -222,12 +222,12 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) ca = tcp_ca_find_autoload(net, name); if (!ca) { ret = -ENOENT; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) - module_put(prev->owner); + bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; @@ -366,19 +366,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, } else if (!load) { const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - if (try_module_get(ca->owner)) { + if (bpf_try_module_get(ca, ca->owner)) { if (reinit) { tcp_reinit_congestion_control(sk, ca); } else { icsk->icsk_ca_ops = ca; - module_put(old_ca->owner); + bpf_module_put(old_ca, old_ca->owner); } } else { err = -EBUSY; } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { err = -EBUSY; } else { tcp_reinit_congestion_control(sk, ca); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4adac9c75343..317ccca548a2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2678,7 +2678,8 @@ static void __net_exit tcp_sk_exit(struct net *net) int cpu; if (net->ipv4.tcp_congestion_control) - module_put(net->ipv4.tcp_congestion_control->owner); + bpf_module_put(net->ipv4.tcp_congestion_control, + net->ipv4.tcp_congestion_control->owner); for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); @@ -2785,7 +2786,8 @@ static int __net_init tcp_sk_init(struct net *net) /* Reno is always built in */ if (!net_eq(net, &init_net) && - try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + bpf_try_module_get(init_net.ipv4.tcp_congestion_control, + init_net.ipv4.tcp_congestion_control->owner)) net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; else net->ipv4.tcp_congestion_control = &tcp_reno; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c802bc80c400..ad3b56d9fa71 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -414,7 +414,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; @@ -425,7 +425,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || - !try_module_get(icsk->icsk_ca_ops->owner))) + !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 58c92a7d671c..377cfab422df 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3368,8 +3368,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { - module_put(icsk->icsk_ca_ops->owner); + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } -- cgit v1.2.3 From 206057fe020ac5c037d5e2dd6562a9bd216ec765 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:45:51 -0800 Subject: bpf: Add BPF_FUNC_tcp_send_ack helper Add a helper to send out a tcp-ack. It will be used in the later bpf_dctcp implementation that requires to send out an ack when the CE state changed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109004551.3900448-1-kafai@fb.com --- include/uapi/linux/bpf.h | 11 ++++++++++- net/ipv4/bpf_tcp_ca.c | 24 +++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 38059880963e..2d6a2e572f56 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2837,6 +2837,14 @@ union bpf_attr { * Return * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. + * + * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2954,7 +2962,8 @@ union bpf_attr { FN(probe_read_user), \ FN(probe_read_kernel), \ FN(probe_read_user_str), \ - FN(probe_read_kernel_str), + FN(probe_read_kernel_str), \ + FN(tcp_send_ack), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 9c7745b958d7..574972bc7299 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -143,11 +143,33 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, return NOT_INIT; } +BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) +{ + /* bpf_tcp_ca prog cannot have NULL tp */ + __tcp_send_ack((struct sock *)tp, rcv_nxt); + return 0; +} + +static const struct bpf_func_proto bpf_tcp_send_ack_proto = { + .func = bpf_tcp_send_ack, + .gpl_only = false, + /* In case we want to report error later */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_ANYTHING, + .btf_id = &tcp_sock_id, +}; + static const struct bpf_func_proto * bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id); + switch (func_id) { + case BPF_FUNC_tcp_send_ack: + return &bpf_tcp_send_ack_proto; + default: + return bpf_base_func_proto(func_id); + } } static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { -- cgit v1.2.3 From f5bfcd953d811dbb8913de36b96b38da6bb62135 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Tue, 7 Jan 2020 17:40:06 -0800 Subject: bpf: Document BPF_F_QUERY_EFFECTIVE flag Document BPF_F_QUERY_EFFECTIVE flag, mostly to clarify how it affects attach_flags what may not be obvious and what may lead to confision. Specifically attach_flags is returned only for target_fd but if programs are inherited from an ancestor cgroup then returned attach_flags for current cgroup may be confusing. For example, two effective programs of same attach_type can be returned but w/o BPF_F_ALLOW_MULTI in attach_flags. Simple repro: # bpftool c s /sys/fs/cgroup/path/to/task ID AttachType AttachFlags Name # bpftool c s /sys/fs/cgroup/path/to/task effective ID AttachType AttachFlags Name 95043 ingress tw_ipt_ingress 95048 ingress tw_ingress Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200108014006.938363-1-rdna@fb.com --- include/uapi/linux/bpf.h | 7 ++++++- tools/include/uapi/linux/bpf.h | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2d6a2e572f56..52966e758fe5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -359,7 +359,12 @@ enum bpf_attach_type { /* Enable memory-mapping BPF map */ #define BPF_F_MMAPABLE (1U << 10) -/* flags for BPF_PROG_QUERY */ +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are returned only for directly attached programs. + */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) enum bpf_stack_build_id_status { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d6a2e572f56..52966e758fe5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -359,7 +359,12 @@ enum bpf_attach_type { /* Enable memory-mapping BPF map */ #define BPF_F_MMAPABLE (1U << 10) -/* flags for BPF_PROG_QUERY */ +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are returned only for directly attached programs. + */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) enum bpf_stack_build_id_status { -- cgit v1.2.3 From 51c39bb1d5d105a02e29aa7960f0a395086e6342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 9 Jan 2020 22:41:20 -0800 Subject: bpf: Introduce function-by-function verification New llvm and old llvm with libbpf help produce BTF that distinguish global and static functions. Unlike arguments of static function the arguments of global functions cannot be removed or optimized away by llvm. The compiler has to use exactly the arguments specified in a function prototype. The argument type information allows the verifier validate each global function independently. For now only supported argument types are pointer to context and scalars. In the future pointers to structures, sizes, pointer to packet data can be supported as well. Consider the following example: static int f1(int ...) { ... } int f3(int b); int f2(int a) { f1(a) + f3(a); } int f3(int b) { ... } int main(...) { f1(...) + f2(...) + f3(...); } The verifier will start its safety checks from the first global function f2(). It will recursively descend into f1() because it's static. Then it will check that arguments match for the f3() invocation inside f2(). It will not descend into f3(). It will finish f2() that has to be successfully verified for all possible values of 'a'. Then it will proceed with f3(). That function also has to be safe for all possible values of 'b'. Then it will start subprog 0 (which is main() function). It will recursively descend into f1() and will skip full check of f2() and f3(), since they are global. The order of processing global functions doesn't affect safety, since all global functions must be proven safe based on their arguments only. Such function by function verification can drastically improve speed of the verification and reduce complexity. Note that the stack limit of 512 still applies to the call chain regardless whether functions were static or global. The nested level of 8 also still applies. The same recursion prevention checks are in place as well. The type information and static/global kind is preserved after the verification hence in the above example global function f2() and f3() can be replaced later by equivalent functions with the same types that are loaded and verified later without affecting safety of this main() program. Such replacement (re-linking) of global functions is a subject of future patches. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200110064124.1760511-3-ast@kernel.org --- include/linux/bpf.h | 7 +- include/linux/bpf_verifier.h | 10 +- include/uapi/linux/btf.h | 6 ++ kernel/bpf/btf.c | 175 ++++++++++++++++++++++++------ kernel/bpf/verifier.c | 252 ++++++++++++++++++++++++++++++++++--------- 5 files changed, 366 insertions(+), 84 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a7bfe8a388c6..aed2bc39d72b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -566,6 +566,7 @@ static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, #endif struct bpf_func_info_aux { + u16 linkage; bool unreliable; }; @@ -1081,7 +1082,11 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog); +struct bpf_reg_state; +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs); +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg); struct bpf_prog *bpf_prog_by_id(u32 id); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 26e40de9ef55..5406e6e96585 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -304,11 +304,13 @@ struct bpf_insn_aux_data { u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ - bool seen; /* this insn was processed by the verifier */ + u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ - bool prune_point; + + /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ + bool prune_point; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ @@ -379,6 +381,7 @@ struct bpf_verifier_env { int *insn_stack; int cur_stack; } cfg; + u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ u32 prev_insn_processed, insn_processed; @@ -428,4 +431,7 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); +int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 1a2898c482ee..5a667107ad2c 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -146,6 +146,12 @@ enum { BTF_VAR_GLOBAL_EXTERN = 2, }; +enum btf_func_linkage { + BTF_FUNC_STATIC = 0, + BTF_FUNC_GLOBAL = 1, + BTF_FUNC_EXTERN = 2, +}; + /* BTF_KIND_VAR is followed by a single "struct btf_var" to describe * additional information related to the variable such as its linkage. */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 81d9cf75cacd..832b5d7fd892 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2651,8 +2651,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); + if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) { + btf_verifier_log_type(env, t, "Invalid func linkage"); return -EINVAL; } @@ -3506,7 +3506,8 @@ static u8 bpf_ctx_convert_map[] = { static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type) + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) { const struct btf_type *conv_struct; const struct btf_type *ctx_struct; @@ -3527,12 +3528,13 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ - bpf_log(log, "BPF program ctx type is not a struct\n"); + if (log->level & BPF_LOG_LEVEL) + bpf_log(log, "arg#%d type is not a struct\n", arg); return NULL; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { - bpf_log(log, "BPF program ctx struct doesn't have a name\n"); + bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return NULL; } /* prog_type is valid bpf program type. No need for bounds check. */ @@ -3565,11 +3567,12 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, - enum bpf_prog_type prog_type) + enum bpf_prog_type prog_type, + int arg) { const struct btf_member *prog_ctx_type, *kern_ctx_type; - prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type); + prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg); if (!prog_ctx_type) return -ENOENT; kern_ctx_type = prog_ctx_type + 1; @@ -3731,7 +3734,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { - ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); + ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); if (ret > 0) { info->btf_id = ret; return true; @@ -4112,11 +4115,16 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) +/* Compare BTF of a function with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + */ +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg) { - struct bpf_verifier_state *st = env->cur_state; - struct bpf_func_state *func = st->frame[st->curframe]; - struct bpf_reg_state *reg = func->regs; struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; @@ -4126,27 +4134,30 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) const char *tname; if (!prog->aux->func_info) - return 0; + return -EINVAL; btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) - return 0; + return -EFAULT; if (prog->aux->func_info_aux[subprog].unreliable) - return 0; + return -EINVAL; t = btf_type_by_id(btf, btf_id); if (!t || !btf_type_is_func(t)) { - bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n", + /* These checks were already done by the verifier while loading + * struct bpf_func_info + */ + bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", subprog); - return -EINVAL; + return -EFAULT; } tname = btf_name_by_offset(btf, t->name_off); t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid type of func %s\n", tname); - return -EINVAL; + bpf_log(log, "Invalid BTF of func %s\n", tname); + return -EFAULT; } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); @@ -4172,25 +4183,127 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) bpf_log(log, "R%d is not a pointer\n", i + 1); goto out; } - /* If program is passing PTR_TO_CTX into subprogram - * check that BTF type matches. + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. */ - if (reg[i + 1].type == PTR_TO_CTX && - !btf_get_prog_ctx_type(log, btf, t, prog->type)) - goto out; - /* All other pointers are ok */ - continue; + if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + if (reg[i + 1].type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)]); + goto out; + } + if (check_ctx_reg(env, ®[i + 1], i + 1)) + goto out; + continue; + } } - bpf_log(log, "Unrecognized argument type %s\n", - btf_kind_str[BTF_INFO_KIND(t->info)]); + bpf_log(log, "Unrecognized arg#%d type %s\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } return 0; out: - /* LLVM optimizations can remove arguments from static functions. */ - bpf_log(log, - "Type info disagrees with actual arguments due to compiler optimizations\n"); + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ prog->aux->func_info_aux[subprog].unreliable = true; + return -EINVAL; +} + +/* Convert BTF of a function into bpf_reg_state if possible + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - cannot convert BTF. + * 0 - Successfully converted BTF into bpf_reg_state + * (either PTR_TO_CTX or SCALAR_VALUE). + */ +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg) +{ + struct bpf_verifier_log *log = &env->log; + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs, btf_id; + const char *tname; + + if (!prog->aux->func_info || + prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) { + bpf_log(log, "Verifier bug\n"); + return -EFAULT; + } + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) { + bpf_log(log, "Global functions need valid BTF\n"); + return -EFAULT; + } + + t = btf_type_by_id(btf, btf_id); + if (!t || !btf_type_is_func(t)) { + /* These checks were already done by the verifier while loading + * struct bpf_func_info + */ + bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", + subprog); + return -EFAULT; + } + tname = btf_name_by_offset(btf, t->name_off); + + if (log->level & BPF_LOG_LEVEL) + bpf_log(log, "Validating %s() func#%d...\n", + tname, subprog); + + if (prog->aux->func_info_aux[subprog].unreliable) { + bpf_log(log, "Verifier bug in function %s()\n", tname); + return -EFAULT; + } + + t = btf_type_by_id(btf, t->type); + if (!t || !btf_type_is_func_proto(t)) { + bpf_log(log, "Invalid type of function %s()\n", tname); + return -EFAULT; + } + args = (const struct btf_param *)(t + 1); + nargs = btf_type_vlen(t); + if (nargs > 5) { + bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", + tname, nargs); + return -EINVAL; + } + /* check that function returns int */ + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + bpf_log(log, + "Global function %s() doesn't return scalar. Only those are supported.\n", + tname); + return -EINVAL; + } + /* Convert BTF function arguments into verifier types. + * Only PTR_TO_CTX and SCALAR are supported atm. + */ + for (i = 0; i < nargs; i++) { + t = btf_type_by_id(btf, args[i].type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (btf_type_is_int(t) || btf_type_is_enum(t)) { + reg[i + 1].type = SCALAR_VALUE; + continue; + } + if (btf_type_is_ptr(t) && + btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + reg[i + 1].type = PTR_TO_CTX; + continue; + } + bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)], tname); + return -EINVAL; + } return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f5af759a8a5f..ca17dccc17ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1122,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env, regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); regs[BPF_REG_FP].frameno = state->frameno; - - /* 1st arg to a function */ - regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(env, regs, BPF_REG_1); } #define BPF_MAIN_FUNC (-1) @@ -2739,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -static int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { /* Access to ctx or passing it to a helper is only allowed in * its original, unmodified form. @@ -3956,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env, return 0; } +static void clear_caller_saved_regs(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + int i; + + /* after the call registers r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } +} + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; int i, err, subprog, target_insn; + bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -3984,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } + func_info_aux = env->prog->aux->func_info_aux; + if (func_info_aux) + is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + if (is_global) { + if (err) { + verbose(env, "Caller passes invalid args into func#%d\n", + subprog); + return err; + } else { + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, + "Func#%d is global and valid. Skipping.\n", + subprog); + clear_caller_saved_regs(env, caller->regs); + + /* All global functions return SCALAR_VALUE */ + mark_reg_unknown(env, caller->regs, BPF_REG_0); + + /* continue with next insn after call */ + return 0; + } + } + callee = kzalloc(sizeof(*callee), GFP_KERNEL); if (!callee) return -ENOMEM; @@ -4010,18 +4046,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call registers r0 - r5 were scratched */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, caller->regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); - } + clear_caller_saved_regs(env, caller->regs); /* only increment it after check_reg_arg() finished */ state->curframe++; - if (btf_check_func_arg_match(env, subprog)) - return -EINVAL; - /* and go analyze first insn of the callee */ *insn_idx = target_insn; @@ -6771,12 +6800,13 @@ static int check_btf_func(struct bpf_verifier_env *env, /* check type_id */ type = btf_type_by_id(btf, krecord[i].type_id); - if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { + if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); ret = -EINVAL; goto err_free; } + info_aux[i].linkage = BTF_INFO_VLEN(type->info); prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -7756,35 +7786,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; - env->prev_linfo = NULL; - - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); - if (!state) - return -ENOMEM; - state->curframe = 0; - state->speculative = false; - state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); - if (!state->frame[0]) { - kfree(state); - return -ENOMEM; - } - env->cur_state = state; - init_func_state(env, state->frame[0], - BPF_MAIN_FUNC /* callsite */, - 0 /* frameno */, - 0 /* subprogno, zero == main subprog */); - - if (btf_check_func_arg_match(env, 0)) - return -EINVAL; - for (;;) { struct bpf_insn *insn; u8 class; @@ -7862,7 +7870,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -8082,7 +8090,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -8095,7 +8103,6 @@ process_bpf_exit: env->insn_idx++; } - env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -8372,7 +8379,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = true; + new_data[i].seen = env->pass_cnt; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; @@ -9484,6 +9491,7 @@ static void free_states(struct bpf_verifier_env *env) kfree(sl); sl = sln; } + env->free_list = NULL; if (!env->explored_states) return; @@ -9497,11 +9505,159 @@ static void free_states(struct bpf_verifier_env *env) kfree(sl); sl = sln; } + env->explored_states[i] = NULL; } +} - kvfree(env->explored_states); +/* The verifier is using insn_aux_data[] to store temporary data during + * verification and to store information for passes that run after the + * verification like dead code sanitization. do_check_common() for subprogram N + * may analyze many other subprograms. sanitize_insn_aux_data() clears all + * temporary data after do_check_common() finds that subprogram N cannot be + * verified independently. pass_cnt counts the number of times + * do_check_common() was run and insn->aux->seen tells the pass number + * insn_aux_data was touched. These variables are compared to clear temporary + * data from failed pass. For testing and experiments do_check_common() can be + * run multiple times even when prior attempt to verify is unsuccessful. + */ +static void sanitize_insn_aux_data(struct bpf_verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + struct bpf_insn_aux_data *aux; + int i, class; + + for (i = 0; i < env->prog->len; i++) { + class = BPF_CLASS(insn[i].code); + if (class != BPF_LDX && class != BPF_STX) + continue; + aux = &env->insn_aux_data[i]; + if (aux->seen != env->pass_cnt) + continue; + memset(aux, 0, offsetof(typeof(*aux), orig_idx)); + } } +static int do_check_common(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_verifier_state *state; + struct bpf_reg_state *regs; + int ret, i; + + env->prev_linfo = NULL; + env->pass_cnt++; + + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + state->curframe = 0; + state->speculative = false; + state->branches = 1; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + subprog); + + regs = state->frame[state->curframe]->regs; + if (subprog) { + ret = btf_prepare_func_args(env, subprog, regs); + if (ret) + goto out; + for (i = BPF_REG_1; i <= BPF_REG_5; i++) { + if (regs[i].type == PTR_TO_CTX) + mark_reg_known_zero(env, regs, i); + else if (regs[i].type == SCALAR_VALUE) + mark_reg_unknown(env, regs, i); + } + } else { + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; + mark_reg_known_zero(env, regs, BPF_REG_1); + ret = btf_check_func_arg_match(env, subprog, regs); + if (ret == -EFAULT) + /* unlikely verifier bug. abort. + * ret == 0 and ret < 0 are sadly acceptable for + * main() function due to backward compatibility. + * Like socket filter program may be written as: + * int bpf_prog(struct pt_regs *ctx) + * and never dereference that ctx in the program. + * 'struct pt_regs' is a type mismatch for socket + * filter that should be using 'struct __sk_buff'. + */ + goto out; + } + + ret = do_check(env); +out: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + while (!pop_stack(env, NULL, NULL)); + free_states(env); + if (ret) + /* clean aux data in case subprog was rejected */ + sanitize_insn_aux_data(env); + return ret; +} + +/* Verify all global functions in a BPF program one by one based on their BTF. + * All global functions must pass verification. Otherwise the whole program is rejected. + * Consider: + * int bar(int); + * int foo(int f) + * { + * return bar(f); + * } + * int bar(int b) + * { + * ... + * } + * foo() will be verified first for R1=any_scalar_value. During verification it + * will be assumed that bar() already verified successfully and call to bar() + * from foo() will be checked for type match only. Later bar() will be verified + * independently to check that it's safe for R1=any_scalar_value. + */ +static int do_check_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog_aux *aux = env->prog->aux; + int i, ret; + + if (!aux->func_info) + return 0; + + for (i = 1; i < env->subprog_cnt; i++) { + if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL) + continue; + env->insn_idx = env->subprog_info[i].start; + WARN_ON_ONCE(env->insn_idx == 0); + ret = do_check_common(env, i); + if (ret) { + return ret; + } else if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, + "Func#%d is safe for any args that match its prototype\n", + i); + } + } + return 0; +} + +static int do_check_main(struct bpf_verifier_env *env) +{ + int ret; + + env->insn_idx = 0; + ret = do_check_common(env, 0); + if (!ret) + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + return ret; +} + + static void print_verification_stats(struct bpf_verifier_env *env) { int i; @@ -9849,18 +10005,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; - ret = do_check(env); - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } + ret = do_check_subprogs(env); + ret = ret ?: do_check_main(env); if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) ret = bpf_prog_offload_finalize(env); skip_full_check: - while (!pop_stack(env, NULL, NULL)); - free_states(env); + kvfree(env->explored_states); if (ret == 0) ret = check_max_stack_depth(env); -- cgit v1.2.3 From 8482941f09067da42f9c3362e15bfb3f3c19d610 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 14 Jan 2020 19:50:02 -0800 Subject: bpf: Add bpf_send_signal_thread() helper Commit 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") added helper bpf_send_signal() which permits bpf program to send a signal to the current process. The signal may be delivered to any threads in the process. We found a use case where sending the signal to the current thread is more preferable. - A bpf program will collect the stack trace and then send signal to the user application. - The user application will add some thread specific information to the just collected stack trace for later analysis. If bpf_send_signal() is used, user application will need to check whether the thread receiving the signal matches the thread collecting the stack by checking thread id. If not, it will need to send signal to another thread through pthread_kill(). This patch proposed a new helper bpf_send_signal_thread(), which sends the signal to the thread corresponding to the current kernel task. This way, user space is guaranteed that bpf_program execution context and user space signal handling context are the same thread. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115035002.602336-1-yhs@fb.com --- include/uapi/linux/bpf.h | 19 +++++++++++++++++-- kernel/trace/bpf_trace.c | 27 ++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 19 +++++++++++++++++-- 3 files changed, 58 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 52966e758fe5..a88837d4dd18 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2714,7 +2714,8 @@ union bpf_attr { * * int bpf_send_signal(u32 sig) * Description - * Send signal *sig* to the current task. + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. * Return * 0 on success or successfully queued. * @@ -2850,6 +2851,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2968,7 +2982,8 @@ union bpf_attr { FN(probe_read_kernel), \ FN(probe_read_user_str), \ FN(probe_read_kernel_str), \ - FN(tcp_send_ack), + FN(tcp_send_ack), \ + FN(send_signal_thread), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e5ef4ae9edb5..19e793aa441a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -703,6 +703,7 @@ struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; u32 sig; + enum pid_type type; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); @@ -712,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry) struct send_signal_irq_work *work; work = container_of(entry, struct send_signal_irq_work, irq_work); - group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); } -BPF_CALL_1(bpf_send_signal, u32, sig) +static int bpf_send_signal_common(u32 sig, enum pid_type type) { struct send_signal_irq_work *work = NULL; @@ -748,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig) */ work->task = current; work->sig = sig; + work->type = type; irq_work_queue(&work->irq_work); return 0; } - return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); + return group_send_sig_info(sig, SEND_SIG_PRIV, current, type); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_TGID); } static const struct bpf_func_proto bpf_send_signal_proto = { @@ -762,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_send_signal_thread, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_PID); +} + +static const struct bpf_func_proto bpf_send_signal_thread_proto = { + .func = bpf_send_signal_thread, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -822,6 +841,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 52966e758fe5..a88837d4dd18 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2714,7 +2714,8 @@ union bpf_attr { * * int bpf_send_signal(u32 sig) * Description - * Send signal *sig* to the current task. + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. * Return * 0 on success or successfully queued. * @@ -2850,6 +2851,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2968,7 +2982,8 @@ union bpf_attr { FN(probe_read_kernel), \ FN(probe_read_user_str), \ FN(probe_read_kernel_str), \ - FN(tcp_send_ack), + FN(tcp_send_ack), \ + FN(send_signal_thread), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From cb4d03ab499d4c040f4ab6fd4389d2b49f42b5a5 Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:01 -0800 Subject: bpf: Add generic support for lookup batch op This commit introduces generic support for the bpf_map_lookup_batch. This implementation can be used by almost all the bpf maps since its core implementation is relying on the existing map_get_next_key and map_lookup_elem. The bpf syscall subcommand introduced is: BPF_MAP_LOOKUP_BATCH The UAPI attribute is: struct { /* struct used by BPF_MAP_*_BATCH commands */ __aligned_u64 in_batch; /* start batch, * NULL to start from beginning */ __aligned_u64 out_batch; /* output: next start batch */ __aligned_u64 keys; __aligned_u64 values; __u32 count; /* input/output: * input: # of key/value * elements * output: # of filled elements */ __u32 map_fd; __u64 elem_flags; __u64 flags; } batch; in_batch/out_batch are opaque values use to communicate between user/kernel space, in_batch/out_batch must be of key_size length. To start iterating from the beginning in_batch must be null, count is the # of key/value elements to retrieve. Note that the 'keys' buffer must be a buffer of key_size * count size and the 'values' buffer must be value_size * count, where value_size must be aligned to 8 bytes by userspace if it's dealing with percpu maps. 'count' will contain the number of keys/values successfully retrieved. Note that 'count' is an input/output variable and it can contain a lower value after a call. If there's no more entries to retrieve, ENOENT will be returned. If error is ENOENT, count might be > 0 in case it copied some values but there were no more entries to retrieve. Note that if the return code is an error and not -EFAULT, count indicates the number of elements successfully processed. Suggested-by: Stanislav Fomichev Signed-off-by: Brian Vazquez Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-3-brianvv@google.com --- include/linux/bpf.h | 5 ++ include/uapi/linux/bpf.h | 18 ++++++ kernel/bpf/syscall.c | 160 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 179 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aed2bc39d72b..807744ecaa5a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -44,6 +44,8 @@ struct bpf_map_ops { int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); + int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); @@ -982,6 +984,9 @@ void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); +int generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); extern int sysctl_unprivileged_bpf_disabled; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a88837d4dd18..0721b2c6bb8c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -107,6 +107,7 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_ELEM, BPF_MAP_FREEZE, BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, }; enum bpf_map_type { @@ -420,6 +421,23 @@ union bpf_attr { __u64 flags; }; + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + struct { /* anonymous struct used by BPF_PROG_LOAD command */ __u32 prog_type; /* one of enum bpf_prog_type */ __u32 insn_cnt; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 08b0b6e40454..d604ddbb1afb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -219,10 +219,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, void *ptr; int err; - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_lookup_elem(map, key, value); - return err; - } + if (bpf_map_is_dev_bound(map)) + return bpf_map_offload_lookup_elem(map, key, value); preempt_disable(); this_cpu_inc(bpf_prog_active); @@ -1220,6 +1218,109 @@ err_put: return err; } +#define MAP_LOOKUP_RETRIES 3 + +int generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + void *buf, *buf_prevkey, *prev_key, *key, *value; + int err, retry = MAP_LOOKUP_RETRIES; + u32 value_size, cp, max_count; + bool first_key = false; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) + return -EINVAL; + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!buf_prevkey) + return -ENOMEM; + + buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); + if (!buf) { + kvfree(buf_prevkey); + return -ENOMEM; + } + + err = -EFAULT; + first_key = false; + prev_key = NULL; + if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) + goto free_buf; + key = buf; + value = key + map->key_size; + if (ubatch) + prev_key = buf_prevkey; + + for (cp = 0; cp < max_count;) { + rcu_read_lock(); + err = map->ops->map_get_next_key(map, prev_key, key); + rcu_read_unlock(); + if (err) + break; + err = bpf_map_copy_value(map, key, value, + attr->batch.elem_flags); + + if (err == -ENOENT) { + if (retry) { + retry--; + continue; + } + err = -EINTR; + break; + } + + if (err) + goto free_buf; + + if (copy_to_user(keys + cp * map->key_size, key, + map->key_size)) { + err = -EFAULT; + goto free_buf; + } + if (copy_to_user(values + cp * value_size, value, value_size)) { + err = -EFAULT; + goto free_buf; + } + + if (!prev_key) + prev_key = buf_prevkey; + + swap(prev_key, key); + retry = MAP_LOOKUP_RETRIES; + cp++; + } + + if (err == -EFAULT) + goto free_buf; + + if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || + (cp && copy_to_user(uobatch, prev_key, map->key_size)))) + err = -EFAULT; + +free_buf: + kfree(buf_prevkey); + kfree(buf); + return err; +} + #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value static int map_lookup_and_delete_elem(union bpf_attr *attr) @@ -3076,6 +3177,54 @@ out: return err; } +#define BPF_MAP_BATCH_LAST_FIELD batch.flags + +#define BPF_DO_BATCH(fn) \ + do { \ + if (!fn) { \ + err = -ENOTSUPP; \ + goto err_put; \ + } \ + err = fn(map, attr, uattr); \ + } while (0) + +static int bpf_map_do_batch(const union bpf_attr *attr, + union bpf_attr __user *uattr, + int cmd) +{ + struct bpf_map *map; + int err, ufd; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_BATCH)) + return -EINVAL; + + ufd = attr->batch.map_fd; + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (cmd == BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + + if (cmd != BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + if (cmd == BPF_MAP_LOOKUP_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_batch); + +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -3173,6 +3322,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; + case BPF_MAP_LOOKUP_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From aa2e93b8e58e18442edfb2427446732415bc215e Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:02 -0800 Subject: bpf: Add generic support for update and delete batch ops This commit adds generic support for update and delete batch ops that can be used for almost all the bpf maps. These commands share the same UAPI attr that lookup and lookup_and_delete batch ops use and the syscall commands are: BPF_MAP_UPDATE_BATCH BPF_MAP_DELETE_BATCH The main difference between update/delete and lookup batch ops is that for update/delete keys/values must be specified for userspace and because of that, neither in_batch nor out_batch are used. Suggested-by: Stanislav Fomichev Signed-off-by: Brian Vazquez Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-4-brianvv@google.com --- include/linux/bpf.h | 10 +++++ include/uapi/linux/bpf.h | 2 + kernel/bpf/syscall.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 807744ecaa5a..05466ad6cf1c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,6 +46,10 @@ struct bpf_map_ops { void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); + int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr); + int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); @@ -987,6 +991,12 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); +int generic_map_update_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); +int generic_map_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); extern int sysctl_unprivileged_bpf_disabled; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0721b2c6bb8c..d5320b0be459 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -108,6 +108,8 @@ enum bpf_cmd { BPF_MAP_FREEZE, BPF_BTF_GET_NEXT_ID, BPF_MAP_LOOKUP_BATCH, + BPF_MAP_UPDATE_BATCH, + BPF_MAP_DELETE_BATCH, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d604ddbb1afb..ce8244d1ba99 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1218,6 +1218,111 @@ err_put: return err; } +int generic_map_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *keys = u64_to_user_ptr(attr->batch.keys); + u32 cp, max_count; + int err = 0; + void *key; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + return -EINVAL; + } + + max_count = attr->batch.count; + if (!max_count) + return 0; + + for (cp = 0; cp < max_count; cp++) { + key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + break; + } + + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + break; + } + + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + if (err) + break; + } + if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) + err = -EFAULT; + return err; +} + +int generic_map_update_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + u32 value_size, cp, max_count; + int ufd = attr->map_fd; + void *key, *value; + struct fd f; + int err = 0; + + f = fdget(ufd); + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + return -EINVAL; + } + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + return -ENOMEM; + + for (cp = 0; cp < max_count; cp++) { + key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + break; + } + err = -EFAULT; + if (copy_from_user(value, values + cp * value_size, value_size)) + break; + + err = bpf_map_update_value(map, f, key, value, + attr->batch.elem_flags); + + if (err) + break; + } + + if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) + err = -EFAULT; + + kfree(value); + kfree(key); + return err; +} + #define MAP_LOOKUP_RETRIES 3 int generic_map_lookup_batch(struct bpf_map *map, @@ -3219,6 +3324,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch); + else if (cmd == BPF_MAP_UPDATE_BATCH) + BPF_DO_BATCH(map->ops->map_update_batch); + else + BPF_DO_BATCH(map->ops->map_delete_batch); err_put: fdput(f); @@ -3325,6 +3434,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); break; + case BPF_MAP_UPDATE_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); + break; + case BPF_MAP_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 057996380a42bb64ccc04383cfa9c0ace4ea11f0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 15 Jan 2020 10:43:04 -0800 Subject: bpf: Add batch ops to all htab bpf map htab can't use generic batch support due some problematic behaviours inherent to the data structre, i.e. while iterating the bpf map a concurrent program might delete the next entry that batch was about to use, in that case there's no easy solution to retrieve the next entry, the issue has been discussed multiple times (see [1] and [2]). The only way hmap can be traversed without the problem previously exposed is by making sure that the map is traversing entire buckets. This commit implements those strict requirements for hmap, the implementation follows the same interaction that generic support with some exceptions: - If keys/values buffer are not big enough to traverse a bucket, ENOSPC will be returned. - out_batch contains the value of the next bucket in the iteration, not the next key, but this is transparent for the user since the user should never use out_batch for other than bpf batch syscalls. This commits implements BPF_MAP_LOOKUP_BATCH and adds support for new command BPF_MAP_LOOKUP_AND_DELETE_BATCH. Note that for update/delete batch ops it is possible to use the generic implementations. [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/ [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/ Signed-off-by: Yonghong Song Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-6-brianvv@google.com --- include/linux/bpf.h | 3 + include/uapi/linux/bpf.h | 1 + kernel/bpf/hashtab.c | 264 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 9 +- 4 files changed, 276 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05466ad6cf1c..3517e32149a4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,6 +46,9 @@ struct bpf_map_ops { void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); + int (*map_lookup_and_delete_batch)(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d5320b0be459..033d90a2282d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -108,6 +108,7 @@ enum bpf_cmd { BPF_MAP_FREEZE, BPF_BTF_GET_NEXT_ID, BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, BPF_MAP_UPDATE_BATCH, BPF_MAP_DELETE_BATCH, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 22066a62c8c9..2d182c4ee9d9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -17,6 +17,16 @@ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) +#define BATCH_OPS(_name) \ + .map_lookup_batch = \ + _name##_map_lookup_batch, \ + .map_lookup_and_delete_batch = \ + _name##_map_lookup_and_delete_batch, \ + .map_update_batch = \ + generic_map_update_batch, \ + .map_delete_batch = \ + generic_map_delete_batch + struct bucket { struct hlist_nulls_head head; raw_spinlock_t lock; @@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int +__htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete, bool is_lru_map, + bool is_percpu) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); + u32 batch, max_count, size, bucket_size; + u64 elem_map_flags, map_flags; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + unsigned long flags; + struct htab_elem *l; + struct bucket *b; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + if ((elem_map_flags & ~BPF_F_LOCK) || + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + return -EINVAL; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + batch = 0; + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) + return -EFAULT; + + if (batch >= htab->n_buckets) + return -ENOENT; + + key_size = htab->map.key_size; + roundup_key_size = round_up(htab->map.key_size, 8); + value_size = htab->map.value_size; + size = round_up(value_size, 8); + if (is_percpu) + value_size = size * num_possible_cpus(); + total = 0; + /* while experimenting with hash tables with sizes ranging from 10 to + * 1000, it was observed that a bucket can have upto 5 entries. + */ + bucket_size = 5; + +alloc: + /* We cannot do copy_from_user or copy_to_user inside + * the rcu_read_lock. Allocate enough space here. + */ + keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); + if (!keys || !values) { + ret = -ENOMEM; + goto after_loop; + } + +again: + preempt_disable(); + this_cpu_inc(bpf_prog_active); + rcu_read_lock(); +again_nocopy: + dst_key = keys; + dst_val = values; + b = &htab->buckets[batch]; + head = &b->head; + raw_spin_lock_irqsave(&b->lock, flags); + + bucket_cnt = 0; + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + bucket_cnt++; + + if (bucket_cnt > (max_count - total)) { + if (total == 0) + ret = -ENOSPC; + raw_spin_unlock_irqrestore(&b->lock, flags); + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + goto after_loop; + } + + if (bucket_cnt > bucket_size) { + bucket_size = bucket_cnt; + raw_spin_unlock_irqrestore(&b->lock, flags); + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + kvfree(keys); + kvfree(values); + goto alloc; + } + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + memcpy(dst_key, l->key, key_size); + + if (is_percpu) { + int off = 0, cpu; + void __percpu *pptr; + + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(dst_val + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + } else { + value = l->key + roundup_key_size; + if (elem_map_flags & BPF_F_LOCK) + copy_map_value_locked(map, dst_val, value, + true); + else + copy_map_value(map, dst_val, value); + check_and_init_map_lock(map, dst_val); + } + if (do_delete) { + hlist_nulls_del_rcu(&l->hash_node); + if (is_lru_map) + bpf_lru_push_free(&htab->lru, &l->lru_node); + else + free_htab_elem(htab, l); + } + dst_key += key_size; + dst_val += value_size; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + /* If we are not copying data, we can go to next bucket and avoid + * unlocking the rcu. + */ + if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { + batch++; + goto again_nocopy; + } + + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, + key_size * bucket_cnt) || + copy_to_user(uvalues + total * value_size, values, + value_size * bucket_cnt))) { + ret = -EFAULT; + goto after_loop; + } + + total += bucket_cnt; + batch++; + if (batch >= htab->n_buckets) { + ret = -ENOENT; + goto after_loop; + } + goto again; + +after_loop: + if (ret == -EFAULT) + goto out; + + /* copy # of entries and next batch */ + ubatch = u64_to_user_ptr(attr->batch.out_batch); + if (copy_to_user(ubatch, &batch, sizeof(batch)) || + put_user(total, &uattr->batch.count)) + ret = -EFAULT; + +out: + kvfree(keys); + kvfree(values); + return ret; +} + +static int +htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, true); +} + +static int +htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, true); +} + +static int +htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, false); +} + +static int +htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, false); +} + +static int +htab_lru_percpu_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, true); +} + +static int +htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, true); +} + +static int +htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, false); +} + +static int +htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, false); +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab), }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab_lru), }; /* Called from eBPF program */ @@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_percpu), }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_lru_percpu), }; static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ce8244d1ba99..0d94d361379c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3310,7 +3310,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (IS_ERR(map)) return PTR_ERR(map); - if (cmd == BPF_MAP_LOOKUP_BATCH && + if ((cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; @@ -3324,6 +3325,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch); + else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); else if (cmd == BPF_MAP_UPDATE_BATCH) BPF_DO_BATCH(map->ops->map_update_batch); else @@ -3434,6 +3437,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); break; + case BPF_MAP_LOOKUP_AND_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, + BPF_MAP_LOOKUP_AND_DELETE_BATCH); + break; case BPF_MAP_UPDATE_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); break; -- cgit v1.2.3 From 75ccae62cb8d42a619323a85c577107b8b37d797 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 16 Jan 2020 16:14:44 +0100 Subject: xdp: Move devmap bulk queue into struct net_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 96360004b862 ("xdp: Make devmap flush_list common for all map instances"), changed devmap flushing to be a global operation instead of a per-map operation. However, the queue structure used for bulking was still allocated as part of the containing map. This patch moves the devmap bulk queue into struct net_device. The motivation for this is reusing it for the non-map variant of XDP_REDIRECT, which will be changed in a subsequent commit. To avoid other fields of struct net_device moving to different cache lines, we also move a couple of other members around. We defer the actual allocation of the bulk queue structure until the NETDEV_REGISTER notification devmap.c. This makes it possible to check for ndo_xdp_xmit support before allocating the structure, which is not possible at the time struct net_device is allocated. However, we keep the freeing in free_netdev() to avoid adding another RCU callback on NETDEV_UNREGISTER. Because of this change, we lose the reference back to the map that originated the redirect, so change the tracepoint to always return 0 as the map ID and index. Otherwise no functional change is intended with this patch. After this patch, the relevant part of struct net_device looks like this, according to pahole: /* --- cacheline 14 boundary (896 bytes) --- */ struct netdev_queue * _tx __attribute__((__aligned__(64))); /* 896 8 */ unsigned int num_tx_queues; /* 904 4 */ unsigned int real_num_tx_queues; /* 908 4 */ struct Qdisc * qdisc; /* 912 8 */ unsigned int tx_queue_len; /* 920 4 */ spinlock_t tx_global_lock; /* 924 4 */ struct xdp_dev_bulk_queue * xdp_bulkq; /* 928 8 */ struct xps_dev_maps * xps_cpus_map; /* 936 8 */ struct xps_dev_maps * xps_rxqs_map; /* 944 8 */ struct mini_Qdisc * miniq_egress; /* 952 8 */ /* --- cacheline 15 boundary (960 bytes) --- */ struct hlist_head qdisc_hash[16]; /* 960 128 */ /* --- cacheline 17 boundary (1088 bytes) --- */ struct timer_list watchdog_timer; /* 1088 40 */ /* XXX last struct has 4 bytes of padding */ int watchdog_timeo; /* 1128 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head todo_list; /* 1136 16 */ /* --- cacheline 18 boundary (1152 bytes) --- */ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/157918768397.1458396.12673224324627072349.stgit@toke.dk --- include/linux/netdevice.h | 13 ++++++---- include/trace/events/xdp.h | 2 +- kernel/bpf/devmap.c | 63 ++++++++++++++++++++-------------------------- net/core/dev.c | 2 ++ 4 files changed, 38 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2741aa35bec6..5ec3537fbdb1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -876,6 +876,7 @@ enum bpf_netdev_command { struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; +struct xdp_dev_bulk_queue; struct netdev_bpf { enum bpf_netdev_command command; @@ -1986,12 +1987,10 @@ struct net_device { unsigned int num_tx_queues; unsigned int real_num_tx_queues; struct Qdisc *qdisc; -#ifdef CONFIG_NET_SCHED - DECLARE_HASHTABLE (qdisc_hash, 4); -#endif unsigned int tx_queue_len; spinlock_t tx_global_lock; - int watchdog_timeo; + + struct xdp_dev_bulk_queue __percpu *xdp_bulkq; #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_cpus_map; @@ -2001,11 +2000,15 @@ struct net_device { struct mini_Qdisc __rcu *miniq_egress; #endif +#ifdef CONFIG_NET_SCHED + DECLARE_HASHTABLE (qdisc_hash, 4); +#endif /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; + int watchdog_timeo; - int __percpu *pcpu_refcnt; struct list_head todo_list; + int __percpu *pcpu_refcnt; struct list_head link_watch_list; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index a7378bcd9928..72bad13d4a3c 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -278,7 +278,7 @@ TRACE_EVENT(xdp_devmap_xmit, ), TP_fast_assign( - __entry->map_id = map->id; + __entry->map_id = map ? map->id : 0; __entry->act = XDP_REDIRECT; __entry->map_index = map_index; __entry->drops = drops; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index da9c832fc5c8..030d125c3839 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -53,13 +53,11 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 -struct bpf_dtab_netdev; - -struct xdp_bulk_queue { +struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; + struct net_device *dev; struct net_device *dev_rx; - struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -67,9 +65,8 @@ struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; - struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; - unsigned int idx; /* keep track of map index for tracepoint */ + unsigned int idx; }; struct bpf_dtab { @@ -219,7 +216,6 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -234,7 +230,6 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -320,10 +315,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) +static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { - struct bpf_dtab_netdev *obj = bq->obj; - struct net_device *dev = obj->dev; + struct net_device *dev = bq->dev; int sent = 0, drops = 0, err = 0; int i; @@ -346,8 +340,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, - sent, drops, bq->dev_rx, dev, err); + trace_xdp_devmap_xmit(NULL, 0, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); return 0; @@ -374,7 +367,7 @@ error: void __dev_map_flush(void) { struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq, *tmp; + struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) @@ -401,12 +394,12 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, +static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); @@ -444,7 +437,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dst, xdpf, dev_rx); + return bq_enqueue(dev, xdpf, dev_rx); } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -483,7 +476,6 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -538,30 +530,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, u32 ifindex, unsigned int idx) { - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev; - struct xdp_bulk_queue *bq; - int cpu; - dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { - free_percpu(dev->bulkq); kfree(dev); return ERR_PTR(-EINVAL); } @@ -721,9 +698,23 @@ static int dev_map_notification(struct notifier_block *notifier, { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; - int i; + int i, cpu; switch (event) { + case NETDEV_REGISTER: + if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) + break; + + /* will be freed in free_netdev() */ + netdev->xdp_bulkq = + __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), + sizeof(void *), GFP_ATOMIC); + if (!netdev->xdp_bulkq) + return NOTIFY_BAD; + + for_each_possible_cpu(cpu) + per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; + break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete diff --git a/net/core/dev.c b/net/core/dev.c index d99f88c58636..e7802a41ae7f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9847,6 +9847,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; + free_percpu(dev->xdp_bulkq); + dev->xdp_bulkq = NULL; netdev_unregister_lockdep_key(dev); -- cgit v1.2.3 From 1d233886dd904edbf239eeffe435c3308ae97625 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 16 Jan 2020 16:14:45 +0100 Subject: xdp: Use bulking for non-map XDP_REDIRECT and consolidate code paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the bulk queue used by XDP_REDIRECT now lives in struct net_device, we can re-use the bulking for the non-map version of the bpf_redirect() helper. This is a simple matter of having xdp_do_redirect_slow() queue the frame on the bulk queue instead of sending it out with __bpf_tx_xdp(). Unfortunately we can't make the bpf_redirect() helper return an error if the ifindex doesn't exit (as bpf_redirect_map() does), because we don't have a reference to the network namespace of the ingress device at the time the helper is called. So we have to leave it as-is and keep the device lookup in xdp_do_redirect_slow(). Since this leaves less reason to have the non-map redirect code in a separate function, so we get rid of the xdp_do_redirect_slow() function entirely. This does lose us the tracepoint disambiguation, but fortunately the xdp_redirect and xdp_redirect_map tracepoints use the same tracepoint entry structures. This means both can contain a map index, so we can just amend the tracepoint definitions so we always emit the xdp_redirect(_err) tracepoints, but with the map ID only populated if a map is present. This means we retire the xdp_redirect_map(_err) tracepoints entirely, but keep the definitions around in case someone is still listening for them. With this change, the performance of the xdp_redirect sample program goes from 5Mpps to 8.4Mpps (a 68% increase). Since the flush functions are no longer map-specific, rename the flush() functions to drop _map from their names. One of the renamed functions is the xdp_do_flush_map() callback used in all the xdp-enabled drivers. To keep from having to update all drivers, use a #define to keep the old name working, and only update the virtual drivers in this patch. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/157918768505.1458396.17518057312953572912.stgit@toke.dk --- drivers/net/tun.c | 4 +- drivers/net/veth.c | 2 +- drivers/net/virtio_net.c | 2 +- include/linux/bpf.h | 13 +++++- include/linux/filter.h | 10 ++++- include/trace/events/xdp.h | 101 ++++++++++++++++++++------------------------- kernel/bpf/devmap.c | 32 +++++++++----- net/core/filter.c | 90 +++++++++------------------------------- 8 files changed, 108 insertions(+), 146 deletions(-) (limited to 'include') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 683d371e6e82..3a5a6c655dda 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1718,7 +1718,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, if (err < 0) goto err_xdp; if (err == XDP_REDIRECT) - xdp_do_flush_map(); + xdp_do_flush(); if (err != XDP_PASS) goto out; @@ -2549,7 +2549,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) } if (flush) - xdp_do_flush_map(); + xdp_do_flush(); rcu_read_unlock(); local_bh_enable(); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a552df37a347..1c89017beebb 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -769,7 +769,7 @@ static int veth_poll(struct napi_struct *napi, int budget) if (xdp_xmit & VETH_XDP_TX) veth_xdp_flush(rq->dev, &bq); if (xdp_xmit & VETH_XDP_REDIR) - xdp_do_flush_map(); + xdp_do_flush(); xdp_clear_return_frame_no_direct(); return done; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4d7d5434cc5d..c458cd313281 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1432,7 +1432,7 @@ static int virtnet_poll(struct napi_struct *napi, int budget) virtqueue_napi_complete(napi, rq->vq, received); if (xdp_xmit & VIRTIO_XDP_REDIR) - xdp_do_flush_map(); + xdp_do_flush(); if (xdp_xmit & VIRTIO_XDP_TX) { sq = virtnet_xdp_sq(vi); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3517e32149a4..8e3b8f4ad183 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1056,7 +1056,9 @@ struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); -void __dev_map_flush(void); +void __dev_flush(void); +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -1169,13 +1171,20 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map return NULL; } -static inline void __dev_map_flush(void) +static inline void __dev_flush(void) { } struct xdp_buff; struct bpf_dtab_netdev; +static inline +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return 0; +} + static inline int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx) diff --git a/include/linux/filter.h b/include/linux/filter.h index a366a0b64a57..f349e2c0884c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -918,7 +918,7 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, return 0; } -/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the +/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. @@ -929,7 +929,13 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); -void xdp_do_flush_map(void); +void xdp_do_flush(void); + +/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as + * it is no longer only flushing maps. Keep this define for compatibility + * until all drivers are updated - do not use xdp_do_flush_map() in new code! + */ +#define xdp_do_flush_map xdp_do_flush void bpf_warn_invalid_xdp_action(u32 act); diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 72bad13d4a3c..b680973687b4 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -79,14 +79,26 @@ TRACE_EVENT(xdp_bulk_tx, __entry->sent, __entry->drops, __entry->err) ); +#ifndef __DEVMAP_OBJ_TYPE +#define __DEVMAP_OBJ_TYPE +struct _bpf_dtab_netdev { + struct net_device *dev; +}; +#endif /* __DEVMAP_OBJ_TYPE */ + +#define devmap_ifindex(tgt, map) \ + (((map->map_type == BPF_MAP_TYPE_DEVMAP || \ + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \ + ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0) + DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), + const void *tgt, int err, + const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), + TP_ARGS(dev, xdp, tgt, err, map, index), TP_STRUCT__entry( __field(int, prog_id) @@ -103,90 +115,65 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; - __entry->to_ifindex = to_ifindex; + __entry->to_ifindex = map ? devmap_ifindex(tgt, map) : + index; __entry->map_id = map ? map->id : 0; - __entry->map_index = map_index; + __entry->map_index = map ? index : 0; ), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d", + TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" + " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, - __entry->err) + __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ - trace_xdp_redirect(dev, xdp, to, 0, NULL, 0); + trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to); #define _trace_xdp_redirect_err(dev, xdp, to, err) \ - trace_xdp_redirect_err(dev, xdp, to, err, NULL, 0); + trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to); + +#define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ + trace_xdp_redirect(dev, xdp, to, 0, map, index); + +#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ + trace_xdp_redirect_err(dev, xdp, to, err, map, index); -DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map, +/* not used anymore, but kept around so as not to break old programs */ +DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" - " map_id=%d map_index=%d", - __entry->prog_id, - __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), - __entry->ifindex, __entry->to_ifindex, - __entry->err, - __entry->map_id, __entry->map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); -DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, +DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" - " map_id=%d map_index=%d", - __entry->prog_id, - __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), - __entry->ifindex, __entry->to_ifindex, - __entry->err, - __entry->map_id, __entry->map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); -#ifndef __DEVMAP_OBJ_TYPE -#define __DEVMAP_OBJ_TYPE -struct _bpf_dtab_netdev { - struct net_device *dev; -}; -#endif /* __DEVMAP_OBJ_TYPE */ - -#define devmap_ifindex(fwd, map) \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) - -#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ - trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ - 0, map, idx) - -#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \ - trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ - err, map, idx) - TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 030d125c3839..d5311009953f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -81,7 +81,7 @@ struct bpf_dtab { u32 n_buckets; }; -static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); +static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -357,16 +357,16 @@ error: goto out; } -/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled +/* __dev_flush is called from xdp_do_flush() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(void) +void __dev_flush(void) { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); @@ -396,9 +396,8 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) */ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) - { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -419,10 +418,9 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, - struct net_device *dev_rx) +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) { - struct net_device *dev = dst->dev; struct xdp_frame *xdpf; int err; @@ -440,6 +438,20 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return __xdp_enqueue(dev, xdp, dev_rx); +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; + + return __xdp_enqueue(dev, xdp, dev_rx); +} + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { @@ -762,7 +774,7 @@ static int __init dev_map_init(void) register_netdevice_notifier(&dev_map_notifier); for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); + INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index d6f58dc4e1c4..17de6747d9e3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3458,58 +3458,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp(struct net_device *dev, - struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) -{ - struct xdp_frame *xdpf; - int err, sent; - - if (!dev->netdev_ops->ndo_xdp_xmit) { - return -EOPNOTSUPP; - } - - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); - if (unlikely(err)) - return err; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); - if (sent <= 0) - return sent; - return 0; -} - -static noinline int -xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) -{ - struct net_device *fwd; - u32 index = ri->tgt_index; - int err; - - fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->tgt_index = 0; - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - - err = __bpf_tx_xdp(fwd, NULL, xdp, 0); - if (unlikely(err)) - goto err; - - _trace_xdp_redirect(dev, xdp_prog, index); - return 0; -err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); - return err; -} - static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, struct xdp_buff *xdp) { @@ -3527,13 +3475,13 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, return 0; } -void xdp_do_flush_map(void) +void xdp_do_flush(void) { - __dev_map_flush(); + __dev_flush(); __cpu_map_flush(); __xsk_map_flush(); } -EXPORT_SYMBOL_GPL(xdp_do_flush_map); +EXPORT_SYMBOL_GPL(xdp_do_flush); static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) { @@ -3568,10 +3516,11 @@ void bpf_clear_redirect_map(struct bpf_map *map) } } -static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_map *map, - struct bpf_redirect_info *ri) +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) { + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->tgt_index; void *fwd = ri->tgt_value; int err; @@ -3580,7 +3529,18 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + if (unlikely(!map)) { + fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = dev_xdp_enqueue(fwd, xdp, dev); + } else { + err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + } + if (unlikely(err)) goto err; @@ -3590,18 +3550,6 @@ err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); return err; } - -int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) -{ - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - - if (likely(map)) - return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); - - return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri); -} EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, -- cgit v1.2.3 From 58aa94f922c1b44e0919d1814d2eab5b9e8bf945 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 16 Jan 2020 16:14:46 +0100 Subject: devmap: Adjust tracepoint for map-less queue flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we don't have a reference to a devmap when flushing the device bulk queue, let's change the the devmap_xmit tracepoint to remote the map_id and map_index fields entirely. Rearrange the fields so 'drops' and 'sent' stay in the same position in the tracepoint struct, to make it possible for the xdp_monitor utility to read both the old and the new format. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/157918768613.1458396.9165902403373826572.stgit@toke.dk --- include/trace/events/xdp.h | 29 ++++++++++++----------------- kernel/bpf/devmap.c | 2 +- samples/bpf/xdp_monitor_kern.c | 8 +++----- 3 files changed, 16 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b680973687b4..b95d65e8c628 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -246,43 +246,38 @@ TRACE_EVENT(xdp_cpumap_enqueue, TRACE_EVENT(xdp_devmap_xmit, - TP_PROTO(const struct bpf_map *map, u32 map_index, - int sent, int drops, - const struct net_device *from_dev, - const struct net_device *to_dev, int err), + TP_PROTO(const struct net_device *from_dev, + const struct net_device *to_dev, + int sent, int drops, int err), - TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), + TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( - __field(int, map_id) + __field(int, from_ifindex) __field(u32, act) - __field(u32, map_index) + __field(int, to_ifindex) __field(int, drops) __field(int, sent) - __field(int, from_ifindex) - __field(int, to_ifindex) __field(int, err) ), TP_fast_assign( - __entry->map_id = map ? map->id : 0; + __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; - __entry->map_index = map_index; + __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; - __entry->from_ifindex = from_dev->ifindex; - __entry->to_ifindex = to_dev->ifindex; __entry->err = err; ), TP_printk("ndo_xdp_xmit" - " map_id=%d map_index=%d action=%s" + " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" - " from_ifindex=%d to_ifindex=%d err=%d", - __entry->map_id, __entry->map_index, + " err=%d", + __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, - __entry->from_ifindex, __entry->to_ifindex, __entry->err) + __entry->err) ); /* Expect users already include , but not xdp_priv.h */ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d5311009953f..de630f980282 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -340,7 +340,7 @@ static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(NULL, 0, sent, drops, bq->dev_rx, dev, err); + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); return 0; diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index ad10fe700d7d..39458a44472e 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -222,14 +222,12 @@ struct bpf_map_def SEC("maps") devmap_xmit_cnt = { */ struct devmap_xmit_ctx { u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; + int from_ifindex; // offset:8; size:4; signed:1; u32 act; // offset:12; size:4; signed:0; - u32 map_index; // offset:16; size:4; signed:0; + int to_ifindex; // offset:16; size:4; signed:1; int drops; // offset:20; size:4; signed:1; int sent; // offset:24; size:4; signed:1; - int from_ifindex; // offset:28; size:4; signed:1; - int to_ifindex; // offset:32; size:4; signed:1; - int err; // offset:36; size:4; signed:1; + int err; // offset:28; size:4; signed:1; }; SEC("tracepoint/xdp/xdp_devmap_xmit") -- cgit v1.2.3 From 43a825afc91e2b06af1e8e7422198e759c2c5e20 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 20 Jan 2020 10:29:17 +0100 Subject: xsk, net: Make sock_def_readable() have external linkage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XDP sockets use the default implementation of struct sock's sk_data_ready callback, which is sock_def_readable(). This function is called in the XDP socket fast-path, and involves a retpoline. By letting sock_def_readable() have external linkage, and being called directly, the retpoline can be avoided. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200120092917.13949-1-bjorn.topel@gmail.com --- include/net/sock.h | 2 ++ net/core/sock.c | 2 +- net/xdp/xsk.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 8dff68b4c316..0891c55f1e82 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2612,4 +2612,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) return false; } +void sock_def_readable(struct sock *sk); + #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 8459ad579f73..a4c8fac781ff 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2786,7 +2786,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk) +void sock_def_readable(struct sock *sk) { struct socket_wq *wq; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 02ada7ab8c6e..df600487a68d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -217,7 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); - xs->sk.sk_data_ready(&xs->sk); + sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) -- cgit v1.2.3 From be8704ff07d2374bcc5c675526f95e70c6459683 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Jan 2020 16:53:46 -0800 Subject: bpf: Introduce dynamic program extensions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce dynamic program extensions. The users can load additional BPF functions and replace global functions in previously loaded BPF programs while these programs are executing. Global functions are verified individually by the verifier based on their types only. Hence the global function in the new program which types match older function can safely replace that corresponding function. This new function/program is called 'an extension' of old program. At load time the verifier uses (attach_prog_fd, attach_btf_id) pair to identify the function to be replaced. The BPF program type is derived from the target program into extension program. Technically bpf_verifier_ops is copied from target program. The BPF_PROG_TYPE_EXT program type is a placeholder. It has empty verifier_ops. The extension program can call the same bpf helper functions as target program. Single BPF_PROG_TYPE_EXT type is used to extend XDP, SKB and all other program types. The verifier allows only one level of replacement. Meaning that the extension program cannot recursively extend an extension. That also means that the maximum stack size is increasing from 512 to 1024 bytes and maximum function nesting level from 8 to 16. The programs don't always consume that much. The stack usage is determined by the number of on-stack variables used by the program. The verifier could have enforced 512 limit for combined original plus extension program, but it makes for difficult user experience. The main use case for extensions is to provide generic mechanism to plug external programs into policy program or function call chaining. BPF trampoline is used to track both fentry/fexit and program extensions because both are using the same nop slot at the beginning of every BPF function. Attaching fentry/fexit to a function that was replaced is not allowed. The opposite is true as well. Replacing a function that currently being analyzed with fentry/fexit is not allowed. The executable page allocated by BPF trampoline is not used by program extensions. This inefficiency will be optimized in future patches. Function by function verification of global function supports scalars and pointer to context only. Hence program extensions are supported for such class of global functions only. In the future the verifier will be extended with support to pointers to structures, arrays with sizes, etc. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200121005348.2769920-2-ast@kernel.org --- include/linux/bpf.h | 10 ++- include/linux/bpf_types.h | 2 + include/linux/btf.h | 5 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 152 +++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 15 ++++- kernel/bpf/trampoline.c | 41 ++++++++++++- kernel/bpf/verifier.c | 85 ++++++++++++++++++++------ 8 files changed, 283 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8e3b8f4ad183..05d16615054c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -465,7 +465,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, - BPF_TRAMP_MAX + BPF_TRAMP_MAX, + BPF_TRAMP_REPLACE, /* more than MAX */ }; struct bpf_trampoline { @@ -480,6 +481,11 @@ struct bpf_trampoline { void *addr; bool ftrace_managed; } func; + /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF + * program by replacing one of its functions. func.addr is the address + * of the function it replaced. + */ + struct bpf_prog *extension_prog; /* list of BPF programs using this trampoline */ struct hlist_head progs_hlist[BPF_TRAMP_MAX]; /* Number of attached programs. A counter per kind. */ @@ -1107,6 +1113,8 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); +int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, + struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9f326e6ef885..c81d4ece79a4 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -68,6 +68,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, #if defined(CONFIG_BPF_JIT) BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, void *, void *) +BPF_PROG_TYPE(BPF_PROG_TYPE_EXT, bpf_extension, + void *, void *) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) diff --git a/include/linux/btf.h b/include/linux/btf.h index 881e9b76ef49..5c1ea99b480f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -107,6 +107,11 @@ static inline u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static inline u16 btf_func_linkage(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + static inline bool btf_type_kflag(const struct btf_type *t) { return BTF_INFO_KFLAG(t->info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 033d90a2282d..e81628eb059c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -180,6 +180,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_PROG_TYPE_TRACING, BPF_PROG_TYPE_STRUCT_OPS, + BPF_PROG_TYPE_EXT, }; enum bpf_attach_type { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 832b5d7fd892..32963b6d5a9c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -276,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = "DATASEC", }; +static const char *btf_type_str(const struct btf_type *t) +{ + return btf_kind_str[BTF_INFO_KIND(t->info)]; +} + struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, @@ -4115,6 +4120,148 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } +/* Compare BTFs of two functions assuming only scalars and pointers to context. + * t1 points to BTF_KIND_FUNC in btf1 + * t2 points to BTF_KIND_FUNC in btf2 + * Returns: + * EINVAL - function prototype mismatch + * EFAULT - verifier bug + * 0 - 99% match. The last 1% is validated by the verifier. + */ +int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) +{ + const struct btf_param *args1, *args2; + const char *fn1, *fn2, *s1, *s2; + u32 nargs1, nargs2, i; + + fn1 = btf_name_by_offset(btf1, t1->name_off); + fn2 = btf_name_by_offset(btf2, t2->name_off); + + if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) { + bpf_log(log, "%s() is not a global function\n", fn1); + return -EINVAL; + } + if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) { + bpf_log(log, "%s() is not a global function\n", fn2); + return -EINVAL; + } + + t1 = btf_type_by_id(btf1, t1->type); + if (!t1 || !btf_type_is_func_proto(t1)) + return -EFAULT; + t2 = btf_type_by_id(btf2, t2->type); + if (!t2 || !btf_type_is_func_proto(t2)) + return -EFAULT; + + args1 = (const struct btf_param *)(t1 + 1); + nargs1 = btf_type_vlen(t1); + args2 = (const struct btf_param *)(t2 + 1); + nargs2 = btf_type_vlen(t2); + + if (nargs1 != nargs2) { + bpf_log(log, "%s() has %d args while %s() has %d args\n", + fn1, nargs1, fn2, nargs2); + return -EINVAL; + } + + t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); + t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); + if (t1->info != t2->info) { + bpf_log(log, + "Return type %s of %s() doesn't match type %s of %s()\n", + btf_type_str(t1), fn1, + btf_type_str(t2), fn2); + return -EINVAL; + } + + for (i = 0; i < nargs1; i++) { + t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL); + t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL); + + if (t1->info != t2->info) { + bpf_log(log, "arg%d in %s() is %s while %s() has %s\n", + i, fn1, btf_type_str(t1), + fn2, btf_type_str(t2)); + return -EINVAL; + } + if (btf_type_has_size(t1) && t1->size != t2->size) { + bpf_log(log, + "arg%d in %s() has size %d while %s() has %d\n", + i, fn1, t1->size, + fn2, t2->size); + return -EINVAL; + } + + /* global functions are validated with scalars and pointers + * to context only. And only global functions can be replaced. + * Hence type check only those types. + */ + if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + continue; + if (!btf_type_is_ptr(t1)) { + bpf_log(log, + "arg%d in %s() has unrecognized type\n", + i, fn1); + return -EINVAL; + } + t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); + t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); + if (!btf_type_is_struct(t1)) { + bpf_log(log, + "arg%d in %s() is not a pointer to context\n", + i, fn1); + return -EINVAL; + } + if (!btf_type_is_struct(t2)) { + bpf_log(log, + "arg%d in %s() is not a pointer to context\n", + i, fn2); + return -EINVAL; + } + /* This is an optional check to make program writing easier. + * Compare names of structs and report an error to the user. + * btf_prepare_func_args() already checked that t2 struct + * is a context type. btf_prepare_func_args() will check + * later that t1 struct is a context type as well. + */ + s1 = btf_name_by_offset(btf1, t1->name_off); + s2 = btf_name_by_offset(btf2, t2->name_off); + if (strcmp(s1, s2)) { + bpf_log(log, + "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n", + i, fn1, s1, fn2, s2); + return -EINVAL; + } + } + return 0; +} + +/* Compare BTFs of given program with BTF of target program */ +int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, + struct btf *btf2, const struct btf_type *t2) +{ + struct btf *btf1 = prog->aux->btf; + const struct btf_type *t1; + u32 btf_id = 0; + + if (!prog->aux->func_info) { + bpf_log(&env->log, "Program extension requires BTF\n"); + return -EINVAL; + } + + btf_id = prog->aux->func_info[0].type_id; + if (!btf_id) + return -EFAULT; + + t1 = btf_type_by_id(btf1, btf_id); + if (!t1 || !btf_type_is_func(t1)) + return -EFAULT; + + return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2); +} + /* Compare BTF of a function with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. @@ -4224,6 +4371,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; + enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t; @@ -4261,6 +4409,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Verifier bug in function %s()\n", tname); return -EFAULT; } + if (prog_type == BPF_PROG_TYPE_EXT) + prog_type = prog->aux->linked_prog->type; t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { @@ -4296,7 +4446,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, continue; } if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { reg[i + 1].type = PTR_TO_CTX; continue; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9a840c57f6df..a91ad518c050 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1932,13 +1932,15 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_EXT: break; default: return -EINVAL; } } - if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING) + if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING && + prog_type != BPF_PROG_TYPE_EXT) return -EINVAL; switch (prog_type) { @@ -1981,6 +1983,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_EXT: + if (expected_attach_type) + return -EINVAL; + /* fallthrough */ default: return 0; } @@ -2183,7 +2189,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) int tr_fd, err; if (prog->expected_attach_type != BPF_TRACE_FENTRY && - prog->expected_attach_type != BPF_TRACE_FEXIT) { + prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; } @@ -2250,12 +2257,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_EXT && prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { err = -EINVAL; goto out_put_prog; } - if (prog->type == BPF_PROG_TYPE_TRACING) { + if (prog->type == BPF_PROG_TYPE_TRACING || + prog->type == BPF_PROG_TYPE_EXT) { if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7657ede7aee2..eb64c245052b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -5,6 +5,12 @@ #include #include +/* dummy _ops. The verifier will operate on target program's ops. */ +const struct bpf_verifier_ops bpf_extension_verifier_ops = { +}; +const struct bpf_prog_ops bpf_extension_prog_ops = { +}; + /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) @@ -194,8 +200,10 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) switch (t) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; - default: + case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + default: + return BPF_TRAMP_REPLACE; } } @@ -204,12 +212,31 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) enum bpf_tramp_prog_type kind; struct bpf_trampoline *tr; int err = 0; + int cnt; tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog->expected_attach_type); mutex_lock(&tr->mutex); - if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT] - >= BPF_MAX_TRAMP_PROGS) { + if (tr->extension_prog) { + /* cannot attach fentry/fexit if extension prog is attached. + * cannot overwrite extension prog either. + */ + err = -EBUSY; + goto out; + } + cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; + if (kind == BPF_TRAMP_REPLACE) { + /* Cannot attach extension if fentry/fexit are in use. */ + if (cnt) { + err = -EBUSY; + goto out; + } + tr->extension_prog = prog; + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + prog->bpf_func); + goto out; + } + if (cnt >= BPF_MAX_TRAMP_PROGS) { err = -E2BIG; goto out; } @@ -240,9 +267,17 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog->expected_attach_type); mutex_lock(&tr->mutex); + if (kind == BPF_TRAMP_REPLACE) { + WARN_ON_ONCE(!tr->extension_prog); + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + tr->extension_prog->bpf_func, NULL); + tr->extension_prog = NULL; + goto out; + } hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; err = bpf_trampoline_update(prog->aux->trampoline); +out: mutex_unlock(&tr->mutex); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6defbec9eb62..9fe94f64f0ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9564,7 +9564,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) subprog); regs = state->frame[state->curframe]->regs; - if (subprog) { + if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { ret = btf_prepare_func_args(env, subprog, regs); if (ret) goto out; @@ -9742,6 +9742,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; @@ -9757,7 +9758,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); - if (prog->type != BPF_PROG_TYPE_TRACING) + if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension) return 0; if (!btf_id) { @@ -9793,8 +9794,59 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; + if (prog_extension) { + if (conservative) { + verbose(env, + "Cannot replace static functions\n"); + return -EINVAL; + } + if (!prog->jit_requested) { + verbose(env, + "Extension programs should be JITed\n"); + return -EINVAL; + } + env->ops = bpf_verifier_ops[tgt_prog->type]; + } + if (!tgt_prog->jited) { + verbose(env, "Can attach to only JITed progs\n"); + return -EINVAL; + } + if (tgt_prog->type == prog->type) { + /* Cannot fentry/fexit another fentry/fexit program. + * Cannot attach program extension to another extension. + * It's ok to attach fentry/fexit to extension program. + */ + verbose(env, "Cannot recursively attach\n"); + return -EINVAL; + } + if (tgt_prog->type == BPF_PROG_TYPE_TRACING && + prog_extension && + (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + /* Program extensions can extend all program types + * except fentry/fexit. The reason is the following. + * The fentry/fexit programs are used for performance + * analysis, stats and can be attached to any program + * type except themselves. When extension program is + * replacing XDP function it is necessary to allow + * performance analysis of all functions. Both original + * XDP program and its program extension. Hence + * attaching fentry/fexit to BPF_PROG_TYPE_EXT is + * allowed. If extending of fentry/fexit was allowed it + * would be possible to create long call chain + * fentry->extension->fentry->extension beyond + * reasonable stack size. Hence extending fentry is not + * allowed. + */ + verbose(env, "Cannot extend fentry/fexit\n"); + return -EINVAL; + } key = ((u64)aux->id) << 32 | btf_id; } else { + if (prog_extension) { + verbose(env, "Cannot replace kernel functions\n"); + return -EINVAL; + } key = btf_id; } @@ -9832,6 +9884,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + default: + if (!prog_extension) + return -EINVAL; + /* fallthrough */ case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { @@ -9839,6 +9895,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) btf_id); return -EINVAL; } + if (prog_extension && + btf_check_type_match(env, prog, btf, t)) + return -EINVAL; t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; @@ -9862,18 +9921,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (ret < 0) goto out; if (tgt_prog) { - if (!tgt_prog->jited) { - /* for now */ - verbose(env, "Can trace only JITed BPF progs\n"); - ret = -EINVAL; - goto out; - } - if (tgt_prog->type == BPF_PROG_TYPE_TRACING) { - /* prevent cycles */ - verbose(env, "Cannot recursively attach\n"); - ret = -EINVAL; - goto out; - } if (subprog == 0) addr = (long) tgt_prog->bpf_func; else @@ -9895,8 +9942,6 @@ out: if (ret) bpf_trampoline_put(tr); return ret; - default: - return -EINVAL; } } @@ -9966,10 +10011,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; @@ -10006,6 +10047,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From 5576b991e9c1a11d2cc21c4b94fc75ec27603896 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Jan 2020 15:36:46 -0800 Subject: bpf: Add BPF_FUNC_jiffies64 This patch adds a helper to read the 64bit jiffies. It will be used in a later patch to implement the bpf_cubic.c. The helper is inlined for jit_requested and 64 BITS_PER_LONG as the map_gen_lookup(). Other cases could be considered together with map_gen_lookup() if needed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200122233646.903260-1-kafai@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 ++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ net/core/filter.c | 2 ++ 6 files changed, 48 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05d16615054c..a9687861fd7e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1414,6 +1414,7 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; +extern const struct bpf_func_proto bpf_jiffies64_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e81628eb059c..f1d74a2bd234 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2886,6 +2886,12 @@ union bpf_attr { * **-EPERM** if no permission to send the *sig*. * * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64bit jiffies + * Return + * The 64 bit jiffies */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3005,7 +3011,8 @@ union bpf_attr { FN(probe_read_user_str), \ FN(probe_read_kernel_str), \ FN(tcp_send_ack), \ - FN(send_signal_thread), + FN(send_signal_thread), \ + FN(jiffies64), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29d47aae0dd1..973a20d49749 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2137,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; +const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cada974c9f4e..d8b7b110a1c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, preempt_enable(); } +BPF_CALL_0(bpf_jiffies64) +{ + return get_jiffies_64(); +} + +const struct bpf_func_proto bpf_jiffies64_proto = { + .func = bpf_jiffies64, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fe94f64f0ec..734abaa02123 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9445,6 +9445,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_jiffies64) { + struct bpf_insn ld_jiffies_addr[2] = { + BPF_LD_IMM64(BPF_REG_0, + (unsigned long)&jiffies), + }; + + insn_buf[0] = ld_jiffies_addr[0]; + insn_buf[1] = ld_jiffies_addr[1]; + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, + BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/net/core/filter.c b/net/core/filter.c index 17de6747d9e3..4bf3e4aa8a7a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5923,6 +5923,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; default: return NULL; } -- cgit v1.2.3