From 2a02759ef5f8a34792df22b41d5e10658fd7bbd3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:02 -0700 Subject: bpf: Add support for BTF pointers to interpreter Pointer to BTF object is a pointer to kernel object or NULL. The memory access in the interpreter has to be done via probe_kernel_read to avoid page faults. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-9-ast@kernel.org --- kernel/bpf/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel/bpf/core.c') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 66088a9e9b9e..8a765bbd33f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1291,6 +1291,11 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON +u64 __weak bpf_probe_read(void * dst, u32 size, const void * unsafe_ptr) +{ + memset(dst, 0, size); + return -EFAULT; +} /** * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1310,6 +1315,10 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6 /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, + [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, + [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, + [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL @@ -1542,6 +1551,16 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST +#define LDX_PROBE(SIZEOP, SIZE) \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read(&DST, SIZE, (const void *)(long) SRC); \ + CONT; + LDX_PROBE(B, 1) + LDX_PROBE(H, 2) + LDX_PROBE(W, 4) + LDX_PROBE(DW, 8) +#undef LDX_PROBE + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ atomic_add((u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); -- cgit v1.2.3 From 3dec541b2e632d630fe7142ed44f0b3702ef1f8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:03 -0700 Subject: bpf: Add support for BTF pointers to x86 JIT Pointer to BTF object is a pointer to kernel object or NULL. Such pointers can only be used by BPF_LDX instructions. The verifier changed their opcode from LDX|MEM|size to LDX|PROBE_MEM|size to make JITing easier. The number of entries in extable is the number of BPF_LDX insns that access kernel memory via "pointer to BTF type". Only these load instructions can fault. Since x86 extable is relative it has to be allocated in the same memory region as JITed code. Allocate it prior to last pass of JITing and let the last pass populate it. Pointer to extable in bpf_prog_aux is necessary to make page fault handling fast. Page fault handling is done in two steps: 1. bpf_prog_kallsyms_find() finds BPF program that page faulted. It's done by walking rb tree. 2. then extable for given bpf program is binary searched. This process is similar to how page faulting is done for kernel modules. The exception handler skips over faulting x86 instruction and initializes destination register with zero. This mimics exact behavior of bpf_probe_read (when probe_kernel_read faults dest is zeroed). JITs for other architectures can add support in similar way. Until then they will reject unknown opcode and fallback to interpreter. Since extable should be aligned and placed near JITed code make bpf_jit_binary_alloc() return 4 byte aligned image offset, so that extable aligning formula in bpf_int_jit_compile() doesn't need to rely on internal implementation of bpf_jit_binary_alloc(). On x86 gcc defaults to 16-byte alignment for regular kernel functions due to better performance. JITed code may be aligned to 16 in the future, but it will use 4 in the meantime. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-10-ast@kernel.org --- arch/x86/net/bpf_jit_comp.c | 97 +++++++++++++++++++++++++++++++++++++++++++-- include/linux/bpf.h | 3 ++ include/linux/extable.h | 10 +++++ kernel/bpf/core.c | 20 +++++++++- kernel/bpf/verifier.c | 1 + kernel/extable.c | 2 + 6 files changed, 128 insertions(+), 5 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 3ad2ba1ad855..8cd23d8309bf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -9,7 +9,7 @@ #include #include #include - +#include #include #include @@ -123,6 +123,19 @@ static const int reg2hex[] = { [AUX_REG] = 3, /* R11 temp register */ }; +static const int reg2pt_regs[] = { + [BPF_REG_0] = offsetof(struct pt_regs, ax), + [BPF_REG_1] = offsetof(struct pt_regs, di), + [BPF_REG_2] = offsetof(struct pt_regs, si), + [BPF_REG_3] = offsetof(struct pt_regs, dx), + [BPF_REG_4] = offsetof(struct pt_regs, cx), + [BPF_REG_5] = offsetof(struct pt_regs, r8), + [BPF_REG_6] = offsetof(struct pt_regs, bx), + [BPF_REG_7] = offsetof(struct pt_regs, r13), + [BPF_REG_8] = offsetof(struct pt_regs, r14), + [BPF_REG_9] = offsetof(struct pt_regs, r15), +}; + /* * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15 * which need extra byte of encoding. @@ -377,6 +390,19 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } + +static bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr) +{ + u32 reg = x->fixup >> 8; + + /* jump over faulting load and clear dest register */ + *(unsigned long *)((void *)regs + reg) = 0; + regs->ip += x->fixup & 0xff; + return true; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -384,7 +410,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int insn_cnt = bpf_prog->len; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, cnt = 0; + int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; @@ -778,14 +804,17 @@ stx: if (is_imm8(insn->off)) /* LDX: dst_reg = *(u8*)(src_reg + off) */ case BPF_LDX | BPF_MEM | BPF_B: + case BPF_LDX | BPF_PROBE_MEM | BPF_B: /* Emit 'movzx rax, byte ptr [rax + off]' */ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); goto ldx; case BPF_LDX | BPF_MEM | BPF_H: + case BPF_LDX | BPF_PROBE_MEM | BPF_H: /* Emit 'movzx rax, word ptr [rax + off]' */ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); goto ldx; case BPF_LDX | BPF_MEM | BPF_W: + case BPF_LDX | BPF_PROBE_MEM | BPF_W: /* Emit 'mov eax, dword ptr [rax+0x14]' */ if (is_ereg(dst_reg) || is_ereg(src_reg)) EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); @@ -793,6 +822,7 @@ stx: if (is_imm8(insn->off)) EMIT1(0x8B); goto ldx; case BPF_LDX | BPF_MEM | BPF_DW: + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: /* Emit 'mov rax, qword ptr [rax+0x14]' */ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); ldx: /* @@ -805,6 +835,48 @@ ldx: /* else EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), insn->off); + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + struct exception_table_entry *ex; + u8 *_insn = image + proglen; + s64 delta; + + if (!bpf_prog->aux->extable) + break; + + if (excnt >= bpf_prog->aux->num_exentries) { + pr_err("ex gen bug\n"); + return -EFAULT; + } + ex = &bpf_prog->aux->extable[excnt++]; + + delta = _insn - (u8 *)&ex->insn; + if (!is_simm32(delta)) { + pr_err("extable->insn doesn't fit into 32-bit\n"); + return -EFAULT; + } + ex->insn = delta; + + delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; + if (!is_simm32(delta)) { + pr_err("extable->handler doesn't fit into 32-bit\n"); + return -EFAULT; + } + ex->handler = delta; + + if (dst_reg > BPF_REG_9) { + pr_err("verifier error\n"); + return -EFAULT; + } + /* + * Compute size of x86 insn and its target dest x86 register. + * ex_handler_bpf() will use lower 8 bits to adjust + * pt_regs->ip to jump over this x86 instruction + * and upper bits to figure out which pt_regs to zero out. + * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" + * of 4 bytes will be ignored and rbx will be zero inited. + */ + ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8); + } break; /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ @@ -1058,6 +1130,11 @@ emit_jmp: addrs[i] = proglen; prog = temp; } + + if (image && excnt != bpf_prog->aux->num_exentries) { + pr_err("extable is not populated\n"); + return -EFAULT; + } return proglen; } @@ -1158,12 +1235,24 @@ out_image: break; } if (proglen == oldproglen) { - header = bpf_jit_binary_alloc(proglen, &image, - 1, jit_fill_hole); + /* + * The number of entries in extable is the number of BPF_LDX + * insns that access kernel memory via "pointer to BTF type". + * The verifier changed their opcode from LDX|MEM|size + * to LDX|PROBE_MEM|size to make JITing easier. + */ + u32 align = __alignof__(struct exception_table_entry); + u32 extable_size = prog->aux->num_exentries * + sizeof(struct exception_table_entry); + + /* allocate module memory for x86 insns and extable */ + header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size, + &image, align, jit_fill_hole); if (!header) { prog = orig_prog; goto out_addrs; } + prog->aux->extable = (void *) image + roundup(proglen, align); } oldproglen = proglen; cond_resched(); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 028555fcd10d..a7330d75bb94 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -24,6 +24,7 @@ struct sock; struct seq_file; struct btf; struct btf_type; +struct exception_table_entry; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -423,6 +424,8 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + u32 num_exentries; + struct exception_table_entry *extable; struct bpf_prog_stats __percpu *stats; union { struct work_struct work; diff --git a/include/linux/extable.h b/include/linux/extable.h index 81ecfaa83ad3..4ab9e78f313b 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -33,4 +33,14 @@ search_module_extables(unsigned long addr) } #endif /*CONFIG_MODULES*/ +#ifdef CONFIG_BPF_JIT +const struct exception_table_entry *search_bpf_extables(unsigned long addr); +#else +static inline const struct exception_table_entry * +search_bpf_extables(unsigned long addr) +{ + return NULL; +} +#endif + #endif /* _LINUX_EXTABLE_H */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8a765bbd33f0..673f5d40a93e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -30,7 +30,7 @@ #include #include #include - +#include #include /* Registers */ @@ -712,6 +712,24 @@ bool is_bpf_text_address(unsigned long addr) return ret; } +const struct exception_table_entry *search_bpf_extables(unsigned long addr) +{ + const struct exception_table_entry *e = NULL; + struct bpf_prog *prog; + + rcu_read_lock(); + prog = bpf_prog_kallsyms_find(addr); + if (!prog) + goto out; + if (!prog->aux->num_exentries) + goto out; + + e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr); +out: + rcu_read_unlock(); + return e; +} + int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4b6a2cfcd47..fba9ef6a831b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8729,6 +8729,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -EINVAL; } insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); + env->prog->aux->num_exentries++; continue; default: continue; diff --git a/kernel/extable.c b/kernel/extable.c index f6c9406eec7d..f6920a11e28a 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -56,6 +56,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) e = search_kernel_exception_table(addr); if (!e) e = search_module_extables(addr); + if (!e) + e = search_bpf_extables(addr); return e; } -- cgit v1.2.3 From cd7455f1013ef96d5cbf5c05d2b7c06f273810a6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 22 Oct 2019 15:57:23 +0200 Subject: bpf: Fix use after free in subprog's jited symbol removal syzkaller managed to trigger the following crash: [...] BUG: unable to handle page fault for address: ffffc90001923030 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD aa551067 P4D aa551067 PUD aa552067 PMD a572b067 PTE 80000000a1173163 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 7982 Comm: syz-executor912 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:bpf_jit_binary_hdr include/linux/filter.h:787 [inline] RIP: 0010:bpf_get_prog_addr_region kernel/bpf/core.c:531 [inline] RIP: 0010:bpf_tree_comp kernel/bpf/core.c:600 [inline] RIP: 0010:__lt_find include/linux/rbtree_latch.h:115 [inline] RIP: 0010:latch_tree_find include/linux/rbtree_latch.h:208 [inline] RIP: 0010:bpf_prog_kallsyms_find kernel/bpf/core.c:674 [inline] RIP: 0010:is_bpf_text_address+0x184/0x3b0 kernel/bpf/core.c:709 [...] Call Trace: kernel_text_address kernel/extable.c:147 [inline] __kernel_text_address+0x9a/0x110 kernel/extable.c:102 unwind_get_return_address+0x4c/0x90 arch/x86/kernel/unwind_frame.c:19 arch_stack_walk+0x98/0xe0 arch/x86/kernel/stacktrace.c:26 stack_trace_save+0xb6/0x150 kernel/stacktrace.c:123 save_stack mm/kasan/common.c:69 [inline] set_track mm/kasan/common.c:77 [inline] __kasan_kmalloc+0x11c/0x1b0 mm/kasan/common.c:510 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:518 slab_post_alloc_hook mm/slab.h:584 [inline] slab_alloc mm/slab.c:3319 [inline] kmem_cache_alloc+0x1f5/0x2e0 mm/slab.c:3483 getname_flags+0xba/0x640 fs/namei.c:138 getname+0x19/0x20 fs/namei.c:209 do_sys_open+0x261/0x560 fs/open.c:1091 __do_sys_open fs/open.c:1115 [inline] __se_sys_open fs/open.c:1110 [inline] __x64_sys_open+0x87/0x90 fs/open.c:1110 do_syscall_64+0xf7/0x1c0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe [...] After further debugging it turns out that we walk kallsyms while in parallel we tear down a BPF program which contains subprograms that have been JITed though the program itself has not been fully exposed and is eventually bailing out with error. The bpf_prog_kallsyms_del_subprogs() in bpf_prog_load()'s error path removes the symbols, however, bpf_prog_free() tears down the JIT memory too early via scheduled work. Instead, it needs to properly respect RCU grace period as the kallsyms walk for BPF is under RCU. Fix it by refactoring __bpf_prog_put()'s tear down and reuse it in our error path where we defer final destruction when we have subprogs in the program. Fixes: 7d1982b4e335 ("bpf: fix panic in prog load calls cleanup") Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Reported-by: syzbot+710043c5d1d5b5013bc7@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Tested-by: syzbot+710043c5d1d5b5013bc7@syzkaller.appspotmail.com Link: https://lore.kernel.org/bpf/55f6367324c2d7e9583fa9ccf5385dcbba0d7a6e.1571752452.git.daniel@iogearbox.net --- include/linux/filter.h | 1 - kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 31 ++++++++++++++++++++----------- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/filter.h b/include/linux/filter.h index 2ce57645f3cd..0367a75f873b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1099,7 +1099,6 @@ static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) #endif /* CONFIG_BPF_JIT */ -void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp); void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); #define BPF_ANC BIT(15) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 66088a9e9b9e..ef0e1e3e66f4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -502,7 +502,7 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); } -void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) +static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 82eabd4e38ad..bcfc362de4f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1332,18 +1332,26 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) bpf_prog_free(aux->prog); } +static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) +{ + bpf_prog_kallsyms_del_all(prog); + btf_put(prog->aux->btf); + kvfree(prog->aux->func_info); + bpf_prog_free_linfo(prog); + + if (deferred) + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + else + __bpf_prog_put_rcu(&prog->aux->rcu); +} + static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - bpf_prog_kallsyms_del_all(prog); - btf_put(prog->aux->btf); - kvfree(prog->aux->func_info); - bpf_prog_free_linfo(prog); - - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + __bpf_prog_put_noref(prog, true); } } @@ -1741,11 +1749,12 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: - bpf_prog_free_linfo(prog); - kvfree(prog->aux->func_info); - btf_put(prog->aux->btf); - bpf_prog_kallsyms_del_subprogs(prog); - free_used_maps(prog->aux); + /* In case we have subprogs, we need to wait for a grace + * period before we can tear down JIT memory since symbols + * are already exposed under kallsyms. + */ + __bpf_prog_put_noref(prog, prog->aux->func_cnt); + return err; free_prog: bpf_prog_uncharge_memlock(prog); free_prog_sec: -- cgit v1.2.3 From af91acbc62999d62e2ca1e80f660d20561ca55d3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 16:30:19 -0700 Subject: bpf: Fix bpf jit kallsym access Jiri reported crash when JIT is on, but net.core.bpf_jit_kallsyms is off. bpf_prog_kallsyms_find() was skipping addr->bpf_prog resolution logic in oops and stack traces. That's incorrect. It should only skip addr->name resolution for 'cat /proc/kallsyms'. That's what bpf_jit_kallsyms and bpf_jit_harden protect. Fixes: 3dec541b2e63 ("bpf: Add support for BTF pointers to x86 JIT") Reported-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191030233019.1187404-1-ast@kernel.org --- kernel/bpf/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 673f5d40a93e..8d3fbc86ca5e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -668,9 +668,6 @@ static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) { struct latch_tree_node *n; - if (!bpf_jit_kallsyms_enabled()) - return NULL; - n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); return n ? container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : -- cgit v1.2.3 From 6e07a6341277a1dbdf5ed0c41921033c234c1635 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:18:00 +0100 Subject: bpf: Switch BPF probe insns to bpf_probe_read_kernel Commit 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") explicitly states that the pointer to BTF object is a pointer to a kernel object or NULL. Therefore we should also switch to using the strict kernel probe helper which is restricted to kernel addresses only when architectures have non-overlapping address spaces. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/d2b90827837685424a4b8008dfe0460558abfada.1572649915.git.daniel@iogearbox.net --- kernel/bpf/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8d3fbc86ca5e..df82d5a42b23 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1306,11 +1306,12 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON -u64 __weak bpf_probe_read(void * dst, u32 size, const void * unsafe_ptr) +u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) { memset(dst, 0, size); return -EFAULT; } + /** * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1566,9 +1567,9 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST -#define LDX_PROBE(SIZEOP, SIZE) \ - LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read(&DST, SIZE, (const void *)(long) SRC); \ +#define LDX_PROBE(SIZEOP, SIZE) \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) SRC); \ CONT; LDX_PROBE(B, 1) LDX_PROBE(H, 2) -- cgit v1.2.3 From 85d31dd07002315c0d1816543fdc392c8cc6b73a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 6 Nov 2019 17:46:40 -0800 Subject: bpf: Account for insn->off when doing bpf_probe_read_kernel In the bpf interpreter mode, bpf_probe_read_kernel is used to read from PTR_TO_BTF_ID's kernel object. It currently missed considering the insn->off. This patch fixes it. Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191107014640.384083-1-kafai@fb.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf/core.c') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 97e37d82a1cc..c1fde0303280 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1569,7 +1569,7 @@ out: #undef LDST #define LDX_PROBE(SIZEOP, SIZE) \ LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) SRC); \ + bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \ CONT; LDX_PROBE(B, 1) LDX_PROBE(H, 2) -- cgit v1.2.3 From b7b3fc8dd95bc02bd30680da258e09dda55270db Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 15 Nov 2019 13:37:22 +0100 Subject: bpf: Support doubleword alignment in bpf_jit_binary_alloc Currently passing alignment greater than 4 to bpf_jit_binary_alloc does not work: in such cases it silently aligns only to 4 bytes. On s390, in order to load a constant from memory in a large (>512k) BPF program, one must use lgrl instruction, whose memory operand must be aligned on an 8-byte boundary. This patch makes it possible to request 8-byte alignment from bpf_jit_binary_alloc, and also makes it issue a warning when an unsupported alignment is requested. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191115123722.58462-1-iii@linux.ibm.com --- include/linux/filter.h | 6 ++++-- kernel/bpf/core.c | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7a6f8f6f1da4..ad80e9c6111c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -515,10 +515,12 @@ struct sock_fprog_kern { struct sock_filter *filter; }; +/* Some arches need doubleword alignment for their instructions and/or data */ +#define BPF_IMAGE_ALIGNMENT 8 + struct bpf_binary_header { u32 pages; - /* Some arches need word alignment for their instructions */ - u8 image[] __aligned(4); + u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1fde0303280..99693f3c4e99 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include #include #include +#include #include /* Registers */ @@ -815,6 +816,9 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, struct bpf_binary_header *hdr; u32 size, hole, start, pages; + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. -- cgit v1.2.3 From 5964b2000f283ff5df366f718e0f083ebbaae977 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:03 -0800 Subject: bpf: Add bpf_arch_text_poke() helper Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch nops/calls in kernel text into calls into BPF trampoline and to patch calls/nops inside BPF programs too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-4-ast@kernel.org --- arch/x86/net/bpf_jit_comp.c | 51 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 8 +++++++ kernel/bpf/core.c | 6 ++++++ 3 files changed, 65 insertions(+) (limited to 'kernel/bpf/core.c') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fb99d976ad6e..254b2889e881 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -9,9 +9,11 @@ #include #include #include +#include #include #include #include +#include static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -486,6 +488,55 @@ static int emit_call(u8 **pprog, void *func, void *ip) return 0; } +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + u8 old_insn[X86_CALL_SIZE] = {}; + u8 new_insn[X86_CALL_SIZE] = {}; + u8 *prog; + int ret; + + if (!is_kernel_text((long)ip)) + /* BPF trampoline in modules is not supported */ + return -EINVAL; + + if (old_addr) { + prog = old_insn; + ret = emit_call(&prog, old_addr, (void *)ip); + if (ret) + return ret; + } + if (new_addr) { + prog = new_insn; + ret = emit_call(&prog, new_addr, (void *)ip); + if (ret) + return ret; + } + ret = -EBUSY; + mutex_lock(&text_mutex); + switch (t) { + case BPF_MOD_NOP_TO_CALL: + if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE)) + goto out; + text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL); + break; + case BPF_MOD_CALL_TO_CALL: + if (memcmp(ip, old_insn, X86_CALL_SIZE)) + goto out; + text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL); + break; + case BPF_MOD_CALL_TO_NOP: + if (memcmp(ip, old_insn, X86_CALL_SIZE)) + goto out; + text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL); + break; + } + ret = 0; +out: + mutex_unlock(&text_mutex); + return ret; +} + static bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7c7f518811a6..8b90db25348a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1157,4 +1157,12 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, } #endif /* CONFIG_INET */ +enum bpf_text_poke_type { + BPF_MOD_NOP_TO_CALL, + BPF_MOD_CALL_TO_CALL, + BPF_MOD_CALL_TO_NOP, +}; +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *addr1, void *addr2); + #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 99693f3c4e99..434a0d920153 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2144,6 +2144,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *addr1, void *addr2) +{ + return -ENOTSUPP; +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -- cgit v1.2.3 From fec56f5890d93fc2ed74166c397dc186b1c25951 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:04 -0800 Subject: bpf: Introduce BPF trampoline Introduce BPF trampoline concept to allow kernel code to call into BPF programs with practically zero overhead. The trampoline generation logic is architecture dependent. It's converting native calling convention into BPF calling convention. BPF ISA is 64-bit (even on 32-bit architectures). The registers R1 to R5 are used to pass arguments into BPF functions. The main BPF program accepts only single argument "ctx" in R1. Whereas CPU native calling convention is different. x86-64 is passing first 6 arguments in registers and the rest on the stack. x86-32 is passing first 3 arguments in registers. sparc64 is passing first 6 in registers. And so on. The trampolines between BPF and kernel already exist. BPF_CALL_x macros in include/linux/filter.h statically compile trampolines from BPF into kernel helpers. They convert up to five u64 arguments into kernel C pointers and integers. On 64-bit architectures this BPF_to_kernel trampolines are nops. On 32-bit architecture they're meaningful. The opposite job kernel_to_BPF trampolines is done by CAST_TO_U64 macros and __bpf_trace_##call() shim functions in include/trace/bpf_probe.h. They convert kernel function arguments into array of u64s that BPF program consumes via R1=ctx pointer. This patch set is doing the same job as __bpf_trace_##call() static trampolines, but dynamically for any kernel function. There are ~22k global kernel functions that are attachable via nop at function entry. The function arguments and types are described in BTF. The job of btf_distill_func_proto() function is to extract useful information from BTF into "function model" that architecture dependent trampoline generators will use to generate assembly code to cast kernel function arguments into array of u64s. For example the kernel function eth_type_trans has two pointers. They will be casted to u64 and stored into stack of generated trampoline. The pointer to that stack space will be passed into BPF program in R1. On x86-64 such generated trampoline will consume 16 bytes of stack and two stores of %rdi and %rsi into stack. The verifier will make sure that only two u64 are accessed read-only by BPF program. The verifier will also recognize the precise type of the pointers being accessed and will not allow typecasting of the pointer to a different type within BPF program. The tracing use case in the datacenter demonstrated that certain key kernel functions have (like tcp_retransmit_skb) have 2 or more kprobes that are always active. Other functions have both kprobe and kretprobe. So it is essential to keep both kernel code and BPF programs executing at maximum speed. Hence generated BPF trampoline is re-generated every time new program is attached or detached to maintain maximum performance. To avoid the high cost of retpoline the attached BPF programs are called directly. __bpf_prog_enter/exit() are used to support per-program execution stats. In the future this logic will be optimized further by adding support for bpf_stats_enabled_key inside generated assembly code. Introduction of preemptible and sleepable BPF programs will completely remove the need to call to __bpf_prog_enter/exit(). Detach of a BPF program from the trampoline should not fail. To avoid memory allocation in detach path the half of the page is used as a reserve and flipped after each attach/detach. 2k bytes is enough to call 40+ BPF programs directly which is enough for BPF tracing use cases. This limit can be increased in the future. BPF_TRACE_FENTRY programs have access to raw kernel function arguments while BPF_TRACE_FEXIT programs have access to kernel return value as well. Often kprobe BPF program remembers function arguments in a map while kretprobe fetches arguments from a map and analyzes them together with return value. BPF_TRACE_FEXIT accelerates this typical use case. Recursion prevention for kprobe BPF programs is done via per-cpu bpf_prog_active counter. In practice that turned out to be a mistake. It caused programs to randomly skip execution. The tracing tools missed results they were looking for. Hence BPF trampoline doesn't provide builtin recursion prevention. It's a job of BPF program itself and will be addressed in the follow up patches. BPF trampoline is intended to be used beyond tracing and fentry/fexit use cases in the future. For example to remove retpoline cost from XDP programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org --- arch/x86/net/bpf_jit_comp.c | 211 +++++++++++++++++++++++++++++++++++- include/linux/bpf.h | 105 ++++++++++++++++++ include/uapi/linux/bpf.h | 2 + kernel/bpf/Makefile | 1 + kernel/bpf/btf.c | 77 +++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/syscall.c | 53 +++++++++- kernel/bpf/trampoline.c | 253 ++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 42 ++++++++ 9 files changed, 735 insertions(+), 10 deletions(-) create mode 100644 kernel/bpf/trampoline.c (limited to 'kernel/bpf/core.c') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 254b2889e881..be2b43a894f6 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -98,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) /* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) +#define X86_REG_R9 (MAX_BPF_JIT_REG + 2) /* * The following table maps BPF registers to x86-64 registers. @@ -106,8 +107,8 @@ static int bpf_size_to_x86_bytes(int bpf_size) * register in load/store instructions, it always needs an * extra byte of encoding and is callee saved. * - * Also x86-64 register R9 is unused. x86-64 register R10 is - * used for blinding (if enabled). + * x86-64 register R9 is not used by BPF programs, but can be used by BPF + * trampoline. x86-64 register R10 is used for blinding (if enabled). */ static const int reg2hex[] = { [BPF_REG_0] = 0, /* RAX */ @@ -123,6 +124,7 @@ static const int reg2hex[] = { [BPF_REG_FP] = 5, /* RBP readonly */ [BPF_REG_AX] = 2, /* R10 temp register */ [AUX_REG] = 3, /* R11 temp register */ + [X86_REG_R9] = 1, /* R9 register, 6th function argument */ }; static const int reg2pt_regs[] = { @@ -150,6 +152,7 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_7) | BIT(BPF_REG_8) | BIT(BPF_REG_9) | + BIT(X86_REG_R9) | BIT(BPF_REG_AX)); } @@ -1233,6 +1236,210 @@ emit_jmp: return proglen; } +static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args, + int stack_size) +{ + int i; + /* Store function arguments to stack. + * For a function that accepts two pointers the sequence will be: + * mov QWORD PTR [rbp-0x10],rdi + * mov QWORD PTR [rbp-0x8],rsi + */ + for (i = 0; i < min(nr_args, 6); i++) + emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]), + BPF_REG_FP, + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + -(stack_size - i * 8)); +} + +static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args, + int stack_size) +{ + int i; + + /* Restore function arguments from stack. + * For a function that accepts two pointers the sequence will be: + * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] + * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] + */ + for (i = 0; i < min(nr_args, 6); i++) + emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]), + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + BPF_REG_FP, + -(stack_size - i * 8)); +} + +static int invoke_bpf(struct btf_func_model *m, u8 **pprog, + struct bpf_prog **progs, int prog_cnt, int stack_size) +{ + u8 *prog = *pprog; + int cnt = 0, i; + + for (i = 0; i < prog_cnt; i++) { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + + /* arg1: lea rdi, [rbp - stack_size] */ + EMIT4(0x48, 0x8D, 0x7D, -stack_size); + /* arg2: progs[i]->insnsi for interpreter */ + if (!progs[i]->jited) + emit_mov_imm64(&prog, BPF_REG_2, + (long) progs[i]->insnsi >> 32, + (u32) (long) progs[i]->insnsi); + /* call JITed bpf program or interpreter */ + if (emit_call(&prog, progs[i]->bpf_func, prog)) + return -EINVAL; + + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32, + (u32) (long) progs[i]); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } + *pprog = prog; + return 0; +} + +/* Example: + * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); + * its 'struct btf_func_model' will be nr_args=2 + * The assembly code when eth_type_trans is executing after trampoline: + * + * push rbp + * mov rbp, rsp + * sub rsp, 16 // space for skb and dev + * push rbx // temp regs to pass start time + * mov qword ptr [rbp - 16], rdi // save skb pointer to stack + * mov qword ptr [rbp - 8], rsi // save dev pointer to stack + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time in bpf stats are enabled + * lea rdi, [rbp - 16] // R1==ctx of bpf prog + * call addr_of_jited_FENTRY_prog + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack + * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack + * pop rbx + * leave + * ret + * + * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be + * replaced with 'call generated_bpf_trampoline'. When it returns + * eth_type_trans will continue executing with original skb and dev pointers. + * + * The assembly code when eth_type_trans is called from trampoline: + * + * push rbp + * mov rbp, rsp + * sub rsp, 24 // space for skb, dev, return value + * push rbx // temp regs to pass start time + * mov qword ptr [rbp - 24], rdi // save skb pointer to stack + * mov qword ptr [rbp - 16], rsi // save dev pointer to stack + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time if bpf stats are enabled + * lea rdi, [rbp - 24] // R1==ctx of bpf prog + * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack + * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack + * call eth_type_trans+5 // execute body of eth_type_trans + * mov qword ptr [rbp - 8], rax // save return value + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time in bpf stats are enabled + * lea rdi, [rbp - 24] // R1==ctx of bpf prog + * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value + * pop rbx + * leave + * add rsp, 8 // skip eth_type_trans's frame + * ret // return to its caller + */ +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call) +{ + int cnt = 0, nr_args = m->nr_args; + int stack_size = nr_args * 8; + u8 *prog; + + /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ + if (nr_args > 6) + return -ENOTSUPP; + + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) + stack_size += 8; /* room for return value of orig_call */ + + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* skip patched call instruction and point orig_call to actual + * body of the kernel function. + */ + orig_call += X86_CALL_SIZE; + + prog = image; + + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ + EMIT1(0x53); /* push rbx */ + + save_regs(m, &prog, nr_args, stack_size); + + if (fentry_cnt) + if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + if (fentry_cnt) + restore_regs(m, &prog, nr_args, stack_size); + + /* call original function */ + if (emit_call(&prog, orig_call, prog)) + return -EINVAL; + /* remember return value in a stack for bpf prog to access */ + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + } + + if (fexit_cnt) + if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_RESTORE_REGS) + restore_regs(m, &prog, nr_args, stack_size); + + if (flags & BPF_TRAMP_F_CALL_ORIG) + /* restore original return value back into RAX */ + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + + EMIT1(0x5B); /* pop rbx */ + EMIT1(0xC9); /* leave */ + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* skip our return address and return to parent */ + EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ + EMIT1(0xC3); /* ret */ + /* One half of the page has active running trampoline. + * Another half is an area for next trampoline. + * Make sure the trampoline generation logic doesn't overflow. + */ + if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY)) + return -EFAULT; + return 0; +} + struct x64_jit_data { struct bpf_binary_header *header; int *addrs; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b90db25348a..0d4c5c224d79 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -384,6 +386,100 @@ struct bpf_prog_stats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); +struct btf_func_model { + u8 ret_size; + u8 nr_args; + u8 arg_size[MAX_BPF_FUNC_ARGS]; +}; + +/* Restore arguments before returning from trampoline to let original function + * continue executing. This flag is used for fentry progs when there are no + * fexit progs. + */ +#define BPF_TRAMP_F_RESTORE_REGS BIT(0) +/* Call original function after fentry progs, but before fexit progs. + * Makes sense for fentry/fexit, normal calls and indirect calls. + */ +#define BPF_TRAMP_F_CALL_ORIG BIT(1) +/* Skip current frame and return to parent. Makes sense for fentry/fexit + * programs only. Should not be used with normal calls and indirect calls. + */ +#define BPF_TRAMP_F_SKIP_FRAME BIT(2) + +/* Different use cases for BPF trampoline: + * 1. replace nop at the function entry (kprobe equivalent) + * flags = BPF_TRAMP_F_RESTORE_REGS + * fentry = a set of programs to run before returning from trampoline + * + * 2. replace nop at the function entry (kprobe + kretprobe equivalent) + * flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME + * orig_call = fentry_ip + MCOUNT_INSN_SIZE + * fentry = a set of program to run before calling original function + * fexit = a set of program to run after original function + * + * 3. replace direct call instruction anywhere in the function body + * or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid) + * With flags = 0 + * fentry = a set of programs to run before returning from trampoline + * With flags = BPF_TRAMP_F_CALL_ORIG + * orig_call = original callback addr or direct function addr + * fentry = a set of program to run before calling original function + * fexit = a set of program to run after original function + */ +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call); +/* these two functions are called from generated trampoline */ +u64 notrace __bpf_prog_enter(void); +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); + +enum bpf_tramp_prog_type { + BPF_TRAMP_FENTRY, + BPF_TRAMP_FEXIT, + BPF_TRAMP_MAX +}; + +struct bpf_trampoline { + /* hlist for trampoline_table */ + struct hlist_node hlist; + /* serializes access to fields of this trampoline */ + struct mutex mutex; + refcount_t refcnt; + u64 key; + struct { + struct btf_func_model model; + void *addr; + } func; + /* list of BPF programs using this trampoline */ + struct hlist_head progs_hlist[BPF_TRAMP_MAX]; + /* Number of attached programs. A counter per kind. */ + int progs_cnt[BPF_TRAMP_MAX]; + /* Executable image of trampoline */ + void *image; + u64 selector; +}; +#ifdef CONFIG_BPF_JIT +struct bpf_trampoline *bpf_trampoline_lookup(u64 key); +int bpf_trampoline_link_prog(struct bpf_prog *prog); +int bpf_trampoline_unlink_prog(struct bpf_prog *prog); +void bpf_trampoline_put(struct bpf_trampoline *tr); +#else +static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +{ + return NULL; +} +static inline int bpf_trampoline_link_prog(struct bpf_prog *prog) +{ + return -ENOTSUPP; +} +static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +{ + return -ENOTSUPP; +} +static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} +#endif + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -398,6 +494,9 @@ struct bpf_prog_aux { bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + enum bpf_tramp_prog_type trampoline_prog_type; + struct bpf_trampoline *trampoline; + struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ @@ -784,6 +883,12 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 *next_btf_id); u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); +int btf_distill_func_proto(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *func_proto, + const char *func_name, + struct btf_func_model *m); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df6809a76404..69c200e6e696 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -201,6 +201,8 @@ enum bpf_attach_type { BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1d9adb212f9..3f671bf617e8 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4639c4ba9a9b..9e1164e5b429 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3517,13 +3517,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, args++; nr_args--; } - if (arg >= nr_args) { + + if (prog->expected_attach_type == BPF_TRACE_FEXIT && + arg == nr_args) { + /* function return type */ + t = btf_type_by_id(btf_vmlinux, t->type); + } else if (arg >= nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", - tname, arg); + tname, arg + 1); return false; + } else { + t = btf_type_by_id(btf_vmlinux, args[arg].type); } - - t = btf_type_by_id(btf_vmlinux, args[arg].type); /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf_vmlinux, t->type); @@ -3784,6 +3789,70 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) return btf_id; } +static int __get_type_size(struct btf *btf, u32 btf_id, + const struct btf_type **bad_type) +{ + const struct btf_type *t; + + if (!btf_id) + /* void */ + return 0; + t = btf_type_by_id(btf, btf_id); + while (t && btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!t) + return -EINVAL; + if (btf_type_is_ptr(t)) + /* kernel size of pointer. Not BPF's size of pointer*/ + return sizeof(void *); + if (btf_type_is_int(t) || btf_type_is_enum(t)) + return t->size; + *bad_type = t; + return -EINVAL; +} + +int btf_distill_func_proto(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *func, + const char *tname, + struct btf_func_model *m) +{ + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs; + int ret; + + args = (const struct btf_param *)(func + 1); + nargs = btf_type_vlen(func); + if (nargs >= MAX_BPF_FUNC_ARGS) { + bpf_log(log, + "The function %s has %d arguments. Too many.\n", + tname, nargs); + return -EINVAL; + } + ret = __get_type_size(btf, func->type, &t); + if (ret < 0) { + bpf_log(log, + "The function %s return type %s is unsupported.\n", + tname, btf_kind_str[BTF_INFO_KIND(t->info)]); + return -EINVAL; + } + m->ret_size = ret; + + for (i = 0; i < nargs; i++) { + ret = __get_type_size(btf, args[i].type, &t); + if (ret < 0) { + bpf_log(log, + "The function %s arg%d type %s is unsupported.\n", + tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); + return -EINVAL; + } + m->arg_size[i] = ret; + } + m->nr_args = nargs; + return 0; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 434a0d920153..da5a8b8e278f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2015,6 +2015,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif + bpf_trampoline_put(aux->trampoline); for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6d9ce95e5a8d..e2e37bea86bc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1799,6 +1799,49 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +static int bpf_tracing_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_tracing_prog_fops = { + .release = bpf_tracing_prog_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +static int bpf_tracing_prog_attach(struct bpf_prog *prog) +{ + int tr_fd, err; + + if (prog->expected_attach_type != BPF_TRACE_FENTRY && + prog->expected_attach_type != BPF_TRACE_FEXIT) { + err = -EINVAL; + goto out_put_prog; + } + + err = bpf_trampoline_link_prog(prog); + if (err) + goto out_put_prog; + + tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops, + prog, O_CLOEXEC); + if (tr_fd < 0) { + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); + err = tr_fd; + goto out_put_prog; + } + return tr_fd; + +out_put_prog: + bpf_prog_put(prog); + return err; +} + struct bpf_raw_tracepoint { struct bpf_raw_event_map *btp; struct bpf_prog *prog; @@ -1850,14 +1893,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (prog->type == BPF_PROG_TYPE_TRACING) { if (attr->raw_tracepoint.name) { - /* raw_tp name should not be specified in raw_tp - * programs that were verified via in-kernel BTF info + /* The attach point for this category of programs + * should be specified via btf_id during program load. */ err = -EINVAL; goto out_put_prog; } - /* raw_tp name is taken from type name instead */ - tp_name = prog->aux->attach_func_name; + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + tp_name = prog->aux->attach_func_name; + else + return bpf_tracing_prog_attach(prog); } else { if (strncpy_from_user(buf, u64_to_user_ptr(attr->raw_tracepoint.name), diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c new file mode 100644 index 000000000000..10ae59d65f13 --- /dev/null +++ b/kernel/bpf/trampoline.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019 Facebook */ +#include +#include +#include + +/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ +#define TRAMPOLINE_HASH_BITS 10 +#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) + +static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; + +/* serializes access to trampoline_table */ +static DEFINE_MUTEX(trampoline_mutex); + +struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +{ + struct bpf_trampoline *tr; + struct hlist_head *head; + void *image; + int i; + + mutex_lock(&trampoline_mutex); + head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head, hlist) { + if (tr->key == key) { + refcount_inc(&tr->refcnt); + goto out; + } + } + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + goto out; + + /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) { + kfree(tr); + tr = NULL; + goto out; + } + + tr->key = key; + INIT_HLIST_NODE(&tr->hlist); + hlist_add_head(&tr->hlist, head); + refcount_set(&tr->refcnt, 1); + mutex_init(&tr->mutex); + for (i = 0; i < BPF_TRAMP_MAX; i++) + INIT_HLIST_HEAD(&tr->progs_hlist[i]); + + set_vm_flush_reset_perms(image); + /* Keep image as writeable. The alternative is to keep flipping ro/rw + * everytime new program is attached or detached. + */ + set_memory_x((long)image, 1); + tr->image = image; +out: + mutex_unlock(&trampoline_mutex); + return tr; +} + +/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 + * bytes on x86. Pick a number to fit into PAGE_SIZE / 2 + */ +#define BPF_MAX_TRAMP_PROGS 40 + +static int bpf_trampoline_update(struct bpf_trampoline *tr) +{ + void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; + int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; + int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; + struct bpf_prog **progs, **fentry, **fexit; + u32 flags = BPF_TRAMP_F_RESTORE_REGS; + struct bpf_prog_aux *aux; + int err; + + if (fentry_cnt + fexit_cnt == 0) { + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP, + old_image, NULL); + tr->selector = 0; + goto out; + } + + /* populate fentry progs */ + fentry = progs = progs_to_run; + hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist) + *progs++ = aux->prog; + + /* populate fexit progs */ + fexit = progs; + hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist) + *progs++ = aux->prog; + + if (fexit_cnt) + flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + + err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags, + fentry, fentry_cnt, + fexit, fexit_cnt, + tr->func.addr); + if (err) + goto out; + + if (tr->selector) + /* progs already running at this address */ + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL, + old_image, new_image); + else + /* first time registering */ + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL, + NULL, new_image); + if (err) + goto out; + tr->selector++; +out: + return err; +} + +static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) +{ + switch (t) { + case BPF_TRACE_FENTRY: + return BPF_TRAMP_FENTRY; + default: + return BPF_TRAMP_FEXIT; + } +} + +int bpf_trampoline_link_prog(struct bpf_prog *prog) +{ + enum bpf_tramp_prog_type kind; + struct bpf_trampoline *tr; + int err = 0; + + tr = prog->aux->trampoline; + kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + mutex_lock(&tr->mutex); + if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT] + >= BPF_MAX_TRAMP_PROGS) { + err = -E2BIG; + goto out; + } + if (!hlist_unhashed(&prog->aux->tramp_hlist)) { + /* prog already linked */ + err = -EBUSY; + goto out; + } + hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); + tr->progs_cnt[kind]++; + err = bpf_trampoline_update(prog->aux->trampoline); + if (err) { + hlist_del(&prog->aux->tramp_hlist); + tr->progs_cnt[kind]--; + } +out: + mutex_unlock(&tr->mutex); + return err; +} + +/* bpf_trampoline_unlink_prog() should never fail. */ +int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +{ + enum bpf_tramp_prog_type kind; + struct bpf_trampoline *tr; + int err; + + tr = prog->aux->trampoline; + kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + mutex_lock(&tr->mutex); + hlist_del(&prog->aux->tramp_hlist); + tr->progs_cnt[kind]--; + err = bpf_trampoline_update(prog->aux->trampoline); + mutex_unlock(&tr->mutex); + return err; +} + +void bpf_trampoline_put(struct bpf_trampoline *tr) +{ + if (!tr) + return; + mutex_lock(&trampoline_mutex); + if (!refcount_dec_and_test(&tr->refcnt)) + goto out; + WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) + goto out; + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) + goto out; + bpf_jit_free_exec(tr->image); + hlist_del(&tr->hlist); + kfree(tr); +out: + mutex_unlock(&trampoline_mutex); +} + +/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that + * are needed for trampoline. The macro is split into + * call _bpf_prog_enter + * call prog->bpf_func + * call __bpf_prog_exit + */ +u64 notrace __bpf_prog_enter(void) +{ + u64 start = 0; + + rcu_read_lock(); + preempt_disable(); + if (static_branch_unlikely(&bpf_stats_enabled_key)) + start = sched_clock(); + return start; +} + +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) +{ + struct bpf_prog_stats *stats; + + if (static_branch_unlikely(&bpf_stats_enabled_key) && + /* static_key could be enabled in __bpf_prog_enter + * and disabled in __bpf_prog_exit. + * And vice versa. + * Hence check that 'start' is not zero. + */ + start) { + stats = this_cpu_ptr(prog->aux->stats); + u64_stats_update_begin(&stats->syncp); + stats->cnt++; + stats->nsecs += sched_clock() - start; + u64_stats_update_end(&stats->syncp); + } + preempt_enable(); + rcu_read_unlock(); +} + +int __weak +arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call) +{ + return -ENOTSUPP; +} + +static int __init init_trampolines(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&trampoline_table[i]); + return 0; +} +late_initcall(init_trampolines); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2f2374967b36..8f89cfa93e88 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9382,8 +9382,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *prog = env->prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct bpf_trampoline *tr; const struct btf_type *t; const char *tname; + int ret = 0; + long addr; if (prog->type != BPF_PROG_TYPE_TRACING) return 0; @@ -9432,6 +9435,45 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + tr = bpf_trampoline_lookup(btf_id); + if (!tr) + return -ENOMEM; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + mutex_lock(&tr->mutex); + if (tr->func.addr) { + prog->aux->trampoline = tr; + goto out; + } + ret = btf_distill_func_proto(&env->log, btf_vmlinux, t, + tname, &tr->func.model); + if (ret < 0) + goto out; + addr = kallsyms_lookup_name(tname); + if (!addr) { + verbose(env, + "The address of function %s cannot be found\n", + tname); + ret = -ENOENT; + goto out; + } + tr->func.addr = (void *)addr; + prog->aux->trampoline = tr; +out: + mutex_unlock(&tr->mutex); + if (ret) + bpf_trampoline_put(tr); + return ret; default: return -EINVAL; } -- cgit v1.2.3 From 5b92a28aae4dd0f88778d540ecfdcdaec5a41723 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:17 -0800 Subject: bpf: Support attaching tracing BPF program to other BPF programs Allow FENTRY/FEXIT BPF programs to attach to other BPF programs of any type including their subprograms. This feature allows snooping on input and output packets in XDP, TC programs including their return values. In order to do that the verifier needs to track types not only of vmlinux, but types of other BPF programs as well. The verifier also needs to translate uapi/linux/bpf.h types used by networking programs into kernel internal BTF types used by FENTRY/FEXIT BPF programs. In some cases LLVM optimizations can remove arguments from BPF subprograms without adjusting BTF info that LLVM backend knows. When BTF info disagrees with actual types that the verifiers sees the BPF trampoline has to fallback to conservative and treat all arguments as u64. The FENTRY/FEXIT program can still attach to such subprograms, but it won't be able to recognize pointer types like 'struct sk_buff *' and it won't be able to pass them to bpf_skb_output() for dumping packets to user space. The FENTRY/FEXIT program would need to use bpf_probe_read_kernel() instead. The BPF_PROG_LOAD command is extended with attach_prog_fd field. When it's set to zero the attach_btf_id is one vmlinux BTF type ids. When attach_prog_fd points to previously loaded BPF program the attach_btf_id is BTF type id of main function or one of its subprograms. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-18-ast@kernel.org --- arch/x86/net/bpf_jit_comp.c | 3 +- include/linux/bpf.h | 1 + include/linux/btf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 70 +++++++++++++++++++++++++++++++++----- kernel/bpf/core.c | 2 ++ kernel/bpf/syscall.c | 19 ++++++++--- kernel/bpf/verifier.c | 83 +++++++++++++++++++++++++++++++++++++-------- 8 files changed, 152 insertions(+), 28 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index c06096df9118..2e586f579945 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -504,7 +504,8 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, u8 *prog; int ret; - if (!is_kernel_text((long)ip)) + if (!is_kernel_text((long)ip) && + !is_bpf_text_address((long)ip)) /* BPF trampoline in modules is not supported */ return -EINVAL; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c70bf04726b4..5b81cde47314 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -495,6 +495,7 @@ struct bpf_prog_aux { u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + struct bpf_prog *linked_prog; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 9dee00859c5f..79d4abc2556a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -88,6 +88,7 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); +struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69c200e6e696..4842a134b202 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -425,6 +425,7 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + __u32 attach_prog_fd; /* 0 to attach to vmlinux */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4620267b186e..40efde5eedcb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3530,6 +3530,20 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, return ctx_type; } +static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *t, + enum bpf_prog_type prog_type) +{ + const struct btf_member *prog_ctx_type, *kern_ctx_type; + + prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type); + if (!prog_ctx_type) + return -ENOENT; + kern_ctx_type = prog_ctx_type + 1; + return kern_ctx_type->type; +} + struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; @@ -3602,15 +3616,29 @@ errout: return ERR_PTR(err); } +struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) +{ + struct bpf_prog *tgt_prog = prog->aux->linked_prog; + + if (tgt_prog) { + return tgt_prog->aux->btf; + } else { + return btf_vmlinux; + } +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const struct btf_type *t = prog->aux->attach_func_proto; + struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct btf *btf = bpf_prog_get_target_btf(prog); const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; + int ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3619,7 +3647,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } arg = off / 8; args = (const struct btf_param *)(t + 1); - nr_args = btf_type_vlen(t); + /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */ + nr_args = t ? btf_type_vlen(t) : 5; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; @@ -3628,18 +3657,24 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog->expected_attach_type == BPF_TRACE_FEXIT && arg == nr_args) { + if (!t) + /* Default prog with 5 args. 6th arg is retval. */ + return true; /* function return type */ - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); } else if (arg >= nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; } else { - t = btf_type_by_id(btf_vmlinux, args[arg].type); + if (!t) + /* Default prog with 5 args */ + return true; + t = btf_type_by_id(btf, args[arg].type); } /* skip modifiers */ while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t)) /* accessing a scalar */ return true; @@ -3647,7 +3682,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, bpf_log(log, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, - __btf_name_by_offset(btf_vmlinux, t->name_off), + __btf_name_by_offset(btf, t->name_off), btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } @@ -3662,10 +3697,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; info->btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); + if (tgt_prog) { + ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); + if (ret > 0) { + info->btf_id = ret; + return true; + } else { + return false; + } + } + t = btf_type_by_id(btf, t->type); /* skip modifiers */ while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", @@ -3674,7 +3718,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], - __btf_name_by_offset(btf_vmlinux, t->name_off)); + __btf_name_by_offset(btf, t->name_off)); return true; } @@ -3954,6 +3998,16 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, u32 i, nargs; int ret; + if (!func) { + /* BTF function prototype doesn't match the verifier types. + * Fall back to 5 u64 args. + */ + for (i = 0; i < 5; i++) + m->arg_size[i] = 8; + m->ret_size = 8; + m->nr_args = 5; + return 0; + } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); if (nargs >= MAX_BPF_FUNC_ARGS) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index da5a8b8e278f..b5945c3aaa8e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2031,6 +2031,8 @@ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; + if (aux->linked_prog) + bpf_prog_put(aux->linked_prog); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 43ba647de720..c88c815c2154 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1577,7 +1577,7 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, - u32 btf_id) + u32 btf_id, u32 prog_fd) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: @@ -1585,7 +1585,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, return -EINVAL; break; default: - if (btf_id) + if (btf_id || prog_fd) return -EINVAL; break; } @@ -1636,7 +1636,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD attach_btf_id +#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -1679,7 +1679,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, - attr->attach_btf_id)) + attr->attach_btf_id, + attr->attach_prog_fd)) return -EINVAL; /* plain bpf_prog allocation */ @@ -1689,6 +1690,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; + if (attr->attach_prog_fd) { + struct bpf_prog *tgt_prog; + + tgt_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + goto free_prog_nouncharge; + } + prog->aux->linked_prog = tgt_prog; + } prog->aux->offload_requested = !!attr->prog_ifindex; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11910149ca2f..e9dc95a18d44 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9390,13 +9390,17 @@ static void print_verification_stats(struct bpf_verifier_env *env) static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; + bool conservative = true; const char *tname; - int ret = 0; + struct btf *btf; long addr; + u64 key; if (prog->type != BPF_PROG_TYPE_TRACING) return 0; @@ -9405,19 +9409,47 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) verbose(env, "Tracing programs must provide btf_id\n"); return -EINVAL; } - t = btf_type_by_id(btf_vmlinux, btf_id); + btf = bpf_prog_get_target_btf(prog); + if (!btf) { + verbose(env, + "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); + return -EINVAL; + } + t = btf_type_by_id(btf, btf_id); if (!t) { verbose(env, "attach_btf_id %u is invalid\n", btf_id); return -EINVAL; } - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); if (!tname) { verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); return -EINVAL; } + if (tgt_prog) { + struct bpf_prog_aux *aux = tgt_prog->aux; + + for (i = 0; i < aux->func_info_cnt; i++) + if (aux->func_info[i].type_id == btf_id) { + subprog = i; + break; + } + if (subprog == -1) { + verbose(env, "Subprog %s doesn't exist\n", tname); + return -EINVAL; + } + conservative = aux->func_info_aux[subprog].unreliable; + key = ((u64)aux->id) << 32 | btf_id; + } else { + key = btf_id; + } switch (prog->expected_attach_type) { case BPF_TRACE_RAW_TP: + if (tgt_prog) { + verbose(env, + "Only FENTRY/FEXIT progs are attachable to another BPF prog\n"); + return -EINVAL; + } if (!btf_type_is_typedef(t)) { verbose(env, "attach_btf_id %u is not a typedef\n", btf_id); @@ -9429,11 +9461,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } tname += sizeof(prefix) - 1; - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_ptr(t)) /* should never happen in valid vmlinux build */ return -EINVAL; - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) /* should never happen in valid vmlinux build */ return -EINVAL; @@ -9452,30 +9484,51 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) btf_id); return -EINVAL; } - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - tr = bpf_trampoline_lookup(btf_id); + tr = bpf_trampoline_lookup(key); if (!tr) return -ENOMEM; prog->aux->attach_func_name = tname; + /* t is either vmlinux type or another program's type */ prog->aux->attach_func_proto = t; mutex_lock(&tr->mutex); if (tr->func.addr) { prog->aux->trampoline = tr; goto out; } - ret = btf_distill_func_proto(&env->log, btf_vmlinux, t, + if (tgt_prog && conservative) { + prog->aux->attach_func_proto = NULL; + t = NULL; + } + ret = btf_distill_func_proto(&env->log, btf, t, tname, &tr->func.model); if (ret < 0) goto out; - addr = kallsyms_lookup_name(tname); - if (!addr) { - verbose(env, - "The address of function %s cannot be found\n", - tname); - ret = -ENOENT; - goto out; + if (tgt_prog) { + if (!tgt_prog->jited) { + /* for now */ + verbose(env, "Can trace only JITed BPF progs\n"); + ret = -EINVAL; + goto out; + } + if (tgt_prog->type == BPF_PROG_TYPE_TRACING) { + /* prevent cycles */ + verbose(env, "Cannot recursively attach\n"); + ret = -EINVAL; + goto out; + } + addr = (long) tgt_prog->aux->func[subprog]->bpf_func; + } else { + addr = kallsyms_lookup_name(tname); + if (!addr) { + verbose(env, + "The address of function %s cannot be found\n", + tname); + ret = -ENOENT; + goto out; + } } tr->func.addr = (void *)addr; prog->aux->trampoline = tr; -- cgit v1.2.3 From 6332be04c039a72fca32ed0a4265bac58d606bb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:55 +0100 Subject: bpf: Move bpf_free_used_maps into sleepable section We later on are going to need a sleepable context as opposed to plain RCU callback in order to untrack programs we need to poke at runtime and tracking as well as image update is performed under mutex. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/09823b1d5262876e9b83a8e75df04cf0467357a4.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 4 ++++ kernel/bpf/core.c | 23 +++++++++++++++++++++++ kernel/bpf/syscall.c | 20 -------------------- 3 files changed, 27 insertions(+), 20 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7978b617caa8..561b920f0bf7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1031,6 +1031,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, { return -ENOTSUPP; } + +static inline void bpf_map_put(struct bpf_map *map) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5945c3aaa8e..0e825c164f1a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2003,12 +2003,35 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } +static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + } +} + +static void bpf_free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + bpf_free_cgroup_storage(aux); + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + kfree(aux->used_maps); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); + bpf_free_used_maps(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ae52eb05f41..373778da8489 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1302,25 +1302,6 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } -/* drop refcnt on maps used by eBPF program and free auxilary data */ -static void free_used_maps(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - int i; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); - } - - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); - - kfree(aux->used_maps); -} - int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1415,7 +1396,6 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); - free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); -- cgit v1.2.3 From 2beee5f57441413b64a9c2bd657e17beabb98d1c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:56 +0100 Subject: bpf: Move owner type, jited info into array auxiliary data We're going to extend this with further information which is only relevant for prog array at this point. Given this info is not used in critical path, move it into its own structure such that the main array map structure can be kept on diet. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b9ddccdb0f6f7026489ee955f16c96381e1e7238.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 18 +++++++++++------- kernel/bpf/arraymap.c | 32 ++++++++++++++++++++++++++++++-- kernel/bpf/core.c | 11 +++++------ kernel/bpf/map_in_map.c | 5 ++--- kernel/bpf/syscall.c | 16 ++++++---------- 5 files changed, 54 insertions(+), 28 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 561b920f0bf7..c3b29061284e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -560,17 +560,21 @@ struct bpf_prog_aux { }; }; +struct bpf_array_aux { + /* 'Ownership' of prog array is claimed by the first program that + * is going to use this map or by the first program which FD is + * stored in the map to make sure that all callers and callees have + * the same prog type and JITed flag. + */ + enum bpf_prog_type type; + bool jited; +}; + struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; - /* 'ownership' of prog_array is claimed by the first program that - * is going to use this map or by the first program which FD is stored - * in the map to make sure that all callers and callees have the same - * prog_type and JITed flag - */ - enum bpf_prog_type owner_prog_type; - bool owner_jited; + struct bpf_array_aux *aux; union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633c8c701ff6..57da950ee55b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -671,10 +671,38 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array_aux *aux; + struct bpf_map *map; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL); + if (!aux) + return ERR_PTR(-ENOMEM); + + map = array_map_alloc(attr); + if (IS_ERR(map)) { + kfree(aux); + return map; + } + + container_of(map, struct bpf_array, map)->aux = aux; + return map; +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + kfree(aux); + fd_array_map_free(map); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, - .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0e825c164f1a..07af9c1d9cf1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1691,18 +1691,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array, if (fp->kprobe_override) return false; - if (!array->owner_prog_type) { + if (!array->aux->type) { /* There's no owner yet where we could check for * compatibility. */ - array->owner_prog_type = fp->type; - array->owner_jited = fp->jited; - + array->aux->type = fp->type; + array->aux->jited = fp->jited; return true; } - return array->owner_prog_type == fp->type && - array->owner_jited == fp->jited; + return array->aux->type == fp->type && + array->aux->jited == fp->jited; } static int bpf_check_tail_call(const struct bpf_prog *fp) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 4cbe987be35b..5e9366b33f0f 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -17,9 +17,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) if (IS_ERR(inner_map)) return inner_map; - /* prog_array->owner_prog_type and owner_jited - * is a runtime binding. Doing static check alone - * in the verifier is not enough. + /* prog_array->aux->{type,jited} is a runtime binding. + * Doing static check alone in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 373778da8489..b904d56ec686 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -389,13 +389,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; const struct bpf_array *array; - u32 owner_prog_type = 0; - u32 owner_jited = 0; + u32 type = 0, jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); - owner_prog_type = array->owner_prog_type; - owner_jited = array->owner_jited; + type = array->aux->type; + jited = array->aux->jited; } seq_printf(m, @@ -415,12 +414,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); - - if (owner_prog_type) { - seq_printf(m, "owner_prog_type:\t%u\n", - owner_prog_type); - seq_printf(m, "owner_jited:\t%u\n", - owner_jited); + if (type) { + seq_printf(m, "owner_prog_type:\t%u\n", type); + seq_printf(m, "owner_jited:\t%u\n", jited); } } #endif -- cgit v1.2.3 From a66886fe6c24ebeeb6dc10fbd9b75158029eacf7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:57 +0100 Subject: bpf: Add initial poke descriptor table for jit images Add initial poke table data structures and management to the BPF prog that can later be used by JITs. Also add an instance of poke specific data for tail call maps; plan for later work is to extend this also for BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1db285ec2ea4207ee0455b3f8e191a4fc58b9ade.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 20 ++++++++++++++++++++ include/linux/filter.h | 10 ++++++++++ kernel/bpf/core.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3b29061284e..312983bf7faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,24 @@ struct bpf_func_info_aux { bool unreliable; }; +enum bpf_jit_poke_reason { + BPF_POKE_REASON_TAIL_CALL, +}; + +/* Descriptor of pokes pointing /into/ the JITed image. */ +struct bpf_jit_poke_descriptor { + void *ip; + union { + struct { + struct bpf_map *map; + u32 key; + } tail_call; + }; + bool ip_stable; + u8 adj_off; + u16 reason; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -513,6 +531,8 @@ struct bpf_prog_aux { const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ + struct bpf_jit_poke_descriptor *poke_tab; + u32 size_poke_tab; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; diff --git a/include/linux/filter.h b/include/linux/filter.h index ad80e9c6111c..796b60d8cc6c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -952,6 +952,9 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke); + int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); @@ -1055,6 +1058,13 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) return false; } +static inline int +bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + return -ENOTSUPP; +} + static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 07af9c1d9cf1..608b7085e0c9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -256,6 +256,7 @@ void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { free_percpu(fp->aux->stats); + kfree(fp->aux->poke_tab); kfree(fp->aux); } vfree(fp); @@ -756,6 +757,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + static const u32 poke_tab_max = 1024; + u32 slot = prog->aux->size_poke_tab; + u32 size = slot + 1; + + if (size > poke_tab_max) + return -ENOSPC; + if (poke->ip || poke->ip_stable || poke->adj_off) + return -EINVAL; + + switch (poke->reason) { + case BPF_POKE_REASON_TAIL_CALL: + if (!poke->tail_call.map) + return -EINVAL; + break; + default: + return -EINVAL; + } + + tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + if (!tab) + return -ENOMEM; + + memcpy(&tab[slot], poke, sizeof(*poke)); + prog->aux->size_poke_tab = size; + prog->aux->poke_tab = tab; + + return slot; +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, -- cgit v1.2.3 From da765a2f599304a81a25e77908d1790414ecdbb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:58 +0100 Subject: bpf: Add poke dependency tracking for prog array maps This work adds program tracking to prog array maps. This is needed such that upon prog array updates/deletions we can fix up all programs which make use of this tail call map. We add ops->map_poke_{un,}track() helpers to maps to maintain the list of programs and ops->map_poke_run() for triggering the actual update. bpf_array_aux is extended to contain the list head and poke_mutex in order to serialize program patching during updates/deletions. bpf_free_used_maps() will untrack the program shortly before dropping the reference to the map. For clearing out the prog array once all urefs are dropped we need to use schedule_work() to have a sleepable context. The prog_array_map_poke_run() is triggered during updates/deletions and walks the maintained prog list. It checks in their poke_tabs whether the map and key is matching and runs the actual bpf_arch_text_poke() for patching in the nop or new jmp location. Depending on the type of update, we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 12 ++++ kernel/bpf/arraymap.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/core.c | 9 ++- kernel/bpf/syscall.c | 20 ++++-- 4 files changed, 212 insertions(+), 12 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 312983bf7faa..c2f07fd410c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,7 @@ struct bpf_verifier_env; struct bpf_verifier_log; struct perf_event; struct bpf_prog; +struct bpf_prog_aux; struct bpf_map; struct sock; struct seq_file; @@ -64,6 +65,12 @@ struct bpf_map_ops { const struct btf_type *key_type, const struct btf_type *value_type); + /* Prog poke tracking helpers. */ + int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old, + struct bpf_prog *new); + /* Direct value access helpers. */ int (*map_direct_value_addr)(const struct bpf_map *map, u64 *imm, u32 off); @@ -588,6 +595,11 @@ struct bpf_array_aux { */ enum bpf_prog_type type; bool jited; + /* Programs with direct jumps into programs part of this array. */ + struct list_head poke_progs; + struct bpf_map *map; + struct mutex poke_mutex; + struct work_struct work; }; struct bpf_array { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 57da950ee55b..58bdf5fd24cc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); - old_ptr = xchg(array->ptrs + index, new_ptr); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, new_ptr); + map->ops->map_poke_run(map, index, old_ptr, new_ptr); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, new_ptr); + } + if (old_ptr) map->ops->map_fd_put_ptr(old_ptr); - return 0; } @@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) if (index >= array->map.max_entries) return -E2BIG; - old_ptr = xchg(array->ptrs + index, NULL); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, NULL); + map->ops->map_poke_run(map, index, old_ptr, NULL); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, NULL); + } + if (old_ptr) { map->ops->map_fd_put_ptr(old_ptr); return 0; @@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +struct prog_poke_elem { + struct list_head list; + struct bpf_prog_aux *aux; +}; + +static int prog_array_map_poke_track(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + int ret = 0; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry(elem, &aux->poke_progs, list) { + if (elem->aux == prog_aux) + goto out; + } + + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + if (!elem) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&elem->list); + /* We must track the program's aux info at this point in time + * since the program pointer itself may not be stable yet, see + * also comment in prog_array_map_poke_run(). + */ + elem->aux = prog_aux; + + list_add_tail(&elem->list, &aux->poke_progs); +out: + mutex_unlock(&aux->poke_mutex); + return ret; +} + +static void prog_array_map_poke_untrack(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + if (elem->aux == prog_aux) { + list_del_init(&elem->list); + kfree(elem); + break; + } + } + mutex_unlock(&aux->poke_mutex); +} + +static void prog_array_map_poke_run(struct bpf_map *map, u32 key, + struct bpf_prog *old, + struct bpf_prog *new) +{ + enum bpf_text_poke_type type; + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + + if (!old && new) + type = BPF_MOD_NOP_TO_JUMP; + else if (old && !new) + type = BPF_MOD_JUMP_TO_NOP; + else if (old && new) + type = BPF_MOD_JUMP_TO_JUMP; + else + return; + + aux = container_of(map, struct bpf_array, map)->aux; + WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); + + list_for_each_entry(elem, &aux->poke_progs, list) { + struct bpf_jit_poke_descriptor *poke; + int i, ret; + + for (i = 0; i < elem->aux->size_poke_tab; i++) { + poke = &elem->aux->poke_tab[i]; + + /* Few things to be aware of: + * + * 1) We can only ever access aux in this context, but + * not aux->prog since it might not be stable yet and + * there could be danger of use after free otherwise. + * 2) Initially when we start tracking aux, the program + * is not JITed yet and also does not have a kallsyms + * entry. We skip these as poke->ip_stable is not + * active yet. The JIT will do the final fixup before + * setting it stable. The various poke->ip_stable are + * successively activated, so tail call updates can + * arrive from here while JIT is still finishing its + * final fixup for non-activated poke entries. + * 3) On program teardown, the program's kallsym entry gets + * removed out of RCU callback, but we can only untrack + * from sleepable context, therefore bpf_arch_text_poke() + * might not see that this is in BPF text section and + * bails out with -EINVAL. As these are unreachable since + * RCU grace period already passed, we simply skip them. + * 4) Also programs reaching refcount of zero while patching + * is in progress is okay since we're protected under + * poke_mutex and untrack the programs before the JIT + * buffer is freed. When we're still in the middle of + * patching and suddenly kallsyms entry of the program + * gets evicted, we just skip the rest which is fine due + * to point 3). + * 5) Any other error happening below from bpf_arch_text_poke() + * is a unexpected bug. + */ + if (!READ_ONCE(poke->ip_stable)) + continue; + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + if (poke->tail_call.map != map || + poke->tail_call.key != key) + continue; + + ret = bpf_arch_text_poke(poke->ip, type, + old ? (u8 *)old->bpf_func + + poke->adj_off : NULL, + new ? (u8 *)new->bpf_func + + poke->adj_off : NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } +} + +static void prog_array_map_clear_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_array_aux, + work)->map; + bpf_fd_array_map_clear(map); + bpf_map_put(map); +} + +static void prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array_aux *aux = container_of(map, struct bpf_array, + map)->aux; + bpf_map_inc(map); + schedule_work(&aux->work); +} + static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) { struct bpf_array_aux *aux; @@ -680,6 +841,10 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) if (!aux) return ERR_PTR(-ENOMEM); + INIT_WORK(&aux->work, prog_array_map_clear_deferred); + INIT_LIST_HEAD(&aux->poke_progs); + mutex_init(&aux->poke_mutex); + map = array_map_alloc(attr); if (IS_ERR(map)) { kfree(aux); @@ -687,14 +852,21 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) } container_of(map, struct bpf_array, map)->aux = aux; + aux->map = map; + return map; } static void prog_array_map_free(struct bpf_map *map) { + struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + list_del_init(&elem->list); + kfree(elem); + } kfree(aux); fd_array_map_free(map); } @@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, .map_free = prog_array_map_free, + .map_poke_track = prog_array_map_poke_track, + .map_poke_untrack = prog_array_map_poke_untrack, + .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, - .map_release_uref = bpf_fd_array_map_clear, + .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 608b7085e0c9..49e32acad7d8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) static void bpf_free_used_maps(struct bpf_prog_aux *aux) { + struct bpf_map *map; int i; bpf_free_cgroup_storage(aux); - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); + for (i = 0; i < aux->used_map_cnt; i++) { + map = aux->used_maps[i]; + if (map->ops->map_poke_untrack) + map->ops->map_poke_untrack(map, aux); + bpf_map_put(map); + } kfree(aux->used_maps); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b904d56ec686..e3461ec59570 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,12 +25,13 @@ #include #include -#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) -#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ + IS_FD_HASH(map)) #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) @@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map)) { + } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); @@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + goto out; } /* must increment bpf_prog_active to avoid kprobe+bpf triggering from @@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = map->ops->map_delete_elem(map, key); + goto out; } preempt_disable(); -- cgit v1.2.3 From a2ea07465c8d7984cc6b8b1f0b3324f9b138094a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 16 Dec 2019 17:49:00 +0100 Subject: bpf: Fix missing prog untrack in release_maps Commit da765a2f5993 ("bpf: Add poke dependency tracking for prog array maps") wrongly assumed that in case of prog load errors, we're cleaning up all program tracking via bpf_free_used_maps(). However, it can happen that we're still at the point where we didn't copy map pointers into the prog's aux section such that env->prog->aux->used_maps is still zero, running into a UAF. In such case, the verifier has similar release_maps() helper that drops references to used maps from its env. Consolidate the release code into __bpf_free_used_maps() and call it from all sides to fix it. Fixes: da765a2f5993 ("bpf: Add poke dependency tracking for prog array maps") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1c2909484ca524ae9f55109b06f22b6213e76376.1576514756.git.daniel@iogearbox.net --- include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 14 ++++++++++---- kernel/bpf/verifier.c | 14 ++------------ 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ac7de5291509..085a59afba85 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -818,6 +818,8 @@ struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); +void __bpf_free_used_maps(struct bpf_prog_aux *aux, + struct bpf_map **used_maps, u32 len); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 49e32acad7d8..6231858df723 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2048,18 +2048,24 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) } } -static void bpf_free_used_maps(struct bpf_prog_aux *aux) +void __bpf_free_used_maps(struct bpf_prog_aux *aux, + struct bpf_map **used_maps, u32 len) { struct bpf_map *map; - int i; + u32 i; bpf_free_cgroup_storage(aux); - for (i = 0; i < aux->used_map_cnt; i++) { - map = aux->used_maps[i]; + for (i = 0; i < len; i++) { + map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); bpf_map_put(map); } +} + +static void bpf_free_used_maps(struct bpf_prog_aux *aux) +{ + __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt); kfree(aux->used_maps); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 034ef81f935b..a1acdce77070 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8298,18 +8298,8 @@ next_insn: /* drop refcnt of maps used by the rejected program */ static void release_maps(struct bpf_verifier_env *env) { - enum bpf_cgroup_storage_type stype; - int i; - - for_each_cgroup_storage_type(stype) { - if (!env->prog->aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(env->prog, - env->prog->aux->cgroup_storage[stype]); - } - - for (i = 0; i < env->used_map_cnt; i++) - bpf_map_put(env->used_maps[i]); + __bpf_free_used_maps(env->prog->aux, env->used_maps, + env->used_map_cnt); } /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ -- cgit v1.2.3 From e47304232b373362228bf233f17bd12b11c9aafc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 17 Dec 2019 13:28:16 +0100 Subject: bpf: Fix cgroup local storage prog tracking Recently noticed that we're tracking programs related to local storage maps through their prog pointer. This is a wrong assumption since the prog pointer can still change throughout the verification process, for example, whenever bpf_patch_insn_single() is called. Therefore, the prog pointer that was assigned via bpf_cgroup_storage_assign() is not guaranteed to be the same as we pass in bpf_cgroup_storage_release() and the map would therefore remain in busy state forever. Fix this by using the prog's aux pointer which is stable throughout verification and beyond. Fixes: de9cbbaadba5 ("bpf: introduce cgroup storage maps") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Roman Gushchin Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/1471c69eca3022218666f909bc927a92388fd09e.1576580332.git.daniel@iogearbox.net --- include/linux/bpf-cgroup.h | 8 ++++---- kernel/bpf/core.c | 3 +-- kernel/bpf/local_storage.c | 24 ++++++++++++------------ kernel/bpf/verifier.c | 2 +- 4 files changed, 18 insertions(+), 19 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 169fd25f6bc2..9be71c195d74 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -157,8 +157,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, enum bpf_attach_type type); void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); -int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); -void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); +int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); +void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map); int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, @@ -360,9 +360,9 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, static inline void bpf_cgroup_storage_set( struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} -static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, +static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } -static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, +static inline void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map) {} static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6231858df723..af6b738cf435 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2043,8 +2043,7 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) for_each_cgroup_storage_type(stype) { if (!aux->cgroup_storage[stype]) continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); + bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]); } } diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 2ba750725cb2..6bf605dd4b94 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -20,7 +20,7 @@ struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; - struct bpf_prog *prog; + struct bpf_prog_aux *aux; struct rb_root root; struct list_head list; }; @@ -420,7 +420,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_seq_show_elem = cgroup_storage_seq_show_elem, }; -int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); @@ -428,14 +428,14 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) spin_lock_bh(&map->lock); - if (map->prog && map->prog != prog) + if (map->aux && map->aux != aux) goto unlock; - if (prog->aux->cgroup_storage[stype] && - prog->aux->cgroup_storage[stype] != _map) + if (aux->cgroup_storage[stype] && + aux->cgroup_storage[stype] != _map) goto unlock; - map->prog = prog; - prog->aux->cgroup_storage[stype] = _map; + map->aux = aux; + aux->cgroup_storage[stype] = _map; ret = 0; unlock: spin_unlock_bh(&map->lock); @@ -443,16 +443,16 @@ unlock: return ret; } -void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); spin_lock_bh(&map->lock); - if (map->prog == prog) { - WARN_ON(prog->aux->cgroup_storage[stype] != _map); - map->prog = NULL; - prog->aux->cgroup_storage[stype] = NULL; + if (map->aux == aux) { + WARN_ON(aux->cgroup_storage[stype] != _map); + map->aux = NULL; + aux->cgroup_storage[stype] = NULL; } spin_unlock_bh(&map->lock); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a1acdce77070..6ef71429d997 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8268,7 +8268,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && - bpf_cgroup_storage_assign(env->prog, map)) { + bpf_cgroup_storage_assign(env->prog->aux, map)) { verbose(env, "only one cgroup storage of each type is allowed\n"); fdput(f); return -EBUSY; -- cgit v1.2.3