From 60a3b2253c413cf601783b070507d7dd6620c954 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Sep 2014 22:53:44 +0200 Subject: net: bpf: make eBPF interpreter images read-only With eBPF getting more extended and exposure to user space is on it's way, hardening the memory range the interpreter uses to steer its command flow seems appropriate. This patch moves the to be interpreted bytecode to read-only pages. In case we execute a corrupted BPF interpreter image for some reason e.g. caused by an attacker which got past a verifier stage, it would not only provide arbitrary read/write memory access but arbitrary function calls as well. After setting up the BPF interpreter image, its contents do not change until destruction time, thus we can setup the image on immutable made pages in order to mitigate modifications to that code. The idea is derived from commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks"). This is possible because bpf_prog is not part of sk_filter anymore. After setup bpf_prog cannot be altered during its life-time. This prevents any modifications to the entire bpf_prog structure (incl. function/JIT image pointer). Every eBPF program (including classic BPF that are migrated) have to call bpf_prog_select_runtime() to select either interpreter or a JIT image as a last setup step, and they all are being freed via bpf_prog_free(), including non-JIT. Therefore, we can easily integrate this into the eBPF life-time, plus since we directly allocate a bpf_prog, we have no performance penalty. Tested with seccomp and test_bpf testsuite in JIT/non-JIT mode and manual inspection of kernel_page_tables. Brad Spengler proposed the same idea via Twitter during development of this patch. Joint work with Hannes Frederic Sowa. Suggested-by: Brad Spengler Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 49 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 5 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index a5227ab8ccb1..c78994593355 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -9,6 +9,11 @@ #include #include #include +#include + +struct sk_buff; +struct sock; +struct seccomp_data; /* Internally used and optimized filter representation with extended * instruction set based on top of classic BPF. @@ -320,20 +325,23 @@ struct sock_fprog_kern { struct sock_filter *filter; }; -struct sk_buff; -struct sock; -struct seccomp_data; +struct bpf_work_struct { + struct bpf_prog *prog; + struct work_struct work; +}; struct bpf_prog { + u32 pages; /* Number of allocated pages */ u32 jited:1, /* Is our filter JIT'ed? */ len:31; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); + /* Instructions for interpreter */ union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; - struct work_struct work; }; }; @@ -353,6 +361,26 @@ static inline unsigned int bpf_prog_size(unsigned int proglen) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ + set_memory_ro((unsigned long)fp, fp->pages); +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ + set_memory_rw((unsigned long)fp, fp->pages); +} +#else +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ +} +#endif /* CONFIG_DEBUG_SET_MODULE_RONX */ + int sk_filter(struct sock *sk, struct sk_buff *skb); void bpf_prog_select_runtime(struct bpf_prog *fp); @@ -361,6 +389,17 @@ void bpf_prog_free(struct bpf_prog *fp); int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len); +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags); +void __bpf_prog_free(struct bpf_prog *fp); + +static inline void bpf_prog_unlock_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_ro(fp); + __bpf_prog_free(fp); +} + int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); void bpf_prog_destroy(struct bpf_prog *fp); @@ -450,7 +489,7 @@ static inline void bpf_jit_compile(struct bpf_prog *fp) static inline void bpf_jit_free(struct bpf_prog *fp) { - kfree(fp); + bpf_prog_unlock_free(fp); } #endif /* CONFIG_BPF_JIT */ -- cgit v1.2.3 From 02ab695bb37ee9ad515df0d0790d5977505dd04a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:17 -0700 Subject: net: filter: add "load 64-bit immediate" eBPF instruction add BPF_LD_IMM64 instruction to load 64-bit immediate value into a register. All previous instructions were 8-byte. This is first 16-byte instruction. Two consecutive 'struct bpf_insn' blocks are interpreted as single instruction: insn[0].code = BPF_LD | BPF_DW | BPF_IMM insn[0].dst_reg = destination register insn[0].imm = lower 32-bit insn[1].code = 0 insn[1].imm = upper 32-bit All unused fields must be zero. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. x64 JITs it as single 'movabsq %rax, imm64' arm64 may JIT as sequence of four 'movk x0, #imm16, lsl #shift' insn Note that old eBPF programs are binary compatible with new interpreter. It helps eBPF programs load 64-bit constant into a register with one instruction instead of using two registers and 4 instructions: BPF_MOV32_IMM(R1, imm32) BPF_ALU64_IMM(BPF_LSH, R1, 32) BPF_MOV32_IMM(R2, imm32) BPF_ALU64_REG(BPF_OR, R1, R2) User space generated programs will use this instruction to load constants only. To tell kernel that user space needs a pointer the _pseudo_ variant of this instruction may be added later, which will use extra bits of encoding to indicate what type of pointer user space is asking kernel to provide. For example 'off' or 'src_reg' fields can be used for such purpose. src_reg = 1 could mean that user space is asking kernel to validate and load in-kernel map pointer. src_reg = 2 could mean that user space needs readonly data section pointer src_reg = 3 could mean that user space needs a pointer to per-cpu local data All such future pseudo instructions will not be carrying the actual pointer as part of the instruction, but rather will be treated as a request to kernel to provide one. The kernel will verify the request_for_a_pointer, then will drop _pseudo_ marking and will store actual internal pointer inside the instruction, so the end result is the interpreter and JITs never see pseudo BPF_LD_IMM64 insns and only operate on generic BPF_LD_IMM64 that loads 64-bit immediate into a register. User space never operates on direct pointers and verifier can easily recognize request_for_pointer vs other instructions. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 8 +++++++- arch/x86/net/bpf_jit_comp.c | 17 +++++++++++++++++ include/linux/filter.h | 18 ++++++++++++++++++ kernel/bpf/core.c | 5 +++++ lib/test_bpf.c | 21 +++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) (limited to 'include/linux/filter.h') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index c48a9704bda8..81916ab5d96f 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -951,7 +951,7 @@ Size modifier is one of ... Mode modifier is one of: - BPF_IMM 0x00 /* classic BPF only, reserved in eBPF */ + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ BPF_ABS 0x20 BPF_IND 0x40 BPF_MEM 0x60 @@ -995,6 +995,12 @@ BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and 2 byte atomic increments are not supported. +eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists +of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single +instruction that loads 64-bit immediate value into a dst_reg. +Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads +32-bit immediate value into a register. + Testing ------- diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 39ccfbb4a723..06f8c17f5484 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -393,6 +393,23 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); break; + case BPF_LD | BPF_IMM | BPF_DW: + if (insn[1].code != 0 || insn[1].src_reg != 0 || + insn[1].dst_reg != 0 || insn[1].off != 0) { + /* verifier must catch invalid insns */ + pr_err("invalid BPF_LD_IMM64 insn\n"); + return -EINVAL; + } + + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(insn[0].imm, 4); + EMIT(insn[1].imm, 4); + + insn++; + i++; + break; + /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */ case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_DIV | BPF_X: diff --git a/include/linux/filter.h b/include/linux/filter.h index c78994593355..bf323da77950 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -166,6 +166,24 @@ enum { .off = 0, \ .imm = IMM }) +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b54bb2c2e494..2c2bfaacce66 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -242,6 +242,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; void *ptr; int off; @@ -301,6 +302,10 @@ select_insn: ALU64_MOV_K: DST = IMM; CONT; + LD_IMM_DW: + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9a67456ba29a..413890815d3e 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1735,6 +1735,27 @@ static struct bpf_test tests[] = { { }, { { 1, 0 } }, }, + { + "load 64-bit immediate", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x567800001234L), + BPF_MOV64_REG(R2, R1), + BPF_MOV64_REG(R3, R2), + BPF_ALU64_IMM(BPF_RSH, R2, 32), + BPF_ALU64_IMM(BPF_LSH, R3, 32), + BPF_ALU64_IMM(BPF_RSH, R3, 32), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_JMP_IMM(BPF_JEQ, R2, 0x5678, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, R3, 0x1234, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } } + }, }; static struct net_device dev; -- cgit v1.2.3 From daedfb22451dd02b35c0549566cbb7cc06bdd53b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:18 -0700 Subject: net: filter: split filter.h and expose eBPF to user space allow user space to generate eBPF programs uapi/linux/bpf.h: eBPF instruction set definition linux/filter.h: the rest This patch only moves macro definitions, but practically it freezes existing eBPF instruction set, though new instructions can still be added in the future. These eBPF definitions cannot go into uapi/linux/filter.h, since the names may conflict with existing applications. Full eBPF ISA description is in Documentation/networking/filter.txt Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 56 +--------------------------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/bpf.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 55 deletions(-) create mode 100644 include/uapi/linux/bpf.h (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index bf323da77950..8f82ef3f1cdd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -10,58 +10,12 @@ #include #include #include +#include struct sk_buff; struct sock; struct seccomp_data; -/* Internally used and optimized filter representation with extended - * instruction set based on top of classic BPF. - */ - -/* instruction classes */ -#define BPF_ALU64 0x07 /* alu mode in double word width */ - -/* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ -#define BPF_XADD 0xc0 /* exclusive add */ - -/* alu/jmp fields */ -#define BPF_MOV 0xb0 /* mov reg to reg */ -#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ - -/* change endianness of a register */ -#define BPF_END 0xd0 /* flags for endianness conversion: */ -#define BPF_TO_LE 0x00 /* convert to little-endian */ -#define BPF_TO_BE 0x08 /* convert to big-endian */ -#define BPF_FROM_LE BPF_TO_LE -#define BPF_FROM_BE BPF_TO_BE - -#define BPF_JNE 0x50 /* jump != */ -#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ -#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ -#define BPF_CALL 0x80 /* function call */ -#define BPF_EXIT 0x90 /* function return */ - -/* Register numbers */ -enum { - BPF_REG_0 = 0, - BPF_REG_1, - BPF_REG_2, - BPF_REG_3, - BPF_REG_4, - BPF_REG_5, - BPF_REG_6, - BPF_REG_7, - BPF_REG_8, - BPF_REG_9, - BPF_REG_10, - __MAX_BPF_REG, -}; - -/* BPF has 10 general purpose 64-bit registers and stack frame. */ -#define MAX_BPF_REG __MAX_BPF_REG - /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. @@ -322,14 +276,6 @@ enum { #define SK_RUN_FILTER(filter, ctx) \ (*filter->prog->bpf_func)(ctx, filter->prog->insnsi) -struct bpf_insn { - __u8 code; /* opcode */ - __u8 dst_reg:4; /* dest register */ - __u8 src_reg:4; /* source register */ - __s16 off; /* signed offset */ - __s32 imm; /* signed immediate constant */ -}; - #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 24e9033f8b3f..fb3f7b675229 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -67,6 +67,7 @@ header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h +header-y += bpf.h header-y += bpqether.h header-y += bsg.h header-y += btrfs.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h new file mode 100644 index 000000000000..479ed0b6be16 --- /dev/null +++ b/include/uapi/linux/bpf.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_H__ +#define _UAPI__LINUX_BPF_H__ + +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word */ +#define BPF_XADD 0xc0 /* exclusive add */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +#endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 738cbe72adc5c8f2016c4c68aa5162631d4f27e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Sep 2014 08:04:47 +0200 Subject: net: bpf: consolidate JIT binary allocator Introduced in commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") and later on replicated in aa2d2c73c21f ("s390/bpf,jit: address randomize and write protect jit code") for s390 architecture, write protection for BPF JIT images got added and a random start address of the JIT code, so that it's not on a page boundary anymore. Since both use a very similar allocator for the BPF binary header, we can consolidate this code into the BPF core as it's mostly JIT independant anyway. This will also allow for future archs that support DEBUG_SET_MODULE_RONX to just reuse instead of reimplementing it. JIT tested on x86_64 and s390x with BPF test suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Eric Dumazet Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 45 ++++++++------------------------------- arch/x86/net/bpf_jit_comp.c | 50 ++++++++++---------------------------------- include/linux/filter.h | 13 ++++++++++++ kernel/bpf/core.c | 39 ++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 75 deletions(-) (limited to 'include/linux/filter.h') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index f2833c5b218a..b734f975c22e 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -5,11 +5,9 @@ * * Author(s): Martin Schwidefsky */ -#include #include #include #include -#include #include #include #include @@ -148,6 +146,12 @@ struct bpf_jit { ret; \ }) +static void bpf_jit_fill_hole(void *area, unsigned int size) +{ + /* Fill whole space with illegal instructions */ + memset(area, 0, size); +} + static void bpf_jit_prologue(struct bpf_jit *jit) { /* Save registers and create stack frame if necessary */ @@ -780,38 +784,6 @@ out: return -1; } -/* - * Note: for security reasons, bpf code will follow a randomly - * sized amount of illegal instructions. - */ -struct bpf_binary_header { - unsigned int pages; - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int bpfsize, - u8 **image_ptr) -{ - struct bpf_binary_header *header; - unsigned int sz, hole; - - /* Most BPF filters are really small, but if some of them fill a page, - * allow at least 128 extra bytes for illegal instructions. - */ - sz = round_up(bpfsize + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - memset(header, 0, sz); - header->pages = sz / PAGE_SIZE; - hole = min(sz - (bpfsize + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - /* Insert random number of illegal instructions before BPF code - * and make sure the first instruction starts at an even address. - */ - *image_ptr = &header->image[(prandom_u32() % hole) & -2]; - return header; -} - void bpf_jit_compile(struct bpf_prog *fp) { struct bpf_binary_header *header = NULL; @@ -850,7 +822,8 @@ void bpf_jit_compile(struct bpf_prog *fp) size = prg_len + lit_len; if (size >= BPF_SIZE_MAX) goto out; - header = bpf_alloc_binary(size, &jit.start); + header = bpf_jit_binary_alloc(size, &jit.start, + 2, bpf_jit_fill_hole); if (!header) goto out; jit.prg = jit.mid = jit.start + prg_len; @@ -884,7 +857,7 @@ void bpf_jit_free(struct bpf_prog *fp) goto free_filter; set_memory_rw(addr, header->pages); - module_free(NULL, header); + bpf_jit_binary_free(header); free_filter: bpf_prog_unlock_free(fp); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 06f8c17f5484..9de0b5476b0c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,12 +8,10 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include #include #include #include -#include +#include int bpf_jit_enable __read_mostly; @@ -109,39 +107,6 @@ static inline void bpf_flush_icache(void *start, void *end) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) -struct bpf_binary_header { - unsigned int pages; - /* Note : for security reasons, bpf code will follow a randomly - * sized amount of int3 instructions - */ - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, - u8 **image_ptr) -{ - unsigned int sz, hole; - struct bpf_binary_header *header; - - /* Most of BPF filters are really small, - * but if some of them fill a page, allow at least - * 128 extra bytes to insert a random section of int3 - */ - sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - - memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ - - header->pages = sz / PAGE_SIZE; - hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - - /* insert a random number of int3 instructions before BPF code */ - *image_ptr = &header->image[prandom_u32() % hole]; - return header; -} - /* pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_REG + 1) @@ -206,6 +171,12 @@ static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +static void jit_fill_hole(void *area, unsigned int size) +{ + /* fill whole space with int3 instructions */ + memset(area, 0xcc, size); +} + struct jit_context { unsigned int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; @@ -959,7 +930,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) if (proglen <= 0) { image = NULL; if (header) - module_free(NULL, header); + bpf_jit_binary_free(header); goto out; } if (image) { @@ -969,7 +940,8 @@ void bpf_int_jit_compile(struct bpf_prog *prog) break; } if (proglen == oldproglen) { - header = bpf_alloc_binary(proglen, &image); + header = bpf_jit_binary_alloc(proglen, &image, + 1, jit_fill_hole); if (!header) goto out; } @@ -998,7 +970,7 @@ void bpf_jit_free(struct bpf_prog *fp) goto free_filter; set_memory_rw(addr, header->pages); - module_free(NULL, header); + bpf_jit_binary_free(header); free_filter: bpf_prog_unlock_free(fp); diff --git a/include/linux/filter.h b/include/linux/filter.h index 8f82ef3f1cdd..868764fcffb8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -289,6 +289,11 @@ struct sock_fprog_kern { struct sock_filter *filter; }; +struct bpf_binary_header { + unsigned int pages; + u8 image[]; +}; + struct bpf_work_struct { struct bpf_prog *prog; struct work_struct work; @@ -358,6 +363,14 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); +typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_jit_binary_free(struct bpf_binary_header *hdr); + static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { bpf_prog_unlock_ro(fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2c2bfaacce66..8ee520f0ec70 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -20,9 +20,12 @@ * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ + #include #include #include +#include +#include #include /* Registers */ @@ -125,6 +128,42 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *hdr; + unsigned int size, hole, start; + + /* Most of BPF filters are really small, but if some of them + * fill a page, allow at least 128 extra bytes to insert a + * random section of illegal instructions. + */ + size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + hdr = module_alloc(size); + if (hdr == NULL) + return NULL; + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(hdr, size); + + hdr->pages = size / PAGE_SIZE; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), + PAGE_SIZE - sizeof(*hdr)); + start = (prandom_u32() % hole) & ~(alignment - 1); + + /* Leave a random number of instructions before BPF code. */ + *image_ptr = &hdr->image[start]; + + return hdr; +} + +void bpf_jit_binary_free(struct bpf_binary_header *hdr) +{ + module_free(NULL, hdr); +} + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. -- cgit v1.2.3 From 286aad3c4014ca825c447e07e24f8929e6d266d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Sep 2014 08:04:49 +0200 Subject: net: bpf: be friendly to kmemcheck Reported by Mikulas Patocka, kmemcheck currently barks out a false positive since we don't have special kmemcheck annotation for bitfields used in bpf_prog structure. We currently have jited:1, len:31 and thus when accessing len while CONFIG_KMEMCHECK enabled, kmemcheck throws a warning that we're reading uninitialized memory. As we don't need the whole bit universe for pages member, we can just split it to u16 and use a bool flag for jited instead of a bitfield. Signed-off-by: Mikulas Patocka Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 2 +- arch/mips/net/bpf_jit.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 6 +++--- net/core/filter.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux/filter.h') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 2d1a5b93d91c..6b45f649eff0 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -933,7 +933,7 @@ void bpf_jit_compile(struct bpf_prog *fp) set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)ctx.target; - fp->jited = 1; + fp->jited = true; out: kfree(ctx.offsets); return; diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index cfa83cf2447d..0e97ccd29fe3 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1417,7 +1417,7 @@ void bpf_jit_compile(struct bpf_prog *fp) bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); fp->bpf_func = (void *)ctx.target; - fp->jited = 1; + fp->jited = true; out: kfree(ctx.offsets); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 40c53ff59124..cbae2dfd053c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -686,7 +686,7 @@ void bpf_jit_compile(struct bpf_prog *fp) ((u64 *)image)[0] = (u64)code_base; ((u64 *)image)[1] = local_paca->kernel_toc; fp->bpf_func = (void *)image; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index b734f975c22e..555f5c7e83ab 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -842,7 +842,7 @@ void bpf_jit_compile(struct bpf_prog *fp) if (jit.start) { set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *) jit.start; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index f7a736b645e8..b2ad9dc5425e 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -801,7 +801,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; if (image) { bpf_flush_icache(image, image + proglen); fp->bpf_func = (void *)image; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9de0b5476b0c..d56cd1f515bd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -955,7 +955,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; - prog->jited = 1; + prog->jited = true; } out: kfree(addrs); diff --git a/include/linux/filter.h b/include/linux/filter.h index 868764fcffb8..4b59edead908 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -300,9 +300,9 @@ struct bpf_work_struct { }; struct bpf_prog { - u32 pages; /* Number of allocated pages */ - u32 jited:1, /* Is our filter JIT'ed? */ - len:31; /* Number of filter blocks */ + u16 pages; /* Number of allocated pages */ + bool jited; /* Is our filter JIT'ed? */ + u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, diff --git a/net/core/filter.c b/net/core/filter.c index fa5b7d0f77ac..dfc716ffa44b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -972,7 +972,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) int err; fp->bpf_func = NULL; - fp->jited = 0; + fp->jited = false; err = bpf_check_classic(fp->insns, fp->len); if (err) { -- cgit v1.2.3 From b954d83421d51d822c42e5ab7b65069b25ad3005 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Sep 2014 15:01:02 +0200 Subject: net: bpf: only build bpf_jit_binary_{alloc, free}() when jit selected Since BPF JIT depends on the availability of module_alloc() and module_free() helpers (HAVE_BPF_JIT and MODULES), we better build that code only in case we have BPF_JIT in our config enabled, just like with other JIT code. Fixes builds for arm/marzen_defconfig and sh/rsk7269_defconfig. ==================== kernel/built-in.o: In function `bpf_jit_binary_alloc': /home/cwang/linux/kernel/bpf/core.c:144: undefined reference to `module_alloc' kernel/built-in.o: In function `bpf_jit_binary_free': /home/cwang/linux/kernel/bpf/core.c:164: undefined reference to `module_free' make: *** [vmlinux] Error 1 ==================== Reported-by: Fengguang Wu Fixes: 738cbe72adc5 ("net: bpf: consolidate JIT binary allocator") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 78 +++++++++++++++++++++++++------------------------- kernel/bpf/core.c | 2 ++ 2 files changed, 41 insertions(+), 39 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4b59edead908..1a0bc6d134d7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -4,12 +4,18 @@ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ +#include + #include #include #include +#include +#include #include -#include + #include + +#include #include struct sk_buff; @@ -363,14 +369,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); -typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); - -struct bpf_binary_header * -bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, - unsigned int alignment, - bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_jit_binary_free(struct bpf_binary_header *hdr); - static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { bpf_prog_unlock_ro(fp); @@ -393,6 +391,38 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_int_jit_compile(struct bpf_prog *fp); +#ifdef CONFIG_BPF_JIT +typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_jit_binary_free(struct bpf_binary_header *hdr); + +void bpf_jit_compile(struct bpf_prog *fp); +void bpf_jit_free(struct bpf_prog *fp); + +static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, + u32 pass, void *image) +{ + pr_err("flen=%u proglen=%u pass=%u image=%pK\n", + flen, proglen, pass, image); + if (image) + print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, + 16, 1, image, proglen, false); +} +#else +static inline void bpf_jit_compile(struct bpf_prog *fp) +{ +} + +static inline void bpf_jit_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_free(fp); +} +#endif /* CONFIG_BPF_JIT */ + #define BPF_ANC BIT(15) static inline u16 bpf_anc_helper(const struct sock_filter *ftest) @@ -440,36 +470,6 @@ static inline void *bpf_load_pointer(const struct sk_buff *skb, int k, return bpf_internal_load_pointer_neg_helper(skb, k, size); } -#ifdef CONFIG_BPF_JIT -#include -#include -#include - -void bpf_jit_compile(struct bpf_prog *fp); -void bpf_jit_free(struct bpf_prog *fp); - -static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, - u32 pass, void *image) -{ - pr_err("flen=%u proglen=%u pass=%u image=%pK\n", - flen, proglen, pass, image); - if (image) - print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, - 16, 1, image, proglen, false); -} -#else -#include - -static inline void bpf_jit_compile(struct bpf_prog *fp) -{ -} - -static inline void bpf_jit_free(struct bpf_prog *fp) -{ - bpf_prog_unlock_free(fp); -} -#endif /* CONFIG_BPF_JIT */ - static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8ee520f0ec70..8b7002488251 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -128,6 +128,7 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +#ifdef CONFIG_BPF_JIT struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -163,6 +164,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_free(NULL, hdr); } +#endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs -- cgit v1.2.3 From 09756af46893c18839062976c3252e93a1beeba7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:00 -0700 Subject: bpf: expand BPF syscall with program load/unload eBPF programs are similar to kernel modules. They are loaded by the user process and automatically unloaded when process exits. Each eBPF program is a safe run-to-completion set of instructions. eBPF verifier statically determines that the program terminates and is safe to execute. The following syscall wrapper can be used to load the program: int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = prog_type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } where 'insns' is an array of eBPF instructions and 'license' is a string that must be GPL compatible to call helper functions marked gpl_only Upon succesful load the syscall returns prog_fd. Use close(prog_fd) to unload the program. User space tests and examples follow in the later patches Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 38 +++++++++++ include/linux/filter.h | 8 +-- include/uapi/linux/bpf.h | 26 ++++++++ kernel/bpf/core.c | 29 +++++---- kernel/bpf/syscall.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 246 insertions(+), 20 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2887f3f9da59..92979182be81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,4 +46,42 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL + * instructions after verifying + */ +struct bpf_func_proto { + u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + bool gpl_only; +}; + +struct bpf_verifier_ops { + /* return eBPF function prototype for verification */ + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); +}; + +struct bpf_prog_type_list { + struct list_head list_node; + struct bpf_verifier_ops *ops; + enum bpf_prog_type type; +}; + +void bpf_register_prog_type(struct bpf_prog_type_list *tl); + +struct bpf_prog; + +struct bpf_prog_aux { + atomic_t refcnt; + bool is_gpl_compatible; + enum bpf_prog_type prog_type; + struct bpf_verifier_ops *ops; + struct bpf_map **used_maps; + u32 used_map_cnt; + struct bpf_prog *prog; + struct work_struct work; +}; + +void bpf_prog_put(struct bpf_prog *prog); +struct bpf_prog *bpf_prog_get(u32 ufd); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 1a0bc6d134d7..4ffc0958d85e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -21,6 +21,7 @@ struct sk_buff; struct sock; struct seccomp_data; +struct bpf_prog_aux; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -300,17 +301,12 @@ struct bpf_binary_header { u8 image[]; }; -struct bpf_work_struct { - struct bpf_prog *prog; - struct work_struct work; -}; - struct bpf_prog { u16 pages; /* Number of allocated pages */ bool jited; /* Is our filter JIT'ed? */ u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - struct bpf_work_struct *work; /* Deferred free work struct */ + struct bpf_prog_aux *aux; /* Auxiliary fields */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); /* Instructions for interpreter */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395cabd2ca0a..424f442016e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -99,12 +99,23 @@ enum bpf_cmd { * returns zero and stores next key or negative error */ BPF_MAP_GET_NEXT_KEY, + + /* verify and load eBPF program + * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size) + * Using attr->prog_type, attr->insns, attr->license + * returns fd or negative error + */ + BPF_PROG_LOAD, }; enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, }; +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -121,6 +132,21 @@ union bpf_attr { __aligned_u64 next_key; }; }; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + }; } __attribute__((aligned(8))); +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +enum bpf_func_id { + BPF_FUNC_unspec, + __BPF_FUNC_MAX_ID, +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b7002488251..f0c30c59b317 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include #include #include +#include /* Registers */ #define BPF_R0 regs[BPF_REG_0] @@ -71,7 +72,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); @@ -79,14 +80,14 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); - if (ws == NULL) { + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { vfree(fp); return NULL; } fp->pages = size / PAGE_SIZE; - fp->work = ws; + fp->aux = aux; return fp; } @@ -110,10 +111,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; - /* We keep fp->work from fp_old around in the new + /* We keep fp->aux from fp_old around in the new * reallocated structure. */ - fp_old->work = NULL; + fp_old->aux = NULL; __bpf_prog_free(fp_old); } @@ -123,7 +124,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->work); + kfree(fp->aux); vfree(fp); } EXPORT_SYMBOL_GPL(__bpf_prog_free); @@ -638,19 +639,19 @@ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static void bpf_prog_free_deferred(struct work_struct *work) { - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; - ws = container_of(work, struct bpf_work_struct, work); - bpf_jit_free(ws->prog); + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); } /* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - struct bpf_work_struct *ws = fp->work; + struct bpf_prog_aux *aux = fp->aux; - INIT_WORK(&ws->work, bpf_prog_free_deferred); - ws->prog = fp; - schedule_work(&ws->work); + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f94349ecaf61..0afb4eaa1887 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include static LIST_HEAD(bpf_map_types); @@ -334,6 +336,166 @@ err_put: return err; } +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->aux->prog_type = type; + return 0; + } + } + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD license + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->aux->is_gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + /* err = bpf_check(prog, tb); */ + + if (err < 0) + goto free_used_maps; + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -395,6 +557,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 0246e64d9a5fcd4805198de59b9b5cf1f974eb41 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:04 -0700 Subject: bpf: handle pseudo BPF_LD_IMM64 insn eBPF programs passed from userspace are using pseudo BPF_LD_IMM64 instructions to refer to process-local map_fd. Scan the program for such instructions and if FDs are valid, convert them to 'struct bpf_map' pointers which will be used by verifier to check access to maps in bpf_map_lookup/update() calls. If program passes verifier, convert pseudo BPF_LD_IMM64 into generic by dropping BPF_PSEUDO_MAP_FD flag. Note that eBPF interpreter is generic and knows nothing about pseudo insns. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 6 ++ kernel/bpf/verifier.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4ffc0958d85e..ca95abd2bed1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -145,6 +145,12 @@ struct bpf_prog_aux; .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) +#define BPF_PSEUDO_MAP_FD 1 + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 871edc1f2e1f..7227543e474b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,10 +125,15 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + /* single container for all structs * one verifier_env per bpf_check() call */ struct verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ }; /* verbose verifier prints what it's seeing @@ -300,6 +305,115 @@ static void print_bpf_insn(struct bpf_insn *insn) } } +/* return the map pointer stored inside BPF_LD_IMM64 instruction */ +static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) +{ + u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; + + return (struct bpf_map *) (unsigned long) imm64; +} + +/* look for pseudo eBPF instructions that access map FDs and + * replace them with actual map pointers + */ +static int replace_map_fd_with_map_ptr(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_map *map; + struct fd f; + + if (i == insn_cnt - 1 || insn[1].code != 0 || + insn[1].dst_reg != 0 || insn[1].src_reg != 0 || + insn[1].off != 0) { + verbose("invalid bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + if (insn->src_reg == 0) + /* valid generic load 64-bit imm */ + goto next_insn; + + if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + verbose("unrecognized bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + f = fdget(insn->imm); + + map = bpf_map_get(f); + if (IS_ERR(map)) { + verbose("fd %d is not pointing to valid bpf_map\n", + insn->imm); + fdput(f); + return PTR_ERR(map); + } + + /* store map pointer inside BPF_LD_IMM64 instruction */ + insn[0].imm = (u32) (unsigned long) map; + insn[1].imm = ((u64) (unsigned long) map) >> 32; + + /* check whether we recorded this map already */ + for (j = 0; j < env->used_map_cnt; j++) + if (env->used_maps[j] == map) { + fdput(f); + goto next_insn; + } + + if (env->used_map_cnt >= MAX_USED_MAPS) { + fdput(f); + return -E2BIG; + } + + /* remember this map */ + env->used_maps[env->used_map_cnt++] = map; + + /* hold the map. If the program is rejected by verifier, + * the map will be released by release_maps() or it + * will be used by the valid program until it's unloaded + * and all maps are released in free_bpf_prog_info() + */ + atomic_inc(&map->refcnt); + + fdput(f); +next_insn: + insn++; + i++; + } + } + + /* now all pseudo BPF_LD_IMM64 instructions load valid + * 'struct bpf_map *' into a register instead of user map_fd. + * These pointers will be used later by verifier to validate map access. + */ + return 0; +} + +/* drop refcnt of maps used by the rejected program */ +static void release_maps(struct verifier_env *env) +{ + int i; + + for (i = 0; i < env->used_map_cnt; i++) + bpf_map_put(env->used_maps[i]); +} + +/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ +static void convert_pseudo_ld_imm64(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) + if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) + insn->src_reg = 0; +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; @@ -316,6 +430,8 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; + env->prog = prog; + /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -342,8 +458,14 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) log_level = 0; } + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + /* ret = do_check(env); */ +skip_full_check: + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -357,11 +479,36 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) goto free_log_buf; } + if (ret == 0 && env->used_map_cnt) { + /* if program passed verifier, update used_maps in bpf_prog_info */ + prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); + + if (!prog->aux->used_maps) { + ret = -ENOMEM; + goto free_log_buf; + } + + memcpy(prog->aux->used_maps, env->used_maps, + sizeof(env->used_maps[0]) * env->used_map_cnt); + prog->aux->used_map_cnt = env->used_map_cnt; + + /* program is valid. Convert pseudo bpf_ld_imm64 into generic + * bpf_ld_imm64 instructions + */ + convert_pseudo_ld_imm64(env); + } free_log_buf: if (log_level) vfree(log_buf); free_env: + if (!prog->aux->used_maps) + /* if we didn't copy map pointers into bpf_prog_info, release + * them now. Otherwise free_bpf_prog_info() will release them. + */ + release_maps(env); kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; -- cgit v1.2.3