diff options
31 files changed, 987 insertions, 5 deletions
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index e2fe0700b3b4..9a20821b111c 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -89,4 +89,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 92121b0f5b98..2b65ed6b277c 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -82,4 +82,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h index 60f60f5b9b35..e2503d9f1869 100644 --- a/arch/cris/include/uapi/asm/socket.h +++ b/arch/cris/include/uapi/asm/socket.h @@ -84,6 +84,9 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index 2c6890209ea6..4823ad125578 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -82,5 +82,8 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 09a93fb566f6..59be3d87f86d 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -91,4 +91,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index e8589819c274..7bc4cb273856 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -82,4 +82,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 2e9ee8c55a10..dec3c850f36b 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -100,4 +100,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index f3492e8c9f70..cab7d6d50051 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -82,4 +82,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 7984a1cab3da..a5cd40cd8ee1 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -81,4 +81,7 @@ #define SO_INCOMING_CPU 0x402A +#define SO_ATTACH_BPF 0x402B +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index 3474e4ef166d..c046666038f8 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -89,4 +89,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 8457636c33e1..296942d56e6a 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -88,4 +88,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 4a8003a94163..e6a16c40be5f 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -78,6 +78,9 @@ #define SO_INCOMING_CPU 0x0033 +#define SO_ATTACH_BPF 0x0034 +#define SO_DETACH_BPF SO_DETACH_FILTER + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index c46f6a696849..4120af086160 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -93,4 +93,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 75e94eaa228b..bbfceb756452 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -128,7 +128,11 @@ struct bpf_prog_aux { struct work_struct work; }; +#ifdef CONFIG_BPF_SYSCALL void bpf_prog_put(struct bpf_prog *prog); +#else +static inline void bpf_prog_put(struct bpf_prog *prog) {} +#endif struct bpf_prog *bpf_prog_get(u32 ufd); /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); diff --git a/include/linux/filter.h b/include/linux/filter.h index ca95abd2bed1..caac2087a4d5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -381,6 +381,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); +int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); int bpf_check_classic(const struct sock_filter *filter, unsigned int flen); diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index f541ccefd4ac..5c15c2a5c123 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -84,4 +84,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a3d0f84f178..45da7ec7d274 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -117,6 +117,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, }; /* flags for BPF_MAP_UPDATE_ELEM command */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b6a1f7c14a67..a28e09c7825d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1172,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +/* verify safety of LD_ABS|LD_IND instructions: + * - they can only appear in the programs where ctx == skb + * - since they are wrappers of function calls, they scratch R1-R5 registers, + * preserve R6-R9, and store return value into R0 + * + * Implicit input: + * ctx == skb == R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * R0 - 8/16/32-bit skb data converted to cpu endianness + */ +static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + u8 mode = BPF_MODE(insn->code); + struct reg_state *reg; + int i, err; + + if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + return -EINVAL; + } + + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose("BPF_LD_ABS uses reserved fields\n"); + return -EINVAL; + } + + /* check whether implicit source operand (register R6) is readable */ + err = check_reg_arg(regs, BPF_REG_6, SRC_OP); + if (err) + return err; + + if (regs[BPF_REG_6].type != PTR_TO_CTX) { + verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + return -EINVAL; + } + + if (mode == BPF_IND) { + /* check explicit source operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } + + /* reset caller saved regs to unreadable */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* mark destination R0 register as readable, since it contains + * the value fetched from the packet + */ + regs[BPF_REG_0].type = UNKNOWN_VALUE; + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -1677,8 +1741,10 @@ process_bpf_exit: u8 mode = BPF_MODE(insn->code); if (mode == BPF_ABS || mode == BPF_IND) { - verbose("LD_ABS is not supported yet\n"); - return -EINVAL; + err = check_ld_abs(env, insn); + if (err) + return err; + } else if (mode == BPF_IMM) { err = check_ld_imm(env, insn); if (err) diff --git a/net/core/filter.c b/net/core/filter.c index 647b12265e18..8cc3c03078b3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -44,6 +44,7 @@ #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> +#include <linux/bpf.h> /** * sk_filter - run a packet through a socket filter @@ -813,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp) static void __bpf_prog_release(struct bpf_prog *prog) { - bpf_release_orig_filter(prog); - bpf_prog_free(prog); + if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + } else { + bpf_release_orig_filter(prog); + bpf_prog_free(prog); + } } static void __sk_filter_release(struct sk_filter *fp) @@ -1088,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); +#ifdef CONFIG_BPF_SYSCALL +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + struct bpf_prog *prog; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get(ufd); + if (!prog) + return -EINVAL; + + if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + /* valid fd, but invalid program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + bpf_prog_put(prog); + return -ENOMEM; + } + fp->prog = prog; + + atomic_set(&fp->refcnt, 0); + + if (!sk_filter_charge(sk, fp)) { + __sk_filter_release(fp); + return -ENOMEM; + } + + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + + return 0; +} + +/* allow socket filters to call + * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem() + */ +static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + default: + return NULL; + } +} + +static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* skb fields cannot be accessed yet */ + return false; +} + +static struct bpf_verifier_ops sock_filter_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, +}; + +static struct bpf_prog_type_list tl = { + .ops = &sock_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, +}; + +static int __init register_sock_filter_ops(void) +{ + bpf_register_prog_type(&tl); + return 0; +} +late_initcall(register_sock_filter_ops); +#else +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + return -EOPNOTSUPP; +} +#endif int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/sock.c b/net/core/sock.c index 0725cf0cb685..9a56b2000c3f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -888,6 +888,19 @@ set_rcvbuf: } break; + case SO_ATTACH_BPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0718d9ce4619..b5b3600dcdf5 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -3,11 +3,31 @@ obj- := dummy.o # List of programs to build hostprogs-y := test_verifier test_maps +hostprogs-y += sock_example +hostprogs-y += sockex1 +hostprogs-y += sockex2 test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o +sock_example-objs := sock_example.o libbpf.o +sockex1-objs := bpf_load.o libbpf.o sockex1_user.o +sockex2-objs := bpf_load.o libbpf.o sockex2_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) +always += sockex1_kern.o +always += sockex2_kern.o HOSTCFLAGS += -I$(objtree)/usr/include + +HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable +HOSTLOADLIBES_sockex1 += -lelf +HOSTLOADLIBES_sockex2 += -lelf + +# point this to your LLVM backend with bpf support +LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc + +%.o: %.c + clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ + -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \ + -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h new file mode 100644 index 000000000000..ca0333146006 --- /dev/null +++ b/samples/bpf/bpf_helpers.h @@ -0,0 +1,40 @@ +#ifndef __BPF_HELPERS_H +#define __BPF_HELPERS_H + +/* helper macro to place programs, maps, license in + * different sections in elf_bpf file. Section names + * are interpreted by elf_bpf loader + */ +#define SEC(NAME) __attribute__((section(NAME), used)) + +/* helper functions called from eBPF programs written in C */ +static void *(*bpf_map_lookup_elem)(void *map, void *key) = + (void *) BPF_FUNC_map_lookup_elem; +static int (*bpf_map_update_elem)(void *map, void *key, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_map_update_elem; +static int (*bpf_map_delete_elem)(void *map, void *key) = + (void *) BPF_FUNC_map_delete_elem; + +/* llvm builtin functions that eBPF C program may use to + * emit BPF_LD_ABS and BPF_LD_IND instructions + */ +struct sk_buff; +unsigned long long load_byte(void *skb, + unsigned long long off) asm("llvm.bpf.load.byte"); +unsigned long long load_half(void *skb, + unsigned long long off) asm("llvm.bpf.load.half"); +unsigned long long load_word(void *skb, + unsigned long long off) asm("llvm.bpf.load.word"); + +/* a helper structure used by eBPF C program + * to describe map attributes to elf_bpf loader + */ +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; +}; + +#endif diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c new file mode 100644 index 000000000000..1831d236382b --- /dev/null +++ b/samples/bpf/bpf_load.c @@ -0,0 +1,203 @@ +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <libelf.h> +#include <gelf.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <stdbool.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include "libbpf.h" +#include "bpf_helpers.h" +#include "bpf_load.h" + +static char license[128]; +static bool processed_sec[128]; +int map_fd[MAX_MAPS]; +int prog_fd[MAX_PROGS]; +int prog_cnt; + +static int load_and_attach(const char *event, struct bpf_insn *prog, int size) +{ + int fd; + bool is_socket = strncmp(event, "socket", 6) == 0; + + if (!is_socket) + /* tracing events tbd */ + return -1; + + fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, + prog, size, license); + + if (fd < 0) { + printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf); + return -1; + } + + prog_fd[prog_cnt++] = fd; + + return 0; +} + +static int load_maps(struct bpf_map_def *maps, int len) +{ + int i; + + for (i = 0; i < len / sizeof(struct bpf_map_def); i++) { + + map_fd[i] = bpf_create_map(maps[i].type, + maps[i].key_size, + maps[i].value_size, + maps[i].max_entries); + if (map_fd[i] < 0) + return 1; + } + return 0; +} + +static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, + GElf_Shdr *shdr, Elf_Data **data) +{ + Elf_Scn *scn; + + scn = elf_getscn(elf, i); + if (!scn) + return 1; + + if (gelf_getshdr(scn, shdr) != shdr) + return 2; + + *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); + if (!*shname || !shdr->sh_size) + return 3; + + *data = elf_getdata(scn, 0); + if (!*data || elf_getdata(scn, *data) != NULL) + return 4; + + return 0; +} + +static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, + GElf_Shdr *shdr, struct bpf_insn *insn) +{ + int i, nrels; + + nrels = shdr->sh_size / shdr->sh_entsize; + + for (i = 0; i < nrels; i++) { + GElf_Sym sym; + GElf_Rel rel; + unsigned int insn_idx; + + gelf_getrel(data, i, &rel); + + insn_idx = rel.r_offset / sizeof(struct bpf_insn); + + gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); + + if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { + printf("invalid relo for insn[%d].code 0x%x\n", + insn_idx, insn[insn_idx].code); + return 1; + } + insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; + insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)]; + } + + return 0; +} + +int load_bpf_file(char *path) +{ + int fd, i; + Elf *elf; + GElf_Ehdr ehdr; + GElf_Shdr shdr, shdr_prog; + Elf_Data *data, *data_prog, *symbols = NULL; + char *shname, *shname_prog; + + if (elf_version(EV_CURRENT) == EV_NONE) + return 1; + + fd = open(path, O_RDONLY, 0); + if (fd < 0) + return 1; + + elf = elf_begin(fd, ELF_C_READ, NULL); + + if (!elf) + return 1; + + if (gelf_getehdr(elf, &ehdr) != &ehdr) + return 1; + + /* scan over all elf sections to get license and map info */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (0) /* helpful for llvm debugging */ + printf("section %d:%s data %p size %zd link %d flags %d\n", + i, shname, data->d_buf, data->d_size, + shdr.sh_link, (int) shdr.sh_flags); + + if (strcmp(shname, "license") == 0) { + processed_sec[i] = true; + memcpy(license, data->d_buf, data->d_size); + } else if (strcmp(shname, "maps") == 0) { + processed_sec[i] = true; + if (load_maps(data->d_buf, data->d_size)) + return 1; + } else if (shdr.sh_type == SHT_SYMTAB) { + symbols = data; + } + } + + /* load programs that need map fixup (relocations) */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + if (shdr.sh_type == SHT_REL) { + struct bpf_insn *insns; + + if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, + &shdr_prog, &data_prog)) + continue; + + insns = (struct bpf_insn *) data_prog->d_buf; + + processed_sec[shdr.sh_info] = true; + processed_sec[i] = true; + + if (parse_relo_and_apply(data, symbols, &shdr, insns)) + continue; + + if (memcmp(shname_prog, "events/", 7) == 0 || + memcmp(shname_prog, "socket", 6) == 0) + load_and_attach(shname_prog, insns, data_prog->d_size); + } + } + + /* load programs that don't use maps */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (processed_sec[i]) + continue; + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (memcmp(shname, "events/", 7) == 0 || + memcmp(shname, "socket", 6) == 0) + load_and_attach(shname, data->d_buf, data->d_size); + } + + close(fd); + return 0; +} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h new file mode 100644 index 000000000000..27789a34f5e6 --- /dev/null +++ b/samples/bpf/bpf_load.h @@ -0,0 +1,24 @@ +#ifndef __BPF_LOAD_H +#define __BPF_LOAD_H + +#define MAX_MAPS 32 +#define MAX_PROGS 32 + +extern int map_fd[MAX_MAPS]; +extern int prog_fd[MAX_PROGS]; + +/* parses elf file compiled by llvm .c->.o + * . parses 'maps' section and creates maps via BPF syscall + * . parses 'license' section and passes it to syscall + * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by + * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD + * . loads eBPF programs via BPF syscall + * + * One ELF file can contain multiple BPF programs which will be loaded + * and their FDs stored stored in prog_fd array + * + * returns zero on success + */ +int load_bpf_file(char *path); + +#endif diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c index 17bb520eb57f..46d50b7ddf79 100644 --- a/samples/bpf/libbpf.c +++ b/samples/bpf/libbpf.c @@ -7,6 +7,10 @@ #include <linux/netlink.h> #include <linux/bpf.h> #include <errno.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <linux/if_packet.h> +#include <arpa/inet.h> #include "libbpf.h" static __u64 ptr_to_u64(void *ptr) @@ -93,3 +97,27 @@ int bpf_prog_load(enum bpf_prog_type prog_type, return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); } + +int open_raw_sock(const char *name) +{ + struct sockaddr_ll sll; + int sock; + + sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL)); + if (sock < 0) { + printf("cannot create raw socket\n"); + return -1; + } + + memset(&sll, 0, sizeof(sll)); + sll.sll_family = AF_PACKET; + sll.sll_ifindex = if_nametoindex(name); + sll.sll_protocol = htons(ETH_P_ALL); + if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) { + printf("bind to %s: %s\n", name, strerror(errno)); + close(sock); + return -1; + } + + return sock; +} diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h index f8678e5f48bf..58c5fe1bdba1 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/libbpf.h @@ -15,7 +15,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_len, const char *license); -#define LOG_BUF_SIZE 8192 +#define LOG_BUF_SIZE 65536 extern char bpf_log_buf[LOG_BUF_SIZE]; /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ @@ -99,6 +99,16 @@ extern char bpf_log_buf[LOG_BUF_SIZE]; BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) +/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ + +#define BPF_LD_ABS(SIZE, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ @@ -169,4 +179,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE]; .off = 0, \ .imm = 0 }) +/* create RAW socket and bind to interface 'name' */ +int open_raw_sock(const char *name); + #endif diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c new file mode 100644 index 000000000000..c8ad0404416f --- /dev/null +++ b/samples/bpf/sock_example.c @@ -0,0 +1,101 @@ +/* eBPF example program: + * - creates arraymap in kernel with key 4 bytes and value 8 bytes + * + * - loads eBPF program: + * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)]; + * *(u32*)(fp - 4) = r0; + * // assuming packet is IPv4, lookup ip->proto in a map + * value = bpf_map_lookup_elem(map_fd, fp - 4); + * if (value) + * (*(u64*)value) += 1; + * + * - attaches this program to eth0 raw socket + * + * - every second user space reads map[tcp], map[udp], map[icmp] to see + * how many packets of given protocol were seen on eth0 + */ +#include <stdio.h> +#include <unistd.h> +#include <assert.h> +#include <linux/bpf.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <stddef.h> +#include "libbpf.h" + +static int test_sock(void) +{ + int sock = -1, map_fd, prog_fd, i, key; + long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; + + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), + 256); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + goto cleanup; + } + + struct bpf_insn prog[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), + }; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog), + "GPL"); + if (prog_fd < 0) { + printf("failed to load prog '%s'\n", strerror(errno)); + goto cleanup; + } + + sock = open_raw_sock("lo"); + + if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) < 0) { + printf("setsockopt %s\n", strerror(errno)); + goto cleanup; + } + + for (i = 0; i < 10; i++) { + key = IPPROTO_TCP; + assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0); + + key = IPPROTO_UDP; + assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0); + + key = IPPROTO_ICMP; + assert(bpf_lookup_elem(map_fd, &key, &icmp_cnt) == 0); + + printf("TCP %lld UDP %lld ICMP %lld packets\n", + tcp_cnt, udp_cnt, icmp_cnt); + sleep(1); + } + +cleanup: + /* maps, programs, raw sockets will auto cleanup on process exit */ + return 0; +} + +int main(void) +{ + FILE *f; + + f = popen("ping -c5 localhost", "r"); + (void)f; + + return test_sock(); +} diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c new file mode 100644 index 000000000000..066892662915 --- /dev/null +++ b/samples/bpf/sockex1_kern.c @@ -0,0 +1,25 @@ +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 256, +}; + +SEC("socket1") +int bpf_prog1(struct sk_buff *skb) +{ + int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); + long *value; + + value = bpf_map_lookup_elem(&my_map, &index); + if (value) + __sync_fetch_and_add(value, 1); + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c new file mode 100644 index 000000000000..34a443ff3831 --- /dev/null +++ b/samples/bpf/sockex1_user.c @@ -0,0 +1,49 @@ +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" +#include <unistd.h> +#include <arpa/inet.h> + +int main(int ac, char **argv) +{ + char filename[256]; + FILE *f; + int i, sock; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + sock = open_raw_sock("lo"); + + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, + sizeof(prog_fd[0])) == 0); + + f = popen("ping -c5 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + long long tcp_cnt, udp_cnt, icmp_cnt; + int key; + + key = IPPROTO_TCP; + assert(bpf_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0); + + key = IPPROTO_UDP; + assert(bpf_lookup_elem(map_fd[0], &key, &udp_cnt) == 0); + + key = IPPROTO_ICMP; + assert(bpf_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0); + + printf("TCP %lld UDP %lld ICMP %lld packets\n", + tcp_cnt, udp_cnt, icmp_cnt); + sleep(1); + } + + return 0; +} diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c new file mode 100644 index 000000000000..6f0135f0f217 --- /dev/null +++ b/samples/bpf/sockex2_kern.c @@ -0,0 +1,215 @@ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" +#include <uapi/linux/in.h> +#include <uapi/linux/if.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/if_tunnel.h> +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +struct flow_keys { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u16 thoff; + __u8 ip_proto; +}; + +static inline int proto_ports_offset(__u64 proto) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + return 0; + case IPPROTO_AH: + return 4; + default: + return 0; + } +} + +static inline int ip_is_fragment(struct sk_buff *ctx, __u64 nhoff) +{ + return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) + & (IP_MF | IP_OFFSET); +} + +static inline __u32 ipv6_addr_hash(struct sk_buff *ctx, __u64 off) +{ + __u64 w0 = load_word(ctx, off); + __u64 w1 = load_word(ctx, off + 4); + __u64 w2 = load_word(ctx, off + 8); + __u64 w3 = load_word(ctx, off + 12); + + return (__u32)(w0 ^ w1 ^ w2 ^ w3); +} + +static inline __u64 parse_ip(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto, + struct flow_keys *flow) +{ + __u64 verlen; + + if (unlikely(ip_is_fragment(skb, nhoff))) + *ip_proto = 0; + else + *ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol)); + + if (*ip_proto != IPPROTO_GRE) { + flow->src = load_word(skb, nhoff + offsetof(struct iphdr, saddr)); + flow->dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr)); + } + + verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/); + if (likely(verlen == 0x45)) + nhoff += 20; + else + nhoff += (verlen & 0xF) << 2; + + return nhoff; +} + +static inline __u64 parse_ipv6(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto, + struct flow_keys *flow) +{ + *ip_proto = load_byte(skb, + nhoff + offsetof(struct ipv6hdr, nexthdr)); + flow->src = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, saddr)); + flow->dst = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, daddr)); + nhoff += sizeof(struct ipv6hdr); + + return nhoff; +} + +static inline bool flow_dissector(struct sk_buff *skb, struct flow_keys *flow) +{ + __u64 nhoff = ETH_HLEN; + __u64 ip_proto; + __u64 proto = load_half(skb, 12); + int poff; + + if (proto == ETH_P_8021AD) { + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (proto == ETH_P_8021Q) { + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (likely(proto == ETH_P_IP)) + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + else if (proto == ETH_P_IPV6) + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + else + return false; + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + }; + + __u64 gre_flags = load_half(skb, + nhoff + offsetof(struct gre_hdr, flags)); + __u64 gre_proto = load_half(skb, + nhoff + offsetof(struct gre_hdr, proto)); + + if (gre_flags & (GRE_VERSION|GRE_ROUTING)) + break; + + proto = gre_proto; + nhoff += 4; + if (gre_flags & GRE_CSUM) + nhoff += 4; + if (gre_flags & GRE_KEY) + nhoff += 4; + if (gre_flags & GRE_SEQ) + nhoff += 4; + + if (proto == ETH_P_8021Q) { + proto = load_half(skb, + nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (proto == ETH_P_IP) + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + else if (proto == ETH_P_IPV6) + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + else + return false; + break; + } + case IPPROTO_IPIP: + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + break; + case IPPROTO_IPV6: + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + break; + default: + break; + } + + flow->ip_proto = ip_proto; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += poff; + flow->ports = load_word(skb, nhoff); + } + + flow->thoff = (__u16) nhoff; + + return true; +} + +struct bpf_map_def SEC("maps") hash_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__be32), + .value_size = sizeof(long), + .max_entries = 1024, +}; + +SEC("socket2") +int bpf_prog2(struct sk_buff *skb) +{ + struct flow_keys flow; + long *value; + u32 key; + + if (!flow_dissector(skb, &flow)) + return 0; + + key = flow.dst; + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) { + __sync_fetch_and_add(value, 1); + } else { + long val = 1; + + bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c new file mode 100644 index 000000000000..d2d5f5a790d3 --- /dev/null +++ b/samples/bpf/sockex2_user.c @@ -0,0 +1,44 @@ +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" +#include <unistd.h> +#include <arpa/inet.h> + +int main(int ac, char **argv) +{ + char filename[256]; + FILE *f; + int i, sock; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + sock = open_raw_sock("lo"); + + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, + sizeof(prog_fd[0])) == 0); + + f = popen("ping -c5 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + int key = 0, next_key; + long long value; + + while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) { + bpf_lookup_elem(map_fd[0], &next_key, &value); + printf("ip %s count %lld\n", + inet_ntoa((struct in_addr){htonl(next_key)}), + value); + key = next_key; + } + sleep(1); + } + return 0; +} |