diff options
44 files changed, 1828 insertions, 222 deletions
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index a5edea07b8d5..41e2feb0cf4f 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -25,8 +25,6 @@ #include "bpf_jit_32.h" -int bpf_jit_enable __read_mostly; - /* * eBPF prog stack layout: * diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index cbfe890b0ee5..0775d5ab8ee9 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -31,8 +31,6 @@ #include "bpf_jit.h" -int bpf_jit_enable __read_mostly; - #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) #define TCALL_CNT (MAX_BPF_JIT_REG + 2) diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 44b925005dd3..4d8cb9bb8365 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1207,8 +1207,6 @@ jmp_cmp: return 0; } -int bpf_jit_enable __read_mostly; - void bpf_jit_compile(struct bpf_prog *fp) { struct jit_ctx ctx; diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 97069a1b6f43..4e347030ed2c 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -177,8 +177,6 @@ static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx) (ctx->idx * 4) - 4; } -int bpf_jit_enable __read_mostly; - enum which_ebpf_reg { src_reg, src_reg_no_fp, diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index f9941b3b5770..872d1f6dd11e 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -18,8 +18,6 @@ #include "bpf_jit32.h" -int bpf_jit_enable __read_mostly; - static inline void bpf_flush_icache(void *start, void *end) { smp_wmb(); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 6771c63b2bec..217a78e84865 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -21,8 +21,6 @@ #include "bpf_jit64.h" -int bpf_jit_enable __read_mostly; - static void bpf_jit_fill_ill_insns(void *area, unsigned int size) { memset32(area, BREAKPOINT_INSTRUCTION, size/4); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 1dfadbd126f3..e50188773ff3 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -28,8 +28,6 @@ #include <asm/set_memory.h> #include "bpf_jit.h" -int bpf_jit_enable __read_mostly; - struct bpf_jit { u32 seen; /* Flags to remember seen eBPF instructions */ u32 seen_reg[16]; /* Array to remember which registers are used */ diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c index 09e318eb34ee..3bd8ca95e521 100644 --- a/arch/sparc/net/bpf_jit_comp_32.c +++ b/arch/sparc/net/bpf_jit_comp_32.c @@ -11,8 +11,6 @@ #include "bpf_jit_32.h" -int bpf_jit_enable __read_mostly; - static inline bool is_simm13(unsigned int value) { return value + 0x1000 < 0x2000; diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 635fdefd4ae2..50a24d7bd4c5 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -12,8 +12,6 @@ #include "bpf_jit_64.h" -int bpf_jit_enable __read_mostly; - static inline bool is_simm13(unsigned int value) { return value + 0x1000 < 0x2000; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 87f214fbe66e..5acee5139e28 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -15,8 +15,6 @@ #include <asm/set_memory.h> #include <linux/bpf.h> -int bpf_jit_enable __read_mostly; - /* * assembly code in arch/x86/net/bpf_jit.S */ @@ -154,6 +152,11 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_AX)); } +static bool is_axreg(u32 reg) +{ + return reg == BPF_REG_0; +} + /* add modifiers if 'reg' maps to x64 registers r8..r15 */ static u8 add_1mod(u8 byte, u32 reg) { @@ -447,16 +450,36 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); + /* b3 holds 'normal' opcode, b2 short form only valid + * in case dst is eax/rax. + */ switch (BPF_OP(insn->code)) { - case BPF_ADD: b3 = 0xC0; break; - case BPF_SUB: b3 = 0xE8; break; - case BPF_AND: b3 = 0xE0; break; - case BPF_OR: b3 = 0xC8; break; - case BPF_XOR: b3 = 0xF0; break; + case BPF_ADD: + b3 = 0xC0; + b2 = 0x05; + break; + case BPF_SUB: + b3 = 0xE8; + b2 = 0x2D; + break; + case BPF_AND: + b3 = 0xE0; + b2 = 0x25; + break; + case BPF_OR: + b3 = 0xC8; + b2 = 0x0D; + break; + case BPF_XOR: + b3 = 0xF0; + b2 = 0x35; + break; } if (is_imm8(imm32)) EMIT3(0x83, add_1reg(b3, dst_reg), imm32); + else if (is_axreg(dst_reg)) + EMIT1_off32(b2, imm32); else EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32); break; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 71e6586acc36..80d3aa0fc9d3 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -157,7 +157,14 @@ nfp_bpf_cmsg_wait_reply(struct nfp_app_bpf *bpf, enum nfp_bpf_cmsg_type type, int tag) { struct sk_buff *skb; - int err; + int i, err; + + for (i = 0; i < 50; i++) { + udelay(4); + skb = nfp_bpf_reply(bpf, tag); + if (skb) + return skb; + } err = wait_event_interruptible_timeout(bpf->cmsg_wq, skb = nfp_bpf_reply(bpf, tag), diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index e2859b2e9c6a..1a357aacc444 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -127,6 +127,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog) struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; unsigned int stack_size; unsigned int max_instr; + int err; stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64; if (prog->aux->stack_depth > stack_size) { @@ -143,7 +144,14 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog) if (!nfp_prog->prog) return -ENOMEM; - return nfp_bpf_jit(nfp_prog); + err = nfp_bpf_jit(nfp_prog); + if (err) + return err; + + prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64); + prog->aux->offload->jited_image = nfp_prog->prog; + + return 0; } static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) @@ -168,6 +176,8 @@ nfp_bpf_map_get_next_key(struct bpf_offloaded_map *offmap, static int nfp_bpf_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) { + if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) + return -EINVAL; return nfp_bpf_ctrl_del_entry(offmap, key); } diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 5134d5c1306c..b3851bbefad3 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -17,6 +17,7 @@ #include <linux/bpf_verifier.h> #include <linux/debugfs.h> #include <linux/kernel.h> +#include <linux/mutex.h> #include <linux/rtnetlink.h> #include <net/pkt_cls.h> @@ -31,6 +32,19 @@ struct nsim_bpf_bound_prog { struct list_head l; }; +#define NSIM_BPF_MAX_KEYS 2 + +struct nsim_bpf_bound_map { + struct netdevsim *ns; + struct bpf_offloaded_map *map; + struct mutex mutex; + struct nsim_map_entry { + void *key; + void *value; + } entry[NSIM_BPF_MAX_KEYS]; + struct list_head l; +}; + static int nsim_debugfs_bpf_string_read(struct seq_file *file, void *data) { const char **str = file->private; @@ -284,6 +298,224 @@ nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) return 0; } +static bool +nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key) +{ + return e->key && !memcmp(key, e->key, map->key_size); +} + +static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) + if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key)) + return i; + + return -ENOENT; +} + +static int +nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + + nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_USER); + if (!nmap->entry[idx].key) + return -ENOMEM; + nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_USER); + if (!nmap->entry[idx].value) { + kfree(nmap->entry[idx].key); + nmap->entry[idx].key = NULL; + return -ENOMEM; + } + + return 0; +} + +static int +nsim_map_get_next_key(struct bpf_offloaded_map *offmap, + void *key, void *next_key) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + int idx = -ENOENT; + + mutex_lock(&nmap->mutex); + + if (key) + idx = nsim_map_key_find(offmap, key); + if (idx == -ENOENT) + idx = 0; + else + idx++; + + for (; idx < ARRAY_SIZE(nmap->entry); idx++) { + if (nmap->entry[idx].key) { + memcpy(next_key, nmap->entry[idx].key, + offmap->map.key_size); + break; + } + } + + mutex_unlock(&nmap->mutex); + + if (idx == ARRAY_SIZE(nmap->entry)) + return -ENOENT; + return 0; +} + +static int +nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + int idx; + + mutex_lock(&nmap->mutex); + + idx = nsim_map_key_find(offmap, key); + if (idx >= 0) + memcpy(value, nmap->entry[idx].value, offmap->map.value_size); + + mutex_unlock(&nmap->mutex); + + return idx < 0 ? idx : 0; +} + +static int +nsim_map_update_elem(struct bpf_offloaded_map *offmap, + void *key, void *value, u64 flags) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + int idx, err = 0; + + mutex_lock(&nmap->mutex); + + idx = nsim_map_key_find(offmap, key); + if (idx < 0 && flags == BPF_EXIST) { + err = idx; + goto exit_unlock; + } + if (idx >= 0 && flags == BPF_NOEXIST) { + err = -EEXIST; + goto exit_unlock; + } + + if (idx < 0) { + for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++) + if (!nmap->entry[idx].key) + break; + if (idx == ARRAY_SIZE(nmap->entry)) { + err = -E2BIG; + goto exit_unlock; + } + + err = nsim_map_alloc_elem(offmap, idx); + if (err) + goto exit_unlock; + } + + memcpy(nmap->entry[idx].key, key, offmap->map.key_size); + memcpy(nmap->entry[idx].value, value, offmap->map.value_size); +exit_unlock: + mutex_unlock(&nmap->mutex); + + return err; +} + +static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + int idx; + + if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) + return -EINVAL; + + mutex_lock(&nmap->mutex); + + idx = nsim_map_key_find(offmap, key); + if (idx >= 0) { + kfree(nmap->entry[idx].key); + kfree(nmap->entry[idx].value); + memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx])); + } + + mutex_unlock(&nmap->mutex); + + return idx < 0 ? idx : 0; +} + +static const struct bpf_map_dev_ops nsim_bpf_map_ops = { + .map_get_next_key = nsim_map_get_next_key, + .map_lookup_elem = nsim_map_lookup_elem, + .map_update_elem = nsim_map_update_elem, + .map_delete_elem = nsim_map_delete_elem, +}; + +static int +nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) +{ + struct nsim_bpf_bound_map *nmap; + unsigned int i; + int err; + + if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY && + offmap->map.map_type != BPF_MAP_TYPE_HASH)) + return -EINVAL; + if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS) + return -ENOMEM; + if (offmap->map.map_flags) + return -EINVAL; + + nmap = kzalloc(sizeof(*nmap), GFP_USER); + if (!nmap) + return -ENOMEM; + + offmap->dev_priv = nmap; + nmap->ns = ns; + nmap->map = offmap; + mutex_init(&nmap->mutex); + + if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) { + for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { + u32 *key; + + err = nsim_map_alloc_elem(offmap, i); + if (err) + goto err_free; + key = nmap->entry[i].key; + *key = i; + } + } + + offmap->dev_ops = &nsim_bpf_map_ops; + list_add_tail(&nmap->l, &ns->bpf_bound_maps); + + return 0; + +err_free: + while (--i) { + kfree(nmap->entry[i].key); + kfree(nmap->entry[i].value); + } + kfree(nmap); + return err; +} + +static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap) +{ + struct nsim_bpf_bound_map *nmap = offmap->dev_priv; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { + kfree(nmap->entry[i].key); + kfree(nmap->entry[i].value); + } + list_del_init(&nmap->l); + mutex_destroy(&nmap->mutex); + kfree(nmap); +} + int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct netdevsim *ns = netdev_priv(dev); @@ -328,6 +560,14 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) return err; return nsim_xdp_set_prog(ns, bpf); + case BPF_OFFLOAD_MAP_ALLOC: + if (!ns->bpf_map_accept) + return -EOPNOTSUPP; + + return nsim_bpf_map_alloc(ns, bpf->offmap); + case BPF_OFFLOAD_MAP_FREE: + nsim_bpf_map_free(bpf->offmap); + return 0; default: return -EINVAL; } @@ -336,6 +576,7 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) int nsim_bpf_init(struct netdevsim *ns) { INIT_LIST_HEAD(&ns->bpf_bound_progs); + INIT_LIST_HEAD(&ns->bpf_bound_maps); debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir, &ns->bpf_offloaded_id); @@ -362,12 +603,17 @@ int nsim_bpf_init(struct netdevsim *ns) debugfs_create_bool("bpf_xdpoffload_accept", 0600, ns->ddir, &ns->bpf_xdpoffload_accept); + ns->bpf_map_accept = true; + debugfs_create_bool("bpf_map_accept", 0600, ns->ddir, + &ns->bpf_map_accept); + return 0; } void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(!list_empty(&ns->bpf_bound_progs)); + WARN_ON(!list_empty(&ns->bpf_bound_maps)); WARN_ON(ns->xdp_prog); WARN_ON(ns->bpf_offloaded); } diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 32270de9395a..b80361200302 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -61,6 +61,9 @@ struct netdevsim { bool bpf_tc_non_bound_accept; bool bpf_xdpdrv_accept; bool bpf_xdpoffload_accept; + + bool bpf_map_accept; + struct list_head bpf_bound_maps; }; extern struct dentry *nsim_ddir; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5c2c104dc2c5..66df387106de 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -234,6 +234,8 @@ struct bpf_prog_offload { struct list_head offloads; bool dev_state; const struct bpf_prog_offload_ops *dev_ops; + void *jited_image; + u32 jited_len; }; struct bpf_prog_aux { @@ -584,6 +586,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); + int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c2259e8bc54..406c19d6016b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -17,7 +17,7 @@ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ +#define BPF_DW 0x18 /* double word (64-bit) */ #define BPF_XADD 0xc0 /* exclusive add */ /* alu/jmp fields */ @@ -938,6 +938,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/include/uapi/linux/bpf_common.h b/include/uapi/linux/bpf_common.h index 18be90725ab0..ee97668bdadb 100644 --- a/include/uapi/linux/bpf_common.h +++ b/include/uapi/linux/bpf_common.h @@ -15,9 +15,10 @@ /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) -#define BPF_W 0x00 -#define BPF_H 0x08 -#define BPF_B 0x10 +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ab94d304a634..b1f66480135b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,27 +49,35 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static struct bpf_map *array_map_alloc(union bpf_attr *attr) +static int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); - u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); - struct bpf_array *array; - u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ - return ERR_PTR(-E2BIG); + return -E2BIG; + + return 0; +} + +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); + struct bpf_array *array; + u64 array_size, mask64; elem_size = round_up(attr->value_size, 8); @@ -112,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ - array->map.map_type = attr->map_type; - array->map.key_size = attr->key_size; - array->map.value_size = attr->value_size; - array->map.max_entries = attr->max_entries; - array->map.map_flags = attr->map_flags; - array->map.numa_node = numa_node; + bpf_map_init_from_attr(&array->map, attr); array->elem_size = elem_size; if (!percpu) @@ -327,6 +330,7 @@ static void array_map_free(struct bpf_map *map) } const struct bpf_map_ops array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -337,6 +341,7 @@ const struct bpf_map_ops array_map_ops = { }; const struct bpf_map_ops percpu_array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -345,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return array_map_alloc(attr); + return -EINVAL; + return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) @@ -474,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map) } const struct bpf_map_ops prog_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -561,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map, } const struct bpf_map_ops perf_event_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -592,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map) } const struct bpf_map_ops cgroup_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -610,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_array_map_alloc(attr); + map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -673,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, } const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 887e28408f98..3aa0658add76 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -300,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +/* All BPF JIT sysctl knobs here. */ +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_harden __read_mostly; +int bpf_jit_kallsyms __read_mostly; + static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, unsigned long *symbol_start, @@ -381,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -int bpf_jit_kallsyms __read_mostly; - static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); @@ -563,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } -int bpf_jit_harden __read_mostly; - static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) @@ -1379,9 +1380,13 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) } #else -static unsigned int __bpf_prog_ret0(const void *ctx, - const struct bpf_insn *insn) +static unsigned int __bpf_prog_ret0_warn(const void *ctx, + const struct bpf_insn *insn) { + /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON + * is not working properly, so warn about it! + */ + WARN_ON_ONCE(1); return 0; } #endif @@ -1441,7 +1446,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; #else - fp->bpf_func = __bpf_prog_ret0; + fp->bpf_func = __bpf_prog_ret0_warn; #endif /* eBPF JITs can rewrite the program in case constant diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 584e02227671..d7ea96218516 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -591,9 +591,100 @@ unlock: raw_spin_unlock(&trie->lock); } -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { - return -ENOTSUPP; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct lpm_trie_node *node, *next_node = NULL, *parent; + struct lpm_trie_node **node_stack = NULL; + struct lpm_trie_node __rcu **root; + int err = 0, stack_ptr = -1; + unsigned int next_bit; + size_t matchlen; + + /* The get_next_key follows postorder. For the 4 node example in + * the top of this file, the trie_get_next_key() returns the following + * one after another: + * 192.168.0.0/24 + * 192.168.1.0/24 + * 192.168.128.0/24 + * 192.168.0.0/16 + * + * The idea is to return more specific keys before less specific ones. + */ + + /* Empty trie */ + if (!rcu_dereference(trie->root)) + return -ENOENT; + + /* For invalid key, find the leftmost node in the trie */ + if (!key || key->prefixlen > trie->max_prefixlen) { + root = &trie->root; + goto find_leftmost; + } + + node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), + GFP_USER | __GFP_NOWARN); + if (!node_stack) + return -ENOMEM; + + /* Try to find the exact node for the given key */ + for (node = rcu_dereference(trie->root); node;) { + node_stack[++stack_ptr] = node; + matchlen = longest_prefix_match(trie, node, key); + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + root = &trie->root; + goto find_leftmost; + } + + /* The node with the exactly-matching key has been found, + * find the first node in postorder after the matched node. + */ + node = node_stack[stack_ptr]; + while (stack_ptr > 0) { + parent = node_stack[stack_ptr - 1]; + if (rcu_dereference(parent->child[0]) == node && + rcu_dereference(parent->child[1])) { + root = &parent->child[1]; + goto find_leftmost; + } + if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { + next_node = parent; + goto do_copy; + } + + node = parent; + stack_ptr--; + } + + /* did not find anything */ + err = -ENOENT; + goto free_stack; + +find_leftmost: + /* Find the leftmost non-intermediate node, all intermediate nodes + * have exact two children, so this function will never return NULL. + */ + for (node = rcu_dereference(*root); node;) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + next_node = node; + node = rcu_dereference(node->child[0]); + } +do_copy: + next_key->prefixlen = next_node->prefixlen; + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + next_node->data, trie->data_size); +free_stack: + kfree(node_stack); + return err; } const struct bpf_map_ops trie_map_ops = { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index a88cebf368bf..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -230,9 +230,12 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, .prog = prog, .info = info, }; + struct bpf_prog_aux *aux = prog->aux; struct inode *ns_inode; struct path ns_path; + char __user *uinsns; void *res; + u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); if (IS_ERR(res)) { @@ -241,6 +244,26 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, return PTR_ERR(res); } + down_read(&bpf_devs_lock); + + if (!aux->offload) { + up_read(&bpf_devs_lock); + return -ENODEV; + } + + ulen = info->jited_prog_len; + info->jited_prog_len = aux->offload->jited_len; + if (info->jited_prog_len & ulen) { + uinsns = u64_to_user_ptr(info->jited_prog_insns); + ulen = min_t(u32, info->jited_prog_len, ulen); + if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { + up_read(&bpf_devs_lock); + return -EFAULT; + } + } + + up_read(&bpf_devs_lock); + ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; @@ -276,7 +299,8 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_type != BPF_MAP_TYPE_HASH) + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); offmap = kzalloc(sizeof(*offmap), GFP_USER); @@ -389,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) return ret; } +struct ns_get_path_bpf_map_args { + struct bpf_offloaded_map *offmap; + struct bpf_map_info *info; +}; + +static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_map_args *args = private_data; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (args->offmap->netdev) { + args->info->ifindex = args->offmap->netdev->ifindex; + net = dev_net(args->offmap->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct ns_get_path_bpf_map_args args = { + .offmap = map_to_offmap(map), + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c691b9e972e3..5bdb0cc84ad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1504,6 +1504,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -1724,19 +1726,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, goto done; } - ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; - if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; - } else { - info.jited_prog_insns = 0; - } - } - ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { @@ -1762,6 +1751,24 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; + goto done; + } + + /* NOTE: the following code is supposed to be skipped for offload. + * bpf_prog_offload_info_fill() is the place to fill similar fields + * for offload. + */ + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } } done: @@ -1794,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_info_fill(&info, map); + if (err) + return err; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8063dffd91a8..dfb138b46488 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1850,6 +1850,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MEM || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_UNINIT_MEM; +} + +static bool arg_type_is_mem_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_SIZE || + type == ARG_CONST_SIZE_OR_ZERO; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -1899,9 +1912,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_MEM || - arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_UNINIT_MEM) { + } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test @@ -1962,25 +1973,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* bpf_xxx(..., buf, len) call will access 'len' bytes - * from stack pointer 'buf'. Check it - * note: regno == len, regno - 1 == buf - */ - if (regno == 0) { - /* kernel subsystem misconfigured verifier */ - verbose(env, - "ARG_CONST_SIZE cannot be first argument\n"); - return -EACCES; - } - /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ - if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to @@ -2124,7 +2122,7 @@ error: return -EINVAL; } -static int check_raw_mode(const struct bpf_func_proto *fn) +static bool check_raw_mode_ok(const struct bpf_func_proto *fn) { int count = 0; @@ -2139,7 +2137,44 @@ static int check_raw_mode(const struct bpf_func_proto *fn) if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; - return count > 1 ? -EINVAL : 0; + /* We only support one arg being in raw mode at the moment, + * which is sufficient for the helper functions we have + * right now. + */ + return count <= 1; +} + +static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, + enum bpf_arg_type arg_next) +{ + return (arg_type_is_mem_ptr(arg_curr) && + !arg_type_is_mem_size(arg_next)) || + (!arg_type_is_mem_ptr(arg_curr) && + arg_type_is_mem_size(arg_next)); +} + +static bool check_arg_pair_ok(const struct bpf_func_proto *fn) +{ + /* bpf_xxx(..., buf, len) call will access 'len' + * bytes from memory 'buf'. Both arg types need + * to be paired, so make sure there's no buggy + * helper function specification. + */ + if (arg_type_is_mem_size(fn->arg1_type) || + arg_type_is_mem_ptr(fn->arg5_type) || + check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || + check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || + check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || + check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + return false; + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) +{ + return check_raw_mode_ok(fn) && + check_arg_pair_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2295,7 +2330,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id); - if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -2319,10 +2353,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - /* We only support one arg being in raw mode at the moment, which - * is sufficient for the helper functions we have right now. - */ - err = check_raw_mode(fn); + err = check_func_proto(fn); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -4803,7 +4834,8 @@ process_bpf_exit: insn_idx++; } - verbose(env, "processed %d insns, stack depth ", insn_processed); + verbose(env, "processed %d insns (limit %d), stack depth ", + insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); for (i = 0; i < env->subprog_cnt + 1; i++) { u32 depth = env->subprog_stack_depth[i]; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f274468cbc45..fc2838ac8b78 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -245,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(1 /* Fake ip will not be printed. */, \ + __trace_printk(0 /* Fake ip */, \ fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ diff --git a/lib/test_bpf.c b/lib/test_bpf.c index f369889e521d..e3938e395cba 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6109,6 +6109,110 @@ static struct bpf_test tests[] = { { { ETH_HLEN, 42 } }, .fill_helper = bpf_fill_ld_abs_vlan_push_pop2, }, + /* Checking interpreter vs JIT wrt signed extended imms. */ + { + "JNE signed compare, test 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12), + BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000), + BPF_MOV64_REG(R2, R1), + BPF_ALU64_REG(BPF_AND, R2, R3), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R2, -17104896, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JNE signed compare, test 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12), + BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000), + BPF_MOV64_REG(R2, R1), + BPF_ALU64_REG(BPF_AND, R2, R3), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R2, 0xfefb0000, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JNE signed compare, test 3", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R1, 0xfefbbc12), + BPF_ALU32_IMM(BPF_MOV, R3, 0xffff0000), + BPF_ALU32_IMM(BPF_MOV, R4, 0xfefb0000), + BPF_MOV64_REG(R2, R1), + BPF_ALU64_REG(BPF_AND, R2, R3), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_REG(BPF_JNE, R2, R4, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 2 } }, + }, + { + "JNE signed compare, test 4", + .u.insns_int = { + BPF_LD_IMM64(R1, -17104896), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R1, -17104896, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 2 } }, + }, + { + "JNE signed compare, test 5", + .u.insns_int = { + BPF_LD_IMM64(R1, 0xfefb0000), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R1, 0xfefb0000, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JNE signed compare, test 6", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x7efb0000), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_JMP_IMM(BPF_JNE, R1, 0x7efb0000, 1), + BPF_ALU32_IMM(BPF_MOV, R0, 2), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 2 } }, + }, + { + "JNE signed compare, test 7", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffff0000), + BPF_STMT(BPF_MISC | BPF_TAX, 0), + BPF_STMT(BPF_LD | BPF_IMM, 0xfefbbc12), + BPF_STMT(BPF_ALU | BPF_AND | BPF_X, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0xfefb0000, 1, 0), + BPF_STMT(BPF_RET | BPF_K, 1), + BPF_STMT(BPF_RET | BPF_K, 2), + }, + CLASSIC | FLAG_NO_DATA, + {}, + { { 0, 2 } }, + }, }; static struct net_device dev; diff --git a/net/core/filter.c b/net/core/filter.c index 08491b3b8742..18da42a81d0c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2865,7 +2865,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -3154,7 +3154,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) @@ -3460,6 +3460,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: @@ -4530,6 +4532,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { }; const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a47ad6cd41c0..f2d0462611c3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,7 @@ static int zero = 0; static int one = 1; +static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -250,6 +251,46 @@ static int proc_do_rss_key(struct ctl_table *table, int write, return proc_dostring(&fake_table, write, buffer, lenp, ppos); } +#ifdef CONFIG_BPF_JIT +static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, jit_enable = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &jit_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (jit_enable < 2 || + (jit_enable == 2 && bpf_dump_raw_ok())) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } else { + ret = -EPERM; + } + } + return ret; +} + +# ifdef CONFIG_HAVE_EBPF_JIT +static int +proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +# endif +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -325,13 +366,14 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - .proc_handler = proc_dointvec -#else - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_bpf_enable, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = &one, .extra2 = &one, -#endif +# else + .extra1 = &zero, + .extra2 = &two, +# endif }, # ifdef CONFIG_HAVE_EBPF_JIT { @@ -339,14 +381,18 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &one, }, # endif #endif diff --git a/net/socket.c b/net/socket.c index fbfae1ed3ff5..1536515b6437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2613,15 +2613,6 @@ out_fs: core_initcall(sock_init); /* early initcall */ -static int __init jit_init(void) -{ -#ifdef CONFIG_BPF_JIT_ALWAYS_ON - bpf_jit_enable = 1; -#endif - return 0; -} -pure_initcall(jit_init); - #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c index 12e1024069c2..0c12048ac79f 100644 --- a/samples/bpf/xdp2skb_meta_kern.c +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -35,15 +35,17 @@ int _xdp_mark(struct xdp_md *ctx) void *data, *data_end; int ret; - /* Reserve space in-front data pointer for our meta info. + /* Reserve space in-front of data pointer for our meta info. * (Notice drivers not supporting data_meta will fail here!) */ ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta)); if (ret < 0) return XDP_ABORTED; - /* For some unknown reason, these ctx pointers must be read - * after bpf_xdp_adjust_meta, else verifier will reject prog. + /* Notice: Kernel-side verifier requires that loading of + * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(), + * as pkt-data pointers are invalidated. Helpers that require + * this are determined/marked by bpf_helper_changes_pkt_data() */ data = (void *)(unsigned long)ctx->data; diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index c969141bfa8b..211db8ded0de 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -1,6 +1,7 @@ -/* XDP monitor tool, based on tracepoints +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. * - * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * XDP monitor tool, based on tracepoints */ #include <uapi/linux/bpf.h> #include "bpf_helpers.h" @@ -118,3 +119,92 @@ int trace_xdp_exception(struct xdp_exception_ctx *ctx) return 0; } + +/* Common stats data record shared with _user.c */ +struct datarec { + u64 processed; + u64 dropped; + u64 info; +}; +#define MAX_CPUS 64 + +struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_CPUS, +}; + +struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->info += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->info++; + + return 0; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index eaba165b3549..eec14520d513 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -1,4 +1,5 @@ -/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__= "XDP monitor tool, based on tracepoints\n" @@ -40,6 +41,9 @@ static const struct option long_options[] = { {0, 0, NULL, 0 } }; +/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ +#define EXIT_FAIL_MEM 5 + static void usage(char *argv[]) { int i; @@ -108,23 +112,93 @@ static const char *action2str(int action) return NULL; } +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 info; +}; +#define MAX_CPUS 64 + +/* Userspace structs for collection of stats from maps */ struct record { - __u64 counter; __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct u64rec { + __u64 processed; +}; +struct record_u64 { + /* record for _kern side __u64 values */ + __u64 timestamp; + struct u64rec total; + struct u64rec *cpu; }; struct stats_record { - struct record xdp_redir[REDIR_RES_MAX]; - struct record xdp_exception[XDP_ACTION_MAX]; + struct record_u64 xdp_redirect[REDIR_RES_MAX]; + struct record_u64 xdp_exception[XDP_ACTION_MAX]; + struct record xdp_cpumap_kthread; + struct record xdp_cpumap_enqueue[MAX_CPUS]; }; -static void stats_print_headers(bool err_only) +static bool map_collect_record(int fd, __u32 key, struct record *rec) { - if (err_only) - printf("\n%s\n", __doc_err_only__); + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_info = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); - printf("%-14s %-11s %-10s %-18s %-9s\n", - "ACTION", "result", "pps ", "pps-human-readable", "measure-period"); + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].info = values[i].info; + sum_info += values[i].info; + } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.info = sum_info; + return true; +} + +static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct u64rec values[nr_cpus]; + __u64 sum_total = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_total += values[i].processed; + } + rec->total.processed = sum_total; + return true; } static double calc_period(struct record *r, struct record *p) @@ -139,77 +213,203 @@ static double calc_period(struct record *r, struct record *p) return period_; } -static double calc_pps(struct record *r, struct record *p, double period) +static double calc_period_u64(struct record_u64 *r, struct record_u64 *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_pps(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_drop(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->dropped - p->dropped; + pps = packets / period; + } + return pps; +} + +static double calc_info(struct datarec *r, struct datarec *p, double period) { __u64 packets = 0; double pps = 0; if (period > 0) { - packets = r->counter - p->counter; + packets = r->info - p->info; pps = packets / period; } return pps; } -static void stats_print(struct stats_record *rec, - struct stats_record *prev, +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, bool err_only) { - double period = 0, pps = 0; - struct record *r, *p; - int i = 0; + unsigned int nr_cpus = bpf_num_possible_cpus(); + int rec_i = 0, i, to_cpu; + double t = 0, pps = 0; - char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n"; + /* Header */ + printf("%-15s %-7s %-12s %-12s %-9s\n", + "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info"); /* tracepoint: xdp:xdp_redirect_* */ if (err_only) - i = REDIR_ERROR; - - for (; i < REDIR_RES_MAX; i++) { - r = &rec->xdp_redir[i]; - p = &prev->xdp_redir[i]; - - if (p->timestamp) { - period = calc_period(r, p); - pps = calc_pps(r, p, period); + rec_i = REDIR_ERROR; + + for (; rec_i < REDIR_RES_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_redirect[rec_i]; + prev = &stats_prev->xdp_redirect[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "XDP_REDIRECT", i, + rec_i ? 0.0: pps, rec_i ? pps : 0.0, + err2str(rec_i)); } - printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period); + pps = calc_pps_u64(&rec->total, &prev->total, t); + printf(fmt2, "XDP_REDIRECT", "total", + rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i)); } /* tracepoint: xdp:xdp_exception */ - for (i = 0; i < XDP_ACTION_MAX; i++) { - r = &rec->xdp_exception[i]; - p = &prev->xdp_exception[i]; - if (p->timestamp) { - period = calc_period(r, p); - pps = calc_pps(r, p, period); + for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_exception[rec_i]; + prev = &stats_prev->xdp_exception[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "Exception", i, + 0.0, pps, err2str(rec_i)); } + pps = calc_pps_u64(&rec->total, &prev->total, t); if (pps > 0) - printf(fmt, action2str(i), "Exception", - pps, pps, period); + printf(fmt2, "Exception", "total", + 0.0, pps, action2str(rec_i)); } - printf("\n"); -} -static __u64 get_key32_value64_percpu(int fd, __u32 key) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus]; - __u64 sum = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return 0; + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { + char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + struct record *rec, *prev; + char *info_str = ""; + double drop, info; + + rec = &stats_rec->xdp_cpumap_enqueue[to_cpu]; + prev = &stats_prev->xdp_cpumap_enqueue[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt1, "cpumap-enqueue", + i, to_cpu, pps, drop, info, info_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + printf(fmt2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, info, info_str); + } } - /* Sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - sum += values[i]; + /* cpumap kthread stats */ + { + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n"; + struct record *rec, *prev; + double drop, info; + char *i_str = ""; + + rec = &stats_rec->xdp_cpumap_kthread; + prev = &stats_prev->xdp_cpumap_kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) + i_str = "sched"; + if (pps > 0) + printf(fmt1, "cpumap-kthread", + i, pps, drop, info, i_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) + i_str = "sched-sum"; + printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); } - return sum; + + printf("\n"); } static bool stats_collect(struct stats_record *rec) @@ -222,25 +422,109 @@ static bool stats_collect(struct stats_record *rec) */ fd = map_data[0].fd; /* map0: redirect_err_cnt */ - for (i = 0; i < REDIR_RES_MAX; i++) { - rec->xdp_redir[i].timestamp = gettime(); - rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i); - } + for (i = 0; i < REDIR_RES_MAX; i++) + map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); fd = map_data[1].fd; /* map1: exception_cnt */ for (i = 0; i < XDP_ACTION_MAX; i++) { - rec->xdp_exception[i].timestamp = gettime(); - rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i); + map_collect_record_u64(fd, i, &rec->xdp_exception[i]); } + fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */ + for (i = 0; i < MAX_CPUS; i++) + map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); + + fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ + map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); + return true; } +static void *alloc_rec_per_cpu(int record_size) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + void *array; + size_t size; + + size = record_size * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int rec_sz; + int i; + + /* Alloc main stats_record structure */ + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + + /* Alloc stats stored per CPU for each record */ + rec_sz = sizeof(struct u64rec); + for (i = 0; i < REDIR_RES_MAX; i++) + rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < XDP_ACTION_MAX; i++) + rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz); + + rec_sz = sizeof(struct datarec); + rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < MAX_CPUS; i++) + rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < REDIR_RES_MAX; i++) + free(r->xdp_redirect[i].cpu); + + for (i = 0; i < XDP_ACTION_MAX; i++) + free(r->xdp_exception[i].cpu); + + free(r->xdp_cpumap_kthread.cpu); + + for (i = 0; i < MAX_CPUS; i++) + free(r->xdp_cpumap_enqueue[i].cpu); + + free(r); +} + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + static void stats_poll(int interval, bool err_only) { - struct stats_record rec, prev; + struct stats_record *rec, *prev; - memset(&rec, 0, sizeof(rec)); + rec = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(rec); + + if (err_only) + printf("\n%s\n", __doc_err_only__); /* Trick to pretty printf with thousands separators use %' */ setlocale(LC_NUMERIC, "en_US"); @@ -258,13 +542,15 @@ static void stats_poll(int interval, bool err_only) fflush(stdout); while (1) { - memcpy(&prev, &rec, sizeof(rec)); - stats_collect(&rec); - stats_print_headers(err_only); - stats_print(&rec, &prev, err_only); + swap(&prev, &rec); + stats_collect(rec); + stats_print(rec, prev, err_only); fflush(stdout); sleep(interval); } + + free_stats_record(rec); + free_stats_record(prev); } static void print_bpf_prog_info(void) diff --git a/tools/bpf/bpf_jit_disasm.c b/tools/bpf/bpf_jit_disasm.c index 30044bc4f389..58c2bab4ef6e 100644 --- a/tools/bpf/bpf_jit_disasm.c +++ b/tools/bpf/bpf_jit_disasm.c @@ -172,7 +172,8 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen, { char *ptr, *pptr, *tmp; off_t off = 0; - int ret, flen, proglen, pass, ulen = 0; + unsigned int proglen; + int ret, flen, pass, ulen = 0; regmatch_t pmatch[1]; unsigned long base; regex_t regex; @@ -199,7 +200,7 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen, } ptr = haystack + off - (pmatch[0].rm_eo - pmatch[0].rm_so); - ret = sscanf(ptr, "flen=%d proglen=%d pass=%d image=%lx", + ret = sscanf(ptr, "flen=%d proglen=%u pass=%d image=%lx", &flen, &proglen, &pass, &base); if (ret != 4) { regfree(®ex); @@ -239,7 +240,7 @@ static uint8_t *get_last_jit_image(char *haystack, size_t hlen, } assert(ulen == proglen); - printf("%d bytes emitted from JIT compiler (pass:%d, flen:%d)\n", + printf("%u bytes emitted from JIT compiler (pass:%d, flen:%d)\n", proglen, pass, flen); printf("%lx + <x>:\n", base); diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 6601c95a9258..0b482c0070e0 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -34,6 +34,7 @@ /* Author: Jakub Kicinski <kubakici@wp.pl> */ #include <errno.h> +#include <fcntl.h> #include <fts.h> #include <libgen.h> #include <mntent.h> @@ -433,6 +434,77 @@ ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf) return if_indextoname(ifindex, buf); } +static int read_sysfs_hex_int(char *path) +{ + char vendor_id_buf[8]; + int len; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) { + p_err("Can't open %s: %s", path, strerror(errno)); + return -1; + } + + len = read(fd, vendor_id_buf, sizeof(vendor_id_buf)); + close(fd); + if (len < 0) { + p_err("Can't read %s: %s", path, strerror(errno)); + return -1; + } + if (len >= (int)sizeof(vendor_id_buf)) { + p_err("Value in %s too long", path); + return -1; + } + + vendor_id_buf[len] = 0; + + return strtol(vendor_id_buf, NULL, 0); +} + +static int read_sysfs_netdev_hex_int(char *devname, const char *entry_name) +{ + char full_path[64]; + + snprintf(full_path, sizeof(full_path), "/sys/class/net/%s/device/%s", + devname, entry_name); + + return read_sysfs_hex_int(full_path); +} + +const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino) +{ + char devname[IF_NAMESIZE]; + int vendor_id; + int device_id; + + if (!ifindex_to_name_ns(ifindex, ns_dev, ns_ino, devname)) { + p_err("Can't get net device name for ifindex %d: %s", ifindex, + strerror(errno)); + return NULL; + } + + vendor_id = read_sysfs_netdev_hex_int(devname, "vendor"); + if (vendor_id < 0) { + p_err("Can't get device vendor id for %s", devname); + return NULL; + } + + switch (vendor_id) { + case 0x19ee: + device_id = read_sysfs_netdev_hex_int(devname, "device"); + if (device_id != 0x4000 && + device_id != 0x6000 && + device_id != 0x6003) + p_info("Unknown NFP device ID, assuming it is NFP-6xxx arch"); + return "NFP-6xxx"; + default: + p_err("Can't get bfd arch name for device vendor id 0x%04x", + vendor_id); + return NULL; + } +} + void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode) { char name[IF_NAMESIZE]; diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c index 57d32e8a1391..87439320ef70 100644 --- a/tools/bpf/bpftool/jit_disasm.c +++ b/tools/bpf/bpftool/jit_disasm.c @@ -76,7 +76,8 @@ static int fprintf_json(void *out, const char *fmt, ...) return 0; } -void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes) +void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, + const char *arch) { disassembler_ftype disassemble; struct disassemble_info info; @@ -100,6 +101,19 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes) else init_disassemble_info(&info, stdout, (fprintf_ftype) fprintf); + + /* Update architecture info for offload. */ + if (arch) { + const bfd_arch_info_type *inf = bfd_scan_arch(arch); + + if (inf) { + bfdf->arch_info = inf; + } else { + p_err("No libfd support for %s", arch); + return; + } + } + info.arch = bfd_get_arch(bfdf); info.mach = bfd_get_mach(bfdf); info.buffer = image; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 65b526fe6e7e..b8e9584d6246 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -121,7 +121,10 @@ int do_cgroup(int argc, char **arg); int prog_parse_fd(int *argc, char ***argv); -void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes); +void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, + const char *arch); void print_hex_data_json(uint8_t *data, size_t len); +const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); + #endif diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 8d7db9d6b9cd..f95fa67bb498 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -66,6 +66,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", [BPF_MAP_TYPE_DEVMAP] = "devmap", [BPF_MAP_TYPE_SOCKMAP] = "sockmap", + [BPF_MAP_TYPE_CPUMAP] = "cpumap", }; static unsigned int get_possible_cpus(void) @@ -428,6 +429,9 @@ static int show_map_close_json(int fd, struct bpf_map_info *info) jsonw_name(json_wtr, "flags"); jsonw_printf(json_wtr, "%#x", info->map_flags); + + print_dev_json(info->ifindex, info->netns_dev, info->netns_ino); + jsonw_uint_field(json_wtr, "bytes_key", info->key_size); jsonw_uint_field(json_wtr, "bytes_value", info->value_size); jsonw_uint_field(json_wtr, "max_entries", info->max_entries); @@ -469,7 +473,9 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info) if (*info->name) printf("name %s ", info->name); - printf("flags 0x%x\n", info->map_flags); + printf("flags 0x%x", info->map_flags); + print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino); + printf("\n"); printf("\tkey %uB value %uB max_entries %u", info->key_size, info->value_size, info->max_entries); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 099e21cf1b5c..e8e2baaf93c2 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -776,7 +776,17 @@ static int do_dump(int argc, char **argv) } } else { if (member_len == &info.jited_prog_len) { - disasm_print_insn(buf, *member_len, opcodes); + const char *name = NULL; + + if (info.ifindex) { + name = ifindex_to_bfd_name_ns(info.ifindex, + info.netns_dev, + info.netns_ino); + if (!name) + goto err_free; + } + + disasm_print_insn(buf, *member_len, opcodes, name); } else { kernel_syms_load(&dd); if (json_output) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 69f96af4a569..af1f49ad8b88 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -900,6 +900,9 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; + /* Below access go through struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ }; enum sk_action { @@ -935,6 +938,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops @@ -956,6 +962,12 @@ struct bpf_sock_ops { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ }; /* List of known BPF sock_ops operators. @@ -1010,7 +1022,8 @@ struct bpf_perf_event_value { #define BPF_DEVCG_DEV_CHAR (1ULL << 1) struct bpf_cgroup_dev_ctx { - __u32 access_type; /* (access << 16) | type */ + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; __u32 major; __u32 minor; }; diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 541d9d7fad5a..1e09d77f1948 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -3,3 +3,10 @@ test_maps test_lru_map test_lpm_map test_tag +FEATURE-DUMP.libbpf +fixdep +test_align +test_dev_cgroup +test_progs +test_verifier_log +feature diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index a8aa7e251c8e..3a44b655d852 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -19,7 +19,8 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ - test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o + test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ + sample_map_ret0.o TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ test_offload.py diff --git a/tools/testing/selftests/bpf/sample_map_ret0.c b/tools/testing/selftests/bpf/sample_map_ret0.c new file mode 100644 index 000000000000..0756303676ac --- /dev/null +++ b/tools/testing/selftests/bpf/sample_map_ret0.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ +#include <linux/bpf.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") htab = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__u32), + .value_size = sizeof(long), + .max_entries = 2, +}; + +struct bpf_map_def SEC("maps") array = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(long), + .max_entries = 2, +}; + +/* Sample program which should always load for testing control paths. */ +SEC(".text") int func() +{ + __u64 key64 = 0; + __u32 key = 0; + long *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (!value) + return 1; + value = bpf_map_lookup_elem(&array, &key64); + if (!value) + return 1; + + return 0; +} diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index f61480641b6e..081510853c6d 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -521,6 +521,126 @@ static void test_lpm_delete(void) close(map_fd); } +static void test_lpm_get_next_key(void) +{ + struct bpf_lpm_trie_key *key_p, *next_key_p; + size_t key_size; + __u32 value = 0; + int map_fd; + + key_size = sizeof(*key_p) + sizeof(__u32); + key_p = alloca(key_size); + next_key_p = alloca(key_size); + + map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, sizeof(value), + 100, BPF_F_NO_PREALLOC); + assert(map_fd >= 0); + + /* empty tree. get_next_key should return ENOENT */ + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == -1 && + errno == ENOENT); + + /* get and verify the first key, get the second one should fail. */ + key_p->prefixlen = 16; + inet_pton(AF_INET, "192.168.0.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 16 && key_p->data[0] == 192 && + key_p->data[1] == 168); + + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && + errno == ENOENT); + + /* no exact matching key should get the first one in post order. */ + key_p->prefixlen = 8; + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 16 && key_p->data[0] == 192 && + key_p->data[1] == 168); + + /* add one more element (total two) */ + key_p->prefixlen = 24; + inet_pton(AF_INET, "192.168.0.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 24 && key_p->data[0] == 192 && + key_p->data[1] == 168 && key_p->data[2] == 0); + + memset(next_key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && + errno == ENOENT); + + /* Add one more element (total three) */ + key_p->prefixlen = 24; + inet_pton(AF_INET, "192.168.128.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 24 && key_p->data[0] == 192 && + key_p->data[1] == 168 && key_p->data[2] == 0); + + memset(next_key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 128); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && + errno == ENOENT); + + /* Add one more element (total four) */ + key_p->prefixlen = 24; + inet_pton(AF_INET, "192.168.1.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 24 && key_p->data[0] == 192 && + key_p->data[1] == 168 && key_p->data[2] == 0); + + memset(next_key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 1); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 128); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && + errno == ENOENT); + + /* no exact matching key should return the first one in post order */ + key_p->prefixlen = 22; + inet_pton(AF_INET, "192.168.1.0", key_p->data); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 0); + + close(map_fd); +} + int main(void) { struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY }; @@ -545,6 +665,8 @@ int main(void) test_lpm_delete(); + test_lpm_get_next_key(); + printf("test_lpm: OK\n"); return 0; } diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index e3c750f17cb8..833b9c1ec450 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -20,6 +20,7 @@ import os import pprint import random import string +import struct import subprocess import time @@ -156,6 +157,14 @@ def bpftool_prog_list(expected=None, ns=""): (len(progs), expected)) return progs +def bpftool_map_list(expected=None, ns=""): + _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) + if expected is not None: + if len(maps) != expected: + fail(True, "%d BPF maps loaded, expected %d" % + (len(maps), expected)) + return maps + def bpftool_prog_list_wait(expected=0, n_retry=20): for i in range(n_retry): nprogs = len(bpftool_prog_list()) @@ -164,6 +173,14 @@ def bpftool_prog_list_wait(expected=0, n_retry=20): time.sleep(0.05) raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) +def bpftool_map_list_wait(expected=0, n_retry=20): + for i in range(n_retry): + nmaps = len(bpftool_map_list()) + if nmaps == expected: + return + time.sleep(0.05) + raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) + def ip(args, force=False, JSON=True, ns="", fail=True): if force: args = "-force " + args @@ -193,6 +210,26 @@ def mknetns(n_retry=10): return name return None +def int2str(fmt, val): + ret = [] + for b in struct.pack(fmt, val): + ret.append(int(b)) + return " ".join(map(lambda x: str(x), ret)) + +def str2int(strtab): + inttab = [] + for i in strtab: + inttab.append(int(i, 16)) + ba = bytearray(inttab) + if len(strtab) == 4: + fmt = "I" + elif len(strtab) == 8: + fmt = "Q" + else: + raise Exception("String array of len %d can't be unpacked to an int" % + (len(strtab))) + return struct.unpack(fmt, ba)[0] + class DebugfsDir: """ Class for accessing DebugFS directories as a dictionary. @@ -311,13 +348,13 @@ class NetdevSim: return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu), fail=fail) - def set_xdp(self, bpf, mode, force=False, fail=True): + def set_xdp(self, bpf, mode, force=False, JSON=True, fail=True): return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf), - force=force, fail=fail) + force=force, JSON=JSON, fail=fail) - def unset_xdp(self, mode, force=False, fail=True): + def unset_xdp(self, mode, force=False, JSON=True, fail=True): return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode), - force=force, fail=fail) + force=force, JSON=JSON, fail=fail) def ip_link_show(self, xdp): _, link = ip("link show dev %s" % (self['ifname'])) @@ -390,12 +427,16 @@ class NetdevSim: ################################################################################ def clean_up(): + global files, netns, devs + for dev in devs: dev.remove() for f in files: cmd("rm -f %s" % (f)) for ns in netns: cmd("ip netns delete %s" % (ns)) + files = [] + netns = [] def pin_prog(file_name, idx=0): progs = bpftool_prog_list(expected=(idx + 1)) @@ -405,16 +446,31 @@ def pin_prog(file_name, idx=0): return file_name, bpf_pinned(file_name) -def check_dev_info(other_ns, ns, pin_file=None, removed=False): - if removed: - bpftool_prog_list(expected=0) - ret, err = bpftool("prog show pin %s" % (pin_file), fail=False) - fail(ret == 0, "Showing prog with removed device did not fail") - fail(err["error"].find("No such device") == -1, - "Showing prog with removed device expected ENODEV, error is %s" % - (err["error"])) - return - progs = bpftool_prog_list(expected=int(not removed), ns=ns) +def pin_map(file_name, idx=0, expected=1): + maps = bpftool_map_list(expected=expected) + m = maps[idx] + bpftool("map pin id %d %s" % (m["id"], file_name)) + files.append(file_name) + + return file_name, bpf_pinned(file_name) + +def check_dev_info_removed(prog_file=None, map_file=None): + bpftool_prog_list(expected=0) + ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) + fail(ret == 0, "Showing prog with removed device did not fail") + fail(err["error"].find("No such device") == -1, + "Showing prog with removed device expected ENODEV, error is %s" % + (err["error"])) + + bpftool_map_list(expected=0) + ret, err = bpftool("map show pin %s" % (map_file), fail=False) + fail(ret == 0, "Showing map with removed device did not fail") + fail(err["error"].find("No such device") == -1, + "Showing map with removed device expected ENODEV, error is %s" % + (err["error"])) + +def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): + progs = bpftool_prog_list(expected=1, ns=ns) prog = progs[0] fail("dev" not in prog.keys(), "Device parameters not reported") @@ -423,16 +479,17 @@ def check_dev_info(other_ns, ns, pin_file=None, removed=False): fail("ns_dev" not in dev.keys(), "Device parameters not reported") fail("ns_inode" not in dev.keys(), "Device parameters not reported") - if not removed and not other_ns: + if not other_ns: fail("ifname" not in dev.keys(), "Ifname not reported") fail(dev["ifname"] != sim["ifname"], "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"])) else: fail("ifname" in dev.keys(), "Ifname is reported for other ns") - if removed: - fail(dev["ifindex"] != 0, "Device perameters not zero on removed") - fail(dev["ns_dev"] != 0, "Device perameters not zero on removed") - fail(dev["ns_inode"] != 0, "Device perameters not zero on removed") + + maps = bpftool_map_list(expected=2, ns=ns) + for m in maps: + fail("dev" not in m.keys(), "Device parameters not reported") + fail(dev != m["dev"], "Map's device different than program's") # Parse command line parser = argparse.ArgumentParser() @@ -464,7 +521,7 @@ if out.find("/sys/kernel/debug type debugfs") == -1: cmd("mount -t debugfs none /sys/kernel/debug") # Check samples are compiled -samples = ["sample_ret0.o"] +samples = ["sample_ret0.o", "sample_map_ret0.o"] for s in samples: ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False) skip(ret != 0, "sample %s/%s not found, please compile it" % @@ -739,8 +796,9 @@ try: bpftool_prog_list_wait(expected=0) sim = NetdevSim() - sim.set_ethtool_tc_offloads(True) - sim.set_xdp(obj, "offload") + map_obj = bpf_obj("sample_map_ret0.o") + start_test("Test loading program with maps...") + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON start_test("Test bpftool bound info reporting (own ns)...") check_dev_info(False, "") @@ -757,11 +815,111 @@ try: sim.set_ns("") check_dev_info(False, "") - pin_file, _ = pin_prog("/sys/fs/bpf/tmp") + prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog") + map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2) sim.remove() start_test("Test bpftool bound info reporting (removed dev)...") - check_dev_info(True, "", pin_file=pin_file, removed=True) + check_dev_info_removed(prog_file=prog_file, map_file=map_file) + + # Remove all pinned files and reinstantiate the netdev + clean_up() + bpftool_prog_list_wait(expected=0) + + sim = NetdevSim() + + start_test("Test map update (no flags)...") + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + maps = bpftool_map_list(expected=2) + array = maps[0] if maps[0]["type"] == "array" else maps[1] + htab = maps[0] if maps[0]["type"] == "hash" else maps[1] + for m in maps: + for i in range(2): + bpftool("map update id %d key %s value %s" % + (m["id"], int2str("I", i), int2str("Q", i * 3))) + + for m in maps: + ret, _ = bpftool("map update id %d key %s value %s" % + (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), + fail=False) + fail(ret == 0, "added too many entries") + + start_test("Test map update (exists)...") + for m in maps: + for i in range(2): + bpftool("map update id %d key %s value %s exist" % + (m["id"], int2str("I", i), int2str("Q", i * 3))) + + for m in maps: + ret, err = bpftool("map update id %d key %s value %s exist" % + (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), + fail=False) + fail(ret == 0, "updated non-existing key") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map update (noexist)...") + for m in maps: + for i in range(2): + ret, err = bpftool("map update id %d key %s value %s noexist" % + (m["id"], int2str("I", i), int2str("Q", i * 3)), + fail=False) + fail(ret == 0, "updated existing key") + fail(err["error"].find("File exists") == -1, + "expected EEXIST, error is '%s'" % (err["error"])) + + start_test("Test map dump...") + for m in maps: + _, entries = bpftool("map dump id %d" % (m["id"])) + for i in range(2): + key = str2int(entries[i]["key"]) + fail(key != i, "expected key %d, got %d" % (key, i)) + val = str2int(entries[i]["value"]) + fail(val != i * 3, "expected value %d, got %d" % (val, i * 3)) + + start_test("Test map getnext...") + for m in maps: + _, entry = bpftool("map getnext id %d" % (m["id"])) + key = str2int(entry["next_key"]) + fail(key != 0, "next key %d, expected %d" % (key, 0)) + _, entry = bpftool("map getnext id %d key %s" % + (m["id"], int2str("I", 0))) + key = str2int(entry["next_key"]) + fail(key != 1, "next key %d, expected %d" % (key, 1)) + ret, err = bpftool("map getnext id %d key %s" % + (m["id"], int2str("I", 1)), fail=False) + fail(ret == 0, "got next key past the end of map") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map delete (htab)...") + for i in range(2): + bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i))) + + start_test("Test map delete (array)...") + for i in range(2): + ret, err = bpftool("map delete id %d key %s" % + (htab["id"], int2str("I", i)), fail=False) + fail(ret == 0, "removed entry from an array") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map remove...") + sim.unset_xdp("offload") + bpftool_map_list_wait(expected=0) + sim.remove() + + sim = NetdevSim() + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + sim.remove() + bpftool_map_list_wait(expected=0) + + start_test("Test map creation fail path...") + sim = NetdevSim() + sim.dfs["bpf_map_accept"] = "N" + ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) + fail(ret == 0, + "netdevsim didn't refuse to create a map with offload disabled") print("%s: OK" % (os.path.basename(__file__))) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 38c0f850f315..fb82d29ee863 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -29,6 +29,7 @@ #include <linux/filter.h> #include <linux/bpf_perf_event.h> #include <linux/bpf.h> +#include <linux/if_ether.h> #include <bpf/bpf.h> @@ -49,6 +50,8 @@ #define MAX_INSNS 512 #define MAX_FIXUPS 8 #define MAX_NR_MAPS 4 +#define POINTER_VALUE 0xcafe4all +#define TEST_DATA_LEN 64 #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0) #define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1) @@ -62,6 +65,7 @@ struct bpf_test { int fixup_map_in_map[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; + uint32_t retval; enum { UNDEF, ACCEPT, @@ -95,6 +99,94 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = -3, + }, + { + "DIV32 by 0, zero check 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV32 by 0, zero check 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV64 by 0, zero check", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU64_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "MOD32 by 0, zero check 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "MOD32 by 0, zero check 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "MOD64 by 0, zero check", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "empty prog", + .insns = { + }, + .errstr = "last insn is not an exit or jmp", + .result = REJECT, + }, + { + "only exit insn", + .insns = { + BPF_EXIT_INSN(), + }, + .errstr = "R0 !read_ok", + .result = REJECT, }, { "unreachable", @@ -210,6 +302,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 1, }, { "test8 ld_imm64", @@ -517,6 +610,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, + .retval = POINTER_VALUE, }, { "check valid spill/fill, skb mark", @@ -803,6 +897,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "R1 pointer comparison", .result_unpriv = REJECT, .result = ACCEPT, + .retval = -ENOENT, }, { "jump test 4", @@ -1823,6 +1918,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 0xfaceb00c, }, { "PTR_TO_STACK store/load - bad alignment on off", @@ -1881,6 +1977,7 @@ static struct bpf_test tests[] = { .result = ACCEPT, .result_unpriv = REJECT, .errstr_unpriv = "R0 leaks addr", + .retval = POINTER_VALUE, }, { "unpriv: add const to pointer", @@ -2054,6 +2151,7 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc), + BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -2841,6 +2939,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test12 (and, good access)", @@ -2865,6 +2964,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test13 (branches, good access)", @@ -2895,6 +2995,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test14 (pkt_ptr += 0, CONST_IMM, good access)", @@ -2918,6 +3019,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test15 (spill with xadd)", @@ -3204,6 +3306,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 1, }, { "direct packet access: test28 (marking on <=, bad access)", @@ -5823,6 +5926,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = 0 /* csum_diff of 64-byte packet */, }, { "helper access to variable memory: size = 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)", @@ -6191,6 +6295,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 42 /* ultimate return value */, }, { "ld_ind: check calling conv, r1", @@ -6262,6 +6367,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 1, }, { "check bpf_perf_event_data->sample_period byte load permitted", @@ -7249,6 +7355,7 @@ static struct bpf_test tests[] = { }, .fixup_map1 = { 3 }, .result = ACCEPT, + .retval = POINTER_VALUE, .result_unpriv = REJECT, .errstr_unpriv = "R0 leaks addr as return value" }, @@ -7269,6 +7376,7 @@ static struct bpf_test tests[] = { }, .fixup_map1 = { 3 }, .result = ACCEPT, + .retval = POINTER_VALUE, .result_unpriv = REJECT, .errstr_unpriv = "R0 leaks addr as return value" }, @@ -7710,6 +7818,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = TEST_DATA_LEN, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@ -8851,6 +8960,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "function calls to other bpf functions are allowed for root only", .result_unpriv = REJECT, .result = ACCEPT, + .retval = 1, }, { "calls: overlapping caller/callee", @@ -9046,6 +9156,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_ACT, .result = ACCEPT, + .retval = TEST_DATA_LEN, }, { "calls: callee using args1", @@ -9058,6 +9169,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "allowed for root only", .result_unpriv = REJECT, .result = ACCEPT, + .retval = POINTER_VALUE, }, { "calls: callee using wrong args2", @@ -9088,6 +9200,7 @@ static struct bpf_test tests[] = { .errstr_unpriv = "allowed for root only", .result_unpriv = REJECT, .result = ACCEPT, + .retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN, }, { "calls: callee changing pkt pointers", @@ -9136,6 +9249,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = TEST_DATA_LEN + TEST_DATA_LEN, }, { "calls: calls with stack arith", @@ -9154,6 +9268,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 42, }, { "calls: calls with misaligned stack access", @@ -9187,6 +9302,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 43, }, { "calls: calls control flow, jump test 2", @@ -9679,6 +9795,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_XDP, .result = ACCEPT, + .retval = 42, }, { "calls: write into callee stack frame", @@ -10290,6 +10407,7 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .retval = POINTER_VALUE, }, { "calls: pkt_ptr spill into caller stack 2", @@ -10355,6 +10473,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 1, }, { "calls: pkt_ptr spill into caller stack 4", @@ -10388,6 +10507,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, + .retval = 1, }, { "calls: pkt_ptr spill into caller stack 5", @@ -10796,10 +10916,12 @@ static void do_test_single(struct bpf_test *test, bool unpriv, int fd_prog, expected_ret, reject_from_alignment; struct bpf_insn *prog = test->insns; int prog_len = probe_filter_length(prog); + char data_in[TEST_DATA_LEN] = {}; int prog_type = test->prog_type; int map_fds[MAX_NR_MAPS]; const char *expected_err; - int i; + uint32_t retval; + int i, err; for (i = 0; i < MAX_NR_MAPS; i++) map_fds[i] = -1; @@ -10842,6 +10964,19 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } } + if (fd_prog >= 0) { + err = bpf_prog_test_run(fd_prog, 1, data_in, sizeof(data_in), + NULL, NULL, &retval, NULL); + if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { + printf("Unexpected bpf_prog_test_run error\n"); + goto fail_log; + } + if (!err && retval != test->retval && + test->retval != POINTER_VALUE) { + printf("FAIL retval %d != %d\n", retval, test->retval); + goto fail_log; + } + } (*passes)++; printf("OK%s\n", reject_from_alignment ? " (NOTE: reject due to unknown alignment)" : ""); |