diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/btf.c | 46 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 61 | ||||
-rw-r--r-- | kernel/bpf/core.c | 32 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 7 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 16 | ||||
-rw-r--r-- | kernel/bpf/offload.c | 223 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 300 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 103 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 13 |
9 files changed, 518 insertions, 283 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2d49d18b793a..9704934252b3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) */ static bool btf_type_int_is_regular(const struct btf_type *t) { - u16 nr_bits, nr_bytes; + u8 nr_bits, nr_bytes; u32 int_data; int_data = btf_type_int(t); @@ -991,38 +991,38 @@ static void btf_int_bits_seq_show(const struct btf *btf, void *data, u8 bits_offset, struct seq_file *m) { + u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); - u16 nr_bits = BTF_INT_BITS(int_data); - u16 total_bits_offset; - u16 nr_copy_bytes; - u16 nr_copy_bits; - u8 nr_upper_bits; - union { - u64 u64_num; - u8 u8_nums[8]; - } print_num; + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + u8 nr_copy_bytes; + u8 nr_copy_bits; + u64 print_num; + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 64 bits. + */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num.u64_num = 0; - memcpy(&print_num.u64_num, data, nr_copy_bytes); + print_num = 0; + memcpy(&print_num, data, nr_copy_bytes); - /* Ditch the higher order bits */ - nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); - if (nr_upper_bits) { - /* We need to mask out some bits of the upper byte. */ - u8 mask = (1 << nr_upper_bits) - 1; - - print_num.u8_nums[nr_copy_bytes - 1] &= mask; - } +#ifdef __BIG_ENDIAN_BITFIELD + left_shift_bits = bits_offset; +#else + left_shift_bits = BITS_PER_U64 - nr_copy_bits; +#endif + right_shift_bits = BITS_PER_U64 - nr_bits; - print_num.u64_num >>= bits_offset; + print_num <<= left_shift_bits; + print_num >>= right_shift_bits; - seq_printf(m, "0x%llx", print_num.u64_num); + seq_printf(m, "0x%llx", print_num); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, @@ -1032,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; - u32 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bits = BTF_INT_BITS(int_data); if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f7c00bd6f8e4..badabb0b435c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -95,7 +95,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { - struct bpf_prog_array __rcu *progs; + struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; int cnt = 0; @@ -120,13 +120,12 @@ static int compute_effective_progs(struct cgroup *cgrp, &p->bpf.progs[type], node) { if (!pl->prog) continue; - rcu_dereference_protected(progs, 1)-> - progs[cnt++] = pl->prog; + progs->progs[cnt++] = pl->prog; } p = cgroup_parent(p); } while (p); - *array = progs; + rcu_assign_pointer(*array, progs); return 0; } @@ -428,6 +427,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return ret; } +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_query(cgrp, attr, uattr); + + cgroup_put(cgrp); + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a9e6c04d0f4a..253aa8e79c7b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_fill_ill_insns(hdr, size); hdr->pages = size / PAGE_SIZE; - hdr->locked = 0; - hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } -static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) -{ -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - int i, err; - - for (i = 0; i < fp->aux->func_cnt; i++) { - err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); - if (err) - return err; - } - - return bpf_prog_check_pages_ro_single(fp); -#endif - return 0; -} - static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON @@ -1524,17 +1506,7 @@ finalize: * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); - if (*err) - return fp; - - /* Checkpoint: at this point onwards any cBPF -> eBPF or - * native eBPF program is read-only. If we failed to change - * the page attributes (e.g. allocation failure from - * splitting large pages), then reject the whole program - * in order to guarantee not ending up with any W+X pages - * from BPF side in kernel. - */ - *err = bpf_prog_check_pages_ro_locked(fp); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -1566,7 +1538,7 @@ static struct { .null_prog = NULL, }; -struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..d361fc1e3bf3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ca2198a6d22..513d9dfcf4ee 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, * old element will be freed immediately. * Otherwise return an error */ - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + l_new = ERR_PTR(-E2BIG); + goto dec_count; } l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); - if (!l_new) - return ERR_PTR(-ENOMEM); + if (!l_new) { + l_new = ERR_PTR(-ENOMEM); + goto dec_count; + } } memcpy(l_new->key, key, key_size); @@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, GFP_ATOMIC | __GFP_NOWARN); if (!pptr) { kfree(l_new); - return ERR_PTR(-ENOMEM); + l_new = ERR_PTR(-ENOMEM); + goto dec_count; } } @@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; +dec_count: + atomic_dec(&htab->count); + return l_new; } static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ac747d5cf7c6..177a52436394 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -18,19 +18,43 @@ #include <linux/bug.h> #include <linux/kdev_t.h> #include <linux/list.h> +#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/proc_ns.h> +#include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> -/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members +/* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); -static LIST_HEAD(bpf_prog_offload_devs); -static LIST_HEAD(bpf_map_offload_devs); + +struct bpf_offload_dev { + struct list_head netdevs; +}; + +struct bpf_offload_netdev { + struct rhash_head l; + struct net_device *netdev; + struct bpf_offload_dev *offdev; + struct list_head progs; + struct list_head maps; + struct list_head offdev_netdevs; +}; + +static const struct rhashtable_params offdevs_params = { + .nelem_hint = 4, + .key_len = sizeof(struct net_device *), + .key_offset = offsetof(struct bpf_offload_netdev, netdev), + .head_offset = offsetof(struct bpf_offload_netdev, l), + .automatic_shrinking = true, +}; + +static struct rhashtable offdevs; +static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -41,8 +65,19 @@ static int bpf_dev_offload_check(struct net_device *netdev) return 0; } +static struct bpf_offload_netdev * +bpf_offload_find_netdev(struct net_device *netdev) +{ + lockdep_assert_held(&bpf_devs_lock); + + if (!offdevs_inited) + return NULL; + return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { + struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; @@ -66,12 +101,13 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) { + ondev = bpf_offload_find_netdev(offload->netdev); + if (!ondev) { err = -EINVAL; goto err_unlock; } prog->aux->offload = offload; - list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); up_write(&bpf_devs_lock); @@ -294,6 +330,7 @@ static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; + struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; @@ -316,11 +353,17 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (err) goto err_unlock; + ondev = bpf_offload_find_netdev(offmap->netdev); + if (!ondev) { + err = -EINVAL; + goto err_unlock; + } + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; - list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); rtnl_unlock(); @@ -468,77 +511,159 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +static bool __bpf_offload_dev_match(struct bpf_prog *prog, + struct net_device *netdev) { - struct bpf_offloaded_map *offmap; + struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; - bool ret; if (!bpf_prog_is_dev_bound(prog->aux)) return false; - if (!bpf_map_is_dev_bound(map)) - return bpf_map_offload_neutral(map); - down_read(&bpf_devs_lock); offload = prog->aux->offload; - offmap = map_to_offmap(map); + if (!offload) + return false; + if (offload->netdev == netdev) + return true; - ret = offload && offload->netdev == offmap->netdev; + ondev1 = bpf_offload_find_netdev(offload->netdev); + ondev2 = bpf_offload_find_netdev(netdev); + + return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) +{ + bool ret; + + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, netdev); up_read(&bpf_devs_lock); return ret; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_match); -static void bpf_offload_orphan_all_progs(struct net_device *netdev) +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { - struct bpf_prog_offload *offload, *tmp; + struct bpf_offloaded_map *offmap; + bool ret; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); + offmap = map_to_offmap(map); + + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, offmap->netdev); + up_read(&bpf_devs_lock); + + return ret; } -static void bpf_offload_orphan_all_maps(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { - struct bpf_offloaded_map *offmap, *tmp; + struct bpf_offload_netdev *ondev; + int err; - list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) - if (offmap->netdev == netdev) - __bpf_map_offload_destroy(offmap); + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; + + ondev->netdev = netdev; + ondev->offdev = offdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + down_write(&bpf_devs_lock); + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_unlock_free; + } + + list_add(&ondev->offdev_netdevs, &offdev->netdevs); + up_write(&bpf_devs_lock); + return 0; + +err_unlock_free: + up_write(&bpf_devs_lock); + kfree(ondev); + return err; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -static int bpf_offload_notification(struct notifier_block *notifier, - ulong event, void *ptr) +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) { - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_offload_netdev *ondev, *altdev; + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; ASSERT_RTNL(); - switch (event) { - case NETDEV_UNREGISTER: - /* ignore namespace changes */ - if (netdev->reg_state != NETREG_UNREGISTERING) - break; - - down_write(&bpf_devs_lock); - bpf_offload_orphan_all_progs(netdev); - bpf_offload_orphan_all_maps(netdev); - up_write(&bpf_devs_lock); - break; - default: - break; + down_write(&bpf_devs_lock); + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + goto unlock; + + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); + list_del(&ondev->offdev_netdevs); + + /* Try to move the objects to another netdev of the device */ + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); } - return NOTIFY_OK; -} -static struct notifier_block bpf_offload_notifier = { - .notifier_call = bpf_offload_notification, -}; + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +unlock: + up_write(&bpf_devs_lock); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); -static int __init bpf_offload_init(void) +struct bpf_offload_dev *bpf_offload_dev_create(void) { - register_netdevice_notifier(&bpf_offload_notifier); - return 0; + struct bpf_offload_dev *offdev; + int err; + + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return ERR_PTR(err); + offdevs_inited = true; + } + up_write(&bpf_devs_lock); + + offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); + if (!offdev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&offdev->netdevs); + + return offdev; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_create); -subsys_initcall(bpf_offload_init); +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) +{ + WARN_ON(!list_empty(&offdev->netdevs)); + kfree(offdev); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 52a91d816c0e..0b38be5a955c 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -72,6 +72,7 @@ struct bpf_htab { u32 n_buckets; u32 elem_size; struct bpf_sock_progs progs; + struct rcu_head rcu; }; struct htab_elem { @@ -89,8 +90,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; - struct htab_elem *hash_link; - struct bpf_htab *htab; + struct htab_elem __rcu *hash_link; + struct bpf_htab __rcu *htab; }; struct smap_psock { @@ -120,6 +121,7 @@ struct smap_psock { struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; + spinlock_t maps_lock; /* Back reference used when sock callback trigger sockmap operations */ struct sock *sock; @@ -140,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { @@ -161,7 +164,42 @@ out: return !empty; } -static struct proto tcp_bpf_proto; +enum { + SOCKMAP_IPV4, + SOCKMAP_IPV6, + SOCKMAP_NUM_PROTS, +}; + +enum { + SOCKMAP_BASE, + SOCKMAP_TX, + SOCKMAP_NUM_CONFIGS, +}; + +static struct proto *saved_tcpv6_prot __read_mostly; +static DEFINE_SPINLOCK(tcpv6_prot_lock); +static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; +static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], + struct proto *base) +{ + prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].close = bpf_tcp_close; + prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; + prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; + + prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; + prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; + prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; +} + +static void update_sk_prot(struct sock *sk, struct smap_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; + int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; + + sk->sk_prot = &bpf_tcp_prots[family][conf]; +} + static int bpf_tcp_init(struct sock *sk) { struct smap_psock *psock; @@ -181,14 +219,17 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; - if (psock->bpf_tx_msg) { - tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; - tcp_bpf_proto.sendpage = bpf_tcp_sendpage; - tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; - tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; + /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ + if (sk->sk_family == AF_INET6 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + spin_lock_bh(&tcpv6_prot_lock); + if (likely(sk->sk_prot != saved_tcpv6_prot)) { + build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); + smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + } + spin_unlock_bh(&tcpv6_prot_lock); } - - sk->sk_prot = &tcp_bpf_proto; + update_sk_prot(sk, psock); rcu_read_unlock(); return 0; } @@ -219,24 +260,64 @@ out: rcu_read_unlock(); } +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { atomic_dec(&htab->count); kfree_rcu(l, rcu); } +static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, + struct smap_psock *psock) +{ + struct smap_psock_map_entry *e; + + spin_lock_bh(&psock->maps_lock); + e = list_first_entry_or_null(&psock->maps, + struct smap_psock_map_entry, + list); + if (e) + list_del(&e->list); + spin_unlock_bh(&psock->maps_lock); + return e; +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock_map_entry *e, *tmp; + struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; + lock_sock(sk); rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) { rcu_read_unlock(); + release_sock(sk); return sk->sk_prot->close(sk, timeout); } @@ -247,7 +328,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) */ close_fun = psock->save_close; - write_lock_bh(&sk->sk_callback_lock); if (psock->cork) { free_start_sg(psock->sock, psock->cork); kfree(psock->cork); @@ -260,21 +340,40 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(md); } - list_for_each_entry_safe(e, tmp, &psock->maps, list) { + e = psock_map_pop(sk, psock); + while (e) { if (e->entry) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { - list_del(&e->list); smap_release_sock(psock, sk); } } else { - hlist_del_rcu(&e->hash_link->hash_node); - smap_release_sock(psock, e->hash_link->sk); - free_htab_elem(e->htab, e->hash_link); + struct htab_elem *link = rcu_dereference(e->hash_link); + struct bpf_htab *htab = rcu_dereference(e->htab); + struct hlist_head *head; + struct htab_elem *l; + struct bucket *b; + + b = __select_bucket(htab, link->hash); + head = &b->head; + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, + link->hash, link->key, + htab->map.key_size); + /* If another thread deleted this object skip deletion. + * The refcnt on psock may or may not be zero. + */ + if (l) { + hlist_del_rcu(&link->hash_node); + smap_release_sock(psock, link->sk); + free_htab_elem(htab, link); + } + raw_spin_unlock_bh(&b->lock); } + e = psock_map_pop(sk, psock); } - write_unlock_bh(&sk->sk_callback_lock); rcu_read_unlock(); + release_sock(sk); close_fun(sk, timeout); } @@ -472,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) while (sg[i].length) { free += sg[i].length; sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); + if (!md->skb) + put_page(sg_page(&sg[i])); sg[i].length = 0; sg[i].page_link = 0; sg[i].offset = 0; @@ -481,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + if (md->skb) + consume_skb(md->skb); return free; } @@ -623,11 +725,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, { bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; - struct scatterlist *sg; int err = 0; - sg = md->sg_data; - rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) @@ -1111,8 +1210,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, static int bpf_tcp_ulp_register(void) { - tcp_bpf_proto = tcp_prot; - tcp_bpf_proto.close = bpf_tcp_close; + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); /* Once BPF TX ULP is registered it is never unregistered. It * will be in the ULP list for the lifetime of the system. Doing * duplicate registers is not a problem. @@ -1135,7 +1233,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1357,7 +1455,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { tcp_cleanup_ulp(sock); + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); clear_bit(SMAP_TX_RUNNING, &psock->state); rcu_assign_sk_user_data(sock, NULL); call_rcu_sched(&psock->rcu, smap_destroy_psock); @@ -1388,7 +1488,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); @@ -1508,6 +1608,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node) INIT_LIST_HEAD(&psock->maps); INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); + spin_lock_init(&psock->maps_lock); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -1564,18 +1665,32 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, - struct sock **entry, - struct htab_elem *hash_link) +static void smap_list_map_remove(struct smap_psock *psock, + struct sock **entry) { struct smap_psock_map_entry *e, *tmp; + spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry || e->hash_link == hash_link) { + if (e->entry == entry) + list_del(&e->list); + } + spin_unlock_bh(&psock->maps_lock); +} + +static void smap_list_hash_remove(struct smap_psock *psock, + struct htab_elem *hash_link) +{ + struct smap_psock_map_entry *e, *tmp; + + spin_lock_bh(&psock->maps_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + struct htab_elem *c = rcu_dereference(e->hash_link); + + if (c == hash_link) list_del(&e->list); - break; - } } + spin_unlock_bh(&psock->maps_lock); } static void sock_map_free(struct bpf_map *map) @@ -1601,7 +1716,6 @@ static void sock_map_free(struct bpf_map *map) if (!sock) continue; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1609,10 +1723,9 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i], NULL); + smap_list_map_remove(psock, &stab->sock_map[i]); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); @@ -1661,17 +1774,15 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!sock) return -EINVAL; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); if (!psock) goto out; if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k], NULL); + smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: - write_unlock_bh(&sock->sk_callback_lock); return 0; } @@ -1752,7 +1863,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, } } - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* 2. Do not allow inheriting programs if psock exists and has @@ -1789,7 +1899,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); if (!e) { err = -ENOMEM; - goto out_progs; + goto out_free; } } @@ -1809,7 +1919,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, if (err) goto out_free; smap_init_progs(psock, verdict, parse); + write_lock_bh(&sock->sk_callback_lock); smap_start_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); } /* 4. Place psock in sockmap for use and stop any programs on @@ -1819,9 +1931,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, */ if (map_link) { e->entry = map_link; + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); } - write_unlock_bh(&sock->sk_callback_lock); return err; out_free: smap_release_sock(psock, sock); @@ -1832,7 +1945,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; } @@ -1869,10 +1981,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (osock) { struct smap_psock *opsock = smap_psock_sk(osock); - write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i], NULL); + smap_list_map_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); - write_unlock_bh(&osock->sk_callback_lock); } out: return err; @@ -1915,6 +2025,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return 0; } +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = sock_map_prog(map, prog, attr->attach_type); + fdput(f); + return err; +} + static void *sock_map_lookup(struct bpf_map *map, void *key) { return NULL; @@ -1944,7 +2072,13 @@ static int sock_map_update_elem(struct bpf_map *map, return -EOPNOTSUPP; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_map_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2043,14 +2177,13 @@ free_htab: return ERR_PTR(err); } -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +static void __bpf_htab_free(struct rcu_head *rcu) { - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} + struct bpf_htab *htab; -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; + htab = container_of(rcu, struct bpf_htab, rcu); + bpf_map_area_free(htab->buckets); + kfree(htab); } static void sock_hash_free(struct bpf_map *map) @@ -2069,16 +2202,18 @@ static void sock_hash_free(struct bpf_map *map) */ rcu_read_lock(); for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); + struct bucket *b = __select_bucket(htab, i); + struct hlist_head *head; struct hlist_node *n; struct htab_elem *l; + raw_spin_lock_bh(&b->lock); + head = &b->head; hlist_for_each_entry_safe(l, n, head, hash_node) { struct sock *sock = l->sk; struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get * the sk_callback_lock before this case but after xchg @@ -2086,16 +2221,15 @@ static void sock_hash_free(struct bpf_map *map) * (psock) to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); - kfree(l); + free_htab_elem(htab, l); } + raw_spin_unlock_bh(&b->lock); } rcu_read_unlock(); - bpf_map_area_free(htab->buckets); - kfree(htab); + call_rcu(&htab->rcu, __bpf_htab_free); } static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, @@ -2122,19 +2256,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, return l_new; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - static inline u32 htab_map_hash(const void *key, u32 key_len) { return jhash(key, key_len, 0); @@ -2230,7 +2351,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (err) goto err; - /* bpf_map_update_elem() can be called in_irq() */ + /* psock is valid here because otherwise above *ctx_update_elem would + * have thrown an error. It is safe to skip error check. + */ + psock = smap_psock_sk(sock); raw_spin_lock_bh(&b->lock); l_old = lookup_elem_raw(head, hash, key, key_size); if (l_old && map_flags == BPF_NOEXIST) { @@ -2248,15 +2372,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto bucket_err; } - psock = smap_psock_sk(sock); - if (unlikely(!psock)) { - err = -EINVAL; - goto bucket_err; - } - - e->hash_link = l_new; - e->htab = container_of(map, struct bpf_htab, map); + rcu_assign_pointer(e->hash_link, l_new); + rcu_assign_pointer(e->htab, + container_of(map, struct bpf_htab, map)); + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); /* add new element to the head of the list, so that * concurrent search will find it before old elem @@ -2266,19 +2387,17 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, psock = smap_psock_sk(l_old->sk); hlist_del_rcu(&l_old->hash_node); - smap_list_remove(psock, NULL, l_old); + smap_list_hash_remove(psock, l_old); smap_release_sock(psock, l_old->sk); free_htab_elem(htab, l_old); } raw_spin_unlock_bh(&b->lock); return 0; bucket_err: + smap_release_sock(psock, sock); raw_spin_unlock_bh(&b->lock); err: kfree(e); - psock = smap_psock_sk(sock); - if (psock) - smap_release_sock(psock, sock); return err; } @@ -2300,7 +2419,13 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_hash_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2326,7 +2451,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -2334,10 +2458,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); free_htab_elem(htab, l); ret = 0; } @@ -2359,10 +2482,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_bh(&b->lock); l = lookup_elem_raw(head, hash, key, key_size); sk = l ? l->sk : NULL; - raw_spin_unlock_bh(&b->lock); return sk; } @@ -2383,6 +2504,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, + .map_release_uref = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35dc466641f2..a31a1ba0f8ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_update_elem(map, key, value, attr->flags); goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -1483,8 +1485,6 @@ out_free_tp: return err; } -#ifdef CONFIG_CGROUP_BPF - static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -1499,40 +1499,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, - int type, bool attach) -{ - struct bpf_prog *prog = NULL; - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, type); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); - } - } - - err = sock_map_prog(map, prog, attr->attach_type); - if (err) { - fdput(f); - if (prog) - bpf_prog_put(prog); - return err; - } - - fdput(f); - return 0; -} - #define BPF_F_ATTACH_MASK \ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) @@ -1540,7 +1506,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - struct cgroup *cgrp; int ret; if (!capable(CAP_NET_ADMIN)) @@ -1577,12 +1542,15 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); + ptype = BPF_PROG_TYPE_SK_MSG; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + ptype = BPF_PROG_TYPE_SK_SKB; + break; case BPF_LIRC_MODE2: - return lirc_prog_attach(attr); + ptype = BPF_PROG_TYPE_LIRC_MODE2; + break; default: return -EINVAL; } @@ -1596,18 +1564,20 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) { - bpf_prog_put(prog); - return PTR_ERR(cgrp); + switch (ptype) { + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + ret = sockmap_get_from_fd(attr, ptype, prog); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + ret = lirc_prog_attach(attr, prog); + break; + default: + ret = cgroup_bpf_prog_attach(attr, ptype, prog); } - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, - attr->attach_flags); if (ret) bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; } @@ -1616,9 +1586,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - struct bpf_prog *prog; - struct cgroup *cgrp; - int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1651,29 +1618,17 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog)) - prog = NULL; - - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); - if (prog) - bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; + return cgroup_bpf_prog_detach(attr, ptype); } #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt @@ -1681,9 +1636,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - struct cgroup *cgrp; - int ret; - if (!capable(CAP_NET_ADMIN)) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) @@ -1711,14 +1663,9 @@ static int bpf_prog_query(const union bpf_attr *attr, default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->query.target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); - cgroup_put(cgrp); - return ret; + + return cgroup_bpf_prog_query(attr, uattr); } -#endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -2365,7 +2312,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; -#ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); break; @@ -2375,7 +2321,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_QUERY: err = bpf_prog_query(&attr, uattr); break; -#endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..25e47c195874 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5054,7 +5054,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && - !bpf_offload_dev_match(prog, map)) { + !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; } @@ -5430,6 +5430,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; + /* Upon error here we cannot fall back to interpreter but + * need a hard reject of the program. Thus -EFAULT is + * propagated in any case. + */ subprog = find_subprog(env, i + insn->imm + 1); if (subprog < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", @@ -5450,7 +5454,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) - return -ENOMEM; + goto out_undo_insn; for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; @@ -5515,7 +5519,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -EFAULT; + err = -ENOTSUPP; goto out_free; } cond_resched(); @@ -5552,6 +5556,7 @@ out_free: if (func[i]) bpf_jit_free(func[i]); kfree(func); +out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5578,6 +5583,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) err = jit_subprogs(env); if (err == 0) return 0; + if (err == -EFAULT) + return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON for (i = 0; i < prog->len; i++, insn++) { |