diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 7 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 51 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 5 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 82 | ||||
-rw-r--r-- | kernel/bpf/core.c | 5 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 31 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 68 | ||||
-rw-r--r-- | kernel/bpf/local_storage.c | 174 | ||||
-rw-r--r-- | kernel/bpf/map_in_map.c | 3 | ||||
-rw-r--r-- | kernel/bpf/offload.c | 18 | ||||
-rw-r--r-- | kernel/bpf/queue_stack_maps.c | 288 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 2580 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 4 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 148 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 985 | ||||
-rw-r--r-- | kernel/bpf/xskmap.c | 12 |
16 files changed, 1501 insertions, 2960 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0488b8258321..4c2fa3ac56f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) @@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y) obj-$(CONFIG_BPF_SYSCALL) += xskmap.o endif obj-$(CONFIG_BPF_SYSCALL) += offload.o -ifeq ($(CONFIG_STREAM_PARSER),y) -ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_BPF_SYSCALL) += sockmap.o -endif -endif endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0c17aab3ce5f..24583da9ffd1 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,6 +358,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + seq_printf(m, "%u: {\n", *(u32 *)key); + pptr = array->pptrs[index & array->index_mask]; + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) @@ -398,6 +421,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; @@ -425,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map) static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } /* only called from syscall */ @@ -529,6 +553,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); } +static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void **elem, *ptr; + u32 prog_id; + + rcu_read_lock(); + + elem = array_map_lookup_elem(map, key); + if (elem) { + ptr = READ_ONCE(*elem); + if (ptr) { + seq_printf(m, "%u: ", *(u32 *)key); + prog_id = prog_fd_array_sys_lookup_elem(ptr); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &prog_id, m); + seq_puts(m, "\n"); + } + } + + rcu_read_unlock(); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -540,7 +587,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, - .map_check_btf = map_check_no_btf, + .map_seq_show_elem = prog_array_map_seq_show_elem, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2590700237c1..378cef70341c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->type_len; + end = cur + hdr->type_len; env->log_type_id = 1; while (cur < end) { @@ -2114,6 +2114,9 @@ static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, hdr = &btf->hdr; + if (hdr->hdr_len != hdr_len) + return -EINVAL; + btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6a7d931bbc55..9425c2fb872f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -25,6 +25,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); */ void cgroup_bpf_put(struct cgroup *cgrp) { + enum bpf_cgroup_storage_type stype; unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -34,8 +35,10 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); - bpf_cgroup_storage_unlink(pl->storage); - bpf_cgroup_storage_free(pl->storage); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_unlink(pl->storage[stype]); + bpf_cgroup_storage_free(pl->storage[stype]); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -97,6 +100,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { + enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; @@ -125,7 +129,9 @@ static int compute_effective_progs(struct cgroup *cgrp, continue; progs->items[cnt].prog = pl->prog; - progs->items[cnt].cgroup_storage = pl->storage; + for_each_cgroup_storage_type(stype) + progs->items[cnt].cgroup_storage[stype] = + pl->storage[stype]; cnt++; } } while ((p = cgroup_parent(p))); @@ -232,7 +238,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage, *old_storage = NULL; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], + *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + enum bpf_cgroup_storage_type stype; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -254,34 +262,44 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; - storage = bpf_cgroup_storage_alloc(prog); - if (IS_ERR(storage)) - return -ENOMEM; + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + } if (flags & BPF_F_ALLOW_MULTI) { list_for_each_entry(pl, progs, node) { if (pl->prog == prog) { /* disallow attaching the same prog twice */ - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -EINVAL; } } pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } pl_was_allocated = true; pl->prog = prog; - pl->storage = storage; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } pl_was_allocated = true; @@ -289,12 +307,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; - old_storage = pl->storage; - bpf_cgroup_storage_unlink(old_storage); + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); + } pl_was_allocated = false; } pl->prog = prog; - pl->storage = storage; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; } cgrp->bpf.flags[type] = flags; @@ -304,21 +325,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, goto cleanup; static_branch_inc(&cgroup_bpf_enabled_key); - if (old_storage) - bpf_cgroup_storage_free(old_storage); + for_each_cgroup_storage_type(stype) { + if (!old_storage[stype]) + continue; + bpf_cgroup_storage_free(old_storage[stype]); + } if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_cgroup_storage_link(storage, cgrp, type); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_link(storage[stype], cgrp, type); return 0; cleanup: /* and cleanup the prog list */ pl->prog = old_prog; - bpf_cgroup_storage_free(pl->storage); - pl->storage = old_storage; - bpf_cgroup_storage_link(old_storage, cgrp, type); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_free(pl->storage[stype]); + pl->storage[stype] = old_storage[stype]; + bpf_cgroup_storage_link(old_storage[stype], cgrp, type); + } if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -339,6 +366,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 unused_flags) { struct list_head *progs = &cgrp->bpf.progs[type]; + enum bpf_cgroup_storage_type stype; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; struct bpf_prog_list *pl; @@ -385,8 +413,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - bpf_cgroup_storage_unlink(pl->storage); - bpf_cgroup_storage_free(pl->storage); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_unlink(pl->storage[stype]); + bpf_cgroup_storage_free(pl->storage[stype]); + } kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ @@ -523,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; + void *saved_data_end; struct cgroup *cgrp; int ret; @@ -536,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, save_sk = skb->sk; skb->sk = sk; __skb_push(skb, offset); + + /* compute pointers for the bpf prog */ + bpf_compute_and_save_data_end(skb, &saved_data_end); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, bpf_prog_run_save_cb); + bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; return ret == 1 ? 0 : -EPERM; @@ -677,6 +713,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3f5bf1af0826..7c7eeea8cffc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1783,6 +1783,9 @@ BPF_CALL_0(bpf_user_rnd_u32) const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_map_push_elem_proto __weak; +const struct bpf_func_proto bpf_map_pop_elem_proto __weak; +const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; @@ -1792,8 +1795,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; -const struct bpf_func_proto bpf_sock_map_update_proto __weak; -const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 03cc59ee9c95..2c1790288138 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1285,6 +1285,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; } +static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct htab_elem *l; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + l = __htab_map_lookup_elem(map, key); + if (!l) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": {\n"); + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1293,6 +1322,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_elem = htab_percpu_map_lookup_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1303,6 +1333,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_elem = htab_lru_percpu_map_lookup_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1991466b8327..ab0d5e3f9892 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; +BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) +{ + return map->ops->map_push_elem(map, value, flags); +} + +const struct bpf_func_proto bpf_map_push_elem_proto = { + .func = bpf_map_push_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_pop_elem(map, value); +} + +const struct bpf_func_proto bpf_map_pop_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + +BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_peek_elem(map, value); +} + +const struct bpf_func_proto bpf_map_peek_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, @@ -194,16 +237,28 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .ret_type = RET_INTEGER, }; -DECLARE_PER_CPU(void*, bpf_cgroup_storage); +#ifdef CONFIG_CGROUP_BPF +DECLARE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { - /* map and flags arguments are not used now, - * but provide an ability to extend the API - * for other types of local storages. - * verifier checks that their values are correct. + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. */ - return (unsigned long) this_cpu_read(bpf_cgroup_storage); + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage; + void *ptr; + + storage = this_cpu_read(bpf_cgroup_storage[stype]); + + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; } const struct bpf_func_proto bpf_get_local_storage_proto = { @@ -214,3 +269,4 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif +#endif diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..c97a8f968638 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,7 +7,8 @@ #include <linux/rbtree.h> #include <linux/slab.h> -DEFINE_PER_CPU(void*, bpf_cgroup_storage); +DEFINE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF @@ -129,7 +130,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags & BPF_NOEXIST) + if (flags != BPF_ANY && flags != BPF_EXIST) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -151,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, + void *value) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(storage->percpu_buf, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, + void *value, u64 map_flags) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + return -EINVAL; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), + value + off, size); + off += size; + } + rcu_read_unlock(); + return 0; +} + static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, void *_next_key) { @@ -195,6 +261,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); @@ -251,6 +320,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) { + enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); int ret = -EBUSY; @@ -258,11 +328,12 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) if (map->prog && map->prog != prog) goto unlock; - if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + if (prog->aux->cgroup_storage[stype] && + prog->aux->cgroup_storage[stype] != _map) goto unlock; map->prog = prog; - prog->aux->cgroup_storage = _map; + prog->aux->cgroup_storage[stype] = _map; ret = 0; unlock: spin_unlock_bh(&map->lock); @@ -272,70 +343,117 @@ unlock: void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) { + enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); spin_lock_bh(&map->lock); if (map->prog == prog) { - WARN_ON(prog->aux->cgroup_storage != _map); + WARN_ON(prog->aux->cgroup_storage[stype] != _map); map->prog = NULL; - prog->aux->cgroup_storage = NULL; + prog->aux->cgroup_storage[stype] = NULL; } spin_unlock_bh(&map->lock); } -struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) +{ + size_t size; + + if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { + size = sizeof(struct bpf_storage_buffer) + map->value_size; + *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, + PAGE_SIZE) >> PAGE_SHIFT; + } else { + size = map->value_size; + *pages = round_up(round_up(size, 8) * num_possible_cpus(), + PAGE_SIZE) >> PAGE_SHIFT; + } + + return size; +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, + enum bpf_cgroup_storage_type stype) { struct bpf_cgroup_storage *storage; struct bpf_map *map; + gfp_t flags; + size_t size; u32 pages; - map = prog->aux->cgroup_storage; + map = prog->aux->cgroup_storage[stype]; if (!map) return NULL; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + size = bpf_cgroup_storage_calculate_size(map, &pages); + if (bpf_map_charge_memlock(map, pages)) return ERR_PTR(-EPERM); storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), __GFP_ZERO | GFP_USER, map->numa_node); - if (!storage) { - bpf_map_uncharge_memlock(map, pages); - return ERR_PTR(-ENOMEM); - } + if (!storage) + goto enomem; - storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, - map->numa_node); - if (!storage->buf) { - bpf_map_uncharge_memlock(map, pages); - kfree(storage); - return ERR_PTR(-ENOMEM); + flags = __GFP_ZERO | GFP_USER; + + if (stype == BPF_CGROUP_STORAGE_SHARED) { + storage->buf = kmalloc_node(size, flags, map->numa_node); + if (!storage->buf) + goto enomem; + } else { + storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); + if (!storage->percpu_buf) + goto enomem; } storage->map = (struct bpf_cgroup_storage_map *)map; return storage; + +enomem: + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); +} + +static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + kfree(storage->buf); + kfree(storage); +} + +static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + free_percpu(storage->percpu_buf); + kfree(storage); } void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { - u32 pages; + enum bpf_cgroup_storage_type stype; struct bpf_map *map; + u32 pages; if (!storage) return; map = &storage->map->map; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + + bpf_cgroup_storage_calculate_size(map, &pages); bpf_map_uncharge_memlock(map, pages); - kfree_rcu(storage->buf, rcu); - kfree_rcu(storage, rcu); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) + call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); + else + call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); } void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3bfbf4464416..99d243e1ad6e 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -24,7 +24,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 177a52436394..8e93c47f0779 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -172,6 +172,24 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, return ret; } +int bpf_prog_offload_finalize(struct bpf_verifier_env *env) +{ + struct bpf_prog_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + if (offload->dev_ops->finalize) + ret = offload->dev_ops->finalize(env); + else + ret = 0; + } + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c new file mode 100644 index 000000000000..12a93fb37449 --- /dev/null +++ b/kernel/bpf/queue_stack_maps.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * queue_stack_maps.c: BPF queue and stack maps + * + * Copyright (c) 2018 Politecnico di Torino + */ +#include <linux/bpf.h> +#include <linux/list.h> +#include <linux/slab.h> +#include "percpu_freelist.h" + +#define QUEUE_STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + + +struct bpf_queue_stack { + struct bpf_map map; + raw_spinlock_t lock; + u32 head, tail; + u32 size; /* max_entries + 1 */ + + char elements[0] __aligned(8); +}; + +static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) +{ + return container_of(map, struct bpf_queue_stack, map); +} + +static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) +{ + return qs->head == qs->tail; +} + +static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) +{ + u32 head = qs->head + 1; + + if (unlikely(head >= qs->size)) + head = 0; + + return head == qs->tail; +} + +/* Called from syscall */ +static int queue_stack_map_alloc_check(union bpf_attr *attr) +{ + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 0 || + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + return -EINVAL; + + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) +{ + int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_queue_stack *qs; + u32 size, value_size; + u64 queue_size, cost; + + size = attr->max_entries + 1; + value_size = attr->value_size; + + queue_size = sizeof(*qs) + (u64) value_size * size; + + cost = queue_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); + + qs = bpf_map_area_alloc(queue_size, numa_node); + if (!qs) + return ERR_PTR(-ENOMEM); + + memset(qs, 0, sizeof(*qs)); + + bpf_map_init_from_attr(&qs->map, attr); + + qs->map.pages = cost; + qs->size = size; + + raw_spin_lock_init(&qs->lock); + + return &qs->map; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void queue_stack_map_free(struct bpf_map *map) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + bpf_map_area_free(qs); +} + +static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + ptr = &qs->elements[qs->tail * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) { + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + + +static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + u32 index; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + index = qs->head - 1; + if (unlikely(index >= qs->size)) + index = qs->size - 1; + + ptr = &qs->elements[index * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) + qs->head = index; + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static int queue_map_peek_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_peek_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int queue_map_pop_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_pop_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long irq_flags; + int err = 0; + void *dst; + + /* BPF_EXIST is used to force making room for a new element in case the + * map is full + */ + bool replace = (flags & BPF_EXIST); + + /* Check supported flags for queue and stack maps */ + if (flags & BPF_NOEXIST || flags > BPF_EXIST) + return -EINVAL; + + raw_spin_lock_irqsave(&qs->lock, irq_flags); + + if (queue_stack_map_is_full(qs)) { + if (!replace) { + err = -E2BIG; + goto out; + } + /* advance tail pointer to overwrite oldest element */ + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + + dst = &qs->elements[qs->head * qs->map.value_size]; + memcpy(dst, value, qs->map.value_size); + + if (unlikely(++qs->head >= qs->size)) + qs->head = 0; + +out: + raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called from syscall */ +static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -EINVAL; +} + +const struct bpf_map_ops queue_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = queue_map_pop_elem, + .map_peek_elem = queue_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; + +const struct bpf_map_ops stack_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = stack_map_pop_elem, + .map_peek_elem = stack_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c deleted file mode 100644 index 488ef9663c01..000000000000 --- a/kernel/bpf/sockmap.c +++ /dev/null @@ -1,2580 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -/* A BPF sock_map is used to store sock objects. This is primarly used - * for doing socket redirect with BPF helper routines. - * - * A sock map may have BPF programs attached to it, currently a program - * used to parse packets and a program to provide a verdict and redirect - * decision on the packet are supported. Any programs attached to a sock - * map are inherited by sock objects when they are added to the map. If - * no BPF programs are attached the sock object may only be used for sock - * redirect. - * - * A sock object may be in multiple maps, but can only inherit a single - * parse or verdict program. If adding a sock object to a map would result - * in having multiple parsing programs the update will return an EBUSY error. - * - * For reference this program is similar to devmap used in XDP context - * reviewing these together may be useful. For an example please review - * ./samples/bpf/sockmap/. - */ -#include <linux/bpf.h> -#include <net/sock.h> -#include <linux/filter.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/kernel.h> -#include <linux/net.h> -#include <linux/skbuff.h> -#include <linux/workqueue.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <net/strparser.h> -#include <net/tcp.h> -#include <linux/ptr_ring.h> -#include <net/inet_common.h> -#include <linux/sched/signal.h> - -#define SOCK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -struct bpf_sock_progs { - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; -}; - -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; - struct bpf_sock_progs progs; - raw_spinlock_t lock; -}; - -struct bucket { - struct hlist_head head; - raw_spinlock_t lock; -}; - -struct bpf_htab { - struct bpf_map map; - struct bucket *buckets; - atomic_t count; - u32 n_buckets; - u32 elem_size; - struct bpf_sock_progs progs; - struct rcu_head rcu; -}; - -struct htab_elem { - struct rcu_head rcu; - struct hlist_node hash_node; - u32 hash; - struct sock *sk; - char key[0]; -}; - -enum smap_psock_state { - SMAP_TX_RUNNING, -}; - -struct smap_psock_map_entry { - struct list_head list; - struct bpf_map *map; - struct sock **entry; - struct htab_elem __rcu *hash_link; -}; - -struct smap_psock { - struct rcu_head rcu; - refcount_t refcnt; - - /* datapath variables */ - struct sk_buff_head rxqueue; - bool strp_enabled; - - /* datapath error path cache across tx work invocations */ - int save_rem; - int save_off; - struct sk_buff *save_skb; - - /* datapath variables for tx_msg ULP */ - struct sock *sk_redir; - int apply_bytes; - int cork_bytes; - int sg_size; - int eval; - struct sk_msg_buff *cork; - struct list_head ingress; - - struct strparser strp; - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; - struct list_head maps; - spinlock_t maps_lock; - - /* Back reference used when sock callback trigger sockmap operations */ - struct sock *sock; - unsigned long state; - - struct work_struct tx_work; - struct work_struct gc_work; - - struct proto *sk_proto; - void (*save_close)(struct sock *sk, long timeout); - void (*save_data_ready)(struct sock *sk); - void (*save_write_space)(struct sock *sk); -}; - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -static void bpf_tcp_close(struct sock *sk, long timeout); - -static inline struct smap_psock *smap_psock_sk(const struct sock *sk) -{ - return rcu_dereference_sk_user_data(sk); -} - -static bool bpf_tcp_stream_read(const struct sock *sk) -{ - struct smap_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - empty = list_empty(&psock->ingress); -out: - rcu_read_unlock(); - return !empty; -} - -enum { - SOCKMAP_IPV4, - SOCKMAP_IPV6, - SOCKMAP_NUM_PROTS, -}; - -enum { - SOCKMAP_BASE, - SOCKMAP_TX, - SOCKMAP_NUM_CONFIGS, -}; - -static struct proto *saved_tcpv6_prot __read_mostly; -static DEFINE_SPINLOCK(tcpv6_prot_lock); -static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; -static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], - struct proto *base) -{ - prot[SOCKMAP_BASE] = *base; - prot[SOCKMAP_BASE].close = bpf_tcp_close; - prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; - prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; - - prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; - prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; - prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; -} - -static void update_sk_prot(struct sock *sk, struct smap_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; - int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; - - sk->sk_prot = &bpf_tcp_prots[family][conf]; -} - -static int bpf_tcp_init(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return -EINVAL; - } - - if (unlikely(psock->sk_proto)) { - rcu_read_unlock(); - return -EBUSY; - } - - psock->save_close = sk->sk_prot->close; - psock->sk_proto = sk->sk_prot; - - /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ - if (sk->sk_family == AF_INET6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { - spin_lock_bh(&tcpv6_prot_lock); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); - } - spin_unlock_bh(&tcpv6_prot_lock); - } - update_sk_prot(sk, psock); - rcu_read_unlock(); - return 0; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); - -static void bpf_tcp_release(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; - } -out: - rcu_read_unlock(); -} - -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} - -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; -} - -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) -{ - atomic_dec(&htab->count); - kfree_rcu(l, rcu); -} - -static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, - struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - - spin_lock_bh(&psock->maps_lock); - e = list_first_entry_or_null(&psock->maps, - struct smap_psock_map_entry, - list); - if (e) - list_del(&e->list); - spin_unlock_bh(&psock->maps_lock); - return e; -} - -static void bpf_tcp_close(struct sock *sk, long timeout) -{ - void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock_map_entry *e; - struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; - struct sock *osk; - - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - e = psock_map_pop(sk, psock); - while (e) { - if (e->entry) { - struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); - - raw_spin_lock_bh(&stab->lock); - osk = *e->entry; - if (osk == sk) { - *e->entry = NULL; - smap_release_sock(psock, sk); - } - raw_spin_unlock_bh(&stab->lock); - } else { - struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - struct bucket *b; - - b = __select_bucket(htab, link->hash); - head = &b->head; - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, - link->hash, link->key, - htab->map.key_size); - /* If another thread deleted this object skip deletion. - * The refcnt on psock may or may not be zero. - */ - if (l && l == link) { - hlist_del_rcu(&link->hash_node); - smap_release_sock(psock, link->sk); - free_htab_elem(htab, link); - } - raw_spin_unlock_bh(&b->lock); - } - kfree(e); - e = psock_map_pop(sk, psock); - } - rcu_read_unlock(); - release_sock(sk); - close_fun(sk, timeout); -} - -enum __sk_action { - __SK_DROP = 0, - __SK_PASS, - __SK_REDIRECT, - __SK_NONE, -}; - -static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { - .name = "bpf_tcp", - .uid = TCP_ULP_BPF, - .user_visible = false, - .owner = NULL, - .init = bpf_tcp_init, - .release = bpf_tcp_release, -}; - -static int memcopy_from_iter(struct sock *sk, - struct sk_msg_buff *md, - struct iov_iter *from, int bytes) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_curr, rc = -ENOSPC; - - do { - int copy; - char *to; - - if (md->sg_copybreak >= sg[i].length) { - md->sg_copybreak = 0; - - if (++i == MAX_SKB_FRAGS) - i = 0; - - if (i == md->sg_end) - break; - } - - copy = sg[i].length - md->sg_copybreak; - to = sg_virt(&sg[i]) + md->sg_copybreak; - md->sg_copybreak += copy; - - if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) - rc = copy_from_iter_nocache(to, copy, from); - else - rc = copy_from_iter(to, copy, from); - - if (rc != copy) { - rc = -EFAULT; - goto out; - } - - bytes -= copy; - if (!bytes) - break; - - md->sg_copybreak = 0; - if (++i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -out: - md->sg_curr = i; - return rc; -} - -static int bpf_tcp_push(struct sock *sk, int apply_bytes, - struct sk_msg_buff *md, - int flags, bool uncharge) -{ - bool apply = apply_bytes; - struct scatterlist *sg; - int offset, ret = 0; - struct page *p; - size_t size; - - while (1) { - sg = md->sg_data + md->sg_start; - size = (apply && apply_bytes < sg->length) ? - apply_bytes : sg->length; - offset = sg->offset; - - tcp_rate_check_app_limited(sk); - p = sg_page(sg); -retry: - ret = do_tcp_sendpages(sk, p, offset, size, flags); - if (ret != size) { - if (ret > 0) { - if (apply) - apply_bytes -= ret; - - sg->offset += ret; - sg->length -= ret; - size -= ret; - offset += ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - goto retry; - } - - return ret; - } - - if (apply) - apply_bytes -= ret; - sg->offset += ret; - sg->length -= ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - - if (!sg->length) { - put_page(p); - md->sg_start++; - if (md->sg_start == MAX_SKB_FRAGS) - md->sg_start = 0; - sg_init_table(sg, 1); - - if (md->sg_start == md->sg_end) - break; - } - - if (apply && !apply_bytes) - break; - } - return 0; -} - -static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data + md->sg_start; - - if (md->sg_copy[md->sg_start]) { - md->data = md->data_end = 0; - } else { - md->data = sg_virt(sg); - md->data_end = md->data + sg->length; - } -} - -static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start; - - do { - int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; - - sk_mem_uncharge(sk, uncharge); - bytes -= uncharge; - if (!bytes) - break; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -} - -static void free_bytes_sg(struct sock *sk, int bytes, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start, free; - - while (bytes && sg[i].length) { - free = sg[i].length; - if (bytes < free) { - sg[i].length -= bytes; - sg[i].offset += bytes; - if (charge) - sk_mem_uncharge(sk, bytes); - break; - } - - if (charge) - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - bytes -= sg[i].length; - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - md->sg_start = i; -} - -static int free_sg(struct sock *sk, int start, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = start, free = 0; - - while (sg[i].length) { - free += sg[i].length; - if (charge) - sk_mem_uncharge(sk, sg[i].length); - if (!md->skb) - put_page(sg_page(&sg[i])); - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - if (md->skb) - consume_skb(md->skb); - - return free; -} - -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) -{ - int free = free_sg(sk, md->sg_start, md, charge); - - md->sg_start = md->sg_end; - return free; -} - -static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) -{ - return free_sg(sk, md->sg_curr, md, true); -} - -static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) -{ - return ((_rc == SK_PASS) ? - (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP); -} - -static unsigned int smap_do_tx_msg(struct sock *sk, - struct smap_psock *psock, - struct sk_msg_buff *md) -{ - struct bpf_prog *prog; - unsigned int rc, _rc; - - preempt_disable(); - rcu_read_lock(); - - /* If the policy was removed mid-send then default to 'accept' */ - prog = READ_ONCE(psock->bpf_tx_msg); - if (unlikely(!prog)) { - _rc = SK_PASS; - goto verdict; - } - - bpf_compute_data_pointers_sg(md); - md->sk = sk; - rc = (*prog->bpf_func)(md, prog->insnsi); - psock->apply_bytes = md->apply_bytes; - - /* Moving return codes from UAPI namespace into internal namespace */ - _rc = bpf_map_msg_verdict(rc, md); - - /* The psock has a refcount on the sock but not on the map and because - * we need to drop rcu read lock here its possible the map could be - * removed between here and when we need it to execute the sock - * redirect. So do the map lookup now for future use. - */ - if (_rc == __SK_REDIRECT) { - if (psock->sk_redir) - sock_put(psock->sk_redir); - psock->sk_redir = do_msg_redirect_map(md); - if (!psock->sk_redir) { - _rc = __SK_DROP; - goto verdict; - } - sock_hold(psock->sk_redir); - } -verdict: - rcu_read_unlock(); - preempt_enable(); - - return _rc; -} - -static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, - struct smap_psock *psock, - struct sk_msg_buff *md, int flags) -{ - bool apply = apply_bytes; - size_t size, copied = 0; - struct sk_msg_buff *r; - int err = 0, i; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!r)) - return -ENOMEM; - - lock_sock(sk); - r->sg_start = md->sg_start; - i = md->sg_start; - - do { - size = (apply && apply_bytes < md->sg_data[i].length) ? - apply_bytes : md->sg_data[i].length; - - if (!sk_wmem_schedule(sk, size)) { - if (!copied) - err = -ENOMEM; - break; - } - - sk_mem_charge(sk, size); - r->sg_data[i] = md->sg_data[i]; - r->sg_data[i].length = size; - md->sg_data[i].length -= size; - md->sg_data[i].offset += size; - copied += size; - - if (md->sg_data[i].length) { - get_page(sg_page(&r->sg_data[i])); - r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; - } else { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - r->sg_end = i; - } - - if (apply) { - apply_bytes -= size; - if (!apply_bytes) - break; - } - } while (i != md->sg_end); - - md->sg_start = i; - - if (!err) { - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - } else { - free_start_sg(sk, r, true); - kfree(r); - } - - release_sock(sk); - return err; -} - -static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, - struct sk_msg_buff *md, - int flags) -{ - bool ingress = !!(md->flags & BPF_F_INGRESS); - struct smap_psock *psock; - int err = 0; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out_rcu; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto out_rcu; - - rcu_read_unlock(); - - if (ingress) { - err = bpf_tcp_ingress(sk, send, psock, md, flags); - } else { - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); - } - smap_release_sock(psock, sk); - return err; -out_rcu: - rcu_read_unlock(); - return 0; -} - -static inline void bpf_md_init(struct smap_psock *psock) -{ - if (!psock->apply_bytes) { - psock->eval = __SK_NONE; - if (psock->sk_redir) { - sock_put(psock->sk_redir); - psock->sk_redir = NULL; - } - } -} - -static void apply_bytes_dec(struct smap_psock *psock, int i) -{ - if (psock->apply_bytes) { - if (psock->apply_bytes < i) - psock->apply_bytes = 0; - else - psock->apply_bytes -= i; - } -} - -static int bpf_exec_tx_verdict(struct smap_psock *psock, - struct sk_msg_buff *m, - struct sock *sk, - int *copied, int flags) -{ - bool cork = false, enospc = (m->sg_start == m->sg_end); - struct sock *redir; - int err = 0; - int send; - -more_data: - if (psock->eval == __SK_NONE) - psock->eval = smap_do_tx_msg(sk, psock, m); - - if (m->cork_bytes && - m->cork_bytes > psock->sg_size && !enospc) { - psock->cork_bytes = m->cork_bytes - psock->sg_size; - if (!psock->cork) { - psock->cork = kcalloc(1, - sizeof(struct sk_msg_buff), - GFP_ATOMIC | __GFP_NOWARN); - - if (!psock->cork) { - err = -ENOMEM; - goto out_err; - } - } - memcpy(psock->cork, m, sizeof(*m)); - goto out_err; - } - - send = psock->sg_size; - if (psock->apply_bytes && psock->apply_bytes < send) - send = psock->apply_bytes; - - switch (psock->eval) { - case __SK_PASS: - err = bpf_tcp_push(sk, send, m, flags, true); - if (unlikely(err)) { - *copied -= free_start_sg(sk, m, true); - break; - } - - apply_bytes_dec(psock, send); - psock->sg_size -= send; - break; - case __SK_REDIRECT: - redir = psock->sk_redir; - apply_bytes_dec(psock, send); - - if (psock->cork) { - cork = true; - psock->cork = NULL; - } - - return_mem_sg(sk, send, m); - release_sock(sk); - - err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); - lock_sock(sk); - - if (unlikely(err < 0)) { - int free = free_start_sg(sk, m, false); - - psock->sg_size = 0; - if (!cork) - *copied -= free; - } else { - psock->sg_size -= send; - } - - if (cork) { - free_start_sg(sk, m, true); - psock->sg_size = 0; - kfree(m); - m = NULL; - err = 0; - } - break; - case __SK_DROP: - default: - free_bytes_sg(sk, send, m, true); - apply_bytes_dec(psock, send); - *copied -= send; - psock->sg_size -= send; - err = -EACCES; - break; - } - - if (likely(!err)) { - bpf_md_init(psock); - if (m && - m->sg_data[m->sg_start].page_link && - m->sg_data[m->sg_start].length) - goto more_data; - } - -out_err: - return err; -} - -static int bpf_wait_data(struct sock *sk, - struct smap_psock *psk, int flags, - long timeo, int *err) -{ - int rc; - - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - rc = sk_wait_event(sk, &timeo, - !list_empty(&psk->ingress) || - !skb_queue_empty(&sk->sk_receive_queue), - &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - - return rc; -} - -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct iov_iter *iter = &msg->msg_iter; - struct smap_psock *psock; - int copied = 0; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) - goto out; - rcu_read_unlock(); - - lock_sock(sk); -bytes_ready: - while (copied != len) { - struct scatterlist *sg; - struct sk_msg_buff *md; - int i; - - md = list_first_entry_or_null(&psock->ingress, - struct sk_msg_buff, list); - if (unlikely(!md)) - break; - i = md->sg_start; - do { - struct page *page; - int n, copy; - - sg = &md->sg_data[i]; - copy = sg->length; - page = sg_page(sg); - - if (copied + copy > len) - copy = len - copied; - - n = copy_page_to_iter(page, sg->offset, copy, iter); - if (n != copy) { - md->sg_start = i; - release_sock(sk); - smap_release_sock(psock, sk); - return -EFAULT; - } - - copied += copy; - sg->offset += copy; - sg->length -= copy; - sk_mem_uncharge(sk, copy); - - if (!sg->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!md->skb) - put_page(page); - } - if (copied == len) - break; - } while (i != md->sg_end); - md->sg_start = i; - - if (!sg->length && md->sg_start == md->sg_end) { - list_del(&md->list); - if (md->skb) - consume_skb(md->skb); - kfree(md); - } - } - - if (!copied) { - long timeo; - int data; - int err = 0; - - timeo = sock_rcvtimeo(sk, nonblock); - data = bpf_wait_data(sk, psock, flags, timeo, &err); - - if (data) { - if (!skb_queue_empty(&sk->sk_receive_queue)) { - release_sock(sk); - smap_release_sock(psock, sk); - copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - return copied; - } - goto bytes_ready; - } - - if (err) - copied = err; - } - - release_sock(sk); - smap_release_sock(psock, sk); - return copied; -out: - rcu_read_unlock(); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -} - - -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; - struct sk_msg_buff md = {0}; - unsigned int sg_copy = 0; - struct smap_psock *psock; - int copied = 0, err = 0; - struct scatterlist *sg; - long timeo; - - /* Its possible a sock event or user removed the psock _but_ the ops - * have not been reprogrammed yet so we get here. In this case fallback - * to tcp_sendmsg. Note this only works because we _only_ ever allow - * a single ULP there is no hierarchy here. - */ - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - /* Increment the psock refcnt to ensure its not released while sending a - * message. Required because sk lookup and bpf programs are used in - * separate rcu critical sections. Its OK if we lose the map entry - * but we can't lose the sock reference. - */ - if (!refcount_inc_not_zero(&psock->refcnt)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - sg = md.sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - rcu_read_unlock(); - - lock_sock(sk); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - - while (msg_data_left(msg)) { - struct sk_msg_buff *m = NULL; - bool enospc = false; - int copy; - - if (sk->sk_err) { - err = -sk->sk_err; - goto out_err; - } - - copy = msg_data_left(msg); - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - - m = psock->cork_bytes ? psock->cork : &md; - m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; - err = sk_alloc_sg(sk, copy, m->sg_data, - m->sg_start, &m->sg_end, &sg_copy, - m->sg_end - 1); - if (err) { - if (err != -ENOSPC) - goto wait_for_memory; - enospc = true; - copy = sg_copy; - } - - err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); - if (err < 0) { - free_curr_sg(sk, m); - goto out_err; - } - - psock->sg_size += copy; - copied += copy; - sg_copy = 0; - - /* When bytes are being corked skip running BPF program and - * applying verdict unless there is no more buffer space. In - * the ENOSPC case simply run BPF prorgram with currently - * accumulated data. We don't have much choice at this point - * we could try extending the page frags or chaining complex - * frags but even in these cases _eventually_ we will hit an - * OOM scenario. More complex recovery schemes may be - * implemented in the future, but BPF programs must handle - * the case where apply_cork requests are not honored. The - * canonical method to verify this is to check data length. - */ - if (psock->cork_bytes) { - if (copy > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= copy; - - if (psock->cork_bytes && !enospc) - goto out_cork; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); - if (unlikely(err < 0)) - goto out_err; - continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - err = sk_stream_wait_memory(sk, &timeo); - if (err) { - if (m && m != psock->cork) - free_start_sg(sk, m, true); - goto out_err; - } - } -out_err: - if (err < 0) - err = sk_stream_error(sk, msg->msg_flags, err); -out_cork: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -} - -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct sk_msg_buff md = {0}, *m = NULL; - int err = 0, copied = 0; - struct smap_psock *psock; - struct scatterlist *sg; - bool enospc = false; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto accept; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto accept; - rcu_read_unlock(); - - lock_sock(sk); - - if (psock->cork_bytes) { - m = psock->cork; - sg = &m->sg_data[m->sg_end]; - } else { - m = &md; - sg = m->sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(m->sg_end == m->sg_start && - m->sg_data[m->sg_end].length)) - goto out_err; - - psock->sg_size += size; - sg_set_page(sg, page, size, offset); - get_page(page); - m->sg_copy[m->sg_end] = true; - sk_mem_charge(sk, size); - m->sg_end++; - copied = size; - - if (m->sg_end == MAX_SKB_FRAGS) - m->sg_end = 0; - - if (m->sg_end == m->sg_start) - enospc = true; - - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - - if (psock->cork_bytes && !enospc) - goto out_err; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); -out_err: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -accept: - rcu_read_unlock(); - return tcp_sendpage(sk, page, offset, size, flags); -} - -static void bpf_tcp_msg_add(struct smap_psock *psock, - struct sock *sk, - struct bpf_prog *tx_msg) -{ - struct bpf_prog *orig_tx_msg; - - orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); - if (orig_tx_msg) - bpf_prog_put(orig_tx_msg); -} - -static int bpf_tcp_ulp_register(void) -{ - build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); - /* Once BPF TX ULP is registered it is never unregistered. It - * will be in the ULP list for the lifetime of the system. Doing - * duplicate registers is not a problem. - */ - return tcp_register_ulp(&bpf_tcp_ulp_ops); -} - -static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) -{ - struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); - int rc; - - if (unlikely(!prog)) - return __SK_DROP; - - skb_orphan(skb); - /* We need to ensure that BPF metadata for maps is also cleared - * when we orphan the skb so that we don't have the possibility - * to reference a stale map. - */ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - preempt_disable(); - rc = (*prog->bpf_func)(skb, prog->insnsi); - preempt_enable(); - skb->sk = NULL; - - /* Moving return codes from UAPI namespace into internal namespace */ - return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP; -} - -static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) -{ - struct sock *sk = psock->sock; - int copied = 0, num_sg; - struct sk_msg_buff *r; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); - if (unlikely(!r)) - return -EAGAIN; - - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(r); - return -EAGAIN; - } - - sg_init_table(r->sg_data, MAX_SKB_FRAGS); - num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); - if (unlikely(num_sg < 0)) { - kfree(r); - return num_sg; - } - sk_mem_charge(sk, skb->len); - copied = skb->len; - r->sg_start = 0; - r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; - r->skb = skb; - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - return copied; -} - -static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) -{ - struct smap_psock *peer; - struct sock *sk; - __u32 in; - int rc; - - rc = smap_verdict_func(psock, skb); - switch (rc) { - case __SK_REDIRECT: - sk = do_sk_redirect_map(skb); - if (!sk) { - kfree_skb(skb); - break; - } - - peer = smap_psock_sk(sk); - in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - - if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || - !test_bit(SMAP_TX_RUNNING, &peer->state))) { - kfree_skb(skb); - break; - } - - if (!in && sock_writeable(sk)) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } else if (in && - atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } - /* Fall through and free skb otherwise */ - case __SK_DROP: - default: - kfree_skb(skb); - } -} - -static void smap_report_sk_error(struct smap_psock *psock, int err) -{ - struct sock *sk = psock->sock; - - sk->sk_err = err; - sk->sk_error_report(sk); -} - -static void smap_read_sock_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - smap_do_verdict(psock, skb); - rcu_read_unlock(); -} - -/* Called with lock held on socket */ -static void smap_data_ready(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); - } - rcu_read_unlock(); -} - -static void smap_tx_work(struct work_struct *w) -{ - struct smap_psock *psock; - struct sk_buff *skb; - int rem, off, n; - - psock = container_of(w, struct smap_psock, tx_work); - - /* lock sock to avoid losing sk_socket at some point during loop */ - lock_sock(psock->sock); - if (psock->save_skb) { - skb = psock->save_skb; - rem = psock->save_rem; - off = psock->save_off; - psock->save_skb = NULL; - goto start; - } - - while ((skb = skb_dequeue(&psock->rxqueue))) { - __u32 flags; - - rem = skb->len; - off = 0; -start: - flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - do { - if (likely(psock->sock->sk_socket)) { - if (flags) - n = smap_do_ingress(psock, skb); - else - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - } else { - n = -EINVAL; - } - - if (n <= 0) { - if (n == -EAGAIN) { - /* Retry when space is available */ - psock->save_skb = skb; - psock->save_rem = rem; - psock->save_off = off; - goto out; - } - /* Hard errors break pipe and stop xmit */ - smap_report_sk_error(psock, n ? -n : EPIPE); - clear_bit(SMAP_TX_RUNNING, &psock->state); - kfree_skb(skb); - goto out; - } - rem -= n; - off += n; - } while (rem); - - if (!flags) - kfree_skb(skb); - } -out: - release_sock(psock->sock); -} - -static void smap_write_space(struct sock *sk) -{ - struct smap_psock *psock; - void (*write_space)(struct sock *sk); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) - schedule_work(&psock->tx_work); - write_space = psock->save_write_space; - rcu_read_unlock(); - write_space(sk); -} - -static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) -{ - if (!psock->strp_enabled) - return; - sk->sk_data_ready = psock->save_data_ready; - sk->sk_write_space = psock->save_write_space; - psock->save_data_ready = NULL; - psock->save_write_space = NULL; - strp_stop(&psock->strp); - psock->strp_enabled = false; -} - -static void smap_destroy_psock(struct rcu_head *rcu) -{ - struct smap_psock *psock = container_of(rcu, - struct smap_psock, rcu); - - /* Now that a grace period has passed there is no longer - * any reference to this sock in the sockmap so we can - * destroy the psock, strparser, and bpf programs. But, - * because we use workqueue sync operations we can not - * do it in rcu context - */ - schedule_work(&psock->gc_work); -} - -static bool psock_is_smap_sk(struct sock *sk) -{ - return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock) -{ - if (refcount_dec_and_test(&psock->refcnt)) { - if (psock_is_smap_sk(sock)) - tcp_cleanup_ulp(sock); - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); - } -} - -static int smap_parse_func_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - struct bpf_prog *prog; - int rc; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - prog = READ_ONCE(psock->bpf_parse); - - if (unlikely(!prog)) { - rcu_read_unlock(); - return skb->len; - } - - /* Attach socket for bpf program to use if needed we can do this - * because strparser clones the skb before handing it to a upper - * layer, meaning skb_orphan has been called. We NULL sk on the - * way out to ensure we don't trigger a BUG_ON in skb/sk operations - * later and because we are not charging the memory of this skb to - * any socket yet. - */ - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - rc = (*prog->bpf_func)(skb, prog->insnsi); - skb->sk = NULL; - rcu_read_unlock(); - return rc; -} - -static int smap_read_sock_done(struct strparser *strp, int err) -{ - return err; -} - -static int smap_init_sock(struct smap_psock *psock, - struct sock *sk) -{ - static const struct strp_callbacks cb = { - .rcv_msg = smap_read_sock_strparser, - .parse_msg = smap_parse_func_strparser, - .read_sock_done = smap_read_sock_done, - }; - - return strp_init(&psock->strp, sk, &cb); -} - -static void smap_init_progs(struct smap_psock *psock, - struct bpf_prog *verdict, - struct bpf_prog *parse) -{ - struct bpf_prog *orig_parse, *orig_verdict; - - orig_parse = xchg(&psock->bpf_parse, parse); - orig_verdict = xchg(&psock->bpf_verdict, verdict); - - if (orig_verdict) - bpf_prog_put(orig_verdict); - if (orig_parse) - bpf_prog_put(orig_parse); -} - -static void smap_start_sock(struct smap_psock *psock, struct sock *sk) -{ - if (sk->sk_data_ready == smap_data_ready) - return; - psock->save_data_ready = sk->sk_data_ready; - psock->save_write_space = sk->sk_write_space; - sk->sk_data_ready = smap_data_ready; - sk->sk_write_space = smap_write_space; - psock->strp_enabled = true; -} - -static void sock_map_remove_complete(struct bpf_stab *stab) -{ - bpf_map_area_free(stab->sock_map); - kfree(stab); -} - -static void smap_gc_work(struct work_struct *w) -{ - struct smap_psock_map_entry *e, *tmp; - struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; - - psock = container_of(w, struct smap_psock, gc_work); - - /* no callback lock needed because we already detached sockmap ops */ - if (psock->strp_enabled) - strp_done(&psock->strp); - - cancel_work_sync(&psock->tx_work); - __skb_queue_purge(&psock->rxqueue); - - /* At this point all strparser and xmit work must be complete */ - if (psock->bpf_parse) - bpf_prog_put(psock->bpf_parse); - if (psock->bpf_verdict) - bpf_prog_put(psock->bpf_verdict); - if (psock->bpf_tx_msg) - bpf_prog_put(psock->bpf_tx_msg); - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - list_del(&e->list); - kfree(e); - } - - if (psock->sk_redir) - sock_put(psock->sk_redir); - - sock_put(psock->sock); - kfree(psock); -} - -static struct smap_psock *smap_init_psock(struct sock *sock, int node) -{ - struct smap_psock *psock; - - psock = kzalloc_node(sizeof(struct smap_psock), - GFP_ATOMIC | __GFP_NOWARN, - node); - if (!psock) - return ERR_PTR(-ENOMEM); - - psock->eval = __SK_NONE; - psock->sock = sock; - skb_queue_head_init(&psock->rxqueue); - INIT_WORK(&psock->tx_work, smap_tx_work); - INIT_WORK(&psock->gc_work, smap_gc_work); - INIT_LIST_HEAD(&psock->maps); - INIT_LIST_HEAD(&psock->ingress); - refcount_set(&psock->refcnt, 1); - spin_lock_init(&psock->maps_lock); - - rcu_assign_sk_user_data(sock, psock); - sock_hold(sock); - return psock; -} - -static struct bpf_map *sock_map_alloc(union bpf_attr *attr) -{ - struct bpf_stab *stab; - u64 cost; - int err; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - err = bpf_tcp_ulp_register(); - if (err && err != -EEXIST) - return ERR_PTR(err); - - stab = kzalloc(sizeof(*stab), GFP_USER); - if (!stab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&stab->map, attr); - raw_spin_lock_init(&stab->lock); - - /* make sure page count doesn't overflow */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = -EINVAL; - if (cost >= U32_MAX - PAGE_SIZE) - goto free_stab; - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(stab->map.pages); - if (err) - goto free_stab; - - err = -ENOMEM; - stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *), - stab->map.numa_node); - if (!stab->sock_map) - goto free_stab; - - return &stab->map; -free_stab: - kfree(stab); - return ERR_PTR(err); -} - -static void smap_list_map_remove(struct smap_psock *psock, - struct sock **entry) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void smap_list_hash_remove(struct smap_psock *psock, - struct htab_elem *hash_link) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - struct htab_elem *c = rcu_dereference(e->hash_link); - - if (c == hash_link) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void sock_map_free(struct bpf_map *map) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - raw_spin_lock_bh(&stab->lock); - for (i = 0; i < stab->map.max_entries; i++) { - struct smap_psock *psock; - struct sock *sock; - - sock = stab->sock_map[i]; - if (!sock) - continue; - stab->sock_map[i] = NULL; - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); - } - } - raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); - - sock_map_remove_complete(stab); -} - -static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - u32 i = key ? *(u32 *)key : U32_MAX; - u32 *next = (u32 *)next_key; - - if (i >= stab->map.max_entries) { - *next = 0; - return 0; - } - - if (i == stab->map.max_entries - 1) - return -ENOENT; - - *next = i + 1; - return 0; -} - -struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - if (key >= map->max_entries) - return NULL; - - return READ_ONCE(stab->sock_map[key]); -} - -static int sock_map_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock *psock; - int k = *(u32 *)key; - struct sock *sock; - - if (k >= map->max_entries) - return -EINVAL; - - raw_spin_lock_bh(&stab->lock); - sock = stab->sock_map[k]; - stab->sock_map[k] = NULL; - raw_spin_unlock_bh(&stab->lock); - if (!sock) - return -EINVAL; - - psock = smap_psock_sk(sock); - if (!psock) - return 0; - if (psock->bpf_parse) { - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - smap_list_map_remove(psock, &stab->sock_map[k]); - smap_release_sock(psock, sock); - return 0; -} - -/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are - * done inside rcu critical sections. This ensures on updates that the psock - * will not be released via smap_release_sock() until concurrent updates/deletes - * complete. All operations operate on sock_map using cmpxchg and xchg - * operations to ensure we do not get stale references. Any reads into the - * map must be done with READ_ONCE() because of this. - * - * A psock is destroyed via call_rcu and after any worker threads are cancelled - * and syncd so we are certain all references from the update/lookup/delete - * operations as well as references in the data path are no longer in use. - * - * Psocks may exist in multiple maps, but only a single set of parse/verdict - * programs may be inherited from the maps it belongs to. A reference count - * is kept with the total number of references to the psock from all maps. The - * psock will not be released until this reaches zero. The psock and sock - * user data data use the sk_callback_lock to protect critical data structures - * from concurrent access. This allows us to avoid two updates from modifying - * the user data in sock and the lock is required anyways for modifying - * callbacks, we simply increase its scope slightly. - * - * Rules to follow, - * - psock must always be read inside RCU critical section - * - sk_user_data must only be modified inside sk_callback_lock and read - * inside RCU critical section. - * - psock->maps list must only be read & modified inside sk_callback_lock - * - sock_map must use READ_ONCE and (cmp)xchg operations - * - BPF verdict/parse programs must use READ_ONCE and xchg operations - */ - -static int __sock_map_ctx_update_elem(struct bpf_map *map, - struct bpf_sock_progs *progs, - struct sock *sock, - void *key) -{ - struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock *psock; - bool new = false; - int err = 0; - - /* 1. If sock map has BPF programs those will be inherited by the - * sock being added. If the sock is already attached to BPF programs - * this results in an error. - */ - verdict = READ_ONCE(progs->bpf_verdict); - parse = READ_ONCE(progs->bpf_parse); - tx_msg = READ_ONCE(progs->bpf_tx_msg); - - if (parse && verdict) { - /* bpf prog refcnt may be zero if a concurrent attach operation - * removes the program after the above READ_ONCE() but before - * we increment the refcnt. If this is the case abort with an - * error. - */ - verdict = bpf_prog_inc_not_zero(verdict); - if (IS_ERR(verdict)) - return PTR_ERR(verdict); - - parse = bpf_prog_inc_not_zero(parse); - if (IS_ERR(parse)) { - bpf_prog_put(verdict); - return PTR_ERR(parse); - } - } - - if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(tx_msg); - if (IS_ERR(tx_msg)) { - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - return PTR_ERR(tx_msg); - } - } - - psock = smap_psock_sk(sock); - - /* 2. Do not allow inheriting programs if psock exists and has - * already inherited programs. This would create confusion on - * which parser/verdict program is running. If no psock exists - * create one. Inside sk_callback_lock to ensure concurrent create - * doesn't update user data. - */ - if (psock) { - if (!psock_is_smap_sk(sock)) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_parse) && parse) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { - err = -EBUSY; - goto out_progs; - } - if (!refcount_inc_not_zero(&psock->refcnt)) { - err = -EAGAIN; - goto out_progs; - } - } else { - psock = smap_init_psock(sock, map->numa_node); - if (IS_ERR(psock)) { - err = PTR_ERR(psock); - goto out_progs; - } - - set_bit(SMAP_TX_RUNNING, &psock->state); - new = true; - } - - /* 3. At this point we have a reference to a valid psock that is - * running. Attach any BPF programs needed. - */ - if (tx_msg) - bpf_tcp_msg_add(psock, sock, tx_msg); - if (new) { - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_free; - } - - if (parse && verdict && !psock->strp_enabled) { - err = smap_init_sock(psock, sock); - if (err) - goto out_free; - smap_init_progs(psock, verdict, parse); - write_lock_bh(&sock->sk_callback_lock); - smap_start_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - - return err; -out_free: - smap_release_sock(psock, sock); -out_progs: - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - if (tx_msg) - bpf_prog_put(tx_msg); - return err; -} - -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock = skops->sk; - struct smap_psock_map_entry *e; - struct smap_psock *psock; - u32 i = *(u32 *)key; - int err; - - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto out; - - /* psock guaranteed to be present. */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&stab->lock); - osock = stab->sock_map[i]; - if (osock && flags == BPF_NOEXIST) { - err = -EEXIST; - goto out_unlock; - } - if (!osock && flags == BPF_EXIST) { - err = -ENOENT; - goto out_unlock; - } - - e->entry = &stab->sock_map[i]; - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - stab->sock_map[i] = sock; - if (osock) { - psock = smap_psock_sk(osock); - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, osock); - } - raw_spin_unlock_bh(&stab->lock); - return 0; -out_unlock: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&stab->lock); -out: - kfree(e); - return err; -} - -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } else { - return -EINVAL; - } - - switch (type) { - case BPF_SK_MSG_VERDICT: - orig = xchg(&progs->bpf_tx_msg, prog); - break; - case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&progs->bpf_parse, prog); - break; - case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&progs->bpf_verdict, prog); - break; - default: - return -EOPNOTSUPP; - } - - if (orig) - bpf_prog_put(orig); - - return 0; -} - -int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog) -{ - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - err = sock_map_prog(map, prog, attr->attach_type); - fdput(f); - return err; -} - -static void *sock_map_lookup(struct bpf_map *map, void *key) -{ - return NULL; -} - -static int sock_map_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_map_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static void sock_map_release(struct bpf_map *map) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } - - orig = xchg(&progs->bpf_parse, NULL); - if (orig) - bpf_prog_put(orig); - orig = xchg(&progs->bpf_verdict, NULL); - if (orig) - bpf_prog_put(orig); - - orig = xchg(&progs->bpf_tx_msg, NULL); - if (orig) - bpf_prog_put(orig); -} - -static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) -{ - struct bpf_htab *htab; - int i, err; - u64 cost; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || - attr->key_size == 0 || - attr->value_size != 4 || - attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return ERR_PTR(-E2BIG); - - err = bpf_tcp_ulp_register(); - if (err && err != -EEXIST) - return ERR_PTR(err); - - htab = kzalloc(sizeof(*htab), GFP_USER); - if (!htab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&htab->map, attr); - - htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8); - err = -EINVAL; - if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct bucket)) - goto free_htab; - - cost = (u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries; - - if (cost >= U32_MAX - PAGE_SIZE) - goto free_htab; - - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(htab->map.pages); - if (err) - goto free_htab; - - err = -ENOMEM; - htab->buckets = bpf_map_area_alloc( - htab->n_buckets * sizeof(struct bucket), - htab->map.numa_node); - if (!htab->buckets) - goto free_htab; - - for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); - raw_spin_lock_init(&htab->buckets[i].lock); - } - - return &htab->map; -free_htab: - kfree(htab); - return ERR_PTR(err); -} - -static void __bpf_htab_free(struct rcu_head *rcu) -{ - struct bpf_htab *htab; - - htab = container_of(rcu, struct bpf_htab, rcu); - bpf_map_area_free(htab->buckets); - kfree(htab); -} - -static void sock_hash_free(struct bpf_map *map) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - for (i = 0; i < htab->n_buckets; i++) { - struct bucket *b = __select_bucket(htab, i); - struct hlist_head *head; - struct hlist_node *n; - struct htab_elem *l; - - raw_spin_lock_bh(&b->lock); - head = &b->head; - hlist_for_each_entry_safe(l, n, head, hash_node) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get - * the sk_callback_lock before this case but after xchg - * causing the refcnt to hit zero and sock user data - * (psock) to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - } - raw_spin_unlock_bh(&b->lock); - } - rcu_read_unlock(); - call_rcu(&htab->rcu, __bpf_htab_free); -} - -static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, - void *key, u32 key_size, u32 hash, - struct sock *sk, - struct htab_elem *old_elem) -{ - struct htab_elem *l_new; - - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - if (!old_elem) { - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); - } - } - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); - if (!l_new) { - atomic_dec(&htab->count); - return ERR_PTR(-ENOMEM); - } - - memcpy(l_new->key, key, key_size); - l_new->sk = sk; - l_new->hash = hash; - return l_new; -} - -static inline u32 htab_map_hash(const void *key, u32 key_len) -{ - return jhash(key, key_len, 0); -} - -static int sock_hash_get_next_key(struct bpf_map *map, - void *key, void *next_key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l, *next_l; - struct hlist_head *h; - u32 hash, key_size; - int i = 0; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - key_size = map->key_size; - if (!key) - goto find_first_elem; - hash = htab_map_hash(key, key_size); - h = select_bucket(htab, hash); - - l = lookup_elem_raw(h, hash, key, key_size); - if (!l) - goto find_first_elem; - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), - struct htab_elem, hash_node); - if (next_l) { - memcpy(next_key, next_l->key, key_size); - return 0; - } - - /* no more elements in this hash list, go to the next bucket */ - i = hash & (htab->n_buckets - 1); - i++; - -find_first_elem: - /* iterate over buckets */ - for (; i < htab->n_buckets; i++) { - h = select_bucket(htab, i); - - /* pick first element in the bucket */ - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(h)), - struct htab_elem, hash_node); - if (next_l) { - /* if it's not empty, just return it */ - memcpy(next_key, next_l->key, key_size); - return 0; - } - } - - /* iterated over all buckets and all elements */ - return -ENOENT; -} - -static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 map_flags) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_sock_progs *progs = &htab->progs; - struct htab_elem *l_new = NULL, *l_old; - struct smap_psock_map_entry *e = NULL; - struct hlist_head *head; - struct smap_psock *psock; - u32 key_size, hash; - struct sock *sock; - struct bucket *b; - int err; - - sock = skops->sk; - - if (sock->sk_type != SOCK_STREAM || - sock->sk_protocol != IPPROTO_TCP) - return -EOPNOTSUPP; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - WARN_ON_ONCE(!rcu_read_lock_held()); - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto err; - - /* psock is valid here because otherwise above *ctx_update_elem would - * have thrown an error. It is safe to skip error check. - */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&b->lock); - l_old = lookup_elem_raw(head, hash, key, key_size); - if (l_old && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto bucket_err; - } - if (!l_old && map_flags == BPF_EXIST) { - err = -ENOENT; - goto bucket_err; - } - - l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); - if (IS_ERR(l_new)) { - err = PTR_ERR(l_new); - goto bucket_err; - } - - rcu_assign_pointer(e->hash_link, l_new); - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - /* add new element to the head of the list, so that - * concurrent search will find it before old elem - */ - hlist_add_head_rcu(&l_new->hash_node, head); - if (l_old) { - psock = smap_psock_sk(l_old->sk); - - hlist_del_rcu(&l_old->hash_node); - smap_list_hash_remove(psock, l_old); - smap_release_sock(psock, l_old->sk); - free_htab_elem(htab, l_old); - } - raw_spin_unlock_bh(&b->lock); - return 0; -bucket_err: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&b->lock); -err: - kfree(e); - return err; -} - -static int sock_hash_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_hash_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static int sock_hash_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct bucket *b; - struct htab_elem *l; - u32 hash, key_size; - int ret = -ENOENT; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, hash, key, key_size); - if (l) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - ret = 0; - } - raw_spin_unlock_bh(&b->lock); - return ret; -} - -struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - u32 key_size, hash; - struct bucket *b; - struct sock *sk; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - l = lookup_elem_raw(head, hash, key, key_size); - sk = l ? l->sk : NULL; - return sk; -} - -const struct bpf_map_ops sock_map_ops = { - .map_alloc = sock_map_alloc, - .map_free = sock_map_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_map_get_next_key, - .map_update_elem = sock_map_update_elem, - .map_delete_elem = sock_map_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -const struct bpf_map_ops sock_hash_ops = { - .map_alloc = sock_hash_alloc, - .map_free = sock_hash_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_hash_get_next_key, - .map_update_elem = sock_hash_update_elem, - .map_delete_elem = sock_hash_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return sock_map_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_map_update_proto = { - .func = bpf_sock_map_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_hash_update_proto = { - .func = bpf_sock_hash_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8061a439ef18..90daf285de03 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -505,7 +505,7 @@ const struct bpf_func_proto bpf_get_stack_proto = { /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } /* Called from syscall */ @@ -600,7 +600,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8339d81cba1d..ccb93277aae2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,7 +30,6 @@ #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> -#include <linux/btf.h> #include <linux/nospec.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ @@ -652,6 +651,17 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return -ENOTSUPP; } +static void *__bpf_copy_key(void __user *ukey, u64 key_size) +{ + if (key_size) + return memdup_user(ukey, key_size); + + if (ukey) + return ERR_PTR(-EINVAL); + + return NULL; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -679,7 +689,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -687,7 +697,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) value_size = sizeof(u32); @@ -706,6 +717,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); } else if (IS_FD_ARRAY(map)) { @@ -714,13 +727,21 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); - if (ptr) + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + } else if (!ptr) { + err = -ENOENT; + } else { + err = 0; memcpy(value, ptr, value_size); + } rcu_read_unlock(); - err = ptr ? 0 : -ENOENT; } if (err) @@ -741,6 +762,17 @@ err_put: return err; } +static void maybe_wait_bpf_programs(struct bpf_map *map) +{ + /* Wait for any running BPF programs to complete so that + * userspace, when we return to it, knows that all programs + * that could be running use the new map value. + */ + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) + synchronize_rcu(); +} + #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr) @@ -767,7 +799,7 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -775,7 +807,8 @@ static int map_update_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else value_size = map->value_size; @@ -810,6 +843,9 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_update(map, key, value, + attr->flags); } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, @@ -824,6 +860,9 @@ static int map_update_elem(union bpf_attr *attr) /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -831,6 +870,7 @@ static int map_update_elem(union bpf_attr *attr) } __this_cpu_dec(bpf_prog_active); preempt_enable(); + maybe_wait_bpf_programs(map); out: free_value: kfree(value); @@ -865,7 +905,7 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -883,6 +923,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); + maybe_wait_bpf_programs(map); out: kfree(key); err_put: @@ -917,7 +958,7 @@ static int map_get_next_key(union bpf_attr *attr) } if (ukey) { - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -958,6 +999,69 @@ err_put: return err; } +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value + +static int map_lookup_and_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); + int ufd = attr->map_fd; + struct bpf_map *map; + void *key, *value; + u32 value_size; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + key = __bpf_copy_key(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto err_put; + } + + value_size = map->value_size; + + err = -ENOMEM; + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + goto free_key; + + if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_pop_elem(map, value); + } else { + err = -ENOTSUPP; + } + + if (err) + goto free_value; + + if (copy_to_user(uvalue, value, value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -989,10 +1093,15 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) /* drop refcnt on maps used by eBPF program and free auxilary data */ static void free_used_maps(struct bpf_prog_aux *aux) { + enum bpf_cgroup_storage_type stype; int i; - if (aux->cgroup_storage) - bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + } for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); @@ -1616,6 +1725,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_LIRC_MODE2: ptype = BPF_PROG_TYPE_LIRC_MODE2; break; + case BPF_FLOW_DISSECTOR: + ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; + break; default: return -EINVAL; } @@ -1632,11 +1744,14 @@ static int bpf_prog_attach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - ret = sockmap_get_from_fd(attr, ptype, prog); + ret = sock_map_get_from_fd(attr, prog); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + break; default: ret = cgroup_bpf_prog_attach(attr, ptype, prog); } @@ -1683,12 +1798,14 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_bpf_prog_detach(attr); default: return -EINVAL; } @@ -2418,6 +2535,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_TASK_FD_QUERY: err = bpf_task_fd_query(&attr, uattr); break; + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: + err = map_lookup_and_delete_elem(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92246117d2b0..98fa0be35370 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,5 +1,6 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -80,8 +81,8 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * (like pointer plus pointer becomes SCALAR_VALUE type) * * When verifier sees load or store instructions the type of base register - * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer - * types recognized by check_mem_access() function. + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are + * four pointer types recognized by check_mem_access() function. * * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' * and the range of [ptr, ptr + map's value_size) is accessible. @@ -140,6 +141,24 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * * After the call R0 is set to return type of the function and registers R1-R5 * are set to NOT_INIT to indicate that they are no longer readable. + * + * The following reference types represent a potential reference to a kernel + * resource which, after first being allocated, must be checked and freed by + * the BPF program: + * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET + * + * When the verifier sees a helper call return a reference type, it allocates a + * pointer id for the reference and stores it in the current function state. + * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into + * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type + * passes through a NULL-check conditional. For the branch wherein the state is + * changed to CONST_IMM, the verifier releases the reference. + * + * For each helper function that allocates a reference, such as + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as + * bpf_sk_release(). When a reference type passes into the release function, + * the verifier also releases the reference. If any unchecked or unreleased + * reference remains at the end of the program, the verifier rejects it. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -189,6 +208,7 @@ struct bpf_call_arg_meta { int access_size; s64 msize_smax_value; u64 msize_umax_value; + int ptr_id; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -249,6 +269,46 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } +static bool reg_type_may_be_null(enum bpf_reg_type type) +{ + return type == PTR_TO_MAP_VALUE_OR_NULL || + type == PTR_TO_SOCKET_OR_NULL; +} + +static bool type_is_refcounted(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET; +} + +static bool type_is_refcounted_or_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; +} + +static bool reg_is_refcounted(const struct bpf_reg_state *reg) +{ + return type_is_refcounted(reg->type); +} + +static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) +{ + return type_is_refcounted_or_null(reg->type); +} + +static bool arg_type_is_refcounted(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_SOCKET; +} + +/* Determine whether the function releases some resources allocated by another + * function call. The first reference type argument will be assumed to be + * released by release_reference(). + */ +static bool is_release_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_release; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -261,6 +321,16 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", +}; + +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', }; static void print_liveness(struct bpf_verifier_env *env, @@ -349,72 +419,179 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) { - verbose(env, " fp%d", - (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + int j; + + for (j = 0; j < BPF_REG_SIZE; j++) { + if (state->stack[i].slot_type[j] != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[ + state->stack[i].slot_type[j]]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); - } - if (state->stack[i].slot_type[0] == STACK_ZERO) - verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); + else + verbose(env, "=%s", types_buf); + } + if (state->acquired_refs && state->refs[0].id) { + verbose(env, " refs=%d", state->refs[0].id); + for (i = 1; i < state->acquired_refs; i++) + if (state->refs[i].id) + verbose(env, ",%d", state->refs[i].id); } verbose(env, "\n"); } -static int copy_stack_state(struct bpf_func_state *dst, - const struct bpf_func_state *src) -{ - if (!src->stack) - return 0; - if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { - /* internal bug, make state invalid to reject the program */ - memset(dst, 0, sizeof(*dst)); - return -EFAULT; - } - memcpy(dst->stack, src->stack, - sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); - return 0; -} +#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +static int copy_##NAME##_state(struct bpf_func_state *dst, \ + const struct bpf_func_state *src) \ +{ \ + if (!src->FIELD) \ + return 0; \ + if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \ + /* internal bug, make state invalid to reject the program */ \ + memset(dst, 0, sizeof(*dst)); \ + return -EFAULT; \ + } \ + memcpy(dst->FIELD, src->FIELD, \ + sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ + return 0; \ +} +/* copy_reference_state() */ +COPY_STATE_FN(reference, acquired_refs, refs, 1) +/* copy_stack_state() */ +COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef COPY_STATE_FN + +#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ + bool copy_old) \ +{ \ + u32 old_size = state->COUNT; \ + struct bpf_##NAME##_state *new_##FIELD; \ + int slot = size / SIZE; \ + \ + if (size <= old_size || !size) { \ + if (copy_old) \ + return 0; \ + state->COUNT = slot * SIZE; \ + if (!size && old_size) { \ + kfree(state->FIELD); \ + state->FIELD = NULL; \ + } \ + return 0; \ + } \ + new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \ + GFP_KERNEL); \ + if (!new_##FIELD) \ + return -ENOMEM; \ + if (copy_old) { \ + if (state->FIELD) \ + memcpy(new_##FIELD, state->FIELD, \ + sizeof(*new_##FIELD) * (old_size / SIZE)); \ + memset(new_##FIELD + old_size / SIZE, 0, \ + sizeof(*new_##FIELD) * (size - old_size) / SIZE); \ + } \ + state->COUNT = slot * SIZE; \ + kfree(state->FIELD); \ + state->FIELD = new_##FIELD; \ + return 0; \ +} +/* realloc_reference_state() */ +REALLOC_STATE_FN(reference, acquired_refs, refs, 1) +/* realloc_stack_state() */ +REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef REALLOC_STATE_FN /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from * the program calls into realloc_func_state() to grow the stack size. * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which this function copies over. It points to previous bpf_verifier_state - * which is never reallocated + * which realloc_stack_state() copies over. It points to previous + * bpf_verifier_state which is never reallocated. */ -static int realloc_func_state(struct bpf_func_state *state, int size, - bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int stack_size, + int refs_size, bool copy_old) { - u32 old_size = state->allocated_stack; - struct bpf_stack_state *new_stack; - int slot = size / BPF_REG_SIZE; + int err = realloc_reference_state(state, refs_size, copy_old); + if (err) + return err; + return realloc_stack_state(state, stack_size, copy_old); +} + +/* Acquire a pointer id from the env and update the state->refs to include + * this new pointer reference. + * On success, returns a valid pointer id to associate with the register + * On failure, returns a negative errno. + */ +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) +{ + struct bpf_func_state *state = cur_func(env); + int new_ofs = state->acquired_refs; + int id, err; - if (size <= old_size || !size) { - if (copy_old) + err = realloc_reference_state(state, state->acquired_refs + 1, true); + if (err) + return err; + id = ++env->id_gen; + state->refs[new_ofs].id = id; + state->refs[new_ofs].insn_idx = insn_idx; + + return id; +} + +/* release function corresponding to acquire_reference_state(). Idempotent. */ +static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +{ + int i, last_idx; + + if (!ptr_id) + return -EFAULT; + + last_idx = state->acquired_refs - 1; + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].id == ptr_id) { + if (last_idx && i != last_idx) + memcpy(&state->refs[i], &state->refs[last_idx], + sizeof(*state->refs)); + memset(&state->refs[last_idx], 0, sizeof(*state->refs)); + state->acquired_refs--; return 0; - state->allocated_stack = slot * BPF_REG_SIZE; - if (!size && old_size) { - kfree(state->stack); - state->stack = NULL; } - return 0; } - new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), - GFP_KERNEL); - if (!new_stack) - return -ENOMEM; - if (copy_old) { - if (state->stack) - memcpy(new_stack, state->stack, - sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); - memset(new_stack + old_size / BPF_REG_SIZE, 0, - sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); - } - state->allocated_stack = slot * BPF_REG_SIZE; - kfree(state->stack); - state->stack = new_stack; + return -EFAULT; +} + +/* variation on the above for cases where we expect that there must be an + * outstanding reference for the specified ptr_id. + */ +static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) +{ + struct bpf_func_state *state = cur_func(env); + int err; + + err = __release_reference_state(state, ptr_id); + if (WARN_ON_ONCE(err != 0)) + verbose(env, "verifier internal error: can't release reference\n"); + return err; +} + +static int transfer_reference_state(struct bpf_func_state *dst, + struct bpf_func_state *src) +{ + int err = realloc_reference_state(dst, src->acquired_refs, false); + if (err) + return err; + err = copy_reference_state(dst, src); + if (err) + return err; return 0; } @@ -422,6 +599,7 @@ static void free_func_state(struct bpf_func_state *state) { if (!state) return; + kfree(state->refs); kfree(state->stack); kfree(state); } @@ -447,10 +625,14 @@ static int copy_func_state(struct bpf_func_state *dst, { int err; - err = realloc_func_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs, + false); + if (err) + return err; + memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs)); + err = copy_reference_state(dst, src); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); } @@ -466,7 +648,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->frame[i] = NULL; } dst_state->curframe = src->curframe; - dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -553,7 +734,9 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg); */ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) { - reg->id = 0; + /* Clear id, off, and union(map_ptr, range) */ + memset(((u8 *)reg) + sizeof(reg->type), 0, + offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->var_off = tnum_const(imm); reg->smin_value = (s64)imm; reg->smax_value = (s64)imm; @@ -572,7 +755,6 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) static void __mark_reg_const_zero(struct bpf_reg_state *reg) { __mark_reg_known(reg, 0); - reg->off = 0; reg->type = SCALAR_VALUE; } @@ -683,9 +865,12 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) /* Mark a register as having a completely unknown (scalar) value. */ static void __mark_reg_unknown(struct bpf_reg_state *reg) { + /* + * Clear type, id, off, and union(map_ptr, range) and + * padding between 'type' and union + */ + memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); reg->type = SCALAR_VALUE; - reg->id = 0; - reg->off = 0; reg->var_off = tnum_unknown; reg->frameno = 0; __mark_reg_unbounded(reg); @@ -732,6 +917,7 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; + regs[i].parent = NULL; } /* frame pointer */ @@ -823,10 +1009,6 @@ static int check_subprogs(struct bpf_verifier_env *env) verbose(env, "function calls to other bpf functions are allowed for root only\n"); return -EPERM; } - if (bpf_prog_is_dev_bound(env->prog->aux)) { - verbose(env, "function calls in offloaded programs are not supported yet\n"); - return -EINVAL; - } ret = add_subprog(env, i + insn[i].imm + 1); if (ret < 0) return ret; @@ -876,74 +1058,21 @@ next: return 0; } -static -struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) -{ - struct bpf_verifier_state *tmp = NULL; - - /* 'parent' could be a state of caller and - * 'state' could be a state of callee. In such case - * parent->curframe < state->curframe - * and it's ok for r1 - r5 registers - * - * 'parent' could be a callee's state after it bpf_exit-ed. - * In such case parent->curframe > state->curframe - * and it's ok for r0 only - */ - if (parent->curframe == state->curframe || - (parent->curframe < state->curframe && - regno >= BPF_REG_1 && regno <= BPF_REG_5) || - (parent->curframe > state->curframe && - regno == BPF_REG_0)) - return parent; - - if (parent->curframe > state->curframe && - regno >= BPF_REG_6) { - /* for callee saved regs we have to skip the whole chain - * of states that belong to callee and mark as LIVE_READ - * the registers before the call - */ - tmp = parent; - while (tmp && tmp->curframe != state->curframe) { - tmp = tmp->parent; - } - if (!tmp) - goto bug; - parent = tmp; - } else { - goto bug; - } - return parent; -bug: - verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); - verbose(env, "regno %d parent frame %d current frame %d\n", - regno, parent->curframe, state->curframe); - return NULL; -} - +/* Parentage chain of this register (or stack slot) should take care of all + * issues like callee-saved registers, stack slot allocation time, etc. + */ static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) + const struct bpf_reg_state *state, + struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ - if (regno == BPF_REG_FP) - /* We don't need to worry about FP liveness because it's read-only */ - return 0; - while (parent) { /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->live & REG_LIVE_WRITTEN) break; - parent = skip_callee(env, state, parent, regno); - if (!parent) - return -EFAULT; /* ... then we depend on parent's value */ - parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; + parent->live |= REG_LIVE_READ; state = parent; parent = state->parent; writes = true; @@ -969,7 +1098,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - return mark_reg_read(env, vstate, vstate->parent, regno); + /* We don't need to worry about FP liveness because it's read-only */ + if (regno != BPF_REG_FP) + return mark_reg_read(env, ®s[regno], + regs[regno].parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -993,7 +1125,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: return true; default: return false; @@ -1018,7 +1153,7 @@ static int check_stack_write(struct bpf_verifier_env *env, enum bpf_reg_type type; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + state->acquired_refs, true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -1080,8 +1215,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } else { u8 type = STACK_MISC; - /* regular write of data into stack */ - state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* regular write of data into stack destroys any spilled ptr */ + state->stack[spi].spilled_ptr.type = NOT_INIT; /* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1106,61 +1241,6 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -/* registers of every function are unique and mark_reg_read() propagates - * the liveness in the following cases: - * - from callee into caller for R1 - R5 that were used as arguments - * - from caller into callee for R0 that used as result of the call - * - from caller to the same caller skipping states of the callee for R6 - R9, - * since R6 - R9 are callee saved by implicit function prologue and - * caller's R6 != callee's R6, so when we propagate liveness up to - * parent states we need to skip callee states for R6 - R9. - * - * stack slot marking is different, since stacks of caller and callee are - * accessible in both (since caller can pass a pointer to caller's stack to - * callee which can pass it to another function), hence mark_stack_slot_read() - * has to propagate the stack liveness to all parent states at given frame number. - * Consider code: - * f1() { - * ptr = fp - 8; - * *ptr = ctx; - * call f2 { - * .. = *ptr; - * } - * .. = *ptr; - * } - * First *ptr is reading from f1's stack and mark_stack_slot_read() has - * to mark liveness at the f1's frame and not f2's frame. - * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has - * to propagate liveness to f2 states at f1's frame level and further into - * f1 states at f1's frame level until write into that stack slot - */ -static void mark_stack_slot_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - int slot, int frameno) -{ - bool writes = parent == state->parent; /* Observe write marks */ - - while (parent) { - if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) - /* since LIVE_WRITTEN mark is only done for full 8-byte - * write the read marks are conservative and parent - * state may not even have the stack allocated. In such case - * end the propagation, since the loop reached beginning - * of the function - */ - break; - /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) - break; - /* ... then we depend on parent's value */ - parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; - state = parent; - parent = state->parent; - writes = true; - } -} - static int check_stack_read(struct bpf_verifier_env *env, struct bpf_func_state *reg_state /* func where register points to */, int off, int size, int value_regno) @@ -1198,8 +1278,8 @@ static int check_stack_read(struct bpf_verifier_env *env, */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); return 0; } else { int zeros = 0; @@ -1215,8 +1295,8 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1321,6 +1401,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; @@ -1404,6 +1485,40 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } +static int check_flow_keys_access(struct bpf_verifier_env *env, int off, + int size) +{ + if (size < 0 || off < 0 || + (u64)off + size > sizeof(struct bpf_flow_keys)) { + verbose(env, "invalid access to flow keys off=%d size=%d\n", + off, size); + return -EACCES; + } + return 0; +} + +static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, + int size, enum bpf_access_type t) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[regno]; + struct bpf_insn_access_aux info; + + if (reg->smin_value < 0) { + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + + if (!bpf_sock_is_valid_access(off, size, t, &info)) { + verbose(env, "invalid bpf_sock access off=%d size=%d\n", + off, size); + return -EACCES; + } + + return 0; +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -1413,25 +1528,39 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +{ + return cur_regs(env) + regno; +} + static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); + return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); } static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = cur_regs(env) + regno; + const struct bpf_reg_state *reg = reg_state(env, regno); - return reg->type == PTR_TO_CTX; + return reg->type == PTR_TO_CTX || + reg->type == PTR_TO_SOCKET; } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = cur_regs(env) + regno; + const struct bpf_reg_state *reg = reg_state(env, regno); return type_is_pkt_pointer(reg->type); } +static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */ + return reg->type == PTR_TO_FLOW_KEYS; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1505,6 +1634,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * right in front, treat it the very same way. */ return check_pkt_ptr_alignment(env, reg, off, size, strict); + case PTR_TO_FLOW_KEYS: + pointer_desc = "flow keys "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1519,6 +1651,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, */ strict = true; break; + case PTR_TO_SOCKET: + pointer_desc = "sock "; + break; default: break; } @@ -1727,9 +1862,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn else mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].id = 0; - regs[value_regno].off = 0; - regs[value_regno].range = 0; regs[value_regno].type = reg_type; } @@ -1778,6 +1910,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_FLOW_KEYS) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into flow keys\n", + value_regno); + return -EACCES; + } + + err = check_flow_keys_access(env, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_SOCKET) { + if (t == BPF_WRITE) { + verbose(env, "cannot write into socket\n"); + return -EACCES; + } + err = check_sock_access(env, regno, off, size, t); + if (!err && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1818,10 +1969,11 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins } if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg)) { + is_pkt_reg(env, insn->dst_reg) || + is_flow_key_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", - insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? - "context" : "packet"); + insn->dst_reg, + reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } @@ -1846,7 +1998,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = cur_regs(env) + regno; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); int off, i, slot, spi; @@ -1908,8 +2060,8 @@ mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, - spi, state->frameno); + mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent); } return update_stack_depth(env, state, off); } @@ -1978,7 +2130,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE) { + arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != expected_type) @@ -1999,6 +2152,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; + } else if (arg_type == ARG_PTR_TO_SOCKET) { + expected_type = PTR_TO_SOCKET; + if (type != expected_type) + goto err_type; + if (meta->ptr_id || !reg->id) { + verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", + meta->ptr_id, reg->id); + return -EFAULT; + } + meta->ptr_id = reg->id; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2038,7 +2201,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ @@ -2047,9 +2211,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, - NULL); + meta); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -2129,6 +2294,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: if (func_id != BPF_FUNC_get_local_storage) goto error; break; @@ -2171,6 +2337,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_sk_select_reuseport) goto error; break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + if (func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_push_elem) + goto error; + break; default: break; } @@ -2219,13 +2392,21 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_get_local_storage: - if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) goto error; break; case BPF_FUNC_sk_select_reuseport: if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) goto error; break; + case BPF_FUNC_map_peek_elem: + case BPF_FUNC_map_pop_elem: + case BPF_FUNC_map_push_elem: + if (map->map_type != BPF_MAP_TYPE_QUEUE && + map->map_type != BPF_MAP_TYPE_STACK) + goto error; + break; default: break; } @@ -2286,10 +2467,32 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) return true; } +static bool check_refcount_ok(const struct bpf_func_proto *fn) +{ + int count = 0; + + if (arg_type_is_refcounted(fn->arg1_type)) + count++; + if (arg_type_is_refcounted(fn->arg2_type)) + count++; + if (arg_type_is_refcounted(fn->arg3_type)) + count++; + if (arg_type_is_refcounted(fn->arg4_type)) + count++; + if (arg_type_is_refcounted(fn->arg5_type)) + count++; + + /* We only support one arg being unreferenced at the moment, + * which is sufficient for the helper functions we have right now. + */ + return count <= 1; +} + static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && - check_arg_pair_ok(fn) ? 0 : -EINVAL; + check_arg_pair_ok(fn) && + check_refcount_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2305,10 +2508,9 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(env, regs, i); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - reg = &state->stack[i].spilled_ptr; if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(reg); } @@ -2323,12 +2525,45 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) __clear_all_pkt_pointers(env, vstate->frame[i]); } +static void release_reg_references(struct bpf_verifier_env *env, + struct bpf_func_state *state, int id) +{ + struct bpf_reg_state *regs = state->regs, *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].id == id) + mark_reg_unknown(env, regs, i); + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + if (reg_is_refcounted(reg) && reg->id == id) + __mark_reg_unknown(reg); + } +} + +/* The pointer with the specified id has released its reference to kernel + * resources. Identify all copies of the same pointer and clear the reference. + */ +static int release_reference(struct bpf_verifier_env *env, + struct bpf_call_arg_meta *meta) +{ + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + release_reg_references(env, vstate->frame[i], meta->ptr_id); + + return release_reference_state(env, meta->ptr_id); +} + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; - int i, subprog, target_insn; + int i, err, subprog, target_insn; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -2366,11 +2601,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */); - /* copy r1 - r5 args that callee can access */ + /* Transfer references to the callee */ + err = transfer_reference_state(callee, caller); + if (err) + return err; + + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call regsiters r0 - r5 were scratched */ + /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, caller->regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); @@ -2396,6 +2638,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; struct bpf_reg_state *r0; + int err; callee = state->frame[state->curframe]; r0 = &callee->regs[BPF_REG_0]; @@ -2415,6 +2658,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; + /* Transfer references to the caller */ + err = transfer_reference_state(caller, callee); + if (err) + return err; + *insn_idx = callee->callsite + 1; if (env->log.level) { verbose(env, "returning from callee:\n"); @@ -2454,7 +2702,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && func_id != BPF_FUNC_map_update_elem && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_map_push_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_peek_elem) return 0; if (meta->map_ptr == NULL) { @@ -2471,6 +2722,18 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; } +static int check_reference_leak(struct bpf_verifier_env *env) +{ + struct bpf_func_state *state = cur_func(env); + int i; + + for (i = 0; i < state->acquired_refs; i++) { + verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", + state->refs[i].id, state->refs[i].insn_idx); + } + return state->acquired_refs ? -EINVAL : 0; +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2549,6 +2812,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } + if (func_id == BPF_FUNC_tail_call) { + err = check_reference_leak(env); + if (err) { + verbose(env, "tail_call would lead to reference leak\n"); + return err; + } + } else if (is_release_function(func_id)) { + err = release_reference(env, &meta); + if (err) + return err; + } + regs = cur_regs(env); /* check that flags argument in get_local_storage(map, flags) is 0, @@ -2580,7 +2855,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() @@ -2592,6 +2866,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + int id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; + regs[BPF_REG_0].id = id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -2722,20 +3003,20 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", - dst); - return -EACCES; - } - if (ptr_reg->type == CONST_PTR_TO_MAP) { - verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", - dst); + switch (ptr_reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: + verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", + dst, reg_type_str[ptr_reg->type]); return -EACCES; - } - if (ptr_reg->type == PTR_TO_PACKET_END) { - verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", - dst); + case CONST_PTR_TO_MAP: + case PTR_TO_PACKET_END: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: + verbose(env, "R%d pointer arithmetic on %s prohibited\n", + dst, reg_type_str[ptr_reg->type]); return -EACCES; + default: + break; } /* In case of 'scalar += pointer', dst_reg inherits pointer type and id. @@ -2896,6 +3177,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + if (insn_bitness == 32) { + /* Relevant for 32-bit RSH: Information can propagate towards + * LSB, so it isn't sufficient to only truncate the output to + * 32 bits. + */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -3131,7 +3421,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->32 */ coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); } __reg_deduce_bounds(dst_reg); @@ -3163,7 +3452,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. Disallow all math except * pointer subtraction */ - if (opcode == BPF_SUB){ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { mark_reg_unknown(env, regs, insn->dst_reg); return 0; } @@ -3447,10 +3736,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) reg->range = max(reg->range, new_range); } @@ -3656,12 +3944,11 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src, } } -static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, - bool is_null) +static void mark_ptr_or_null_reg(struct bpf_func_state *state, + struct bpf_reg_state *reg, u32 id, + bool is_null) { - struct bpf_reg_state *reg = ®s[regno]; - - if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + if (reg_type_may_be_null(reg->type) && reg->id == id) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. @@ -3674,40 +3961,49 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->map_ptr->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else { - reg->type = PTR_TO_MAP_VALUE; + } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = PTR_TO_MAP_VALUE; + } + } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { + reg->type = PTR_TO_SOCKET; + } + if (is_null || !reg_is_refcounted(reg)) { + /* We don't need id from this point onwards anymore, + * thus we should better reset it, so that state + * pruning has chances to take effect. + */ + reg->id = 0; } - /* We don't need id from this point onwards anymore, thus we - * should better reset it, so that state pruning has chances - * to take effect. - */ - reg->id = 0; } } /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, - bool is_null) +static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, + bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; u32 id = regs[regno].id; int i, j; + if (reg_is_refcounted_or_null(®s[regno]) && is_null) + __release_reference_state(state, id); + for (i = 0; i < MAX_BPF_REG; i++) - mark_map_reg(regs, i, id, is_null); + mark_ptr_or_null_reg(state, ®s[i], id, is_null); for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + mark_ptr_or_null_reg(state, reg, id, is_null); } } } @@ -3909,12 +4205,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - /* Mark all identical map registers in each branch as either + reg_type_may_be_null(dst_reg->type)) { + /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ - mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); - mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); + mark_ptr_or_null_regs(this_branch, insn->dst_reg, + opcode == BPF_JNE); + mark_ptr_or_null_regs(other_branch, insn->dst_reg, + opcode == BPF_JEQ); } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], this_branch, other_branch) && is_pointer_value(env, insn->dst_reg)) { @@ -4037,6 +4335,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as + * gen_ld_abs() may terminate the program at runtime, leading to + * reference leak. + */ + err = check_reference_leak(env); + if (err) { + verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); + return err; + } + if (regs[BPF_REG_6].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -4370,7 +4678,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, /* explored state didn't use this */ return true; - equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; if (rold->type == PTR_TO_STACK) /* two stack pointers are equal only if they're pointing to @@ -4451,6 +4759,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_CTX: case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -4526,6 +4837,14 @@ static bool stacksafe(struct bpf_func_state *old, return true; } +static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) +{ + if (old->acquired_refs != cur->acquired_refs) + return false; + return !memcmp(old->refs, cur->refs, + sizeof(*old->refs) * old->acquired_refs); +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -4571,6 +4890,9 @@ static bool func_states_equal(struct bpf_func_state *old, if (!stacksafe(old, cur, idmap)) goto out_free; + + if (!refsafe(old, cur)) + goto out_free; ret = true; out_free: kfree(idmap); @@ -4603,7 +4925,7 @@ static bool states_equal(struct bpf_verifier_env *env, * equivalent state (jump target or such) we didn't arrive by the straight-line * code, so read marks in the state must propagate to the parent regardless * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() and mark_stack_slot_read() is for. + * in mark_reg_read() is for. */ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, @@ -4624,7 +4946,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, vstate, vparent, i); + err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], + &vparent->frame[vstate->curframe]->regs[i]); if (err) return err; } @@ -4639,7 +4962,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_stack_slot_read(env, vstate, vparent, i, frame); + mark_reg_read(env, &state->stack[i].spilled_ptr, + &parent->stack[i].spilled_ptr); } } return err; @@ -4649,7 +4973,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err; sl = env->explored_states[insn_idx]; @@ -4691,16 +5015,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; /* add new state to the head of linked list */ - err = copy_verifier_state(&new_sl->state, cur); + new = &new_sl->state; + err = copy_verifier_state(new, cur); if (err) { - free_verifier_state(&new_sl->state, false); + free_verifier_state(new, false); kfree(new_sl); return err; } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - cur->parent = &new_sl->state; + for (i = 0; i < BPF_REG_FP; i++) + cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -4713,13 +5039,48 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { struct bpf_func_state *frame = cur->frame[j]; + struct bpf_func_state *newframe = new->frame[j]; - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.parent = + &newframe->stack[i].spilled_ptr; + } } return 0; } +/* Return true if it's OK to have the same insn return a different type. */ +static bool reg_type_mismatch_ok(enum bpf_reg_type type) +{ + switch (type) { + case PTR_TO_CTX: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: + return false; + default: + return true; + } +} + +/* If an instruction was previously used with particular pointer types, then we + * need to be careful to avoid cases such as the below, where it may be ok + * for one branch accessing the pointer, but not ok for the other branch: + * + * R1 = sock_ptr + * goto X; + * ... + * R1 = some_other_valid_ptr; + * goto X; + * ... + * R2 = *(u32 *)(R1 + 0); + */ +static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) +{ + return src != prev && (!reg_type_mismatch_ok(src) || + !reg_type_mismatch_ok(prev)); +} + static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state; @@ -4734,7 +5095,6 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; - state->parent = NULL; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -4814,6 +5174,7 @@ static int do_check(struct bpf_verifier_env *env) regs = cur_regs(env); env->insn_aux_data[insn_idx].seen = true; + if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -4853,9 +5214,7 @@ static int do_check(struct bpf_verifier_env *env) */ *prev_src_type = src_reg_type; - } else if (src_reg_type != *prev_src_type && - (src_reg_type == PTR_TO_CTX || - *prev_src_type == PTR_TO_CTX)) { + } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -4900,9 +5259,7 @@ static int do_check(struct bpf_verifier_env *env) if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; - } else if (dst_reg_type != *prev_dst_type && - (dst_reg_type == PTR_TO_CTX || - *prev_dst_type == PTR_TO_CTX)) { + } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) { verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -4919,8 +5276,9 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_ST stores into R%d context is not allowed\n", - insn->dst_reg); + verbose(env, "BPF_ST stores into R%d %s is not allowed\n", + insn->dst_reg, + reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } @@ -4982,6 +5340,10 @@ static int do_check(struct bpf_verifier_env *env) continue; } + err = check_reference_leak(env); + if (err) + return err; + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time @@ -5095,6 +5457,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return 0; } +static bool bpf_map_is_cgroup_storage(struct bpf_map *map) +{ + return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -5185,10 +5553,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; - if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + if (bpf_map_is_cgroup_storage(map) && bpf_cgroup_storage_assign(env->prog, map)) { - verbose(env, - "only one cgroup storage is allowed\n"); + verbose(env, "only one cgroup storage of each type is allowed\n"); fdput(f); return -EBUSY; } @@ -5217,11 +5584,15 @@ next_insn: /* drop refcnt of maps used by the rejected program */ static void release_maps(struct bpf_verifier_env *env) { + enum bpf_cgroup_storage_type stype; int i; - if (env->prog->aux->cgroup_storage) + for_each_cgroup_storage_type(stype) { + if (!env->prog->aux->cgroup_storage[stype]) + continue; bpf_cgroup_storage_release(env->prog, - env->prog->aux->cgroup_storage); + env->prog->aux->cgroup_storage[stype]); + } for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); @@ -5319,8 +5690,10 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } -/* convert load instructions that access fields of 'struct __sk_buff' - * into sequence of instructions that access fields of 'struct sk_buff' +/* convert load instructions that access fields of a context type into a + * sequence of instructions that access fields of the underlying structure: + * struct __sk_buff -> struct sk_buff + * struct bpf_sock_ops -> struct sock */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { @@ -5349,12 +5722,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_dev_bound(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { + bpf_convert_ctx_access_t convert_ctx_access; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || @@ -5396,8 +5771,18 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } - if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) + switch (env->insn_aux_data[i + delta].ptr_type) { + case PTR_TO_CTX: + if (!ops->convert_ctx_access) + continue; + convert_ctx_access = ops->convert_ctx_access; + break; + case PTR_TO_SOCKET: + convert_ctx_access = bpf_sock_convert_ctx_access; + break; + default: continue; + } ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; size = BPF_LDST_BYTES(insn); @@ -5429,8 +5814,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } target_size = 0; - cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, - &target_size); + cnt = convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { verbose(env, "bpf verifier is misconfigured\n"); @@ -5621,10 +6006,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; int i, depth; #endif - int err; + int err = 0; - err = 0; - if (env->prog->jit_requested) { + if (env->prog->jit_requested && + !bpf_prog_is_dev_bound(env->prog->aux)) { err = jit_subprogs(env); if (err == 0) return 0; @@ -5793,7 +6178,10 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (prog->jit_requested && BITS_PER_LONG == 64 && (insn->imm == BPF_FUNC_map_lookup_elem || insn->imm == BPF_FUNC_map_update_elem || - insn->imm == BPF_FUNC_map_delete_elem)) { + insn->imm == BPF_FUNC_map_delete_elem || + insn->imm == BPF_FUNC_map_push_elem || + insn->imm == BPF_FUNC_map_pop_elem || + insn->imm == BPF_FUNC_map_peek_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -5826,6 +6214,14 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_update_elem, (int (*)(struct bpf_map *map, void *key, void *value, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_push_elem, + (int (*)(struct bpf_map *map, void *value, + u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_pop_elem, + (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_peek_elem, + (int (*)(struct bpf_map *map, void *value))NULL)); + switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - @@ -5839,6 +6235,18 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = BPF_CAST_CALL(ops->map_delete_elem) - __bpf_call_base; continue; + case BPF_FUNC_map_push_elem: + insn->imm = BPF_CAST_CALL(ops->map_push_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_pop_elem: + insn->imm = BPF_CAST_CALL(ops->map_pop_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_peek_elem: + insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - + __bpf_call_base; + continue; } goto patch_call_imm; @@ -5962,6 +6370,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->cur_state = NULL; } + if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) + ret = bpf_prog_offload_finalize(env); + skip_full_check: while (!pop_stack(env, NULL, NULL)); free_states(env); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9f8463afda9c..686d244e798d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -154,7 +154,7 @@ void __xsk_map_flush(struct bpf_map *map) static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, @@ -192,11 +192,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, sock_hold(sock->sk); old_xs = xchg(&m->xsk_map[i], xs); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } sockfd_put(sock); return 0; @@ -212,11 +209,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; old_xs = xchg(&m->xsk_map[k], NULL); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } return 0; } |