From fca16e51078e8e5c0af839426b3d2dcd2bede135 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 26 Jul 2019 18:06:53 +0200 Subject: xdp: Refactor devmap allocation code for reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The subsequent patch to add a new devmap sub-type can re-use much of the initialisation and allocation code, so refactor it into separate functions. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 136 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 83 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d83cf8ccc872..a0501266bdb8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,9 +60,9 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; - unsigned int bit; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; + unsigned int idx; /* keep track of map index for tracepoint */ }; struct bpf_dtab { @@ -75,28 +75,21 @@ struct bpf_dtab { static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - struct bpf_dtab *dtab; int err, cpu; u64 cost; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); + return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; - dtab = kzalloc(sizeof(*dtab), GFP_USER); - if (!dtab) - return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&dtab->map, attr); @@ -107,9 +100,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) - goto free_dtab; - - err = -ENOMEM; + return -EINVAL; dtab->flush_list = alloc_percpu(struct list_head); if (!dtab->flush_list) @@ -124,19 +115,38 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; - spin_lock(&dev_map_lock); - list_add_tail_rcu(&dtab->list, &dev_map_list); - spin_unlock(&dev_map_lock); - - return &dtab->map; + return 0; free_percpu: free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); -free_dtab: - kfree(dtab); - return ERR_PTR(err); + return -ENOMEM; +} + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ + struct bpf_dtab *dtab; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return ERR_PTR(-ENOMEM); + + err = dev_map_init_map(dtab, attr); + if (err) { + kfree(dtab); + return ERR_PTR(err); + } + + spin_lock(&dev_map_lock); + list_add_tail_rcu(&dtab->list, &dev_map_list); + spin_unlock(&dev_map_lock); + + return &dtab->map; } static void dev_map_free(struct bpf_map *map) @@ -235,7 +245,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); @@ -412,17 +422,52 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, + struct bpf_dtab *dtab, + u32 ifindex, + unsigned int idx) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + struct bpf_dtab_netdev *dev; + struct xdp_bulk_queue *bq; + int cpu; + + dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + free_percpu(dev->bulkq); + kfree(dev); + return ERR_PTR(-EINVAL); + } + + dev->idx = idx; + dev->dtab = dtab; + + return dev; +} + +static int __dev_map_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; u32 ifindex = *(u32 *)value; - struct xdp_bulk_queue *bq; u32 i = *(u32 *)key; - int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -434,31 +479,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); - if (!dev) - return -ENOMEM; - - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return -ENOMEM; - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - free_percpu(dev->bulkq); - kfree(dev); - return -EINVAL; - } - - dev->bit = i; - dev->dtab = dtab; + dev = __dev_map_alloc_node(net, dtab, ifindex, i); + if (IS_ERR(dev)) + return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed @@ -472,6 +495,13 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, -- cgit v1.2.3 From 6f9d451ab1a33728adb72d7ff66a7b374d665176 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 26 Jul 2019 18:06:55 +0200 Subject: xdp: Add devmap_hash map type for looking up devices by hashed index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common pattern when using xdp_redirect_map() is to create a device map where the lookup key is simply ifindex. Because device maps are arrays, this leaves holes in the map, and the map has to be sized to fit the largest ifindex, regardless of how many devices actually are actually needed in the map. This patch adds a second type of device map where the key is looked up using a hashmap, instead of being used as an array index. This allows maps to be densely packed, so they can be smaller. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++ include/linux/bpf_types.h | 1 + include/trace/events/xdp.h | 3 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 200 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 2 + net/core/filter.c | 9 +- 7 files changed, 220 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bfdb54dd2ad1..f9a506147c8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -713,6 +713,7 @@ struct xdp_buff; struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -799,6 +800,12 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } +static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + static inline void __dev_map_flush(struct bpf_map *map) { } diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index eec5aeeeaf92..36a9c2325176 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 68899fdc985b..8c8420230a10 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev { #endif /* __DEVMAP_OBJ_TYPE */ #define devmap_ifindex(fwd, map) \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e985f07a98ed..6bbef0c7f585 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -134,6 +134,7 @@ enum bpf_map_type { BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a0501266bdb8..9af048a932b5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -37,6 +37,12 @@ * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. + * + * The devmap_hash type is a map type which interprets keys as ifindexes and + * indexes these using a hashmap. This allows maps that use ifindex as key to be + * densely packed instead of having holes in the lookup array for unused + * ifindexes. The setup and packet enqueue/send code is shared between the two + * types of devmap; only the lookup and insertion is different. */ #include #include @@ -59,6 +65,7 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ + struct hlist_node index_hlist; struct bpf_dtab *dtab; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; @@ -70,11 +77,30 @@ struct bpf_dtab { struct bpf_dtab_netdev **netdev_map; struct list_head __percpu *flush_list; struct list_head list; + + /* these are only used for DEVMAP_HASH type maps */ + struct hlist_head *dev_index_head; + spinlock_t index_lock; + unsigned int items; + u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static struct hlist_head *dev_map_create_hash(unsigned int entries) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < entries; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -97,6 +123,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += sizeof(struct list_head) * num_possible_cpus(); + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + + if (!dtab->n_buckets) /* Overflow check */ + return -EINVAL; + cost += sizeof(struct hlist_head) * dtab->n_buckets; + } + /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) @@ -115,8 +149,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + if (!dtab->dev_index_head) + goto free_map_area; + + spin_lock_init(&dtab->index_lock); + } + return 0; +free_map_area: + bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -198,6 +242,7 @@ static void dev_map_free(struct bpf_map *map) free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); + kfree(dtab->dev_index_head); kfree(dtab); } @@ -218,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct hlist_head *head = dev_map_index_hash(dtab, key); + struct bpf_dtab_netdev *dev; + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->idx == key) + return dev; + + return NULL; +} + +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 idx, *next = next_key; + struct bpf_dtab_netdev *dev, *next_dev; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first; + + idx = *(u32 *)key; + + dev = __dev_map_hash_lookup_elem(map, idx); + if (!dev) + goto find_first; + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), + struct bpf_dtab_netdev, index_hlist); + + if (next_dev) { + *next = next_dev->idx; + return 0; + } + + i = idx & (dtab->n_buckets - 1); + i++; + + find_first: + for (; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_dtab_netdev, + index_hlist); + if (next_dev) { + *next = next_dev->idx; + return 0; + } + } + + return -ENOENT; +} + static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { @@ -373,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, + *(u32 *)key); + struct net_device *dev = obj ? obj->dev : NULL; + + return dev ? &dev->ifindex : NULL; +} + static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { @@ -422,6 +540,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + unsigned long flags; + int ret = -ENOENT; + + spin_lock_irqsave(&dtab->index_lock, flags); + + old_dev = __dev_map_hash_lookup_elem(map, k); + if (old_dev) { + dtab->items--; + hlist_del_init_rcu(&old_dev->index_hlist); + call_rcu(&old_dev->rcu, __dev_map_entry_free); + ret = 0; + } + spin_unlock_irqrestore(&dtab->index_lock, flags); + + return ret; +} + static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, u32 ifindex, @@ -502,6 +642,56 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev, *old_dev; + u32 ifindex = *(u32 *)value; + u32 idx = *(u32 *)key; + unsigned long flags; + + if (unlikely(map_flags > BPF_EXIST || !ifindex)) + return -EINVAL; + + old_dev = __dev_map_hash_lookup_elem(map, idx); + if (old_dev && (map_flags & BPF_NOEXIST)) + return -EEXIST; + + dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + spin_lock_irqsave(&dtab->index_lock, flags); + + if (old_dev) { + hlist_del_rcu(&old_dev->index_hlist); + } else { + if (dtab->items >= dtab->map.max_entries) { + spin_unlock_irqrestore(&dtab->index_lock, flags); + call_rcu(&dev->rcu, __dev_map_entry_free); + return -E2BIG; + } + dtab->items++; + } + + hlist_add_head_rcu(&dev->index_hlist, + dev_map_index_hash(dtab, idx)); + spin_unlock_irqrestore(&dtab->index_lock, flags); + + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_hash_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -512,6 +702,16 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, }; +const struct bpf_map_ops dev_map_hash_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_hash_get_next_key, + .map_lookup_elem = dev_map_hash_lookup_elem, + .map_update_elem = dev_map_hash_update_elem, + .map_delete_elem = dev_map_hash_delete_elem, + .map_check_btf = map_check_no_btf, +}; + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5900cbb966b1..cef851cd5c36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3457,6 +3457,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: if (func_id != BPF_FUNC_redirect_map && func_id != BPF_FUNC_map_lookup_elem) goto error; @@ -3539,6 +3540,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && map->map_type != BPF_MAP_TYPE_CPUMAP && map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; diff --git a/net/core/filter.c b/net/core/filter.c index 3961437ccc50..1eee70f80fba 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3517,7 +3517,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, int err; switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: { + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: { struct bpf_dtab_netdev *dst = fwd; err = dev_map_enqueue(dst, xdp, dev_rx); @@ -3554,6 +3555,7 @@ void xdp_do_flush_map(void) if (map) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: __dev_map_flush(map); break; case BPF_MAP_TYPE_CPUMAP: @@ -3574,6 +3576,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: return __dev_map_lookup_elem(map, index); + case BPF_MAP_TYPE_DEVMAP_HASH: + return __dev_map_hash_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); case BPF_MAP_TYPE_XSKMAP: @@ -3655,7 +3659,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + if (map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { struct bpf_dtab_netdev *dst = fwd; err = dev_map_generic_redirect(dst, skb, xdp_prog); -- cgit v1.2.3 From 9babe825da769cfbd7080f95e6bddbe98ce6b95d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 29 Jul 2019 14:51:10 -0700 Subject: bpf: always allocate at least 16 bytes for setsockopt hook Since we always allocate memory, allocate just a little bit more for the BPF program in case it need to override user input with bigger value. The canonical example is TCP_CONGESTION where input string might be too small to override (nv -> bbr or cubic). 16 bytes are chosen to match the size of TCP_CA_NAME_MAX and can be extended in the future if needed. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a00eaca6fae..6a6a154cfa7b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -964,7 +964,6 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; - ctx->optlen = max_optlen; return 0; } @@ -984,7 +983,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, .level = *level, .optname = *optname, }; - int ret; + int ret, max_optlen; /* Opportunistic check to see whether we have any BPF program * attached to the hook so we don't waste time allocating @@ -994,10 +993,18 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; - ret = sockopt_alloc_buf(&ctx, *optlen); + /* Allocate a bit more than the initial user buffer for + * BPF program. The canonical use case is overriding + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). + */ + max_optlen = max_t(int, 16, *optlen); + + ret = sockopt_alloc_buf(&ctx, max_optlen); if (ret) return ret; + ctx.optlen = *optlen; + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { ret = -EFAULT; goto out; @@ -1016,7 +1023,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ ret = 1; - } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { /* optlen is out of bounds */ ret = -EFAULT; } else { @@ -1063,6 +1070,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, if (ret) return ret; + ctx.optlen = max_optlen; + if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back -- cgit v1.2.3 From 341dfcf8d78eaa3a2dc96dea06f0392eb2978364 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Aug 2019 11:39:47 -0700 Subject: btf: expose BTF info through sysfs Make .BTF section allocated and expose its contents through sysfs. /sys/kernel/btf directory is created to contain all the BTFs present inside kernel. Currently there is only kernel's main BTF, represented as /sys/kernel/btf/kernel file. Once kernel modules' BTFs are supported, each module will expose its BTF as /sys/kernel/btf/ file. Current approach relies on a few pieces coming together: 1. pahole is used to take almost final vmlinux image (modulo .BTF and kallsyms) and generate .BTF section by converting DWARF info into BTF. This section is not allocated and not mapped to any segment, though, so is not yet accessible from inside kernel at runtime. 2. objcopy dumps .BTF contents into binary file and subsequently convert binary file into linkable object file with automatically generated symbols _binary__btf_kernel_bin_start and _binary__btf_kernel_bin_end, pointing to start and end, respectively, of BTF raw data. 3. final vmlinux image is generated by linking this object file (and kallsyms, if necessary). sysfs_btf.c then creates /sys/kernel/btf/kernel file and exposes embedded BTF contents through it. This allows, e.g., libbpf and bpftool access BTF info at well-known location, without resorting to searching for vmlinux image on disk (location of which is not standardized and vmlinux image might not be even available in some scenarios, e.g., inside qemu during testing). Alternative approach using .incbin assembler directive to embed BTF contents directly was attempted but didn't work, because sysfs_proc.o is not re-compiled during link-vmlinux.sh stage. This is required, though, to update embedded BTF data (initially empty data is embedded, then pahole generates BTF info and we need to regenerate sysfs_btf.o with updated contents, but it's too late at that point). If BTF couldn't be generated due to missing or too old pahole, sysfs_btf.c handles that gracefully by detecting that _binary__btf_kernel_bin_start (weak symbol) is 0 and not creating /sys/kernel/btf at all. v2->v3: - added Documentation/ABI/testing/sysfs-kernel-btf (Greg K-H); - created proper kobject (btf_kobj) for btf directory (Greg K-H); - undo v2 change of reusing vmlinux, as it causes extra kallsyms pass due to initially missing __binary__btf_kernel_bin_{start/end} symbols; v1->v2: - allow kallsyms stage to re-use vmlinux generated by gen_btf(); Reviewed-by: Greg Kroah-Hartman Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- Documentation/ABI/testing/sysfs-kernel-btf | 17 ++++++++++ kernel/bpf/Makefile | 3 ++ kernel/bpf/sysfs_btf.c | 51 +++++++++++++++++++++++++++++ scripts/link-vmlinux.sh | 52 +++++++++++++++++++----------- 4 files changed, 104 insertions(+), 19 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-btf create mode 100644 kernel/bpf/sysfs_btf.c (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-btf b/Documentation/ABI/testing/sysfs-kernel-btf new file mode 100644 index 000000000000..5390f8001f96 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-btf @@ -0,0 +1,17 @@ +What: /sys/kernel/btf +Date: Aug 2019 +KernelVersion: 5.5 +Contact: bpf@vger.kernel.org +Description: + Contains BTF type information and related data for kernel and + kernel modules. + +What: /sys/kernel/btf/kernel +Date: Aug 2019 +KernelVersion: 5.5 +Contact: bpf@vger.kernel.org +Description: + Read-only binary attribute exposing kernel's own BTF type + information with description of all internal kernel types. See + Documentation/bpf/btf.rst for detailed description of format + itself. diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 29d781061cd5..e1d9adb212f9 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -22,3 +22,6 @@ obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o endif +ifeq ($(CONFIG_SYSFS),y) +obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o +endif diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c new file mode 100644 index 000000000000..092e63b9758b --- /dev/null +++ b/kernel/bpf/sysfs_btf.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel BTF information for introspection and use by eBPF tools. + */ +#include +#include +#include +#include +#include + +/* See scripts/link-vmlinux.sh, gen_btf() func for details */ +extern char __weak _binary__btf_kernel_bin_start[]; +extern char __weak _binary__btf_kernel_bin_end[]; + +static ssize_t +btf_kernel_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + memcpy(buf, _binary__btf_kernel_bin_start + off, len); + return len; +} + +static struct bin_attribute bin_attr_btf_kernel __ro_after_init = { + .attr = { .name = "kernel", .mode = 0444, }, + .read = btf_kernel_read, +}; + +static struct kobject *btf_kobj; + +static int __init btf_kernel_init(void) +{ + int err; + + if (!_binary__btf_kernel_bin_start) + return 0; + + btf_kobj = kobject_create_and_add("btf", kernel_kobj); + if (IS_ERR(btf_kobj)) { + err = PTR_ERR(btf_kobj); + btf_kobj = NULL; + return err; + } + + bin_attr_btf_kernel.size = _binary__btf_kernel_bin_end - + _binary__btf_kernel_bin_start; + + return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_kernel); +} + +subsys_initcall(btf_kernel_init); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index a7124f895b24..cb93832c6ad7 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -56,8 +56,8 @@ modpost_link() } # Link of vmlinux -# ${1} - optional extra .o files -# ${2} - output file +# ${1} - output file +# ${@:2} - optional extra .o files vmlinux_link() { local lds="${objtree}/${KBUILD_LDS}" @@ -70,9 +70,9 @@ vmlinux_link() --start-group \ ${KBUILD_VMLINUX_LIBS} \ --end-group \ - ${1}" + ${@:2}" - ${LD} ${KBUILD_LDFLAGS} ${LDFLAGS_vmlinux} -o ${2} \ + ${LD} ${KBUILD_LDFLAGS} ${LDFLAGS_vmlinux} -o ${1} \ -T ${lds} ${objects} else objects="-Wl,--whole-archive \ @@ -81,9 +81,9 @@ vmlinux_link() -Wl,--start-group \ ${KBUILD_VMLINUX_LIBS} \ -Wl,--end-group \ - ${1}" + ${@:2}" - ${CC} ${CFLAGS_vmlinux} -o ${2} \ + ${CC} ${CFLAGS_vmlinux} -o ${1} \ -Wl,-T,${lds} \ ${objects} \ -lutil -lrt -lpthread @@ -92,23 +92,34 @@ vmlinux_link() } # generate .BTF typeinfo from DWARF debuginfo +# ${1} - vmlinux image +# ${2} - file to dump raw BTF data into gen_btf() { - local pahole_ver; + local pahole_ver + local bin_arch if ! [ -x "$(command -v ${PAHOLE})" ]; then info "BTF" "${1}: pahole (${PAHOLE}) is not available" - return 0 + return 1 fi pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') if [ "${pahole_ver}" -lt "113" ]; then info "BTF" "${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" - return 0 + return 1 fi - info "BTF" ${1} + info "BTF" ${2} + vmlinux_link ${1} LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1} + + # dump .BTF section into raw binary file to link with final vmlinux + bin_arch=$(${OBJDUMP} -f ${1} | grep architecture | \ + cut -d, -f1 | cut -d' ' -f2) + ${OBJCOPY} --dump-section .BTF=.btf.kernel.bin ${1} 2>/dev/null + ${OBJCOPY} -I binary -O ${CONFIG_OUTPUT_FORMAT} -B ${bin_arch} \ + --rename-section .data=.BTF .btf.kernel.bin ${2} } # Create ${2} .o file with all symbols from the ${1} object file @@ -153,6 +164,7 @@ sortextable() # Delete output files in case of error cleanup() { + rm -f .btf.* rm -f .tmp_System.map rm -f .tmp_kallsyms* rm -f .tmp_vmlinux* @@ -215,6 +227,13 @@ ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o info MODINFO modules.builtin.modinfo ${OBJCOPY} -j .modinfo -O binary vmlinux.o modules.builtin.modinfo +btf_kernel_bin_o="" +if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then + if gen_btf .tmp_vmlinux.btf .btf.kernel.bin.o ; then + btf_kernel_bin_o=.btf.kernel.bin.o + fi +fi + kallsymso="" kallsyms_vmlinux="" if [ -n "${CONFIG_KALLSYMS}" ]; then @@ -246,11 +265,11 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then kallsyms_vmlinux=.tmp_vmlinux2 # step 1 - vmlinux_link "" .tmp_vmlinux1 + vmlinux_link .tmp_vmlinux1 ${btf_kernel_bin_o} kallsyms .tmp_vmlinux1 .tmp_kallsyms1.o # step 2 - vmlinux_link .tmp_kallsyms1.o .tmp_vmlinux2 + vmlinux_link .tmp_vmlinux2 .tmp_kallsyms1.o ${btf_kernel_bin_o} kallsyms .tmp_vmlinux2 .tmp_kallsyms2.o # step 3 @@ -261,18 +280,13 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then kallsymso=.tmp_kallsyms3.o kallsyms_vmlinux=.tmp_vmlinux3 - vmlinux_link .tmp_kallsyms2.o .tmp_vmlinux3 - + vmlinux_link .tmp_vmlinux3 .tmp_kallsyms2.o ${btf_kernel_bin_o} kallsyms .tmp_vmlinux3 .tmp_kallsyms3.o fi fi info LD vmlinux -vmlinux_link "${kallsymso}" vmlinux - -if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then - gen_btf vmlinux -fi +vmlinux_link vmlinux "${kallsymso}" "${btf_kernel_bin_o}" if [ -n "${CONFIG_BUILDTIME_EXTABLE_SORT}" ]; then info SORTEX vmlinux -- cgit v1.2.3 From 7fd785685e2243bb639b31557e258d11464c3489 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 13 Aug 2019 11:54:42 -0700 Subject: btf: rename /sys/kernel/btf/kernel into /sys/kernel/btf/vmlinux Expose kernel's BTF under the name vmlinux to be more uniform with using kernel module names as file names in the future. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- Documentation/ABI/testing/sysfs-kernel-btf | 2 +- kernel/bpf/sysfs_btf.c | 30 +++++++++++++++--------------- scripts/link-vmlinux.sh | 18 +++++++++--------- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-btf b/Documentation/ABI/testing/sysfs-kernel-btf index 5390f8001f96..2c9744b2cd59 100644 --- a/Documentation/ABI/testing/sysfs-kernel-btf +++ b/Documentation/ABI/testing/sysfs-kernel-btf @@ -6,7 +6,7 @@ Description: Contains BTF type information and related data for kernel and kernel modules. -What: /sys/kernel/btf/kernel +What: /sys/kernel/btf/vmlinux Date: Aug 2019 KernelVersion: 5.5 Contact: bpf@vger.kernel.org diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 092e63b9758b..4659349fc795 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,30 +9,30 @@ #include /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak _binary__btf_kernel_bin_start[]; -extern char __weak _binary__btf_kernel_bin_end[]; +extern char __weak _binary__btf_vmlinux_bin_start[]; +extern char __weak _binary__btf_vmlinux_bin_end[]; static ssize_t -btf_kernel_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t len) +btf_vmlinux_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) { - memcpy(buf, _binary__btf_kernel_bin_start + off, len); + memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); return len; } -static struct bin_attribute bin_attr_btf_kernel __ro_after_init = { - .attr = { .name = "kernel", .mode = 0444, }, - .read = btf_kernel_read, +static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { + .attr = { .name = "vmlinux", .mode = 0444, }, + .read = btf_vmlinux_read, }; static struct kobject *btf_kobj; -static int __init btf_kernel_init(void) +static int __init btf_vmlinux_init(void) { int err; - if (!_binary__btf_kernel_bin_start) + if (!_binary__btf_vmlinux_bin_start) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); @@ -42,10 +42,10 @@ static int __init btf_kernel_init(void) return err; } - bin_attr_btf_kernel.size = _binary__btf_kernel_bin_end - - _binary__btf_kernel_bin_start; + bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - + _binary__btf_vmlinux_bin_start; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_kernel); + return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } -subsys_initcall(btf_kernel_init); +subsys_initcall(btf_vmlinux_init); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index cb93832c6ad7..f7933c606f27 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -117,9 +117,9 @@ gen_btf() # dump .BTF section into raw binary file to link with final vmlinux bin_arch=$(${OBJDUMP} -f ${1} | grep architecture | \ cut -d, -f1 | cut -d' ' -f2) - ${OBJCOPY} --dump-section .BTF=.btf.kernel.bin ${1} 2>/dev/null + ${OBJCOPY} --dump-section .BTF=.btf.vmlinux.bin ${1} 2>/dev/null ${OBJCOPY} -I binary -O ${CONFIG_OUTPUT_FORMAT} -B ${bin_arch} \ - --rename-section .data=.BTF .btf.kernel.bin ${2} + --rename-section .data=.BTF .btf.vmlinux.bin ${2} } # Create ${2} .o file with all symbols from the ${1} object file @@ -227,10 +227,10 @@ ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o info MODINFO modules.builtin.modinfo ${OBJCOPY} -j .modinfo -O binary vmlinux.o modules.builtin.modinfo -btf_kernel_bin_o="" +btf_vmlinux_bin_o="" if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then - if gen_btf .tmp_vmlinux.btf .btf.kernel.bin.o ; then - btf_kernel_bin_o=.btf.kernel.bin.o + if gen_btf .tmp_vmlinux.btf .btf.vmlinux.bin.o ; then + btf_vmlinux_bin_o=.btf.vmlinux.bin.o fi fi @@ -265,11 +265,11 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then kallsyms_vmlinux=.tmp_vmlinux2 # step 1 - vmlinux_link .tmp_vmlinux1 ${btf_kernel_bin_o} + vmlinux_link .tmp_vmlinux1 ${btf_vmlinux_bin_o} kallsyms .tmp_vmlinux1 .tmp_kallsyms1.o # step 2 - vmlinux_link .tmp_vmlinux2 .tmp_kallsyms1.o ${btf_kernel_bin_o} + vmlinux_link .tmp_vmlinux2 .tmp_kallsyms1.o ${btf_vmlinux_bin_o} kallsyms .tmp_vmlinux2 .tmp_kallsyms2.o # step 3 @@ -280,13 +280,13 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then kallsymso=.tmp_kallsyms3.o kallsyms_vmlinux=.tmp_vmlinux3 - vmlinux_link .tmp_vmlinux3 .tmp_kallsyms2.o ${btf_kernel_bin_o} + vmlinux_link .tmp_vmlinux3 .tmp_kallsyms2.o ${btf_vmlinux_bin_o} kallsyms .tmp_vmlinux3 .tmp_kallsyms3.o fi fi info LD vmlinux -vmlinux_link vmlinux "${kallsymso}" "${btf_kernel_bin_o}" +vmlinux_link vmlinux "${kallsymso}" "${btf_vmlinux_bin_o}" if [ -n "${CONFIG_BUILDTIME_EXTABLE_SORT}" ]; then info SORTEX vmlinux -- cgit v1.2.3