diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/bpfilter/main.c | 2 | ||||
-rw-r--r-- | net/core/filter.c | 72 | ||||
-rw-r--r-- | net/core/xdp.c | 11 | ||||
-rw-r--r-- | net/sched/act_bpf.c | 2 | ||||
-rw-r--r-- | net/sched/cls_bpf.c | 3 | ||||
-rw-r--r-- | net/xdp/xsk.c | 4 | ||||
-rw-r--r-- | net/xdp/xsk.h | 4 | ||||
-rw-r--r-- | net/xdp/xskmap.c | 29 |
8 files changed, 65 insertions, 62 deletions
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 05e1cfc1e5cd..291a92546246 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -57,7 +57,7 @@ int main(void) { debug_f = fopen("/dev/kmsg", "w"); setvbuf(debug_f, 0, _IOLBF, 0); - fprintf(debug_f, "Started bpfilter\n"); + fprintf(debug_f, "<5>Started bpfilter\n"); loop(); fclose(debug_f); return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 0b13d8157a8f..d22895caa164 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3235,15 +3235,12 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) return ret; } -static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -3255,21 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV4 needs to be changed into - * SKB_GSO_TCPV6. - */ + /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } - - /* Due to IPv6 header, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); @@ -3278,15 +3265,12 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) return 0; } -static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -3298,21 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV6 needs to be changed into - * SKB_GSO_TCPV4. - */ + /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } - - /* Due to IPv4 header, MSS can be upgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_increase_gso_size(shinfo, len_diff); - - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); @@ -3321,17 +3295,17 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) return 0; } -static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto, u64 flags) +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) - return bpf_skb_proto_4_to_6(skb, flags); + return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) - return bpf_skb_proto_6_to_4(skb, flags); + return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } @@ -3341,7 +3315,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, { int ret; - if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO))) + if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork @@ -3361,7 +3335,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ - ret = bpf_skb_proto_xlat(skb, proto, flags); + ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } @@ -3923,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; +/* XDP_REDIRECT works by a three-step process, implemented in the functions + * below: + * + * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target + * of the redirect and store it (along with some other metadata) in a per-CPU + * struct bpf_redirect_info. + * + * 2. When the program returns the XDP_REDIRECT return code, the driver will + * call xdp_do_redirect() which will use the information in struct + * bpf_redirect_info to actually enqueue the frame into a map type-specific + * bulk queue structure. + * + * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), + * which will flush all the different bulk queues, thus completing the + * redirect. + * + * Pointers to the map entries will be kept around for this whole sequence of + * steps, protected by RCU. However, there is no top-level rcu_read_lock() in + * the core code; instead, the RCU protection relies on everything happening + * inside a single NAPI poll sequence, which means it's between a pair of calls + * to local_bh_disable()/local_bh_enable(). + * + * The map entries are marked as __rcu and the map code makes sure to + * dereference those pointers with rcu_dereference_check() in a way that works + * for both sections that to hold an rcu_read_lock() and sections that are + * called from NAPI without a separate rcu_read_lock(). The code below does not + * use RCU annotations, but relies on those in the map code. + */ void xdp_do_flush(void) { __dev_flush(); diff --git a/net/core/xdp.c b/net/core/xdp.c index 725d20f1b100..cc92ccb38432 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -113,8 +113,13 @@ static void mem_allocator_disconnect(void *allocator) void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; + int type = xdp_rxq->mem.type; int id = xdp_rxq->mem.id; + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { WARN(1, "Missing register, driver bug"); return; @@ -123,7 +128,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + if (type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); @@ -144,10 +149,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; - - /* Reset mem info to defaults */ - xdp_rxq->mem.id = 0; - xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index e48e980c3b93..e409a0005717 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -43,7 +43,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, tcf_lastuse_update(&prog->tcf_tm); bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - rcu_read_lock(); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); @@ -56,7 +55,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); - rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6e3e63db0e01..fa739efa59f4 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -85,8 +85,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_bpf_prog *prog; int ret = -1; - /* Needed here for accessing maps. */ - rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res; @@ -131,7 +129,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, break; } - rcu_read_unlock(); return ret; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd62d4ba87a9..996da915f520 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -749,7 +749,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs) } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, - struct xdp_sock ***map_entry) + struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; @@ -785,7 +785,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ - struct xdp_sock **map_entry = NULL; + struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index edcf249ad1f1..a4bc4749faac 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -31,7 +31,7 @@ struct xdp_mmap_offsets_v1 { struct xsk_map_node { struct list_head node; struct xsk_map *map; - struct xdp_sock **map_entry; + struct xdp_sock __rcu **map_entry; }; static inline struct xdp_sock *xdp_sk(struct sock *sk) @@ -40,7 +40,7 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); + struct xdp_sock __rcu **map_entry); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 9df75ea4a567..2e48d0e094d9 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -12,7 +12,7 @@ #include "xsk.h" static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *node; @@ -42,7 +42,7 @@ static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) } static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *n, *tmp; @@ -124,6 +124,10 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) { struct xsk_map *m = container_of(map, struct xsk_map, map); @@ -131,12 +135,11 @@ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - return READ_ONCE(m->xsk_map[key]); + return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held()); } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - WARN_ON_ONCE(!rcu_read_lock_held()); return __xsk_map_lookup_elem(map, *(u32 *)key); } @@ -149,7 +152,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *xs, *old_xs; u32 i = *(u32 *)key, fd = *(u32 *)value; struct xsk_map_node *node; struct socket *sock; @@ -179,7 +183,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); + old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock)); if (old_xs == xs) { err = 0; goto out; @@ -191,7 +195,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, goto out; } xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); + rcu_assign_pointer(*map_entry, xs); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -208,7 +212,8 @@ out: static int xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *old_xs; int k = *(u32 *)key; if (k >= map->max_entries) @@ -216,7 +221,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) spin_lock_bh(&m->lock); map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); + old_xs = unrcu_pointer(xchg(map_entry, NULL)); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -231,11 +236,11 @@ static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); + if (rcu_access_pointer(*map_entry) == xs) { + rcu_assign_pointer(*map_entry, NULL); xsk_map_sock_delete(xs, map_entry); } spin_unlock_bh(&map->lock); |