From 887596095ec2a9ea39ffcf98f27bf2e77c5eb512 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:26 -0800 Subject: bpf: Clean up sockmap related Kconfigs As suggested by John, clean up sockmap related Kconfigs: Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream parser, to reflect its name. Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL and CONFIG_INET, the latter is still needed at this point because of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched, as it is used by non-sockmap cases. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-2-xiyou.wangcong@gmail.com --- include/net/tcp.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 963cd86d12dd..c00e125dcfb9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2222,25 +2222,27 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +#ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); -#else -static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) -{ -} -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ -#ifdef CONFIG_NET_SOCK_MSG int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif + #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, -- cgit v1.2.3 From 16137b09a66f2b75090f1e56a9ba0e27ef845ebc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:28 -0800 Subject: bpf: Compute data_end dynamically with JIT code Currently, we compute ->data_end with a compile-time constant offset of skb. But as Jakub pointed out, we can actually compute it in eBPF JIT code at run-time, so that we can competely get rid of ->data_end. This is similar to skb_shinfo(skb) computation in bpf_convert_shinfo_access(). Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-4-xiyou.wangcong@gmail.com --- include/net/tcp.h | 6 ------ net/core/filter.c | 48 ++++++++++++++++++++++++++++-------------------- net/core/skmsg.c | 1 - 3 files changed, 28 insertions(+), 27 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index c00e125dcfb9..947ef5da6867 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -886,18 +886,12 @@ struct tcp_skb_cb { struct { __u32 flags; struct sock *sk_redir; - void *data_end; } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; diff --git a/net/core/filter.c b/net/core/filter.c index adfdad234674..13bcf248ee7b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = { static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err = __bpf_try_make_writable(skb, write_len); - - bpf_compute_data_end_sk_skb(skb); - return err; + return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -3577,7 +3574,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOMEM; __skb_pull(skb, len_diff_abs); } - bpf_compute_data_end_sk_skb(skb); if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); @@ -3742,10 +3738,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { - int ret = __bpf_skb_change_tail(skb, new_len, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { @@ -3808,10 +3801,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { @@ -9655,22 +9645,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* si->dst_reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + /* si->dst_reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + return insn; +} + static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): - off = si->off; - off -= offsetof(struct __sk_buff, data_end); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct tcp_skb_cb, bpf.data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); + insn = bpf_convert_data_end_access(si, insn); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d00c9a4b47e7..8822001ab3dc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -746,7 +746,6 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - bpf_compute_data_end_sk_skb(skb); return bpf_prog_run_pin_on_cpu(prog, skb); } -- cgit v1.2.3 From e3526bb92a2084cdaec6cb2855bcec98b280426c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:29 -0800 Subject: skmsg: Move sk_redir from TCP_SKB_CB to skb Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly does not work for any other non-TCP protocols. We can move them to skb ext, but it introduces a memory allocation on fast path. Fortunately, we only need to a word-size to store all the information, because the flags actually only contains 1 bit so can be just packed into the lowest bit of the "pointer", which is stored as unsigned long. Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is no longer needed after ->sk_data_ready() so we can just drop it. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-5-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 3 +++ include/linux/skmsg.h | 38 ++++++++++++++++++++++++++++++++++++++ include/net/tcp.h | 19 ------------------- net/core/skmsg.c | 31 +++++++++++++++++++------------ net/core/sock_map.c | 8 ++------ 5 files changed, 62 insertions(+), 37 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..bd84f799c952 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -755,6 +755,9 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; +#ifdef CONFIG_NET_SOCK_MSG + unsigned long _sk_redir; +#endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 22e26f82de33..e0de45527bb6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -455,4 +455,42 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return false; return !!psock->saved_data_ready; } + +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + +/* We only have one bit so far. */ +#define BPF_F_PTR_MASK ~(BPF_F_INGRESS) + +static inline bool skb_bpf_ingress(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return sk_redir & BPF_F_INGRESS; +} + +static inline void skb_bpf_set_ingress(struct sk_buff *skb) +{ + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, + bool ingress) +{ + skb->_sk_redir = (unsigned long)sk_redir; + if (ingress) + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return (struct sock *)(sk_redir & BPF_F_PTR_MASK); +} + +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) +{ + skb->_sk_redir = 0; +} +#endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 947ef5da6867..075de26f449d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -883,30 +883,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8822001ab3dc..409258367bea 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -754,7 +760,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -804,9 +810,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -818,7 +825,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -830,8 +836,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -892,9 +897,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -1011,9 +1017,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1a28a5c2c61e..dbfcd7006338 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } -- cgit v1.2.3