From 887596095ec2a9ea39ffcf98f27bf2e77c5eb512 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:26 -0800 Subject: bpf: Clean up sockmap related Kconfigs As suggested by John, clean up sockmap related Kconfigs: Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream parser, to reflect its name. Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL and CONFIG_INET, the latter is still needed at this point because of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched, as it is used by non-sockmap cases. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-2-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 145 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 79 insertions(+), 66 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1261512d6807..e017744111e1 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -645,15 +645,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +static void sk_psock_done_strp(struct sk_psock *psock); + static void sk_psock_destroy_deferred(struct work_struct *gc) { struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - /* Parser has been stopped */ - if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + sk_psock_done_strp(psock); cancel_work_sync(&psock->work); @@ -750,14 +750,6 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, return bpf_prog_run_pin_on_cpu(prog, skb); } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -866,6 +858,24 @@ out_free: } } +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; @@ -897,6 +907,14 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = sk_psock_from_strp(strp); @@ -933,6 +951,56 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (psock->progs.skb_parser) + strp_done(&psock->parser.strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t orig_len) { @@ -984,35 +1052,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk) sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); } -static void sk_psock_write_space(struct sock *sk) -{ - struct sk_psock *psock; - void (*write_space)(struct sock *sk) = NULL; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) { - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); - write_space = psock->saved_write_space; - } - rcu_read_unlock(); - if (write_space) - write_space(sk); -} - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -{ - static const struct strp_callbacks cb = { - .rcv_msg = sk_psock_strp_read, - .read_sock_done = sk_psock_strp_read_done, - .parse_msg = sk_psock_strp_parse, - }; - - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); -} - void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -1026,32 +1065,6 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) parser->enabled = true; } -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) - return; - - parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) - return; - - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; -} - void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; -- cgit v1.2.3 From 5a685cd94b21a88efa6be77169eddef525368034 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:27 -0800 Subject: skmsg: Get rid of struct sk_psock_parser struct sk_psock_parser is embedded in sk_psock, it is unnecessary as skb verdict also uses ->saved_data_ready. We can simply fold these fields into sk_psock, and get rid of ->enabled. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 19 +++++++----------- net/core/skmsg.c | 53 ++++++++++++++++----------------------------------- net/core/sock_map.c | 8 ++++---- 3 files changed, 27 insertions(+), 53 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index db7a08be4725..22e26f82de33 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -70,14 +70,6 @@ struct sk_psock_link { void *link_raw; }; -struct sk_psock_parser { -#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) - struct strparser strp; -#endif - bool enabled; - void (*saved_data_ready)(struct sock *sk); -}; - struct sk_psock_work_state { struct sk_buff *skb; u32 len; @@ -92,7 +84,9 @@ struct sk_psock { u32 eval; struct sk_msg *cork; struct sk_psock_progs progs; - struct sk_psock_parser parser; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + struct strparser strp; +#endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; unsigned long state; @@ -102,6 +96,7 @@ struct sk_psock { void (*saved_unhash)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); + void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; struct sk_psock_work_state work_state; struct work_struct work; @@ -418,8 +413,8 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { - if (psock->parser.enabled) - psock->parser.saved_data_ready(sk); + if (psock->saved_data_ready) + psock->saved_data_ready(sk); else sk->sk_data_ready(sk); } @@ -458,6 +453,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; - return psock->parser.enabled; + return !!psock->saved_data_ready; } #endif /* _LINUX_SKMSG_H */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e017744111e1..d00c9a4b47e7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -907,17 +907,9 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; @@ -941,10 +933,10 @@ static void sk_psock_strp_data_ready(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { - psock->parser.saved_data_ready(sk); + psock->saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); + strp_data_ready(&psock->strp); write_unlock_bh(&sk->sk_callback_lock); } } @@ -959,41 +951,34 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) .parse_msg = sk_psock_strp_parse, }; - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); + return strp_init(&psock->strp, sk, &cb); } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); } static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + strp_done(&psock->strp); } #else static void sk_psock_done_strp(struct sk_psock *psock) @@ -1054,25 +1039,19 @@ static void sk_psock_verdict_data_ready(struct sock *sk) void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ee3334dd3a38..1a28a5c2c61e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->parser.enabled && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.skb_parser) strp_stop = true; - if (psock->parser.enabled && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -283,14 +283,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.skb_verdict, skb_verdict); sk_psock_start_verdict(sk,psock); } -- cgit v1.2.3 From 16137b09a66f2b75090f1e56a9ba0e27ef845ebc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:28 -0800 Subject: bpf: Compute data_end dynamically with JIT code Currently, we compute ->data_end with a compile-time constant offset of skb. But as Jakub pointed out, we can actually compute it in eBPF JIT code at run-time, so that we can competely get rid of ->data_end. This is similar to skb_shinfo(skb) computation in bpf_convert_shinfo_access(). Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-4-xiyou.wangcong@gmail.com --- include/net/tcp.h | 6 ------ net/core/filter.c | 48 ++++++++++++++++++++++++++++-------------------- net/core/skmsg.c | 1 - 3 files changed, 28 insertions(+), 27 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index c00e125dcfb9..947ef5da6867 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -886,18 +886,12 @@ struct tcp_skb_cb { struct { __u32 flags; struct sock *sk_redir; - void *data_end; } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; diff --git a/net/core/filter.c b/net/core/filter.c index adfdad234674..13bcf248ee7b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = { static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err = __bpf_try_make_writable(skb, write_len); - - bpf_compute_data_end_sk_skb(skb); - return err; + return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -3577,7 +3574,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOMEM; __skb_pull(skb, len_diff_abs); } - bpf_compute_data_end_sk_skb(skb); if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); @@ -3742,10 +3738,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { - int ret = __bpf_skb_change_tail(skb, new_len, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { @@ -3808,10 +3801,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { @@ -9655,22 +9645,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* si->dst_reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + /* si->dst_reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + return insn; +} + static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): - off = si->off; - off -= offsetof(struct __sk_buff, data_end); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct tcp_skb_cb, bpf.data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); + insn = bpf_convert_data_end_access(si, insn); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d00c9a4b47e7..8822001ab3dc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -746,7 +746,6 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - bpf_compute_data_end_sk_skb(skb); return bpf_prog_run_pin_on_cpu(prog, skb); } -- cgit v1.2.3 From e3526bb92a2084cdaec6cb2855bcec98b280426c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:29 -0800 Subject: skmsg: Move sk_redir from TCP_SKB_CB to skb Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly does not work for any other non-TCP protocols. We can move them to skb ext, but it introduces a memory allocation on fast path. Fortunately, we only need to a word-size to store all the information, because the flags actually only contains 1 bit so can be just packed into the lowest bit of the "pointer", which is stored as unsigned long. Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is no longer needed after ->sk_data_ready() so we can just drop it. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-5-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 3 +++ include/linux/skmsg.h | 38 ++++++++++++++++++++++++++++++++++++++ include/net/tcp.h | 19 ------------------- net/core/skmsg.c | 31 +++++++++++++++++++------------ net/core/sock_map.c | 8 ++------ 5 files changed, 62 insertions(+), 37 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..bd84f799c952 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -755,6 +755,9 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; +#ifdef CONFIG_NET_SOCK_MSG + unsigned long _sk_redir; +#endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 22e26f82de33..e0de45527bb6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -455,4 +455,42 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return false; return !!psock->saved_data_ready; } + +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + +/* We only have one bit so far. */ +#define BPF_F_PTR_MASK ~(BPF_F_INGRESS) + +static inline bool skb_bpf_ingress(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return sk_redir & BPF_F_INGRESS; +} + +static inline void skb_bpf_set_ingress(struct sk_buff *skb) +{ + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, + bool ingress) +{ + skb->_sk_redir = (unsigned long)sk_redir; + if (ingress) + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return (struct sock *)(sk_redir & BPF_F_PTR_MASK); +} + +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) +{ + skb->_sk_redir = 0; +} +#endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 947ef5da6867..075de26f449d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -883,30 +883,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8822001ab3dc..409258367bea 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -754,7 +760,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -804,9 +810,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -818,7 +825,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -830,8 +836,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -892,9 +897,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -1011,9 +1017,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1a28a5c2c61e..dbfcd7006338 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } -- cgit v1.2.3 From ae8b8332fbb512f53bf50ff6a7586dd0f90ed18a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:30 -0800 Subject: sock_map: Rename skb_parser and skb_verdict These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact they are only used for TCP. And save the name 'skb_verdict' for general use later. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 8 +-- net/core/skmsg.c | 14 ++--- net/core/sock_map.c | 60 +++++++++++----------- .../selftests/bpf/prog_tests/sockmap_listen.c | 8 +-- .../selftests/bpf/progs/test_sockmap_listen.c | 4 +- 5 files changed, 47 insertions(+), 47 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e0de45527bb6..d9f6ec4a9cf2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -56,8 +56,8 @@ struct sk_msg { struct sk_psock_progs { struct bpf_prog *msg_parser; - struct bpf_prog *skb_parser; - struct bpf_prog *skb_verdict; + struct bpf_prog *stream_parser; + struct bpf_prog *stream_verdict; }; enum sk_psock_state_bits { @@ -443,8 +443,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog, static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); - psock_set_prog(&progs->skb_parser, NULL); - psock_set_prog(&progs->skb_verdict, NULL); + psock_set_prog(&progs->stream_parser, NULL); + psock_set_prog(&progs->stream_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 409258367bea..35f9caa3b125 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -691,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.skb_verdict) + else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -803,7 +803,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) int ret = __SK_PASS; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { /* We skip full set_owner_r here because if we do a SK_PASS * or SK_DROP we can skip skb memory accounting and use the @@ -895,7 +895,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); @@ -919,7 +919,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) int ret = skb->len; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); + prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); @@ -982,7 +982,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) strp_done(&psock->strp); } #else @@ -1015,7 +1015,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dbfcd7006338..69785070f02d 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->saved_data_ready && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -224,23 +224,23 @@ out: static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock *psock; int ret; - skb_verdict = READ_ONCE(progs->skb_verdict); - if (skb_verdict) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); } - skb_parser = READ_ONCE(progs->skb_parser); - if (skb_parser) { - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - ret = PTR_ERR(skb_parser); - goto out_put_skb_verdict; + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; } } @@ -249,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out_put_skb_parser; + goto out_put_stream_parser; } } @@ -261,8 +261,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_parser && READ_ONCE(psock->progs.skb_parser)) || - (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -283,15 +283,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->saved_data_ready) { + if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + psock_set_prog(&psock->progs.stream_parser, stream_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); @@ -303,12 +303,12 @@ out_drop: out_progs: if (msg_parser) bpf_prog_put(msg_parser); -out_put_skb_parser: - if (skb_parser) - bpf_prog_put(skb_parser); -out_put_skb_verdict: - if (skb_verdict) - bpf_prog_put(skb_verdict); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); return ret; } @@ -1459,11 +1459,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->skb_parser; + pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: - pprog = &progs->skb_verdict; + pprog = &progs->stream_verdict; break; default: return -EOPNOTSUPP; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index d7d65a700799..c26e6bf05e49 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; @@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index a3a366c57ce1..fa221141e9c1 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -31,13 +31,13 @@ struct { static volatile bool test_sockmap; /* toggled by user-space */ SEC("sk_skb/stream_parser") -int prog_skb_parser(struct __sk_buff *skb) +int prog_stream_parser(struct __sk_buff *skb) { return skb->len; } SEC("sk_skb/stream_verdict") -int prog_skb_verdict(struct __sk_buff *skb) +int prog_stream_verdict(struct __sk_buff *skb) { unsigned int *count; __u32 zero = 0; -- cgit v1.2.3 From cd81cefb1abc52bd164f4d9760cd22eadc0e4468 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:32 -0800 Subject: skmsg: Make __sk_psock_purge_ingress_msg() static It is only used within skmsg.c so can become static. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-8-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 -- net/core/skmsg.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9f6ec4a9cf2..676d48e08159 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -340,8 +340,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link) struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); -void __sk_psock_purge_ingress_msg(struct sk_psock *psock); - static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 35f9caa3b125..46e29d2c0c48 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -619,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) return link; } -void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; -- cgit v1.2.3 From 533342322276b06b4db260c413ce907238851e9b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:33 -0800 Subject: skmsg: Get rid of sk_psock_bpf_run() It is now nearly identical to bpf_prog_run_pin_on_cpu() and it has an unused parameter 'psock', so we can just get rid of it and call bpf_prog_run_pin_on_cpu() directly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-9-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 46e29d2c0c48..07f54015238a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -749,12 +749,6 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, - struct sk_buff *skb) -{ - return bpf_prog_run_pin_on_cpu(prog, skb); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -812,7 +806,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) skb->sk = psock->sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } @@ -899,7 +893,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); @@ -922,7 +916,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); @@ -1019,7 +1013,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); -- cgit v1.2.3 From 37f0e514db660f03f8982b8f4fbbd4b2740abe7d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:22 -0700 Subject: skmsg: Lock ingress_skb when purging Currently we purge the ingress_skb queue only when psock refcnt goes down to 0, so locking the queue is not necessary, but in order to be called during ->close, we have to lock it here. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-2-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/skmsg.c') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 07f54015238a..bebf84ed4e30 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -634,7 +634,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; - while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); kfree_skb(skb); } -- cgit v1.2.3 From b01fd6e802b6d0a635176f943315670b679d8d7b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:23 -0700 Subject: skmsg: Introduce a spinlock to protect ingress_msg Currently we rely on lock_sock to protect ingress_msg, it is too big for this, we can actually just use a spinlock to protect this list like protecting other skb queues. __tcp_bpf_recvmsg() is still special because of peeking, it still has to use lock_sock. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/skmsg.c | 3 +++ net/ipv4/tcp_bpf.c | 18 ++++++------------ 3 files changed, 55 insertions(+), 12 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 6c09d94be2e9..f2d45a73b2b2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -89,6 +89,7 @@ struct sk_psock { #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; + spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -284,7 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + spin_lock_bh(&psock->ingress_lock); list_add_tail(&msg->list, &psock->ingress_msg); + spin_unlock_bh(&psock->ingress_lock); +} + +static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + if (msg) + list_del(&msg->list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + struct sk_msg *ret; + + spin_lock_bh(&psock->ingress_lock); + if (list_is_last(&msg->list, &psock->ingress_msg)) + ret = NULL; + else + ret = list_next_entry(msg, list); + spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) @@ -292,6 +331,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock) return psock ? list_empty(&psock->ingress_msg) : true; } +static inline void kfree_sk_msg(struct sk_msg *msg) +{ + if (msg->skb) + consume_skb(msg->skb); + kfree(msg); +} + static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index bebf84ed4e30..305dddc51857 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -592,6 +592,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_WORK(&psock->work, sk_psock_backlog); INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); @@ -638,7 +639,9 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); kfree_skb(skb); } + spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); + spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_link_destroy(struct sk_psock *psock) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 17c322b875fd..ae980716d896 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -18,9 +18,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg_rx; int i, copied = 0; - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - + msg_rx = sk_psock_peek_msg(psock); while (copied != len) { struct scatterlist *sge; @@ -68,22 +66,18 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } while (i != msg_rx->sg.end); if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) break; - msg_rx = list_next_entry(msg_rx, list); continue; } msg_rx->sg.start = i; if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); + msg_rx = sk_psock_peek_msg(psock); } return copied; -- cgit v1.2.3 From 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:25 -0700 Subject: skmsg: Avoid lock_sock() in sk_psock_backlog() We do not have to lock the sock to avoid losing sk_socket, instead we can purge all the ingress queues when we close the socket. Sending or receiving packets after orphaning socket makes no sense. We do purge these queues when psock refcnt reaches zero but here we want to purge them explicitly in sock_map_close(). There are also some nasty race conditions on testing bit SK_PSOCK_TX_ENABLED and queuing/canceling the psock work, we can expand psock->ingress_lock a bit to protect them too. As noticed by John, we still have to lock the psock->work, because the same work item could be running concurrently on different CPU's. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ net/core/skmsg.c | 50 ++++++++++++++++++++++++++++++++++---------------- net/core/sock_map.c | 1 + 3 files changed, 37 insertions(+), 16 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index f2d45a73b2b2..7382c4b518d7 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; + struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; union { @@ -347,6 +348,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) } struct sk_psock *sk_psock_init(struct sock *sk, int node); +void sk_psock_stop(struct sk_psock *psock, bool wait); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 305dddc51857..9c25020086a9 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; - return skb_send_sock_locked(psock->sk, skb, off, len); + return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb); } @@ -511,8 +511,7 @@ static void sk_psock_backlog(struct work_struct *work) u32 len, off; int ret; - /* Lock sock to avoid losing sk_socket during loop. */ - lock_sock(psock->sk); + mutex_lock(&psock->work_mutex); if (state->skb) { skb = state->skb; len = state->len; @@ -529,7 +528,7 @@ start: skb_bpf_redirect_clear(skb); do { ret = -EIO; - if (likely(psock->sk->sk_socket)) + if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { @@ -553,7 +552,7 @@ start: kfree_skb(skb); } end: - release_sock(psock->sk); + mutex_unlock(&psock->work_mutex); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -591,6 +590,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) spin_lock_init(&psock->link_lock); INIT_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); @@ -631,7 +631,7 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) } } -static void sk_psock_zap_ingress(struct sk_psock *psock) +static void __sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; @@ -639,9 +639,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); kfree_skb(skb); } - spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); - spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_link_destroy(struct sk_psock *psock) @@ -654,6 +652,18 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +void sk_psock_stop(struct sk_psock *psock, bool wait) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + __sk_psock_zap_ingress(psock); + spin_unlock_bh(&psock->ingress_lock); + + if (wait) + cancel_work_sync(&psock->work); +} + static void sk_psock_done_strp(struct sk_psock *psock); static void sk_psock_destroy_deferred(struct work_struct *gc) @@ -665,12 +675,12 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) sk_psock_done_strp(psock); cancel_work_sync(&psock->work); + mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); sk_psock_link_destroy(psock); sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); if (psock->sk_redir) sock_put(psock->sk_redir); @@ -688,8 +698,7 @@ static void sk_psock_destroy(struct rcu_head *rcu) void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); + sk_psock_stop(psock, false); write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); @@ -699,7 +708,6 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); call_rcu(&psock->rcu, sk_psock_destroy); } @@ -770,14 +778,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + kfree_skb(skb); + return; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); kfree_skb(skb); return; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); + spin_unlock_bh(&psock_other->ingress_lock); } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -845,8 +859,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + } + spin_unlock_bh(&psock->ingress_lock); } break; case __SK_REDIRECT: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dd53a7771d7e..e564fdeaada1 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); + sk_psock_stop(psock, true); release_sock(sk); saved_close(sk, timeout); } -- cgit v1.2.3 From 7786dfc41a74e0567557b5c4a28fc8482f5f5691 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:26 -0700 Subject: skmsg: Use rcu work for destroying psock The RCU callback sk_psock_destroy() only queues work psock->gc, so we can just switch to rcu work to simplify the code. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 5 +---- net/core/skmsg.c | 17 +++++------------ 2 files changed, 6 insertions(+), 16 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 7382c4b518d7..e7aba150539d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -102,10 +102,7 @@ struct sk_psock { struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; - union { - struct rcu_head rcu; - struct work_struct gc; - }; + struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9c25020086a9..d43d43905d2c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -666,10 +666,10 @@ void sk_psock_stop(struct sk_psock *psock, bool wait) static void sk_psock_done_strp(struct sk_psock *psock); -static void sk_psock_destroy_deferred(struct work_struct *gc) +static void sk_psock_destroy(struct work_struct *work) { - struct sk_psock *psock = container_of(gc, struct sk_psock, gc); - + struct sk_psock *psock = container_of(to_rcu_work(work), + struct sk_psock, rwork); /* No sk_callback_lock since already detached. */ sk_psock_done_strp(psock); @@ -688,14 +688,6 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -static void sk_psock_destroy(struct rcu_head *rcu) -{ - struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); - - INIT_WORK(&psock->gc, sk_psock_destroy_deferred); - schedule_work(&psock->gc); -} - void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { sk_psock_stop(psock, false); @@ -709,7 +701,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - call_rcu(&psock->rcu, sk_psock_destroy); + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); + queue_rcu_work(system_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); -- cgit v1.2.3 From 190179f65ba8bc18dc1d38435b7932505ca5544f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:27 -0700 Subject: skmsg: Use GFP_KERNEL in sk_psock_create_ingress_msg() This function is only called in process context. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-7-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/skmsg.c') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d43d43905d2c..656eceab73bc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -410,7 +410,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); if (unlikely(!msg)) return NULL; -- cgit v1.2.3 From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:30 -0700 Subject: sock_map: Introduce BPF_SK_SKB_VERDICT Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is confusing and more importantly we still want to distinguish them from user-space. So we can just reuse the stream verdict code but introduce a new type of eBPF program, skb_verdict. Users are not allowed to attach stream_verdict and skb_verdict programs to the same map. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 1 + net/core/skmsg.c | 4 +++- net/core/sock_map.c | 28 ++++++++++++++++++++++++++++ tools/bpf/bpftool/common.c | 1 + tools/bpf/bpftool/prog.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 8 files changed, 38 insertions(+), 1 deletion(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e7aba150539d..c83dbc2d81d9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,7 @@ struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; + struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { @@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); + psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 598716742593..49371eba98ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9603de81811a..6428634da57e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 656eceab73bc..a045812d7c78 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.stream_verdict) + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, } skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 42d797291d34..c2a0411e08a8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk, strp_stop = true; if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; + if (psock->saved_data_ready && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } @@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; @@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { @@ -278,6 +290,9 @@ no_progs: if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; @@ -309,6 +324,9 @@ no_progs: } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; @@ -317,6 +335,9 @@ out_unlock_drop: out_drop: sk_psock_put(sk, psock); out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: @@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #endif case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; pprog = &progs->stream_verdict; break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; + pprog = &progs->skb_verdict; + break; default: return -EOPNOTSUPP; } diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 65303664417e..1828bba19020 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", [BPF_LIRC_MODE2] = "lirc_mode2", [BPF_FLOW_DISSECTOR] = "flow_dissector", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f2b915b20546..3f067d2d7584 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -76,6 +76,7 @@ enum dump_mode { static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ab9f2233607c..69902603012c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 8a59f9d1e3d4340659fdfee8879dc09a6f2546e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:31 -0700 Subject: sock: Introduce sk->sk_prot->psock_update_sk_prot() Currently sockmap calls into each protocol to update the struct proto and replace it. This certainly won't work when the protocol is implemented as a module, for example, AF_UNIX. Introduce a new ops sk->sk_prot->psock_update_sk_prot(), so each protocol can implement its own way to replace the struct proto. This also helps get rid of symbol dependencies on CONFIG_INET. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-11-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 18 +++--------------- include/net/sock.h | 3 +++ include/net/tcp.h | 1 + include/net/udp.h | 1 + net/core/skmsg.c | 5 ----- net/core/sock_map.c | 24 ++++-------------------- net/ipv4/tcp_bpf.c | 24 +++++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 3 +++ net/ipv4/udp.c | 3 +++ net/ipv4/udp_bpf.c | 15 +++++++++++++-- net/ipv6/tcp_ipv6.c | 3 +++ net/ipv6/udp.c | 3 +++ 12 files changed, 58 insertions(+), 45 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c83dbc2d81d9..5e800ddc2dc6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + int (*psock_update_sk_prot)(struct sock *sk, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; @@ -395,25 +396,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock) } } -static inline void sk_psock_update_proto(struct sock *sk, - struct sk_psock *psock, - struct proto *ops) -{ - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, ops); -} - static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { sk->sk_prot->unhash = psock->saved_unhash; - if (inet_csk_has_ulp(sk)) { - tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); - } else { - sk->sk_write_space = psock->saved_write_space; - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, psock->sk_proto); - } + if (psock->psock_update_sk_prot) + psock->psock_update_sk_prot(sk, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index 0b6266fd6bf6..8b4155e756c2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1184,6 +1184,9 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); +#ifdef CONFIG_BPF_SYSCALL + int (*psock_update_sk_prot)(struct sock *sk, bool restore); +#endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS diff --git a/include/net/tcp.h b/include/net/tcp.h index 075de26f449d..2efa4e5ea23d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2203,6 +2203,7 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int tcp_bpf_update_proto(struct sock *sk, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/udp.h b/include/net/udp.h index d4d064c59232..df7cc1edc200 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -518,6 +518,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int udp_bpf_update_proto(struct sock *sk, bool restore); #endif #endif /* _UDP_H */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a045812d7c78..9fc83f7cc1a0 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -562,11 +562,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); - if (inet_csk_has_ulp(sk)) { - psock = ERR_PTR(-EINVAL); - goto out; - } - if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index c2a0411e08a8..2915c7c8778b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -185,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw) static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { - struct proto *prot; - - switch (sk->sk_type) { - case SOCK_STREAM: - prot = tcp_bpf_get_proto(sk, psock); - break; - - case SOCK_DGRAM: - prot = udp_bpf_get_proto(sk, psock); - break; - - default: + if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; - } - - if (IS_ERR(prot)) - return PTR_ERR(prot); - - sk_psock_update_proto(sk, psock, prot); - return 0; + psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; + return sk->sk_prot->psock_update_sk_prot(sk, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) @@ -556,7 +540,7 @@ static bool sock_map_redirect_allowed(const struct sock *sk) static bool sock_map_sk_is_suitable(const struct sock *sk) { - return sk_is_tcp(sk) || sk_is_udp(sk); + return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ae980716d896..ac8cfbaeacd2 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -595,20 +595,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int tcp_bpf_update_proto(struct sock *sk, bool restore) { + struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (restore) { + if (inet_csk_has_ulp(sk)) { + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } + return 0; + } + + if (inet_csk_has_ulp(sk)) + return -EINVAL; + if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) - return ERR_PTR(-EINVAL); + return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } - return &tcp_bpf_prots[family][config]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + return 0; } +EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index daad4f99db32..dfc6d1c0e710 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2806,6 +2806,9 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a0478b17243..38952aaee3a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2849,6 +2849,9 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a94791efc1a..6001f93cd3a0 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -41,12 +41,23 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int udp_bpf_update_proto(struct sock *sk, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + struct sk_psock *psock = sk_psock(sk); + + if (restore) { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - return &udp_bpf_prots[family]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + return 0; } +EXPORT_SYMBOL_GPL(udp_bpf_update_proto); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d0f007741e8e..bff22d6ef516 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2139,6 +2139,9 @@ struct proto tcpv6_prot = { .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d25e5a9252fd..ef2c75bb4771 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1713,6 +1713,9 @@ struct proto udpv6_prot = { .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), -- cgit v1.2.3 From 2bc793e3272a13e337416c057cb81c5396ad91d1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:33 -0700 Subject: skmsg: Extract __tcp_bpf_recvmsg() and tcp_bpf_wait_data() Although these two functions are only used by TCP, they are not specific to TCP at all, both operate on skmsg and ingress_msg, so fit in net/core/skmsg.c very well. And we will need them for non-TCP, so rename and move them to skmsg.c and export them to modules. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-13-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 4 ++ include/net/tcp.h | 2 - net/core/skmsg.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_bpf.c | 100 +------------------------------------------------- net/tls/tls_sw.c | 4 +- 5 files changed, 106 insertions(+), 102 deletions(-) (limited to 'net/core/skmsg.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 5e800ddc2dc6..f78e90a04a69 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -125,6 +125,10 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err); +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2efa4e5ea23d..31b1696c62ba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2209,8 +2209,6 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9fc83f7cc1a0..92a83c02562a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,6 +399,104 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_wait_data); + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; + struct sk_msg *msg_rx; + int i, copied = 0; + + msg_rx = sk_psock_peek_msg(psock); + while (copied != len) { + struct scatterlist *sge; + + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) + return copied ? copied : -EFAULT; + + copied += copy; + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + if (!msg_rx->skb) + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + return copied; + sk_msg_iter_var_next(i); + } + + if (copied == len) + break; + } while (i != msg_rx->sg.end); + + if (unlikely(peek)) { + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) + break; + continue; + } + + msg_rx->sg.start = i; + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); + } + msg_rx = sk_psock_peek_msg(psock); + } + + return copied; +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg); + static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ac8cfbaeacd2..3d622a0d0753 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,80 +10,6 @@ #include #include -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags) -{ - struct iov_iter *iter = &msg->msg_iter; - int peek = flags & MSG_PEEK; - struct sk_msg *msg_rx; - int i, copied = 0; - - msg_rx = sk_psock_peek_msg(psock); - while (copied != len) { - struct scatterlist *sge; - - if (unlikely(!msg_rx)) - break; - - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; - - copied += copy; - if (likely(!peek)) { - sge->offset += copy; - sge->length -= copy; - if (!msg_rx->skb) - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - - if (!sge->length) { - sk_msg_iter_var_next(i); - if (!msg_rx->skb) - put_page(page); - } - } else { - /* Lets not optimize peek case if copy_page_to_iter - * didn't copy the entire length lets just break. - */ - if (copy != sge->length) - return copied; - sk_msg_iter_var_next(i); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - if (unlikely(peek)) { - msg_rx = sk_psock_next_msg(psock, msg_rx); - if (!msg_rx) - break; - continue; - } - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - msg_rx = sk_psock_dequeue_msg(psock); - kfree_sk_msg(msg_rx); - } - msg_rx = sk_psock_peek_msg(psock); - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -237,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -278,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 01d933ae5f16..1dcb34dfd56b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1789,8 +1789,8 @@ int tls_sw_recvmsg(struct sock *sk, skb = tls_wait_data(sk, psock, flags, timeo, &err); if (!skb) { if (psock) { - int ret = __tcp_bpf_recvmsg(sk, psock, - msg, len, flags); + int ret = sk_msg_recvmsg(sk, psock, msg, len, + flags); if (ret > 0) { decrypted += ret; -- cgit v1.2.3