summaryrefslogtreecommitdiff
path: root/net/core/skmsg.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 21:57:23 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 21:57:23 +0300
commit9d31d2338950293ec19d9b095fbaa9030899dcb4 (patch)
treee688040d0557c24a2eeb9f6c9c223d949f6f7ef9 /net/core/skmsg.c
parent635de956a7f5a6ffcb04f29d70630c64c717b56b (diff)
parent4a52dd8fefb45626dace70a63c0738dbd83b7edb (diff)
downloadlinux-9d31d2338950293ec19d9b095fbaa9030899dcb4.tar.xz
Merge tag 'net-next-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - bpf: - allow bpf programs calling kernel functions (initially to reuse TCP congestion control implementations) - enable task local storage for tracing programs - remove the need to store per-task state in hash maps, and allow tracing programs access to task local storage previously added for BPF_LSM - add bpf_for_each_map_elem() helper, allowing programs to walk all map elements in a more robust and easier to verify fashion - sockmap: support UDP and cross-protocol BPF_SK_SKB_VERDICT redirection - lpm: add support for batched ops in LPM trie - add BTF_KIND_FLOAT support - mostly to allow use of BTF on s390 which has floats in its headers files - improve BPF syscall documentation and extend the use of kdoc parsing scripts we already employ for bpf-helpers - libbpf, bpftool: support static linking of BPF ELF files - improve support for encapsulation of L2 packets - xdp: restructure redirect actions to avoid a runtime lookup, improving performance by 4-8% in microbenchmarks - xsk: build skb by page (aka generic zerocopy xmit) - improve performance of software AF_XDP path by 33% for devices which don't need headers in the linear skb part (e.g. virtio) - nexthop: resilient next-hop groups - improve path stability on next-hops group changes (incl. offload for mlxsw) - ipv6: segment routing: add support for IPv4 decapsulation - icmp: add support for RFC 8335 extended PROBE messages - inet: use bigger hash table for IP ID generation - tcp: deal better with delayed TX completions - make sure we don't give up on fast TCP retransmissions only because driver is slow in reporting that it completed transmitting the original - tcp: reorder tcp_congestion_ops for better cache locality - mptcp: - add sockopt support for common TCP options - add support for common TCP msg flags - include multiple address ids in RM_ADDR - add reset option support for resetting one subflow - udp: GRO L4 improvements - improve 'forward' / 'frag_list' co-existence with UDP tunnel GRO, allowing the first to take place correctly even for encapsulated UDP traffic - micro-optimize dev_gro_receive() and flow dissection, avoid retpoline overhead on VLAN and TEB GRO - use less memory for sysctls, add a new sysctl type, to allow using u8 instead of "int" and "long" and shrink networking sysctls - veth: allow GRO without XDP - this allows aggregating UDP packets before handing them off to routing, bridge, OvS, etc. - allow specifing ifindex when device is moved to another namespace - netfilter: - nft_socket: add support for cgroupsv2 - nftables: add catch-all set element - special element used to define a default action in case normal lookup missed - use net_generic infra in many modules to avoid allocating per-ns memory unnecessarily - xps: improve the xps handling to avoid potential out-of-bound accesses and use-after-free when XPS change race with other re-configuration under traffic - add a config knob to turn off per-cpu netdev refcnt to catch underflows in testing Device APIs: - add WWAN subsystem to organize the WWAN interfaces better and hopefully start driving towards more unified and vendor- independent APIs - ethtool: - add interface for reading IEEE MIB stats (incl. mlx5 and bnxt support) - allow network drivers to dump arbitrary SFP EEPROM data, current offset+length API was a poor fit for modern SFP which define EEPROM in terms of pages (incl. mlx5 support) - act_police, flow_offload: add support for packet-per-second policing (incl. offload for nfp) - psample: add additional metadata attributes like transit delay for packets sampled from switch HW (and corresponding egress and policy-based sampling in the mlxsw driver) - dsa: improve support for sandwiched LAGs with bridge and DSA - netfilter: - flowtable: use direct xmit in topologies with IP forwarding, bridging, vlans etc. - nftables: counter hardware offload support - Bluetooth: - improvements for firmware download w/ Intel devices - add support for reading AOSP vendor capabilities - add support for virtio transport driver - mac80211: - allow concurrent monitor iface and ethernet rx decap - set priority and queue mapping for injected frames - phy: add support for Clause-45 PHY Loopback - pci/iov: add sysfs MSI-X vector assignment interface to distribute MSI-X resources to VFs (incl. mlx5 support) New hardware/drivers: - dsa: mv88e6xxx: add support for Marvell mv88e6393x - 11-port Ethernet switch with 8x 1-Gigabit Ethernet and 3x 10-Gigabit interfaces. - dsa: support for legacy Broadcom tags used on BCM5325, BCM5365 and BCM63xx switches - Microchip KSZ8863 and KSZ8873; 3x 10/100Mbps Ethernet switches - ath11k: support for QCN9074 a 802.11ax device - Bluetooth: Broadcom BCM4330 and BMC4334 - phy: Marvell 88X2222 transceiver support - mdio: add BCM6368 MDIO mux bus controller - r8152: support RTL8153 and RTL8156 (USB Ethernet) chips - mana: driver for Microsoft Azure Network Adapter (MANA) - Actions Semi Owl Ethernet MAC - can: driver for ETAS ES58X CAN/USB interfaces Pure driver changes: - add XDP support to: enetc, igc, stmmac - add AF_XDP support to: stmmac - virtio: - page_to_skb() use build_skb when there's sufficient tailroom (21% improvement for 1000B UDP frames) - support XDP even without dedicated Tx queues - share the Tx queues with the stack when necessary - mlx5: - flow rules: add support for mirroring with conntrack, matching on ICMP, GTP, flex filters and more - support packet sampling with flow offloads - persist uplink representor netdev across eswitch mode changes - allow coexistence of CQE compression and HW time-stamping - add ethtool extended link error state reporting - ice, iavf: support flow filters, UDP Segmentation Offload - dpaa2-switch: - move the driver out of staging - add spanning tree (STP) support - add rx copybreak support - add tc flower hardware offload on ingress traffic - ionic: - implement Rx page reuse - support HW PTP time-stamping - octeon: support TC hardware offloads - flower matching on ingress and egress ratelimitting. - stmmac: - add RX frame steering based on VLAN priority in tc flower - support frame preemption (FPE) - intel: add cross time-stamping freq difference adjustment - ocelot: - support forwarding of MRP frames in HW - support multiple bridges - support PTP Sync one-step timestamping - dsa: mv88e6xxx, dpaa2-switch: offload bridge port flags like learning, flooding etc. - ipa: add IPA v4.5, v4.9 and v4.11 support (Qualcomm SDX55, SM8350, SC7280 SoCs) - mt7601u: enable TDLS support - mt76: - add support for 802.3 rx frames (mt7915/mt7615) - mt7915 flash pre-calibration support - mt7921/mt7663 runtime power management fixes" * tag 'net-next-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2451 commits) net: selftest: fix build issue if INET is disabled net: netrom: nr_in: Remove redundant assignment to ns net: tun: Remove redundant assignment to ret net: phy: marvell: add downshift support for M88E1240 net: dsa: ksz: Make reg_mib_cnt a u8 as it never exceeds 255 net/sched: act_ct: Remove redundant ct get and check icmp: standardize naming of RFC 8335 PROBE constants bpf, selftests: Update array map tests for per-cpu batched ops bpf: Add batched ops support for percpu array bpf: Implement formatted output helpers with bstr_printf seq_file: Add a seq_bprintf function sfc: adjust efx->xdp_tx_queue_count with the real number of initialized queues net:nfc:digital: Fix a double free in digital_tg_recv_dep_req net: fix a concurrency bug in l2tp_tunnel_register() net/smc: Remove redundant assignment to rc mpls: Remove redundant assignment to err llc2: Remove redundant assignment to rc net/tls: Remove redundant initialization of record rds: Remove redundant assignment to nr_sig dt-bindings: net: mdio-gpio: add compatible for microchip,mdio-smi0 ...
Diffstat (limited to 'net/core/skmsg.c')
-rw-r--r--net/core/skmsg.c383
1 files changed, 242 insertions, 141 deletions
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 5def3a2e85be..43ce17a6a585 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,6 +399,104 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+ long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_wait_data);
+
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ struct sk_msg *msg_rx;
+ int i, copied = 0;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (!copy)
+ return copied ? copied : -EFAULT;
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ if (!msg_rx->skb)
+ sk_mem_uncharge(sk, copy);
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ /* Lets not optimize peek case if copy_page_to_iter
+ * didn't copy the entire length lets just break.
+ */
+ if (copy != sge->length)
+ return copied;
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while (i != msg_rx->sg.end);
+
+ if (unlikely(peek)) {
+ msg_rx = sk_psock_next_msg(psock, msg_rx);
+ if (!msg_rx)
+ break;
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+ msg_rx = sk_psock_dequeue_msg(psock);
+ kfree_sk_msg(msg_rx);
+ }
+ msg_rx = sk_psock_peek_msg(psock);
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
struct sk_buff *skb)
{
@@ -410,7 +508,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
if (!sk_rmem_schedule(sk, skb, skb->truesize))
return NULL;
- msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
if (unlikely(!msg))
return NULL;
@@ -498,7 +596,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
- return skb_send_sock_locked(psock->sk, skb, off, len);
+ return skb_send_sock(psock->sk, skb, off, len);
}
return sk_psock_skb_ingress(psock, skb);
}
@@ -512,8 +610,7 @@ static void sk_psock_backlog(struct work_struct *work)
u32 len, off;
int ret;
- /* Lock sock to avoid losing sk_socket during loop. */
- lock_sock(psock->sk);
+ mutex_lock(&psock->work_mutex);
if (state->skb) {
skb = state->skb;
len = state->len;
@@ -526,10 +623,11 @@ static void sk_psock_backlog(struct work_struct *work)
len = skb->len;
off = 0;
start:
- ingress = tcp_skb_bpf_ingress(skb);
+ ingress = skb_bpf_ingress(skb);
+ skb_bpf_redirect_clear(skb);
do {
ret = -EIO;
- if (likely(psock->sk->sk_socket))
+ if (!sock_flag(psock->sk, SOCK_DEAD))
ret = sk_psock_handle_skb(psock, skb, off,
len, ingress);
if (ret <= 0) {
@@ -553,7 +651,7 @@ start:
kfree_skb(skb);
}
end:
- release_sock(psock->sk);
+ mutex_unlock(&psock->work_mutex);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -563,11 +661,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
write_lock_bh(&sk->sk_callback_lock);
- if (inet_csk_has_ulp(sk)) {
- psock = ERR_PTR(-EINVAL);
- goto out;
- }
-
if (sk->sk_user_data) {
psock = ERR_PTR(-EBUSY);
goto out;
@@ -591,7 +684,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
spin_lock_init(&psock->link_lock);
INIT_WORK(&psock->work, sk_psock_backlog);
+ mutex_init(&psock->work_mutex);
INIT_LIST_HEAD(&psock->ingress_msg);
+ spin_lock_init(&psock->ingress_lock);
skb_queue_head_init(&psock->ingress_skb);
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
@@ -619,7 +714,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
return link;
}
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
{
struct sk_msg *msg, *tmp;
@@ -630,9 +725,14 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
}
}
-static void sk_psock_zap_ingress(struct sk_psock *psock)
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
{
- __skb_queue_purge(&psock->ingress_skb);
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
+ skb_bpf_redirect_clear(skb);
+ kfree_skb(skb);
+ }
__sk_psock_purge_ingress_msg(psock);
}
@@ -646,23 +746,35 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
}
}
-static void sk_psock_destroy_deferred(struct work_struct *gc)
+void sk_psock_stop(struct sk_psock *psock, bool wait)
{
- struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
+ spin_lock_bh(&psock->ingress_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ sk_psock_cork_free(psock);
+ __sk_psock_zap_ingress(psock);
+ spin_unlock_bh(&psock->ingress_lock);
+ if (wait)
+ cancel_work_sync(&psock->work);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock);
+
+static void sk_psock_destroy(struct work_struct *work)
+{
+ struct sk_psock *psock = container_of(to_rcu_work(work),
+ struct sk_psock, rwork);
/* No sk_callback_lock since already detached. */
- /* Parser has been stopped */
- if (psock->progs.skb_parser)
- strp_done(&psock->parser.strp);
+ sk_psock_done_strp(psock);
cancel_work_sync(&psock->work);
+ mutex_destroy(&psock->work_mutex);
psock_progs_drop(&psock->progs);
sk_psock_link_destroy(psock);
sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
if (psock->sk_redir)
sock_put(psock->sk_redir);
@@ -670,30 +782,21 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
kfree(psock);
}
-static void sk_psock_destroy(struct rcu_head *rcu)
-{
- struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
-
- INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
- schedule_work(&psock->gc);
-}
-
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
- sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
+ sk_psock_stop(psock, false);
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
- if (psock->progs.skb_parser)
+ if (psock->progs.stream_parser)
sk_psock_stop_strp(sk, psock);
- else if (psock->progs.skb_verdict)
+ else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
- sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- call_rcu(&psock->rcu, sk_psock_destroy);
+ INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+ queue_rcu_work(system_wq, &psock->rwork);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
@@ -744,27 +847,12 @@ out:
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
-static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
- struct sk_buff *skb)
-{
- bpf_compute_data_end_sk_skb(skb);
- return bpf_prog_run_pin_on_cpu(prog, skb);
-}
-
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
-{
- struct sk_psock_parser *parser;
-
- parser = container_of(strp, struct sk_psock_parser, strp);
- return container_of(parser, struct sk_psock, parser);
-}
-
static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
- sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ sk_other = skb_bpf_redirect_fetch(skb);
/* This error is a buggy BPF program, it returned a redirect
* return code, but then didn't set a redirect interface.
*/
@@ -777,14 +865,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
* error that caused the pipe to break. We can't send a packet on
* a socket that is in this state so we drop the skb.
*/
- if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
- !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+ kfree_skb(skb);
+ return;
+ }
+ spin_lock_bh(&psock_other->ingress_lock);
+ if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ spin_unlock_bh(&psock_other->ingress_lock);
kfree_skb(skb);
return;
}
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
+ spin_unlock_bh(&psock_other->ingress_lock);
}
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
@@ -806,12 +900,13 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
int ret = __SK_PASS;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
skb->sk = psock->sk;
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
@@ -823,7 +918,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
- struct tcp_skb_cb *tcp;
struct sock *sk_other;
int err = -EIO;
@@ -835,8 +929,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
goto out_free;
}
- tcp = TCP_SKB_CB(skb);
- tcp->bpf.flags |= BPF_F_INGRESS;
+ skb_bpf_set_ingress(skb);
/* If the queue is empty then we can submit directly
* into the msg queue. If its not empty we have to
@@ -848,8 +941,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
err = sk_psock_skb_ingress_self(psock, skb);
}
if (err < 0) {
- skb_queue_tail(&psock->ingress_skb, skb);
- schedule_work(&psock->work);
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ skb_queue_tail(&psock->ingress_skb, skb);
+ schedule_work(&psock->work);
+ }
+ spin_unlock_bh(&psock->ingress_lock);
}
break;
case __SK_REDIRECT:
@@ -862,6 +959,24 @@ out_free:
}
}
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk) = NULL;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ }
+ rcu_read_unlock();
+ if (write_space)
+ write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock;
@@ -876,12 +991,13 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
kfree_skb(skb);
goto out;
}
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
skb->sk = sk;
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_verdict_apply(psock, skb, ret);
@@ -896,15 +1012,15 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
{
- struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
struct bpf_prog *prog;
int ret = skb->len;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_parser);
+ prog = READ_ONCE(psock->progs.stream_parser);
if (likely(prog)) {
skb->sk = psock->sk;
- ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
skb->sk = NULL;
}
rcu_read_unlock();
@@ -920,16 +1036,59 @@ static void sk_psock_strp_data_ready(struct sock *sk)
psock = sk_psock(sk);
if (likely(psock)) {
if (tls_sw_has_ctx_rx(sk)) {
- psock->parser.saved_data_ready(sk);
+ psock->saved_data_ready(sk);
} else {
write_lock_bh(&sk->sk_callback_lock);
- strp_data_ready(&psock->parser.strp);
+ strp_data_ready(&psock->strp);
write_unlock_bh(&sk->sk_callback_lock);
}
}
rcu_read_unlock();
}
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = sk_psock_strp_read,
+ .read_sock_done = sk_psock_strp_read_done,
+ .parse_msg = sk_psock_strp_parse,
+ };
+
+ return strp_init(&psock->strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+ if (psock->saved_data_ready)
+ return;
+
+ psock->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_strp_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+ if (!psock->saved_data_ready)
+ return;
+
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+ strp_stop(&psock->strp);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+ /* Parser has been stopped */
+ if (psock->progs.stream_parser)
+ strp_done(&psock->strp);
+}
+#else
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t orig_len)
{
@@ -953,12 +1112,15 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
kfree_skb(skb);
goto out;
}
- prog = READ_ONCE(psock->progs.skb_verdict);
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (!prog)
+ prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb->sk = sk;
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_verdict_apply(psock, skb, ret);
@@ -982,82 +1144,21 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
}
-static void sk_psock_write_space(struct sock *sk)
-{
- struct sk_psock *psock;
- void (*write_space)(struct sock *sk) = NULL;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock)) {
- if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
- schedule_work(&psock->work);
- write_space = psock->saved_write_space;
- }
- rcu_read_unlock();
- if (write_space)
- write_space(sk);
-}
-
-int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
-{
- static const struct strp_callbacks cb = {
- .rcv_msg = sk_psock_strp_read,
- .read_sock_done = sk_psock_strp_read_done,
- .parse_msg = sk_psock_strp_parse,
- };
-
- psock->parser.enabled = false;
- return strp_init(&psock->parser.strp, sk, &cb);
-}
-
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
+ if (psock->saved_data_ready)
return;
- parser->saved_data_ready = sk->sk_data_ready;
+ psock->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_verdict_data_ready;
sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
-}
-
-void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
- return;
-
- parser->saved_data_ready = sk->sk_data_ready;
- sk->sk_data_ready = sk_psock_strp_data_ready;
- sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
-}
-
-void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
- return;
-
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- strp_stop(&parser->strp);
- parser->enabled = false;
}
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (!parser->enabled)
+ if (!psock->saved_data_ready)
return;
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- parser->enabled = false;
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
}