From ee5675ecdf7a4e713ed21d98a70c2871d6ebed01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2023 01:10:07 +0000 Subject: net/packet: convert po->origdev to an atomic flag syzbot/KCAN reported that po->origdev can be read while another thread is changing its value. We can avoid this splat by converting this field to an actual bit. Following patches will convert remaining 1bit fields. Fixes: 80feaacb8a64 ("[AF_PACKET]: Add option to return orig_dev to userspace.") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/packet/internal.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/internal.h b/net/packet/internal.h index 48af35b1aed2..178cd1852238 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -116,9 +116,9 @@ struct packet_sock { int copy_thresh; spinlock_t bind_lock; struct mutex pg_vec_lock; + unsigned long flags; unsigned int running; /* bind_lock must be held */ unsigned int auxdata:1, /* writer must hold sock lock */ - origdev:1, has_vnet_hdr:1, tp_loss:1, tp_tx_has_off:1; @@ -144,4 +144,24 @@ static inline struct packet_sock *pkt_sk(struct sock *sk) return (struct packet_sock *)sk; } +enum packet_sock_flags { + PACKET_SOCK_ORIGDEV, +}; + +static inline void packet_sock_flag_set(struct packet_sock *po, + enum packet_sock_flags flag, + bool val) +{ + if (val) + set_bit(flag, &po->flags); + else + clear_bit(flag, &po->flags); +} + +static inline bool packet_sock_flag(const struct packet_sock *po, + enum packet_sock_flags flag) +{ + return test_bit(flag, &po->flags); +} + #endif -- cgit v1.2.3 From fd53c297aa7b077ae98a3d3d2d3aa278a1686ba6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2023 01:10:08 +0000 Subject: net/packet: convert po->auxdata to an atomic flag po->auxdata can be read while another thread is changing its value, potentially raising KCSAN splat. Convert it to PACKET_SOCK_AUXDATA flag. Fixes: 8dc419447415 ("[PACKET]: Add optional checksum computation for recvmsg") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 8 +++----- net/packet/diag.c | 2 +- net/packet/internal.h | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index af7c44169b86..ecd9fc27e360 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3514,7 +3514,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } - if (pkt_sk(sk)->auxdata) { + if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; @@ -3900,9 +3900,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); - po->auxdata = !!val; - release_sock(sk); + packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: @@ -4060,7 +4058,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, break; case PACKET_AUXDATA: - val = po->auxdata; + val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); diff --git a/net/packet/diag.c b/net/packet/diag.c index e1ac9bb375b3..d704c7bf51b2 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -23,7 +23,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) pinfo.pdi_flags = 0; if (po->running) pinfo.pdi_flags |= PDI_RUNNING; - if (po->auxdata) + if (packet_sock_flag(po, PACKET_SOCK_AUXDATA)) pinfo.pdi_flags |= PDI_AUXDATA; if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) pinfo.pdi_flags |= PDI_ORIGDEV; diff --git a/net/packet/internal.h b/net/packet/internal.h index 178cd1852238..3bae8ea7a36f 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -118,8 +118,7 @@ struct packet_sock { struct mutex pg_vec_lock; unsigned long flags; unsigned int running; /* bind_lock must be held */ - unsigned int auxdata:1, /* writer must hold sock lock */ - has_vnet_hdr:1, + unsigned int has_vnet_hdr:1, /* writer must hold sock lock */ tp_loss:1, tp_tx_has_off:1; int pressure; @@ -146,6 +145,7 @@ static inline struct packet_sock *pkt_sk(struct sock *sk) enum packet_sock_flags { PACKET_SOCK_ORIGDEV, + PACKET_SOCK_AUXDATA, }; static inline void packet_sock_flag_set(struct packet_sock *po, -- cgit v1.2.3 From 7438344660fa55b33b8234c1797c886eb73667a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2023 01:10:10 +0000 Subject: net/packet: convert po->tp_tx_has_off to an atomic flag This is to use existing space in po->flags, and reclaim the storage used by the non atomic bit fields. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 +++--- net/packet/internal.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a27a811fa2b0..7800dc622ff3 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2672,7 +2672,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, return -EMSGSIZE; } - if (unlikely(po->tp_tx_has_off)) { + if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); @@ -3993,7 +3993,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) - po->tp_tx_has_off = !!val; + packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val); release_sock(sk); return 0; @@ -4120,7 +4120,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: - val = po->tp_tx_has_off; + val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: val = packet_use_direct_xmit(po); diff --git a/net/packet/internal.h b/net/packet/internal.h index 3bae8ea7a36f..0d16a581e271 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -119,8 +119,7 @@ struct packet_sock { unsigned long flags; unsigned int running; /* bind_lock must be held */ unsigned int has_vnet_hdr:1, /* writer must hold sock lock */ - tp_loss:1, - tp_tx_has_off:1; + tp_loss:1; int pressure; int ifindex; /* bound device */ __be16 num; @@ -146,6 +145,7 @@ static inline struct packet_sock *pkt_sk(struct sock *sk) enum packet_sock_flags { PACKET_SOCK_ORIGDEV, PACKET_SOCK_AUXDATA, + PACKET_SOCK_TX_HAS_OFF, }; static inline void packet_sock_flag_set(struct packet_sock *po, -- cgit v1.2.3 From 164bddace2e03f6005e650cb88f101a66ebdc05a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2023 01:10:11 +0000 Subject: net/packet: convert po->tp_loss to an atomic flag tp_loss can be read locklessly. Convert it to an atomic flag to avoid races. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 +++--- net/packet/diag.c | 2 +- net/packet/internal.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7800dc622ff3..119063c8a1c5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2843,7 +2843,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(tp_len < 0)) { tpacket_error: - if (po->tp_loss) { + if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); @@ -3886,7 +3886,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { - po->tp_loss = !!val; + packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val); ret = 0; } release_sock(sk); @@ -4095,7 +4095,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, val = po->tp_reserve; break; case PACKET_LOSS: - val = po->tp_loss; + val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP: val = READ_ONCE(po->tp_tstamp); diff --git a/net/packet/diag.c b/net/packet/diag.c index 0abca32e2f87..8bb4ce6a8e61 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -29,7 +29,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) pinfo.pdi_flags |= PDI_ORIGDEV; if (po->has_vnet_hdr) pinfo.pdi_flags |= PDI_VNETHDR; - if (po->tp_loss) + if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) pinfo.pdi_flags |= PDI_LOSS; return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo); diff --git a/net/packet/internal.h b/net/packet/internal.h index 0d16a581e271..9d406a92ede8 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -118,8 +118,7 @@ struct packet_sock { struct mutex pg_vec_lock; unsigned long flags; unsigned int running; /* bind_lock must be held */ - unsigned int has_vnet_hdr:1, /* writer must hold sock lock */ - tp_loss:1; + unsigned int has_vnet_hdr:1; /* writer must hold sock lock */ int pressure; int ifindex; /* bound device */ __be16 num; @@ -146,6 +145,7 @@ enum packet_sock_flags { PACKET_SOCK_ORIGDEV, PACKET_SOCK_AUXDATA, PACKET_SOCK_TX_HAS_OFF, + PACKET_SOCK_TP_LOSS, }; static inline void packet_sock_flag_set(struct packet_sock *po, -- cgit v1.2.3 From 50d935eafee292fc432d5ac8c8715a6492961abc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2023 01:10:12 +0000 Subject: net/packet: convert po->has_vnet_hdr to an atomic flag po->has_vnet_hdr can be read locklessly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 19 ++++++++++--------- net/packet/diag.c | 2 +- net/packet/internal.h | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 119063c8a1c5..5a6b05e17ca2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2309,7 +2309,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; - if (po->has_vnet_hdr) { + if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { netoff += sizeof(struct virtio_net_hdr); do_vnet = true; } @@ -2780,7 +2780,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr) + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && + !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) size_max = dev->mtu + reserve + VLAN_HLEN; reinit_completion(&po->skb_completion); @@ -2809,7 +2810,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; - if (po->has_vnet_hdr) { + if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { vnet_hdr = data; data += sizeof(*vnet_hdr); tp_len -= sizeof(*vnet_hdr); @@ -2837,7 +2838,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && - !po->has_vnet_hdr && + !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR) && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; @@ -2856,7 +2857,7 @@ tpacket_error: } } - if (po->has_vnet_hdr) { + if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; @@ -2991,7 +2992,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; - if (po->has_vnet_hdr) { + if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; @@ -3451,7 +3452,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, packet_rcv_try_clear_pressure(pkt_sk(sk)); - if (pkt_sk(sk)->has_vnet_hdr) { + if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_HAS_VNET_HDR)) { err = packet_rcv_vnet(msg, skb, &len); if (err) goto out_free; @@ -3931,7 +3932,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { - po->has_vnet_hdr = !!val; + packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val); ret = 0; } release_sock(sk); @@ -4065,7 +4066,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: - val = po->has_vnet_hdr; + val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR); break; case PACKET_VERSION: val = po->tp_version; diff --git a/net/packet/diag.c b/net/packet/diag.c index 8bb4ce6a8e61..56240aaf032b 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) pinfo.pdi_flags |= PDI_AUXDATA; if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) pinfo.pdi_flags |= PDI_ORIGDEV; - if (po->has_vnet_hdr) + if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) pinfo.pdi_flags |= PDI_VNETHDR; if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) pinfo.pdi_flags |= PDI_LOSS; diff --git a/net/packet/internal.h b/net/packet/internal.h index 9d406a92ede8..2521176807f4 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -118,7 +118,6 @@ struct packet_sock { struct mutex pg_vec_lock; unsigned long flags; unsigned int running; /* bind_lock must be held */ - unsigned int has_vnet_hdr:1; /* writer must hold sock lock */ int pressure; int ifindex; /* bound device */ __be16 num; @@ -146,6 +145,7 @@ enum packet_sock_flags { PACKET_SOCK_AUXDATA, PACKET_SOCK_TX_HAS_OFF, PACKET_SOCK_TP_LOSS, + PACKET_SOCK_HAS_VNET_HDR, }; static inline void packet_sock_flag_set(struct packet_sock *po, -- cgit v1.2.3 From 61edf479818e63978cabd243b82ca80f8948a313 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2023 01:10:13 +0000 Subject: net/packet: convert po->running to an atomic flag Instead of consuming 32 bits for po->running, use one available bit in po->flags. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 20 ++++++++++---------- net/packet/diag.c | 2 +- net/packet/internal.h | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5a6b05e17ca2..ec446452bbe8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -340,14 +340,14 @@ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); - if (!po->running) { + if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); - po->running = 1; + packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } @@ -369,7 +369,7 @@ static void __unregister_prot_hook(struct sock *sk, bool sync) lockdep_assert_held_once(&po->bind_lock); - po->running = 0; + packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); @@ -389,7 +389,7 @@ static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); - if (po->running) + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } @@ -1782,7 +1782,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -EINVAL; spin_lock(&po->bind_lock); - if (po->running && + if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { @@ -3222,7 +3222,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, if (need_rehook) { dev_hold(dev); - if (po->running) { + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() @@ -3235,7 +3235,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, dev->ifindex); } - BUG_ON(po->running); + BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; @@ -4159,7 +4159,7 @@ static int packet_notifier(struct notifier_block *this, case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); - if (po->running) { + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) @@ -4470,7 +4470,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, /* Detach socket from network */ spin_lock(&po->bind_lock); - was_running = po->running; + was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; if (was_running) { WRITE_ONCE(po->num, 0); @@ -4681,7 +4681,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) s->sk_type, ntohs(READ_ONCE(po->num)), READ_ONCE(po->ifindex), - po->running, + packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), sock_i_ino(s)); diff --git a/net/packet/diag.c b/net/packet/diag.c index 56240aaf032b..de4ced5cf3e8 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -21,7 +21,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) pinfo.pdi_tstamp = READ_ONCE(po->tp_tstamp); pinfo.pdi_flags = 0; - if (po->running) + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) pinfo.pdi_flags |= PDI_RUNNING; if (packet_sock_flag(po, PACKET_SOCK_AUXDATA)) pinfo.pdi_flags |= PDI_AUXDATA; diff --git a/net/packet/internal.h b/net/packet/internal.h index 2521176807f4..58f042c63172 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -117,7 +117,6 @@ struct packet_sock { spinlock_t bind_lock; struct mutex pg_vec_lock; unsigned long flags; - unsigned int running; /* bind_lock must be held */ int pressure; int ifindex; /* bound device */ __be16 num; @@ -146,6 +145,7 @@ enum packet_sock_flags { PACKET_SOCK_TX_HAS_OFF, PACKET_SOCK_TP_LOSS, PACKET_SOCK_HAS_VNET_HDR, + PACKET_SOCK_RUNNING, }; static inline void packet_sock_flag_set(struct packet_sock *po, -- cgit v1.2.3 From 791a3e9f1a86fe8eb09173c9788493b8b5c957f4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2023 01:10:14 +0000 Subject: net/packet: convert po->pressure to an atomic flag Not only this removes some READ_ONCE()/WRITE_ONCE(), this also removes one integer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 14 ++++++++------ net/packet/internal.h | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ec446452bbe8..7b9367b233d3 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1307,22 +1307,23 @@ static int __packet_rcv_has_room(const struct packet_sock *po, static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { - int pressure, ret; + bool pressure; + int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; - if (READ_ONCE(po->pressure) != pressure) - WRITE_ONCE(po->pressure, pressure); + if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) + packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { - if (READ_ONCE(po->pressure) && + if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - WRITE_ONCE(po->pressure, 0); + packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) @@ -1409,7 +1410,8 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); - if (po_next != po_skip && !READ_ONCE(po_next->pressure) && + if (po_next != po_skip && + !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; diff --git a/net/packet/internal.h b/net/packet/internal.h index 58f042c63172..680703dbce5e 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -117,7 +117,6 @@ struct packet_sock { spinlock_t bind_lock; struct mutex pg_vec_lock; unsigned long flags; - int pressure; int ifindex; /* bound device */ __be16 num; struct packet_rollover *rollover; @@ -146,6 +145,7 @@ enum packet_sock_flags { PACKET_SOCK_TP_LOSS, PACKET_SOCK_HAS_VNET_HDR, PACKET_SOCK_RUNNING, + PACKET_SOCK_PRESSURE, }; static inline void packet_sock_flag_set(struct packet_sock *po, -- cgit v1.2.3 From 68ac9a8b6e65c7cbbe96541353dab1b3f8de2043 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2023 15:55:31 +0000 Subject: af_packet: preserve const qualifier in pkt_sk() We can change pkt_sk() to propagate const qualifier of its argument, thanks to container_of_const() This should avoid some potential errors caused by accidental (const -> not_const) promotion. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/packet/internal.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/internal.h b/net/packet/internal.h index 680703dbce5e..e793e99646f1 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -133,10 +133,7 @@ struct packet_sock { atomic_t tp_drops ____cacheline_aligned_in_smp; }; -static inline struct packet_sock *pkt_sk(struct sock *sk) -{ - return (struct packet_sock *)sk; -} +#define pkt_sk(ptr) container_of_const(ptr, struct packet_sock, sk) enum packet_sock_flags { PACKET_SOCK_ORIGDEV, -- cgit v1.2.3 From 105a201ebf3312990b96c4fbaade22e31402f8cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2023 16:20:02 +0000 Subject: net/packet: remove po->xmit Use PACKET_SOCK_QDISC_BYPASS atomic bit instead of a pointer. This removes one indirect call in fast path, and READ_ONCE()/WRITE_ONCE() annotations as well. Signed-off-by: Eric Dumazet Suggested-by: Willem de Bruijn Cc: Daniel Borkmann Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 24 +++++++++--------------- net/packet/internal.h | 2 +- 2 files changed, 10 insertions(+), 16 deletions(-) (limited to 'net/packet/internal.h') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7b9367b233d3..497193f73030 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -270,8 +270,11 @@ static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) } #endif -static int packet_direct_xmit(struct sk_buff *skb) +static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { + if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) + return dev_queue_xmit(skb); + #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); @@ -305,12 +308,6 @@ static void packet_cached_dev_reset(struct packet_sock *po) RCU_INIT_POINTER(po->cached_dev, NULL); } -static bool packet_use_direct_xmit(const struct packet_sock *po) -{ - /* Paired with WRITE_ONCE() in packet_setsockopt() */ - return READ_ONCE(po->xmit) == packet_direct_xmit; -} - static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -2872,8 +2869,7 @@ tpacket_error: packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; - /* Paired with WRITE_ONCE() in packet_setsockopt() */ - err = READ_ONCE(po->xmit)(skb); + err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); @@ -3076,8 +3072,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) virtio_net_hdr_set_proto(skb, &vnet_hdr); } - /* Paired with WRITE_ONCE() in packet_setsockopt() */ - err = READ_ONCE(po->xmit)(skb); + err = packet_xmit(po, skb); + if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); @@ -3359,7 +3355,6 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; - po->xmit = dev_queue_xmit; err = packet_alloc_pending(po); if (err) @@ -4010,8 +4005,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - /* Paired with all lockless reads of po->xmit */ - WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit); + packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val); return 0; } default: @@ -4126,7 +4120,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: - val = packet_use_direct_xmit(po); + val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT; diff --git a/net/packet/internal.h b/net/packet/internal.h index e793e99646f1..27930f69f368 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -128,7 +128,6 @@ struct packet_sock { unsigned int tp_tstamp; struct completion skb_completion; struct net_device __rcu *cached_dev; - int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; atomic_t tp_drops ____cacheline_aligned_in_smp; }; @@ -143,6 +142,7 @@ enum packet_sock_flags { PACKET_SOCK_HAS_VNET_HDR, PACKET_SOCK_RUNNING, PACKET_SOCK_PRESSURE, + PACKET_SOCK_QDISC_BYPASS, }; static inline void packet_sock_flag_set(struct packet_sock *po, -- cgit v1.2.3 From dfc39d4026fb2432363c0f77543c4cf3adca4c7b Mon Sep 17 00:00:00 2001 From: Jianfeng Tan Date: Wed, 19 Apr 2023 15:24:16 +0800 Subject: net/packet: support mergeable feature of virtio Packet sockets, like tap, can be used as the backend for kernel vhost. In packet sockets, virtio net header size is currently hardcoded to be the size of struct virtio_net_hdr, which is 10 bytes; however, it is not always the case: some virtio features, such as mrg_rxbuf, need virtio net header to be 12-byte long. Mergeable buffers, as a virtio feature, is worthy of supporting: packets that are larger than one-mbuf size will be dropped in vhost worker's handle_rx if mrg_rxbuf feature is not used, but large packets cannot be avoided and increasing mbuf's size is not economical. With this virtio feature enabled by virtio-user, packet sockets with hardcoded 10-byte virtio net header will parse mac head incorrectly in packet_snd by taking the last two bytes of virtio net header as part of mac header. This incorrect mac header parsing will cause packet to be dropped due to invalid ether head checking in later under-layer device packet receiving. By adding extra field vnet_hdr_sz with utilizing holes in struct packet_sock to record currently used virtio net header size and supporting extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet sockets can know the exact length of virtio net header that virtio user gives. In packet_snd, tpacket_snd and packet_recvmsg, instead of using hardcoded virtio net header size, it can get the exact vnet_hdr_sz from corresponding packet_sock, and parse mac header correctly based on this information to avoid the packets being mistakenly dropped. Signed-off-by: Jianfeng Tan Co-developed-by: Anqi Shen Signed-off-by: Anqi Shen Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 95 +++++++++++++++++++++++++----------------- net/packet/diag.c | 2 +- net/packet/internal.h | 2 +- 4 files changed, 60 insertions(+), 40 deletions(-) (limited to 'net/packet/internal.h') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 78c981d6a9d4..9efc42382fdb 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -59,6 +59,7 @@ struct sockaddr_ll { #define PACKET_ROLLOVER_STATS 21 #define PACKET_FANOUT_DATA 22 #define PACKET_IGNORE_OUTGOING 23 +#define PACKET_VNET_HDR_SZ 24 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 568f8d76e3c1..6080c0db0814 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct sk_buff *skb, } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, - size_t *len) + size_t *len, int vnet_hdr_sz) { - struct virtio_net_hdr vnet_hdr; + struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; - if (*len < sizeof(vnet_hdr)) + if (*len < vnet_hdr_sz) return -EINVAL; - *len -= sizeof(vnet_hdr); + *len -= vnet_hdr_sz; - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; - return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); + return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* @@ -2250,7 +2250,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, __u32 ts_status; bool is_drop_n_account = false; unsigned int slot_id = 0; - bool do_vnet = false; + int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing @@ -2308,10 +2308,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { - netoff += sizeof(struct virtio_net_hdr); - do_vnet = true; - } + vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); + if (vnet_hdr_sz) + netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { @@ -2337,7 +2336,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; - do_vnet = false; + vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > @@ -2351,7 +2350,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; - do_vnet = false; + vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); @@ -2367,7 +2366,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, __set_bit(slot_id, po->rx_ring.rx_owner_map); } - if (do_vnet && + if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { @@ -2551,16 +2550,26 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, - struct virtio_net_hdr *vnet_hdr) + struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { - if (*len < sizeof(*vnet_hdr)) + int ret; + + if (*len < vnet_hdr_sz) return -EINVAL; - *len -= sizeof(*vnet_hdr); + *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; - return __packet_snd_vnet_parse(vnet_hdr, *len); + ret = __packet_snd_vnet_parse(vnet_hdr, *len); + if (ret) + return ret; + + /* move iter to point to the start of mac header */ + if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) + iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); + + return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, @@ -2722,6 +2731,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; @@ -2779,8 +2789,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if ((size_max > dev->mtu + reserve + VLAN_HLEN) && - !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; reinit_completion(&po->skb_completion); @@ -2809,10 +2818,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { + if (vnet_hdr_sz) { vnet_hdr = data; - data += sizeof(*vnet_hdr); - tp_len -= sizeof(*vnet_hdr); + data += vnet_hdr_sz; + tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; @@ -2837,7 +2846,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && - !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR) && + !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; @@ -2856,7 +2865,7 @@ tpacket_error: } } - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { + if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; @@ -2946,7 +2955,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); - bool has_vnet_hdr = false; + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; @@ -2990,11 +2999,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { - err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); + if (vnet_hdr_sz) { + err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; - has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -3064,11 +3072,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) packet_parse_headers(skb, sock); - if (has_vnet_hdr) { + if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; - len += sizeof(vnet_hdr); + len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } @@ -3408,7 +3416,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; - int vnet_hdr_len = 0; + int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; @@ -3449,11 +3457,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, packet_rcv_try_clear_pressure(pkt_sk(sk)); - if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_HAS_VNET_HDR)) { - err = packet_rcv_vnet(msg, skb, &len); + if (vnet_hdr_len) { + err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; - vnet_hdr_len = sizeof(struct virtio_net_hdr); } /* You lose any data beyond the buffer you gave. If it worries @@ -3915,8 +3922,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, return 0; } case PACKET_VNET_HDR: + case PACKET_VNET_HDR_SZ: { - int val; + int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; @@ -3925,11 +3933,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; + if (optname == PACKET_VNET_HDR_SZ) { + if (val && val != sizeof(struct virtio_net_hdr) && + val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) + return -EINVAL; + hdr_len = val; + } else { + hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; + } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { - packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val); + WRITE_ONCE(po->vnet_hdr_sz, hdr_len); ret = 0; } release_sock(sk); @@ -4062,7 +4078,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: - val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR); + val = !!READ_ONCE(po->vnet_hdr_sz); + break; + case PACKET_VNET_HDR_SZ: + val = READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VERSION: val = po->tp_version; diff --git a/net/packet/diag.c b/net/packet/diag.c index de4ced5cf3e8..d0c4eda4cdc6 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) pinfo.pdi_flags |= PDI_AUXDATA; if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) pinfo.pdi_flags |= PDI_ORIGDEV; - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) + if (READ_ONCE(po->vnet_hdr_sz)) pinfo.pdi_flags |= PDI_VNETHDR; if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) pinfo.pdi_flags |= PDI_LOSS; diff --git a/net/packet/internal.h b/net/packet/internal.h index 27930f69f368..63f4865202c1 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -118,6 +118,7 @@ struct packet_sock { struct mutex pg_vec_lock; unsigned long flags; int ifindex; /* bound device */ + u8 vnet_hdr_sz; __be16 num; struct packet_rollover *rollover; struct packet_mclist *mclist; @@ -139,7 +140,6 @@ enum packet_sock_flags { PACKET_SOCK_AUXDATA, PACKET_SOCK_TX_HAS_OFF, PACKET_SOCK_TP_LOSS, - PACKET_SOCK_HAS_VNET_HDR, PACKET_SOCK_RUNNING, PACKET_SOCK_PRESSURE, PACKET_SOCK_QDISC_BYPASS, -- cgit v1.2.3