From 5648b5e1169ff1d6d6a46c35c0b5fbebd2a5cbb2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Oct 2021 18:08:10 +0200 Subject: netfilter: nfnetlink_queue: fix OOB when mac header was cleared On 64bit platforms the MAC header is set to 0xffff on allocation and also when a helper like skb_unset_mac_header() is called. dev_parse_header may call skb_mac_header() which assumes valid mac offset: BUG: KASAN: use-after-free in eth_header_parse+0x75/0x90 Read of size 6 at addr ffff8881075a5c05 by task nf-queue/1364 Call Trace: memcpy+0x20/0x60 eth_header_parse+0x75/0x90 __nfqnl_enqueue_packet+0x1a61/0x3380 __nf_queue+0x597/0x1300 nf_queue+0xf/0x40 nf_hook_slow+0xed/0x190 nf_hook+0x184/0x440 ip_output+0x1c0/0x2a0 nf_reinject+0x26f/0x700 nfqnl_recv_verdict+0xa16/0x18b0 nfnetlink_rcv_msg+0x506/0xe70 The existing code only works if the skb has a mac header. Fixes: 2c38de4c1f8da7 ("netfilter: fix looped (broad|multi)cast's MAC handling") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 4c3fbaaeb103..4acc4b8e9fe5 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -560,7 +560,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nla_put_failure; if (indev && entskb->dev && - entskb->mac_header != entskb->network_header) { + skb_mac_header_was_set(entskb)) { struct nfqnl_msg_packet_hw phw; int len; -- cgit v1.2.3 From 2199f562730dd1382946e0a2532afc38cd444129 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 21 Oct 2021 15:02:55 +0200 Subject: ipvs: autoload ipvs on genl access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel provides the functionality to automatically load modules providing genl families. Use this to remove the need for users to manually load the module. Signed-off-by: Thomas Weißschuh Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 29ec3ef63edc..0ff94c66641f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -48,6 +48,8 @@ #include +MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); + /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); -- cgit v1.2.3 From 26499499cae642a906e7d501ac5342ca148ea8b1 Mon Sep 17 00:00:00 2001 From: Yuiko Oshino Date: Mon, 1 Nov 2021 12:21:19 -0400 Subject: net: phy: microchip_t1: add lan87xx_config_rgmii_delay for lan87xx phy Add a function to initialize phy rgmii delay according to phydev->interface. Signed-off-by: Yuiko Oshino Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20211101162119.29275-1-yuiko.oshino@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/microchip_t1.c | 44 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c index a4de3d2081c5..bc50224d43dd 100644 --- a/drivers/net/phy/microchip_t1.c +++ b/drivers/net/phy/microchip_t1.c @@ -28,6 +28,11 @@ #define LAN87XX_MASK_LINK_UP (0x0004) #define LAN87XX_MASK_LINK_DOWN (0x0002) +/* MISC Control 1 Register */ +#define LAN87XX_CTRL_1 (0x11) +#define LAN87XX_MASK_RGMII_TXC_DLY_EN (0x4000) +#define LAN87XX_MASK_RGMII_RXC_DLY_EN (0x2000) + /* phyaccess nested types */ #define PHYACC_ATTR_MODE_READ 0 #define PHYACC_ATTR_MODE_WRITE 1 @@ -112,6 +117,43 @@ static int access_ereg_modify_changed(struct phy_device *phydev, return rc; } +static int lan87xx_config_rgmii_delay(struct phy_device *phydev) +{ + int rc; + + if (!phy_interface_is_rgmii(phydev)) + return 0; + + rc = access_ereg(phydev, PHYACC_ATTR_MODE_READ, + PHYACC_ATTR_BANK_MISC, LAN87XX_CTRL_1, 0); + if (rc < 0) + return rc; + + switch (phydev->interface) { + case PHY_INTERFACE_MODE_RGMII: + rc &= ~LAN87XX_MASK_RGMII_TXC_DLY_EN; + rc &= ~LAN87XX_MASK_RGMII_RXC_DLY_EN; + break; + case PHY_INTERFACE_MODE_RGMII_ID: + rc |= LAN87XX_MASK_RGMII_TXC_DLY_EN; + rc |= LAN87XX_MASK_RGMII_RXC_DLY_EN; + break; + case PHY_INTERFACE_MODE_RGMII_RXID: + rc &= ~LAN87XX_MASK_RGMII_TXC_DLY_EN; + rc |= LAN87XX_MASK_RGMII_RXC_DLY_EN; + break; + case PHY_INTERFACE_MODE_RGMII_TXID: + rc |= LAN87XX_MASK_RGMII_TXC_DLY_EN; + rc &= ~LAN87XX_MASK_RGMII_RXC_DLY_EN; + break; + default: + return 0; + } + + return access_ereg(phydev, PHYACC_ATTR_MODE_WRITE, + PHYACC_ATTR_BANK_MISC, LAN87XX_CTRL_1, rc); +} + static int lan87xx_phy_init(struct phy_device *phydev) { static const struct access_ereg_val init[] = { @@ -185,7 +227,7 @@ static int lan87xx_phy_init(struct phy_device *phydev) return rc; } - return 0; + return lan87xx_config_rgmii_delay(phydev); } static int lan87xx_phy_config_intr(struct phy_device *phydev) -- cgit v1.2.3 From d52bcb47bdf971a59a2467975d2405fcfcb2fa19 Mon Sep 17 00:00:00 2001 From: Maxim Kiselev Date: Mon, 1 Nov 2021 18:23:41 +0300 Subject: net: davinci_emac: Fix interrupt pacing disable This patch allows to use 0 for `coal->rx_coalesce_usecs` param to disable rx irq coalescing. Previously we could enable rx irq coalescing via ethtool (For ex: `ethtool -C eth0 rx-usecs 2000`) but we couldn't disable it because this part rejects 0 value: if (!coal->rx_coalesce_usecs) return -EINVAL; Fixes: 84da2658a619 ("TI DaVinci EMAC : Implement interrupt pacing functionality.") Signed-off-by: Maxim Kiselev Reviewed-by: Grygorii Strashko Link: https://lore.kernel.org/r/20211101152343.4193233-1-bigunclemax@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/davinci_emac.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 2d2dcf70563f..d55f06120ce7 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -420,8 +420,20 @@ static int emac_set_coalesce(struct net_device *ndev, u32 int_ctrl, num_interrupts = 0; u32 prescale = 0, addnl_dvdr = 1, coal_intvl = 0; - if (!coal->rx_coalesce_usecs) - return -EINVAL; + if (!coal->rx_coalesce_usecs) { + priv->coal_intvl = 0; + + switch (priv->version) { + case EMAC_VERSION_2: + emac_ctrl_write(EMAC_DM646X_CMINTCTRL, 0); + break; + default: + emac_ctrl_write(EMAC_CTRL_EWINTTCNT, 0); + break; + } + + return 0; + } coal_intvl = coal->rx_coalesce_usecs; -- cgit v1.2.3 From 6ab9f57a648953e2326b9ad000783c122d133c9d Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 1 Nov 2021 22:03:12 -0400 Subject: bnxt_en: avoid newline at end of message in NL_SET_ERR_MSG_MOD Fix following coccicheck warning: ./drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c:446:8-56: WARNING avoid newline at end of message in NL_SET_ERR_MSG_MOD. Signed-off-by: Wan Jiabing Link: https://lore.kernel.org/r/20211102020312.16567-1-wanjiabing@vivo.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index ce790e9b45c3..5c464ea73576 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -443,7 +443,7 @@ static int bnxt_dl_reload_down(struct devlink *dl, bool netns_change, case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: { if (BNXT_PF(bp) && bp->pf.active_vfs) { NL_SET_ERR_MSG_MOD(extack, - "reload is unsupported when VFs are allocated\n"); + "reload is unsupported when VFs are allocated"); return -EOPNOTSUPP; } rtnl_lock(); -- cgit v1.2.3 From 236f57fe1b8853fb3505502c0f94ae64d153ae92 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 2 Nov 2021 09:24:33 +0100 Subject: net: marvell: prestera: Add explicit padding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On m68k: In function ‘prestera_hw_build_tests’, inlined from ‘prestera_hw_switch_init’ at drivers/net/ethernet/marvell/prestera/prestera_hw.c:788:2: ././include/linux/compiler_types.h:335:38: error: call to ‘__compiletime_assert_345’ declared with attribute error: BUILD_BUG_ON failed: sizeof(struct prestera_msg_switch_attr_req) != 16 ... The driver assumes structure members are naturally aligned, but does not add explicit padding, thus breaking architectures where integral values are not always naturally aligned (e.g. on m68k, __alignof(int) is 2, not 4). Fixes: bb5dbf2cc64d5cfa ("net: marvell: prestera: add firmware v4.0 support") Signed-off-by: Geert Uytterhoeven Reviewed-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211102082433.3820514-1-geert@linux-m68k.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/prestera/prestera_hw.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_hw.c index 41ba17cb2965..4f5f52dcdd9d 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.c @@ -189,6 +189,7 @@ struct prestera_msg_switch_attr_req { struct prestera_msg_cmd cmd; __le32 attr; union prestera_msg_switch_param param; + u8 pad[2]; }; struct prestera_msg_switch_init_resp { @@ -313,6 +314,7 @@ struct prestera_msg_port_info_resp { __le32 hw_id; __le32 dev_id; __le16 fp_id; + u8 pad[2]; }; struct prestera_msg_vlan_req { @@ -345,11 +347,13 @@ struct prestera_msg_bridge_req { __le32 port; __le32 dev; __le16 bridge; + u8 pad[2]; }; struct prestera_msg_bridge_resp { struct prestera_msg_ret ret; __le16 bridge; + u8 pad[2]; }; struct prestera_msg_acl_action { @@ -408,16 +412,19 @@ struct prestera_msg_acl_ruleset_bind_req { __le32 port; __le32 dev; __le16 ruleset_id; + u8 pad[2]; }; struct prestera_msg_acl_ruleset_req { struct prestera_msg_cmd cmd; __le16 id; + u8 pad[2]; }; struct prestera_msg_acl_ruleset_resp { struct prestera_msg_ret ret; __le16 id; + u8 pad[2]; }; struct prestera_msg_span_req { @@ -425,11 +432,13 @@ struct prestera_msg_span_req { __le32 port; __le32 dev; u8 id; + u8 pad[3]; }; struct prestera_msg_span_resp { struct prestera_msg_ret ret; u8 id; + u8 pad[3]; }; struct prestera_msg_stp_req { @@ -443,6 +452,7 @@ struct prestera_msg_stp_req { struct prestera_msg_rxtx_req { struct prestera_msg_cmd cmd; u8 use_sdma; + u8 pad[3]; }; struct prestera_msg_rxtx_resp { @@ -455,12 +465,14 @@ struct prestera_msg_lag_req { __le32 port; __le32 dev; __le16 lag_id; + u8 pad[2]; }; struct prestera_msg_cpu_code_counter_req { struct prestera_msg_cmd cmd; u8 counter_type; u8 code; + u8 pad[2]; }; struct mvsw_msg_cpu_code_counter_ret { -- cgit v1.2.3 From c4777efa751d293e369aec464ce6875e957be255 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Nov 2021 17:45:55 -0700 Subject: net: add and use skb_unclone_keeptruesize() helper While commit 097b9146c0e2 ("net: fix up truesize of cloned skb in skb_prepare_for_shift()") fixed immediate issues found when KFENCE was enabled/tested, there are still similar issues, when tcp_trim_head() hits KFENCE while the master skb is cloned. This happens under heavy networking TX workloads, when the TX completion might be delayed after incoming ACK. This patch fixes the WARNING in sk_stream_kill_queues when sk->sk_mem_queued/sk->sk_forward_alloc are not zero. Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB") Signed-off-by: Eric Dumazet Acked-by: Marco Elver Link: https://lore.kernel.org/r/20211102004555.1359210-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 ++++++++++++++++ net/core/skbuff.c | 14 +------------- net/ipv4/tcp_output.c | 6 +++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0bd6520329f6..a63e13082397 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1675,6 +1675,22 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) return 0; } +/* This variant of skb_unclone() makes sure skb->truesize is not changed */ +static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + might_sleep_if(gfpflags_allow_blocking(pri)); + + if (skb_cloned(skb)) { + unsigned int save = skb->truesize; + int res; + + res = pskb_expand_head(skb, 0, 0, pri); + skb->truesize = save; + return res; + } + return 0; +} + /** * skb_header_cloned - is the header a clone * @skb: buffer to check diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 67a9188d8a49..3ec42cdee16a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3449,19 +3449,7 @@ EXPORT_SYMBOL(skb_split); */ static int skb_prepare_for_shift(struct sk_buff *skb) { - int ret = 0; - - if (skb_cloned(skb)) { - /* Save and restore truesize: pskb_expand_head() may reallocate - * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we - * cannot change truesize at this point. - */ - unsigned int save_truesize = skb->truesize; - - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - skb->truesize = save_truesize; - } - return ret; + return skb_unclone_keeptruesize(skb, GFP_ATOMIC); } /** diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6fbbf1558033..76cc1641beb4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1559,7 +1559,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; } - if (skb_unclone(skb, gfp)) + if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ @@ -1667,7 +1667,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); @@ -3166,7 +3166,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); -- cgit v1.2.3 From 18635d524870888b60625abef086b53c2df3ad89 Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Tue, 2 Nov 2021 13:04:00 +0200 Subject: MAINTAINERS: Update ENA maintainers information The ENA driver is no longer maintained by Netanel and Guy Signed-off-by: Shay Agroskin Link: https://lore.kernel.org/r/20211102110358.193920-1-shayagr@amazon.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f96aa662ee32..ecf8ec3d2339 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -872,9 +872,10 @@ F: Documentation/devicetree/bindings/thermal/amazon,al-thermal.txt F: drivers/thermal/thermal_mmio.c AMAZON ETHERNET DRIVERS -M: Netanel Belgazal +M: Shay Agroskin M: Arthur Kiyanovski -R: Guy Tzalik +R: David Arinzon +R: Noam Dagan R: Saeed Bishara L: netdev@vger.kernel.org S: Supported -- cgit v1.2.3 From db2434343b2c29817fe1fa63919e9c56218a46e8 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 2 Nov 2021 21:03:53 +0800 Subject: amt: fix error return code in amt_init() Return error code when alloc_workqueue() fails in amt_init(). Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Reviewed-by: Taehee Yoo Link: https://lore.kernel.org/r/20211102130353.1666999-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 60a7053a9cf7..d8c9ed9f8a81 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -3259,8 +3259,10 @@ static int __init amt_init(void) goto unregister_notifier; amt_wq = alloc_workqueue("amt", WQ_UNBOUND, 1); - if (!amt_wq) + if (!amt_wq) { + err = -ENOMEM; goto rtnl_unregister; + } spin_lock_init(&source_gc_lock); spin_lock_bh(&source_gc_lock); -- cgit v1.2.3 From a4414341b58397e703c066d28081c58f5057e948 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 3 Nov 2021 11:45:07 +0800 Subject: amt: Remove duplicate include Clean up the following includecheck warning: ./drivers/net/amt.c: net/protocol.h is included more than once. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Reviewed-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/amt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index d8c9ed9f8a81..896c9e2857f0 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From ca3676f94b8f40f52d285f9aef36dfd6725bfc14 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 3 Nov 2021 10:44:55 +0800 Subject: kselftests/net: add missed icmp.sh test to Makefile When generating the selftests to another folder, the icmp.sh test will miss as it is not in Makefile, e.g. make -C tools/testing/selftests/ install \ TARGETS="net" INSTALL_PATH=/tmp/kselftests Fixes: 7e9838b7915e ("selftests/net: Add icmp.sh for testing ICMP dummy address responses") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index aee76d1bb9da..7b079b01aa1b 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -12,7 +12,7 @@ TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_a TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh -TEST_PROGS += altnames.sh icmp_redirect.sh ip6_gre_headroom.sh +TEST_PROGS += altnames.sh icmp.sh icmp_redirect.sh ip6_gre_headroom.sh TEST_PROGS += route_localnet.sh TEST_PROGS += reuseaddr_ports_exhausted.sh TEST_PROGS += txtimestamp.sh -- cgit v1.2.3 From b99ac1841147eefd8d8b52fcf00d7d917949ae7f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 3 Nov 2021 10:44:56 +0800 Subject: kselftests/net: add missed setup_loopback.sh/setup_veth.sh to Makefile When generating the selftests to another folder, the include file setup_loopback.sh/setup_veth.sh for gro.sh/gre_gro.sh are missing as they are not in Makefile, e.g. make -C tools/testing/selftests/ install \ TARGETS="net" INSTALL_PATH=/tmp/kselftests Fixes: 7d1575014a63 ("selftests/net: GRO coalesce test") Fixes: 9af771d2ec04 ("selftests/net: allow GRO coalesce test on veth") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7b079b01aa1b..8c3d0709b870 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -30,7 +30,7 @@ TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh -TEST_PROGS_EXTENDED := in_netns.sh +TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite -- cgit v1.2.3 From 653e7f19b4a0a632cead2390281bde352d3d3273 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 3 Nov 2021 10:44:57 +0800 Subject: kselftests/net: add missed SRv6 tests When generating the selftests to another folder, the SRv6 tests are missing as they are not in Makefile, e.g. make -C tools/testing/selftests/ install \ TARGETS="net" INSTALL_PATH=/tmp/kselftests Fixes: 03a0b567a03d ("selftests: seg6: add selftest for SRv6 End.DT46 Behavior") Fixes: 2195444e09b4 ("selftests: add selftest for the SRv6 End.DT4 behavior") Fixes: 2bc035538e16 ("selftests: add selftest for the SRv6 End.DT6 (VRF) behavior") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8c3d0709b870..256dcd17cd8d 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -30,6 +30,9 @@ TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh +TEST_PROGS += srv6_end_dt46_l3vpn_test.sh +TEST_PROGS += srv6_end_dt4_l3vpn_test.sh +TEST_PROGS += srv6_end_dt6_l3vpn_test.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any -- cgit v1.2.3 From 8883deb50eb6529ae1fd4641e402da8ab4f720d2 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 3 Nov 2021 10:44:58 +0800 Subject: kselftests/net: add missed vrf_strict_mode_test.sh test to Makefile When generating the selftests to another folder, the vrf_strict_mode_test.sh test will miss as it is not in Makefile, e.g. make -C tools/testing/selftests/ install \ TARGETS="net" INSTALL_PATH=/tmp/kselftests Fixes: 8735e6eaa438 ("selftests: add selftest for the VRF strict mode") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 256dcd17cd8d..218a24f0567e 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -33,6 +33,7 @@ TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += srv6_end_dt46_l3vpn_test.sh TEST_PROGS += srv6_end_dt4_l3vpn_test.sh TEST_PROGS += srv6_end_dt6_l3vpn_test.sh +TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any -- cgit v1.2.3 From 17b67370c38de2a878debf39dcbc704a206af4d0 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 3 Nov 2021 10:44:59 +0800 Subject: kselftests/net: add missed toeplitz.sh/toeplitz_client.sh to Makefile When generating the selftests to another folder, the toeplitz.sh and toeplitz_client.sh are missing as they are not in Makefile, e.g. make -C tools/testing/selftests/ install \ TARGETS="net" INSTALL_PATH=/tmp/kselftests Making them under TEST_PROGS_EXTENDED as they test NIC hardware features and are not intended to be run from kselftests. Fixes: 5ebfb4cc3048 ("selftests/net: toeplitz test") Reviewed-by: Willem de Bruijn Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 218a24f0567e..7615f29831eb 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -35,6 +35,7 @@ TEST_PROGS += srv6_end_dt4_l3vpn_test.sh TEST_PROGS += srv6_end_dt6_l3vpn_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh +TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite -- cgit v1.2.3 From c081d53f97a1a90a38e4296dd3d6fda5e38dca2c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:47 -0400 Subject: security: pass asoc to sctp_assoc_request and sctp_sk_clone This patch is to move secid and peer_secid from endpoint to association, and pass asoc to sctp_assoc_request and sctp_sk_clone instead of ep. As ep is the local endpoint and asoc represents a connection, and in SCTP one sk/ep could have multiple asoc/connection, saving secid/peer_secid for new asoc will overwrite the old asoc's. Note that since asoc can be passed as NULL, security_sctp_assoc_request() is moved to the place right after the new_asoc is created in sctp_sf_do_5_1B_init() and sctp_sf_do_unexpected_init(). v1->v2: - fix the description of selinux_netlbl_skbuff_setsid(), as Jakub noticed. - fix the annotation in selinux_sctp_assoc_request(), as Richard Noticed. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- Documentation/security/SCTP.rst | 28 ++++++++++++++-------------- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/lsm_hooks.h | 8 ++++---- include/linux/security.h | 10 +++++----- include/net/sctp/structs.h | 20 ++++++++++---------- net/sctp/sm_statefuns.c | 26 +++++++++++++------------- net/sctp/socket.c | 5 ++--- security/security.c | 8 ++++---- security/selinux/hooks.c | 22 +++++++++++----------- security/selinux/include/netlabel.h | 4 ++-- security/selinux/netlabel.c | 18 +++++++++--------- 11 files changed, 76 insertions(+), 77 deletions(-) diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst index 0bcf6c1245ee..415b548d9ce0 100644 --- a/Documentation/security/SCTP.rst +++ b/Documentation/security/SCTP.rst @@ -26,11 +26,11 @@ described in the `SCTP SELinux Support`_ chapter. security_sctp_assoc_request() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Passes the ``@ep`` and ``@chunk->skb`` of the association INIT packet to the +Passes the ``@asoc`` and ``@chunk->skb`` of the association INIT packet to the security module. Returns 0 on success, error on failure. :: - @ep - pointer to sctp endpoint structure. + @asoc - pointer to sctp association structure. @skb - pointer to skbuff of association packet. @@ -117,9 +117,9 @@ Called whenever a new socket is created by **accept**\(2) calls **sctp_peeloff**\(3). :: - @ep - pointer to current sctp endpoint structure. + @asoc - pointer to current sctp association structure. @sk - pointer to current sock structure. - @sk - pointer to new sock structure. + @newsk - pointer to new sock structure. security_inet_conn_established() @@ -200,22 +200,22 @@ hooks with the SELinux specifics expanded below:: security_sctp_assoc_request() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Passes the ``@ep`` and ``@chunk->skb`` of the association INIT packet to the +Passes the ``@asoc`` and ``@chunk->skb`` of the association INIT packet to the security module. Returns 0 on success, error on failure. :: - @ep - pointer to sctp endpoint structure. + @asoc - pointer to sctp association structure. @skb - pointer to skbuff of association packet. The security module performs the following operations: - IF this is the first association on ``@ep->base.sk``, then set the peer + IF this is the first association on ``@asoc->base.sk``, then set the peer sid to that in ``@skb``. This will ensure there is only one peer sid - assigned to ``@ep->base.sk`` that may support multiple associations. + assigned to ``@asoc->base.sk`` that may support multiple associations. - ELSE validate the ``@ep->base.sk peer_sid`` against the ``@skb peer sid`` + ELSE validate the ``@asoc->base.sk peer_sid`` against the ``@skb peer sid`` to determine whether the association should be allowed or denied. - Set the sctp ``@ep sid`` to socket's sid (from ``ep->base.sk``) with + Set the sctp ``@asoc sid`` to socket's sid (from ``asoc->base.sk``) with MLS portion taken from ``@skb peer sid``. This will be used by SCTP TCP style sockets and peeled off connections as they cause a new socket to be generated. @@ -259,13 +259,13 @@ security_sctp_sk_clone() Called whenever a new socket is created by **accept**\(2) (i.e. a TCP style socket) or when a socket is 'peeled off' e.g userspace calls **sctp_peeloff**\(3). ``security_sctp_sk_clone()`` will set the new -sockets sid and peer sid to that contained in the ``@ep sid`` and -``@ep peer sid`` respectively. +sockets sid and peer sid to that contained in the ``@asoc sid`` and +``@asoc peer sid`` respectively. :: - @ep - pointer to current sctp endpoint structure. + @asoc - pointer to current sctp association structure. @sk - pointer to current sock structure. - @sk - pointer to new sock structure. + @newsk - pointer to new sock structure. security_inet_conn_established() diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index a9ac70ae01ab..df8de62f4710 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -329,11 +329,11 @@ LSM_HOOK(int, 0, tun_dev_create, void) LSM_HOOK(int, 0, tun_dev_attach_queue, void *security) LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security) LSM_HOOK(int, 0, tun_dev_open, void *security) -LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_endpoint *ep, +LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc, struct sk_buff *skb) LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) -LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_endpoint *ep, +LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) #endif /* CONFIG_SECURITY_NETWORK */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0bada4df23fc..d45b6f6e27fd 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1027,9 +1027,9 @@ * Security hooks for SCTP * * @sctp_assoc_request: - * Passes the @ep and @chunk->skb of the association INIT packet to + * Passes the @asoc and @chunk->skb of the association INIT packet to * the security module. - * @ep pointer to sctp endpoint structure. + * @asoc pointer to sctp association structure. * @skb pointer to skbuff of association packet. * Return 0 on success, error on failure. * @sctp_bind_connect: @@ -1047,9 +1047,9 @@ * Called whenever a new socket is created by accept(2) (i.e. a TCP * style socket) or when a socket is 'peeled off' e.g userspace * calls sctp_peeloff(3). - * @ep pointer to current sctp endpoint structure. + * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. - * @sk pointer to new sock structure. + * @newsk pointer to new sock structure. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index 7e0ba63b5dde..bbf44a466832 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -179,7 +179,7 @@ struct xfrm_policy; struct xfrm_state; struct xfrm_user_sec_ctx; struct seq_file; -struct sctp_endpoint; +struct sctp_association; #ifdef CONFIG_MMU extern unsigned long mmap_min_addr; @@ -1425,10 +1425,10 @@ int security_tun_dev_create(void); int security_tun_dev_attach_queue(void *security); int security_tun_dev_attach(struct sock *sk, void *security); int security_tun_dev_open(void *security); -int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb); +int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb); int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); -void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, +void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); #else /* CONFIG_SECURITY_NETWORK */ @@ -1631,7 +1631,7 @@ static inline int security_tun_dev_open(void *security) return 0; } -static inline int security_sctp_assoc_request(struct sctp_endpoint *ep, +static inline int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return 0; @@ -1644,7 +1644,7 @@ static inline int security_sctp_bind_connect(struct sock *sk, int optname, return 0; } -static inline void security_sctp_sk_clone(struct sctp_endpoint *ep, +static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 651bba654d77..899c29c326ba 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1355,16 +1355,6 @@ struct sctp_endpoint { reconf_enable:1; __u8 strreset_enable; - - /* Security identifiers from incoming (INIT). These are set by - * security_sctp_assoc_request(). These will only be used by - * SCTP TCP type sockets and peeled off connections as they - * cause a new socket to be generated. security_sctp_sk_clone() - * will then plug these into the new socket. - */ - - u32 secid; - u32 peer_secid; }; /* Recover the outter endpoint structure. */ @@ -2104,6 +2094,16 @@ struct sctp_association { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + /* Security identifiers from incoming (INIT). These are set by + * security_sctp_assoc_request(). These will only be used by + * SCTP TCP type sockets and peeled off connections as they + * cause a new socket to be generated. security_sctp_sk_clone() + * will then plug these into the new socket. + */ + + u32 secid; + u32 peer_secid; + struct rcu_head rcu; }; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index fb3da4d8f4a3..3206374209bc 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -326,11 +326,6 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, struct sctp_packet *packet; int len; - /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* 6.10 Bundling * An endpoint MUST NOT bundle INIT, INIT ACK or * SHUTDOWN COMPLETE with any other chunks. @@ -415,6 +410,12 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, if (!new_asoc) goto nomem; + /* Update socket peer label if first association. */ + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0) @@ -780,7 +781,6 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, } } - /* Delay state machine commands until later. * * Re-build the bind address for the association is done in @@ -1517,11 +1517,6 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( struct sctp_packet *packet; int len; - /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* 6.10 Bundling * An endpoint MUST NOT bundle INIT, INIT ACK or * SHUTDOWN COMPLETE with any other chunks. @@ -1594,6 +1589,12 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( if (!new_asoc) goto nomem; + /* Update socket peer label if first association. */ + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0) goto nomem; @@ -2255,8 +2256,7 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( } /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) { + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { sctp_association_free(new_asoc); return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6b937bfd4751..33391254fa82 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9412,7 +9412,6 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; struct sctp_sock *sp = sctp_sk(sk); - struct sctp_endpoint *ep = sp->ep; newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; @@ -9457,9 +9456,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, net_enable_timestamp(); /* Set newsk security attributes from original sk and connection - * security attribute from ep. + * security attribute from asoc. */ - security_sctp_sk_clone(ep, sk, newsk); + security_sctp_sk_clone(asoc, sk, newsk); } static inline void sctp_copy_descendant(struct sock *sk_to, diff --git a/security/security.c b/security/security.c index 95e30fadba78..c88167a414b4 100644 --- a/security/security.c +++ b/security/security.c @@ -2367,9 +2367,9 @@ int security_tun_dev_open(void *security) } EXPORT_SYMBOL(security_tun_dev_open); -int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb) +int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { - return call_int_hook(sctp_assoc_request, 0, ep, skb); + return call_int_hook(sctp_assoc_request, 0, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_request); @@ -2381,10 +2381,10 @@ int security_sctp_bind_connect(struct sock *sk, int optname, } EXPORT_SYMBOL(security_sctp_bind_connect); -void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, +void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { - call_void_hook(sctp_sk_clone, ep, sk, newsk); + call_void_hook(sctp_sk_clone, asoc, sk, newsk); } EXPORT_SYMBOL(security_sctp_sk_clone); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ea7b2876a5ae..62d30c0a30c2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5339,10 +5339,10 @@ static void selinux_sock_graft(struct sock *sk, struct socket *parent) * connect(2), sctp_connectx(3) or sctp_sendmsg(3) (with no association * already present). */ -static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, +static int selinux_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { - struct sk_security_struct *sksec = ep->base.sk->sk_security; + struct sk_security_struct *sksec = asoc->base.sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; u8 peerlbl_active; @@ -5359,7 +5359,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, /* This will return peer_sid = SECSID_NULL if there are * no peer labels, see security_net_peersid_resolve(). */ - err = selinux_skb_peerlbl_sid(skb, ep->base.sk->sk_family, + err = selinux_skb_peerlbl_sid(skb, asoc->base.sk->sk_family, &peer_sid); if (err) return err; @@ -5383,7 +5383,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, */ ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; - ad.u.net->sk = ep->base.sk; + ad.u.net->sk = asoc->base.sk; err = avc_has_perm(&selinux_state, sksec->peer_sid, peer_sid, sksec->sclass, SCTP_SOCKET__ASSOCIATION, &ad); @@ -5392,7 +5392,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, } /* Compute the MLS component for the connection and store - * the information in ep. This will be used by SCTP TCP type + * the information in asoc. This will be used by SCTP TCP type * sockets and peeled off connections as they cause a new * socket to be generated. selinux_sctp_sk_clone() will then * plug this into the new socket. @@ -5401,11 +5401,11 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, if (err) return err; - ep->secid = conn_sid; - ep->peer_secid = peer_sid; + asoc->secid = conn_sid; + asoc->peer_secid = peer_sid; /* Set any NetLabel labels including CIPSO/CALIPSO options. */ - return selinux_netlbl_sctp_assoc_request(ep, skb); + return selinux_netlbl_sctp_assoc_request(asoc, skb); } /* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting @@ -5490,7 +5490,7 @@ static int selinux_sctp_bind_connect(struct sock *sk, int optname, } /* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */ -static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, +static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { struct sk_security_struct *sksec = sk->sk_security; @@ -5502,8 +5502,8 @@ static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, if (!selinux_policycap_extsockclass()) return selinux_sk_clone_security(sk, newsk); - newsksec->sid = ep->secid; - newsksec->peer_sid = ep->peer_secid; + newsksec->sid = asoc->secid; + newsksec->peer_sid = asoc->peer_secid; newsksec->sclass = sksec->sclass; selinux_netlbl_sctp_sk_clone(sk, newsk); } diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h index 0c58f62dc6ab..4d0456d3d459 100644 --- a/security/selinux/include/netlabel.h +++ b/security/selinux/include/netlabel.h @@ -39,7 +39,7 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, int selinux_netlbl_skbuff_setsid(struct sk_buff *skb, u16 family, u32 sid); -int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep, +int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb); int selinux_netlbl_inet_conn_request(struct request_sock *req, u16 family); void selinux_netlbl_inet_csk_clone(struct sock *sk, u16 family); @@ -98,7 +98,7 @@ static inline int selinux_netlbl_skbuff_setsid(struct sk_buff *skb, return 0; } -static inline int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep, +static inline int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return 0; diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index 29b88e81869b..1321f15799e2 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -261,30 +261,30 @@ skbuff_setsid_return: /** * selinux_netlbl_sctp_assoc_request - Label an incoming sctp association. - * @ep: incoming association endpoint. + * @asoc: incoming association. * @skb: the packet. * * Description: - * A new incoming connection is represented by @ep, ...... + * A new incoming connection is represented by @asoc, ...... * Returns zero on success, negative values on failure. * */ -int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep, +int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { int rc; struct netlbl_lsm_secattr secattr; - struct sk_security_struct *sksec = ep->base.sk->sk_security; + struct sk_security_struct *sksec = asoc->base.sk->sk_security; struct sockaddr_in addr4; struct sockaddr_in6 addr6; - if (ep->base.sk->sk_family != PF_INET && - ep->base.sk->sk_family != PF_INET6) + if (asoc->base.sk->sk_family != PF_INET && + asoc->base.sk->sk_family != PF_INET6) return 0; netlbl_secattr_init(&secattr); rc = security_netlbl_sid_to_secattr(&selinux_state, - ep->secid, &secattr); + asoc->secid, &secattr); if (rc != 0) goto assoc_request_return; @@ -294,11 +294,11 @@ int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep, if (ip_hdr(skb)->version == 4) { addr4.sin_family = AF_INET; addr4.sin_addr.s_addr = ip_hdr(skb)->saddr; - rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr4, &secattr); + rc = netlbl_conn_setattr(asoc->base.sk, (void *)&addr4, &secattr); } else if (IS_ENABLED(CONFIG_IPV6) && ip_hdr(skb)->version == 6) { addr6.sin6_family = AF_INET6; addr6.sin6_addr = ipv6_hdr(skb)->saddr; - rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr6, &secattr); + rc = netlbl_conn_setattr(asoc->base.sk, (void *)&addr6, &secattr); } else { rc = -EAFNOSUPPORT; } -- cgit v1.2.3 From e215dab1c49070cd75620afd801f777207a5b65c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:48 -0400 Subject: security: call security_sctp_assoc_request in sctp_sf_do_5_1D_ce The asoc created when receives the INIT chunk is a temporary one, it will be deleted after INIT_ACK chunk is replied. So for the real asoc created in sctp_sf_do_5_1D_ce() when the COOKIE_ECHO chunk is received, security_sctp_assoc_request() should also be called. v1->v2: - fix some typo and grammar errors, noticed by Ondrej. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- Documentation/security/SCTP.rst | 15 +++++++++------ net/sctp/sm_statefuns.c | 5 +++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst index 415b548d9ce0..d5fd6ccc3dcb 100644 --- a/Documentation/security/SCTP.rst +++ b/Documentation/security/SCTP.rst @@ -151,9 +151,9 @@ establishing an association. INIT ---------------------------------------------> sctp_sf_do_5_1B_init() Respond to an INIT chunk. - SCTP peer endpoint "A" is - asking for an association. Call - security_sctp_assoc_request() + SCTP peer endpoint "A" is asking + for a temporary association. + Call security_sctp_assoc_request() to set the peer label if first association. If not first association, check @@ -163,9 +163,12 @@ establishing an association. | discard the packet. | COOKIE ECHO ------------------------------------------> - | - | - | + sctp_sf_do_5_1D_ce() + Respond to an COOKIE ECHO chunk. + Confirm the cookie and create a + permanent association. + Call security_sctp_assoc_request() to + do the same as for INIT chunk Response. <------------------------------------------- COOKIE ACK | | sctp_sf_do_5_1E_ca | diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 3206374209bc..b818532c3fc2 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -781,6 +781,11 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, } } + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + /* Delay state machine commands until later. * * Re-build the bind address for the association is done in -- cgit v1.2.3 From 7c2ef0240e6abfd3cc59511339517358350a8910 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:49 -0400 Subject: security: add sctp_assoc_established hook security_sctp_assoc_established() is added to replace security_inet_conn_established() called in sctp_sf_do_5_1E_ca(), so that asoc can be accessed in security subsystem and save the peer secid to asoc->peer_secid. v1->v2: - fix the return value of security_sctp_assoc_established() in security.h, found by kernel test robot and Ondrej. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- Documentation/security/SCTP.rst | 22 ++++++++++------------ include/linux/lsm_hook_defs.h | 2 ++ include/linux/lsm_hooks.h | 5 +++++ include/linux/security.h | 7 +++++++ net/sctp/sm_statefuns.c | 2 +- security/security.c | 7 +++++++ 6 files changed, 32 insertions(+), 13 deletions(-) diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst index d5fd6ccc3dcb..406cc68b8808 100644 --- a/Documentation/security/SCTP.rst +++ b/Documentation/security/SCTP.rst @@ -15,10 +15,7 @@ For security module support, three SCTP specific hooks have been implemented:: security_sctp_assoc_request() security_sctp_bind_connect() security_sctp_sk_clone() - -Also the following security hook has been utilised:: - - security_inet_conn_established() + security_sctp_assoc_established() The usage of these hooks are described below with the SELinux implementation described in the `SCTP SELinux Support`_ chapter. @@ -122,11 +119,12 @@ calls **sctp_peeloff**\(3). @newsk - pointer to new sock structure. -security_inet_conn_established() +security_sctp_assoc_established() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Called when a COOKIE ACK is received:: +Called when a COOKIE ACK is received, and the peer secid will be +saved into ``@asoc->peer_secid`` for client:: - @sk - pointer to sock structure. + @asoc - pointer to sctp association structure. @skb - pointer to skbuff of the COOKIE ACK packet. @@ -134,7 +132,7 @@ Security Hooks used for Association Establishment ------------------------------------------------- The following diagram shows the use of ``security_sctp_bind_connect()``, -``security_sctp_assoc_request()``, ``security_inet_conn_established()`` when +``security_sctp_assoc_request()``, ``security_sctp_assoc_established()`` when establishing an association. :: @@ -172,7 +170,7 @@ establishing an association. <------------------------------------------- COOKIE ACK | | sctp_sf_do_5_1E_ca | - Call security_inet_conn_established() | + Call security_sctp_assoc_established() | to set the peer label. | | | | If SCTP_SOCKET_TCP or peeled off @@ -198,7 +196,7 @@ hooks with the SELinux specifics expanded below:: security_sctp_assoc_request() security_sctp_bind_connect() security_sctp_sk_clone() - security_inet_conn_established() + security_sctp_assoc_established() security_sctp_assoc_request() @@ -271,12 +269,12 @@ sockets sid and peer sid to that contained in the ``@asoc sid`` and @newsk - pointer to new sock structure. -security_inet_conn_established() +security_sctp_assoc_established() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Called when a COOKIE ACK is received where it sets the connection's peer sid to that in ``@skb``:: - @sk - pointer to sock structure. + @asoc - pointer to sctp association structure. @skb - pointer to skbuff of the COOKIE ACK packet. diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index df8de62f4710..442a611fa0fb 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -335,6 +335,8 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) +LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc, + struct sk_buff *skb) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d45b6f6e27fd..d6823214d5c1 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1050,6 +1050,11 @@ * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. * @newsk pointer to new sock structure. + * @sctp_assoc_established: + * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet + * to the security module. + * @asoc pointer to sctp association structure. + * @skb pointer to skbuff of association packet. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index bbf44a466832..06eac4e61a13 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1430,6 +1430,8 @@ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); +void security_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -1649,6 +1651,11 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *newsk) { } + +static inline void security_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb) +{ +} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index b818532c3fc2..5fabaa54b77d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -946,7 +946,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL()); /* Set peer label for connection. */ - security_inet_conn_established(ep->base.sk, chunk->skb); + security_sctp_assoc_established((struct sctp_association *)asoc, chunk->skb); /* RFC 2960 5.1 Normal Establishment of an Association * diff --git a/security/security.c b/security/security.c index c88167a414b4..779a9edea0a0 100644 --- a/security/security.c +++ b/security/security.c @@ -2388,6 +2388,13 @@ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, } EXPORT_SYMBOL(security_sctp_sk_clone); +void security_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb) +{ + call_void_hook(sctp_assoc_established, asoc, skb); +} +EXPORT_SYMBOL(security_sctp_assoc_established); + #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND -- cgit v1.2.3 From e7310c94024cdf099c0d29e6903dd6fe9205bb60 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:50 -0400 Subject: security: implement sctp_assoc_established hook in selinux Different from selinux_inet_conn_established(), it also gives the secid to asoc->peer_secid in selinux_sctp_assoc_established(), as one UDP-type socket may have more than one asocs. Note that peer_secid in asoc will save the peer secid for this asoc connection, and peer_sid in sksec will just keep the peer secid for the latest connection. So the right use should be do peeloff for UDP-type socket if there will be multiple asocs in one socket, so that the peeloff socket has the right label for its asoc. v1->v2: - call selinux_inet_conn_established() to reduce some code duplication in selinux_sctp_assoc_established(), as Ondrej suggested. - when doing peeloff, it calls sock_create() where it actually gets secid for socket from socket_sockcreate_sid(). So reuse SECSID_WILD to ensure the peeloff socket keeps using that secid after calling selinux_sctp_sk_clone() for client side. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- security/selinux/hooks.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 62d30c0a30c2..5e5215fe2e83 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5502,7 +5502,8 @@ static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk if (!selinux_policycap_extsockclass()) return selinux_sk_clone_security(sk, newsk); - newsksec->sid = asoc->secid; + if (asoc->secid != SECSID_WILD) + newsksec->sid = asoc->secid; newsksec->peer_sid = asoc->peer_secid; newsksec->sclass = sksec->sclass; selinux_netlbl_sctp_sk_clone(sk, newsk); @@ -5558,6 +5559,16 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); } +static void selinux_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb) +{ + struct sk_security_struct *sksec = asoc->base.sk->sk_security; + + selinux_inet_conn_established(asoc->base.sk, skb); + asoc->peer_secid = sksec->peer_sid; + asoc->secid = SECSID_WILD; +} + static int selinux_secmark_relabel_packet(u32 sid) { const struct task_security_struct *__tsec; @@ -7228,6 +7239,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request), LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone), LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect), + LSM_HOOK_INIT(sctp_assoc_established, selinux_sctp_assoc_established), LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request), LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone), LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established), -- cgit v1.2.3 From aedddb4e45b34426cfbfa84454b6f203712733c5 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 2 Nov 2021 16:10:21 +0800 Subject: NFC: add necessary privilege flags in netlink layer The CAP_NET_ADMIN checks are needed to prevent attackers faking a device under NCIUARTSETDRIVER and exploit privileged commands. This patch add GENL_ADMIN_PERM flags in genl_ops to fulfill the check. Except for commands like NFC_CMD_GET_DEVICE, NFC_CMD_GET_TARGET, NFC_CMD_LLC_GET_PARAMS, and NFC_CMD_GET_SE, which are mainly information- read operations. Signed-off-by: Lin Ma Signed-off-by: David S. Miller --- net/nfc/netlink.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 49089c50872e..334f63c9529e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1664,31 +1664,37 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, @@ -1706,26 +1712,31 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, @@ -1737,21 +1748,25 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, + .flags = GENL_ADMIN_PERM, }, }; -- cgit v1.2.3 From acaea0d5a63406c052444ad3a7cb54241adaf805 Mon Sep 17 00:00:00 2001 From: Zhang Mingyu Date: Wed, 3 Nov 2021 06:46:17 +0000 Subject: net:ipv6:Remove unneeded semicolon Eliminate the following coccinelle check warning: net/ipv6/seg6.c:381:2-3 Reported-by: Zeal Robot Signed-off-by: Zhang Mingyu Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 5daa1c3ed83b..a8b5784afb1a 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -378,7 +378,7 @@ static int __net_init seg6_net_init(struct net *net) kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); return -ENOMEM; - }; + } #endif return 0; -- cgit v1.2.3 From 9b65b17db72313b7a4fe9bc9502928c88be57986 Mon Sep 17 00:00:00 2001 From: Talal Ahmad Date: Tue, 2 Nov 2021 22:58:44 -0400 Subject: net: avoid double accounting for pure zerocopy skbs Track skbs containing only zerocopy data and avoid charging them to kernel memory to correctly account the memory utilization for msg_zerocopy. All of the data in such skbs is held in user pages which are already accounted to user. Before this change, they are charged again in kernel in __zerocopy_sg_from_iter. The charging in kernel is excessive because data is not being copied into skb frags. This excessive charging can lead to kernel going into memory pressure state which impacts all sockets in the system adversely. Mark pure zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove charge/uncharge for data in such skbs. Initially, an skb is marked pure zerocopy when it is empty and in zerocopy path. skb can then change from a pure zerocopy skb to mixed data skb (zerocopy and copy data) if it is at tail of write queue and there is room available in it and non-zerocopy data is being sent in the next sendmsg call. At this time sk_mem_charge is done for the pure zerocopied data and the pure zerocopy flag is unmarked. We found that this happens very rarely on workloads that pass MSG_ZEROCOPY. A pure zerocopy skb can later be coalesced into normal skb if they are next to each other in queue but this patch prevents coalescing from happening. This avoids complexity of charging when skb downgrades from pure zerocopy to mixed. This is also rare. In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge for SKB_TRUESIZE(skb_end_offset(skb)) is done for sk_mem_charge in tcp_skb_entail for an skb without data. Testing with the msg_zerocopy.c benchmark between two hosts(100G nics) with zerocopy showed that before this patch the 'sock' variable in memory.stat for cgroup2 that tracks sum of sk_forward_alloc, sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this change it is 0. This is due to no charge to sk_forward_alloc for zerocopy data and shows memory utilization for kernel is lowered. With this commit we don't see the warning we saw in previous commit which resulted in commit 84882cf72cd774cf16fd338bdbf00f69ac9f9194. Signed-off-by: Talal Ahmad Acked-by: Arjun Roy Acked-by: Soheil Hassas Yeganeh Signed-off-by: Willem de Bruijn Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 19 ++++++++++++++++++- include/net/tcp.h | 8 ++++++-- net/core/datagram.c | 3 ++- net/core/skbuff.c | 3 ++- net/ipv4/tcp.c | 22 ++++++++++++++++++++-- net/ipv4/tcp_output.c | 7 +++++-- 6 files changed, 53 insertions(+), 9 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a63e13082397..686a666d073d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -454,9 +454,15 @@ enum { * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), + + /* segment contains only zerocopy data and should not be + * charged to the kernel memory. + */ + SKBFL_PURE_ZEROCOPY = BIT(2), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) +#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } +static inline bool skb_zcopy_pure(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; +} + +static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ + return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); +} + static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); @@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); - skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } diff --git a/include/net/tcp.h b/include/net/tcp.h index 70972f3ac8fa..4da22b41bde6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -293,7 +293,10 @@ static inline bool tcp_out_of_memory(struct sock *sk) static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); - sk_mem_uncharge(sk, skb->truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, skb->truesize); + else + sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb))); __kfree_skb(skb); } @@ -974,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return likely(tcp_skb_can_collapse_to(to) && - mptcp_skb_can_collapse(to, from)); + mptcp_skb_can_collapse(to, from) && + skb_pure_zcopy_same(to, from)); } /* Events passed to congestion control interface */ diff --git a/net/core/datagram.c b/net/core/datagram.c index 15ab9ffb27fe..ee290776c661 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); - sk_mem_charge(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3ec42cdee16a..ba2f38246f07 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; - skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc7f419184aa..b461ae573afc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(skb)) { bool mem_scheduled; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); @@ -1319,6 +1320,15 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); + /* skb changing from pure zc to mixed, must charge zc */ + if (unlikely(skb_zcopy_pure(skb))) { + if (!sk_wmem_schedule(sk, skb->data_len)) + goto wait_for_space; + + sk_mem_charge(sk, skb->data_len); + skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + } + if (!sk_wmem_schedule(sk, copy)) goto wait_for_space; @@ -1339,8 +1349,16 @@ new_segment: } pfrag->offset += copy; } else { - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_space; + /* First append to a fragless skb builds initial + * pure zerocopy skb + */ + if (!skb->len) + skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; + + if (!skb_zcopy_pure(skb)) { + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_space; + } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 76cc1641beb4..6f7860e283c6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); - sk_mem_uncharge(sk, delta_truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, delta_truesize); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) + if (unlikely(TCP_SKB_CB(skb)->eor) || + tcp_has_tx_tstamp(skb) || + !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len; -- cgit v1.2.3 From 1aabe578dd86e9f2867c4db4fba9a15f4ba1825d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Nov 2021 15:02:36 -0700 Subject: ethtool: fix ethtool msg len calculation for pause stats ETHTOOL_A_PAUSE_STAT_MAX is the MAX attribute id, so we need to subtract non-stats and add one to get a count (IOW -2+1 == -1). Otherwise we'll see: ethnl cmd 21: calculated reply length 40, but consumed 52 Fixes: 9a27a33027f2 ("ethtool: add standard pause stats") Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 3 +++ include/uapi/linux/ethtool_netlink.h | 4 +++- net/ethtool/pause.c | 3 +-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 1e7bf78cb382..aba348d58ff6 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -10,6 +10,9 @@ #define __ETHTOOL_LINK_MODE_MASK_NWORDS \ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) +#define ETHTOOL_PAUSE_STAT_CNT (__ETHTOOL_A_PAUSE_STAT_CNT - \ + ETHTOOL_A_PAUSE_STAT_TX_FRAMES) + enum ethtool_multicast_groups { ETHNL_MCGRP_MONITOR, }; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index ca5fbb59fa42..999777d32dcf 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -411,7 +411,9 @@ enum { ETHTOOL_A_PAUSE_STAT_TX_FRAMES, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, - /* add new constants above here */ + /* add new constants above here + * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! + */ __ETHTOOL_A_PAUSE_STAT_CNT, ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 9009f412151e..ee1e5806bc93 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -56,8 +56,7 @@ static int pause_reply_size(const struct ethnl_req_info *req_base, if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ - nla_total_size_64bit(sizeof(u64)) * - (ETHTOOL_A_PAUSE_STAT_MAX - 2); + nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } -- cgit v1.2.3 From 250962e4684678629afd2feeaefdc40c5db501f4 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 3 Nov 2021 16:28:43 +0800 Subject: net: udp6: replace __UDP_INC_STATS() with __UDP6_INC_STATS() __UDP_INC_STATS() is used in udpv6_queue_rcv_one_skb() when encap_rcv() fails. __UDP6_INC_STATS() should be used here, so replace it with __UDP6_INC_STATS(). Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- net/ipv6/udp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 12c12619ee35..e43b31d25fb6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -700,9 +700,9 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - __UDP_INC_STATS(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP6_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } -- cgit v1.2.3 From 563bcbae3ba233c275c244bfce2efe12938f5363 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Tue, 2 Nov 2021 10:12:18 +0800 Subject: net: vlan: fix a UAF in vlan_dev_real_dev() The real_dev of a vlan net_device may be freed after unregister_vlan_dev(). Access the real_dev continually by vlan_dev_real_dev() will trigger the UAF problem for the real_dev like following: ================================================================== BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120 Call Trace: kasan_report.cold+0x83/0xdf vlan_dev_real_dev+0xf9/0x120 is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0 is_eth_port_of_netdev_filter+0x28/0x40 ib_enum_roce_netdev+0x1a3/0x300 ib_enum_all_roce_netdevs+0xc7/0x140 netdevice_event_work_handler+0x9d/0x210 ... Freed by task 9288: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0xfc/0x130 slab_free_freelist_hook+0xdd/0x240 kfree+0xe4/0x690 kvfree+0x42/0x50 device_release+0x9f/0x240 kobject_put+0x1c8/0x530 put_device+0x1b/0x30 free_netdev+0x370/0x540 ppp_destroy_interface+0x313/0x3d0 ... Move the put_device(real_dev) to vlan_dev_free(). Ensure real_dev not be freed before vlan_dev unregistered. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+e4df4e1389e28972e955@syzkaller.appspotmail.com Signed-off-by: Ziyang Xuan Reviewed-by: Jason Gunthorpe Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 --- net/8021q/vlan_dev.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 55275ef9a31a..a3a0a5e994f5 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -123,9 +123,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); - - /* Get rid of the vlan's reference to real_dev */ - dev_put(real_dev); } int vlan_check_real_dev(struct net_device *real_dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 90330b893134..ab6dee28536d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -843,6 +843,9 @@ static void vlan_dev_free(struct net_device *dev) free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; + + /* Get rid of the vlan's reference to real_dev */ + dev_put(vlan->real_dev); } void vlan_setup(struct net_device *dev) -- cgit v1.2.3 From 5f15d392dcb4aa250a63d6f2c5adfc26c0aedc78 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 2 Nov 2021 19:30:41 +0100 Subject: net: dsa: qca8k: make sure PAD0 MAC06 exchange is disabled Some device set MAC06 exchange in the bootloader. This cause some problem as we don't support this strange mode and we just set the port6 as the primary CPU port. With MAC06 exchange, PAD0 reg configure port6 instead of port0. Add an extra check and explicitly disable MAC06 exchange to correctly configure the port PAD config. Signed-off-by: Ansuel Smith Fixes: 3fcf734aa482 ("net: dsa: qca8k: add support for cpu port 6") Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/qca8k.c | 8 ++++++++ drivers/net/dsa/qca8k.h | 1 + 2 files changed, 9 insertions(+) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index ea7f12778922..a429c9750add 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1109,6 +1109,14 @@ qca8k_setup(struct dsa_switch *ds) if (ret) return ret; + /* Make sure MAC06 is disabled */ + ret = qca8k_reg_clear(priv, QCA8K_REG_PORT0_PAD_CTRL, + QCA8K_PORT0_PAD_MAC06_EXCHANGE_EN); + if (ret) { + dev_err(priv->dev, "failed disabling MAC06 exchange"); + return ret; + } + /* Enable CPU Port */ ret = qca8k_reg_set(priv, QCA8K_REG_GLOBAL_FW_CTRL0, QCA8K_GLOBAL_FW_CTRL0_CPU_PORT_EN); diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h index e10571a398c9..128b8cf85e08 100644 --- a/drivers/net/dsa/qca8k.h +++ b/drivers/net/dsa/qca8k.h @@ -34,6 +34,7 @@ #define QCA8K_MASK_CTRL_DEVICE_ID_MASK GENMASK(15, 8) #define QCA8K_MASK_CTRL_DEVICE_ID(x) ((x) >> 8) #define QCA8K_REG_PORT0_PAD_CTRL 0x004 +#define QCA8K_PORT0_PAD_MAC06_EXCHANGE_EN BIT(31) #define QCA8K_PORT0_PAD_SGMII_RXCLK_FALLING_EDGE BIT(19) #define QCA8K_PORT0_PAD_SGMII_TXCLK_FALLING_EDGE BIT(18) #define QCA8K_REG_PORT5_PAD_CTRL 0x008 -- cgit v1.2.3 From 92f62485b3715882cd397b0cbd80a96d179b86d6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 2 Nov 2021 21:31:22 +0200 Subject: net: dsa: felix: fix broken VLAN-tagged PTP under VLAN-aware bridge Normally it is expected that the dsa_device_ops :: rcv() method finishes parsing the DSA tag and consumes it, then never looks at it again. But commit c0bcf537667c ("net: dsa: ocelot: add hardware timestamping support for Felix") added support for RX timestamping in a very unconventional way. On this switch, a partial timestamp is available in the DSA header, but the driver got away with not parsing that timestamp right away, but instead delayed that parsing for a little longer: dsa_switch_rcv(): nskb = cpu_dp->rcv(skb, dev); <------------- not here -> ocelot_rcv() ... skb = nskb; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); ... if (dsa_skb_defer_rx_timestamp(p, skb)) <--- but here -> felix_rxtstamp() return 0; When in felix_rxtstamp(), this driver accounted for the fact that eth_type_trans() happened in the meanwhile, so it got a hold of the extraction header again by subtracting (ETH_HLEN + OCELOT_TAG_LEN) bytes from the current skb->data. This worked for quite some time but was quite fragile from the very beginning. Not to mention that having DSA tag parsing split in two different files, under different folders (net/dsa/tag_ocelot.c vs drivers/net/dsa/ocelot/felix.c) made it quite non-obvious for patches to come that they might break this. Finally, the blamed commit does the following: at the end of ocelot_rcv(), it checks whether the skb payload contains a VLAN header. If it does, and this port is under a VLAN-aware bridge, that VLAN ID might not be correct in the sense that the packet might have suffered VLAN rewriting due to TCAM rules (VCAP IS1). So we consume the VLAN ID from the skb payload using __skb_vlan_pop(), and take the classified VLAN ID from the DSA tag, and construct a hwaccel VLAN tag with the classified VLAN, and the skb payload is VLAN-untagged. The big problem is that __skb_vlan_pop() does: memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); __skb_pull(skb, VLAN_HLEN); aka it moves the Ethernet header 4 bytes to the right, and pulls 4 bytes from the skb headroom (effectively also moving skb->data, by definition). So for felix_rxtstamp()'s fragile logic, all bets are off now. Instead of having the "extraction" pointer point to the DSA header, it actually points to 4 bytes _inside_ the extraction header. Corollary, the last 4 bytes of the "extraction" header are in fact 4 stale bytes of the destination MAC address from the Ethernet header, from prior to the __skb_vlan_pop() movement. So of course, RX timestamps are completely bogus when the system is configured in this way. The fix is actually very simple: just don't structure the code like that. For better or worse, the DSA PTP timestamping API does not offer a straightforward way for drivers to present their RX timestamps, but other drivers (sja1105) have established a simple mechanism to carry their RX timestamp from dsa_device_ops :: rcv() all the way to dsa_switch_ops :: port_rxtstamp() and even later. That mechanism is to simply save the partial timestamp to the skb->cb, and complete it later. Question: why don't we simply populate the skb's struct skb_shared_hwtstamps from ocelot_rcv(), and bother with this complication of propagating the timestamp to felix_rxtstamp()? Answer: dsa_switch_ops :: port_rxtstamp() answers the question whether PTP packets need sleepable context to retrieve the full RX timestamp. Currently felix_rxtstamp() answers "no, thanks" to that question, and calls ocelot_ptp_gettime64() from softirq atomic context. This is understandable, since Felix VSC9959 is a PCIe memory-mapped switch, so hardware access does not require sleeping. But the felix driver is preparing for the introduction of other switches where hardware access is over a slow bus like SPI or MDIO: https://lore.kernel.org/lkml/20210814025003.2449143-1-colin.foster@in-advantage.com/ So I would like to keep this code structure, so the rework needed when that driver will need PTP support will be minimal (answer "yes, I need deferred context for this skb's RX timestamp", then the partial timestamp will still be found in the skb->cb. Fixes: ea440cd2d9b2 ("net: dsa: tag_ocelot: use VLAN information from tagging header when available") Reported-by: Po Liu Cc: Yangbo Lu Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 9 +++------ include/linux/dsa/ocelot.h | 1 + net/dsa/tag_ocelot.c | 3 +++ 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 83808e7dbdda..327cc4654806 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1370,12 +1370,12 @@ out: static bool felix_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type) { - u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN; + u32 tstamp_lo = OCELOT_SKB_CB(skb)->tstamp_lo; struct skb_shared_hwtstamps *shhwtstamps; struct ocelot *ocelot = ds->priv; - u32 tstamp_lo, tstamp_hi; struct timespec64 ts; - u64 tstamp, val; + u32 tstamp_hi; + u64 tstamp; /* If the "no XTR IRQ" workaround is in use, tell DSA to defer this skb * for RX timestamping. Then free it, and poll for its copy through @@ -1390,9 +1390,6 @@ static bool felix_rxtstamp(struct dsa_switch *ds, int port, ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); tstamp = ktime_set(ts.tv_sec, ts.tv_nsec); - ocelot_xfh_get_rew_val(extraction, &val); - tstamp_lo = (u32)val; - tstamp_hi = tstamp >> 32; if ((tstamp & 0xffffffff) < tstamp_lo) tstamp_hi--; diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index d42010cf5468..7ee708ad7df2 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -12,6 +12,7 @@ struct ocelot_skb_cb { struct sk_buff *clone; unsigned int ptp_class; /* valid only for clones */ + u32 tstamp_lo; u8 ptp_cmd; u8 ts_id; }; diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index cd60b94fc175..de1c849a0a70 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -101,6 +101,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct dsa_port *dp; u8 *extraction; u16 vlan_tpid; + u64 rew_val; /* Revert skb->data by the amount consumed by the DSA master, * so it points to the beginning of the frame. @@ -130,6 +131,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, ocelot_xfh_get_qos_class(extraction, &qos_class); ocelot_xfh_get_tag_type(extraction, &tag_type); ocelot_xfh_get_vlan_tci(extraction, &vlan_tci); + ocelot_xfh_get_rew_val(extraction, &rew_val); skb->dev = dsa_master_find_slave(netdev, 0, src_port); if (!skb->dev) @@ -143,6 +145,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, dsa_default_offload_fwd_mark(skb); skb->priority = qos_class; + OCELOT_SKB_CB(skb)->tstamp_lo = rew_val; /* Ocelot switches copy frames unmodified to the CPU. However, it is * possible for the user to request a VLAN modification through -- cgit v1.2.3 From 6429e46304ac7820eebbea2bf5d73b90c18e0e06 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 28 Oct 2021 10:47:21 +0100 Subject: libfs: Move shmem_exchange to simple_rename_exchange Move shmem_exchange and make it available to other callers. Suggested-by: Miklos Szeredi Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Acked-by: Miklos Szeredi Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20211028094724.59043-2-lmb@cloudflare.com --- fs/libfs.c | 24 ++++++++++++++++++++++++ include/linux/fs.h | 2 ++ mm/shmem.c | 24 +----------------------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 51b4de3b3447..1cf144dc9ed2 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -448,6 +448,30 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL(simple_rmdir); +int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + bool old_is_dir = d_is_dir(old_dentry); + bool new_is_dir = d_is_dir(new_dentry); + + if (old_dir != new_dir && old_is_dir != new_is_dir) { + if (old_is_dir) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } else { + drop_nlink(new_dir); + inc_nlink(old_dir); + } + } + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + d_inode(old_dentry)->i_ctime = + d_inode(new_dentry)->i_ctime = current_time(old_dir); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_rename_exchange); + int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/include/linux/fs.h b/include/linux/fs.h index f3cfca5edc9a..bc4e97b82ddd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3383,6 +3383,8 @@ extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); +extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename(struct user_namespace *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); diff --git a/mm/shmem.c b/mm/shmem.c index 17e344e26e73..56616aabe0a1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2947,28 +2947,6 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) return shmem_unlink(dir, dentry); } -static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) -{ - bool old_is_dir = d_is_dir(old_dentry); - bool new_is_dir = d_is_dir(new_dentry); - - if (old_dir != new_dir && old_is_dir != new_is_dir) { - if (old_is_dir) { - drop_nlink(old_dir); - inc_nlink(new_dir); - } else { - drop_nlink(new_dir); - inc_nlink(old_dir); - } - } - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - d_inode(old_dentry)->i_ctime = - d_inode(new_dentry)->i_ctime = current_time(old_dir); - - return 0; -} - static int shmem_whiteout(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry) { @@ -3014,7 +2992,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns, return -EINVAL; if (flags & RENAME_EXCHANGE) - return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); + return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; -- cgit v1.2.3 From 3871cb8cf741dcd8ebaec4f960be9479da2f176b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 28 Oct 2021 10:47:22 +0100 Subject: libfs: Support RENAME_EXCHANGE in simple_rename() Allow atomic exchange via RENAME_EXCHANGE when using simple_rename. This affects binderfs, ramfs, hubetlbfs and bpffs. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Acked-by: Christian Brauner Cc: Al Viro Cc: Miklos Szeredi Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20211028094724.59043-3-lmb@cloudflare.com --- fs/libfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/libfs.c b/fs/libfs.c index 1cf144dc9ed2..ba7438ab9371 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -479,9 +479,12 @@ int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; + if (flags & RENAME_EXCHANGE) + return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (!simple_empty(new_dentry)) return -ENOTEMPTY; -- cgit v1.2.3 From 9fc23c22e5745decc93ba5789bdcf2b093f21145 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 28 Oct 2021 10:47:23 +0100 Subject: selftests/bpf: Convert test_bpffs to ASSERT macros Remove usage of deprecated CHECK macros. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211028094724.59043-4-lmb@cloudflare.com --- .../testing/selftests/bpf/prog_tests/test_bpffs.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c index 172c999e523c..533e3f3a459a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c @@ -29,43 +29,43 @@ static int read_iter(char *file) static int fn(void) { - int err, duration = 0; + int err; err = unshare(CLONE_NEWNS); - if (CHECK(err, "unshare", "failed: %d\n", errno)) + if (!ASSERT_OK(err, "unshare")) goto out; err = mount("", "/", "", MS_REC | MS_PRIVATE, NULL); - if (CHECK(err, "mount /", "failed: %d\n", errno)) + if (!ASSERT_OK(err, "mount /")) goto out; err = umount(TDIR); - if (CHECK(err, "umount " TDIR, "failed: %d\n", errno)) + if (!ASSERT_OK(err, "umount " TDIR)) goto out; err = mount("none", TDIR, "tmpfs", 0, NULL); - if (CHECK(err, "mount", "mount root failed: %d\n", errno)) + if (!ASSERT_OK(err, "mount tmpfs")) goto out; err = mkdir(TDIR "/fs1", 0777); - if (CHECK(err, "mkdir "TDIR"/fs1", "failed: %d\n", errno)) + if (!ASSERT_OK(err, "mkdir " TDIR "/fs1")) goto out; err = mkdir(TDIR "/fs2", 0777); - if (CHECK(err, "mkdir "TDIR"/fs2", "failed: %d\n", errno)) + if (!ASSERT_OK(err, "mkdir " TDIR "/fs2")) goto out; err = mount("bpf", TDIR "/fs1", "bpf", 0, NULL); - if (CHECK(err, "mount bpffs "TDIR"/fs1", "failed: %d\n", errno)) + if (!ASSERT_OK(err, "mount bpffs " TDIR "/fs1")) goto out; err = mount("bpf", TDIR "/fs2", "bpf", 0, NULL); - if (CHECK(err, "mount bpffs " TDIR "/fs2", "failed: %d\n", errno)) + if (!ASSERT_OK(err, "mount bpffs " TDIR "/fs2")) goto out; err = read_iter(TDIR "/fs1/maps.debug"); - if (CHECK(err, "reading " TDIR "/fs1/maps.debug", "failed\n")) + if (!ASSERT_OK(err, "reading " TDIR "/fs1/maps.debug")) goto out; err = read_iter(TDIR "/fs2/progs.debug"); - if (CHECK(err, "reading " TDIR "/fs2/progs.debug", "failed\n")) + if (!ASSERT_OK(err, "reading " TDIR "/fs2/progs.debug")) goto out; out: umount(TDIR "/fs1"); -- cgit v1.2.3 From 7e5ad817ec297f91a2fa5c423a39a458a4701bca Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 28 Oct 2021 10:47:24 +0100 Subject: selftests/bpf: Test RENAME_EXCHANGE and RENAME_NOREPLACE on bpffs Add tests to exercise the behaviour of RENAME_EXCHANGE and RENAME_NOREPLACE on bpffs. The former checks that after an exchange the inode of two directories has changed. The latter checks that the source still exists after a failed rename. Generally, having support for renameat2(RENAME_EXCHANGE) in bpffs fixes atomic upgrades of our sk_lookup control plane. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211028094724.59043-5-lmb@cloudflare.com --- .../testing/selftests/bpf/prog_tests/test_bpffs.c | 65 +++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c index 533e3f3a459a..d29ebfeef9c5 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ #define _GNU_SOURCE +#include #include #include #include @@ -29,7 +30,8 @@ static int read_iter(char *file) static int fn(void) { - int err; + struct stat a, b, c; + int err, map; err = unshare(CLONE_NEWNS); if (!ASSERT_OK(err, "unshare")) @@ -67,6 +69,67 @@ static int fn(void) err = read_iter(TDIR "/fs2/progs.debug"); if (!ASSERT_OK(err, "reading " TDIR "/fs2/progs.debug")) goto out; + + err = mkdir(TDIR "/fs1/a", 0777); + if (!ASSERT_OK(err, "creating " TDIR "/fs1/a")) + goto out; + err = mkdir(TDIR "/fs1/a/1", 0777); + if (!ASSERT_OK(err, "creating " TDIR "/fs1/a/1")) + goto out; + err = mkdir(TDIR "/fs1/b", 0777); + if (!ASSERT_OK(err, "creating " TDIR "/fs1/b")) + goto out; + + map = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 4, 1, 0); + if (!ASSERT_GT(map, 0, "create_map(ARRAY)")) + goto out; + err = bpf_obj_pin(map, TDIR "/fs1/c"); + if (!ASSERT_OK(err, "pin map")) + goto out; + close(map); + + /* Check that RENAME_EXCHANGE works for directories. */ + err = stat(TDIR "/fs1/a", &a); + if (!ASSERT_OK(err, "stat(" TDIR "/fs1/a)")) + goto out; + err = renameat2(0, TDIR "/fs1/a", 0, TDIR "/fs1/b", RENAME_EXCHANGE); + if (!ASSERT_OK(err, "renameat2(/fs1/a, /fs1/b, RENAME_EXCHANGE)")) + goto out; + err = stat(TDIR "/fs1/b", &b); + if (!ASSERT_OK(err, "stat(" TDIR "/fs1/b)")) + goto out; + if (!ASSERT_EQ(a.st_ino, b.st_ino, "b should have a's inode")) + goto out; + err = access(TDIR "/fs1/b/1", F_OK); + if (!ASSERT_OK(err, "access(" TDIR "/fs1/b/1)")) + goto out; + + /* Check that RENAME_EXCHANGE works for mixed file types. */ + err = stat(TDIR "/fs1/c", &c); + if (!ASSERT_OK(err, "stat(" TDIR "/fs1/map)")) + goto out; + err = renameat2(0, TDIR "/fs1/c", 0, TDIR "/fs1/b", RENAME_EXCHANGE); + if (!ASSERT_OK(err, "renameat2(/fs1/c, /fs1/b, RENAME_EXCHANGE)")) + goto out; + err = stat(TDIR "/fs1/b", &b); + if (!ASSERT_OK(err, "stat(" TDIR "/fs1/b)")) + goto out; + if (!ASSERT_EQ(c.st_ino, b.st_ino, "b should have c's inode")) + goto out; + err = access(TDIR "/fs1/c/1", F_OK); + if (!ASSERT_OK(err, "access(" TDIR "/fs1/c/1)")) + goto out; + + /* Check that RENAME_NOREPLACE works. */ + err = renameat2(0, TDIR "/fs1/b", 0, TDIR "/fs1/a", RENAME_NOREPLACE); + if (!ASSERT_ERR(err, "renameat2(RENAME_NOREPLACE)")) { + err = -EINVAL; + goto out; + } + err = access(TDIR "/fs1/b", F_OK); + if (!ASSERT_OK(err, "access(" TDIR "/fs1/b)")) + goto out; + out: umount(TDIR "/fs1"); umount(TDIR "/fs2"); -- cgit v1.2.3 From 401a33da3a45cc05859b121314f8ab52c2c01977 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 2 Nov 2021 22:41:13 -0700 Subject: selftests/bpf: Make netcnt selftests serial to avoid spurious failures When running `./test_progs -j` test_netcnt fails with a very high probability, undercounting number of packets received (9999 vs expected 10000). It seems to be conflicting with other cgroup/skb selftests. So make it serial for now to make parallel mode more robust. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211103054113.2130582-1-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/netcnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/netcnt.c b/tools/testing/selftests/bpf/prog_tests/netcnt.c index 6ede48bde91b..954964f0ac3d 100644 --- a/tools/testing/selftests/bpf/prog_tests/netcnt.c +++ b/tools/testing/selftests/bpf/prog_tests/netcnt.c @@ -8,7 +8,7 @@ #define CG_NAME "/netcnt" -void test_netcnt(void) +void serial_test_netcnt(void) { union percpu_net_cnt *percpu_netcnt = NULL; struct bpf_cgroup_storage_key key; -- cgit v1.2.3 From f30d4968e9aee737e174fc97942af46cfb49b484 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 1 Nov 2021 23:45:35 -0700 Subject: bpf: Do not reject when the stack read size is different from the tracked scalar size Below is a simplified case from a report in bcc [0]: r4 = 20 *(u32 *)(r10 -4) = r4 *(u32 *)(r10 -8) = r4 /* r4 state is tracked */ r4 = *(u64 *)(r10 -8) /* Read more than the tracked 32bit scalar. * verifier rejects as 'corrupted spill memory'. */ After commit 354e8f1970f8 ("bpf: Support <8-byte scalar spill and refill"), the 8-byte aligned 32bit spill is also tracked by the verifier and the register state is stored. However, if 8 bytes are read from the stack instead of the tracked 4 byte scalar, then verifier currently rejects the program as "corrupted spill memory". This patch fixes this case by allowing it to read but marks the register as unknown. Also note that, if the prog is trying to corrupt/leak an earlier spilled pointer by spilling another <8 bytes register on top, this has already been rejected in the check_stack_write_fixed_off(). [0] https://github.com/iovisor/bcc/pull/3683 Fixes: 354e8f1970f8 ("bpf: Support <8-byte scalar spill and refill") Reported-by: Hengqi Chen Reported-by: Yonghong Song Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: Hengqi Chen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211102064535.316018-1-kafai@fb.com --- kernel/bpf/verifier.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f0dca726ebfd..5f8d9128860a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3088,9 +3088,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, reg = ®_state->stack[spi].spilled_ptr; if (is_spilled_reg(®_state->stack[spi])) { - if (size != BPF_REG_SIZE) { - u8 scalar_size = 0; + u8 spill_size = 1; + + for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--) + spill_size++; + if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) { if (reg->type != SCALAR_VALUE) { verbose_linfo(env, env->insn_idx, "; "); verbose(env, "invalid size of register fill\n"); @@ -3101,10 +3104,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (dst_regno < 0) return 0; - for (i = BPF_REG_SIZE; i > 0 && stype[i - 1] == STACK_SPILL; i--) - scalar_size++; - - if (!(off % BPF_REG_SIZE) && size == scalar_size) { + if (!(off % BPF_REG_SIZE) && size == spill_size) { /* The earlier check_reg_arg() has decided the * subreg_def for this insn. Save it first. */ @@ -3128,12 +3128,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, state->regs[dst_regno].live |= REG_LIVE_WRITTEN; return 0; } - for (i = 1; i < BPF_REG_SIZE; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { - verbose(env, "corrupted spill memory\n"); - return -EACCES; - } - } if (dst_regno >= 0) { /* restore register state from stack */ -- cgit v1.2.3 From c08455dec5acf4668f5d1eb099f7fedb29f2de5f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 1 Nov 2021 23:45:41 -0700 Subject: selftests/bpf: Verifier test on refill from a smaller spill This patch adds a verifier test to ensure the verifier can read 8 bytes from the stack after two 32bit write at fp-4 and fp-8. The test is similar to the reported case from bcc [0]. [0] https://github.com/iovisor/bcc/pull/3683 Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211102064541.316414-1-kafai@fb.com --- tools/testing/selftests/bpf/verifier/spill_fill.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c index c9991c3f3bd2..7ab3de108761 100644 --- a/tools/testing/selftests/bpf/verifier/spill_fill.c +++ b/tools/testing/selftests/bpf/verifier/spill_fill.c @@ -265,3 +265,20 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, +{ + "Spill a u32 scalar at fp-4 and then at fp-8", + .insns = { + /* r4 = 4321 */ + BPF_MOV32_IMM(BPF_REG_4, 4321), + /* *(u32 *)(r10 -4) = r4 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_4, -4), + /* *(u32 *)(r10 -8) = r4 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_4, -8), + /* r4 = *(u64 *)(r10 -8) */ + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, +}, -- cgit v1.2.3 From 1a8c7778bcde5981463a5b9f9b2caa44a327ff93 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 26 Feb 2021 13:19:23 -0800 Subject: ice: Fix VF true promiscuous mode When a VF requests promiscuous mode and it's trusted and true promiscuous mode is enabled the PF driver attempts to enable unicast and/or multicast promiscuous mode filters based on the request. This is fine, but there are a couple issues with the current code. [1] The define to configure the unicast promiscuous mode mask also includes bits to configure the multicast promiscuous mode mask, which causes multicast to be set/cleared unintentionally. [2] All 4 cases for enable/disable unicast/multicast mode are not handled in the promiscuous mode message handler, which causes unexpected results regarding the current promiscuous mode settings. To fix [1] make sure any promiscuous mask defines include the correct bits for each of the promiscuous modes. To fix [2] make sure that all 4 cases are handled since there are 2 bits (FLAG_VF_UNICAST_PROMISC and FLAG_VF_MULTICAST_PROMISC) that can be either set or cleared. Also, since either unicast and/or multicast promiscuous configuration can fail, introduce two separate error values to handle each of these cases. Fixes: 01b5e89aab49 ("ice: Add VF promiscuous support") Signed-off-by: Brett Creeley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 5 +- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 78 ++++++++++++------------ 2 files changed, 40 insertions(+), 43 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index bf4ecd9a517c..b2db39ee5f85 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -165,13 +165,10 @@ #define ice_for_each_chnl_tc(i) \ for ((i) = ICE_CHNL_START_TC; (i) < ICE_CHNL_MAX_TC; (i)++) -#define ICE_UCAST_PROMISC_BITS (ICE_PROMISC_UCAST_TX | ICE_PROMISC_MCAST_TX | \ - ICE_PROMISC_UCAST_RX | ICE_PROMISC_MCAST_RX) +#define ICE_UCAST_PROMISC_BITS (ICE_PROMISC_UCAST_TX | ICE_PROMISC_UCAST_RX) #define ICE_UCAST_VLAN_PROMISC_BITS (ICE_PROMISC_UCAST_TX | \ - ICE_PROMISC_MCAST_TX | \ ICE_PROMISC_UCAST_RX | \ - ICE_PROMISC_MCAST_RX | \ ICE_PROMISC_VLAN_TX | \ ICE_PROMISC_VLAN_RX) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 2ac21484b876..9b699419c933 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -3013,6 +3013,7 @@ bool ice_is_any_vf_in_promisc(struct ice_pf *pf) static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg) { enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS; + enum ice_status mcast_status = 0, ucast_status = 0; bool rm_promisc, alluni = false, allmulti = false; struct virtchnl_promisc_info *info = (struct virtchnl_promisc_info *)msg; @@ -3105,52 +3106,51 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg) goto error_param; } } else { - enum ice_status status; - u8 promisc_m; - - if (alluni) { - if (vf->port_vlan_info || vsi->num_vlan) - promisc_m = ICE_UCAST_VLAN_PROMISC_BITS; - else - promisc_m = ICE_UCAST_PROMISC_BITS; - } else if (allmulti) { - if (vf->port_vlan_info || vsi->num_vlan) - promisc_m = ICE_MCAST_VLAN_PROMISC_BITS; - else - promisc_m = ICE_MCAST_PROMISC_BITS; + u8 mcast_m, ucast_m; + + if (vf->port_vlan_info || vsi->num_vlan > 1) { + mcast_m = ICE_MCAST_VLAN_PROMISC_BITS; + ucast_m = ICE_UCAST_VLAN_PROMISC_BITS; } else { - if (vf->port_vlan_info || vsi->num_vlan) - promisc_m = ICE_UCAST_VLAN_PROMISC_BITS; - else - promisc_m = ICE_UCAST_PROMISC_BITS; + mcast_m = ICE_MCAST_PROMISC_BITS; + ucast_m = ICE_UCAST_PROMISC_BITS; } - /* Configure multicast/unicast with or without VLAN promiscuous - * mode - */ - status = ice_vf_set_vsi_promisc(vf, vsi, promisc_m, rm_promisc); - if (status) { - dev_err(dev, "%sable Tx/Rx filter promiscuous mode on VF-%d failed, error: %s\n", - rm_promisc ? "dis" : "en", vf->vf_id, - ice_stat_str(status)); - v_ret = ice_err_to_virt_err(status); - goto error_param; - } else { - dev_dbg(dev, "%sable Tx/Rx filter promiscuous mode on VF-%d succeeded\n", - rm_promisc ? "dis" : "en", vf->vf_id); + ucast_status = ice_vf_set_vsi_promisc(vf, vsi, ucast_m, + !alluni); + if (ucast_status) { + dev_err(dev, "%sable Tx/Rx filter promiscuous mode on VF-%d failed\n", + alluni ? "en" : "dis", vf->vf_id); + v_ret = ice_err_to_virt_err(ucast_status); + } + + mcast_status = ice_vf_set_vsi_promisc(vf, vsi, mcast_m, + !allmulti); + if (mcast_status) { + dev_err(dev, "%sable Tx/Rx filter promiscuous mode on VF-%d failed\n", + allmulti ? "en" : "dis", vf->vf_id); + v_ret = ice_err_to_virt_err(mcast_status); } } - if (allmulti && - !test_and_set_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) - dev_info(dev, "VF %u successfully set multicast promiscuous mode\n", vf->vf_id); - else if (!allmulti && test_and_clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) - dev_info(dev, "VF %u successfully unset multicast promiscuous mode\n", vf->vf_id); + if (!mcast_status) { + if (allmulti && + !test_and_set_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) + dev_info(dev, "VF %u successfully set multicast promiscuous mode\n", + vf->vf_id); + else if (!allmulti && test_and_clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) + dev_info(dev, "VF %u successfully unset multicast promiscuous mode\n", + vf->vf_id); + } - if (alluni && !test_and_set_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states)) - dev_info(dev, "VF %u successfully set unicast promiscuous mode\n", vf->vf_id); - else if (!alluni && test_and_clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states)) - dev_info(dev, "VF %u successfully unset unicast promiscuous mode\n", vf->vf_id); + if (!ucast_status) { + if (alluni && !test_and_set_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states)) + dev_info(dev, "VF %u successfully set unicast promiscuous mode\n", + vf->vf_id); + else if (!alluni && test_and_clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states)) + dev_info(dev, "VF %u successfully unset unicast promiscuous mode\n", + vf->vf_id); + } error_param: return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE, -- cgit v1.2.3 From 0299faeaf8eb982103e4388af61fd94feb9c2d9f Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Wed, 5 May 2021 14:17:57 -0700 Subject: ice: Remove toggling of antispoof for VF trusted promiscuous mode Currently when a trusted VF enables promiscuous mode spoofchk will be disabled. This is wrong and should only be modified from the ndo_set_vf_spoofchk callback. Fix this by removing the call to toggle spoofchk for trusted VFs. Fixes: 01b5e89aab49 ("ice: Add VF promiscuous support") Signed-off-by: Brett Creeley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 9b699419c933..3f8f94732a1f 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -3055,24 +3055,6 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg) rm_promisc = !allmulti && !alluni; if (vsi->num_vlan || vf->port_vlan_info) { - struct ice_vsi *pf_vsi = ice_get_main_vsi(pf); - struct net_device *pf_netdev; - - if (!pf_vsi) { - v_ret = VIRTCHNL_STATUS_ERR_PARAM; - goto error_param; - } - - pf_netdev = pf_vsi->netdev; - - ret = ice_set_vf_spoofchk(pf_netdev, vf->vf_id, rm_promisc); - if (ret) { - dev_err(dev, "Failed to update spoofchk to %s for VF %d VSI %d when setting promiscuous mode\n", - rm_promisc ? "ON" : "OFF", vf->vf_id, - vsi->vsi_num); - v_ret = VIRTCHNL_STATUS_ERR_PARAM; - } - if (rm_promisc) ret = ice_cfg_vlan_pruning(vsi, true); else -- cgit v1.2.3 From ce572a5b88d5ca6737b5e23da9892792fd708ad3 Mon Sep 17 00:00:00 2001 From: Sylwester Dziedziuch Date: Thu, 6 May 2021 08:40:03 -0700 Subject: ice: Fix replacing VF hardware MAC to existing MAC filter VF was not able to change its hardware MAC address in case the new address was already present in the MAC filter list. Change the handling of VF add mac request to not return if requested MAC address is already present on the list and check if its hardware MAC needs to be updated in this case. Fixes: ed4c068d46f6 ("ice: Enable ip link show on the PF to display VF unicast MAC(s)") Signed-off-by: Sylwester Dziedziuch Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 3f8f94732a1f..650ad7f56829 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -3806,6 +3806,7 @@ ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, struct device *dev = ice_pf_to_dev(vf->pf); u8 *mac_addr = vc_ether_addr->addr; enum ice_status status; + int ret = 0; /* device MAC already added */ if (ether_addr_equal(mac_addr, vf->dev_lan_addr.addr)) @@ -3818,20 +3819,23 @@ ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, status = ice_fltr_add_mac(vsi, mac_addr, ICE_FWD_TO_VSI); if (status == ICE_ERR_ALREADY_EXISTS) { - dev_err(dev, "MAC %pM already exists for VF %d\n", mac_addr, + dev_dbg(dev, "MAC %pM already exists for VF %d\n", mac_addr, vf->vf_id); - return -EEXIST; + /* don't return since we might need to update + * the primary MAC in ice_vfhw_mac_add() below + */ + ret = -EEXIST; } else if (status) { dev_err(dev, "Failed to add MAC %pM for VF %d\n, error %s\n", mac_addr, vf->vf_id, ice_stat_str(status)); return -EIO; + } else { + vf->num_mac++; } ice_vfhw_mac_add(vf, vc_ether_addr); - vf->num_mac++; - - return 0; + return ret; } /** -- cgit v1.2.3 From b385cca47363316c6d9a74ae9db407bbc281f815 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 9 Sep 2021 14:38:08 -0700 Subject: ice: Fix not stopping Tx queues for VFs When a VF is removed and/or reset its Tx queues need to be stopped from the PF. This is done by calling the ice_dis_vf_qs() function, which calls ice_vsi_stop_lan_tx_rings(). Currently ice_dis_vf_qs() is protected by the VF state bit ICE_VF_STATE_QS_ENA. Unfortunately, this is causing the Tx queues to not be disabled in some cases and when the VF tries to re-enable/reconfigure its Tx queues over virtchnl the op is failing. This is because a VF can be reset and/or removed before the ICE_VF_STATE_QS_ENA bit is set, but the Tx queues were already configured via ice_vsi_cfg_single_txq() in the VIRTCHNL_OP_CONFIG_VSI_QUEUES op. However, the ICE_VF_STATE_QS_ENA bit is set on a successful VIRTCHNL_OP_ENABLE_QUEUES, which will always happen after the VIRTCHNL_OP_CONFIG_VSI_QUEUES op. This was causing the following error message when loading the ice driver, creating VFs, and modifying VF trust in an endless loop: [35274.192484] ice 0000:88:00.0: Failed to set LAN Tx queue context, error: ICE_ERR_PARAM [35274.193074] ice 0000:88:00.0: VF 0 failed opcode 6, retval: -5 [35274.193640] iavf 0000:88:01.0: PF returned error -5 (IAVF_ERR_PARAM) to our request 6 Fix this by always calling ice_dis_vf_qs() and silencing the error message in ice_vsi_stop_tx_ring() since the calling code ignores the return anyway. Also, all other places that call ice_vsi_stop_tx_ring() catch the error, so this doesn't affect those flows since there was no change to the values the function returns. Other solutions were considered (i.e. tracking which VF queues had been "started/configured" in VIRTCHNL_OP_CONFIG_VSI_QUEUES, but it seemed more complicated than it was worth. This solution also brings in the chance for other unexpected conditions due to invalid state bit checks. So, the proposed solution seemed like the best option since there is no harm in failing to stop Tx queues that were never started. This issue can be seen using the following commands: for i in {0..50}; do rmmod ice modprobe ice sleep 1 echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs echo 1 > /sys/class/net/ens785f1/device/sriov_numvfs ip link set ens785f1 vf 0 trust on ip link set ens785f0 vf 0 trust on sleep 2 echo 0 > /sys/class/net/ens785f0/device/sriov_numvfs echo 0 > /sys/class/net/ens785f1/device/sriov_numvfs sleep 1 echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs echo 1 > /sys/class/net/ens785f1/device/sriov_numvfs ip link set ens785f1 vf 0 trust on ip link set ens785f0 vf 0 trust on done Fixes: 77ca27c41705 ("ice: add support for virtchnl_queue_select.[tx|rx]_queues bitmap") Signed-off-by: Brett Creeley Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 2 +- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index fa6cd63cbf1f..1efc635cc0f5 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -962,7 +962,7 @@ ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src, } else if (status == ICE_ERR_DOES_NOT_EXIST) { dev_dbg(ice_pf_to_dev(vsi->back), "LAN Tx queues do not exist, nothing to disable\n"); } else if (status) { - dev_err(ice_pf_to_dev(vsi->back), "Failed to disable LAN Tx queues, error: %s\n", + dev_dbg(ice_pf_to_dev(vsi->back), "Failed to disable LAN Tx queues, error: %s\n", ice_stat_str(status)); return -ENODEV; } diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 650ad7f56829..3f727df3b6fb 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -638,8 +638,7 @@ void ice_free_vfs(struct ice_pf *pf) /* Avoid wait time by stopping all VFs at the same time */ ice_for_each_vf(pf, i) - if (test_bit(ICE_VF_STATE_QS_ENA, pf->vf[i].vf_states)) - ice_dis_vf_qs(&pf->vf[i]); + ice_dis_vf_qs(&pf->vf[i]); tmp = pf->num_alloc_vfs; pf->num_qps_per_vf = 0; @@ -1695,8 +1694,7 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr) vsi = ice_get_vf_vsi(vf); - if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states)) - ice_dis_vf_qs(vf); + ice_dis_vf_qs(vf); /* Call Disable LAN Tx queue AQ whether or not queues are * enabled. This is needed for successful completion of VFR. -- cgit v1.2.3 From e6ba5273d4ede03d075d7a116b8edad1f6115f4d Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 9 Sep 2021 14:38:09 -0700 Subject: ice: Fix race conditions between virtchnl handling and VF ndo ops The VF can be configured via the PF's ndo ops at the same time the PF is receiving/handling virtchnl messages. This has many issues, with one of them being the ndo op could be actively resetting a VF (i.e. resetting it to the default state and deleting/re-adding the VF's VSI) while a virtchnl message is being handled. The following error was seen because a VF ndo op was used to change a VF's trust setting while the VIRTCHNL_OP_CONFIG_VSI_QUEUES was ongoing: [35274.192484] ice 0000:88:00.0: Failed to set LAN Tx queue context, error: ICE_ERR_PARAM [35274.193074] ice 0000:88:00.0: VF 0 failed opcode 6, retval: -5 [35274.193640] iavf 0000:88:01.0: PF returned error -5 (IAVF_ERR_PARAM) to our request 6 Fix this by making sure the virtchnl handling and VF ndo ops that trigger VF resets cannot run concurrently. This is done by adding a struct mutex cfg_lock to each VF structure. For VF ndo ops, the mutex will be locked around the critical operations and VFR. Since the ndo ops will trigger a VFR, the virtchnl thread will use mutex_trylock(). This is done because if any other thread (i.e. VF ndo op) has the mutex, then that means the current VF message being handled is no longer valid, so just ignore it. This issue can be seen using the following commands: for i in {0..50}; do rmmod ice modprobe ice sleep 1 echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs echo 1 > /sys/class/net/ens785f1/device/sriov_numvfs ip link set ens785f1 vf 0 trust on ip link set ens785f0 vf 0 trust on sleep 2 echo 0 > /sys/class/net/ens785f0/device/sriov_numvfs echo 0 > /sys/class/net/ens785f1/device/sriov_numvfs sleep 1 echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs echo 1 > /sys/class/net/ens785f1/device/sriov_numvfs ip link set ens785f1 vf 0 trust on ip link set ens785f0 vf 0 trust on done Fixes: 7c710869d64e ("ice: Add handlers for VF netdevice operations") Signed-off-by: Brett Creeley Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 25 ++++++++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h | 5 +++++ 2 files changed, 30 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 3f727df3b6fb..217ff5e9a6f1 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -650,6 +650,8 @@ void ice_free_vfs(struct ice_pf *pf) set_bit(ICE_VF_STATE_DIS, pf->vf[i].vf_states); ice_free_vf_res(&pf->vf[i]); } + + mutex_destroy(&pf->vf[i].cfg_lock); } if (ice_sriov_free_msix_res(pf)) @@ -1946,6 +1948,8 @@ static void ice_set_dflt_settings_vfs(struct ice_pf *pf) ice_vf_fdir_init(vf); ice_vc_set_dflt_vf_ops(&vf->vc_ops); + + mutex_init(&vf->cfg_lock); } } @@ -4135,6 +4139,8 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, return 0; } + mutex_lock(&vf->cfg_lock); + vf->port_vlan_info = vlanprio; if (vf->port_vlan_info) @@ -4144,6 +4150,7 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, dev_info(dev, "Clearing port VLAN on VF %d\n", vf_id); ice_vc_reset_vf(vf); + mutex_unlock(&vf->cfg_lock); return 0; } @@ -4683,6 +4690,15 @@ error_handler: return; } + /* VF is being configured in another context that triggers a VFR, so no + * need to process this message + */ + if (!mutex_trylock(&vf->cfg_lock)) { + dev_info(dev, "VF %u is being configured in another context that will trigger a VFR, so there is no need to handle this message\n", + vf->vf_id); + return; + } + switch (v_opcode) { case VIRTCHNL_OP_VERSION: err = ops->get_ver_msg(vf, msg); @@ -4771,6 +4787,8 @@ error_handler: dev_info(dev, "PF failed to honor VF %d, opcode %d, error %d\n", vf_id, v_opcode, err); } + + mutex_unlock(&vf->cfg_lock); } /** @@ -4886,6 +4904,8 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) return -EINVAL; } + mutex_lock(&vf->cfg_lock); + /* VF is notified of its new MAC via the PF's response to the * VIRTCHNL_OP_GET_VF_RESOURCES message after the VF has been reset */ @@ -4904,6 +4924,7 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) } ice_vc_reset_vf(vf); + mutex_unlock(&vf->cfg_lock); return 0; } @@ -4938,11 +4959,15 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted) if (trusted == vf->trusted) return 0; + mutex_lock(&vf->cfg_lock); + vf->trusted = trusted; ice_vc_reset_vf(vf); dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n", vf_id, trusted ? "" : "un"); + mutex_unlock(&vf->cfg_lock); + return 0; } diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h index 5ff93a08f54c..7e28ecbbe7af 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h @@ -100,6 +100,11 @@ struct ice_vc_vf_ops { struct ice_vf { struct ice_pf *pf; + /* Used during virtchnl message handling and NDO ops against the VF + * that will trigger a VFR + */ + struct mutex cfg_lock; + u16 vf_id; /* VF ID in the PF space */ u16 lan_vsi_idx; /* index into PF struct */ u16 ctrl_vsi_idx; -- cgit v1.2.3 From a985442fdecb59504e3a2f1cfdd3c53af017ea5b Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 4 Nov 2021 11:46:13 +0100 Subject: selftests: net: properly support IPv6 in GSO GRE test Explicitly pass -6 to netcat when the test is using IPv6 to prevent failures. Also make sure to pass "-N" to netcat to close the socket after EOF on the client side, otherwise we would always hit the timeout and the test would fail. Without this fix applied: TEST: GREv6/v4 - copy file w/ TSO [FAIL] TEST: GREv6/v4 - copy file w/ GSO [FAIL] TEST: GREv6/v6 - copy file w/ TSO [FAIL] TEST: GREv6/v6 - copy file w/ GSO [FAIL] With this fix applied: TEST: GREv6/v4 - copy file w/ TSO [ OK ] TEST: GREv6/v4 - copy file w/ GSO [ OK ] TEST: GREv6/v6 - copy file w/ TSO [ OK ] TEST: GREv6/v6 - copy file w/ GSO [ OK ] Fixes: 025efa0a82df ("selftests: add simple GSO GRE test") Signed-off-by: Andrea Righi Signed-off-by: David S. Miller --- tools/testing/selftests/net/gre_gso.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/gre_gso.sh b/tools/testing/selftests/net/gre_gso.sh index facbb0c80443..fdeb44d621eb 100755 --- a/tools/testing/selftests/net/gre_gso.sh +++ b/tools/testing/selftests/net/gre_gso.sh @@ -116,17 +116,18 @@ gre_gst_test_checks() { local name=$1 local addr=$2 + local proto=$3 - $NS_EXEC nc -kl $port >/dev/null & + $NS_EXEC nc $proto -kl $port >/dev/null & PID=$! while ! $NS_EXEC ss -ltn | grep -q $port; do ((i++)); sleep 0.01; done - cat $TMPFILE | timeout 1 nc $addr $port + cat $TMPFILE | timeout 1 nc $proto -N $addr $port log_test $? 0 "$name - copy file w/ TSO" ethtool -K veth0 tso off - cat $TMPFILE | timeout 1 nc $addr $port + cat $TMPFILE | timeout 1 nc $proto -N $addr $port log_test $? 0 "$name - copy file w/ GSO" ethtool -K veth0 tso on @@ -155,7 +156,7 @@ gre6_gso_test() sleep 2 gre_gst_test_checks GREv6/v4 172.16.2.2 - gre_gst_test_checks GREv6/v6 2001:db8:1::2 + gre_gst_test_checks GREv6/v6 2001:db8:1::2 -6 cleanup } -- cgit v1.2.3 From 3b65abb8d8a650e50ff5448ac38992ef8a74c584 Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Thu, 4 Nov 2021 00:17:51 +0200 Subject: tcp: Use BIT() for OPTION_* constants Extending these flags using the existing (1 << x) pattern triggers complaints from checkpatch. Instead of ignoring checkpatch modify the existing values to use BIT(x) style in a separate commit. Signed-off-by: Leonard Crestez Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6f7860e283c6..2e6e5a70168e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -408,13 +408,13 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) return tp->snd_una != tp->snd_up; } -#define OPTION_SACK_ADVERTISE (1 << 0) -#define OPTION_TS (1 << 1) -#define OPTION_MD5 (1 << 2) -#define OPTION_WSCALE (1 << 3) -#define OPTION_FAST_OPEN_COOKIE (1 << 8) -#define OPTION_SMC (1 << 9) -#define OPTION_MPTCP (1 << 10) +#define OPTION_SACK_ADVERTISE BIT(0) +#define OPTION_TS BIT(1) +#define OPTION_MD5 BIT(2) +#define OPTION_WSCALE BIT(3) +#define OPTION_FAST_OPEN_COOKIE BIT(8) +#define OPTION_SMC BIT(9) +#define OPTION_MPTCP BIT(10) static void smc_options_write(__be32 *ptr, u16 *options) { -- cgit v1.2.3 From d00c8ee31729248ba40b4ab25cd3b3b580c6f87c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Nov 2021 16:49:11 -0700 Subject: net: fix possible NULL deref in sock_reserve_memory Sanity check in sock_reserve_memory() was not enough to prevent malicious user to trigger a NULL deref. In this case, the isse is that sk_prot->memory_allocated is NULL. Use standard sk_has_account() helper to deal with this. BUG: KASAN: null-ptr-deref in instrument_atomic_read_write include/linux/instrumented.h:101 [inline] BUG: KASAN: null-ptr-deref in atomic_long_add_return include/linux/atomic/atomic-instrumented.h:1218 [inline] BUG: KASAN: null-ptr-deref in sk_memory_allocated_add include/net/sock.h:1371 [inline] BUG: KASAN: null-ptr-deref in sock_reserve_memory net/core/sock.c:994 [inline] BUG: KASAN: null-ptr-deref in sock_setsockopt+0x22ab/0x2b30 net/core/sock.c:1443 Write of size 8 at addr 0000000000000000 by task syz-executor.0/11270 CPU: 1 PID: 11270 Comm: syz-executor.0 Not tainted 5.15.0-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 __kasan_report mm/kasan/report.c:446 [inline] kasan_report.cold+0x66/0xdf mm/kasan/report.c:459 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x13d/0x180 mm/kasan/generic.c:189 instrument_atomic_read_write include/linux/instrumented.h:101 [inline] atomic_long_add_return include/linux/atomic/atomic-instrumented.h:1218 [inline] sk_memory_allocated_add include/net/sock.h:1371 [inline] sock_reserve_memory net/core/sock.c:994 [inline] sock_setsockopt+0x22ab/0x2b30 net/core/sock.c:1443 __sys_setsockopt+0x4f8/0x610 net/socket.c:2172 __do_sys_setsockopt net/socket.c:2187 [inline] __se_sys_setsockopt net/socket.c:2184 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2184 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f56076d5ae9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f5604c4b188 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007f56077e8f60 RCX: 00007f56076d5ae9 RDX: 0000000000000049 RSI: 0000000000000001 RDI: 0000000000000003 RBP: 00007f560772ff25 R08: 000000000000fec7 R09: 0000000000000000 R10: 0000000020000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fffb61a100f R14: 00007f5604c4b300 R15: 0000000000022000 Fixes: 2bb2f5fb21b0 ("net: add new socket option SO_RESERVE_MEM") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 9862eefce21e..8f2b2f2c0e7b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -976,7 +976,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg) + if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) -- cgit v1.2.3 From a38bc45a08e9759f04d61669f45941d6624d173c Mon Sep 17 00:00:00 2001 From: Kleber Sacilotto de Souza Date: Mon, 1 Nov 2021 15:53:17 +0100 Subject: selftests/net: Fix reuseport_bpf_numa by skipping unavailable nodes In some platforms the numa node numbers are not necessarily consecutive, meaning that not all nodes from 0 to the value returned by numa_max_node() are available on the system. Using node numbers which are not available results on errors from libnuma such as: ---- IPv4 UDP ---- send node 0, receive socket 0 libnuma: Warning: Cannot read node cpumask from sysfs ./reuseport_bpf_numa: failed to pin to node: No such file or directory Fix it by checking if the node number bit is set on numa_nodes_ptr, which is defined on libnuma as "Set with all nodes the kernel has exposed to userspace". Signed-off-by: Kleber Sacilotto de Souza Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211101145317.286118-1-kleber.souza@canonical.com --- tools/testing/selftests/net/reuseport_bpf_numa.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/reuseport_bpf_numa.c b/tools/testing/selftests/net/reuseport_bpf_numa.c index c9f478b40996..b2eebf669b8c 100644 --- a/tools/testing/selftests/net/reuseport_bpf_numa.c +++ b/tools/testing/selftests/net/reuseport_bpf_numa.c @@ -211,12 +211,16 @@ static void test(int *rcv_fd, int len, int family, int proto) /* Forward iterate */ for (node = 0; node < len; ++node) { + if (!numa_bitmask_isbitset(numa_nodes_ptr, node)) + continue; send_from_node(node, family, proto); receive_on_node(rcv_fd, len, epfd, node, proto); } /* Reverse iterate */ for (node = len - 1; node >= 0; --node) { + if (!numa_bitmask_isbitset(numa_nodes_ptr, node)) + continue; send_from_node(node, family, proto); receive_on_node(rcv_fd, len, epfd, node, proto); } -- cgit v1.2.3 From 96d0c9be432dfd4908e96dde7cab860368a348ab Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Wed, 3 Nov 2021 20:16:06 +0800 Subject: devlink: fix flexible_array.cocci warning Fix following coccicheck warning: ./net/core/devlink.c:69:6-10: WARNING use flexible-array member instead Signed-off-by: Guo Zhengkui Link: https://lore.kernel.org/r/20211103121607.27490-1-guozhengkui@vivo.com Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index 6b5ee862429e..5ba4f9434acd 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -66,7 +66,7 @@ struct devlink { u8 reload_failed:1; refcount_t refcount; struct completion comp; - char priv[0] __aligned(NETDEV_ALIGN); + char priv[] __aligned(NETDEV_ALIGN); }; void *devlink_priv(struct devlink *devlink) -- cgit v1.2.3 From a4db9055fdb9cf607775c66d39796caf6439ec92 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 3 Nov 2021 22:08:28 +0100 Subject: net: phy: fix duplex out of sync problem while changing settings As reported by Zhang there's a small issue if in forced mode the duplex mode changes with the link staying up [0]. In this case the MAC isn't notified about the change. The proposed patch relies on the phylib state machine and ignores the fact that there are drivers that uses phylib but not the phylib state machine. So let's don't change the behavior for such drivers and fix it w/o re-adding state PHY_FORCING for the case that phylib state machine is used. [0] https://lore.kernel.org/netdev/a5c26ffd-4ee4-a5e6-4103-873208ce0dc5@huawei.com/T/ Fixes: 2bd229df5e2e ("net: phy: remove state PHY_FORCING") Reported-by: Zhang Changzhong Tested-by: Zhang Changzhong Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/7b8b9456-a93f-abbc-1dc5-a2c2542f932c@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index a3bfb156c83d..beb2b66da132 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -815,7 +815,12 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; /* Restart the PHY */ - _phy_start_aneg(phydev); + if (phy_is_started(phydev)) { + phydev->state = PHY_UP; + phy_trigger_machine(phydev); + } else { + _phy_start_aneg(phydev); + } mutex_unlock(&phydev->lock); return 0; -- cgit v1.2.3 From 1e4b50f06d970d8da3474d2a0354450416710bda Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Wed, 3 Nov 2021 20:09:42 +0100 Subject: mctp: handle the struct sockaddr_mctp padding fields In order to have the padding fields actually usable in the future, there have to be checks that user space doesn't supply non-zero garbage there. It is also worth setting these padding fields to zero, unless it is known that they have been already zeroed. Cc: stable@vger.kernel.org # v5.15 Fixes: 5a20dd46b8b84593 ("mctp: Be explicit about struct sockaddr_mctp padding") Signed-off-by: Eugene Syromiatnikov Acked-by: Jeremy Kerr Signed-off-by: Jakub Kicinski --- net/mctp/af_mctp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index d344b02a1cde..bc88159f8844 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -33,6 +33,12 @@ static int mctp_release(struct socket *sock) return 0; } +/* Generic sockaddr checks, padding checks only so far */ +static bool mctp_sockaddr_is_ok(const struct sockaddr_mctp *addr) +{ + return !addr->__smctp_pad0 && !addr->__smctp_pad1; +} + static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { struct sock *sk = sock->sk; @@ -52,6 +58,9 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) /* it's a valid sockaddr for MCTP, cast and do protocol checks */ smctp = (struct sockaddr_mctp *)addr; + if (!mctp_sockaddr_is_ok(smctp)) + return -EINVAL; + lock_sock(sk); /* TODO: allow rebind */ @@ -87,6 +96,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) return -EINVAL; if (addr->smctp_family != AF_MCTP) return -EINVAL; + if (!mctp_sockaddr_is_ok(addr)) + return -EINVAL; if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER)) return -EINVAL; @@ -198,11 +209,13 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, addr = msg->msg_name; addr->smctp_family = AF_MCTP; + addr->__smctp_pad0 = 0; addr->smctp_network = cb->net; addr->smctp_addr.s_addr = hdr->src; addr->smctp_type = type; addr->smctp_tag = hdr->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + addr->__smctp_pad1 = 0; msg->msg_namelen = sizeof(*addr); if (msk->addr_ext) { -- cgit v1.2.3 From e9ea574ec1c27e555e7f78cbbcd28af91889d529 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Wed, 3 Nov 2021 20:09:46 +0100 Subject: mctp: handle the struct sockaddr_mctp_ext padding field struct sockaddr_mctp_ext.__smctp_paddin0 has to be checked for being set to zero, otherwise it cannot be utilised in the future. Fixes: 99ce45d5e7dbde39 ("mctp: Implement extended addressing") Signed-off-by: Eugene Syromiatnikov Acked-by: Jeremy Kerr Signed-off-by: Jakub Kicinski --- net/mctp/af_mctp.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index bc88159f8844..871cf6266125 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -39,6 +39,13 @@ static bool mctp_sockaddr_is_ok(const struct sockaddr_mctp *addr) return !addr->__smctp_pad0 && !addr->__smctp_pad1; } +static bool mctp_sockaddr_ext_is_ok(const struct sockaddr_mctp_ext *addr) +{ + return !addr->__smctp_pad0[0] && + !addr->__smctp_pad0[1] && + !addr->__smctp_pad0[2]; +} + static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { struct sock *sk = sock->sk; @@ -135,7 +142,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, extaddr, msg->msg_name); - if (extaddr->smctp_halen > sizeof(cb->haddr)) { + if (!mctp_sockaddr_ext_is_ok(extaddr) || + extaddr->smctp_halen > sizeof(cb->haddr)) { rc = -EINVAL; goto err_free; } @@ -224,6 +232,7 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(*ae); ae->smctp_ifindex = cb->ifindex; ae->smctp_halen = cb->halen; + memset(ae->__smctp_pad0, 0x0, sizeof(ae->__smctp_pad0)); memset(ae->smctp_haddr, 0x0, sizeof(ae->smctp_haddr)); memcpy(ae->smctp_haddr, cb->haddr, cb->halen); } -- cgit v1.2.3 From b93c6a911a3fe926b00add28f3b932007827c4ca Mon Sep 17 00:00:00 2001 From: Huang Guobin Date: Tue, 2 Nov 2021 17:37:33 +0800 Subject: bonding: Fix a use-after-free problem when bond_sysfs_slave_add() failed When I do fuzz test for bonding device interface, I got the following use-after-free Calltrace: ================================================================== BUG: KASAN: use-after-free in bond_enslave+0x1521/0x24f0 Read of size 8 at addr ffff88825bc11c00 by task ifenslave/7365 CPU: 5 PID: 7365 Comm: ifenslave Tainted: G E 5.15.0-rc1+ #13 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 Call Trace: dump_stack_lvl+0x6c/0x8b print_address_description.constprop.0+0x48/0x70 kasan_report.cold+0x82/0xdb __asan_load8+0x69/0x90 bond_enslave+0x1521/0x24f0 bond_do_ioctl+0x3e0/0x450 dev_ifsioc+0x2ba/0x970 dev_ioctl+0x112/0x710 sock_do_ioctl+0x118/0x1b0 sock_ioctl+0x2e0/0x490 __x64_sys_ioctl+0x118/0x150 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f19159cf577 Code: b3 66 90 48 8b 05 11 89 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 78 RSP: 002b:00007ffeb3083c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffeb3084bca RCX: 00007f19159cf577 RDX: 00007ffeb3083ce0 RSI: 0000000000008990 RDI: 0000000000000003 RBP: 00007ffeb3084bc4 R08: 0000000000000040 R09: 0000000000000000 R10: 00007ffeb3084bc0 R11: 0000000000000246 R12: 00007ffeb3083ce0 R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffeb3083cb0 Allocated by task 7365: kasan_save_stack+0x23/0x50 __kasan_kmalloc+0x83/0xa0 kmem_cache_alloc_trace+0x22e/0x470 bond_enslave+0x2e1/0x24f0 bond_do_ioctl+0x3e0/0x450 dev_ifsioc+0x2ba/0x970 dev_ioctl+0x112/0x710 sock_do_ioctl+0x118/0x1b0 sock_ioctl+0x2e0/0x490 __x64_sys_ioctl+0x118/0x150 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 7365: kasan_save_stack+0x23/0x50 kasan_set_track+0x20/0x30 kasan_set_free_info+0x24/0x40 __kasan_slab_free+0xf2/0x130 kfree+0xd1/0x5c0 slave_kobj_release+0x61/0x90 kobject_put+0x102/0x180 bond_sysfs_slave_add+0x7a/0xa0 bond_enslave+0x11b6/0x24f0 bond_do_ioctl+0x3e0/0x450 dev_ifsioc+0x2ba/0x970 dev_ioctl+0x112/0x710 sock_do_ioctl+0x118/0x1b0 sock_ioctl+0x2e0/0x490 __x64_sys_ioctl+0x118/0x150 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Last potentially related work creation: kasan_save_stack+0x23/0x50 kasan_record_aux_stack+0xb7/0xd0 insert_work+0x43/0x190 __queue_work+0x2e3/0x970 delayed_work_timer_fn+0x3e/0x50 call_timer_fn+0x148/0x470 run_timer_softirq+0x8a8/0xc50 __do_softirq+0x107/0x55f Second to last potentially related work creation: kasan_save_stack+0x23/0x50 kasan_record_aux_stack+0xb7/0xd0 insert_work+0x43/0x190 __queue_work+0x2e3/0x970 __queue_delayed_work+0x130/0x180 queue_delayed_work_on+0xa7/0xb0 bond_enslave+0xe25/0x24f0 bond_do_ioctl+0x3e0/0x450 dev_ifsioc+0x2ba/0x970 dev_ioctl+0x112/0x710 sock_do_ioctl+0x118/0x1b0 sock_ioctl+0x2e0/0x490 __x64_sys_ioctl+0x118/0x150 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88825bc11c00 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 0 bytes inside of 1024-byte region [ffff88825bc11c00, ffff88825bc12000) The buggy address belongs to the page: page:ffffea00096f0400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x25bc10 head:ffffea00096f0400 order:3 compound_mapcount:0 compound_pincount:0 flags: 0x57ff00000010200(slab|head|node=1|zone=2|lastcpupid=0x7ff) raw: 057ff00000010200 ffffea0009a71c08 ffff888240001968 ffff88810004dbc0 raw: 0000000000000000 00000000000a000a 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88825bc11b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88825bc11b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88825bc11c00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88825bc11c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88825bc11d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Put new_slave in bond_sysfs_slave_add() will cause use-after-free problems when new_slave is accessed in the subsequent error handling process. Since new_slave will be put in the subsequent error handling process, remove the unnecessary put to fix it. In addition, when sysfs_create_file() fails, if some files have been crea- ted successfully, we need to call sysfs_remove_file() to remove them. Since there are sysfs_create_files() & sysfs_remove_files() can be used, use these two functions instead. Fixes: 7afcaec49696 (bonding: use kobject_put instead of _del after kobject_add) Signed-off-by: Huang Guobin Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/bonding/bond_sysfs_slave.c | 36 +++++++++++----------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c index fd07561da034..6a6cdd0bb258 100644 --- a/drivers/net/bonding/bond_sysfs_slave.c +++ b/drivers/net/bonding/bond_sysfs_slave.c @@ -108,15 +108,15 @@ static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) } static SLAVE_ATTR_RO(ad_partner_oper_port_state); -static const struct slave_attribute *slave_attrs[] = { - &slave_attr_state, - &slave_attr_mii_status, - &slave_attr_link_failure_count, - &slave_attr_perm_hwaddr, - &slave_attr_queue_id, - &slave_attr_ad_aggregator_id, - &slave_attr_ad_actor_oper_port_state, - &slave_attr_ad_partner_oper_port_state, +static const struct attribute *slave_attrs[] = { + &slave_attr_state.attr, + &slave_attr_mii_status.attr, + &slave_attr_link_failure_count.attr, + &slave_attr_perm_hwaddr.attr, + &slave_attr_queue_id.attr, + &slave_attr_ad_aggregator_id.attr, + &slave_attr_ad_actor_oper_port_state.attr, + &slave_attr_ad_partner_oper_port_state.attr, NULL }; @@ -137,24 +137,10 @@ const struct sysfs_ops slave_sysfs_ops = { int bond_sysfs_slave_add(struct slave *slave) { - const struct slave_attribute **a; - int err; - - for (a = slave_attrs; *a; ++a) { - err = sysfs_create_file(&slave->kobj, &((*a)->attr)); - if (err) { - kobject_put(&slave->kobj); - return err; - } - } - - return 0; + return sysfs_create_files(&slave->kobj, slave_attrs); } void bond_sysfs_slave_del(struct slave *slave) { - const struct slave_attribute **a; - - for (a = slave_attrs; *a; ++a) - sysfs_remove_file(&slave->kobj, &((*a)->attr)); + sysfs_remove_files(&slave->kobj, slave_attrs); } -- cgit v1.2.3 From af1877b6cad16bdd8d8d93ca1c7b37e8f21ef4e3 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 3 Nov 2021 20:48:37 +0800 Subject: net/smc: Print function name in smcr_link_down tracepoint This makes the output of smcr_link_down tracepoint easier to use and understand without additional translating function's pointer address. It prints the function name with offset: -0 [000] ..s. 69.087164: smcr_link_down: lnk=00000000dab41cdc lgr=000000007d5d8e24 state=0 rc=1 dev=mlx5_0 location=smc_wr_tx_tasklet_fn+0x5ef/0x6f0 [smc] Link: https://lore.kernel.org/netdev/11f17a34-fd35-f2ec-3f20-dd0c34e55fde@linux.ibm.com/ Signed-off-by: Tony Lu Reviewed-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/smc_tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index b4c36795a928..ec17f29646f5 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -99,7 +99,7 @@ TRACE_EVENT(smcr_link_down, __entry->location = location; ), - TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%p", + TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%pS", __entry->lnk, __entry->lgr, __entry->state, __get_str(name), __entry->location) -- cgit v1.2.3 From 0c500ef5d3395b68f615486c90aaf28868e0032c Mon Sep 17 00:00:00 2001 From: luo penghao Date: Thu, 4 Nov 2021 06:21:38 +0000 Subject: tg3: Remove redundant assignments The assignment of err will be overwritten next, so this statement should be deleted. The clang_analyzer complains as follows: drivers/net/ethernet/broadcom/tg3.c:5506:2: warning: Value stored to 'expected_sg_dig_ctrl' is never read Reported-by: Zeal Robot Signed-off-by: luo penghao Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index b1328c5524b5..85ca3909859d 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -5503,7 +5503,6 @@ static bool tg3_setup_fiber_hw_autoneg(struct tg3 *tp, u32 mac_status) int workaround, port_a; serdes_cfg = 0; - expected_sg_dig_ctrl = 0; workaround = 0; port_a = 1; current_link_up = false; -- cgit v1.2.3 From d7be1d1cfb4d3215c06e98ac6f6c4e99293d8e3c Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Thu, 4 Nov 2021 14:21:58 +0800 Subject: octeontx2-af: use swap() to make code cleaner Use the macro 'swap()' defined in 'include/linux/minmax.h' to avoid opencoding it. Reported-by: Zeal Robot Signed-off-by: Yang Guang Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index bb6b42bbefa4..c0005a1feee6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -2450,9 +2450,7 @@ alloc: bmap = mcam->bmap_reverse; start = mcam->bmap_entries - start; end = mcam->bmap_entries - end; - index = start; - start = end; - end = index; + swap(start, end); } else { bmap = mcam->bmap; } -- cgit v1.2.3 From f6a510102c0553f683550238cadfab6368f34c24 Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Thu, 4 Nov 2021 14:53:50 +0800 Subject: sfc: use swap() to make code cleaner Use the macro 'swap()' defined in 'include/linux/minmax.h' to avoid opencoding it. Reported-by: Zeal Robot Signed-off-by: Yang Guang Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/falcon/efx.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c index c68837a951f4..314c9c69eb0e 100644 --- a/drivers/net/ethernet/sfc/falcon/efx.c +++ b/drivers/net/ethernet/sfc/falcon/efx.c @@ -817,9 +817,7 @@ ef4_realloc_channels(struct ef4_nic *efx, u32 rxq_entries, u32 txq_entries) efx->rxq_entries = rxq_entries; efx->txq_entries = txq_entries; for (i = 0; i < efx->n_channels; i++) { - channel = efx->channel[i]; - efx->channel[i] = other_channel[i]; - other_channel[i] = channel; + swap(efx->channel[i], other_channel[i]); } /* Restart buffer table allocation */ @@ -863,9 +861,7 @@ rollback: efx->rxq_entries = old_rxq_entries; efx->txq_entries = old_txq_entries; for (i = 0; i < efx->n_channels; i++) { - channel = efx->channel[i]; - efx->channel[i] = other_channel[i]; - other_channel[i] = channel; + swap(efx->channel[i], other_channel[i]); } goto out; } -- cgit v1.2.3 From 9cbc3367968de69017a87a1118b62490ac1bdd0a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Nov 2021 14:34:42 +0100 Subject: octeontx2-pf: select CONFIG_NET_DEVLINK The octeontx2 pf nic driver failsz to link when the devlink support is not reachable: aarch64-linux-ld: drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.o: in function `otx2_dl_mcam_count_get': otx2_devlink.c:(.text+0x10): undefined reference to `devlink_priv' aarch64-linux-ld: drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.o: in function `otx2_dl_mcam_count_validate': otx2_devlink.c:(.text+0x50): undefined reference to `devlink_priv' aarch64-linux-ld: drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.o: in function `otx2_dl_mcam_count_set': otx2_devlink.c:(.text+0xd0): undefined reference to `devlink_priv' aarch64-linux-ld: drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.o: in function `otx2_devlink_info_get': otx2_devlink.c:(.text+0x150): undefined reference to `devlink_priv' This is already selected by the admin function driver, but not the actual nic, which might be built-in when the af driver is not. Fixes: 2da489432747 ("octeontx2-pf: devlink params support to set mcam entry count") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig b/drivers/net/ethernet/marvell/octeontx2/Kconfig index 3f982ccf2c85..639893d87055 100644 --- a/drivers/net/ethernet/marvell/octeontx2/Kconfig +++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig @@ -31,6 +31,7 @@ config NDC_DIS_DYNAMIC_CACHING config OCTEONTX2_PF tristate "Marvell OcteonTX2 NIC Physical Function driver" select OCTEONTX2_MBOX + select NET_DEVLINK depends on (64BIT && COMPILE_TEST) || ARM64 depends on PCI depends on PTP_1588_CLOCK_OPTIONAL -- cgit v1.2.3 From 827beb7781d3dbba1a8cd8dc364cc3cb3fc13b11 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Fri, 5 Nov 2021 01:42:17 +0000 Subject: net: ethernet: litex: Remove unnecessary print function dev_err() The print function dev_err() is redundant because platform_get_irq() already prints an error. Signed-off-by: Xu Wang Reviewed-by: Cai Huoqing Signed-off-by: David S. Miller --- drivers/net/ethernet/litex/litex_liteeth.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/litex/litex_liteeth.c b/drivers/net/ethernet/litex/litex_liteeth.c index 3d9385a4989b..ab9fa1525053 100644 --- a/drivers/net/ethernet/litex/litex_liteeth.c +++ b/drivers/net/ethernet/litex/litex_liteeth.c @@ -242,10 +242,8 @@ static int liteeth_probe(struct platform_device *pdev) priv->dev = &pdev->dev; irq = platform_get_irq(pdev, 0); - if (irq < 0) { - dev_err(&pdev->dev, "Failed to get IRQ %d\n", irq); + if (irq < 0) return irq; - } netdev->irq = irq; priv->base = devm_platform_ioremap_resource_byname(pdev, "mac"); -- cgit v1.2.3 From 69dfccbc1186f7091f97b70a9437d6a51313834d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 5 Nov 2021 15:35:41 +0800 Subject: net: udp: correct the document for udp_mem udp_mem is a vector of 3 INTEGERs, which is used to limit the number of pages allowed for queueing by all UDP sockets. However, sk_has_memory_pressure() in __sk_mem_raise_allocated() always return false for udp, as memory pressure is not supported by udp, which means that __sk_mem_raise_allocated() will fail once pages allocated for udp socket exceeds udp_mem[0]. Therefor, udp_mem[0] is the only one that limit the number of pages. However, the document of udp_mem just express that udp_mem[2] is the limitation. So, just fix it. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index c61cc0219f4c..c04431144f7a 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1004,13 +1004,11 @@ udp_l3mdev_accept - BOOLEAN udp_mem - vector of 3 INTEGERs: min, pressure, max Number of pages allowed for queueing by all UDP sockets. - min: Below this number of pages UDP is not bothered about its - memory appetite. When amount of memory allocated by UDP exceeds - this number, UDP starts to moderate memory usage. + min: Number of pages allowed for queueing by all UDP sockets. pressure: This value was introduced to follow format of tcp_mem. - max: Number of pages allowed for queueing by all UDP sockets. + max: This value was introduced to follow format of tcp_mem. Default is calculated at boot time from amount of available memory. -- cgit v1.2.3 From 6789a4c05127d3f9257db6767fd7ede614e0241f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Nov 2021 10:55:27 -0700 Subject: net: ax88796c: hide ax88796c_dt_ids if !CONFIG_OF Build bot says: >> drivers/net/ethernet/asix/ax88796c_main.c:1116:34: warning: unused variable 'ax88796c_dt_ids' [-Wunused-const-variable] static const struct of_device_id ax88796c_dt_ids[] = { ^ The only reference to this array is wrapped in of_match_ptr(). Reported-by: kernel test robot Fixes: a97c69ba4f30 ("net: ax88796c: ASIX AX88796C SPI Ethernet Adapter Driver") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/asix/ax88796c_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/asix/ax88796c_main.c b/drivers/net/ethernet/asix/ax88796c_main.c index 4b0c5a09fd57..2086de05385c 100644 --- a/drivers/net/ethernet/asix/ax88796c_main.c +++ b/drivers/net/ethernet/asix/ax88796c_main.c @@ -1114,11 +1114,13 @@ static int ax88796c_remove(struct spi_device *spi) return 0; } +#ifdef CONFIG_OF static const struct of_device_id ax88796c_dt_ids[] = { { .compatible = "asix,ax88796c" }, {}, }; MODULE_DEVICE_TABLE(of, ax88796c_dt_ids); +#endif static const struct spi_device_id asix_id[] = { { "ax88796c", 0 }, -- cgit v1.2.3 From 3f81c579912855f19ed1a72af8133485a6119fba Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 5 Nov 2021 15:12:50 +0800 Subject: amt: Fix NULL but dereferenced coccicheck error Eliminate the following coccicheck warning: ./drivers/net/amt.c:2795:6-9: ERROR: amt is NULL but dereferenced. Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: David S. Miller --- drivers/net/amt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 896c9e2857f0..cfd6c8cb4e97 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -2766,7 +2766,7 @@ static int amt_err_lookup(struct sock *sk, struct sk_buff *skb) rcu_read_lock_bh(); amt = rcu_dereference_sk_user_data(sk); if (!amt) - goto drop; + goto out; if (amt->mode != AMT_MODE_GATEWAY) goto drop; @@ -2788,6 +2788,7 @@ static int amt_err_lookup(struct sock *sk, struct sk_buff *skb) default: goto drop; } +out: rcu_read_unlock_bh(); return 0; drop: -- cgit v1.2.3 From 9dcc00715a7c0aea0d3afe1e935f4b4aefbeb294 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Nov 2021 10:29:39 +0100 Subject: ax88796c: fix ioctl callback The timestamp ioctls are now handled by the ndo_eth_ioctl() callback, not the old ndo_do_ioctl(), but oax88796 introduced the function for the old way. Move it over to ndo_eth_ioctl() to actually allow calling it from user space. Fixes: a97c69ba4f30 ("net: ax88796c: ASIX AX88796C SPI Ethernet Adapter Driver") Fixes: a76053707dbf ("dev_ioctl: split out ndo_eth_ioctl") Signed-off-by: Arnd Bergmann Acked-by: Lukasz Stelmach Signed-off-by: David S. Miller --- drivers/net/ethernet/asix/ax88796c_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/asix/ax88796c_main.c b/drivers/net/ethernet/asix/ax88796c_main.c index 2086de05385c..e230d8d0ff73 100644 --- a/drivers/net/ethernet/asix/ax88796c_main.c +++ b/drivers/net/ethernet/asix/ax88796c_main.c @@ -934,7 +934,7 @@ static const struct net_device_ops ax88796c_netdev_ops = { .ndo_stop = ax88796c_close, .ndo_start_xmit = ax88796c_start_xmit, .ndo_get_stats64 = ax88796c_get_stats64, - .ndo_do_ioctl = ax88796c_ioctl, + .ndo_eth_ioctl = ax88796c_ioctl, .ndo_set_mac_address = eth_mac_addr, .ndo_set_features = ax88796c_set_features, }; -- cgit v1.2.3 From a6785bd7d83c9e73c6a6aa33d30a071460074728 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Nov 2021 10:29:40 +0100 Subject: octeontx2-nicvf: fix ioctl callback The mii ioctls are now handled by the ndo_eth_ioctl() callback, not the old ndo_do_ioctl(), but octeontx2-nicvf introduced the function for the old way. Move it over to ndo_eth_ioctl() to actually allow calling it from user space. Fixes: 43510ef4ddad ("octeontx2-nicvf: Add PTP hardware clock support to NIX VF") Fixes: a76053707dbf ("dev_ioctl: split out ndo_eth_ioctl") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index e6cb8cd0787d..78944ad3492f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -501,7 +501,7 @@ static const struct net_device_ops otx2vf_netdev_ops = { .ndo_set_features = otx2vf_set_features, .ndo_get_stats64 = otx2_get_stats64, .ndo_tx_timeout = otx2_tx_timeout, - .ndo_do_ioctl = otx2_ioctl, + .ndo_eth_ioctl = otx2_ioctl, }; static int otx2_wq_init(struct otx2_nic *vf) -- cgit v1.2.3 From dce981c42151e1f0176b0788c2e1bdc3f1e2bc1f Mon Sep 17 00:00:00 2001 From: Zhang Mingyu Date: Fri, 5 Nov 2021 01:27:17 +0000 Subject: amt: remove duplicate include in amt.c 'net/protocol.h' included in 'drivers/net/amt.c' is duplicated. Reported-by: Zeal Robot Signed-off-by: Zhang Mingyu Signed-off-by: David S. Miller --- drivers/net/amt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index cfd6c8cb4e97..c384b2694f9e 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From a46a5036e7d2c537995ed331b7b8727d0e28390c Mon Sep 17 00:00:00 2001 From: Volodymyr Mytnyk Date: Thu, 4 Nov 2021 15:12:52 +0200 Subject: net: marvell: prestera: fix patchwork build problems fix the remaining build issues reported by patchwork in firmware v4.0 support commit which has been already merged. Fix patchwork issues: - source inline - checkpatch Fixes: bb5dbf2cc64d ("net: marvell: prestera: add firmware v4.0 support") Signed-off-by: Volodymyr Mytnyk Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/prestera/prestera_ethtool.c | 3 ++- drivers/net/ethernet/marvell/prestera/prestera_hw.c | 3 ++- drivers/net/ethernet/marvell/prestera/prestera_main.c | 6 ++++-- drivers/net/ethernet/marvell/prestera/prestera_pci.c | 3 ++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_ethtool.c b/drivers/net/ethernet/marvell/prestera/prestera_ethtool.c index 6011454dba71..40d5b89573bb 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_ethtool.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_ethtool.c @@ -499,7 +499,8 @@ static void prestera_port_mdix_get(struct ethtool_link_ksettings *ecmd, { struct prestera_port_phy_state *state = &port->state_phy; - if (prestera_hw_port_phy_mode_get(port, &state->mdix, NULL, NULL, NULL)) { + if (prestera_hw_port_phy_mode_get(port, + &state->mdix, NULL, NULL, NULL)) { netdev_warn(port->dev, "MDIX params get failed"); state->mdix = ETH_TP_MDI_INVALID; } diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_hw.c index 4f5f52dcdd9d..bc3c9310678a 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.c @@ -1356,7 +1356,8 @@ int prestera_hw_port_speed_get(const struct prestera_port *port, u32 *speed) int prestera_hw_port_autoneg_restart(struct prestera_port *port) { struct prestera_msg_port_attr_req req = { - .attr = __cpu_to_le32(PRESTERA_CMD_PORT_ATTR_PHY_AUTONEG_RESTART), + .attr = + __cpu_to_le32(PRESTERA_CMD_PORT_ATTR_PHY_AUTONEG_RESTART), .port = __cpu_to_le32(port->hw_id), .dev = __cpu_to_le32(port->dev_id), }; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 625b40149fac..4369a3ffad45 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -405,7 +405,8 @@ static int prestera_port_create(struct prestera_switch *sw, u32 id) err = prestera_port_cfg_mac_write(port, &cfg_mac); if (err) { - dev_err(prestera_dev(sw), "Failed to set port(%u) mac mode\n", id); + dev_err(prestera_dev(sw), + "Failed to set port(%u) mac mode\n", id); goto err_port_init; } @@ -418,7 +419,8 @@ static int prestera_port_create(struct prestera_switch *sw, u32 id) false, 0, 0, port->cfg_phy.mdix); if (err) { - dev_err(prestera_dev(sw), "Failed to set port(%u) phy mode\n", id); + dev_err(prestera_dev(sw), + "Failed to set port(%u) phy mode\n", id); goto err_port_init; } } diff --git a/drivers/net/ethernet/marvell/prestera/prestera_pci.c b/drivers/net/ethernet/marvell/prestera/prestera_pci.c index 5d4d410b07c8..461259b3655a 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_pci.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_pci.c @@ -411,7 +411,8 @@ static int prestera_fw_cmd_send(struct prestera_fw *fw, int qid, goto cmd_exit; } - memcpy_fromio(out_msg, prestera_fw_cmdq_buf(fw, qid) + in_size, ret_size); + memcpy_fromio(out_msg, + prestera_fw_cmdq_buf(fw, qid) + in_size, ret_size); cmd_exit: prestera_fw_write(fw, PRESTERA_CMDQ_REQ_CTL_REG(qid), -- cgit v1.2.3 From e41ac2020bca4acdb7485ddca34098f68d3af5ae Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 5 Nov 2021 01:58:13 +0000 Subject: bpftool: Install libbpf headers for the bootstrap version, too We recently changed bpftool's Makefile to make it install libbpf's headers locally instead of pulling them from the source directory of the library. Although bpftool needs two versions of libbpf, a "regular" one and a "bootstrap" version, we would only install headers for the regular libbpf build. Given that this build always occurs before the bootstrap build when building bpftool, this is enough to ensure that the bootstrap bpftool will have access to the headers exported through the regular libbpf build. However, this did not account for the case when we only want the bootstrap version of bpftool, through the "bootstrap" target. For example, perf needs the bootstrap version only, to generate BPF skeletons. In that case, when are the headers installed? For some time, the issue has been masked, because we had a step (the installation of headers internal to libbpf) which would depend on the regular build of libbpf and hence trigger the export of the headers, just for the sake of creating a directory. But this changed with commit 8b6c46241c77 ("bpftool: Remove Makefile dep. on $(LIBBPF) for $(LIBBPF_INTERNAL_HDRS)"), where we cleaned up that stage and removed the dependency on the regular libbpf build. As a result, when we only want the bootstrap bpftool version, the regular libbpf is no longer built. The bootstrap libbpf version is built, but headers are not exported, and the bootstrap bpftool build fails because of the missing headers. To fix this, we also install the library headers for the bootstrap version of libbpf, to use them for the bootstrap bpftool and for generating the skeletons. Fixes: f012ade10b34 ("bpftool: Install libbpf headers instead of including the dir") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/bpf/20211105015813.6171-1-quentin@isovalent.com --- tools/bpf/bpftool/Makefile | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index c0c30e56988f..7cfba11c3014 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -22,24 +22,29 @@ else _OUTPUT := $(CURDIR) endif BOOTSTRAP_OUTPUT := $(_OUTPUT)/bootstrap/ + LIBBPF_OUTPUT := $(_OUTPUT)/libbpf/ LIBBPF_DESTDIR := $(LIBBPF_OUTPUT) LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)/include LIBBPF_HDRS_DIR := $(LIBBPF_INCLUDE)/bpf +LIBBPF := $(LIBBPF_OUTPUT)libbpf.a -LIBBPF = $(LIBBPF_OUTPUT)libbpf.a -LIBBPF_BOOTSTRAP_OUTPUT = $(BOOTSTRAP_OUTPUT)libbpf/ -LIBBPF_BOOTSTRAP = $(LIBBPF_BOOTSTRAP_OUTPUT)libbpf.a +LIBBPF_BOOTSTRAP_OUTPUT := $(BOOTSTRAP_OUTPUT)libbpf/ +LIBBPF_BOOTSTRAP_DESTDIR := $(LIBBPF_BOOTSTRAP_OUTPUT) +LIBBPF_BOOTSTRAP_INCLUDE := $(LIBBPF_BOOTSTRAP_DESTDIR)/include +LIBBPF_BOOTSTRAP_HDRS_DIR := $(LIBBPF_BOOTSTRAP_INCLUDE)/bpf +LIBBPF_BOOTSTRAP := $(LIBBPF_BOOTSTRAP_OUTPUT)libbpf.a # We need to copy hashmap.h and nlattr.h which is not otherwise exported by # libbpf, but still required by bpftool. LIBBPF_INTERNAL_HDRS := $(addprefix $(LIBBPF_HDRS_DIR)/,hashmap.h nlattr.h) +LIBBPF_BOOTSTRAP_INTERNAL_HDRS := $(addprefix $(LIBBPF_BOOTSTRAP_HDRS_DIR)/,hashmap.h) ifeq ($(BPFTOOL_VERSION),) BPFTOOL_VERSION := $(shell make -rR --no-print-directory -sC ../../.. kernelversion) endif -$(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) $(LIBBPF_BOOTSTRAP_OUTPUT) $(LIBBPF_HDRS_DIR): +$(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) $(LIBBPF_BOOTSTRAP_OUTPUT) $(LIBBPF_HDRS_DIR) $(LIBBPF_BOOTSTRAP_HDRS_DIR): $(QUIET_MKDIR)mkdir -p $@ $(LIBBPF): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_OUTPUT) @@ -52,7 +57,12 @@ $(LIBBPF_INTERNAL_HDRS): $(LIBBPF_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_HDRS_ $(LIBBPF_BOOTSTRAP): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_BOOTSTRAP_OUTPUT) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \ - ARCH= CC=$(HOSTCC) LD=$(HOSTLD) $@ + DESTDIR=$(LIBBPF_BOOTSTRAP_DESTDIR) prefix= \ + ARCH= CC=$(HOSTCC) LD=$(HOSTLD) $@ install_headers + +$(LIBBPF_BOOTSTRAP_INTERNAL_HDRS): $(LIBBPF_BOOTSTRAP_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_BOOTSTRAP_HDRS_DIR) + $(call QUIET_INSTALL, $@) + $(Q)install -m 644 -t $(LIBBPF_BOOTSTRAP_HDRS_DIR) $< $(LIBBPF)-clean: FORCE | $(LIBBPF_OUTPUT) $(call QUIET_CLEAN, libbpf) @@ -172,11 +182,11 @@ else $(Q)cp "$(VMLINUX_H)" $@ endif -$(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF) +$(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF_BOOTSTRAP) $(QUIET_CLANG)$(CLANG) \ -I$(if $(OUTPUT),$(OUTPUT),.) \ -I$(srctree)/tools/include/uapi/ \ - -I$(LIBBPF_INCLUDE) \ + -I$(LIBBPF_BOOTSTRAP_INCLUDE) \ -g -O2 -Wall -target bpf -c $< -o $@ && $(LLVM_STRIP) -g $@ $(OUTPUT)%.skel.h: $(OUTPUT)%.bpf.o $(BPFTOOL_BOOTSTRAP) @@ -209,8 +219,10 @@ $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJS) $(LIBS) -$(BOOTSTRAP_OUTPUT)%.o: %.c $(LIBBPF_INTERNAL_HDRS) | $(BOOTSTRAP_OUTPUT) - $(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD -o $@ $< +$(BOOTSTRAP_OUTPUT)%.o: %.c $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS) | $(BOOTSTRAP_OUTPUT) + $(QUIET_CC)$(HOSTCC) \ + $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),$(CFLAGS)) \ + -c -MMD -o $@ $< $(OUTPUT)%.o: %.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< @@ -257,6 +269,6 @@ doc-uninstall: FORCE: .SECONDARY: -.PHONY: all FORCE clean install-bin install uninstall +.PHONY: all FORCE bootstrap clean install-bin install uninstall .PHONY: doc doc-clean doc-install doc-uninstall .DEFAULT_GOAL := all -- cgit v1.2.3 From 64165ddf8ea184631c65e3bbc8d59f6d940590ca Mon Sep 17 00:00:00 2001 From: Mehrdad Arshad Rad Date: Thu, 4 Nov 2021 10:13:54 -0700 Subject: libbpf: Fix lookup_and_delete_elem_flags error reporting Fix bpf_map_lookup_and_delete_elem_flags() to pass the return code through libbpf_err_errno() as we do similarly in bpf_map_lookup_and_delete_elem(). Fixes: f12b65432728 ("libbpf: Streamline error reporting for low-level APIs") Signed-off-by: Mehrdad Arshad Rad Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211104171354.11072-1-arshad.rad@gmail.com --- tools/lib/bpf/bpf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index c09cbb868c9f..725701235fd8 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -515,6 +515,7 @@ int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags) { union bpf_attr attr; + int ret; memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; @@ -522,7 +523,8 @@ int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, _ attr.value = ptr_to_u64(value); attr.flags = flags; - return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); + return libbpf_err_errno(ret); } int bpf_map_delete_elem(int fd, const void *key) -- cgit v1.2.3 From 8b4ac13abe7d82da0e0d22a9ba2e27301559a93e Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 27 Oct 2021 11:35:50 +0800 Subject: selftests/bpf/xdp_redirect_multi: Put the logs to tmp folder The xdp_redirect_multi test logs are created in selftest folder and not cleaned after test. Let's creat a tmp dir and remove the logs after testing. Fixes: d23292476297 ("selftests/bpf: Add xdp_redirect_multi test") Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211027033553.962413-2-liuhangbin@gmail.com --- .../selftests/bpf/test_xdp_redirect_multi.sh | 35 +++++++++++----------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh index 351955c2bdfd..c1653f6d7f77 100755 --- a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh +++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh @@ -31,6 +31,7 @@ IFACES="" DRV_MODE="xdpgeneric xdpdrv xdpegress" PASS=0 FAIL=0 +LOG_DIR=$(mktemp -d) test_pass() { @@ -100,17 +101,17 @@ do_egress_tests() local mode=$1 # mac test - ip netns exec ns2 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-2_${mode}.log & - ip netns exec ns3 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-3_${mode}.log & + ip netns exec ns2 tcpdump -e -i veth0 -nn -l -e &> ${LOG_DIR}/mac_ns1-2_${mode}.log & + ip netns exec ns3 tcpdump -e -i veth0 -nn -l -e &> ${LOG_DIR}/mac_ns1-3_${mode}.log & sleep 0.5 ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null sleep 0.5 pkill -9 tcpdump # mac check - grep -q "${veth_mac[2]} > ff:ff:ff:ff:ff:ff" mac_ns1-2_${mode}.log && \ + grep -q "${veth_mac[2]} > ff:ff:ff:ff:ff:ff" ${LOG_DIR}/mac_ns1-2_${mode}.log && \ test_pass "$mode mac ns1-2" || test_fail "$mode mac ns1-2" - grep -q "${veth_mac[3]} > ff:ff:ff:ff:ff:ff" mac_ns1-3_${mode}.log && \ + grep -q "${veth_mac[3]} > ff:ff:ff:ff:ff:ff" ${LOG_DIR}/mac_ns1-3_${mode}.log && \ test_pass "$mode mac ns1-3" || test_fail "$mode mac ns1-3" } @@ -121,9 +122,9 @@ do_ping_tests() # ping6 test: echo request should be redirect back to itself, not others ip netns exec ns1 ip neigh add 2001:db8::2 dev veth0 lladdr 00:00:00:00:00:02 - ip netns exec ns1 tcpdump -i veth0 -nn -l -e &> ns1-1_${mode}.log & - ip netns exec ns2 tcpdump -i veth0 -nn -l -e &> ns1-2_${mode}.log & - ip netns exec ns3 tcpdump -i veth0 -nn -l -e &> ns1-3_${mode}.log & + ip netns exec ns1 tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-1_${mode}.log & + ip netns exec ns2 tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-2_${mode}.log & + ip netns exec ns3 tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-3_${mode}.log & sleep 0.5 # ARP test ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null @@ -135,32 +136,32 @@ do_ping_tests() pkill -9 tcpdump # All netns should receive the redirect arp requests - [ $(grep -c "who-has 192.0.2.254" ns1-1_${mode}.log) -gt 4 ] && \ + [ $(grep -c "who-has 192.0.2.254" ${LOG_DIR}/ns1-1_${mode}.log) -gt 4 ] && \ test_pass "$mode arp(F_BROADCAST) ns1-1" || \ test_fail "$mode arp(F_BROADCAST) ns1-1" - [ $(grep -c "who-has 192.0.2.254" ns1-2_${mode}.log) -le 4 ] && \ + [ $(grep -c "who-has 192.0.2.254" ${LOG_DIR}/ns1-2_${mode}.log) -le 4 ] && \ test_pass "$mode arp(F_BROADCAST) ns1-2" || \ test_fail "$mode arp(F_BROADCAST) ns1-2" - [ $(grep -c "who-has 192.0.2.254" ns1-3_${mode}.log) -le 4 ] && \ + [ $(grep -c "who-has 192.0.2.254" ${LOG_DIR}/ns1-3_${mode}.log) -le 4 ] && \ test_pass "$mode arp(F_BROADCAST) ns1-3" || \ test_fail "$mode arp(F_BROADCAST) ns1-3" # ns1 should not receive the redirect echo request, others should - [ $(grep -c "ICMP echo request" ns1-1_${mode}.log) -eq 4 ] && \ + [ $(grep -c "ICMP echo request" ${LOG_DIR}/ns1-1_${mode}.log) -eq 4 ] && \ test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1" || \ test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1" - [ $(grep -c "ICMP echo request" ns1-2_${mode}.log) -eq 4 ] && \ + [ $(grep -c "ICMP echo request" ${LOG_DIR}/ns1-2_${mode}.log) -eq 4 ] && \ test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2" || \ test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2" - [ $(grep -c "ICMP echo request" ns1-3_${mode}.log) -eq 4 ] && \ + [ $(grep -c "ICMP echo request" ${LOG_DIR}/ns1-3_${mode}.log) -eq 4 ] && \ test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3" || \ test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3" # ns1 should receive the echo request, ns2 should not - [ $(grep -c "ICMP6, echo request" ns1-1_${mode}.log) -eq 4 ] && \ + [ $(grep -c "ICMP6, echo request" ${LOG_DIR}/ns1-1_${mode}.log) -eq 4 ] && \ test_pass "$mode IPv6 (no flags) ns1-1" || \ test_fail "$mode IPv6 (no flags) ns1-1" - [ $(grep -c "ICMP6, echo request" ns1-2_${mode}.log) -eq 0 ] && \ + [ $(grep -c "ICMP6, echo request" ${LOG_DIR}/ns1-2_${mode}.log) -eq 0 ] && \ test_pass "$mode IPv6 (no flags) ns1-2" || \ test_fail "$mode IPv6 (no flags) ns1-2" } @@ -176,7 +177,7 @@ do_tests() xdpgeneric) drv_p="-S";; esac - ./xdp_redirect_multi $drv_p $IFACES &> xdp_redirect_${mode}.log & + ./xdp_redirect_multi $drv_p $IFACES &> ${LOG_DIR}/xdp_redirect_${mode}.log & xdp_pid=$! sleep 1 @@ -192,13 +193,13 @@ do_tests() trap clean_up 0 2 3 6 9 check_env -rm -f xdp_redirect_*.log ns*.log mac_ns*.log for mode in ${DRV_MODE}; do setup_ns $mode do_tests $mode clean_up done +rm -rf ${LOG_DIR} echo "Summary: PASS $PASS, FAIL $FAIL" [ $FAIL -eq 0 ] && exit 0 || exit 1 -- cgit v1.2.3 From f53ea9dbf78d42a10e2392b5c59362ccc224fd1d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 27 Oct 2021 11:35:51 +0800 Subject: selftests/bpf/xdp_redirect_multi: Use arping to accurate the arp number The arp request number triggered by ping none exist address is not accurate, which may lead the test false negative/positive. Change to use arping to accurate the arp number. Also do not use grep pattern match for dot. Fixes: d23292476297 ("selftests/bpf: Add xdp_redirect_multi test") Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211027033553.962413-3-liuhangbin@gmail.com --- tools/testing/selftests/bpf/test_xdp_redirect_multi.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh index c1653f6d7f77..e14dc41b52f2 100755 --- a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh +++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh @@ -127,7 +127,7 @@ do_ping_tests() ip netns exec ns3 tcpdump -i veth0 -nn -l -e &> ${LOG_DIR}/ns1-3_${mode}.log & sleep 0.5 # ARP test - ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null + ip netns exec ns1 arping -q -c 2 -I veth0 192.0.2.254 # IPv4 test ip netns exec ns1 ping 192.0.2.253 -i 0.1 -c 4 &> /dev/null # IPv6 test @@ -136,13 +136,13 @@ do_ping_tests() pkill -9 tcpdump # All netns should receive the redirect arp requests - [ $(grep -c "who-has 192.0.2.254" ${LOG_DIR}/ns1-1_${mode}.log) -gt 4 ] && \ + [ $(grep -cF "who-has 192.0.2.254" ${LOG_DIR}/ns1-1_${mode}.log) -eq 4 ] && \ test_pass "$mode arp(F_BROADCAST) ns1-1" || \ test_fail "$mode arp(F_BROADCAST) ns1-1" - [ $(grep -c "who-has 192.0.2.254" ${LOG_DIR}/ns1-2_${mode}.log) -le 4 ] && \ + [ $(grep -cF "who-has 192.0.2.254" ${LOG_DIR}/ns1-2_${mode}.log) -eq 2 ] && \ test_pass "$mode arp(F_BROADCAST) ns1-2" || \ test_fail "$mode arp(F_BROADCAST) ns1-2" - [ $(grep -c "who-has 192.0.2.254" ${LOG_DIR}/ns1-3_${mode}.log) -le 4 ] && \ + [ $(grep -cF "who-has 192.0.2.254" ${LOG_DIR}/ns1-3_${mode}.log) -eq 2 ] && \ test_pass "$mode arp(F_BROADCAST) ns1-3" || \ test_fail "$mode arp(F_BROADCAST) ns1-3" -- cgit v1.2.3 From 648c3677062fbd14d754b853daebb295426771e8 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 27 Oct 2021 11:35:52 +0800 Subject: selftests/bpf/xdp_redirect_multi: Give tcpdump a chance to terminate cleanly No need to kill tcpdump with -9. Fixes: d23292476297 ("selftests/bpf: Add xdp_redirect_multi test") Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211027033553.962413-4-liuhangbin@gmail.com --- tools/testing/selftests/bpf/test_xdp_redirect_multi.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh index e14dc41b52f2..d4cdb76cdf9e 100755 --- a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh +++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh @@ -106,7 +106,7 @@ do_egress_tests() sleep 0.5 ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null sleep 0.5 - pkill -9 tcpdump + pkill tcpdump # mac check grep -q "${veth_mac[2]} > ff:ff:ff:ff:ff:ff" ${LOG_DIR}/mac_ns1-2_${mode}.log && \ @@ -133,7 +133,7 @@ do_ping_tests() # IPv6 test ip netns exec ns1 ping6 2001:db8::2 -i 0.1 -c 2 &> /dev/null sleep 0.5 - pkill -9 tcpdump + pkill tcpdump # All netns should receive the redirect arp requests [ $(grep -cF "who-has 192.0.2.254" ${LOG_DIR}/ns1-1_${mode}.log) -eq 4 ] && \ -- cgit v1.2.3 From 8955c1a329873385775081e029d9a7c6aa9037e1 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 27 Oct 2021 11:35:53 +0800 Subject: selftests/bpf/xdp_redirect_multi: Limit the tests in netns As I want to test both DEVMAP and DEVMAP_HASH in XDP multicast redirect, I limited DEVMAP max entries to a small value for performace. When the test runs after amount of interface creating/deleting tests. The interface index will exceed the map max entries and xdp_redirect_multi will error out with "Get interfacesInterface index to large". Fix this issue by limit the tests in netns and specify the ifindex when creating interfaces. Fixes: d23292476297 ("selftests/bpf: Add xdp_redirect_multi test") Reported-by: Jiri Benc Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211027033553.962413-5-liuhangbin@gmail.com --- .../selftests/bpf/test_xdp_redirect_multi.sh | 23 ++++++++++++++-------- tools/testing/selftests/bpf/xdp_redirect_multi.c | 4 ++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh index d4cdb76cdf9e..05f872740999 100755 --- a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh +++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh @@ -2,11 +2,11 @@ # SPDX-License-Identifier: GPL-2.0 # # Test topology: -# - - - - - - - - - - - - - - - - - - - - - - - - - -# | veth1 veth2 veth3 | ... init net +# - - - - - - - - - - - - - - - - - - - +# | veth1 veth2 veth3 | ns0 # - -| - - - - - - | - - - - - - | - - # --------- --------- --------- -# | veth0 | | veth0 | | veth0 | ... +# | veth0 | | veth0 | | veth0 | # --------- --------- --------- # ns1 ns2 ns3 # @@ -51,6 +51,7 @@ clean_up() ip link del veth$i 2> /dev/null ip netns del ns$i 2> /dev/null done + ip netns del ns0 2> /dev/null } # Kselftest framework requirement - SKIP code is 4. @@ -78,10 +79,12 @@ setup_ns() mode="xdpdrv" fi + ip netns add ns0 for i in $(seq $NUM); do ip netns add ns$i - ip link add veth$i type veth peer name veth0 netns ns$i - ip link set veth$i up + ip -n ns$i link add veth0 index 2 type veth \ + peer name veth$i netns ns0 index $((1 + $i)) + ip -n ns0 link set veth$i up ip -n ns$i link set veth0 up ip -n ns$i addr add 192.0.2.$i/24 dev veth0 @@ -92,7 +95,7 @@ setup_ns() xdp_dummy.o sec xdp &> /dev/null || \ { test_fail "Unable to load dummy xdp" && exit 1; } IFACES="$IFACES veth$i" - veth_mac[$i]=$(ip link show veth$i | awk '/link\/ether/ {print $2}') + veth_mac[$i]=$(ip -n ns0 link show veth$i | awk '/link\/ether/ {print $2}') done } @@ -177,9 +180,13 @@ do_tests() xdpgeneric) drv_p="-S";; esac - ./xdp_redirect_multi $drv_p $IFACES &> ${LOG_DIR}/xdp_redirect_${mode}.log & + ip netns exec ns0 ./xdp_redirect_multi $drv_p $IFACES &> ${LOG_DIR}/xdp_redirect_${mode}.log & xdp_pid=$! sleep 1 + if ! ps -p $xdp_pid > /dev/null; then + test_fail "$mode xdp_redirect_multi start failed" + return 1 + fi if [ "$mode" = "xdpegress" ]; then do_egress_tests $mode @@ -190,7 +197,7 @@ do_tests() kill $xdp_pid } -trap clean_up 0 2 3 6 9 +trap clean_up EXIT check_env diff --git a/tools/testing/selftests/bpf/xdp_redirect_multi.c b/tools/testing/selftests/bpf/xdp_redirect_multi.c index 3696a8f32c23..f5ffba341c17 100644 --- a/tools/testing/selftests/bpf/xdp_redirect_multi.c +++ b/tools/testing/selftests/bpf/xdp_redirect_multi.c @@ -129,7 +129,7 @@ int main(int argc, char **argv) goto err_out; } - printf("Get interfaces"); + printf("Get interfaces:"); for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) { ifaces[i] = if_nametoindex(argv[optind + i]); if (!ifaces[i]) @@ -139,7 +139,7 @@ int main(int argc, char **argv) goto err_out; } if (ifaces[i] > MAX_INDEX_NUM) { - printf("Interface index to large\n"); + printf(" interface index too large\n"); goto err_out; } printf(" %d", ifaces[i]); -- cgit v1.2.3 From f47d4ffe3a84ae11fc4bddc37939b9719467042c Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 3 Nov 2021 12:54:53 +0100 Subject: riscv, bpf: Fix RV32 broken build, and silence RV64 warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 252c765bd764 ("riscv, bpf: Add BPF exception tables") only addressed RV64, and broke the RV32 build [1]. Fix by gating the exception tables code with CONFIG_ARCH_RV64I. Further, silence a "-Wmissing-prototypes" warning [2] in the RV64 BPF JIT. [1] https://lore.kernel.org/llvm/202111020610.9oy9Rr0G-lkp@intel.com/ [2] https://lore.kernel.org/llvm/202110290334.2zdMyRq4-lkp@intel.com/ Fixes: 252c765bd764 ("riscv, bpf: Add BPF exception tables") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Tong Tiangen Link: https://lore.kernel.org/bpf/20211103115453.397209-1-bjorn@kernel.org --- arch/riscv/mm/extable.c | 4 ++-- arch/riscv/net/bpf_jit_comp64.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/extable.c b/arch/riscv/mm/extable.c index 18bf338303b6..ddb7d3b99e89 100644 --- a/arch/riscv/mm/extable.c +++ b/arch/riscv/mm/extable.c @@ -11,7 +11,7 @@ #include #include -#ifdef CONFIG_BPF_JIT +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_ARCH_RV64I) int rv_bpf_fixup_exception(const struct exception_table_entry *ex, struct pt_regs *regs); #endif @@ -23,7 +23,7 @@ int fixup_exception(struct pt_regs *regs) if (!fixup) return 0; -#ifdef CONFIG_BPF_JIT +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_ARCH_RV64I) if (regs->epc >= BPF_JIT_REGION_START && regs->epc < BPF_JIT_REGION_END) return rv_bpf_fixup_exception(fixup, regs); #endif diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 2ca345c7b0bf..f2a779c7e225 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -459,6 +459,8 @@ static int emit_call(bool fixed, u64 addr, struct rv_jit_context *ctx) #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) +int rv_bpf_fixup_exception(const struct exception_table_entry *ex, + struct pt_regs *regs); int rv_bpf_fixup_exception(const struct exception_table_entry *ex, struct pt_regs *regs) { -- cgit v1.2.3 From 70bf363d7adb3a428773bc905011d0ff923ba747 Mon Sep 17 00:00:00 2001 From: Nghia Le Date: Thu, 4 Nov 2021 21:37:40 +0700 Subject: ipv6: remove useless assignment to newinet in tcp_v6_syn_recv_sock() The newinet value is initialized with inet_sk() in a block code to handle sockets for the ETH_P_IP protocol. Along this code path, newinet is never read. Thus, assignment to newinet is needless and can be removed. Signed-off-by: Nghia Le Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20211104143740.32446-1-nghialm78@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/tcp_ipv6.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2cc9b0e53ad1..551fce49841d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1263,7 +1263,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); - newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); -- cgit v1.2.3 From c0f49d98006f2db3333b917caac65bce2af9865c Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:25 +0800 Subject: can: j1939: j1939_tp_cmd_recv(): ignore abort message in the BAM transport This patch prevents BAM transport from being closed by receiving abort message, as specified in SAE-J1939-82 2015 (A.3.3 Row 4). Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-2-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 6c0a0ebdd024..05eb3d059e17 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2085,6 +2085,12 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) break; case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */ + if (j1939_cb_is_broadcast(skcb)) { + netdev_err_once(priv->ndev, "%s: abort to broadcast (%02x), ignoring!\n", + __func__, skcb->addr.sa); + return; + } + if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_abort(priv, skb, true); -- cgit v1.2.3 From a79305e156db3d24fcd8eb649cdb3c3b2350e5c2 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:26 +0800 Subject: can: j1939: j1939_can_recv(): ignore messages with invalid source address According to SAE-J1939-82 2015 (A.3.6 Row 2), a receiver should never send TP.CM_CTS to the global address, so we can add a check in j1939_can_recv() to drop messages with invalid source address. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-3-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 9bc55ecb37f9..8452b0fbb78c 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -75,6 +75,13 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; /* set default message type */ skcb->addr.type = J1939_TP; + + if (!j1939_address_is_valid(skcb->addr.sa)) { + netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n", + __func__); + goto done; + } + if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { /* Type 1: with destination address */ skcb->addr.da = skcb->addr.pgn; -- cgit v1.2.3 From 164051a6ab5445bd97f719f50b16db8b32174269 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:27 +0800 Subject: can: j1939: j1939_tp_cmd_recv(): check the dst address of TP.CM_BAM The TP.CM_BAM message must be sent to the global address [1], so add a check to drop TP.CM_BAM sent to a non-global address. Without this patch, the receiver will treat the following packets as normal RTS/CTS transport: 18EC0102#20090002FF002301 18EB0102#0100000000000000 18EB0102#020000FFFFFFFFFF [1] SAE-J1939-82 2015 A.3.3 Row 1. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-4-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 05eb3d059e17..a271688780a2 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2023,6 +2023,11 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) extd = J1939_ETP; fallthrough; case J1939_TP_CMD_BAM: + if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) { + netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n", + __func__, skcb->addr.sa); + return; + } fallthrough; case J1939_TP_CMD_RTS: if (skcb->addr.type != extd) -- cgit v1.2.3 From d9447f768bc8c60623e4bb3ce65b8f4654d33a50 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 27 Oct 2021 03:07:40 +0900 Subject: can: etas_es58x: es58x_rx_err_msg(): fix memory leak in error path In es58x_rx_err_msg(), if can->do_set_mode() fails, the function directly returns without calling netif_rx(skb). This means that the skb previously allocated by alloc_can_err_skb() is not freed. In other terms, this is a memory leak. This patch simply removes the return statement in the error branch and let the function continue. Issue was found with GCC -fanalyzer, please follow the link below for details. Fixes: 8537257874e9 ("can: etas_es58x: add core support for ETAS ES58X CAN USB interfaces") Link: https://lore.kernel.org/all/20211026180740.1953265-1-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/etas_es58x/es58x_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index 96a13c770e4a..24627ab14626 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -664,7 +664,7 @@ int es58x_rx_err_msg(struct net_device *netdev, enum es58x_err error, struct can_device_stats *can_stats = &can->can_stats; struct can_frame *cf = NULL; struct sk_buff *skb; - int ret; + int ret = 0; if (!netif_running(netdev)) { if (net_ratelimit()) @@ -823,8 +823,6 @@ int es58x_rx_err_msg(struct net_device *netdev, enum es58x_err error, can->state = CAN_STATE_BUS_OFF; can_bus_off(netdev); ret = can->do_set_mode(netdev, CAN_MODE_STOP); - if (ret) - return ret; } break; @@ -881,7 +879,7 @@ int es58x_rx_err_msg(struct net_device *netdev, enum es58x_err error, ES58X_EVENT_BUSOFF, timestamp); } - return 0; + return ret; } /** -- cgit v1.2.3 From 3f1c7aa28498e52a5e6aa2f1b89bf35c63352cfd Mon Sep 17 00:00:00 2001 From: Stephane Grosjean Date: Thu, 21 Oct 2021 10:15:04 +0200 Subject: can: peak_usb: always ask for BERR reporting for PCAN-USB devices Since for the PCAN-USB, the management of the transition to the ERROR_WARNING or ERROR_PASSIVE state is done according to the error counters, these must be requested unconditionally. Link: https://lore.kernel.org/all/20211021081505.18223-2-s.grosjean@peak-system.com Fixes: c11dcee75830 ("can: peak_usb: pcan_usb_decode_error(): upgrade handling of bus state changes") Cc: stable@vger.kernel.org Signed-off-by: Stephane Grosjean Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/peak_usb/pcan_usb.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c index 837b3fecd71e..af8d3dadbbb8 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb.c @@ -841,14 +841,14 @@ static int pcan_usb_start(struct peak_usb_device *dev) pdev->bec.rxerr = 0; pdev->bec.txerr = 0; - /* be notified on error counter changes (if requested by user) */ - if (dev->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) { - err = pcan_usb_set_err_frame(dev, PCAN_USB_BERR_MASK); - if (err) - netdev_warn(dev->netdev, - "Asking for BERR reporting error %u\n", - err); - } + /* always ask the device for BERR reporting, to be able to switch from + * WARNING to PASSIVE state + */ + err = pcan_usb_set_err_frame(dev, PCAN_USB_BERR_MASK); + if (err) + netdev_warn(dev->netdev, + "Asking for BERR reporting error %u\n", + err); /* if revision greater than 3, can put silent mode on/off */ if (dev->device_rev > 3) { @@ -986,7 +986,6 @@ const struct peak_usb_adapter pcan_usb = { .device_id = PCAN_USB_PRODUCT_ID, .ctrl_count = 1, .ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES | CAN_CTRLMODE_LISTENONLY | - CAN_CTRLMODE_BERR_REPORTING | CAN_CTRLMODE_CC_LEN8_DLC, .clock = { .freq = PCAN_USB_CRYSTAL_HZ / 2, -- cgit v1.2.3 From 6b78ba3e51f9a2fa5b48eef959acc8b6f02cbf1f Mon Sep 17 00:00:00 2001 From: Stephane Grosjean Date: Thu, 21 Oct 2021 10:15:05 +0200 Subject: can: peak_usb: exchange the order of information messages Proposes the possible update of the PCAN-USB firmware after indicating its name and current version. Link: https://lore.kernel.org/all/20211021081505.18223-3-s.grosjean@peak-system.com Signed-off-by: Stephane Grosjean Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/peak_usb/pcan_usb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c index af8d3dadbbb8..876218752766 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb.c @@ -883,6 +883,11 @@ static int pcan_usb_init(struct peak_usb_device *dev) return err; } + dev_info(dev->netdev->dev.parent, + "PEAK-System %s adapter hwrev %u serial %08X (%u channel)\n", + pcan_usb.name, dev->device_rev, serial_number, + pcan_usb.ctrl_count); + /* Since rev 4.1, PCAN-USB is able to make single-shot as well as * looped back frames. */ @@ -896,11 +901,6 @@ static int pcan_usb_init(struct peak_usb_device *dev) "Firmware update available. Please contact support@peak-system.com\n"); } - dev_info(dev->netdev->dev.parent, - "PEAK-System %s adapter hwrev %u serial %08X (%u channel)\n", - pcan_usb.name, dev->device_rev, serial_number, - pcan_usb.ctrl_count); - return 0; } -- cgit v1.2.3 From 3990ed4c426652fcd469f8c9dc08156294b36c28 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 5 Nov 2021 18:40:14 -0700 Subject: bpf: Stop caching subprog index in the bpf_pseudo_func insn This patch is to fix an out-of-bound access issue when jit-ing the bpf_pseudo_func insn (i.e. ld_imm64 with src_reg == BPF_PSEUDO_FUNC) In jit_subprog(), it currently reuses the subprog index cached in insn[1].imm. This subprog index is an index into a few array related to subprogs. For example, in jit_subprog(), it is an index to the newly allocated 'struct bpf_prog **func' array. The subprog index was cached in insn[1].imm after add_subprog(). However, this could become outdated (and too big in this case) if some subprogs are completely removed during dead code elimination (in adjust_subprog_starts_after_remove). The cached index in insn[1].imm is not updated accordingly and causing out-of-bound issue in the later jit_subprog(). Unlike bpf_pseudo_'func' insn, the current bpf_pseudo_'call' insn is handling the DCE properly by calling find_subprog(insn->imm) to figure out the index instead of caching the subprog index. The existing bpf_adj_branches() will adjust the insn->imm whenever insn is added or removed. Instead of having two ways handling subprog index, this patch is to make bpf_pseudo_func works more like bpf_pseudo_call. First change is to stop caching the subprog index result in insn[1].imm after add_subprog(). The verification process will use find_subprog(insn->imm) to figure out the subprog index. Second change is in bpf_adj_branches() and have it to adjust the insn->imm for the bpf_pseudo_func insn also whenever insn is added or removed. Third change is in jit_subprog(). Like the bpf_pseudo_call handling, bpf_pseudo_func temporarily stores the find_subprog() result in insn->off. It is fine because the prog's insn has been finalized at this point. insn->off will be reset back to 0 later to avoid confusing the userspace prog dump tool. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211106014014.651018-1-kafai@fb.com --- include/linux/bpf.h | 6 ++++++ kernel/bpf/core.c | 7 +++++++ kernel/bpf/verifier.c | 37 ++++++++++++++----------------------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2be6dfd68df9..f715e8863f4d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -484,6 +484,12 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) aux->ctx_field_size = size; } +static inline bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 327e3996eadb..2405e39d800f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -390,6 +390,13 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, i = end_new; insn = prog->insnsi + end_old; } + if (bpf_pseudo_func(insn)) { + ret = bpf_adj_delta_to_imm(insn, pos, end_old, + end_new, i, probe_pass); + if (ret) + return ret; + continue; + } code = insn->code; if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f8d9128860a..890b3ec375a3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -240,12 +240,6 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_KFUNC_CALL; } -static bool bpf_pseudo_func(const struct bpf_insn *insn) -{ - return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && - insn->src_reg == BPF_PSEUDO_FUNC; -} - struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -1960,16 +1954,10 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return -EPERM; } - if (bpf_pseudo_func(insn)) { + if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn)) ret = add_subprog(env, i + insn->imm + 1); - if (ret >= 0) - /* remember subprog */ - insn[1].imm = ret; - } else if (bpf_pseudo_call(insn)) { - ret = add_subprog(env, i + insn->imm + 1); - } else { + else ret = add_kfunc_call(env, insn->imm, insn->off); - } if (ret < 0) return ret; @@ -9387,7 +9375,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->src_reg == BPF_PSEUDO_FUNC) { struct bpf_prog_aux *aux = env->prog->aux; - u32 subprogno = insn[1].imm; + u32 subprogno = find_subprog(env, + env->insn_idx + insn->imm + 1); if (!aux->func_info) { verbose(env, "missing btf func_info\n"); @@ -12557,14 +12546,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (bpf_pseudo_func(insn)) { - env->insn_aux_data[i].call_imm = insn->imm; - /* subprog is encoded in insn[1].imm */ + if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn)) continue; - } - if (!bpf_pseudo_call(insn)) - continue; /* Upon error here we cannot fall back to interpreter but * need a hard reject of the program. Thus -EFAULT is * propagated in any case. @@ -12585,6 +12569,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) env->insn_aux_data[i].call_imm = insn->imm; /* point imm to __bpf_call_base+1 from JITs point of view */ insn->imm = 1; + if (bpf_pseudo_func(insn)) + /* jit (e.g. x86_64) may emit fewer instructions + * if it learns a u32 imm is the same as a u64 imm. + * Force a non zero here. + */ + insn[1].imm = 1; } err = bpf_prog_alloc_jited_linfo(prog); @@ -12669,7 +12659,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (bpf_pseudo_func(insn)) { - subprog = insn[1].imm; + subprog = insn->off; insn[0].imm = (u32)(long)func[subprog]->bpf_func; insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; continue; @@ -12720,7 +12710,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (bpf_pseudo_func(insn)) { insn[0].imm = env->insn_aux_data[i].call_imm; - insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + insn[1].imm = insn->off; + insn->off = 0; continue; } if (!bpf_pseudo_call(insn)) -- cgit v1.2.3 From d99341b373215cf32bfb7f341fb3e720e0e791ef Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 5 Nov 2021 18:40:20 -0700 Subject: bpf: selftest: Trigger a DCE on the whole subprog This patch adds a test to trigger the DCE to remove the whole subprog to ensure the verifier does not depend on a stable subprog index. The DCE is done by testing a global const. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211106014020.651638-1-kafai@fb.com --- tools/testing/selftests/bpf/progs/for_each_array_map_elem.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c index df918b2469da..52f6995ff29c 100644 --- a/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c +++ b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c @@ -23,6 +23,16 @@ struct callback_ctx { int output; }; +const volatile int bypass_unused = 1; + +static __u64 +unused_subprog(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output = 0; + return 1; +} + static __u64 check_array_elem(struct bpf_map *map, __u32 *key, __u64 *val, struct callback_ctx *data) @@ -54,6 +64,8 @@ int test_pkt_access(struct __sk_buff *skb) data.output = 0; bpf_for_each_map_elem(&arraymap, check_array_elem, &data, 0); + if (!bypass_unused) + bpf_for_each_map_elem(&arraymap, unused_subprog, &data, 0); arraymap_output = data.output; bpf_for_each_map_elem(&percpu_map, check_percpu_elem, (void *)0, 0); -- cgit v1.2.3 From 691204bd66b34ba982e19988e6eba9f6321dfe6c Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 15 Oct 2021 19:46:59 +0200 Subject: can: mcp251xfd: mcp251xfd_irq(): add missing can_rx_offload_threaded_irq_finish() in case of bus off The function can_rx_offload_threaded_irq_finish() is needed to trigger the NAPI thread to deliver read CAN frames to the networking stack. This patch adds the missing call to can_rx_offload_threaded_irq_finish() in case of a bus off, before leaving the interrupt handler to avoid packet starvation. Link: https://lore.kernel.org/all/20211106201526.44292-1-mkl@pengutronix.de Fixes: 30bfec4fec59 ("can: rx-offload: can_rx_offload_threaded_irq_finish(): add new function to be called from threaded interrupt") Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 673861ab665a..212fcd1554e4 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -2290,8 +2290,10 @@ static irqreturn_t mcp251xfd_irq(int irq, void *dev_id) * check will fail, too. So leave IRQ handler * directly. */ - if (priv->can.state == CAN_STATE_BUS_OFF) + if (priv->can.state == CAN_STATE_BUS_OFF) { + can_rx_offload_threaded_irq_finish(&priv->offload); return IRQ_HANDLED; + } } handled = IRQ_HANDLED; -- cgit v1.2.3 From 69c55f6e7669d46bb40e41f6e2b218428178368a Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 19 Oct 2021 17:00:04 +0200 Subject: can: mcp251xfd: mcp251xfd_chip_start(): fix error handling for mcp251xfd_chip_rx_int_enable() This patch fixes the error handling for mcp251xfd_chip_rx_int_enable(). Instead just returning the error, properly shut down the chip. Link: https://lore.kernel.org/all/20211106201526.44292-2-mkl@pengutronix.de Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 212fcd1554e4..e16dc482f327 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1092,7 +1092,7 @@ static int mcp251xfd_chip_start(struct mcp251xfd_priv *priv) err = mcp251xfd_chip_rx_int_enable(priv); if (err) - return err; + goto out_chip_stop; err = mcp251xfd_chip_ecc_init(priv); if (err) -- cgit v1.2.3 From 08fcdfa6e3ae394ce44ad27485f9722e7eb4e142 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 7 Nov 2021 15:14:00 +0100 Subject: nfc: port100: lower verbosity of cancelled URB messages It is not an error to receive an URB with -ENOENT because it can come from regular user operations, e.g. pressing CTRL+C when running nfctool from neard. Make it a debugging message, not an error. Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/nfc/port100.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c index 16ceb763594f..d7db1a0e6be1 100644 --- a/drivers/nfc/port100.c +++ b/drivers/nfc/port100.c @@ -624,7 +624,7 @@ static void port100_recv_response(struct urb *urb) break; /* success */ case -ECONNRESET: case -ENOENT: - nfc_err(&dev->interface->dev, + nfc_dbg(&dev->interface->dev, "The urb has been canceled (status %d)\n", urb->status); goto sched_wq; case -ESHUTDOWN: @@ -678,7 +678,7 @@ static void port100_recv_ack(struct urb *urb) break; /* success */ case -ECONNRESET: case -ENOENT: - nfc_err(&dev->interface->dev, + nfc_dbg(&dev->interface->dev, "The urb has been stopped (status %d)\n", urb->status); goto sched_wq; case -ESHUTDOWN: @@ -942,7 +942,7 @@ static void port100_send_complete(struct urb *urb) break; /* success */ case -ECONNRESET: case -ENOENT: - nfc_err(&dev->interface->dev, + nfc_dbg(&dev->interface->dev, "The urb has been stopped (status %d)\n", urb->status); break; case -ESHUTDOWN: -- cgit v1.2.3 From 85879f131d78151847baf29f9557c5be1aa8e066 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 5 Nov 2021 20:14:41 -0700 Subject: net: hisilicon: fix hsn3_ethtool kernel-doc warnings Fix kernel-doc warnings and spacing in hns3_ethtool.c: hns3_ethtool.c:246: warning: No description found for return value of 'hns3_lp_run_test' hns3_ethtool.c:408: warning: expecting prototype for hns3_nic_self_test(). Prototype was for hns3_self_test() instead Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Peng Li Cc: Guangbin Huang Cc: Yisen Zhuang Cc: Salil Mehta Cc: "David S. Miller" Cc: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 5ebd96f6833d..057024278cf4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -238,9 +238,11 @@ static void hns3_lb_clear_tx_ring(struct hns3_nic_priv *priv, u32 start_ringid, } /** - * hns3_lp_run_test - run loopback test + * hns3_lp_run_test - run loopback test * @ndev: net device * @mode: loopback type + * + * Return: %0 for success or a NIC loopback test error code on failure */ static int hns3_lp_run_test(struct net_device *ndev, enum hnae3_loop mode) { @@ -398,7 +400,7 @@ static void hns3_do_selftest(struct net_device *ndev, int (*st_param)[2], } /** - * hns3_nic_self_test - self test + * hns3_self_test - self test * @ndev: net device * @eth_test: test cmd * @data: test result -- cgit v1.2.3 From 8ac9dfd58b138f7e82098a4e0a0d46858b12215b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Nov 2021 14:42:14 -0700 Subject: llc: fix out-of-bound array index in llc_sk_dev_hash() Both ifindex and LLC_SK_DEV_HASH_ENTRIES are signed. This means that (ifindex % LLC_SK_DEV_HASH_ENTRIES) is negative if @ifindex is negative. We could simply make LLC_SK_DEV_HASH_ENTRIES unsigned. In this patch I chose to use hash_32() to get more entropy from @ifindex, like llc_sk_laddr_hashfn(). UBSAN: array-index-out-of-bounds in ./include/net/llc.h:75:26 index -43 is out of range for type 'hlist_head [64]' CPU: 1 PID: 20999 Comm: syz-executor.3 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_out_of_bounds.cold+0x62/0x6c lib/ubsan.c:291 llc_sk_dev_hash include/net/llc.h:75 [inline] llc_sap_add_socket+0x49c/0x520 net/llc/llc_conn.c:697 llc_ui_bind+0x680/0xd70 net/llc/af_llc.c:404 __sys_bind+0x1e9/0x250 net/socket.c:1693 __do_sys_bind net/socket.c:1704 [inline] __se_sys_bind net/socket.c:1702 [inline] __x64_sys_bind+0x6f/0xb0 net/socket.c:1702 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fa503407ae9 Fixes: 6d2e3ea28446 ("llc: use a device based hash table to speed up multicast delivery") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/llc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/llc.h b/include/net/llc.h index fd1f9a3fd8dd..e250dca03963 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -72,7 +72,9 @@ struct llc_sap { static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { - return &sap->sk_dev_hash[ifindex % LLC_SK_DEV_HASH_ENTRIES]; + u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); + + return &sap->sk_dev_hash[bucket]; } static inline -- cgit v1.2.3 From e7ea51cd879c8214a824717d28a169b5f2262c02 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 5 Nov 2021 20:30:27 +0300 Subject: sctp: remove unreachable code from sctp_sf_violation_chunk() sctp_sf_violation_chunk() is not called with asoc argument equal to NULL, but if that happens it would lead to NULL pointer dereference in sctp_vtag_verify(). The patch removes code that handles NULL asoc in sctp_sf_violation_chunk(). Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Alexey Khoroshilov Proposed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5fabaa54b77d..39ba82ee87ce 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4898,9 +4898,6 @@ static enum sctp_disposition sctp_sf_violation_chunk( { static const char err_str[] = "The following chunk violates protocol:"; - if (!asoc) - return sctp_sf_violation(net, ep, asoc, type, arg, commands); - return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str, sizeof(err_str)); } -- cgit v1.2.3 From e1464db5c57ef393dde8126f09d2b04d166acf16 Mon Sep 17 00:00:00 2001 From: Volodymyr Mytnyk Date: Fri, 5 Nov 2021 18:49:24 +0200 Subject: net: marvell: prestera: fix hw structure laid out The prestera FW v4.0 support commit has been merged accidentally w/o review comments addressed and waiting for the final patch set to be uploaded. So, fix the remaining comments related to structure laid out and build issues. Reported-by: kernel test robot Fixes: bb5dbf2cc64d ("net: marvell: prestera: add firmware v4.0 support") Signed-off-by: Volodymyr Mytnyk Signed-off-by: David S. Miller --- .../net/ethernet/marvell/prestera/prestera_hw.c | 131 +++++++++++---------- 1 file changed, 68 insertions(+), 63 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_hw.c index bc3c9310678a..9b8b1ed474fc 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.c @@ -180,109 +180,113 @@ struct prestera_msg_common_resp { struct prestera_msg_ret ret; }; -union prestera_msg_switch_param { - u8 mac[ETH_ALEN]; - __le32 ageing_timeout_ms; -} __packed; - struct prestera_msg_switch_attr_req { struct prestera_msg_cmd cmd; __le32 attr; - union prestera_msg_switch_param param; - u8 pad[2]; + union { + __le32 ageing_timeout_ms; + struct { + u8 mac[ETH_ALEN]; + u8 __pad[2]; + }; + } param; }; struct prestera_msg_switch_init_resp { struct prestera_msg_ret ret; __le32 port_count; __le32 mtu_max; - u8 switch_id; - u8 lag_max; - u8 lag_member_max; __le32 size_tbl_router_nexthop; -} __packed __aligned(4); + u8 switch_id; + u8 lag_max; + u8 lag_member_max; +}; struct prestera_msg_event_port_param { union { struct { - u8 oper; __le32 mode; __le32 speed; + u8 oper; u8 duplex; u8 fc; u8 fec; - } __packed mac; + } mac; struct { - u8 mdix; __le64 lmode_bmap; + u8 mdix; u8 fc; - } __packed phy; - } __packed; -} __packed __aligned(4); + u8 __pad[2]; + } __packed phy; /* make sure always 12 bytes size */ + }; +}; struct prestera_msg_port_cap_param { __le64 link_mode; - u8 type; - u8 fec; - u8 fc; - u8 transceiver; + u8 type; + u8 fec; + u8 fc; + u8 transceiver; }; struct prestera_msg_port_flood_param { u8 type; u8 enable; + u8 __pad[2]; }; union prestera_msg_port_param { + __le32 mtu; + __le32 speed; + __le32 link_mode; u8 admin_state; u8 oper_state; - __le32 mtu; u8 mac[ETH_ALEN]; u8 accept_frm_type; - __le32 speed; u8 learning; u8 flood; - __le32 link_mode; u8 type; u8 duplex; u8 fec; u8 fc; - union { struct { - u8 admin:1; + u8 admin; u8 fc; u8 ap_enable; + u8 __reserved[5]; union { struct { __le32 mode; - u8 inband:1; __le32 speed; - u8 duplex; - u8 fec; - u8 fec_supp; - } __packed reg_mode; + u8 inband; + u8 duplex; + u8 fec; + u8 fec_supp; + } reg_mode; struct { __le32 mode; __le32 speed; - u8 fec; - u8 fec_supp; - } __packed ap_modes[PRESTERA_AP_PORT_MAX]; - } __packed; - } __packed mac; + u8 fec; + u8 fec_supp; + u8 __pad[2]; + } ap_modes[PRESTERA_AP_PORT_MAX]; + }; + } mac; struct { - u8 admin:1; - u8 adv_enable; __le64 modes; __le32 mode; + u8 admin; + u8 adv_enable; u8 mdix; - } __packed phy; - } __packed link; + u8 __pad; + } phy; + } link; struct prestera_msg_port_cap_param cap; struct prestera_msg_port_flood_param flood_ext; struct prestera_msg_event_port_param link_evt; -} __packed; +}; struct prestera_msg_port_attr_req { struct prestera_msg_cmd cmd; @@ -290,14 +294,12 @@ struct prestera_msg_port_attr_req { __le32 port; __le32 dev; union prestera_msg_port_param param; -} __packed __aligned(4); - +}; struct prestera_msg_port_attr_resp { struct prestera_msg_ret ret; union prestera_msg_port_param param; -} __packed __aligned(4); - +}; struct prestera_msg_port_stats_resp { struct prestera_msg_ret ret; @@ -322,13 +324,13 @@ struct prestera_msg_vlan_req { __le32 port; __le32 dev; __le16 vid; - u8 is_member; - u8 is_tagged; + u8 is_member; + u8 is_tagged; }; struct prestera_msg_fdb_req { struct prestera_msg_cmd cmd; - u8 dest_type; + __le32 flush_mode; union { struct { __le32 port; @@ -336,11 +338,12 @@ struct prestera_msg_fdb_req { }; __le16 lag_id; } dest; - u8 mac[ETH_ALEN]; __le16 vid; - u8 dynamic; - __le32 flush_mode; -} __packed __aligned(4); + u8 dest_type; + u8 dynamic; + u8 mac[ETH_ALEN]; + u8 __pad[2]; +}; struct prestera_msg_bridge_req { struct prestera_msg_cmd cmd; @@ -363,11 +366,12 @@ struct prestera_msg_acl_action { struct prestera_msg_acl_match { __le32 type; + __le32 __reserved; union { struct { u8 key; u8 mask; - } __packed u8; + } u8; struct { __le16 key; __le16 mask; @@ -383,7 +387,7 @@ struct prestera_msg_acl_match { struct { u8 key[ETH_ALEN]; u8 mask[ETH_ALEN]; - } __packed mac; + } mac; } keymask; }; @@ -446,7 +450,8 @@ struct prestera_msg_stp_req { __le32 port; __le32 dev; __le16 vid; - u8 state; + u8 state; + u8 __pad; }; struct prestera_msg_rxtx_req { @@ -497,21 +502,21 @@ union prestera_msg_event_fdb_param { struct prestera_msg_event_fdb { struct prestera_msg_event id; - u8 dest_type; + __le32 vid; union { __le32 port_id; __le16 lag_id; } dest; - __le32 vid; union prestera_msg_event_fdb_param param; -} __packed __aligned(4); + u8 dest_type; +}; -static inline void prestera_hw_build_tests(void) +static void prestera_hw_build_tests(void) { /* check requests */ BUILD_BUG_ON(sizeof(struct prestera_msg_common_req) != 4); BUILD_BUG_ON(sizeof(struct prestera_msg_switch_attr_req) != 16); - BUILD_BUG_ON(sizeof(struct prestera_msg_port_attr_req) != 120); + BUILD_BUG_ON(sizeof(struct prestera_msg_port_attr_req) != 144); BUILD_BUG_ON(sizeof(struct prestera_msg_port_info_req) != 8); BUILD_BUG_ON(sizeof(struct prestera_msg_vlan_req) != 16); BUILD_BUG_ON(sizeof(struct prestera_msg_fdb_req) != 28); @@ -528,7 +533,7 @@ static inline void prestera_hw_build_tests(void) /* check responses */ BUILD_BUG_ON(sizeof(struct prestera_msg_common_resp) != 8); BUILD_BUG_ON(sizeof(struct prestera_msg_switch_init_resp) != 24); - BUILD_BUG_ON(sizeof(struct prestera_msg_port_attr_resp) != 112); + BUILD_BUG_ON(sizeof(struct prestera_msg_port_attr_resp) != 136); BUILD_BUG_ON(sizeof(struct prestera_msg_port_stats_resp) != 248); BUILD_BUG_ON(sizeof(struct prestera_msg_port_info_resp) != 20); BUILD_BUG_ON(sizeof(struct prestera_msg_bridge_resp) != 12); @@ -561,9 +566,9 @@ static int __prestera_cmd_ret(struct prestera_switch *sw, if (err) return err; - if (__le32_to_cpu(ret->cmd.type) != PRESTERA_CMD_TYPE_ACK) + if (ret->cmd.type != __cpu_to_le32(PRESTERA_CMD_TYPE_ACK)) return -EBADE; - if (__le32_to_cpu(ret->status) != PRESTERA_CMD_ACK_OK) + if (ret->status != __cpu_to_le32(PRESTERA_CMD_ACK_OK)) return -EINVAL; return 0; -- cgit v1.2.3 From 62b12ab5dff038ea43b69207b1f42ddc2f0a0b09 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 5 Nov 2021 17:45:11 +0100 Subject: selftests: net: tls: remove unused variable and code When building selftests/net with clang, the compiler warn about the function abs() see below: tls.c:657:15: warning: variable 'len_compared' set but not used [-Wunused-but-set-variable] unsigned int len_compared = 0; ^ Rework to remove the unused variable and the for-loop where the variable 'len_compared' was assinged. Fixes: 7f657d5bf507 ("selftests: tls: add selftests for TLS sockets") Signed-off-by: Anders Roxell Reviewed-by: Nick Desaulniers Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index d3047e251fe9..e61fc4c32ba2 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -654,7 +654,6 @@ TEST_F(tls, recvmsg_single_max) TEST_F(tls, recvmsg_multiple) { unsigned int msg_iovlen = 1024; - unsigned int len_compared = 0; struct iovec vec[1024]; char *iov_base[1024]; unsigned int iov_len = 16; @@ -675,8 +674,6 @@ TEST_F(tls, recvmsg_multiple) hdr.msg_iovlen = msg_iovlen; hdr.msg_iov = vec; EXPECT_NE(recvmsg(self->cfd, &hdr, 0), -1); - for (i = 0; i < msg_iovlen; i++) - len_compared += iov_len; for (i = 0; i < msg_iovlen; i++) free(iov_base[i]); -- cgit v1.2.3 From 9fec40f850658e00a14a7dd9e06f7fbc7e59cc4a Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Fri, 5 Nov 2021 06:36:36 -0700 Subject: nfc: pn533: Fix double free when pn533_fill_fragment_skbs() fails skb is already freed by dev_kfree_skb in pn533_fill_fragment_skbs, but follow error handler branch when pn533_fill_fragment_skbs() fails, skb is freed again, results in double free issue. Fix this by not free skb in error path of pn533_fill_fragment_skbs. Fixes: 963a82e07d4e ("NFC: pn533: Split large Tx frames in chunks") Fixes: 93ad42020c2d ("NFC: pn533: Target mode Tx fragmentation support") Signed-off-by: Chengfeng Ye Reviewed-by: Dan Carpenter Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/nfc/pn533/pn533.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c index 787bcbd290f7..a491db46e3bd 100644 --- a/drivers/nfc/pn533/pn533.c +++ b/drivers/nfc/pn533/pn533.c @@ -2216,7 +2216,7 @@ static int pn533_fill_fragment_skbs(struct pn533 *dev, struct sk_buff *skb) frag = pn533_alloc_skb(dev, frag_size); if (!frag) { skb_queue_purge(&dev->fragment_skb); - break; + return -ENOMEM; } if (!dev->tgt_mode) { @@ -2285,7 +2285,7 @@ static int pn533_transceive(struct nfc_dev *nfc_dev, /* jumbo frame ? */ if (skb->len > PN533_CMD_DATAEXCH_DATA_MAXLEN) { rc = pn533_fill_fragment_skbs(dev, skb); - if (rc <= 0) + if (rc < 0) goto error; skb = skb_dequeue(&dev->fragment_skb); @@ -2353,7 +2353,7 @@ static int pn533_tm_send(struct nfc_dev *nfc_dev, struct sk_buff *skb) /* let's split in multiple chunks if size's too big */ if (skb->len > PN533_CMD_DATAEXCH_DATA_MAXLEN) { rc = pn533_fill_fragment_skbs(dev, skb); - if (rc <= 0) + if (rc < 0) goto error; /* get the first skb */ -- cgit v1.2.3 From c45231a7668d6b632534f692b10592ea375b55b0 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 7 Nov 2021 21:13:07 +0100 Subject: litex_liteeth: Fix a double free in the remove function 'netdev' is a managed resource allocated in the probe using 'devm_alloc_etherdev()'. It must not be freed explicitly in the remove function. Fixes: ee7da21ac4c3 ("net: Add driver for LiteX's LiteETH network interface") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/litex/litex_liteeth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/litex/litex_liteeth.c b/drivers/net/ethernet/litex/litex_liteeth.c index ab9fa1525053..fdd99f0de424 100644 --- a/drivers/net/ethernet/litex/litex_liteeth.c +++ b/drivers/net/ethernet/litex/litex_liteeth.c @@ -287,7 +287,6 @@ static int liteeth_remove(struct platform_device *pdev) struct net_device *netdev = platform_get_drvdata(pdev); unregister_netdev(netdev); - free_netdev(netdev); return 0; } -- cgit v1.2.3 From 40a34121ac1dc52ed9cd34a8f4e48e32517a52fd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:32 -0700 Subject: bpf, sockmap: Use stricter sk state checks in sk_lookup_assign In order to fix an issue with sockets in TCP sockmap redirect cases we plan to allow CLOSE state sockets to exist in the sockmap. However, the check in bpf_sk_lookup_assign() currently only invalidates sockets in the TCP_ESTABLISHED case relying on the checks on sockmap insert to ensure we never SOCK_CLOSE state sockets in the map. To prepare for this change we flip the logic in bpf_sk_lookup_assign() to explicitly test for the accepted cases. Namely, a tcp socket in TCP_LISTEN or a udp socket in TCP_CLOSE state. This also makes the code more resilent to future changes. Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-2-john.fastabend@gmail.com --- include/linux/skmsg.h | 12 ++++++++++++ net/core/filter.c | 6 ++++-- net/core/sock_map.c | 6 ------ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index b4256847c707..584d94be9c8b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -507,6 +507,18 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } +static inline bool sk_is_tcp(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static inline bool sk_is_udp(const struct sock *sk) +{ + return sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; +} + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) diff --git a/net/core/filter.c b/net/core/filter.c index 8e8d3b49c297..a68418268e92 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10423,8 +10423,10 @@ BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, return -EINVAL; if (unlikely(sk && sk_is_refcounted(sk))) return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ - if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) - return -ESOCKTNOSUPPORT; /* reject connected sockets */ + if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) + return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ + if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) + return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ /* Check if socket is suitable for packet L3/L4 protocol */ if (sk && sk->sk_protocol != ctx->protocol) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e252b8ec2b85..f39ef79ced67 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -511,12 +511,6 @@ static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } -static bool sk_is_tcp(const struct sock *sk) -{ - return sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP; -} - static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) -- cgit v1.2.3 From b8b8315e39ffaca82e79d86dde26e9144addf66b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:33 -0700 Subject: bpf, sockmap: Remove unhash handler for BPF sockmap usage We do not need to handle unhash from BPF side we can simply wait for the close to happen. The original concern was a socket could transition from ESTABLISHED state to a new state while the BPF hook was still attached. But, we convinced ourself this is no longer possible and we also improved BPF sockmap to handle listen sockets so this is no longer a problem. More importantly though there are cases where unhash is called when data is in the receive queue. The BPF unhash logic will flush this data which is wrong. To be correct it should keep the data in the receive queue and allow a receiving application to continue reading the data. This may happen when tcp_abort() is received for example. Instead of complicating the logic in unhash simply moving all this to tcp_close() hook solves this. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-3-john.fastabend@gmail.com --- net/ipv4/tcp_bpf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 5f4d6f45d87f..246f725b78c9 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -475,7 +475,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; - prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; -- cgit v1.2.3 From c5d2177a72a1659554922728fc407f59950aa929 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:34 -0700 Subject: bpf, sockmap: Fix race in ingress receive verdict with redirect to self A socket in a sockmap may have different combinations of programs attached depending on configuration. There can be no programs in which case the socket acts as a sink only. There can be a TX program in this case a BPF program is attached to sending side, but no RX program is attached. There can be an RX program only where sends have no BPF program attached, but receives are hooked with BPF. And finally, both TX and RX programs may be attached. Giving us the permutations: None, Tx, Rx, and TxRx To date most of our use cases have been TX case being used as a fast datapath to directly copy between local application and a userspace proxy. Or Rx cases and TxRX applications that are operating an in kernel based proxy. The traffic in the first case where we hook applications into a userspace application looks like this: AppA redirect AppB Tx <-----------> Rx | | + + TCP <--> lo <--> TCP In this case all traffic from AppA (after 3whs) is copied into the AppB ingress queue and no traffic is ever on the TCP recieive_queue. In the second case the application never receives, except in some rare error cases, traffic on the actual user space socket. Instead the send happens in the kernel. AppProxy socket pool sk0 ------------->{sk1,sk2, skn} ^ | | | | v ingress lb egress TCP TCP Here because traffic is never read off the socket with userspace recv() APIs there is only ever one reader on the sk receive_queue. Namely the BPF programs. However, we've started to introduce a third configuration where the BPF program on receive should process the data, but then the normal case is to push the data into the receive queue of AppB. AppB recv() (userspace) ----------------------- tcp_bpf_recvmsg() (kernel) | | | | | | ingress_msgQ | | | RX_BPF | | | v v sk->receive_queue This is different from the App{A,B} redirect because traffic is first received on the sk->receive_queue. Now for the issue. The tcp_bpf_recvmsg() handler first checks the ingress_msg queue for any data handled by the BPF rx program and returned with PASS code so that it was enqueued on the ingress msg queue. Then if no data exists on that queue it checks the socket receive queue. Unfortunately, this is the same receive_queue the BPF program is reading data off of. So we get a race. Its possible for the recvmsg() hook to pull data off the receive_queue before the BPF hook has a chance to read it. It typically happens when an application is banging on recv() and getting EAGAINs. Until they manage to race with the RX BPF program. To fix this we note that before this patch at attach time when the socket is loaded into the map we check if it needs a TX program or just the base set of proto bpf hooks. Then it uses the above general RX hook regardless of if we have a BPF program attached at rx or not. This patch now extends this check to handle all cases enumerated above, TX, RX, TXRX, and none. And to fix above race when an RX program is attached we use a new hook that is nearly identical to the old one except now we do not let the recv() call skip the RX BPF program. Now only the BPF program pulls data from sk->receive_queue and recv() only pulls data from the ingress msgQ post BPF program handling. With this resolved our AppB from above has been up and running for many hours without detecting any errors. We do this by correlating counters in RX BPF events and the AppB to ensure data is never skipping the BPF program. Selftests, was not able to detect this because we only run them for a short period of time on well ordered send/recvs so we don't get any of the noise we see in real application environments. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-4-john.fastabend@gmail.com --- net/ipv4/tcp_bpf.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 246f725b78c9..f70aa0932bd6 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -172,6 +172,41 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, return ret; } +static int tcp_bpf_recvmsg_parser(struct sock *sk, + struct msghdr *msg, + size_t len, + int nonblock, + int flags, + int *addr_len) +{ + struct sk_psock *psock; + int copied; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + long timeo; + int data; + + timeo = sock_rcvtimeo(sk, nonblock); + data = tcp_msg_wait_data(sk, psock, timeo); + if (data && !sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + copied = -EAGAIN; + } + release_sock(sk); + sk_psock_put(sk, psock); + return copied; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -464,6 +499,8 @@ enum { enum { TCP_BPF_BASE, TCP_BPF_TX, + TCP_BPF_RX, + TCP_BPF_TXRX, TCP_BPF_NUM_CFGS, }; @@ -482,6 +519,12 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; + + prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; + prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; + + prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; + prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) @@ -519,6 +562,10 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (psock->progs.stream_verdict || psock->progs.skb_verdict) { + config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX; + } + if (restore) { if (inet_csk_has_ulp(sk)) { /* TLS does not have an unhash proto in SW cases, -- cgit v1.2.3 From e0dc3b93bd7bcff8c3813d1df43e0908499c7cf0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:35 -0700 Subject: bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and colliding Strparser is reusing the qdisc_skb_cb struct to stash the skb message handling progress, e.g. offset and length of the skb. First this is poorly named and inherits a struct from qdisc that doesn't reflect the actual usage of cb[] at this layer. But, more importantly strparser is using the following to access its metadata. (struct _strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data)) Where _strp_msg is defined as: struct _strp_msg { struct strp_msg strp; /* 0 8 */ int accum_len; /* 8 4 */ /* size: 12, cachelines: 1, members: 2 */ /* last cacheline: 12 bytes */ }; So we use 12 bytes of ->data[] in struct. However in BPF code running parser and verdict the user has read capabilities into the data[] array as well. Its not too problematic, but we should not be exposing internal state to BPF program. If its really needed then we can use the probe_read() APIs which allow reading kernel memory. And I don't believe cb[] layer poses any API breakage by moving this around because programs can't depend on cb[] across layers. In order to fix another issue with a ctx rewrite we need to stash a temp variable somewhere. To make this work cleanly this patch builds a cb struct for sk_skb types called sk_skb_cb struct. Then we can use this consistently in the strparser, sockmap space. Additionally we can start allowing ->cb[] write access after this. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-5-john.fastabend@gmail.com --- include/net/strparser.h | 16 +++++++++++++++- net/core/filter.c | 22 ++++++++++++++++++++++ net/strparser/strparser.c | 10 +--------- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/include/net/strparser.h b/include/net/strparser.h index 1d20b98493a1..bec1439bd3be 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -54,10 +54,24 @@ struct strp_msg { int offset; }; +struct _strp_msg { + /* Internal cb structure. struct strp_msg must be first for passing + * to upper layer. + */ + struct strp_msg strp; + int accum_len; +}; + +struct sk_skb_cb { +#define SK_SKB_CB_PRIV_LEN 20 + unsigned char data[SK_SKB_CB_PRIV_LEN]; + struct _strp_msg strp; +}; + static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ diff --git a/net/core/filter.c b/net/core/filter.c index a68418268e92..c3936d0724b8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9782,11 +9782,33 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): insn = bpf_convert_data_end_access(si, insn); break; + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct sk_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 9c0343568d2a..1a72c67afed5 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -27,18 +27,10 @@ static struct workqueue_struct *strp_wq; -struct _strp_msg { - /* Internal cb structure. struct strp_msg must be first for passing - * to upper layer. - */ - struct strp_msg strp; - int accum_len; -}; - static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) { return (struct _strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Lower lock held */ -- cgit v1.2.3 From b2c4618162ec615a15883a804cce7e27afecfa58 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Wed, 3 Nov 2021 13:47:36 -0700 Subject: bpf, sockmap: sk_skb data_end access incorrect when src_reg = dst_reg The current conversion of skb->data_end reads like this: ; data_end = (void*)(long)skb->data_end; 559: (79) r1 = *(u64 *)(r2 +200) ; r1 = skb->data 560: (61) r11 = *(u32 *)(r2 +112) ; r11 = skb->len 561: (0f) r1 += r11 562: (61) r11 = *(u32 *)(r2 +116) 563: (1f) r1 -= r11 But similar to the case in 84f44df664e9 ("bpf: sock_ops sk access may stomp registers when dst_reg = src_reg"), the code will read an incorrect skb->len when src == dst. In this case we end up generating this xlated code: ; data_end = (void*)(long)skb->data_end; 559: (79) r1 = *(u64 *)(r1 +200) ; r1 = skb->data 560: (61) r11 = *(u32 *)(r1 +112) ; r11 = (skb->data)->len 561: (0f) r1 += r11 562: (61) r11 = *(u32 *)(r1 +116) 563: (1f) r1 -= r11 ... where line 560 is the reading 4B of (skb->data + 112) instead of the intended skb->len Here the skb pointer in r1 gets set to skb->data and the later deref for skb->len ends up following skb->data instead of skb. This fixes the issue similarly to the patch mentioned above by creating an additional temporary variable and using to store the register when dst_reg = src_reg. We name the variable bpf_temp_reg and place it in the cb context for sk_skb. Then we restore from the temp to ensure nothing is lost. Fixes: 16137b09a66f2 ("bpf: Compute data_end dynamically with JIT code") Signed-off-by: Jussi Maki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-6-john.fastabend@gmail.com --- include/net/strparser.h | 4 ++++ net/core/filter.c | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/include/net/strparser.h b/include/net/strparser.h index bec1439bd3be..732b7097d78e 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -66,6 +66,10 @@ struct sk_skb_cb { #define SK_SKB_CB_PRIV_LEN 20 unsigned char data[SK_SKB_CB_PRIV_LEN]; struct _strp_msg strp; + /* temp_reg is a temporary register used for bpf_convert_data_end_access + * when dst_reg == src_reg. + */ + u64 temp_reg; }; static inline struct strp_msg *strp_msg(struct sk_buff *skb) diff --git a/net/core/filter.c b/net/core/filter.c index c3936d0724b8..e471c9b09670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9756,22 +9756,46 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, struct bpf_insn *insn) { - /* si->dst_reg = skb->data */ + int reg; + int temp_reg_off = offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, temp_reg); + + if (si->src_reg == si->dst_reg) { + /* We need an extra register, choose and save a register. */ + reg = BPF_REG_9; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); + } else { + reg = si->dst_reg; + } + + /* reg = skb->data */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), - si->dst_reg, si->src_reg, + reg, si->src_reg, offsetof(struct sk_buff, data)); /* AX = skb->len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, len)); - /* si->dst_reg = skb->data + skb->len */ - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); /* AX = skb->data_len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, data_len)); - /* si->dst_reg = skb->data + skb->len - skb->data_len */ - *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + /* reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); + + if (si->src_reg == si->dst_reg) { + /* Restore the saved register */ + *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, reg); + *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); + } return insn; } -- cgit v1.2.3 From 54f0bad6686cdc50a3f4c5f7c4252c5018511459 Mon Sep 17 00:00:00 2001 From: Jean Sacren Date: Sun, 7 Nov 2021 23:59:41 -0700 Subject: net: sungem_phy: fix code indentation Remove extra space in front of the return statement. Fixes: eb5b5b2ff96e ("sungem_phy: support bcm5461 phy, autoneg.") Signed-off-by: Jean Sacren Signed-off-by: David S. Miller --- drivers/net/sungem_phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/sungem_phy.c b/drivers/net/sungem_phy.c index 291fa449993f..4daac5fda073 100644 --- a/drivers/net/sungem_phy.c +++ b/drivers/net/sungem_phy.c @@ -409,7 +409,7 @@ static int genmii_read_link(struct mii_phy *phy) * though magic-aneg shouldn't prevent this case from occurring */ - return 0; + return 0; } static int generic_suspend(struct mii_phy* phy) -- cgit v1.2.3 From 3e0588c291d6ce225f2b891753ca41d45ba42469 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Mon, 8 Nov 2021 18:37:21 +0800 Subject: hamradio: defer ax25 kfree after unregister_netdev There is a possible race condition (use-after-free) like below (USE) | (FREE) ax25_sendmsg | ax25_queue_xmit | dev_queue_xmit | __dev_queue_xmit | __dev_xmit_skb | sch_direct_xmit | ... xmit_one | netdev_start_xmit | tty_ldisc_kill __netdev_start_xmit | mkiss_close ax_xmit | kfree ax_encaps | | Even though there are two synchronization primitives before the kfree: 1. wait_for_completion(&ax->dead). This can prevent the race with routines from mkiss_ioctl. However, it cannot stop the routine coming from upper layer, i.e., the ax25_sendmsg. 2. netif_stop_queue(ax->dev). It seems that this line of code aims to halt the transmit queue but it fails to stop the routine that already being xmit. This patch reorder the kfree after the unregister_netdev to avoid the possible UAF as the unregister_netdev() is well synchronized and won't return if there is a running routine. Signed-off-by: Lin Ma Signed-off-by: David S. Miller --- drivers/net/hamradio/mkiss.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 867252a0247b..e2b332b54f06 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -792,13 +792,14 @@ static void mkiss_close(struct tty_struct *tty) */ netif_stop_queue(ax->dev); - /* Free all AX25 frame buffers. */ - kfree(ax->rbuff); - kfree(ax->xbuff); - ax->tty = NULL; unregister_netdev(ax->dev); + + /* Free all AX25 frame buffers after unreg. */ + kfree(ax->rbuff); + kfree(ax->xbuff); + free_netdev(ax->dev); } -- cgit v1.2.3 From 0b9111922b1f399aba6ed1e1b8f2079c3da1aed8 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Mon, 8 Nov 2021 18:37:59 +0800 Subject: hamradio: defer 6pack kfree after unregister_netdev There is a possible race condition (use-after-free) like below (USE) | (FREE) dev_queue_xmit | __dev_queue_xmit | __dev_xmit_skb | sch_direct_xmit | ... xmit_one | netdev_start_xmit | tty_ldisc_kill __netdev_start_xmit | 6pack_close sp_xmit | kfree sp_encaps | | According to the patch "defer ax25 kfree after unregister_netdev", this patch reorder the kfree after the unregister_netdev to avoid the possible UAF as the unregister_netdev() is well synchronized and won't return if there is a running routine. Signed-off-by: Lin Ma Signed-off-by: David S. Miller --- drivers/net/hamradio/6pack.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index f4e8793e995d..fb0a3825edd0 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -672,11 +672,13 @@ static void sixpack_close(struct tty_struct *tty) del_timer_sync(&sp->tx_t); del_timer_sync(&sp->resync_t); - /* Free all 6pack frame buffers. */ + unregister_netdev(sp->dev); + + /* Free all 6pack frame buffers after unreg. */ kfree(sp->rbuff); kfree(sp->xbuff); - unregister_netdev(sp->dev); + free_netdev(sp->dev); } /* Perform I/O control on an active 6pack channel. */ -- cgit v1.2.3 From 1c360cc1cc883fbdf0a258b4df376571fbeac5ee Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 9 Nov 2021 14:47:36 +0300 Subject: gve: Fix off by one in gve_tx_timeout() The priv->ntfy_blocks[] has "priv->num_ntfy_blks" elements so this > needs to be >= to prevent an off by one bug. The priv->ntfy_blocks[] array is allocated in gve_alloc_notify_blocks(). Fixes: 87a7f321bb6a ("gve: Recover from queue stall due to missed IRQ") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 6b02ef432eda..59b66f679e46 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1137,7 +1137,7 @@ static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue) goto reset; ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue); - if (ntfy_idx > priv->num_ntfy_blks) + if (ntfy_idx >= priv->num_ntfy_blks) goto reset; block = &priv->ntfy_blocks[ntfy_idx]; -- cgit v1.2.3 From 9758aba8542bb43029d077303d05df1d00a8dbb5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Nov 2021 12:12:24 +0100 Subject: amt: add IPV6 Kconfig dependency This driver cannot be built-in if IPV6 is a loadable module: x86_64-linux-ld: drivers/net/amt.o: in function `amt_build_mld_gq': amt.c:(.text+0x2e7d): undefined reference to `ipv6_dev_get_saddr' Add the idiomatic Kconfig dependency that all such modules have. Fixes: b9022b53adad ("amt: add control plane of amt interface") Signed-off-by: Arnd Bergmann Acked-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 034dbd487c33..10506a4b66ef 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -294,6 +294,7 @@ config GTP config AMT tristate "Automatic Multicast Tunneling (AMT)" depends on INET && IP_MULTICAST + depends on IPV6 || !IPV6 select NET_UDP_TUNNEL help This allows one to create AMT(Automatic Multicast Tunneling) -- cgit v1.2.3 From dc2fc9f03c5c410d8f01c2206b3d529f80b13733 Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Thu, 4 Nov 2021 18:17:47 +0100 Subject: net: dsa: mv88e6xxx: Don't support >1G speeds on 6191X on ports other than 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model 88E6191X only supports >1G speeds on port 10. Port 0 and 9 are only 1G. Fixes: de776d0d316f ("net: dsa: mv88e6xxx: add support for mv88e6393x family") Signed-off-by: Marek Behún Cc: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20211104171747.10509-1-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 14c678a9e41b..f00cbf5753b9 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -640,7 +640,10 @@ static void mv88e6393x_phylink_validate(struct mv88e6xxx_chip *chip, int port, unsigned long *mask, struct phylink_link_state *state) { - if (port == 0 || port == 9 || port == 10) { + bool is_6191x = + chip->info->prod_num == MV88E6XXX_PORT_SWITCH_ID_PROD_6191X; + + if (((port == 0 || port == 9) && !is_6191x) || port == 10) { phylink_set(mask, 10000baseT_Full); phylink_set(mask, 10000baseKR_Full); phylink_set(mask, 10000baseCR_Full); -- cgit v1.2.3 From 43aa4937994f39a5ffc1a581b5ca382a1a2a8b1e Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 8 Nov 2021 14:53:40 +0000 Subject: amt: use cancel_delayed_work() instead of flush_delayed_work() in amt_fini() When the amt module is being removed, it calls flush_delayed_work() to exit source_gc_wq. But it wouldn't be exited properly because the amt_source_gc_work(), which is the callback function of source_gc_wq internally calls mod_delayed_work() again. So, amt_source_gc_work() would be called after the amt module is removed. Therefore kernel panic would occur. In order to avoid it, cancel_delayed_work() should be used instead of flush_delayed_work(). Test commands: modprobe amt modprobe -rv amt Splat looks like: BUG: unable to handle page fault for address: fffffbfff80f50db #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1237ee067 P4D 1237ee067 PUD 1237b2067 PMD 100c11067 PTE 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN PTI CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.15.0+ #27 5a0ebebc29fe5c40c68bea90197606c3a832b09f RIP: 0010:run_timer_softirq+0x221/0xfc0 Code: 00 00 4c 89 e1 4c 8b 30 48 c1 e9 03 80 3c 29 00 0f 85 ed 0b 00 00 4d 89 34 24 4d 85 f6 74 19 49 8d 7e 08 48 89 f9 48 c1 e9 03 <80> 3c 29 00 0f 85 fa 0b 00 00 4d 89 66 08 83 04 24 01 49 89 d4 48 RSP: 0018:ffff888119009e50 EFLAGS: 00010806 RAX: ffff8881191f8a80 RBX: 00000000007ffe2a RCX: 1ffffffff80f50db RDX: ffff888119009ed0 RSI: 0000000000000008 RDI: ffffffffc07a86d8 RBP: dffffc0000000000 R08: ffff8881191f8280 R09: ffffed102323f061 R10: ffff8881191f8307 R11: ffffed102323f060 R12: ffff888119009ec8 R13: 00000000000000c0 R14: ffffffffc07a86d0 R15: ffff8881191f82e8 FS: 0000000000000000(0000) GS:ffff888119000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff80f50db CR3: 00000001062dc002 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? add_timer+0x650/0x650 ? kvm_clock_read+0x14/0x30 ? ktime_get+0xb9/0x180 ? rcu_read_lock_held_common+0xe/0xa0 ? rcu_read_lock_sched_held+0x56/0xc0 ? rcu_read_lock_bh_held+0xa0/0xa0 ? hrtimer_interrupt+0x271/0x790 __do_softirq+0x1d0/0x88f irq_exit_rcu+0xe7/0x120 sysvec_apic_timer_interrupt+0x8a/0xb0 [ ... ] Fixes: bc54e49c140b ("amt: add multicast(IGMP) report message handler") Signed-off-by: Taehee Yoo Link: https://lore.kernel.org/r/20211108145340.17208-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index c384b2694f9e..47a04c330885 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -3286,7 +3286,7 @@ static void __exit amt_fini(void) { rtnl_link_unregister(&amt_link_ops); unregister_netdevice_notifier(&amt_notifier_block); - flush_delayed_work(&source_gc_wq); + cancel_delayed_work(&source_gc_wq); __amt_source_gc_work(); destroy_workqueue(amt_wq); } -- cgit v1.2.3 From 6dc25401cba4d428328eade8ceae717633fdd702 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Nov 2021 10:08:15 -0800 Subject: net/sched: sch_taprio: fix undefined behavior in ktime_mono_to_any 1) if q->tk_offset == TK_OFFS_MAX, then get_tcp_tstamp() calls ktime_mono_to_any() with out-of-bound value. 2) if q->tk_offset is changed in taprio_parse_clockid(), taprio_get_time() might also call ktime_mono_to_any() with out-of-bound value as sysbot found: UBSAN: array-index-out-of-bounds in kernel/time/timekeeping.c:908:27 index 3 is out of range for type 'ktime_t *[3]' CPU: 1 PID: 25668 Comm: kworker/u4:0 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: bat_events batadv_iv_send_outstanding_bat_ogm_packet Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_out_of_bounds.cold+0x62/0x6c lib/ubsan.c:291 ktime_mono_to_any+0x1d4/0x1e0 kernel/time/timekeeping.c:908 get_tcp_tstamp net/sched/sch_taprio.c:322 [inline] get_packet_txtime net/sched/sch_taprio.c:353 [inline] taprio_enqueue_one+0x5b0/0x1460 net/sched/sch_taprio.c:420 taprio_enqueue+0x3b1/0x730 net/sched/sch_taprio.c:485 dev_qdisc_enqueue+0x40/0x300 net/core/dev.c:3785 __dev_xmit_skb net/core/dev.c:3869 [inline] __dev_queue_xmit+0x1f6e/0x3630 net/core/dev.c:4194 batadv_send_skb_packet+0x4a9/0x5f0 net/batman-adv/send.c:108 batadv_iv_ogm_send_to_if net/batman-adv/bat_iv_ogm.c:393 [inline] batadv_iv_ogm_emit net/batman-adv/bat_iv_ogm.c:421 [inline] batadv_iv_send_outstanding_bat_ogm_packet+0x6d7/0x8e0 net/batman-adv/bat_iv_ogm.c:1701 process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298 worker_thread+0x658/0x11f0 kernel/workqueue.c:2445 kthread+0x405/0x4f0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Fixes: 7ede7b03484b ("taprio: make clock reference conversions easier") Fixes: 54002066100b ("taprio: Adjust timestamps for TCP packets") Signed-off-by: Eric Dumazet Cc: Vedang Patel Reported-by: syzbot Reviewed-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20211108180815.1822479-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9ab068fa2672..377f896bdedc 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -95,18 +95,22 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched) return ns_to_ktime(sched->base_time); } -static ktime_t taprio_get_time(struct taprio_sched *q) +static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { - ktime_t mono = ktime_get(); + /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ + enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); - switch (q->tk_offset) { + switch (tk_offset) { case TK_OFFS_MAX: return mono; default: - return ktime_mono_to_any(mono, q->tk_offset); + return ktime_mono_to_any(mono, tk_offset); } +} - return KTIME_MAX; +static ktime_t taprio_get_time(const struct taprio_sched *q) +{ + return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) @@ -319,7 +323,7 @@ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) return 0; } - return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset); + return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from @@ -1352,6 +1356,7 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. @@ -1366,22 +1371,24 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, switch (clockid) { case CLOCK_REALTIME: - q->tk_offset = TK_OFFS_REAL; + tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: - q->tk_offset = TK_OFFS_MAX; + tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: - q->tk_offset = TK_OFFS_BOOT; + tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: - q->tk_offset = TK_OFFS_TAI; + tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } + /* This pairs with READ_ONCE() in taprio_mono_to_any */ + WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { -- cgit v1.2.3 From 8f1bc38bbb516826ede8c96cb73a884221f1a314 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 8 Nov 2021 20:18:17 +0000 Subject: net: mana: Fix spelling mistake "calledd" -> "called" There is a spelling mistake in a dev_info message. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/20211108201817.43121-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index c96ac81212f7..636dfef24a6c 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1424,7 +1424,7 @@ static void mana_gd_shutdown(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); - dev_info(&pdev->dev, "Shutdown was calledd\n"); + dev_info(&pdev->dev, "Shutdown was called\n"); mana_remove(&gc->mana, true); -- cgit v1.2.3 From 3b4c6566c158e0449d490165c1a64d9e410b3007 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 10 Nov 2021 21:42:49 +0800 Subject: net: hns3: fix failed to add reuse multicast mac addr to hardware when mc mac table is full Currently, when driver is failed to add a new multicast mac address to hardware due to the multicast mac table is full, it will directly return. In this case, if the multicast mac list has some reuse addresses after the new address, those reuse addresses will never be added to hardware. To fix this problem, if function hclge_add_mc_addr_common() returns -ENOSPC, hclge_sync_vport_mac_list() should judge whether continue or stop to add next address. As function hclge_sync_vport_mac_list() needs parameter mac_type to know whether is uc or mc, refine this function to add parameter mac_type and remove parameter sync. So does function hclge_unsync_vport_mac_list(). Fixes: ee4bcd3b7ae4 ("net: hns3: refactor the MAC address configure") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 50 ++++++++++++---------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 2e41aa2d1df8..eb96bea9e3ce 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8949,8 +8949,11 @@ int hclge_add_mc_addr_common(struct hclge_vport *vport, err_no_space: /* if already overflow, not to print each time */ - if (!(vport->overflow_promisc_flags & HNAE3_OVERFLOW_MPE)) + if (!(vport->overflow_promisc_flags & HNAE3_OVERFLOW_MPE)) { + vport->overflow_promisc_flags |= HNAE3_OVERFLOW_MPE; dev_err(&hdev->pdev->dev, "mc mac vlan table is full\n"); + } + return -ENOSPC; } @@ -9006,12 +9009,17 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport, static void hclge_sync_vport_mac_list(struct hclge_vport *vport, struct list_head *list, - int (*sync)(struct hclge_vport *, - const unsigned char *)) + enum HCLGE_MAC_ADDR_TYPE mac_type) { + int (*sync)(struct hclge_vport *vport, const unsigned char *addr); struct hclge_mac_node *mac_node, *tmp; int ret; + if (mac_type == HCLGE_MAC_ADDR_UC) + sync = hclge_add_uc_addr_common; + else + sync = hclge_add_mc_addr_common; + list_for_each_entry_safe(mac_node, tmp, list, node) { ret = sync(vport, mac_node->mac_addr); if (!ret) { @@ -9023,8 +9031,13 @@ static void hclge_sync_vport_mac_list(struct hclge_vport *vport, /* If one unicast mac address is existing in hardware, * we need to try whether other unicast mac addresses * are new addresses that can be added. + * Multicast mac address can be reusable, even though + * there is no space to add new multicast mac address, + * we should check whether other mac addresses are + * existing in hardware for reuse. */ - if (ret != -EEXIST) + if ((mac_type == HCLGE_MAC_ADDR_UC && ret != -EEXIST) || + (mac_type == HCLGE_MAC_ADDR_MC && ret != -ENOSPC)) break; } } @@ -9032,12 +9045,17 @@ static void hclge_sync_vport_mac_list(struct hclge_vport *vport, static void hclge_unsync_vport_mac_list(struct hclge_vport *vport, struct list_head *list, - int (*unsync)(struct hclge_vport *, - const unsigned char *)) + enum HCLGE_MAC_ADDR_TYPE mac_type) { + int (*unsync)(struct hclge_vport *vport, const unsigned char *addr); struct hclge_mac_node *mac_node, *tmp; int ret; + if (mac_type == HCLGE_MAC_ADDR_UC) + unsync = hclge_rm_uc_addr_common; + else + unsync = hclge_rm_mc_addr_common; + list_for_each_entry_safe(mac_node, tmp, list, node) { ret = unsync(vport, mac_node->mac_addr); if (!ret || ret == -ENOENT) { @@ -9168,17 +9186,8 @@ stop_traverse: spin_unlock_bh(&vport->mac_list_lock); /* delete first, in order to get max mac table space for adding */ - if (mac_type == HCLGE_MAC_ADDR_UC) { - hclge_unsync_vport_mac_list(vport, &tmp_del_list, - hclge_rm_uc_addr_common); - hclge_sync_vport_mac_list(vport, &tmp_add_list, - hclge_add_uc_addr_common); - } else { - hclge_unsync_vport_mac_list(vport, &tmp_del_list, - hclge_rm_mc_addr_common); - hclge_sync_vport_mac_list(vport, &tmp_add_list, - hclge_add_mc_addr_common); - } + hclge_unsync_vport_mac_list(vport, &tmp_del_list, mac_type); + hclge_sync_vport_mac_list(vport, &tmp_add_list, mac_type); /* if some mac addresses were added/deleted fail, move back to the * mac_list, and retry at next time. @@ -9337,12 +9346,7 @@ static void hclge_uninit_vport_mac_list(struct hclge_vport *vport, spin_unlock_bh(&vport->mac_list_lock); - if (mac_type == HCLGE_MAC_ADDR_UC) - hclge_unsync_vport_mac_list(vport, &tmp_del_list, - hclge_rm_uc_addr_common); - else - hclge_unsync_vport_mac_list(vport, &tmp_del_list, - hclge_rm_mc_addr_common); + hclge_unsync_vport_mac_list(vport, &tmp_del_list, mac_type); if (!list_empty(&tmp_del_list)) dev_warn(&hdev->pdev->dev, -- cgit v1.2.3 From beb27ca451a57a1c0e52b5268703f3c3173c1f8c Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Wed, 10 Nov 2021 21:42:50 +0800 Subject: net: hns3: fix ROCE base interrupt vector initialization bug Currently, NIC init ROCE interrupt vector with MSIX interrupt. But ROCE use pci_irq_vector() to get interrupt vector, which adds the relative interrupt vector again and gets wrong interrupt vector. So fixes it by assign relative interrupt vector to ROCE instead of MSIX interrupt vector and delete the unused struct member base_msi_vector declaration of hclgevf_dev. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Jie Wang Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +----- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 -- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 5 +---- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 2 -- 4 files changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index eb96bea9e3ce..0fc2b81f4712 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2581,7 +2581,7 @@ static int hclge_init_roce_base_info(struct hclge_vport *vport) if (hdev->num_msi < hdev->num_nic_msi + hdev->num_roce_msi) return -EINVAL; - roce->rinfo.base_vector = hdev->roce_base_vector; + roce->rinfo.base_vector = hdev->num_nic_msi; roce->rinfo.netdev = nic->kinfo.netdev; roce->rinfo.roce_io_base = hdev->hw.io_base; @@ -2617,10 +2617,6 @@ static int hclge_init_msi(struct hclge_dev *hdev) hdev->num_msi = vectors; hdev->num_msi_left = vectors; - hdev->base_msi_vector = pdev->irq; - hdev->roce_base_vector = hdev->base_msi_vector + - hdev->num_nic_msi; - hdev->vector_status = devm_kcalloc(&pdev->dev, hdev->num_msi, sizeof(u16), GFP_KERNEL); if (!hdev->vector_status) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 9e1eede599ec..21013776de55 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -904,12 +904,10 @@ struct hclge_dev { u16 num_msi; u16 num_msi_left; u16 num_msi_used; - u32 base_msi_vector; u16 *vector_status; int *vector_irq; u16 num_nic_msi; /* Num of nic vectors for this PF */ u16 num_roce_msi; /* Num of roce vectors for this PF */ - int roce_base_vector; unsigned long service_timer_period; unsigned long service_timer_previous; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 645b2c0011e6..98332dad804d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2557,7 +2557,7 @@ static int hclgevf_init_roce_base_info(struct hclgevf_dev *hdev) hdev->num_msi_left == 0) return -EINVAL; - roce->rinfo.base_vector = hdev->roce_base_vector; + roce->rinfo.base_vector = hdev->roce_base_msix_offset; roce->rinfo.netdev = nic->kinfo.netdev; roce->rinfo.roce_io_base = hdev->hw.io_base; @@ -2823,9 +2823,6 @@ static int hclgevf_init_msi(struct hclgevf_dev *hdev) hdev->num_msi = vectors; hdev->num_msi_left = vectors; - hdev->base_msi_vector = pdev->irq; - hdev->roce_base_vector = pdev->irq + hdev->roce_base_msix_offset; - hdev->vector_status = devm_kcalloc(&pdev->dev, hdev->num_msi, sizeof(u16), GFP_KERNEL); if (!hdev->vector_status) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 28288d7e3303..4bd922b47501 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -308,8 +308,6 @@ struct hclgevf_dev { u16 num_nic_msix; /* Num of nic vectors for this VF */ u16 num_roce_msix; /* Num of roce vectors for this VF */ u16 roce_base_msix_offset; - int roce_base_vector; - u32 base_msi_vector; u16 *vector_status; int *vector_irq; -- cgit v1.2.3 From 0b653a81a26d66ffe526a54c2177e24fb1400301 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Wed, 10 Nov 2021 21:42:51 +0800 Subject: net: hns3: fix pfc packet number incorrect after querying pfc parameters Currently, driver will send command to firmware to query pfc packet number when user uses dcb tool to get pfc parameters. However, the periodic service task will also periodically query and record MAC statistics, including pfc packet number. As the hardware registers of statistics is cleared after reading, it will cause pfc packet number of MAC statistics are not correct after using dcb tool to get pfc parameters. To fix this problem, when user uses dcb tool to get pfc parameters, driver updates MAC statistics firstly and then get pfc packet number from MAC statistics. Fixes: 64fd2300fcc1 ("net: hns3: add support for querying pfc puase packets statistic") Signed-off-by: Jie Wang Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 18 +++--- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 4 ++ .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 68 +++++++++++----------- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 4 +- 5 files changed, 48 insertions(+), 50 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 91cb578f56b8..90013c131e94 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -286,28 +286,24 @@ err_out: static int hclge_ieee_getpfc(struct hnae3_handle *h, struct ieee_pfc *pfc) { - u64 requests[HNAE3_MAX_TC], indications[HNAE3_MAX_TC]; struct hclge_vport *vport = hclge_get_vport(h); struct hclge_dev *hdev = vport->back; int ret; - u8 i; memset(pfc, 0, sizeof(*pfc)); pfc->pfc_cap = hdev->pfc_max; pfc->pfc_en = hdev->tm_info.pfc_en; - ret = hclge_pfc_tx_stats_get(hdev, requests); - if (ret) + ret = hclge_mac_update_stats(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to update MAC stats, ret = %d.\n", ret); return ret; + } - ret = hclge_pfc_rx_stats_get(hdev, indications); - if (ret) - return ret; + hclge_pfc_tx_stats_get(hdev, pfc->requests); + hclge_pfc_rx_stats_get(hdev, pfc->indications); - for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - pfc->requests[i] = requests[i]; - pfc->indications[i] = indications[i]; - } return 0; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 0fc2b81f4712..21aec4e470cf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -26,8 +26,6 @@ #include "hclge_devlink.h" #define HCLGE_NAME "hclge" -#define HCLGE_STATS_READ(p, offset) (*(u64 *)((u8 *)(p) + (offset))) -#define HCLGE_MAC_STATS_FIELD_OFF(f) (offsetof(struct hclge_mac_stats, f)) #define HCLGE_BUF_SIZE_UNIT 256U #define HCLGE_BUF_MUL_BY 2 @@ -587,7 +585,7 @@ static int hclge_mac_query_reg_num(struct hclge_dev *hdev, u32 *reg_num) return 0; } -static int hclge_mac_update_stats(struct hclge_dev *hdev) +int hclge_mac_update_stats(struct hclge_dev *hdev) { /* The firmware supports the new statistics acquisition method */ if (hdev->ae_dev->dev_specs.mac_stats_num) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 21013776de55..3c95c957d1e3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -852,6 +852,9 @@ struct hclge_vf_vlan_cfg { (y) = (_k_ ^ ~_v_) & (_k_); \ } while (0) +#define HCLGE_MAC_STATS_FIELD_OFF(f) (offsetof(struct hclge_mac_stats, f)) +#define HCLGE_STATS_READ(p, offset) (*(u64 *)((u8 *)(p) + (offset))) + #define HCLGE_MAC_TNL_LOG_SIZE 8 #define HCLGE_VPORT_NUM 256 struct hclge_dev { @@ -1166,4 +1169,5 @@ void hclge_inform_vf_promisc_info(struct hclge_vport *vport); int hclge_dbg_dump_rst_info(struct hclge_dev *hdev, char *buf, int len); int hclge_push_vf_link_status(struct hclge_vport *vport); int hclge_enable_vport_vlan_filter(struct hclge_vport *vport, bool request_en); +int hclge_mac_update_stats(struct hclge_dev *hdev); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 95074e91a846..a50e2edbf4a0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -113,50 +113,50 @@ static int hclge_shaper_para_calc(u32 ir, u8 shaper_level, return 0; } -static int hclge_pfc_stats_get(struct hclge_dev *hdev, - enum hclge_opcode_type opcode, u64 *stats) -{ - struct hclge_desc desc[HCLGE_TM_PFC_PKT_GET_CMD_NUM]; - int ret, i, j; - - if (!(opcode == HCLGE_OPC_QUERY_PFC_RX_PKT_CNT || - opcode == HCLGE_OPC_QUERY_PFC_TX_PKT_CNT)) - return -EINVAL; - - for (i = 0; i < HCLGE_TM_PFC_PKT_GET_CMD_NUM - 1; i++) { - hclge_cmd_setup_basic_desc(&desc[i], opcode, true); - desc[i].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); - } - - hclge_cmd_setup_basic_desc(&desc[i], opcode, true); +static const u16 hclge_pfc_tx_stats_offset[] = { + HCLGE_MAC_STATS_FIELD_OFF(mac_tx_pfc_pri0_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_tx_pfc_pri1_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_tx_pfc_pri2_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_tx_pfc_pri3_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_tx_pfc_pri4_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_tx_pfc_pri5_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_tx_pfc_pri6_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_tx_pfc_pri7_pkt_num) +}; - ret = hclge_cmd_send(&hdev->hw, desc, HCLGE_TM_PFC_PKT_GET_CMD_NUM); - if (ret) - return ret; +static const u16 hclge_pfc_rx_stats_offset[] = { + HCLGE_MAC_STATS_FIELD_OFF(mac_rx_pfc_pri0_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_rx_pfc_pri1_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_rx_pfc_pri2_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_rx_pfc_pri3_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_rx_pfc_pri4_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_rx_pfc_pri5_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_rx_pfc_pri6_pkt_num), + HCLGE_MAC_STATS_FIELD_OFF(mac_rx_pfc_pri7_pkt_num) +}; - for (i = 0; i < HCLGE_TM_PFC_PKT_GET_CMD_NUM; i++) { - struct hclge_pfc_stats_cmd *pfc_stats = - (struct hclge_pfc_stats_cmd *)desc[i].data; +static void hclge_pfc_stats_get(struct hclge_dev *hdev, bool tx, u64 *stats) +{ + const u16 *offset; + int i; - for (j = 0; j < HCLGE_TM_PFC_NUM_GET_PER_CMD; j++) { - u32 index = i * HCLGE_TM_PFC_PKT_GET_CMD_NUM + j; + if (tx) + offset = hclge_pfc_tx_stats_offset; + else + offset = hclge_pfc_rx_stats_offset; - if (index < HCLGE_MAX_TC_NUM) - stats[index] = - le64_to_cpu(pfc_stats->pkt_num[j]); - } - } - return 0; + for (i = 0; i < HCLGE_MAX_TC_NUM; i++) + stats[i] = HCLGE_STATS_READ(&hdev->mac_stats, offset[i]); } -int hclge_pfc_rx_stats_get(struct hclge_dev *hdev, u64 *stats) +void hclge_pfc_rx_stats_get(struct hclge_dev *hdev, u64 *stats) { - return hclge_pfc_stats_get(hdev, HCLGE_OPC_QUERY_PFC_RX_PKT_CNT, stats); + hclge_pfc_stats_get(hdev, false, stats); } -int hclge_pfc_tx_stats_get(struct hclge_dev *hdev, u64 *stats) +void hclge_pfc_tx_stats_get(struct hclge_dev *hdev, u64 *stats) { - return hclge_pfc_stats_get(hdev, HCLGE_OPC_QUERY_PFC_TX_PKT_CNT, stats); + hclge_pfc_stats_get(hdev, true, stats); } int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 2ee9b795f71d..1db7f40b4525 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -228,8 +228,8 @@ int hclge_tm_dwrr_cfg(struct hclge_dev *hdev); int hclge_tm_init_hw(struct hclge_dev *hdev, bool init); int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx); int hclge_pause_addr_cfg(struct hclge_dev *hdev, const u8 *mac_addr); -int hclge_pfc_rx_stats_get(struct hclge_dev *hdev, u64 *stats); -int hclge_pfc_tx_stats_get(struct hclge_dev *hdev, u64 *stats); +void hclge_pfc_rx_stats_get(struct hclge_dev *hdev, u64 *stats); +void hclge_pfc_tx_stats_get(struct hclge_dev *hdev, u64 *stats); int hclge_tm_qs_shaper_cfg(struct hclge_vport *vport, int max_tx_rate); int hclge_tm_get_qset_num(struct hclge_dev *hdev, u16 *qset_num); int hclge_tm_get_pri_num(struct hclge_dev *hdev, u8 *pri_num); -- cgit v1.2.3 From 3b6db4a0492beed36545a2bc6075117faecebfe2 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Wed, 10 Nov 2021 21:42:52 +0800 Subject: net: hns3: sync rx ring head in echo common pull When the driver processes rx packets, the head pointer is updated only after the number of received packets reaches 16. However, hardware relies on the head pointer to calculate the number of FBDs. As a result, the hardware calculates the FBD incorrectly. Therefore, the driver proactively updates the head pointer in each common poll to ensure that the number of FBDs calculated by the hardware is correct. Fixes: 68752b24f51a ("net: hns3: schedule the polling again when allocation fails") Signed-off-by: Yufeng Mo Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 7 +++++ .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 1 + .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 1 + .../ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c | 32 ++++++++++++++++++++++ .../ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h | 9 ++++++ 5 files changed, 50 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index a2b993d62822..9ccebbaa0d69 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4210,6 +4210,13 @@ int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget, } out: + /* sync head pointer before exiting, since hardware will calculate + * FBD number with head pointer + */ + if (unused_count > 0) + failure = failure || + hns3_nic_alloc_rx_buffers(ring, unused_count); + return failure ? budget : recv_pkts; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index c327df9dbac4..c5d5466810bb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -483,6 +483,7 @@ static int hclge_firmware_compat_config(struct hclge_dev *hdev, bool en) if (hnae3_dev_phy_imp_supported(hdev)) hnae3_set_bit(compat, HCLGE_PHY_IMP_EN_B, 1); hnae3_set_bit(compat, HCLGE_MAC_STATS_EXT_EN_B, 1); + hnae3_set_bit(compat, HCLGE_SYNC_RX_RING_HEAD_EN_B, 1); req->compat = cpu_to_le32(compat); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index c38b57fc6c6a..d24e59028798 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -1151,6 +1151,7 @@ struct hclge_query_ppu_pf_other_int_dfx_cmd { #define HCLGE_NCSI_ERROR_REPORT_EN_B 1 #define HCLGE_PHY_IMP_EN_B 2 #define HCLGE_MAC_STATS_EXT_EN_B 3 +#define HCLGE_SYNC_RX_RING_HEAD_EN_B 4 struct hclge_firmware_compat_cmd { __le32 compat; u8 rsv[20]; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c index f89bfb352adf..e605c2c5bcce 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c @@ -434,8 +434,28 @@ err_csq: return ret; } +static int hclgevf_firmware_compat_config(struct hclgevf_dev *hdev, bool en) +{ + struct hclgevf_firmware_compat_cmd *req; + struct hclgevf_desc desc; + u32 compat = 0; + + hclgevf_cmd_setup_basic_desc(&desc, HCLGEVF_OPC_IMP_COMPAT_CFG, false); + + if (en) { + req = (struct hclgevf_firmware_compat_cmd *)desc.data; + + hnae3_set_bit(compat, HCLGEVF_SYNC_RX_RING_HEAD_EN_B, 1); + + req->compat = cpu_to_le32(compat); + } + + return hclgevf_cmd_send(&hdev->hw, &desc, 1); +} + int hclgevf_cmd_init(struct hclgevf_dev *hdev) { + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); int ret; spin_lock_bh(&hdev->hw.cmq.csq.lock); @@ -484,6 +504,17 @@ int hclgevf_cmd_init(struct hclgevf_dev *hdev) hnae3_get_field(hdev->fw_version, HNAE3_FW_VERSION_BYTE0_MASK, HNAE3_FW_VERSION_BYTE0_SHIFT)); + if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) { + /* ask the firmware to enable some features, driver can work + * without it. + */ + ret = hclgevf_firmware_compat_config(hdev, true); + if (ret) + dev_warn(&hdev->pdev->dev, + "Firmware compatible features not enabled(%d).\n", + ret); + } + return 0; err_cmd_init: @@ -508,6 +539,7 @@ static void hclgevf_cmd_uninit_regs(struct hclgevf_hw *hw) void hclgevf_cmd_uninit(struct hclgevf_dev *hdev) { + hclgevf_firmware_compat_config(hdev, false); set_bit(HCLGEVF_STATE_CMD_DISABLE, &hdev->state); /* wait to ensure that the firmware completes the possible left * over commands. diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h index 39d0b589c720..edc9e154061a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h @@ -15,6 +15,12 @@ struct hclgevf_hw; struct hclgevf_dev; +#define HCLGEVF_SYNC_RX_RING_HEAD_EN_B 4 +struct hclgevf_firmware_compat_cmd { + __le32 compat; + u8 rsv[20]; +}; + struct hclgevf_desc { __le16 opcode; __le16 flag; @@ -107,6 +113,9 @@ enum hclgevf_opcode_type { HCLGEVF_OPC_RSS_TC_MODE = 0x0D08, /* Mailbox cmd */ HCLGEVF_OPC_MBX_VF_TO_PF = 0x2001, + + /* IMP stats command */ + HCLGEVF_OPC_IMP_COMPAT_CFG = 0x701A, }; #define HCLGEVF_TQP_REG_OFFSET 0x80000 -- cgit v1.2.3 From e140c7983e3054be0652bf914f4454f16c5520b0 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Wed, 10 Nov 2021 21:42:53 +0800 Subject: net: hns3: fix kernel crash when unload VF while it is being reset When fully configure VLANs for a VF, then unload the VF while triggering a reset to PF, will cause a kernel crash because the irq is already uninit. [ 293.177579] ------------[ cut here ]------------ [ 293.183502] kernel BUG at drivers/pci/msi.c:352! [ 293.189547] Internal error: Oops - BUG: 0 [#1] SMP ...... [ 293.390124] Workqueue: hclgevf hclgevf_service_task [hclgevf] [ 293.402627] pstate: 80c00009 (Nzcv daif +PAN +UAO) [ 293.414324] pc : free_msi_irqs+0x19c/0x1b8 [ 293.425429] lr : free_msi_irqs+0x18c/0x1b8 [ 293.436545] sp : ffff00002716fbb0 [ 293.446950] x29: ffff00002716fbb0 x28: 0000000000000000 [ 293.459519] x27: 0000000000000000 x26: ffff45b91ea16b00 [ 293.472183] x25: 0000000000000000 x24: ffffa587b08f4700 [ 293.484717] x23: ffffc591ac30e000 x22: ffffa587b08f8428 [ 293.497190] x21: ffffc591ac30e300 x20: 0000000000000000 [ 293.509594] x19: ffffa58a062a8300 x18: 0000000000000000 [ 293.521949] x17: 0000000000000000 x16: ffff45b91dcc3f48 [ 293.534013] x15: 0000000000000000 x14: 0000000000000000 [ 293.545883] x13: 0000000000000040 x12: 0000000000000228 [ 293.557508] x11: 0000000000000020 x10: 0000000000000040 [ 293.568889] x9 : ffff45b91ea1e190 x8 : ffffc591802d0000 [ 293.580123] x7 : ffffc591802d0148 x6 : 0000000000000120 [ 293.591190] x5 : ffffc591802d0000 x4 : 0000000000000000 [ 293.602015] x3 : 0000000000000000 x2 : 0000000000000000 [ 293.612624] x1 : 00000000000004a4 x0 : ffffa58a1e0c6b80 [ 293.623028] Call trace: [ 293.630340] free_msi_irqs+0x19c/0x1b8 [ 293.638849] pci_disable_msix+0x118/0x140 [ 293.647452] pci_free_irq_vectors+0x20/0x38 [ 293.656081] hclgevf_uninit_msi+0x44/0x58 [hclgevf] [ 293.665309] hclgevf_reset_rebuild+0x1ac/0x2e0 [hclgevf] [ 293.674866] hclgevf_reset+0x358/0x400 [hclgevf] [ 293.683545] hclgevf_reset_service_task+0xd0/0x1b0 [hclgevf] [ 293.693325] hclgevf_service_task+0x4c/0x2e8 [hclgevf] [ 293.702307] process_one_work+0x1b0/0x448 [ 293.710034] worker_thread+0x54/0x468 [ 293.717331] kthread+0x134/0x138 [ 293.724114] ret_from_fork+0x10/0x18 [ 293.731324] Code: f940b000 b4ffff00 a903e7b8 f90017b6 (d4210000) This patch fixes the problem by waiting for the VF reset done while unloading the VF. Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) Support") Signed-off-by: Yufeng Mo Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 5 +++++ drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 98332dad804d..25c419d40066 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -3010,7 +3010,10 @@ static void hclgevf_uninit_client_instance(struct hnae3_client *client, /* un-init roce, if it exists */ if (hdev->roce_client) { + while (test_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state)) + msleep(HCLGEVF_WAIT_RESET_DONE); clear_bit(HCLGEVF_STATE_ROCE_REGISTERED, &hdev->state); + hdev->roce_client->ops->uninit_instance(&hdev->roce, 0); hdev->roce_client = NULL; hdev->roce.client = NULL; @@ -3019,6 +3022,8 @@ static void hclgevf_uninit_client_instance(struct hnae3_client *client, /* un-init nic/unic, if this was not called by roce client */ if (client->ops->uninit_instance && hdev->nic_client && client->type != HNAE3_CLIENT_ROCE) { + while (test_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state)) + msleep(HCLGEVF_WAIT_RESET_DONE); clear_bit(HCLGEVF_STATE_NIC_REGISTERED, &hdev->state); client->ops->uninit_instance(&hdev->nic, 0); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 4bd922b47501..f6f736c0091c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -109,6 +109,8 @@ #define HCLGEVF_VF_RST_ING 0x07008 #define HCLGEVF_VF_RST_ING_BIT BIT(16) +#define HCLGEVF_WAIT_RESET_DONE 100 + #define HCLGEVF_RSS_IND_TBL_SIZE 512 #define HCLGEVF_RSS_SET_BITMAP_MSK 0xffff #define HCLGEVF_RSS_KEY_SIZE 40 -- cgit v1.2.3 From 1122eac19476c5ccf200009d4e4dc9b11458019c Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 10 Nov 2021 21:42:54 +0800 Subject: net: hns3: fix some mac statistics is always 0 in device version V2 When driver queries the register number of mac statistics from firmware, the old firmware runs in device version V2 only returns number of valid registers, not include number of three reserved registers among of them. It cause driver doesn't record the last three data when query mac statistics. To fix this problem, driver never query register number in device version V2 and set it to a fixed value which include three reserved registers. Fixes: c8af2887c941 ("net: hns3: add support pause/pfc durations for mac statistics") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 10 ++++++++++ drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 21aec4e470cf..de9cadf9b9f3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -566,6 +566,16 @@ static int hclge_mac_query_reg_num(struct hclge_dev *hdev, u32 *reg_num) struct hclge_desc desc; int ret; + /* Driver needs total register number of both valid registers and + * reserved registers, but the old firmware only returns number + * of valid registers in device V2. To be compatible with these + * devices, driver uses a fixed value. + */ + if (hdev->ae_dev->dev_version == HNAE3_DEVICE_VERSION_V2) { + *reg_num = HCLGE_MAC_STATS_MAX_NUM_V1; + return 0; + } + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_MAC_REG_NUM, true); ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 3c95c957d1e3..ebba603483a0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -404,7 +404,7 @@ struct hclge_tm_info { }; /* max number of mac statistics on each version */ -#define HCLGE_MAC_STATS_MAX_NUM_V1 84 +#define HCLGE_MAC_STATS_MAX_NUM_V1 87 #define HCLGE_MAC_STATS_MAX_NUM_V2 105 struct hclge_comm_stats_str { -- cgit v1.2.3 From 91fcc79bff406e30c35dcde9fd8568f2b8712e03 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 10 Nov 2021 21:42:55 +0800 Subject: net: hns3: remove check VF uc mac exist when set by PF If users set unicast mac address for VFs by PF, they need to guarantee all VFs' address is different. This patch removes the check mac address exist of VFs, for usrs can refresh mac addresses of all VFs directly without need to modify the exist mac address to other value firstly. Fixes: 8e6de441b8e6 ("net: hns3: add support for configuring VF MAC from the host") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 36 ---------------------- 1 file changed, 36 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index de9cadf9b9f3..c2a58101144e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9418,36 +9418,6 @@ static int hclge_get_mac_ethertype_cmd_status(struct hclge_dev *hdev, return return_status; } -static bool hclge_check_vf_mac_exist(struct hclge_vport *vport, int vf_idx, - u8 *mac_addr) -{ - struct hclge_mac_vlan_tbl_entry_cmd req; - struct hclge_dev *hdev = vport->back; - struct hclge_desc desc; - u16 egress_port = 0; - int i; - - if (is_zero_ether_addr(mac_addr)) - return false; - - memset(&req, 0, sizeof(req)); - hnae3_set_field(egress_port, HCLGE_MAC_EPORT_VFID_M, - HCLGE_MAC_EPORT_VFID_S, vport->vport_id); - req.egress_port = cpu_to_le16(egress_port); - hclge_prepare_mac_addr(&req, mac_addr, false); - - if (hclge_lookup_mac_vlan_tbl(vport, &req, &desc, false) != -ENOENT) - return true; - - vf_idx += HCLGE_VF_VPORT_START_NUM; - for (i = HCLGE_VF_VPORT_START_NUM; i < hdev->num_alloc_vport; i++) - if (i != vf_idx && - ether_addr_equal(mac_addr, hdev->vport[i].vf_info.mac)) - return true; - - return false; -} - static int hclge_set_vf_mac(struct hnae3_handle *handle, int vf, u8 *mac_addr) { @@ -9465,12 +9435,6 @@ static int hclge_set_vf_mac(struct hnae3_handle *handle, int vf, return 0; } - if (hclge_check_vf_mac_exist(vport, vf, mac_addr)) { - dev_err(&hdev->pdev->dev, "Specified MAC(=%pM) exists!\n", - mac_addr); - return -EEXIST; - } - ether_addr_copy(vport->vf_info.mac, mac_addr); if (test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) { -- cgit v1.2.3 From 688db0c7a4a69ddc8b8143a1cac01eb20082a3aa Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 10 Nov 2021 21:42:56 +0800 Subject: net: hns3: allow configure ETS bandwidth of all TCs Currently, driver only allow configuring ETS bandwidth of TCs according to the max TC number queried from firmware. However, the hardware actually supports 8 TCs and users may need to configure ETS bandwidth of all TCs, so remove the restriction. Fixes: 330baff5423b ("net: hns3: add ETS TC weight setting in SSU module") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 90013c131e94..375ebf105a9a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -129,7 +129,7 @@ static int hclge_ets_sch_mode_validate(struct hclge_dev *hdev, u32 total_ets_bw = 0; u8 i; - for (i = 0; i < hdev->tc_max; i++) { + for (i = 0; i < HNAE3_MAX_TC; i++) { switch (ets->tc_tsa[i]) { case IEEE_8021QAZ_TSA_STRICT: if (hdev->tm_info.tc_info[i].tc_sch_mode != diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index a50e2edbf4a0..429652a8cde1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -1123,7 +1123,6 @@ static int hclge_tm_pri_tc_base_dwrr_cfg(struct hclge_dev *hdev) static int hclge_tm_ets_tc_dwrr_cfg(struct hclge_dev *hdev) { -#define DEFAULT_TC_WEIGHT 1 #define DEFAULT_TC_OFFSET 14 struct hclge_ets_tc_weight_cmd *ets_weight; @@ -1136,13 +1135,7 @@ static int hclge_tm_ets_tc_dwrr_cfg(struct hclge_dev *hdev) for (i = 0; i < HNAE3_MAX_TC; i++) { struct hclge_pg_info *pg_info; - ets_weight->tc_weight[i] = DEFAULT_TC_WEIGHT; - - if (!(hdev->hw_tc_map & BIT(i))) - continue; - - pg_info = - &hdev->tm_info.pg_info[hdev->tm_info.tc_info[i].pgid]; + pg_info = &hdev->tm_info.pg_info[hdev->tm_info.tc_info[i].pgid]; ets_weight->tc_weight[i] = pg_info->tc_dwrr[i]; } -- cgit v1.2.3 From e7e4785fa30f9b5d1b60ed2d8e221891325dfc5f Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 5 Nov 2021 16:55:29 +0100 Subject: selftests: net: test_vxlan_under_vrf: fix HV connectivity test It looks like test_vxlan_under_vrf.sh is always failing to verify the connectivity test during the ping between the two simulated VMs. This is due to the fact that veth-hv in each VM should have a distinct MAC address. Fix by setting a unique MAC address on each simulated VM interface. Without this fix: $ sudo ./tools/testing/selftests/net/test_vxlan_under_vrf.sh Checking HV connectivity [ OK ] Check VM connectivity through VXLAN (underlay in the default VRF) [FAIL] With this fix applied: $ sudo ./tools/testing/selftests/net/test_vxlan_under_vrf.sh Checking HV connectivity [ OK ] Check VM connectivity through VXLAN (underlay in the default VRF) [ OK ] Check VM connectivity through VXLAN (underlay in a VRF) [FAIL] NOTE: the connectivity test with the underlay VRF is still failing; it seems that ARP requests are blocked at the simulated hypervisor level, probably due to some missing ARP forwarding rules. This requires more investigation (in the meantime we may consider to set that test as expected failure - XFAIL). Signed-off-by: Andrea Righi Signed-off-by: David S. Miller --- tools/testing/selftests/net/test_vxlan_under_vrf.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/test_vxlan_under_vrf.sh b/tools/testing/selftests/net/test_vxlan_under_vrf.sh index 534c8b7699ab..ea5a7a808f12 100755 --- a/tools/testing/selftests/net/test_vxlan_under_vrf.sh +++ b/tools/testing/selftests/net/test_vxlan_under_vrf.sh @@ -101,6 +101,8 @@ setup-vm() { ip -netns hv-$id link set veth-tap master br0 ip -netns hv-$id link set veth-tap up + ip link set veth-hv address 02:1d:8d:dd:0c:6$id + ip link set veth-hv netns vm-$id ip -netns vm-$id addr add 10.0.0.$id/24 dev veth-hv ip -netns vm-$id link set veth-hv up -- cgit v1.2.3 From f64ab8e4f368f48afb08ae91928e103d17b235e9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 8 Nov 2021 22:28:54 +0200 Subject: net: stmmac: allow a tc-taprio base-time of zero Commit fe28c53ed71d ("net: stmmac: fix taprio configuration when base_time is in the past") allowed some base time values in the past, but apparently not all, the base-time value of 0 (Jan 1st 1970) is still explicitly denied by the driver. Remove the bogus check. Fixes: b60189e0392f ("net: stmmac: Integrate EST with TAPRIO scheduler API") Signed-off-by: Vladimir Oltean Reviewed-by: Kurt Kanzenbach Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 8160087ee92f..1c4ea0b1b845 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -786,8 +786,6 @@ static int tc_setup_taprio(struct stmmac_priv *priv, goto disable; if (qopt->num_entries >= dep) return -EINVAL; - if (!qopt->base_time) - return -ERANGE; if (!qopt->cycle_time) return -ERANGE; -- cgit v1.2.3 From 7a166854b4e24c57d56b3eba9fe1594985ee0a2c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 8 Nov 2021 22:28:55 +0100 Subject: net: ethernet: ti: cpsw_ale: Fix access to un-initialized memory It is spurious to allocate a bitmap without initializing it. So, better safe than sorry, initialize it to 0 at least to have some known values. While at it, switch to the devm_bitmap_ API which is less verbose. Fixes: 4b41d3436796 ("net: ethernet: ti: cpsw: allow untagged traffic on host port") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_ale.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c index 0c75e0576ee1..1ef0aaef5c61 100644 --- a/drivers/net/ethernet/ti/cpsw_ale.c +++ b/drivers/net/ethernet/ti/cpsw_ale.c @@ -1299,10 +1299,8 @@ struct cpsw_ale *cpsw_ale_create(struct cpsw_ale_params *params) if (!ale) return ERR_PTR(-ENOMEM); - ale->p0_untag_vid_mask = - devm_kmalloc_array(params->dev, BITS_TO_LONGS(VLAN_N_VID), - sizeof(unsigned long), - GFP_KERNEL); + ale->p0_untag_vid_mask = devm_bitmap_zalloc(params->dev, VLAN_N_VID, + GFP_KERNEL); if (!ale->p0_untag_vid_mask) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From bb7bbb6e36474933540c24ae1f1ad651b843981f Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Mon, 8 Nov 2021 22:49:18 +0100 Subject: net: marvell: mvpp2: Fix wrong SerDes reconfiguration order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit bfe301ebbc94 ("net: mvpp2: convert to use mac_prepare()/mac_finish()") introduced a bug wherein it leaves the MAC RESET register asserted after mac_finish(), due to wrong order of function calls. Before it was: .mac_config() mvpp22_mode_reconfigure() assert reset mvpp2_xlg_config() deassert reset Now it is: .mac_prepare() .mac_config() mvpp2_xlg_config() deassert reset .mac_finish() mvpp2_xlg_config() assert reset Obviously this is wrong. This bug is triggered when phylink tries to change the PHY interface mode from a GMAC mode (sgmii, 1000base-x, 2500base-x) to XLG mode (10gbase-r, xaui). The XLG mode does not work since reset is left asserted. Only after ifconfig down && ifconfig up is called will the XLG mode work. Move the call to mvpp22_mode_reconfigure() to .mac_prepare() implementation. Since some of the subsequent functions need to know whether the interface is being changed, we unfortunately also need to pass around the new interface mode before setting port->phy_interface. Fixes: bfe301ebbc94 ("net: mvpp2: convert to use mac_prepare()/mac_finish()") Signed-off-by: Marek Behún Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 38 +++++++++++++------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 587def69a6f7..2b18d89d9756 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -1605,7 +1605,7 @@ static void mvpp22_gop_fca_set_periodic_timer(struct mvpp2_port *port) mvpp22_gop_fca_enable_periodic(port, true); } -static int mvpp22_gop_init(struct mvpp2_port *port) +static int mvpp22_gop_init(struct mvpp2_port *port, phy_interface_t interface) { struct mvpp2 *priv = port->priv; u32 val; @@ -1613,7 +1613,7 @@ static int mvpp22_gop_init(struct mvpp2_port *port) if (!priv->sysctrl_base) return 0; - switch (port->phy_interface) { + switch (interface) { case PHY_INTERFACE_MODE_RGMII: case PHY_INTERFACE_MODE_RGMII_ID: case PHY_INTERFACE_MODE_RGMII_RXID: @@ -1743,15 +1743,15 @@ static void mvpp22_gop_setup_irq(struct mvpp2_port *port) * lanes by the physical layer. This is why configurations like * "PPv2 (2500BaseX) - COMPHY (2500SGMII)" are valid. */ -static int mvpp22_comphy_init(struct mvpp2_port *port) +static int mvpp22_comphy_init(struct mvpp2_port *port, + phy_interface_t interface) { int ret; if (!port->comphy) return 0; - ret = phy_set_mode_ext(port->comphy, PHY_MODE_ETHERNET, - port->phy_interface); + ret = phy_set_mode_ext(port->comphy, PHY_MODE_ETHERNET, interface); if (ret) return ret; @@ -2172,7 +2172,8 @@ static void mvpp22_pcs_reset_assert(struct mvpp2_port *port) writel(val & ~MVPP22_XPCS_CFG0_RESET_DIS, xpcs + MVPP22_XPCS_CFG0); } -static void mvpp22_pcs_reset_deassert(struct mvpp2_port *port) +static void mvpp22_pcs_reset_deassert(struct mvpp2_port *port, + phy_interface_t interface) { struct mvpp2 *priv = port->priv; void __iomem *mpcs, *xpcs; @@ -2184,7 +2185,7 @@ static void mvpp22_pcs_reset_deassert(struct mvpp2_port *port) mpcs = priv->iface_base + MVPP22_MPCS_BASE(port->gop_id); xpcs = priv->iface_base + MVPP22_XPCS_BASE(port->gop_id); - switch (port->phy_interface) { + switch (interface) { case PHY_INTERFACE_MODE_10GBASER: val = readl(mpcs + MVPP22_MPCS_CLK_RESET); val |= MAC_CLK_RESET_MAC | MAC_CLK_RESET_SD_RX | @@ -4529,7 +4530,8 @@ static int mvpp2_poll(struct napi_struct *napi, int budget) return rx_done; } -static void mvpp22_mode_reconfigure(struct mvpp2_port *port) +static void mvpp22_mode_reconfigure(struct mvpp2_port *port, + phy_interface_t interface) { u32 ctrl3; @@ -4540,18 +4542,18 @@ static void mvpp22_mode_reconfigure(struct mvpp2_port *port) mvpp22_pcs_reset_assert(port); /* comphy reconfiguration */ - mvpp22_comphy_init(port); + mvpp22_comphy_init(port, interface); /* gop reconfiguration */ - mvpp22_gop_init(port); + mvpp22_gop_init(port, interface); - mvpp22_pcs_reset_deassert(port); + mvpp22_pcs_reset_deassert(port, interface); if (mvpp2_port_supports_xlg(port)) { ctrl3 = readl(port->base + MVPP22_XLG_CTRL3_REG); ctrl3 &= ~MVPP22_XLG_CTRL3_MACMODESELECT_MASK; - if (mvpp2_is_xlg(port->phy_interface)) + if (mvpp2_is_xlg(interface)) ctrl3 |= MVPP22_XLG_CTRL3_MACMODESELECT_10G; else ctrl3 |= MVPP22_XLG_CTRL3_MACMODESELECT_GMAC; @@ -4559,7 +4561,7 @@ static void mvpp22_mode_reconfigure(struct mvpp2_port *port) writel(ctrl3, port->base + MVPP22_XLG_CTRL3_REG); } - if (mvpp2_port_supports_xlg(port) && mvpp2_is_xlg(port->phy_interface)) + if (mvpp2_port_supports_xlg(port) && mvpp2_is_xlg(interface)) mvpp2_xlg_max_rx_size_set(port); else mvpp2_gmac_max_rx_size_set(port); @@ -4579,7 +4581,7 @@ static void mvpp2_start_dev(struct mvpp2_port *port) mvpp2_interrupts_enable(port); if (port->priv->hw_version >= MVPP22) - mvpp22_mode_reconfigure(port); + mvpp22_mode_reconfigure(port, port->phy_interface); if (port->phylink) { phylink_start(port->phylink); @@ -6444,6 +6446,9 @@ static int mvpp2__mac_prepare(struct phylink_config *config, unsigned int mode, mvpp22_gop_mask_irq(port); phy_power_off(port->comphy); + + /* Reconfigure the serdes lanes */ + mvpp22_mode_reconfigure(port, interface); } } @@ -6498,9 +6503,6 @@ static int mvpp2_mac_finish(struct phylink_config *config, unsigned int mode, port->phy_interface != interface) { port->phy_interface = interface; - /* Reconfigure the serdes lanes */ - mvpp22_mode_reconfigure(port); - /* Unmask interrupts */ mvpp22_gop_unmask_irq(port); } @@ -6961,7 +6963,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, * driver does this, we can remove this code. */ if (port->comphy) { - err = mvpp22_comphy_init(port); + err = mvpp22_comphy_init(port, port->phy_interface); if (err == 0) phy_power_off(port->comphy); } -- cgit v1.2.3 From c7cd82b90599fa10915f41e3dd9098a77d0aa7b6 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Tue, 9 Nov 2021 00:15:02 +0000 Subject: vsock: prevent unnecessary refcnt inc for nonblocking connect Currently vosck_connect() increments sock refcount for nonblocking socket each time it's called, which can lead to memory leak if it's called multiple times because connect timeout function decrements sock refcount only once. Fixes it by making vsock_connect() return -EALREADY immediately when sock state is already SS_CONNECTING. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reviewed-by: Stefano Garzarella Signed-off-by: Eiichi Tsukata Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7d851eb3a683..ed0df839c38c 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1322,6 +1322,8 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, * non-blocking call. */ err = -EALREADY; + if (flags & O_NONBLOCK) + goto out; break; default: if ((sk->sk_state == TCP_LISTEN) || -- cgit v1.2.3 From af0a51113cb7445435488e5f9514598f2b52d7a7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 9 Nov 2021 17:17:34 +0100 Subject: selftests: forwarding: Fix packet matching in mirroring selftests In commit 6de6e46d27ef ("cls_flower: Fix inability to match GRE/IPIP packets"), cls_flower was fixed to match an outer packet of a tunneled packet as would be expected, rather than dissecting to the inner packet and matching on that. This fix uncovered several issues in packet matching in mirroring selftests: - in mirror_gre_bridge_1d_vlan.sh and mirror_gre_vlan_bridge_1q.sh, the vlan_ethtype match is copied around as "ip", even as some of the tests are running over ip6gretap. This is fixed by using an "ipv6" for vlan_ethtype in the ip6gretap tests. - in mirror_gre_changes.sh, a filter to count GRE packets is set up to match TTL of 50. This used to trigger in the offloaded datapath, where the envelope TTL was matched, but not in the software datapath, which considered TTL of the inner packet. Now that both match consistently, all the packets were double-counted. This is fixed by marking the filter as skip_hw, leaving only the SW datapath component active. Fixes: 6de6e46d27ef ("cls_flower: Fix inability to match GRE/IPIP packets") Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh | 2 +- .../testing/selftests/net/forwarding/mirror_gre_changes.sh | 2 +- .../selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh | 13 +++++++------ tools/testing/selftests/net/forwarding/mirror_lib.sh | 3 ++- tools/testing/selftests/net/forwarding/mirror_vlan.sh | 4 ++-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh index f8cda822c1ce..1b27f2b0f196 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh @@ -80,7 +80,7 @@ test_gretap() test_ip6gretap() { - test_vlan_match gt6 'skip_hw vlan_id 555 vlan_ethtype ip' \ + test_vlan_match gt6 'skip_hw vlan_id 555 vlan_ethtype ipv6' \ "mirror to ip6gretap" } diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh index 472bd023e2a5..aff88f78e339 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh @@ -74,7 +74,7 @@ test_span_gre_ttl() mirror_install $swp1 ingress $tundev "matchall $tcflags" tc filter add dev $h3 ingress pref 77 prot $prot \ - flower ip_ttl 50 action pass + flower skip_hw ip_ttl 50 action pass mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 0 diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh index 880e3ab9d088..c8a9b5bd841f 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh @@ -141,7 +141,7 @@ test_gretap() test_ip6gretap() { - test_vlan_match gt6 'skip_hw vlan_id 555 vlan_ethtype ip' \ + test_vlan_match gt6 'skip_hw vlan_id 555 vlan_ethtype ipv6' \ "mirror to ip6gretap" } @@ -218,6 +218,7 @@ test_ip6gretap_forbidden_egress() test_span_gre_untagged_egress() { local tundev=$1; shift + local ul_proto=$1; shift local what=$1; shift RET=0 @@ -225,7 +226,7 @@ test_span_gre_untagged_egress() mirror_install $swp1 ingress $tundev "matchall $tcflags" quick_test_span_gre_dir $tundev ingress - quick_test_span_vlan_dir $h3 555 ingress + quick_test_span_vlan_dir $h3 555 ingress "$ul_proto" h3_addr_add_del del $h3.555 bridge vlan add dev $swp3 vid 555 pvid untagged @@ -233,7 +234,7 @@ test_span_gre_untagged_egress() sleep 5 quick_test_span_gre_dir $tundev ingress - fail_test_span_vlan_dir $h3 555 ingress + fail_test_span_vlan_dir $h3 555 ingress "$ul_proto" h3_addr_add_del del $h3 bridge vlan add dev $swp3 vid 555 @@ -241,7 +242,7 @@ test_span_gre_untagged_egress() sleep 5 quick_test_span_gre_dir $tundev ingress - quick_test_span_vlan_dir $h3 555 ingress + quick_test_span_vlan_dir $h3 555 ingress "$ul_proto" mirror_uninstall $swp1 ingress @@ -250,12 +251,12 @@ test_span_gre_untagged_egress() test_gretap_untagged_egress() { - test_span_gre_untagged_egress gt4 "mirror to gretap" + test_span_gre_untagged_egress gt4 ip "mirror to gretap" } test_ip6gretap_untagged_egress() { - test_span_gre_untagged_egress gt6 "mirror to ip6gretap" + test_span_gre_untagged_egress gt6 ipv6 "mirror to ip6gretap" } test_span_gre_fdb_roaming() diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh index 6406cd76a19d..3e8ebeff3019 100644 --- a/tools/testing/selftests/net/forwarding/mirror_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh @@ -115,13 +115,14 @@ do_test_span_vlan_dir_ips() local dev=$1; shift local vid=$1; shift local direction=$1; shift + local ul_proto=$1; shift local ip1=$1; shift local ip2=$1; shift # Install the capture as skip_hw to avoid double-counting of packets. # The traffic is meant for local box anyway, so will be trapped to # kernel. - vlan_capture_install $dev "skip_hw vlan_id $vid vlan_ethtype ip" + vlan_capture_install $dev "skip_hw vlan_id $vid vlan_ethtype $ul_proto" mirror_test v$h1 $ip1 $ip2 $dev 100 $expect mirror_test v$h2 $ip2 $ip1 $dev 100 $expect vlan_capture_uninstall $dev diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh index 9ab2ce77b332..0b44e148235e 100755 --- a/tools/testing/selftests/net/forwarding/mirror_vlan.sh +++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh @@ -85,9 +85,9 @@ test_tagged_vlan_dir() RET=0 mirror_install $swp1 $direction $swp3.555 "matchall $tcflags" - do_test_span_vlan_dir_ips 10 "$h3.555" 111 "$direction" \ + do_test_span_vlan_dir_ips 10 "$h3.555" 111 "$direction" ip \ 192.0.2.17 192.0.2.18 - do_test_span_vlan_dir_ips 0 "$h3.555" 555 "$direction" \ + do_test_span_vlan_dir_ips 0 "$h3.555" 555 "$direction" ip \ 192.0.2.17 192.0.2.18 mirror_uninstall $swp1 $direction -- cgit v1.2.3 From 68eabc348148ae051631e8dab13c3b1a85c82896 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Tue, 9 Nov 2021 23:23:54 +0100 Subject: net: ethernet: lantiq_etop: Fix compilation error This fixes the error detected when compiling the driver. Fixes: 14d4e308e0aa ("net: lantiq: configure the burst length in ethernet drivers") Reported-by: kernel test robot Signed-off-by: Aleksander Jan Bajkowski Signed-off-by: David S. Miller --- drivers/net/ethernet/lantiq_etop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 2258e3f19161..6433c909c6b2 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -262,7 +262,7 @@ ltq_etop_hw_init(struct net_device *dev) /* enable crc generation */ ltq_etop_w32(PPE32_CGEN, LQ_PPE32_ENET_MAC_CFG); - ltq_dma_init_port(DMA_PORT_ETOP, priv->tx_burst_len, rx_burst_len); + ltq_dma_init_port(DMA_PORT_ETOP, priv->tx_burst_len, priv->rx_burst_len); for (i = 0; i < MAX_DMA_CHAN; i++) { int irq = LTQ_DMA_CH0_INT + i; -- cgit v1.2.3 From 721111b1b29c67fd18ac2f69b3a48c06ba996762 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Nov 2021 11:11:09 +0300 Subject: gve: fix unmatched u64_stats_update_end() The u64_stats_update_end() call is supposed to be inside the curly braces so it pairs with the u64_stats_update_begin(). Fixes: 37149e9374bf ("gve: Implement packet continuation for RX.") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index c8500babbd1d..3d04b5aff331 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -500,7 +500,8 @@ static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, rx->rx_copied_pkt++; rx->rx_frag_copy_cnt++; rx->rx_copybreak_pkt++; - } u64_stats_update_end(&rx->statss); + u64_stats_update_end(&rx->statss); + } } else { if (rx->data.raw_addressing) { int recycle = gve_rx_can_recycle_buffer(page_info); -- cgit v1.2.3 From c7ebe23cee350fb187ee00ff445b01e11de0bfe9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Nov 2021 11:07:06 +0300 Subject: net/mlx5: Lag, fix a potential Oops with mlx5_lag_create_definer() There is a minus character missing from ERR_PTR(ENOMEM) so if this allocation fails it will lead to an Oops in the caller. Fixes: dc48516ec7d3 ("net/mlx5: Lag, add support to create definers for LAG") Signed-off-by: Dan Carpenter Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index adc836b3d857..ad63dd45c8fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -289,7 +289,7 @@ mlx5_lag_create_definer(struct mlx5_lag *ldev, enum netdev_lag_hash hash, lag_definer = kzalloc(sizeof(*lag_definer), GFP_KERNEL); if (!lag_definer) - return ERR_PTR(ENOMEM); + return ERR_PTR(-ENOMEM); match_definer_mask = kvzalloc(MLX5_FLD_SZ_BYTES(match_definer, match_mask), -- cgit v1.2.3 From e5d5aadcf3cd59949316df49c27cb21788d7efe4 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Wed, 10 Nov 2021 15:02:34 +0800 Subject: net/smc: fix sk_refcnt underflow on linkdown and fallback We got the following WARNING when running ab/nginx test with RDMA link flapping (up-down-up). The reason is when smc_sock fallback and at linkdown happens simultaneously, we may got the following situation: __smc_lgr_terminate() --> smc_conn_kill() --> smc_close_active_abort() smc_sock->sk_state = SMC_CLOSED sock_put(smc_sock) smc_sock was set to SMC_CLOSED and sock_put() been called when terminate the link group. But later application call close() on the socket, then we got: __smc_release(): if (smc_sock->fallback) smc_sock->sk_state = SMC_CLOSED sock_put(smc_sock) Again we set the smc_sock to CLOSED through it's already in CLOSED state, and double put the refcnt, so the following warning happens: refcount_t: underflow; use-after-free. WARNING: CPU: 5 PID: 860 at lib/refcount.c:28 refcount_warn_saturate+0x8d/0xf0 Modules linked in: CPU: 5 PID: 860 Comm: nginx Not tainted 5.10.46+ #403 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/01/2014 RIP: 0010:refcount_warn_saturate+0x8d/0xf0 Code: 05 5c 1e b5 01 01 e8 52 25 bc ff 0f 0b c3 80 3d 4f 1e b5 01 00 75 ad 48 RSP: 0018:ffffc90000527e50 EFLAGS: 00010286 RAX: 0000000000000026 RBX: ffff8881300df2c0 RCX: 0000000000000027 RDX: 0000000000000000 RSI: ffff88813bd58040 RDI: ffff88813bd58048 RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000001 R10: ffff8881300df2c0 R11: ffffc90000527c78 R12: ffff8881300df340 R13: ffff8881300df930 R14: ffff88810b3dad80 R15: ffff8881300df4f8 FS: 00007f739de8fb80(0000) GS:ffff88813bd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000a01b008 CR3: 0000000111b64003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: smc_release+0x353/0x3f0 __sock_release+0x3d/0xb0 sock_close+0x11/0x20 __fput+0x93/0x230 task_work_run+0x65/0xa0 exit_to_user_mode_prepare+0xf9/0x100 syscall_exit_to_user_mode+0x27/0x190 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This patch adds check in __smc_release() to make sure we won't do an extra sock_put() and set the socket to CLOSED when its already in CLOSED state. Fixes: 51f1de79ad8e (net/smc: replace sock_put worker by socket refcounting) Signed-off-by: Dust Li Reviewed-by: Tony Lu Signed-off-by: Dust Li Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0cf7ed2f5d41..59284da9116d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -149,14 +149,18 @@ static int __smc_release(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { - if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) - sock_put(sk); /* passive closing */ - if (sk->sk_state == SMC_LISTEN) { - /* wake up clcsock accept */ - rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); + if (sk->sk_state != SMC_CLOSED) { + if (sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_INIT) + sock_put(sk); /* passive closing */ + if (sk->sk_state == SMC_LISTEN) { + /* wake up clcsock accept */ + rc = kernel_sock_shutdown(smc->clcsock, + SHUT_RDWR); + } + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); } - sk->sk_state = SMC_CLOSED; - sk->sk_state_change(sk); smc_restore_fallback_changes(smc); } -- cgit v1.2.3 From 0315a075f1343966ea2d9a085666a88a69ea6a3d Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 10 Nov 2021 20:56:05 +0100 Subject: net: fix premature exit from NAPI state polling in napi_disable() Commit 719c57197010 ("net: make napi_disable() symmetric with enable") accidentally introduced a bug sometimes leading to a kernel BUG when bringing an iface up/down under heavy traffic load. Prior to this commit, napi_disable() was polling n->state until none of (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC) is set and then always flip them. Now there's a possibility to get away with the NAPIF_STATE_SCHE unset as 'continue' drops us to the cmpxchg() call with an uninitialized variable, rather than straight to another round of the state check. Error path looks like: napi_disable(): unsigned long val, new; /* new is uninitialized */ do { val = READ_ONCE(n->state); /* NAPIF_STATE_NPSVC and/or NAPIF_STATE_SCHED is set */ if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { /* true */ usleep_range(20, 200); continue; /* go straight to the condition check */ } new = val | <...> } while (cmpxchg(&n->state, val, new) != val); /* state == val, cmpxchg() writes garbage */ napi_enable(): do { val = READ_ONCE(n->state); BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); /* 50/50 boom */ <...> while the typical BUG splat is like: [ 172.652461] ------------[ cut here ]------------ [ 172.652462] kernel BUG at net/core/dev.c:6937! [ 172.656914] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 172.661966] CPU: 36 PID: 2829 Comm: xdp_redirect_cp Tainted: G I 5.15.0 #42 [ 172.670222] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0014.082620210524 08/26/2021 [ 172.680646] RIP: 0010:napi_enable+0x5a/0xd0 [ 172.684832] Code: 07 49 81 cc 00 01 00 00 4c 89 e2 48 89 d8 80 e6 fb f0 48 0f b1 55 10 48 39 c3 74 10 48 8b 5d 10 f6 c7 04 75 3d f6 c3 01 75 b4 <0f> 0b 5b 5d 41 5c c3 65 ff 05 b8 e5 61 53 48 c7 c6 c0 f3 34 ad 48 [ 172.703578] RSP: 0018:ffffa3c9497477a8 EFLAGS: 00010246 [ 172.708803] RAX: ffffa3c96615a014 RBX: 0000000000000000 RCX: ffff8a4b575301a0 < snip > [ 172.782403] Call Trace: [ 172.784857] [ 172.786963] ice_up_complete+0x6f/0x210 [ice] [ 172.791349] ice_xdp+0x136/0x320 [ice] [ 172.795108] ? ice_change_mtu+0x180/0x180 [ice] [ 172.799648] dev_xdp_install+0x61/0xe0 [ 172.803401] dev_xdp_attach+0x1e0/0x550 [ 172.807240] dev_change_xdp_fd+0x1e6/0x220 [ 172.811338] do_setlink+0xee8/0x1010 [ 172.814917] rtnl_setlink+0xe5/0x170 [ 172.818499] ? bpf_lsm_binder_set_context_mgr+0x10/0x10 [ 172.823732] ? security_capable+0x36/0x50 < snip > Fix this by replacing 'do { } while (cmpxchg())' with an "infinite" for-loop with an explicit break. From v1 [0]: - just use a for-loop to simplify both the fix and the existing code (Eric). [0] https://lore.kernel.org/netdev/20211110191126.1214-1-alexandr.lobakin@intel.com Fixes: 719c57197010 ("net: make napi_disable() symmetric with enable") Suggested-by: Eric Dumazet # for-loop Signed-off-by: Alexander Lobakin Reviewed-by: Jesse Brandeburg Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20211110195605.1304-1-alexandr.lobakin@intel.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index edeb811c454e..15ac064b5562 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6928,7 +6928,7 @@ void napi_disable(struct napi_struct *n) might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); - do { + for ( ; ; ) { val = READ_ONCE(n->state); if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); @@ -6937,7 +6937,10 @@ void napi_disable(struct napi_struct *n) new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); - } while (cmpxchg(&n->state, val, new) != val); + + if (cmpxchg(&n->state, val, new) == val) + break; + } hrtimer_cancel(&n->timer); -- cgit v1.2.3 From 4ca110bf8d9b31a60f8f8ff6706ea147d38ad97c Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Thu, 11 Nov 2021 15:55:16 +0530 Subject: cxgb4: fix eeprom len when diagnostics not implemented Ensure diagnostics monitoring support is implemented for the SFF 8472 compliant port module and set the correct length for ethtool port module eeprom read. Fixes: f56ec6766dcf ("cxgb4: Add support for ethtool i2c dump") Signed-off-by: Manoj Malviya Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 7 +++++-- drivers/net/ethernet/chelsio/cxgb4/t4_hw.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 5903bdb78916..129352bbe114 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -2015,12 +2015,15 @@ static int cxgb4_get_module_info(struct net_device *dev, if (ret) return ret; - if (!sff8472_comp || (sff_diag_type & 4)) { + if (!sff8472_comp || (sff_diag_type & SFP_DIAG_ADDRMODE)) { modinfo->type = ETH_MODULE_SFF_8079; modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; } else { modinfo->type = ETH_MODULE_SFF_8472; - modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + if (sff_diag_type & SFP_DIAG_IMPLEMENTED) + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + else + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN / 2; } break; diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h index 002fc62ea726..63bc956d2037 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h @@ -293,6 +293,8 @@ enum { #define I2C_PAGE_SIZE 0x100 #define SFP_DIAG_TYPE_ADDR 0x5c #define SFP_DIAG_TYPE_LEN 0x1 +#define SFP_DIAG_ADDRMODE BIT(2) +#define SFP_DIAG_IMPLEMENTED BIT(6) #define SFF_8472_COMP_ADDR 0x5e #define SFF_8472_COMP_LEN 0x1 #define SFF_REV_ADDR 0x1 -- cgit v1.2.3 From 29cd386750412297fd064c01c87bc40a26f24047 Mon Sep 17 00:00:00 2001 From: M Chetan Kumar Date: Wed, 10 Nov 2021 21:50:36 +0530 Subject: net: wwan: iosm: fix compilation warning curr_phase is unused. Removed the dead code. Fixes: 8d9be0634181 ("net: wwan: iosm: transport layer support for fw flashing/cd") Reported-by: kernel test robot Signed-off-by: M Chetan Kumar Reviewed-by: Loic Poulain Signed-off-by: David S. Miller --- drivers/net/wwan/iosm/iosm_ipc_imem_ops.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c b/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c index b885a6570235..825e8e5ffb2a 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c +++ b/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c @@ -394,12 +394,10 @@ void ipc_imem_sys_devlink_close(struct iosm_devlink *ipc_devlink) int boot_check_timeout = BOOT_CHECK_DEFAULT_TIMEOUT; enum ipc_mem_exec_stage exec_stage; struct ipc_mem_channel *channel; - enum ipc_phase curr_phase; int status = 0; u32 tail = 0; channel = ipc_imem->ipc_devlink->devlink_sio.channel; - curr_phase = ipc_imem->phase; /* Increase the total wait time to boot_check_timeout */ do { exec_stage = ipc_mmio_get_exec_stage(ipc_imem->mmio); -- cgit v1.2.3 From d336509cb9d03970911878bb77f0497f64fda061 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 11 Nov 2021 06:57:17 -0500 Subject: selftests/net: udpgso_bench_rx: fix port argument The below commit added optional support for passing a bind address. It configures the sockaddr bind arguments before parsing options and reconfigures on options -b and -4. This broke support for passing port (-p) on its own. Configure sockaddr after parsing all arguments. Fixes: 3327a9c46352 ("selftests: add functionals test for UDP GRO") Reported-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgso_bench_rx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index 76a24052f4b4..6a193425c367 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -293,19 +293,17 @@ static void usage(const char *filepath) static void parse_opts(int argc, char **argv) { + const char *bind_addr = NULL; int c; - /* bind to any by default */ - setup_sockaddr(PF_INET6, "::", &cfg_bind_addr); while ((c = getopt(argc, argv, "4b:C:Gl:n:p:rR:S:tv")) != -1) { switch (c) { case '4': cfg_family = PF_INET; cfg_alen = sizeof(struct sockaddr_in); - setup_sockaddr(PF_INET, "0.0.0.0", &cfg_bind_addr); break; case 'b': - setup_sockaddr(cfg_family, optarg, &cfg_bind_addr); + bind_addr = optarg; break; case 'C': cfg_connect_timeout_ms = strtoul(optarg, NULL, 0); @@ -341,6 +339,11 @@ static void parse_opts(int argc, char **argv) } } + if (!bind_addr) + bind_addr = cfg_family == PF_INET6 ? "::" : "0.0.0.0"; + + setup_sockaddr(cfg_family, bind_addr, &cfg_bind_addr); + if (optind != argc) usage(argv[0]); -- cgit v1.2.3