From a54d51fb2dfb846aedf3751af501e9688db447f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jan 2024 20:17:49 +0000 Subject: udp: fix busy polling Generic sk_busy_loop_end() only looks at sk->sk_receive_queue for presence of packets. Problem is that for UDP sockets after blamed commit, some packets could be present in another queue: udp_sk(sk)->reader_queue In some cases, a busy poller could spin until timeout expiration, even if some packets are available in udp_sk(sk)->reader_queue. v3: - make sk_busy_loop_end() nicer (Willem) v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats. - add a sk_is_inet() check in sk_is_udp() (Willem feedback) - add a sk_is_inet() check in sk_is_tcp(). Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception") Signed-off-by: Eric Dumazet Reviewed-by: Paolo Abeni Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/skmsg.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 888a4b217829..e65ec3fd2799 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -505,12 +505,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } -static inline bool sk_is_udp(const struct sock *sk) -{ - return sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP; -} - #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) -- cgit v1.2.3 From 73ae7e1c7644a8c33ba526302a10267cdbc249f8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 11 Jan 2024 17:57:44 +0100 Subject: ata: libata-sata: improve sysfs description for ATA_LPM_UNKNOWN Currently, both ATA_LPM_UNKNOWN (0) and ATA_LPM_MAX_POWER (1) displays as "max_performance" in sysfs. This is quite misleading as they are not the same. For ATA_LPM_UNKNOWN, ata_eh_set_lpm() will not be called at all, leaving the configuration in unknown state. For ATA_LPM_MAX_POWER, ata_eh_set_lpm() is called, and setting the policy to ATA_LPM_MAX_POWER. This also matches the description of the SATA_MOBILE_LPM_POLICY Kconfig: 0 => Keep firmware settings 1 => Maximum performance Thus, update the sysfs description for ATA_LPM_UNKNOWN to match reality. While at it, update libata.h to mention that the ascii descriptions are in libata-sata.c and not in libata-scsi.c. Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-sata.c | 2 +- include/linux/libata.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c index b6656c287175..0fb1934875f2 100644 --- a/drivers/ata/libata-sata.c +++ b/drivers/ata/libata-sata.c @@ -784,7 +784,7 @@ bool sata_lpm_ignore_phy_events(struct ata_link *link) EXPORT_SYMBOL_GPL(sata_lpm_ignore_phy_events); static const char *ata_lpm_policy_names[] = { - [ATA_LPM_UNKNOWN] = "max_performance", + [ATA_LPM_UNKNOWN] = "keep_firmware_settings", [ATA_LPM_MAX_POWER] = "max_performance", [ATA_LPM_MED_POWER] = "medium_power", [ATA_LPM_MED_POWER_WITH_DIPM] = "med_power_with_dipm", diff --git a/include/linux/libata.h b/include/linux/libata.h index 1dbb14daccfa..26d68115afb8 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -471,7 +471,7 @@ enum ata_completion_errors { /* * Link power management policy: If you alter this, you also need to - * alter libata-scsi.c (for the ascii descriptions) + * alter libata-sata.c (for the ascii descriptions) */ enum ata_lpm_policy { ATA_LPM_UNKNOWN, -- cgit v1.2.3 From 41353fbad4f551e82c2792f7e82ac225c79cc710 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Thu, 18 Jan 2024 20:51:45 +0800 Subject: nvmet: unify aer type enum The host and target use two definition of aer type, unify them into a single one. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 4 ++-- drivers/nvme/target/discovery.c | 2 +- include/linux/nvme.h | 6 ------ 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index d26aa30f8702..fa35e65915f0 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -248,7 +248,7 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR)) continue; - nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + nvmet_add_async_event(ctrl, NVME_AER_NOTICE, NVME_AER_NOTICE_NS_CHANGED, NVME_LOG_CHANGED_NS); } @@ -265,7 +265,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys, continue; if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE)) continue; - nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + nvmet_add_async_event(ctrl, NVME_AER_NOTICE, NVME_AER_NOTICE_ANA, NVME_LOG_ANA); } mutex_unlock(&subsys->lock); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 668d257fa986..68e82ccc0e4e 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -21,7 +21,7 @@ static void __nvmet_disc_changed(struct nvmet_port *port, if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE)) return; - nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + nvmet_add_async_event(ctrl, NVME_AER_NOTICE, NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 462c21e0e417..68eff8c86ce3 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -816,12 +816,6 @@ struct nvme_reservation_status_ext { struct nvme_registered_ctrl_ext regctl_eds[]; }; -enum nvme_async_event_type { - NVME_AER_TYPE_ERROR = 0, - NVME_AER_TYPE_SMART = 1, - NVME_AER_TYPE_NOTICE = 2, -}; - /* I/O commands */ enum nvme_opcode { -- cgit v1.2.3 From 25461ce8b3d28528f2c55f5e737e99d2906eda83 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 15 Dec 2023 19:31:14 -0800 Subject: net/mlx5e: Use the correct lag ports number when creating TISes The cited commit moved the code of mlx5e_create_tises() and changed the loop to create TISes over MLX5_MAX_PORTS constant value, instead of getting the correct lag ports supported by the device, which can cause FW errors on devices with less than MLX5_MAX_PORTS ports. Change that back to mlx5e_get_num_lag_ports(mdev). Also IPoIB interfaces create there own TISes, they don't use the eth TISes, pass a flag to indicate that. This fixes the following errors that might appear in kernel log: mlx5_cmd_out_err:808:(pid 650): CREATE_TIS(0x912) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x595b5d), err(-22) mlx5e_create_mdev_resources:174:(pid 650): alloc tises failed, -22 Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources") Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 21 +++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 2 +- include/linux/mlx5/driver.h | 1 + 5 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 0bfe1ca8a364..55c6ace0acd5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1124,7 +1124,7 @@ static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev) extern const struct ethtool_ops mlx5e_ethtool_ops; int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey); -int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev); +int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises); void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev); int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, bool enable_mc_lb); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 67f546683e85..6ed3a32b7e22 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -95,7 +95,7 @@ static void mlx5e_destroy_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PO { int tc, i; - for (i = 0; i < MLX5_MAX_PORTS; i++) + for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++) for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++) mlx5e_destroy_tis(mdev, tisn[i][tc]); } @@ -110,7 +110,7 @@ static int mlx5e_create_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PORT int tc, i; int err; - for (i = 0; i < MLX5_MAX_PORTS; i++) { + for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++) { for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++) { u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; void *tisc; @@ -140,7 +140,7 @@ err_close_tises: return err; } -int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev) +int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) { struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; int err; @@ -169,11 +169,15 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev) goto err_destroy_mkey; } - err = mlx5e_create_tises(mdev, res->tisn); - if (err) { - mlx5_core_err(mdev, "alloc tises failed, %d\n", err); - goto err_destroy_bfreg; + if (create_tises) { + err = mlx5e_create_tises(mdev, res->tisn); + if (err) { + mlx5_core_err(mdev, "alloc tises failed, %d\n", err); + goto err_destroy_bfreg; + } + res->tisn_valid = true; } + INIT_LIST_HEAD(&res->td.tirs_list); mutex_init(&res->td.list_lock); @@ -203,7 +207,8 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) mlx5_crypto_dek_cleanup(mdev->mlx5e_res.dek_priv); mdev->mlx5e_res.dek_priv = NULL; - mlx5e_destroy_tises(mdev, res->tisn); + if (res->tisn_valid) + mlx5e_destroy_tises(mdev, res->tisn); mlx5_free_bfreg(mdev, &res->bfreg); mlx5_core_destroy_mkey(mdev, res->mkey); mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b5f1c4ca38ba..c8e8f512803e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5992,7 +5992,7 @@ static int mlx5e_resume(struct auxiliary_device *adev) if (netif_device_present(netdev)) return 0; - err = mlx5e_create_mdev_resources(mdev); + err = mlx5e_create_mdev_resources(mdev, true); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 58845121954c..d77be1b4dd9c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -783,7 +783,7 @@ static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num, } /* This should only be called once per mdev */ - err = mlx5e_create_mdev_resources(mdev); + err = mlx5e_create_mdev_resources(mdev, false); if (err) goto destroy_ht; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c55ff351e5f..41f03b352401 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -681,6 +681,7 @@ struct mlx5e_resources { struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; + bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; -- cgit v1.2.3 From cfbc3608a8c69b48bf238bd68f768192f0238e0d Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 19 Dec 2023 14:46:20 +0200 Subject: net/mlx5: Fix query of sd_group field The sd_group field moved in the HW spec from the MPIR register to the vport context. Align the query accordingly. Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 21 +++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 10 +++++++--- include/linux/mlx5/vport.h | 1 + 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 21753f327868..1005bb6935b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -440,6 +440,27 @@ out: } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid); +int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group) +{ + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 *out; + int err; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (err) + goto out; + + *sd_group = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.sd_group); +out: + kvfree(out); + return err; +} + int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) { u32 *out; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bf5320b28b8b..37230253f9f1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4036,8 +4036,13 @@ struct mlx5_ifc_nic_vport_context_bits { u8 affiliation_criteria[0x4]; u8 affiliated_vhca_id[0x10]; - u8 reserved_at_60[0xd0]; + u8 reserved_at_60[0xa0]; + u8 reserved_at_100[0x1]; + u8 sd_group[0x3]; + u8 reserved_at_104[0x1c]; + + u8 reserved_at_120[0x10]; u8 mtu[0x10]; u8 system_image_guid[0x40]; @@ -10122,8 +10127,7 @@ struct mlx5_ifc_mpir_reg_bits { u8 reserved_at_20[0x20]; u8 local_port[0x8]; - u8 reserved_at_28[0x15]; - u8 sd_group[0x3]; + u8 reserved_at_28[0x18]; u8 reserved_at_60[0x20]; }; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index fbb9bf447889..c36cc6d82926 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); +int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); -- cgit v1.2.3 From ec7cc38ef9f83553102e84c82536971a81630739 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 30 Dec 2023 22:40:37 +0200 Subject: net/mlx5: Bridge, fix multicast packets sent to uplink To enable multicast packets which are offloaded in bridge multicast offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should be set. Add this bit to FTE for the bridge multicast offload rules. Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets") Signed-off-by: Moshe Shemesh Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 ++ include/linux/mlx5/fs.h | 1 + include/linux/mlx5/mlx5_ifc.h | 2 +- 4 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c index a7ed87e9d842..22dd30cf8033 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c @@ -83,6 +83,7 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md i++; } + rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, outer_headers.dmac_47_16); ether_addr_copy(dmac_v, entry->key.addr); @@ -587,6 +588,7 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po if (!rule_spec) return ERR_PTR(-ENOMEM); + rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; @@ -662,6 +664,7 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port) dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; dest.vport.vhca_id = port->esw_owner_vhca_id; } + rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; handle = mlx5_add_flow_rules(port->mcast.ft, rule_spec, &flow_act, &dest, 1); kvfree(rule_spec); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 1616a6144f7b..9b8599c200e2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -566,6 +566,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, fte->flow_context.flow_tag); MLX5_SET(flow_context, in_flow_context, flow_source, fte->flow_context.flow_source); + MLX5_SET(flow_context, in_flow_context, uplink_hairpin_en, + !!(fte->flow_context.flags & FLOW_CONTEXT_UPLINK_HAIRPIN_EN)); MLX5_SET(flow_context, in_flow_context, extended_destination, extended_dest); diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6f7725238abc..3fb428ce7d1c 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -132,6 +132,7 @@ struct mlx5_flow_handle; enum { FLOW_CONTEXT_HAS_TAG = BIT(0), + FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1), }; struct mlx5_flow_context { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 37230253f9f1..c726f90ab752 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3576,7 +3576,7 @@ struct mlx5_ifc_flow_context_bits { u8 action[0x10]; u8 extended_destination[0x1]; - u8 reserved_at_81[0x1]; + u8 uplink_hairpin_en[0x1]; u8 flow_source[0x2]; u8 encrypt_decrypt_type[0x4]; u8 destination_list_size[0x18]; -- cgit v1.2.3 From 56062d60f117dccfb5281869e0ab61e090baf864 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Wed, 10 Jan 2024 15:01:22 +0200 Subject: x86/entry/ia32: Ensure s32 is sign extended to s64 Presently ia32 registers stored in ptregs are unconditionally cast to unsigned int by the ia32 stub. They are then cast to long when passed to __se_sys*, but will not be sign extended. This takes the sign of the syscall argument into account in the ia32 stub. It still casts to unsigned int to avoid implementation specific behavior. However then casts to int or unsigned int as necessary. So that the following cast to long sign extends the value. This fixes the io_pgetevents02 LTP test when compiled with -m32. Presently the systemcall io_pgetevents_time64() unexpectedly accepts -1 for the maximum number of events. It doesn't appear other systemcalls with signed arguments are effected because they all have compat variants defined and wired up. Fixes: ebeb8c82ffaf ("syscalls/x86: Use 'struct pt_regs' based syscall calling for IA32_EMULATION and x32") Suggested-by: Arnd Bergmann Signed-off-by: Richard Palethorpe Signed-off-by: Nikolay Borisov Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240110130122.3836513-1-nik.borisov@suse.com Link: https://lore.kernel.org/ltp/20210921130127.24131-1-rpalethorpe@suse.com/ --- arch/x86/include/asm/syscall_wrapper.h | 25 +++++++++++++++++++++---- include/linux/syscalls.h | 1 + 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index 21f9407be5d3..7e88705e907f 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -58,12 +58,29 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); ,,regs->di,,regs->si,,regs->dx \ ,,regs->r10,,regs->r8,,regs->r9) \ + +/* SYSCALL_PT_ARGS is Adapted from s390x */ +#define SYSCALL_PT_ARG6(m, t1, t2, t3, t4, t5, t6) \ + SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5), m(t6, (regs->bp)) +#define SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5) \ + SYSCALL_PT_ARG4(m, t1, t2, t3, t4), m(t5, (regs->di)) +#define SYSCALL_PT_ARG4(m, t1, t2, t3, t4) \ + SYSCALL_PT_ARG3(m, t1, t2, t3), m(t4, (regs->si)) +#define SYSCALL_PT_ARG3(m, t1, t2, t3) \ + SYSCALL_PT_ARG2(m, t1, t2), m(t3, (regs->dx)) +#define SYSCALL_PT_ARG2(m, t1, t2) \ + SYSCALL_PT_ARG1(m, t1), m(t2, (regs->cx)) +#define SYSCALL_PT_ARG1(m, t1) m(t1, (regs->bx)) +#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__) + +#define __SC_COMPAT_CAST(t, a) \ + (__typeof(__builtin_choose_expr(__TYPE_IS_L(t), 0, 0U))) \ + (unsigned int)a + /* Mapping of registers to parameters for syscalls on i386 */ #define SC_IA32_REGS_TO_ARGS(x, ...) \ - __MAP(x,__SC_ARGS \ - ,,(unsigned int)regs->bx,,(unsigned int)regs->cx \ - ,,(unsigned int)regs->dx,,(unsigned int)regs->si \ - ,,(unsigned int)regs->di,,(unsigned int)regs->bp) + SYSCALL_PT_ARGS(x, __SC_COMPAT_CAST, \ + __MAP(x, __SC_TYPE, __VA_ARGS__)) \ #define __SYS_STUB0(abi, name) \ long __##abi##_##name(const struct pt_regs *regs); \ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index cdba4d0c6d4a..77eb9b0e7685 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -128,6 +128,7 @@ struct mnt_id_req; #define __TYPE_IS_LL(t) (__TYPE_AS(t, 0LL) || __TYPE_AS(t, 0ULL)) #define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a #define __SC_CAST(t, a) (__force t) a +#define __SC_TYPE(t, a) t #define __SC_ARGS(t, a) a #define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long)) -- cgit v1.2.3 From 2f8c7c3715f2c6fb51a4ecc0905c04dd78a3da29 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 24 Jan 2024 13:24:24 +0000 Subject: spi: Raise limit on number of chip selects As reported by Guenter the limit we've got on the number of chip selects is set too low for some systems, raise the limit. We should really remove the hard coded limit but this is needed as a fix so let's do the simple thing and raise the limit for now. Fixes: 4d8ff6b0991d ("spi: Add multi-cs memories support in SPI core") Reported-by: Guenter Roeck Suggested-by: Guenter Roeck Signed-off-by: Mark Brown Link: https://msgid.link/r/20240124-spi-multi-cs-max-v2-1-df6fc5ab1abc@kernel.org Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 471fe2ff9066..600fbd5daf68 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -21,7 +21,7 @@ #include /* Max no. of CS supported per spi device */ -#define SPI_CS_CNT_MAX 4 +#define SPI_CS_CNT_MAX 16 struct dma_chan; struct software_node; -- cgit v1.2.3 From 90383cc07895183c75a0db2460301c2ffd912359 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 24 Jan 2024 11:15:33 -0800 Subject: exec: Distinguish in_execve from in_exec Just to help distinguish the fs->in_exec flag from the current->in_execve flag, add comments in check_unsafe_exec() and copy_fs() for more context. Also note that in_execve is only used by TOMOYO now. Cc: Kentaro Takeda Cc: Tetsuo Handa Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Eric Biederman Cc: Andrew Morton Cc: Sebastian Andrzej Siewior Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Signed-off-by: Kees Cook --- fs/exec.c | 1 + include/linux/sched.h | 2 +- kernel/fork.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 39d773021fff..d179abb78a1c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1633,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) } rcu_read_unlock(); + /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else diff --git a/include/linux/sched.h b/include/linux/sched.h index cdb8ea53c365..ffe8f618ab86 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,7 +920,7 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif - /* Bit to tell LSMs we're in execve(): */ + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK diff --git a/kernel/fork.c b/kernel/fork.c index 47ff3b35352e..0d944e92a43f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1748,6 +1748,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ spin_lock(&fs->lock); + /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { spin_unlock(&fs->lock); return -EAGAIN; -- cgit v1.2.3 From c4608d1bf7c6536d1a3d233eb21e50678681564e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 20 Dec 2023 22:59:43 -0800 Subject: mm: mmap: map MAP_STACK to VM_NOHUGEPAGE commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") incured regression for stress-ng pthread benchmark [1]. It is because THP get allocated to pthread's stack area much more possible than before. Pthread's stack area is allocated by mmap without VM_GROWSDOWN or VM_GROWSUP flag, so kernel can't tell whether it is a stack area or not. The MAP_STACK flag is used to mark the stack area, but it is a no-op on Linux. Mapping MAP_STACK to VM_NOHUGEPAGE to prevent from allocating THP for such stack area. With this change the stack area looks like: fffd18e10000-fffd19610000 rw-p 00000000 00:00 0 Size: 8192 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 12 kB Pss: 12 kB Pss_Dirty: 12 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 12 kB Referenced: 12 kB Anonymous: 12 kB KSM: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd wr mr mw me ac nh The "nh" flag is set. [1] https://lore.kernel.org/linux-mm/202312192310.56367035-oliver.sang@intel.com/ Link: https://lkml.kernel.org/r/20231221065943.2803551-2-shy828301@gmail.com Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") Signed-off-by: Yang Shi Reported-by: kernel test robot Tested-by: Oliver Sang Reviewed-by: Yin Fengwei Cc: Rik van Riel Cc: Matthew Wilcox Cc: Christopher Lameter Cc: Huang, Ying Cc: Signed-off-by: Andrew Morton --- include/linux/mman.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mman.h b/include/linux/mman.h index 40d94411d492..dc7048824be8 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,6 +156,7 @@ calc_vm_flag_bits(unsigned long flags) return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | + _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | arch_calc_vm_flag_bits(flags); } -- cgit v1.2.3 From f6564fce256a3944aa1bc76cb3c40e792d97c1eb Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 18 Jan 2024 11:59:14 +0100 Subject: mm, kmsan: fix infinite recursion due to RCU critical section Alexander Potapenko writes in [1]: "For every memory access in the code instrumented by KMSAN we call kmsan_get_metadata() to obtain the metadata for the memory being accessed. For virtual memory the metadata pointers are stored in the corresponding `struct page`, therefore we need to call virt_to_page() to get them. According to the comment in arch/x86/include/asm/page.h, virt_to_page(kaddr) returns a valid pointer iff virt_addr_valid(kaddr) is true, so KMSAN needs to call virt_addr_valid() as well. To avoid recursion, kmsan_get_metadata() must not call instrumented code, therefore ./arch/x86/include/asm/kmsan.h forks parts of arch/x86/mm/physaddr.c to check whether a virtual address is valid or not. But the introduction of rcu_read_lock() to pfn_valid() added instrumented RCU API calls to virt_to_page_or_null(), which is called by kmsan_get_metadata(), so there is an infinite recursion now. I do not think it is correct to stop that recursion by doing kmsan_enter_runtime()/kmsan_exit_runtime() in kmsan_get_metadata(): that would prevent instrumented functions called from within the runtime from tracking the shadow values, which might introduce false positives." Fix the issue by switching pfn_valid() to the _sched() variant of rcu_read_lock/unlock(), which does not require calling into RCU. Given the critical section in pfn_valid() is very small, this is a reasonable trade-off (with preemptible RCU). KMSAN further needs to be careful to suppress calls into the scheduler, which would be another source of recursion. This can be done by wrapping the call to pfn_valid() into preempt_disable/enable_no_resched(). The downside is that this sacrifices breaking scheduling guarantees; however, a kernel compiled with KMSAN has already given up any performance guarantees due to being heavily instrumented. Note, KMSAN code already disables tracing via Makefile, and since mmzone.h is included, it is not necessary to use the notrace variant, which is generally preferred in all other cases. Link: https://lkml.kernel.org/r/20240115184430.2710652-1-glider@google.com [1] Link: https://lkml.kernel.org/r/20240118110022.2538350-1-elver@google.com Fixes: 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage") Signed-off-by: Marco Elver Reported-by: Alexander Potapenko Reported-by: syzbot+93a9e8a3dea8d6085e12@syzkaller.appspotmail.com Reviewed-by: Alexander Potapenko Tested-by: Alexander Potapenko Cc: Charan Teja Kalla Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Dmitry Vyukov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/include/asm/kmsan.h | 17 ++++++++++++++++- include/linux/mmzone.h | 6 +++--- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h index 8fa6ac0e2d76..d91b37f5b4bb 100644 --- a/arch/x86/include/asm/kmsan.h +++ b/arch/x86/include/asm/kmsan.h @@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr) { unsigned long x = (unsigned long)addr; unsigned long y = x - __START_KERNEL_map; + bool ret; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { @@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr) return false; } - return pfn_valid(x >> PAGE_SHIFT); + /* + * pfn_valid() relies on RCU, and may call into the scheduler on exiting + * the critical section. However, this would result in recursion with + * KMSAN. Therefore, disable preemption here, and re-enable preemption + * below while suppressing reschedules to avoid recursion. + * + * Note, this sacrifices occasionally breaking scheduling guarantees. + * Although, a kernel compiled with KMSAN has already given up on any + * performance guarantees due to being heavily instrumented. + */ + preempt_disable(); + ret = pfn_valid(x >> PAGE_SHIFT); + preempt_enable_no_resched(); + + return ret; } #endif /* !MODULE */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4ed33b127821..a497f189d988 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2013,9 +2013,9 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); - rcu_read_lock(); + rcu_read_lock_sched(); if (!valid_section(ms)) { - rcu_read_unlock(); + rcu_read_unlock_sched(); return 0; } /* @@ -2023,7 +2023,7 @@ static inline int pfn_valid(unsigned long pfn) * the entire section-sized span. */ ret = early_section(ms) || pfn_section_valid(ms, pfn); - rcu_read_unlock(); + rcu_read_unlock_sched(); return ret; } -- cgit v1.2.3 From a22fe1d6dec7e98535b97249fdc95c2be79120bb Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 23 Jan 2024 12:28:41 -0500 Subject: dmaengine: fix is_slave_direction() return false when DMA_DEV_TO_DEV is_slave_direction() should return true when direction is DMA_DEV_TO_DEV. Fixes: 49920bc66984 ("dmaengine: add new enum dma_transfer_direction") Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20240123172842.3764529-1-Frank.Li@nxp.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 3df70d6131c8..752dbde4cec1 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -953,7 +953,8 @@ static inline int dmaengine_slave_config(struct dma_chan *chan, static inline bool is_slave_direction(enum dma_transfer_direction direction) { - return (direction == DMA_MEM_TO_DEV) || (direction == DMA_DEV_TO_MEM); + return (direction == DMA_MEM_TO_DEV) || (direction == DMA_DEV_TO_MEM) || + (direction == DMA_DEV_TO_DEV); } static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single( -- cgit v1.2.3 From 5a287d3d2b9de2b3e747132c615599907ba5c3c1 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 26 Jan 2024 19:45:31 +0100 Subject: lsm: fix default return value of the socket_getpeersec_*() hooks For these hooks the true "neutral" value is -EOPNOTSUPP, which is currently what is returned when no LSM provides this hook and what LSMs return when there is no security context set on the socket. Correct the value in and adjust the dispatch functions in security/security.c to avoid issues when the BPF LSM is enabled. Cc: stable@vger.kernel.org Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks") Signed-off-by: Ondrej Mosnacek [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 4 ++-- security/security.c | 31 +++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 185924c56378..76458b6d53da 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -315,9 +315,9 @@ LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname) LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how) LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb) -LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock, +LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_stream, struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) -LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock, +LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_dgram, struct socket *sock, struct sk_buff *skb, u32 *secid) LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority) LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk) diff --git a/security/security.c b/security/security.c index 6196ccaba433..3aaad75c9ce8 100644 --- a/security/security.c +++ b/security/security.c @@ -4624,8 +4624,20 @@ EXPORT_SYMBOL(security_sock_rcv_skb); int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { - return call_int_hook(socket_getpeersec_stream, -ENOPROTOOPT, sock, - optval, optlen, len); + struct security_hook_list *hp; + int rc; + + /* + * Only one module will provide a security context. + */ + hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream, + list) { + rc = hp->hook.socket_getpeersec_stream(sock, optval, optlen, + len); + if (rc != LSM_RET_DEFAULT(socket_getpeersec_stream)) + return rc; + } + return LSM_RET_DEFAULT(socket_getpeersec_stream); } /** @@ -4645,8 +4657,19 @@ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { - return call_int_hook(socket_getpeersec_dgram, -ENOPROTOOPT, sock, - skb, secid); + struct security_hook_list *hp; + int rc; + + /* + * Only one module will provide a security context. + */ + hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_dgram, + list) { + rc = hp->hook.socket_getpeersec_dgram(sock, skb, secid); + if (rc != LSM_RET_DEFAULT(socket_getpeersec_dgram)) + return rc; + } + return LSM_RET_DEFAULT(socket_getpeersec_dgram); } EXPORT_SYMBOL(security_socket_getpeersec_dgram); -- cgit v1.2.3 From 764ad6b02777d77dca3659ca490f0898aa593670 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 24 Jan 2024 12:26:59 +0100 Subject: HID: bpf: use __bpf_kfunc instead of noinline Follow the docs at Documentation/bpf/kfuncs.rst: - declare the function with `__bpf_kfunc` - disables missing prototype warnings, which allows to remove them from include/linux/hid-bpf.h Removing the prototypes is not an issue because we currently have to redeclare them when writing the BPF program. They will eventually be generated by bpftool directly AFAIU. Link: https://lore.kernel.org/r/20240124-b4-hid-bpf-fixes-v2-3-052520b1e5e6@kernel.org Signed-off-by: Benjamin Tissoires --- drivers/hid/bpf/hid_bpf_dispatch.c | 18 +++++++++++++----- include/linux/hid_bpf.h | 11 ----------- 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index 7903c8638e81..470ae2c29c94 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -143,6 +143,9 @@ u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *s } EXPORT_SYMBOL_GPL(call_hid_bpf_rdesc_fixup); +/* Disables missing prototype warnings */ +__bpf_kfunc_start_defs(); + /** * hid_bpf_get_data - Get the kernel memory pointer associated with the context @ctx * @@ -152,7 +155,7 @@ EXPORT_SYMBOL_GPL(call_hid_bpf_rdesc_fixup); * * @returns %NULL on error, an %__u8 memory pointer on success */ -noinline __u8 * +__bpf_kfunc __u8 * hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr_buf_size) { struct hid_bpf_ctx_kern *ctx_kern; @@ -167,6 +170,7 @@ hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr return ctx_kern->data + offset; } +__bpf_kfunc_end_defs(); /* * The following set contains all functions we agree BPF programs @@ -274,6 +278,9 @@ static int do_hid_bpf_attach_prog(struct hid_device *hdev, int prog_fd, struct b return fd; } +/* Disables missing prototype warnings */ +__bpf_kfunc_start_defs(); + /** * hid_bpf_attach_prog - Attach the given @prog_fd to the given HID device * @@ -286,7 +293,7 @@ static int do_hid_bpf_attach_prog(struct hid_device *hdev, int prog_fd, struct b * is pinned to the BPF file system). */ /* called from syscall */ -noinline int +__bpf_kfunc int hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, __u32 flags) { struct hid_device *hdev; @@ -338,7 +345,7 @@ hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, __u32 flags) * * @returns A pointer to &struct hid_bpf_ctx on success, %NULL on error. */ -noinline struct hid_bpf_ctx * +__bpf_kfunc struct hid_bpf_ctx * hid_bpf_allocate_context(unsigned int hid_id) { struct hid_device *hdev; @@ -371,7 +378,7 @@ hid_bpf_allocate_context(unsigned int hid_id) * @ctx: the HID-BPF context to release * */ -noinline void +__bpf_kfunc void hid_bpf_release_context(struct hid_bpf_ctx *ctx) { struct hid_bpf_ctx_kern *ctx_kern; @@ -397,7 +404,7 @@ hid_bpf_release_context(struct hid_bpf_ctx *ctx) * * @returns %0 on success, a negative error code otherwise. */ -noinline int +__bpf_kfunc int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz, enum hid_report_type rtype, enum hid_class_request reqtype) { @@ -465,6 +472,7 @@ hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz, kfree(dma_data); return ret; } +__bpf_kfunc_end_defs(); /* our HID-BPF entrypoints */ BTF_SET8_START(hid_bpf_fmodret_ids) diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 840cd254172d..7118ac28d468 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -77,17 +77,6 @@ enum hid_bpf_attach_flags { int hid_bpf_device_event(struct hid_bpf_ctx *ctx); int hid_bpf_rdesc_fixup(struct hid_bpf_ctx *ctx); -/* Following functions are kfunc that we export to BPF programs */ -/* available everywhere in HID-BPF */ -__u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t __sz); - -/* only available in syscall */ -int hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, __u32 flags); -int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz, - enum hid_report_type rtype, enum hid_class_request reqtype); -struct hid_bpf_ctx *hid_bpf_allocate_context(unsigned int hid_id); -void hid_bpf_release_context(struct hid_bpf_ctx *ctx); - /* * Below is HID internal */ -- cgit v1.2.3 From 1e560864159d002b453da42bd2c13a1805515a20 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 30 Jan 2024 11:02:43 +0100 Subject: PCI/ASPM: Fix deadlock when enabling ASPM A last minute revert in 6.7-final introduced a potential deadlock when enabling ASPM during probe of Qualcomm PCIe controllers as reported by lockdep: ============================================ WARNING: possible recursive locking detected 6.7.0 #40 Not tainted -------------------------------------------- kworker/u16:5/90 is trying to acquire lock: ffffacfa78ced000 (pci_bus_sem){++++}-{3:3}, at: pcie_aspm_pm_state_change+0x58/0xdc but task is already holding lock: ffffacfa78ced000 (pci_bus_sem){++++}-{3:3}, at: pci_walk_bus+0x34/0xbc other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(pci_bus_sem); lock(pci_bus_sem); *** DEADLOCK *** Call trace: print_deadlock_bug+0x25c/0x348 __lock_acquire+0x10a4/0x2064 lock_acquire+0x1e8/0x318 down_read+0x60/0x184 pcie_aspm_pm_state_change+0x58/0xdc pci_set_full_power_state+0xa8/0x114 pci_set_power_state+0xc4/0x120 qcom_pcie_enable_aspm+0x1c/0x3c [pcie_qcom] pci_walk_bus+0x64/0xbc qcom_pcie_host_post_init_2_7_0+0x28/0x34 [pcie_qcom] The deadlock can easily be reproduced on machines like the Lenovo ThinkPad X13s by adding a delay to increase the race window during asynchronous probe where another thread can take a write lock. Add a new pci_set_power_state_locked() and associated helper functions that can be called with the PCI bus semaphore held to avoid taking the read lock twice. Link: https://lore.kernel.org/r/ZZu0qx2cmn7IwTyQ@hovoldconsulting.com Link: https://lore.kernel.org/r/20240130100243.11011-1-johan+linaro@kernel.org Fixes: f93e71aea6c6 ("Revert "PCI/ASPM: Remove pcie_aspm_pm_state_change()"") Signed-off-by: Johan Hovold Signed-off-by: Bjorn Helgaas Cc: # 6.7 --- drivers/pci/bus.c | 49 +++++++++++++-------- drivers/pci/controller/dwc/pcie-qcom.c | 2 +- drivers/pci/pci.c | 78 ++++++++++++++++++++++------------ drivers/pci/pci.h | 4 +- drivers/pci/pcie/aspm.c | 13 ++++-- include/linux/pci.h | 5 +++ 6 files changed, 101 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 9c2137dae429..826b5016a101 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -386,21 +386,8 @@ void pci_bus_add_devices(const struct pci_bus *bus) } EXPORT_SYMBOL(pci_bus_add_devices); -/** pci_walk_bus - walk devices on/under bus, calling callback. - * @top bus whose devices should be walked - * @cb callback to be called for each device found - * @userdata arbitrary pointer to be passed to callback. - * - * Walk the given bus, including any bridged devices - * on buses under this bus. Call the provided callback - * on each device found. - * - * We check the return of @cb each time. If it returns anything - * other than 0, we break out. - * - */ -void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), - void *userdata) +static void __pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), + void *userdata, bool locked) { struct pci_dev *dev; struct pci_bus *bus; @@ -408,7 +395,8 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), int retval; bus = top; - down_read(&pci_bus_sem); + if (!locked) + down_read(&pci_bus_sem); next = top->devices.next; for (;;) { if (next == &bus->devices) { @@ -431,10 +419,37 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), if (retval) break; } - up_read(&pci_bus_sem); + if (!locked) + up_read(&pci_bus_sem); +} + +/** + * pci_walk_bus - walk devices on/under bus, calling callback. + * @top: bus whose devices should be walked + * @cb: callback to be called for each device found + * @userdata: arbitrary pointer to be passed to callback + * + * Walk the given bus, including any bridged devices + * on buses under this bus. Call the provided callback + * on each device found. + * + * We check the return of @cb each time. If it returns anything + * other than 0, we break out. + */ +void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata) +{ + __pci_walk_bus(top, cb, userdata, false); } EXPORT_SYMBOL_GPL(pci_walk_bus); +void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata) +{ + lockdep_assert_held(&pci_bus_sem); + + __pci_walk_bus(top, cb, userdata, true); +} +EXPORT_SYMBOL_GPL(pci_walk_bus_locked); + struct pci_bus *pci_bus_get(struct pci_bus *bus) { if (bus) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 10f2d0bb86be..2ce2a3bd932b 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -972,7 +972,7 @@ static int qcom_pcie_enable_aspm(struct pci_dev *pdev, void *userdata) * Downstream devices need to be in D0 state before enabling PCI PM * substates. */ - pci_set_power_state(pdev, PCI_D0); + pci_set_power_state_locked(pdev, PCI_D0); pci_enable_link_state_locked(pdev, PCIE_LINK_STATE_ALL); return 0; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index d8f11a078924..9ab9b1008d8b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1354,6 +1354,7 @@ end: /** * pci_set_full_power_state - Put a PCI device into D0 and update its state * @dev: PCI device to power up + * @locked: whether pci_bus_sem is held * * Call pci_power_up() to put @dev into D0, read from its PCI_PM_CTRL register * to confirm the state change, restore its BARs if they might be lost and @@ -1363,7 +1364,7 @@ end: * to D0, it is more efficient to use pci_power_up() directly instead of this * function. */ -static int pci_set_full_power_state(struct pci_dev *dev) +static int pci_set_full_power_state(struct pci_dev *dev, bool locked) { u16 pmcsr; int ret; @@ -1399,7 +1400,7 @@ static int pci_set_full_power_state(struct pci_dev *dev) } if (dev->bus->self) - pcie_aspm_pm_state_change(dev->bus->self); + pcie_aspm_pm_state_change(dev->bus->self, locked); return 0; } @@ -1428,10 +1429,22 @@ void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state) pci_walk_bus(bus, __pci_dev_set_current_state, &state); } +static void __pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state, bool locked) +{ + if (!bus) + return; + + if (locked) + pci_walk_bus_locked(bus, __pci_dev_set_current_state, &state); + else + pci_walk_bus(bus, __pci_dev_set_current_state, &state); +} + /** * pci_set_low_power_state - Put a PCI device into a low-power state. * @dev: PCI device to handle. * @state: PCI power state (D1, D2, D3hot) to put the device into. + * @locked: whether pci_bus_sem is held * * Use the device's PCI_PM_CTRL register to put it into a low-power state. * @@ -1442,7 +1455,7 @@ void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state) * 0 if device already is in the requested state. * 0 if device's power state has been successfully changed. */ -static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state) +static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state, bool locked) { u16 pmcsr; @@ -1496,29 +1509,12 @@ static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state) pci_power_name(state)); if (dev->bus->self) - pcie_aspm_pm_state_change(dev->bus->self); + pcie_aspm_pm_state_change(dev->bus->self, locked); return 0; } -/** - * pci_set_power_state - Set the power state of a PCI device - * @dev: PCI device to handle. - * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. - * - * Transition a device to a new power state, using the platform firmware and/or - * the device's PCI PM registers. - * - * RETURN VALUE: - * -EINVAL if the requested state is invalid. - * -EIO if device does not support PCI PM or its PM capabilities register has a - * wrong version, or device doesn't support the requested state. - * 0 if the transition is to D1 or D2 but D1 and D2 are not supported. - * 0 if device already is in the requested state. - * 0 if the transition is to D3 but D3 is not supported. - * 0 if device's power state has been successfully changed. - */ -int pci_set_power_state(struct pci_dev *dev, pci_power_t state) +static int __pci_set_power_state(struct pci_dev *dev, pci_power_t state, bool locked) { int error; @@ -1542,7 +1538,7 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state) return 0; if (state == PCI_D0) - return pci_set_full_power_state(dev); + return pci_set_full_power_state(dev, locked); /* * This device is quirked not to be put into D3, so don't put it in @@ -1556,16 +1552,16 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state) * To put the device in D3cold, put it into D3hot in the native * way, then put it into D3cold using platform ops. */ - error = pci_set_low_power_state(dev, PCI_D3hot); + error = pci_set_low_power_state(dev, PCI_D3hot, locked); if (pci_platform_power_transition(dev, PCI_D3cold)) return error; /* Powering off a bridge may power off the whole hierarchy */ if (dev->current_state == PCI_D3cold) - pci_bus_set_current_state(dev->subordinate, PCI_D3cold); + __pci_bus_set_current_state(dev->subordinate, PCI_D3cold, locked); } else { - error = pci_set_low_power_state(dev, state); + error = pci_set_low_power_state(dev, state, locked); if (pci_platform_power_transition(dev, state)) return error; @@ -1573,8 +1569,38 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state) return 0; } + +/** + * pci_set_power_state - Set the power state of a PCI device + * @dev: PCI device to handle. + * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. + * + * Transition a device to a new power state, using the platform firmware and/or + * the device's PCI PM registers. + * + * RETURN VALUE: + * -EINVAL if the requested state is invalid. + * -EIO if device does not support PCI PM or its PM capabilities register has a + * wrong version, or device doesn't support the requested state. + * 0 if the transition is to D1 or D2 but D1 and D2 are not supported. + * 0 if device already is in the requested state. + * 0 if the transition is to D3 but D3 is not supported. + * 0 if device's power state has been successfully changed. + */ +int pci_set_power_state(struct pci_dev *dev, pci_power_t state) +{ + return __pci_set_power_state(dev, state, false); +} EXPORT_SYMBOL(pci_set_power_state); +int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state) +{ + lockdep_assert_held(&pci_bus_sem); + + return __pci_set_power_state(dev, state, true); +} +EXPORT_SYMBOL(pci_set_power_state_locked); + #define PCI_EXP_SAVE_REGS 7 static struct pci_cap_saved_state *_pci_find_saved_cap(struct pci_dev *pci_dev, diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 2336a8d1edab..e9750b1b19ba 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -571,12 +571,12 @@ int pcie_retrain_link(struct pci_dev *pdev, bool use_lt); #ifdef CONFIG_PCIEASPM void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); -void pcie_aspm_pm_state_change(struct pci_dev *pdev); +void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } -static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev) { } +static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } #endif diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 5a0066ecc3c5..bc0bd86695ec 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -1003,8 +1003,11 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev) up_read(&pci_bus_sem); } -/* @pdev: the root port or switch downstream port */ -void pcie_aspm_pm_state_change(struct pci_dev *pdev) +/* + * @pdev: the root port or switch downstream port + * @locked: whether pci_bus_sem is held + */ +void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { struct pcie_link_state *link = pdev->link_state; @@ -1014,12 +1017,14 @@ void pcie_aspm_pm_state_change(struct pci_dev *pdev) * Devices changed PM state, we should recheck if latency * meets all functions' requirement */ - down_read(&pci_bus_sem); + if (!locked) + down_read(&pci_bus_sem); mutex_lock(&aspm_lock); pcie_update_aspm_capable(link->root); pcie_config_aspm_path(link); mutex_unlock(&aspm_lock); - up_read(&pci_bus_sem); + if (!locked) + up_read(&pci_bus_sem); } void pcie_aspm_powersave_config_link(struct pci_dev *pdev) diff --git a/include/linux/pci.h b/include/linux/pci.h index add9368e6314..7ab0d13672da 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1422,6 +1422,7 @@ int pci_load_and_free_saved_state(struct pci_dev *dev, struct pci_saved_state **state); int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state); int pci_set_power_state(struct pci_dev *dev, pci_power_t state); +int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state); pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state); bool pci_pme_capable(struct pci_dev *dev, pci_power_t state); void pci_pme_active(struct pci_dev *dev, bool enable); @@ -1625,6 +1626,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata); +void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), + void *userdata); int pci_cfg_space_size(struct pci_dev *dev); unsigned char pci_bus_max_busnr(struct pci_bus *bus); void pci_setup_bridge(struct pci_bus *bus); @@ -2025,6 +2028,8 @@ static inline int pci_save_state(struct pci_dev *dev) { return 0; } static inline void pci_restore_state(struct pci_dev *dev) { } static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return 0; } +static inline int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state) +{ return 0; } static inline int pci_wake_from_d3(struct pci_dev *dev, bool enable) { return 0; } static inline pci_power_t pci_choose_state(struct pci_dev *dev, -- cgit v1.2.3 From 97f7cf1cd80eeed3b7c808b7c12463295c751001 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 29 Jan 2024 10:57:01 +0100 Subject: netfilter: ipset: fix performance regression in swap operation The patch "netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test", commit 28628fa9 fixes a race condition. But the synchronize_rcu() added to the swap function unnecessarily slows it down: it can safely be moved to destroy and use call_rcu() instead. Eric Dumazet pointed out that simply calling the destroy functions as rcu callback does not work: sets with timeout use garbage collectors which need cancelling at destroy which can wait. Therefore the destroy functions are split into two: cancelling garbage collectors safely at executing the command received by netlink and moving the remaining part only into the rcu callback. Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/ Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test") Reported-by: Ale Crismani Reported-by: David Wang <00107082@163.com> Tested-by: David Wang <00107082@163.com> Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 4 ++++ net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++++--- net/netfilter/ipset/ip_set_core.c | 37 +++++++++++++++++++++++++-------- net/netfilter/ipset/ip_set_hash_gen.h | 15 ++++++++++--- net/netfilter/ipset/ip_set_list_set.c | 13 +++++++++--- 5 files changed, 65 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index e8c350a3ade1..e9f4f845d760 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -186,6 +186,8 @@ struct ip_set_type_variant { /* Return true if "b" set is the same as "a" * according to the create set parameters */ bool (*same_set)(const struct ip_set *a, const struct ip_set *b); + /* Cancel ongoing garbage collectors before destroying the set*/ + void (*cancel_gc)(struct ip_set *set); /* Region-locking is used */ bool region_lock; }; @@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type); /* A generic IP set */ struct ip_set { + /* For call_cru in destroy */ + struct rcu_head rcu; /* The name of the set */ char name[IPSET_MAXNAMELEN]; /* Lock protecting the set data */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 21f7860e8fa1..cb48a2b9cb9f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -30,6 +30,7 @@ #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) @@ -59,9 +60,6 @@ mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; - if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); - if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); @@ -290,6 +288,15 @@ mtype_gc(struct timer_list *t) add_timer(&map->gc); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); +} + static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, @@ -303,6 +310,7 @@ static const struct ip_set_type_variant mtype = { .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 4c133e06be1d..bcaad9c009fe 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set) kfree(set); } +static void +ip_set_destroy_set_rcu(struct rcu_head *head) +{ + struct ip_set *set = container_of(head, struct ip_set, rcu); + + ip_set_destroy_set(set); +} + static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { @@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); /* Commands are serialized and references are * protected by the ip_set_ref_lock. @@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, * counter, so if it's already zero, we can proceed * without holding the lock. */ - read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { + /* Must wait for flush to be really finished in list:set */ + rcu_barrier(); + read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && (s->ref || s->ref_netlink)) { @@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, s = ip_set(inst, i); if (s) { ip_set(inst, i) = NULL; + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); ip_set_destroy_set(s); } } @@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, inst->is_destroyed = false; } else { u32 flags = flag_exist(info->nlh); + u16 features = 0; + + read_lock_bh(&ip_set_ref_lock); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { @@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, ret = -IPSET_ERR_BUSY; goto out; } + features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - - ip_set_destroy_set(s); + if (features & IPSET_TYPE_NAME) { + /* Must wait for flush to be really finished */ + rcu_barrier(); + } + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); + call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; out: @@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); - /* Make sure all readers of the old set pointers are completed. */ - synchronize_rcu(); - return 0; } @@ -2409,8 +2425,11 @@ ip_set_fini(void) { nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - unregister_pernet_subsys(&ip_set_net_ops); + + /* Wait for call_rcu() in destroy */ + rcu_barrier(); + pr_debug("these are the famous last words\n"); } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index cbf80da9a01c..1136510521a8 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -222,6 +222,7 @@ static const union nf_inet_addr zeromask = {}; #undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init +#undef mtype_cancel_gc #undef mtype_variant #undef mtype_data_match @@ -266,6 +267,7 @@ static const union nf_inet_addr zeromask = {}; #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) @@ -450,9 +452,6 @@ mtype_destroy(struct ip_set *set) struct htype *h = set->data; struct list_head *l, *lt; - if (SET_WITH_TIMEOUT(set)) - cancel_delayed_work_sync(&h->gc.dwork); - mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); list_for_each_safe(l, lt, &h->ad) { list_del(l); @@ -599,6 +598,15 @@ mtype_gc_init(struct htable_gc *gc) queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct htype *h = set->data; + + if (SET_WITH_TIMEOUT(set)) + cancel_delayed_work_sync(&h->gc.dwork); +} + static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags); @@ -1441,6 +1449,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, .region_lock = true, }; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index e162636525cf..6c3f28bc59b3 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set) struct list_set *map = set->data; struct set_elem *e, *n; - if (SET_WITH_TIMEOUT(set)) - timer_shutdown_sync(&map->gc); - list_for_each_entry_safe(e, n, &map->members, list) { list_del(&e->list); ip_set_put_byindex(map->net, e->id); @@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } +static void +list_set_cancel_gc(struct ip_set *set) +{ + struct list_set *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + timer_shutdown_sync(&map->gc); +} + static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, @@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = { .head = list_set_head, .list = list_set_list, .same_set = list_set_same_set, + .cancel_gc = list_set_cancel_gc, }; static void -- cgit v1.2.3 From f9e9115d0c014dec3278d68823eaff159f98f4d6 Mon Sep 17 00:00:00 2001 From: Caleb Sander Date: Wed, 31 Jan 2024 09:43:13 -0700 Subject: nvme: take const cmd pointer in read-only helpers nvme_is_fabrics() and nvme_is_write() only read struct nvme_command, so take it by const pointer. This allows callers to pass a const pointer and communicates that these functions don't modify the command. Signed-off-by: Caleb Sander Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 68eff8c86ce3..bc605ec4a3fd 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1812,7 +1812,7 @@ struct nvme_command { }; }; -static inline bool nvme_is_fabrics(struct nvme_command *cmd) +static inline bool nvme_is_fabrics(const struct nvme_command *cmd) { return cmd->common.opcode == nvme_fabrics_command; } @@ -1831,7 +1831,7 @@ struct nvme_error_slot { __u8 resv2[24]; }; -static inline bool nvme_is_write(struct nvme_command *cmd) +static inline bool nvme_is_write(const struct nvme_command *cmd) { /* * What a mess... -- cgit v1.2.3 From 54ce1927eb787f7bbb7ee664841c8f5932703f39 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 31 Jan 2024 15:55:38 -0800 Subject: cxl/cper: Fix errant CPER prints for CXL events Jonathan reports that CXL CPER events dump an extra generic error message. {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 {1}[Hardware Error]: event severity: recoverable {1}[Hardware Error]: Error 0, type: recoverable {1}[Hardware Error]: section type: unknown, fbcd0a77-c260-417f-85a9-088b1621eba6 {1}[Hardware Error]: section length: 0x90 {1}[Hardware Error]: 00000000: 00000090 00000007 00000000 0d938086 ................ {1}[Hardware Error]: 00000010: 00100000 00000000 00040000 00000000 ................ ... CXL events were rerouted though the CXL subsystem for additional processing. However, when that work was done it was missed that cper_estatus_print_section() continued with a generic error message which is confusing. Teach CPER print code to ignore printing details of some section types. Assign the CXL event GUIDs to this set to prevent confusing unknown prints. Reported-by: Jonathan Cameron Suggested-by: Jonathan Cameron Signed-off-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Signed-off-by: Ard Biesheuvel --- drivers/acpi/apei/ghes.c | 26 -------------------------- drivers/firmware/efi/cper.c | 19 +++++++++++++++++++ include/linux/cper.h | 23 +++++++++++++++++++++++ 3 files changed, 42 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 7b7c605166e0..fe825a432c5b 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -680,32 +680,6 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, static DECLARE_RWSEM(cxl_cper_rw_sem); static cxl_cper_callback cper_callback; -/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ - -/* - * General Media Event Record - * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 - */ -#define CPER_SEC_CXL_GEN_MEDIA_GUID \ - GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ - 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) - -/* - * DRAM Event Record - * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 - */ -#define CPER_SEC_CXL_DRAM_GUID \ - GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ - 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) - -/* - * Memory Module Event Record - * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 - */ -#define CPER_SEC_CXL_MEM_MODULE_GUID \ - GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ - 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) - static void cxl_cper_post_event(enum cxl_event_type event_type, struct cxl_cper_event_rec *rec) { diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 35c37f667781..9b3884ff81e6 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -523,6 +523,17 @@ static void cper_print_tstamp(const char *pfx, } } +struct ignore_section { + guid_t guid; + const char *name; +}; + +static const struct ignore_section ignore_sections[] = { + { .guid = CPER_SEC_CXL_GEN_MEDIA_GUID, .name = "CXL General Media Event" }, + { .guid = CPER_SEC_CXL_DRAM_GUID, .name = "CXL DRAM Event" }, + { .guid = CPER_SEC_CXL_MEM_MODULE_GUID, .name = "CXL Memory Module Event" }, +}; + static void cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata, int sec_no) @@ -543,6 +554,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text); snprintf(newpfx, sizeof(newpfx), "%s ", pfx); + + for (int i = 0; i < ARRAY_SIZE(ignore_sections); i++) { + if (guid_equal(sec_type, &ignore_sections[i].guid)) { + printk("%ssection_type: %s\n", newpfx, ignore_sections[i].name); + return; + } + } + if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) { struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata); diff --git a/include/linux/cper.h b/include/linux/cper.h index c1a7dc325121..265b0f8fc0b3 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -90,6 +90,29 @@ enum { GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ 0x72, 0x2D, 0xEB, 0x41) +/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ +/* + * General Media Event Record + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CPER_SEC_CXL_GEN_MEDIA_GUID \ + GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ + 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) +/* + * DRAM Event Record + * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 + */ +#define CPER_SEC_CXL_DRAM_GUID \ + GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ + 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) +/* + * Memory Module Event Record + * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 + */ +#define CPER_SEC_CXL_MEM_MODULE_GUID \ + GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ + 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) + /* * Flags bits definitions for flags in struct cper_record_header * If set, the error has been recovered -- cgit v1.2.3 From dad6a09f3148257ac1773cd90934d721d68ab595 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 29 Jan 2024 15:56:36 -0800 Subject: hrtimer: Report offline hrtimer enqueue The hrtimers migration on CPU-down hotplug process has been moved earlier, before the CPU actually goes to die. This leaves a small window of opportunity to queue an hrtimer in a blind spot, leaving it ignored. For example a practical case has been reported with RCU waking up a SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that way a sched/rt timer to the local offline CPU. Make sure such situations never go unnoticed and warn when that happens. Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240129235646.3171983-4-boqun.feng@gmail.com --- include/linux/hrtimer.h | 4 +++- kernel/time/hrtimer.c | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 87e3bedf8eb0..641c4567cfa7 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -157,6 +157,7 @@ enum hrtimer_base_type { * @max_hang_time: Maximum time spent in hrtimer_interrupt * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are * expired + * @online: CPU is online from an hrtimers point of view * @timer_waiters: A hrtimer_cancel() invocation waits for the timer * callback to finish. * @expires_next: absolute time of the next event, is required for remote @@ -179,7 +180,8 @@ struct hrtimer_cpu_base { unsigned int hres_active : 1, in_hrtirq : 1, hang_detected : 1, - softirq_activated : 1; + softirq_activated : 1, + online : 1; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 760793998cdd..edb0f821dcea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, enum hrtimer_mode mode) { debug_activate(timer, mode); + WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; @@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); + old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; -- cgit v1.2.3 From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 23 Jan 2024 18:58:26 +0100 Subject: blk-wbt: Fix detection of dirty-throttled tasks The detection of dirty-throttled tasks in blk-wbt has been subtly broken since its beginning in 2016. Namely if we are doing cgroup writeback and the throttled task is not in the root cgroup, balance_dirty_pages() will set dirty_sleep for the non-root bdi_writeback structure. However blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback structure. Thus detection of recently throttled tasks is not working in this case (we noticed this when we switched to cgroup v2 and suddently writeback was slow). Since blk-wbt has no easy way to get to proper bdi_writeback and furthermore its intention has always been to work on the whole device rather than on individual cgroups, just move the dirty_sleep timestamp from bdi_writeback to backing_dev_info. That fixes the checking for recently throttled task and saves memory for everybody as a bonus. CC: stable@vger.kernel.org Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz [axboe: fixup indentation errors] Signed-off-by: Jens Axboe --- block/blk-wbt.c | 4 ++-- include/linux/backing-dev-defs.h | 7 +++++-- mm/backing-dev.c | 2 +- mm/page-writeback.c | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 5ba3cd574eac..0c0e270a8265 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; - return time_before(jiffies, wb->dirty_sleep + HZ); + return time_before(jiffies, bdi->last_bdp_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ae12696ec492..2ad261082bba 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -141,8 +141,6 @@ struct bdi_writeback { struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ - unsigned long dirty_sleep; /* last wait */ - struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK @@ -179,6 +177,11 @@ struct backing_dev_info { * any dirty wbs, which is depended upon by bdi_has_dirty(). */ atomic_long_t tot_write_bandwidth; + /* + * Jiffies when last process was dirty throttled on this bdi. Used by + * blk-wbt. + */ + unsigned long last_bdp_sleep; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct list_head wb_list; /* list of all wbs */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1e3447bccdb1..e039d05304dd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); - wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); if (err) @@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi) INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); + bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cd4e4ae77c40..cc37fa7f3364 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1921,7 +1921,7 @@ pause: break; } __set_current_state(TASK_KILLABLE); - wb->dirty_sleep = now; + bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; -- cgit v1.2.3 From cd7d469c25704d414d71bf3644f163fb74e7996b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 13 Oct 2023 13:55:44 +0800 Subject: libceph: fail sparse-read if the data length doesn't match Once this happens that means there have bugs. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 3 ++- net/ceph/osd_client.c | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fa018d5864e7..f66f6aac74f6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -45,6 +45,7 @@ enum ceph_sparse_read_state { CEPH_SPARSE_READ_HDR = 0, CEPH_SPARSE_READ_EXTENTS, CEPH_SPARSE_READ_DATA_LEN, + CEPH_SPARSE_READ_DATA_PRE, CEPH_SPARSE_READ_DATA, }; @@ -64,7 +65,7 @@ struct ceph_sparse_read { u64 sr_req_len; /* orig request length */ u64 sr_pos; /* current pos in buffer */ int sr_index; /* current extent index */ - __le32 sr_datalen; /* length of actual data */ + u32 sr_datalen; /* length of actual data */ u32 sr_count; /* extent count in reply */ int sr_ext_len; /* length of extent array */ struct ceph_sparse_extent *sr_extent; /* extent array */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 625622016f57..2cea35e4ff8e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5857,8 +5857,8 @@ static int osd_sparse_read(struct ceph_connection *con, struct ceph_osd *o = con->private; struct ceph_sparse_read *sr = &o->o_sparse_read; u32 count = sr->sr_count; - u64 eoff, elen; - int ret; + u64 eoff, elen, len = 0; + int i, ret; switch (sr->sr_state) { case CEPH_SPARSE_READ_HDR: @@ -5903,8 +5903,20 @@ next_op: convert_extent_map(sr); ret = sizeof(sr->sr_datalen); *pbuf = (char *)&sr->sr_datalen; - sr->sr_state = CEPH_SPARSE_READ_DATA; + sr->sr_state = CEPH_SPARSE_READ_DATA_PRE; break; + case CEPH_SPARSE_READ_DATA_PRE: + /* Convert sr_datalen to host-endian */ + sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen); + for (i = 0; i < count; i++) + len += sr->sr_extent[i].len; + if (sr->sr_datalen != len) { + pr_warn_ratelimited("data len %u != extent len %llu\n", + sr->sr_datalen, len); + return -EREMOTEIO; + } + sr->sr_state = CEPH_SPARSE_READ_DATA; + fallthrough; case CEPH_SPARSE_READ_DATA: if (sr->sr_index >= count) { sr->sr_state = CEPH_SPARSE_READ_HDR; -- cgit v1.2.3 From 8e46a2d068c92a905d01cbb018b00d66991585ab Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 14 Dec 2023 16:01:03 +0800 Subject: libceph: just wait for more data to be available on the socket A short read may occur while reading the message footer from the socket. Later, when the socket is ready for another read, the messenger invokes all read_partial_*() handlers, including read_partial_sparse_msg_data(). The expectation is that read_partial_sparse_msg_data() would bail, allowing the messenger to invoke read_partial() for the footer and pick up where it left off. However read_partial_sparse_msg_data() violates that and ends up calling into the state machine in the OSD client. The sparse-read state machine assumes that it's a new op and interprets some piece of the footer as the sparse-read header and returns bogus extents/data length, etc. To determine whether read_partial_sparse_msg_data() should bail, let's reuse cursor->total_resid. Because once it reaches to zero that means all the extents and data have been successfully received in last read, else it could break out when partially reading any of the extents and data. And then osd_sparse_read() could continue where it left off. [ idryomov: changelog ] Link: https://tracker.ceph.com/issues/63586 Fixes: d396f89db39a ("libceph: add sparse read support to msgr1") Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger_v1.c | 25 +++++++++++++------------ net/ceph/messenger_v2.c | 4 ++-- net/ceph/osd_client.c | 9 +++------ 4 files changed, 19 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 2eaaabbe98cb..1717cc57cdac 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -283,7 +283,7 @@ struct ceph_msg { struct kref kref; bool more_to_follow; bool needs_out_seq; - bool sparse_read; + u64 sparse_read_total; int front_alloc_len; struct ceph_msgpool *pool; diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 4cb60bacf5f5..0cb61c76b9b8 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -160,8 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { /* Initialize data cursor if it's not a sparse read */ - if (!msg->sparse_read) - ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); + u64 len = msg->sparse_read_total ? : data_len; + + ceph_msg_data_cursor_init(&msg->cursor, msg, len); } /* @@ -1036,7 +1037,7 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) if (do_datacrc) crc = con->in_data_crc; - do { + while (cursor->total_resid) { if (con->v1.in_sr_kvec.iov_base) ret = read_partial_message_chunk(con, &con->v1.in_sr_kvec, @@ -1044,23 +1045,23 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) &crc); else if (cursor->sr_resid > 0) ret = read_partial_sparse_msg_extent(con, &crc); - - if (ret <= 0) { - if (do_datacrc) - con->in_data_crc = crc; - return ret; - } + if (ret <= 0) + break; memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); ret = con->ops->sparse_read(con, cursor, (char **)&con->v1.in_sr_kvec.iov_base); + if (ret <= 0) { + ret = ret ? ret : 1; /* must return > 0 to indicate success */ + break; + } con->v1.in_sr_len = ret; - } while (ret > 0); + } if (do_datacrc) con->in_data_crc = crc; - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ + return ret; } static int read_partial_msg_data(struct ceph_connection *con) @@ -1253,7 +1254,7 @@ static int read_partial_message(struct ceph_connection *con) if (!m->num_data_items) return -EIO; - if (m->sparse_read) + if (m->sparse_read_total) ret = read_partial_sparse_msg_data(con); else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) ret = read_partial_msg_data_bounce(con); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index f8ec60e1aba3..a0ca5414b333 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1128,7 +1128,7 @@ static int decrypt_tail(struct ceph_connection *con) struct sg_table enc_sgt = {}; struct sg_table sgt = {}; struct page **pages = NULL; - bool sparse = con->in_msg->sparse_read; + bool sparse = !!con->in_msg->sparse_read_total; int dpos = 0; int tail_len; int ret; @@ -2060,7 +2060,7 @@ static int prepare_read_tail_plain(struct ceph_connection *con) } if (data_len(msg)) { - if (msg->sparse_read) + if (msg->sparse_read_total) con->v2.in_state = IN_S_PREPARE_SPARSE_DATA; else con->v2.in_state = IN_S_PREPARE_READ_DATA; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2cea35e4ff8e..9d078b37fe0b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5510,7 +5510,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m = ceph_msg_get(req->r_reply); - m->sparse_read = (bool)srlen; + m->sparse_read_total = srlen; dout("get_reply tid %lld %p\n", tid, m); @@ -5777,11 +5777,8 @@ static int prep_next_sparse_read(struct ceph_connection *con, } if (o->o_sparse_op_idx < 0) { - u64 srlen = sparse_data_requested(req); - - dout("%s: [%d] starting new sparse read req. srlen=0x%llx\n", - __func__, o->o_osd, srlen); - ceph_msg_data_cursor_init(cursor, con->in_msg, srlen); + dout("%s: [%d] starting new sparse read req\n", + __func__, o->o_osd); } else { u64 end; -- cgit v1.2.3 From 4356e9f841f7fbb945521cef3577ba394c65f3fc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 9 Feb 2024 12:39:31 -0800 Subject: work around gcc bugs with 'asm goto' with outputs We've had issues with gcc and 'asm goto' before, and we created a 'asm_volatile_goto()' macro for that in the past: see commits 3f0116c3238a ("compiler/gcc4: Add quirk for 'asm goto' miscompilation bug") and a9f180345f53 ("compiler/gcc4: Make quirk for asm_volatile_goto() unconditional"). Then, much later, we ended up removing the workaround in commit 43c249ea0b1e ("compiler-gcc.h: remove ancient workaround for gcc PR 58670") because we no longer supported building the kernel with the affected gcc versions, but we left the macro uses around. Now, Sean Christopherson reports a new version of a very similar problem, which is fixed by re-applying that ancient workaround. But the problem in question is limited to only the 'asm goto with outputs' cases, so instead of re-introducing the old workaround as-is, let's rename and limit the workaround to just that much less common case. It looks like there are at least two separate issues that all hit in this area: (a) some versions of gcc don't mark the asm goto as 'volatile' when it has outputs: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 which is easy to work around by just adding the 'volatile' by hand. (b) Internal compiler errors: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 which are worked around by adding the extra empty 'asm' as a barrier, as in the original workaround. but the problem Sean sees may be a third thing since it involves bad code generation (not an ICE) even with the manually added 'volatile'. but the same old workaround works for this case, even if this feels a bit like voodoo programming and may only be hiding the issue. Reported-and-tested-by: Sean Christopherson Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/ Cc: Nick Desaulniers Cc: Uros Bizjak Cc: Jakub Jelinek Cc: Andrew Pinski Signed-off-by: Linus Torvalds --- arch/arc/include/asm/jump_label.h | 4 ++-- arch/arm/include/asm/jump_label.h | 4 ++-- arch/arm64/include/asm/alternative-macros.h | 4 ++-- arch/arm64/include/asm/jump_label.h | 4 ++-- arch/csky/include/asm/jump_label.h | 4 ++-- arch/loongarch/include/asm/jump_label.h | 4 ++-- arch/mips/include/asm/jump_label.h | 4 ++-- arch/parisc/include/asm/jump_label.h | 4 ++-- arch/powerpc/include/asm/jump_label.h | 4 ++-- arch/powerpc/include/asm/uaccess.h | 12 ++++++------ arch/powerpc/kernel/irq_64.c | 2 +- arch/riscv/include/asm/arch_hweight.h | 4 ++-- arch/riscv/include/asm/bitops.h | 8 ++++---- arch/riscv/include/asm/checksum.h | 2 +- arch/riscv/include/asm/cpufeature.h | 4 ++-- arch/riscv/include/asm/jump_label.h | 4 ++-- arch/riscv/lib/csum.c | 10 +++++----- arch/s390/include/asm/jump_label.h | 4 ++-- arch/sparc/include/asm/jump_label.h | 4 ++-- arch/um/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/jump_label.h | 6 +++--- arch/x86/include/asm/rmwcc.h | 2 +- arch/x86/include/asm/special_insns.h | 2 +- arch/x86/include/asm/uaccess.h | 10 +++++----- arch/x86/kvm/svm/svm_ops.h | 6 +++--- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/vmx/vmx_ops.h | 6 +++--- arch/xtensa/include/asm/jump_label.h | 4 ++-- include/linux/compiler-gcc.h | 19 +++++++++++++++++++ include/linux/compiler_types.h | 4 ++-- net/netfilter/nft_set_pipapo_avx2.c | 2 +- samples/bpf/asm_goto_workaround.h | 8 ++++---- tools/arch/x86/include/asm/rmwcc.h | 2 +- tools/include/linux/compiler_types.h | 4 ++-- 35 files changed, 96 insertions(+), 77 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h index 9d9618079739..a339223d9e05 100644 --- a/arch/arc/include/asm/jump_label.h +++ b/arch/arc/include/asm/jump_label.h @@ -31,7 +31,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" + asm goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" "1: \n" "nop \n" ".pushsection __jump_table, \"aw\" \n" @@ -47,7 +47,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" + asm goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" "1: \n" "b %l[l_yes] \n" ".pushsection __jump_table, \"aw\" \n" diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h index e12d7d096fc0..e4eb54f6cd9f 100644 --- a/arch/arm/include/asm/jump_label.h +++ b/arch/arm/include/asm/jump_label.h @@ -11,7 +11,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" WASM(nop) "\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" @@ -25,7 +25,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" WASM(b) " %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h index 210bb43cff2c..d328f549b1a6 100644 --- a/arch/arm64/include/asm/alternative-macros.h +++ b/arch/arm64/include/asm/alternative-macros.h @@ -229,7 +229,7 @@ alternative_has_cap_likely(const unsigned long cpucap) if (!cpucap_is_possible(cpucap)) return false; - asm_volatile_goto( + asm goto( ALTERNATIVE_CB("b %l[l_no]", %[cpucap], alt_cb_patch_nops) : : [cpucap] "i" (cpucap) @@ -247,7 +247,7 @@ alternative_has_cap_unlikely(const unsigned long cpucap) if (!cpucap_is_possible(cpucap)) return false; - asm_volatile_goto( + asm goto( ALTERNATIVE("nop", "b %l[l_yes]", %[cpucap]) : : [cpucap] "i" (cpucap) diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h index 48ddc0f45d22..6aafbb789991 100644 --- a/arch/arm64/include/asm/jump_label.h +++ b/arch/arm64/include/asm/jump_label.h @@ -18,7 +18,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: nop \n\t" " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" @@ -35,7 +35,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: b %l[l_yes] \n\t" " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" diff --git a/arch/csky/include/asm/jump_label.h b/arch/csky/include/asm/jump_label.h index 98a3f4b168bd..ef2e37a10a0f 100644 --- a/arch/csky/include/asm/jump_label.h +++ b/arch/csky/include/asm/jump_label.h @@ -12,7 +12,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto( + asm goto( "1: nop32 \n" " .pushsection __jump_table, \"aw\" \n" " .align 2 \n" @@ -29,7 +29,7 @@ label: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto( + asm goto( "1: bsr32 %l[label] \n" " .pushsection __jump_table, \"aw\" \n" " .align 2 \n" diff --git a/arch/loongarch/include/asm/jump_label.h b/arch/loongarch/include/asm/jump_label.h index 3cea299a5ef5..29acfe3de3fa 100644 --- a/arch/loongarch/include/asm/jump_label.h +++ b/arch/loongarch/include/asm/jump_label.h @@ -22,7 +22,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: nop \n\t" JUMP_TABLE_ENTRY : : "i"(&((char *)key)[branch]) : : l_yes); @@ -35,7 +35,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: b %l[l_yes] \n\t" JUMP_TABLE_ENTRY : : "i"(&((char *)key)[branch]) : : l_yes); diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 081be98c71ef..ff5d388502d4 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -39,7 +39,7 @@ extern void jump_label_apply_nops(struct module *mod); static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\t" B_INSN " 2f\n\t" + asm goto("1:\t" B_INSN " 2f\n\t" "2:\t.insn\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" @@ -53,7 +53,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\t" J_INSN " %l[l_yes]\n\t" + asm goto("1:\t" J_INSN " %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" ".popsection\n\t" diff --git a/arch/parisc/include/asm/jump_label.h b/arch/parisc/include/asm/jump_label.h index 94428798b6aa..317ebc5edc9f 100644 --- a/arch/parisc/include/asm/jump_label.h +++ b/arch/parisc/include/asm/jump_label.h @@ -12,7 +12,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".align %1\n\t" @@ -29,7 +29,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b,n %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".align %1\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 93ce3ec25387..2f2a86ed2280 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop # arch_static_branch\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".long 1b - ., %l[l_yes] - .\n\t" @@ -32,7 +32,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b %l[l_yes] # arch_static_branch_jump\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".long 1b - ., %l[l_yes] - .\n\t" diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index f1f9890f50d3..de10437fd206 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -74,7 +74,7 @@ __pu_failed: \ /* -mprefixed can generate offsets beyond range, fall back hack */ #ifdef CONFIG_PPC_KERNEL_PREFIXED #define __put_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm goto( \ "1: " op " %0,0(%1) # put_user\n" \ EX_TABLE(1b, %l2) \ : \ @@ -83,7 +83,7 @@ __pu_failed: \ : label) #else #define __put_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm goto( \ "1: " op "%U1%X1 %0,%1 # put_user\n" \ EX_TABLE(1b, %l2) \ : \ @@ -97,7 +97,7 @@ __pu_failed: \ __put_user_asm_goto(x, ptr, label, "std") #else /* __powerpc64__ */ #define __put_user_asm2_goto(x, addr, label) \ - asm_volatile_goto( \ + asm goto( \ "1: stw%X1 %0, %1\n" \ "2: stw%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ @@ -146,7 +146,7 @@ do { \ /* -mprefixed can generate offsets beyond range, fall back hack */ #ifdef CONFIG_PPC_KERNEL_PREFIXED #define __get_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm_goto_output( \ "1: "op" %0,0(%1) # get_user\n" \ EX_TABLE(1b, %l2) \ : "=r" (x) \ @@ -155,7 +155,7 @@ do { \ : label) #else #define __get_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm_goto_output( \ "1: "op"%U1%X1 %0, %1 # get_user\n" \ EX_TABLE(1b, %l2) \ : "=r" (x) \ @@ -169,7 +169,7 @@ do { \ __get_user_asm_goto(x, addr, label, "ld") #else /* __powerpc64__ */ #define __get_user_asm2_goto(x, addr, label) \ - asm_volatile_goto( \ + asm_goto_output( \ "1: lwz%X1 %0, %1\n" \ "2: lwz%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ diff --git a/arch/powerpc/kernel/irq_64.c b/arch/powerpc/kernel/irq_64.c index 938e66829eae..d5c48d1b0a31 100644 --- a/arch/powerpc/kernel/irq_64.c +++ b/arch/powerpc/kernel/irq_64.c @@ -230,7 +230,7 @@ again: * This allows interrupts to be unmasked without hard disabling, and * also without new hard interrupts coming in ahead of pending ones. */ - asm_volatile_goto( + asm goto( "1: \n" " lbz 9,%0(13) \n" " cmpwi 9,0 \n" diff --git a/arch/riscv/include/asm/arch_hweight.h b/arch/riscv/include/asm/arch_hweight.h index c20236a0725b..85b2c443823e 100644 --- a/arch/riscv/include/asm/arch_hweight.h +++ b/arch/riscv/include/asm/arch_hweight.h @@ -20,7 +20,7 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w) { #ifdef CONFIG_RISCV_ISA_ZBB - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -51,7 +51,7 @@ static inline unsigned int __arch_hweight8(unsigned int w) static __always_inline unsigned long __arch_hweight64(__u64 w) { # ifdef CONFIG_RISCV_ISA_ZBB - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 9ffc35537024..329d8244a9b3 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -39,7 +39,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) { int num; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -95,7 +95,7 @@ static __always_inline unsigned long variable__fls(unsigned long word) { int num; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -154,7 +154,7 @@ static __always_inline int variable_ffs(int x) if (!x) return 0; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -209,7 +209,7 @@ static __always_inline int variable_fls(unsigned int x) if (!x) return 0; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h index a5b60b54b101..88e6f1499e88 100644 --- a/arch/riscv/include/asm/checksum.h +++ b/arch/riscv/include/asm/checksum.h @@ -53,7 +53,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { unsigned long fold_temp; - asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 5a626ed2c47a..0bd11862b760 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -80,7 +80,7 @@ riscv_has_extension_likely(const unsigned long ext) "ext must be < RISCV_ISA_EXT_MAX"); if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { - asm_volatile_goto( + asm goto( ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1) : : [ext] "i" (ext) @@ -103,7 +103,7 @@ riscv_has_extension_unlikely(const unsigned long ext) "ext must be < RISCV_ISA_EXT_MAX"); if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { - asm_volatile_goto( + asm goto( ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1) : : [ext] "i" (ext) diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h index 14a5ea8d8ef0..4a35d787c019 100644 --- a/arch/riscv/include/asm/jump_label.h +++ b/arch/riscv/include/asm/jump_label.h @@ -17,7 +17,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( " .align 2 \n\t" " .option push \n\t" " .option norelax \n\t" @@ -39,7 +39,7 @@ label: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( " .align 2 \n\t" " .option push \n\t" " .option norelax \n\t" diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c index af3df5274ccb..74af3ab520b6 100644 --- a/arch/riscv/lib/csum.c +++ b/arch/riscv/lib/csum.c @@ -53,7 +53,7 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, * support, so nop when Zbb is available and jump when Zbb is * not available. */ - asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : @@ -170,7 +170,7 @@ do_csum_with_alignment(const unsigned char *buff, int len) * support, so nop when Zbb is available and jump when Zbb is * not available. */ - asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : @@ -178,7 +178,7 @@ do_csum_with_alignment(const unsigned char *buff, int len) : no_zbb); #ifdef CONFIG_32BIT - asm_volatile_goto(".option push \n\ + asm_goto_output(".option push \n\ .option arch,+zbb \n\ rori %[fold_temp], %[csum], 16 \n\ andi %[offset], %[offset], 1 \n\ @@ -193,7 +193,7 @@ do_csum_with_alignment(const unsigned char *buff, int len) return (unsigned short)csum; #else /* !CONFIG_32BIT */ - asm_volatile_goto(".option push \n\ + asm_goto_output(".option push \n\ .option arch,+zbb \n\ rori %[fold_temp], %[csum], 32 \n\ add %[csum], %[fold_temp], %[csum] \n\ @@ -257,7 +257,7 @@ do_csum_no_alignment(const unsigned char *buff, int len) * support, so nop when Zbb is available and jump when Zbb is * not available. */ - asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 895f774bbcc5..bf78cf381dfc 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -25,7 +25,7 @@ */ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 0,%l[label]\n" + asm goto("0: brcl 0,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" @@ -39,7 +39,7 @@ label: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 15,%l[label]\n" + asm goto("0: brcl 15,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index 94eb529dcb77..2718cbea826a 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -10,7 +10,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" @@ -26,7 +26,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b %l[l_yes]\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h index 4b6d1b526bc1..66fe06db872f 100644 --- a/arch/um/include/asm/cpufeature.h +++ b/arch/um/include/asm/cpufeature.h @@ -75,7 +75,7 @@ extern void setup_clear_cpu_cap(unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" + asm goto("1: jmp 6f\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " "((5f-4f) - (2b-1b)),0x90\n" diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a26bebbdff87..a1273698fc43 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -168,7 +168,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto( + asm goto( ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 071572e23d3a..cbbef32517f0 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -24,7 +24,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes] # objtool NOPs this \n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (2 | branch) : : l_yes); @@ -38,7 +38,7 @@ l_yes: static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); @@ -52,7 +52,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 4b081e0d3306..363266cbcada 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -13,7 +13,7 @@ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c = false; \ - asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ + asm goto (fullop "; j" #cc " %l[cc_label]" \ : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ if (0) { \ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index d6cd9344f6c7..48f8dd47cf68 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -205,7 +205,7 @@ static inline void clwb(volatile void *__p) #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { - asm_volatile_goto("1: wrussq %[val], (%[addr])\n" + asm goto("1: wrussq %[val], (%[addr])\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "r" (addr), [val] "r" (val) :: fail); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5c367c1290c3..237dc8cdd12b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -133,7 +133,7 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ @@ -295,7 +295,7 @@ do { \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ @@ -375,7 +375,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -394,7 +394,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -477,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 36c8af87a707..4e725854c63a 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -8,7 +8,7 @@ #define svm_asm(insn, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) "\n\t" \ + asm goto("1: " __stringify(insn) "\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ ::: clobber : fault); \ return; \ @@ -18,7 +18,7 @@ fault: \ #define svm_asm1(insn, op1, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + asm goto("1: " __stringify(insn) " %0\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ :: op1 : clobber : fault); \ return; \ @@ -28,7 +28,7 @@ fault: \ #define svm_asm2(insn, op1, op2, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + asm goto("1: " __stringify(insn) " %1, %0\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ :: op1, op2 : clobber : fault); \ return; \ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e262bc2ba4e5..1111d9d08903 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -738,7 +738,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, */ static int kvm_cpu_vmxoff(void) { - asm_volatile_goto("1: vmxoff\n\t" + asm goto("1: vmxoff\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "cc", "memory" : fault); @@ -2784,7 +2784,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) cr4_set_bits(X86_CR4_VMXE); - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + asm goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) : : [vmxon_pointer] "m"(vmxon_pointer) : : fault); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index f41ce3c24123..8060e5fc6dbd 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -94,7 +94,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT - asm_volatile_goto("1: vmread %[field], %[output]\n\t" + asm_goto_output("1: vmread %[field], %[output]\n\t" "jna %l[do_fail]\n\t" _ASM_EXTABLE(1b, %l[do_exception]) @@ -188,7 +188,7 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) #define vmx_asm1(insn, op1, error_args...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + asm goto("1: " __stringify(insn) " %0\n\t" \ ".byte 0x2e\n\t" /* branch not taken hint */ \ "jna %l[error]\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ @@ -205,7 +205,7 @@ fault: \ #define vmx_asm2(insn, op1, op2, error_args...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + asm goto("1: " __stringify(insn) " %1, %0\n\t" \ ".byte 0x2e\n\t" /* branch not taken hint */ \ "jna %l[error]\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ diff --git a/arch/xtensa/include/asm/jump_label.h b/arch/xtensa/include/asm/jump_label.h index c812bf85021c..46c8596259d2 100644 --- a/arch/xtensa/include/asm/jump_label.h +++ b/arch/xtensa/include/asm/jump_label.h @@ -13,7 +13,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "_nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" @@ -38,7 +38,7 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key, * make it reachable and wrap both into a no-transform block * to avoid any assembler interference with this. */ - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" ".begin no-transform\n\t" "_j %l[l_yes]\n\t" "2:\n\t" diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index aebb65bf95a7..c1a963be7d28 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -64,6 +64,25 @@ __builtin_unreachable(); \ } while (0) +/* + * GCC 'asm goto' with outputs miscompiles certain code sequences: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 + * + * Work it around via the same compiler barrier quirk that we used + * to use for the old 'asm goto' workaround. + * + * Also, always mark such 'asm goto' statements as volatile: all + * asm goto statements are supposed to be volatile as per the + * documentation, but some versions of gcc didn't actually do + * that for asms with outputs: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 + */ +#define asm_goto_output(x...) \ + do { asm volatile goto(x); asm (""); } while (0) + #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6f1ca49306d2..663d8791c871 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -362,8 +362,8 @@ struct ftrace_likely_data { #define __member_size(p) __builtin_object_size(p, 1) #endif -#ifndef asm_volatile_goto -#define asm_volatile_goto(x...) asm goto(x) +#ifndef asm_goto_output +#define asm_goto_output(x...) asm goto(x) #endif #ifdef CONFIG_CC_HAS_ASM_INLINE diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 90e275bb3e5d..a3a8ddca9918 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -57,7 +57,7 @@ /* Jump to label if @reg is zero */ #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ - asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ + asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ "je %l[" #label "]" : : : : label) /* Store 256 bits from YMM register into memory. Contrary to bucket load diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h index 7048bb3594d6..634e81d83efd 100644 --- a/samples/bpf/asm_goto_workaround.h +++ b/samples/bpf/asm_goto_workaround.h @@ -4,14 +4,14 @@ #define __ASM_GOTO_WORKAROUND_H /* - * This will bring in asm_volatile_goto and asm_inline macro definitions + * This will bring in asm_goto_output and asm_inline macro definitions * if enabled by compiler and config options. */ #include -#ifdef asm_volatile_goto -#undef asm_volatile_goto -#define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto") +#ifdef asm_goto_output +#undef asm_goto_output +#define asm_goto_output(x...) asm volatile("invalid use of asm_goto_output") #endif /* diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h index 11ff975242ca..e2ff22b379a4 100644 --- a/tools/arch/x86/include/asm/rmwcc.h +++ b/tools/arch/x86/include/asm/rmwcc.h @@ -4,7 +4,7 @@ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ - asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ + asm goto (fullop "; j" cc " %l[cc_label]" \ : : "m" (var), ## __VA_ARGS__ \ : "memory" : cc_label); \ return 0; \ diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index 1bdd834bdd57..d09f9dc172a4 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -36,8 +36,8 @@ #include #endif -#ifndef asm_volatile_goto -#define asm_volatile_goto(x...) asm goto(x) +#ifndef asm_goto_output +#define asm_goto_output(x...) asm goto(x) #endif #endif /* __LINUX_COMPILER_TYPES_H */ -- cgit v1.2.3