From 62b1fa69f31969e11d03529d3a80599ddda2d043 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:40 +1300 Subject: KVM: Export hardware virtualization enabling/disabling functions To support TDX, KVM will need to enable TDX during KVM module loading time. Enabling TDX requires enabling hardware virtualization first so that all online CPUs (and the new CPU going online) are in post-VMXON state. KVM by default enables hardware virtualization but that is done in kvm_init(), which must be the last step after all initialization is done thus is too late for enabling TDX. Export functions to enable/disable hardware virtualization so that TDX code can use them to handle hardware virtualization enabling before kvm_init(). Signed-off-by: Kai Huang Message-ID: Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..1e75fa114f34 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2571,4 +2571,12 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +int kvm_enable_virtualization(void); +void kvm_disable_virtualization(void); +#else +static inline int kvm_enable_virtualization(void) { return 0; } +static inline void kvm_disable_virtualization(void) { } +#endif + #endif -- cgit v1.2.3 From fcdbdf63431c9faf639bffc957ea2ce9b545432e Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:41 +1300 Subject: KVM: VMX: Initialize TDX during KVM module load Before KVM can use TDX to create and run TDX guests, TDX needs to be initialized from two perspectives: 1) TDX module must be initialized properly to a working state; 2) A per-cpu TDX initialization, a.k.a the TDH.SYS.LP.INIT SEAMCALL must be done on any logical cpu before it can run any other TDX SEAMCALLs. The TDX host core-kernel provides two functions to do the above two respectively: tdx_enable() and tdx_cpu_enable(). There are two options in terms of when to initialize TDX: initialize TDX at KVM module loading time, or when creating the first TDX guest. Choose to initialize TDX during KVM module loading time: Initializing TDX module is both memory and CPU time consuming: 1) the kernel needs to allocate a non-trivial size(~1/256) of system memory as metadata used by TDX module to track each TDX-usable memory page's status; 2) the TDX module needs to initialize this metadata, one entry for each TDX-usable memory page. Also, the kernel uses alloc_contig_pages() to allocate those metadata chunks, because they are large and need to be physically contiguous. alloc_contig_pages() can fail. If initializing TDX when creating the first TDX guest, then there's chance that KVM won't be able to run any TDX guests albeit KVM _declares_ to be able to support TDX. This isn't good for the user. On the other hand, initializing TDX at KVM module loading time can make sure KVM is providing a consistent view of whether KVM can support TDX to the user. Always only try to initialize TDX after VMX has been initialized. TDX is based on VMX, and if VMX fails to initialize then TDX is likely to be broken anyway. Also, in practice, supporting TDX will require part of VMX and common x86 infrastructure in working order, so TDX cannot be enabled alone w/o VMX support. There are two cases that can result in failure to initialize TDX: 1) TDX cannot be supported (e.g., because of TDX is not supported or enabled by hardware, or module is not loaded, or missing some dependency in KVM's configuration); 2) Any unexpected error during TDX bring-up. For the first case only mark TDX is disabled but still allow KVM module to be loaded. For the second case just fail to load the KVM module so that the user can be aware. Because TDX costs additional memory, don't enable TDX by default. Add a new module parameter 'enable_tdx' to allow the user to opt-in. Note, the name tdx_init() has already been taken by the early boot code. Use tdx_bringup() for initializing TDX (and tdx_cleanup() since KVM doesn't actually teardown TDX). They don't match vt_init()/vt_exit(), vmx_init()/vmx_exit() etc but it's not end of the world. Also, once initialized, the TDX module cannot be disabled and enabled again w/o the TDX module runtime update, which isn't supported by the kernel. After TDX is enabled, nothing needs to be done when KVM disables hardware virtualization, e.g., when offlining CPU, or during suspend/resume. TDX host core-kernel code internally tracks TDX status and can handle "multiple enabling" scenario. Similar to KVM_AMD_SEV, add a new KVM_INTEL_TDX Kconfig to guide KVM TDX code. Make it depend on INTEL_TDX_HOST but not replace INTEL_TDX_HOST because in the longer term there's a use case that requires making SEAMCALLs w/o KVM as mentioned by Dan [1]. Link: https://lore.kernel.org/6723fc2070a96_60c3294dc@dwillia2-mobl3.amr.corp.intel.com.notmuch/ [1] Signed-off-by: Kai Huang Message-ID: <162f9dee05c729203b9ad6688db1ca2960b4b502.1731664295.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 3 + arch/x86/include/asm/tdx_global_metadata.h | 44 ++++++++ arch/x86/kvm/Kconfig | 10 ++ arch/x86/kvm/Makefile | 1 + arch/x86/kvm/vmx/main.c | 9 ++ arch/x86/kvm/vmx/tdx.c | 162 ++++++++++++++++++++++++++++ arch/x86/kvm/vmx/tdx.h | 13 +++ arch/x86/virt/vmx/tdx/tdx.c | 14 +++ arch/x86/virt/vmx/tdx/tdx.h | 1 - arch/x86/virt/vmx/tdx/tdx_global_metadata.h | 44 -------- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 3 +- 12 files changed, 259 insertions(+), 46 deletions(-) create mode 100644 arch/x86/include/asm/tdx_global_metadata.h create mode 100644 arch/x86/kvm/vmx/tdx.c create mode 100644 arch/x86/kvm/vmx/tdx.h delete mode 100644 arch/x86/virt/vmx/tdx/tdx_global_metadata.h (limited to 'include/linux') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 75a91869453d..1a8c687603aa 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -34,6 +34,7 @@ #ifndef __ASSEMBLY__ #include +#include #include /* @@ -121,6 +122,7 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, int tdx_cpu_enable(void); int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); +const struct tdx_sys_info *tdx_get_sysinfo(void); int tdx_guest_keyid_alloc(void); void tdx_guest_keyid_free(unsigned int keyid); @@ -179,6 +181,7 @@ static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h new file mode 100644 index 000000000000..060a2ad744bf --- /dev/null +++ b/arch/x86/include/asm/tdx_global_metadata.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Automatically generated TDX global metadata structures. */ +#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H +#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H + +#include + +struct tdx_sys_info_features { + u64 tdx_features0; +}; + +struct tdx_sys_info_tdmr { + u16 max_tdmrs; + u16 max_reserved_per_tdmr; + u16 pamt_4k_entry_size; + u16 pamt_2m_entry_size; + u16 pamt_1g_entry_size; +}; + +struct tdx_sys_info_td_ctrl { + u16 tdr_base_size; + u16 tdcs_base_size; + u16 tdvps_base_size; +}; + +struct tdx_sys_info_td_conf { + u64 attributes_fixed0; + u64 attributes_fixed1; + u64 xfam_fixed0; + u64 xfam_fixed1; + u16 num_cpuid_config; + u16 max_vcpus_per_td; + u64 cpuid_config_leaves[128]; + u64 cpuid_config_values[128][2]; +}; + +struct tdx_sys_info { + struct tdx_sys_info_features features; + struct tdx_sys_info_tdmr tdmr; + struct tdx_sys_info_td_ctrl td_ctrl; + struct tdx_sys_info_td_conf td_conf; +}; + +#endif diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..fe8cbee6f614 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -128,6 +128,16 @@ config X86_SGX_KVM If unsure, say N. +config KVM_INTEL_TDX + bool "Intel Trust Domain Extensions (TDX) support" + default y + depends on INTEL_TDX_HOST + help + Provides support for launching Intel Trust Domain Extensions (TDX) + confidential VMs on Intel processors. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f9dddb8cb466..a5d362c7b504 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o +kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 54cf95cb8d42..97c453187cc1 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -6,6 +6,7 @@ #include "nested.h" #include "pmu.h" #include "posted_intr.h" +#include "tdx.h" #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ @@ -172,6 +173,7 @@ struct kvm_x86_init_ops vt_init_ops __initdata = { static void __exit vt_exit(void) { kvm_exit(); + tdx_cleanup(); vmx_exit(); } module_exit(vt_exit); @@ -184,6 +186,11 @@ static int __init vt_init(void) if (r) return r; + /* tdx_init() has been taken */ + r = tdx_bringup(); + if (r) + goto err_tdx_bringup; + /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! @@ -196,6 +203,8 @@ static int __init vt_init(void) return 0; err_kvm_init: + tdx_cleanup(); +err_tdx_bringup: vmx_exit(); return r; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c new file mode 100644 index 000000000000..3c089ed3b843 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "capabilities.h" +#include "tdx.h" + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +static bool enable_tdx __ro_after_init; +module_param_named(tdx, enable_tdx, bool, 0444); + +static enum cpuhp_state tdx_cpuhp_state; + +static int tdx_online_cpu(unsigned int cpu) +{ + unsigned long flags; + int r; + + /* Sanity check CPU is already in post-VMXON */ + WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); + + local_irq_save(flags); + r = tdx_cpu_enable(); + local_irq_restore(flags); + + return r; +} + +static void __do_tdx_cleanup(void) +{ + /* + * Once TDX module is initialized, it cannot be disabled and + * re-initialized again w/o runtime update (which isn't + * supported by kernel). Only need to remove the cpuhp here. + * The TDX host core code tracks TDX status and can handle + * 'multiple enabling' scenario. + */ + WARN_ON_ONCE(!tdx_cpuhp_state); + cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); + tdx_cpuhp_state = 0; +} + +static void __tdx_cleanup(void) +{ + cpus_read_lock(); + __do_tdx_cleanup(); + cpus_read_unlock(); +} + +static int __init __do_tdx_bringup(void) +{ + int r; + + /* + * TDX-specific cpuhp callback to call tdx_cpu_enable() on all + * online CPUs before calling tdx_enable(), and on any new + * going-online CPU to make sure it is ready for TDX guest. + */ + r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "kvm/cpu/tdx:online", + tdx_online_cpu, NULL); + if (r < 0) + return r; + + tdx_cpuhp_state = r; + + r = tdx_enable(); + if (r) + __do_tdx_cleanup(); + + return r; +} + +static int __init __tdx_bringup(void) +{ + int r; + + /* + * Enabling TDX requires enabling hardware virtualization first, + * as making SEAMCALLs requires CPU being in post-VMXON state. + */ + r = kvm_enable_virtualization(); + if (r) + return r; + + cpus_read_lock(); + r = __do_tdx_bringup(); + cpus_read_unlock(); + + if (r) + goto tdx_bringup_err; + + /* + * Leave hardware virtualization enabled after TDX is enabled + * successfully. TDX CPU hotplug depends on this. + */ + return 0; +tdx_bringup_err: + kvm_disable_virtualization(); + return r; +} + +void tdx_cleanup(void) +{ + if (enable_tdx) { + __tdx_cleanup(); + kvm_disable_virtualization(); + } +} + +int __init tdx_bringup(void) +{ + int r; + + if (!enable_tdx) + return 0; + + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("tdx: no TDX private KeyIDs available\n"); + goto success_disable_tdx; + } + + if (!enable_virt_at_load) { + pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); + goto success_disable_tdx; + } + + /* + * Ideally KVM should probe whether TDX module has been loaded + * first and then try to bring it up. But TDX needs to use SEAMCALL + * to probe whether the module is loaded (there is no CPUID or MSR + * for that), and making SEAMCALL requires enabling virtualization + * first, just like the rest steps of bringing up TDX module. + * + * So, for simplicity do everything in __tdx_bringup(); the first + * SEAMCALL will return -ENODEV when the module is not loaded. The + * only complication is having to make sure that initialization + * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other + * cases. + */ + r = __tdx_bringup(); + if (r) { + /* + * Disable TDX only but don't fail to load module if + * the TDX module could not be loaded. No need to print + * message saying "module is not loaded" because it was + * printed when the first SEAMCALL failed. + */ + if (r == -ENODEV) + goto success_disable_tdx; + + enable_tdx = 0; + } + + return r; + +success_disable_tdx: + enable_tdx = 0; + return 0; +} diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h new file mode 100644 index 000000000000..9d4a0e8265bf --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_TDX_H +#define __KVM_X86_VMX_TDX_H + +#ifdef CONFIG_KVM_INTEL_TDX +int tdx_bringup(void); +void tdx_cleanup(void); +#else +static inline int tdx_bringup(void) { return 0; } +static inline void tdx_cleanup(void) {} +#endif + +#endif diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 0122051af6b3..9f0c482c1a03 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1462,6 +1462,20 @@ void __init tdx_init(void) check_tdx_erratum(); } +const struct tdx_sys_info *tdx_get_sysinfo(void) +{ + const struct tdx_sys_info *p = NULL; + + /* Make sure all fields in @tdx_sysinfo have been populated */ + mutex_lock(&tdx_module_lock); + if (tdx_module_status == TDX_MODULE_INITIALIZED) + p = (const struct tdx_sys_info *)&tdx_sysinfo; + mutex_unlock(&tdx_module_lock); + + return p; +} +EXPORT_SYMBOL_GPL(tdx_get_sysinfo); + int tdx_guest_keyid_alloc(void) { return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 62cb7832c42d..da384387d4eb 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -3,7 +3,6 @@ #define _X86_VIRT_TDX_H #include -#include "tdx_global_metadata.h" /* * This file contains both macros and data structures defined by the TDX diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h deleted file mode 100644 index 060a2ad744bf..000000000000 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Automatically generated TDX global metadata structures. */ -#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H -#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H - -#include - -struct tdx_sys_info_features { - u64 tdx_features0; -}; - -struct tdx_sys_info_tdmr { - u16 max_tdmrs; - u16 max_reserved_per_tdmr; - u16 pamt_4k_entry_size; - u16 pamt_2m_entry_size; - u16 pamt_1g_entry_size; -}; - -struct tdx_sys_info_td_ctrl { - u16 tdr_base_size; - u16 tdcs_base_size; - u16 tdvps_base_size; -}; - -struct tdx_sys_info_td_conf { - u64 attributes_fixed0; - u64 attributes_fixed1; - u64 xfam_fixed0; - u64 xfam_fixed1; - u16 num_cpuid_config; - u16 max_vcpus_per_td; - u64 cpuid_config_leaves[128]; - u64 cpuid_config_values[128][2]; -}; - -struct tdx_sys_info { - struct tdx_sys_info_features features; - struct tdx_sys_info_tdmr tdmr; - struct tdx_sys_info_td_ctrl td_ctrl; - struct tdx_sys_info_td_conf td_conf; -}; - -#endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1e75fa114f34..3bfe3140f444 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2284,6 +2284,7 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +extern bool enable_virt_at_load; extern bool kvm_rebooting; #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6e40383fbe47..622b5a99078a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5464,8 +5464,9 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -static bool enable_virt_at_load = true; +bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); +EXPORT_SYMBOL_GPL(enable_virt_at_load); __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); -- cgit v1.2.3 From 7c035bea94074b19ed560a4f23a76c5a6c8e594f Mon Sep 17 00:00:00 2001 From: Zhiming Hu Date: Wed, 19 Feb 2025 09:02:51 -0500 Subject: KVM: TDX: Register TDX host key IDs to cgroup misc controller TDX host key IDs (HKID) are limit resources in a machine, and the misc cgroup lets the machine owner track their usage and limits the possibility of abusing them outside the owner's control. The cgroup v2 miscellaneous subsystem was introduced to control the resource of AMD SEV & SEV-ES ASIDs. Likewise introduce HKIDs as a misc resource. Signed-off-by: Zhiming Hu Signed-off-by: Isaku Yamahata Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 2 ++ arch/x86/kvm/vmx/tdx.c | 14 ++++++++++++++ arch/x86/kvm/vmx/tdx.h | 1 + arch/x86/virt/vmx/tdx/tdx.c | 6 ++++++ include/linux/misc_cgroup.h | 4 ++++ kernel/cgroup/misc.c | 4 ++++ 6 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 1a8c687603aa..2879fc518a32 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -125,6 +125,7 @@ const char *tdx_dump_mce_info(struct mce *m); const struct tdx_sys_info *tdx_get_sysinfo(void); int tdx_guest_keyid_alloc(void); +u32 tdx_get_nr_guest_keyids(void); void tdx_guest_keyid_free(unsigned int keyid); struct tdx_td { @@ -180,6 +181,7 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } +static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 2ac925ecccd5..01166cb8f2e6 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include "capabilities.h" #include "mmu.h" @@ -140,6 +141,9 @@ static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) tdx_guest_keyid_free(kvm_tdx->hkid); kvm_tdx->hkid = -1; atomic_dec(&nr_configured_hkid); + misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + put_misc_cg(kvm_tdx->misc_cg); + kvm_tdx->misc_cg = NULL; } static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) @@ -675,6 +679,10 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, if (ret < 0) return ret; kvm_tdx->hkid = ret; + kvm_tdx->misc_cg = get_current_misc_cg(); + ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + if (ret) + goto free_hkid; ret = -ENOMEM; @@ -1459,6 +1467,11 @@ static int __init __tdx_bringup(void) goto get_sysinfo_err; } + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { + r = -EINVAL; + goto get_sysinfo_err; + } + /* * Leave hardware virtualization enabled after TDX is enabled * successfully. TDX CPU hotplug depends on this. @@ -1475,6 +1488,7 @@ tdx_bringup_err: void tdx_cleanup(void) { if (enable_tdx) { + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); __tdx_cleanup(); kvm_disable_virtualization(); } diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 6ec7ac1d91e3..0559126c8f9d 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -21,6 +21,7 @@ enum kvm_tdx_state { struct kvm_tdx { struct kvm kvm; + struct misc_cg *misc_cg; int hkid; enum kvm_tdx_state state; diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 9f0c482c1a03..3a272e9ff2ca 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1476,6 +1476,12 @@ const struct tdx_sys_info *tdx_get_sysinfo(void) } EXPORT_SYMBOL_GPL(tdx_get_sysinfo); +u32 tdx_get_nr_guest_keyids(void) +{ + return tdx_nr_guest_keyids; +} +EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); + int tdx_guest_keyid_alloc(void) { return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e59..8c0e4f4d71be 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -17,6 +17,10 @@ enum misc_res_type { MISC_CG_RES_SEV, /** @MISC_CG_RES_SEV_ES: AMD SEV-ES ASIDs resource */ MISC_CG_RES_SEV_ES, +#endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + MISC_CG_RES_TDX, #endif /** @MISC_CG_RES_TYPES: count of enum misc_res_type constants */ MISC_CG_RES_TYPES diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6..264aad22c967 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -24,6 +24,10 @@ static const char *const misc_res_name[] = { /* AMD SEV-ES ASIDs resource */ "sev_es", #endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + "tdx", +#endif }; /* Root misc cgroup */ -- cgit v1.2.3 From c4a92f12cf35b83ce81757f6e5e8eb6223b87388 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 13 Jan 2025 11:08:41 +0800 Subject: KVM: Add parameter "kvm" to kvm_cpu_dirty_log_size() and its callers Add a parameter "kvm" to kvm_cpu_dirty_log_size() and down to its callers: kvm_dirty_ring_get_rsvd_entries(), kvm_dirty_ring_alloc(). This is a preparation to make cpu_dirty_log_size a per-VM value rather than a system-wide value. No function changes expected. Signed-off-by: Yan Zhao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- include/linux/kvm_dirty_ring.h | 11 ++++++----- virt/kvm/dirty_ring.c | 11 ++++++----- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3c208291154c..a0c736c11091 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1311,7 +1311,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { return kvm_x86_ops.cpu_dirty_log_size; } diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 4862c98d80d3..da4d9b5f58f1 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -32,7 +32,7 @@ struct kvm_dirty_ring { * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should * not be included as well, so define these nop functions for the arch. */ -static inline u32 kvm_dirty_ring_get_rsvd_entries(void) +static inline u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { return 0; } @@ -42,7 +42,7 @@ static inline bool kvm_use_dirty_bitmap(struct kvm *kvm) return true; } -static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, +static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, int index, u32 size) { return 0; @@ -71,11 +71,12 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ -int kvm_cpu_dirty_log_size(void); +int kvm_cpu_dirty_log_size(struct kvm *kvm); bool kvm_use_dirty_bitmap(struct kvm *kvm); bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); -u32 kvm_dirty_ring_get_rsvd_entries(void); -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm); +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size); /* * called with kvm->slots_lock held, returns the number of diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 7bc74969a819..d14ffc7513ee 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -11,14 +11,14 @@ #include #include "kvm_mm.h" -int __weak kvm_cpu_dirty_log_size(void) +int __weak kvm_cpu_dirty_log_size(struct kvm *kvm) { return 0; } -u32 kvm_dirty_ring_get_rsvd_entries(void) +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { - return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(); + return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(kvm); } bool kvm_use_dirty_bitmap(struct kvm *kvm) @@ -74,14 +74,15 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) KVM_MMU_UNLOCK(kvm); } -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size) { ring->dirty_gfns = vzalloc(size); if (!ring->dirty_gfns) return -ENOMEM; ring->size = size / sizeof(struct kvm_dirty_gfn); - ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(); + ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(kvm); ring->dirty_index = 0; ring->reset_index = 0; ring->index = index; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 622b5a99078a..549537da3062 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4108,7 +4108,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) goto vcpu_free_run_page; if (kvm->dirty_ring_size) { - r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, + r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring, id, kvm->dirty_ring_size); if (r) goto arch_vcpu_destroy; @@ -4847,7 +4847,7 @@ static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) return -EINVAL; /* Should be bigger to keep the reserved entries, or a page */ - if (size < kvm_dirty_ring_get_rsvd_entries() * + if (size < kvm_dirty_ring_get_rsvd_entries(kvm) * sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) return -EINVAL; -- cgit v1.2.3 From 44428e4936022a7a31743017849b167e64f33a32 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Sat, 22 Feb 2025 09:42:18 +0800 Subject: KVM: x86: Move pv_unhalted check out of kvm_vcpu_has_events() Move pv_unhalted check out of kvm_vcpu_has_events(), check pv_unhalted explicitly when handling PV unhalt and expose kvm_vcpu_has_events(). kvm_vcpu_has_events() returns true if pv_unhalted is set, and pv_unhalted is only cleared on transitions to KVM_MP_STATE_RUNNABLE. If the guest initiates a spurious wakeup, pv_unhalted could be left set in perpetuity. Currently, this is not problematic because kvm_vcpu_has_events() is only called when handling PV unhalt. However, if kvm_vcpu_has_events() is used for other purposes in the future, it could return the unexpected results. Export kvm_vcpu_has_events() for its usage in broader contexts. Suggested-by: Sean Christopherson Signed-off-by: Binbin Wu Message-ID: <20250222014225.897298-3-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 +++++------ include/linux/kvm_host.h | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a792207a0dd1..3cae210ffaa4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11143,7 +11143,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } -static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) return true; @@ -11152,9 +11152,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_apic_init_sipi_allowed(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (kvm_is_exception_pending(vcpu)) return true; @@ -11192,10 +11189,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } +EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); } /* Called within kvm->srcu read side. */ @@ -11331,7 +11330,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - if (kvm_vcpu_has_events(vcpu)) + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) vcpu->arch.pv.pv_unhalted = false; else vcpu->arch.mp_state = state; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bfe3140f444..ed1968f6f841 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1609,6 +1609,7 @@ void kvm_arch_disable_virtualization(void); int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 0b28b7080ef5a323c3afa3860c3d45d627629830 Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Tue, 4 Feb 2025 14:14:12 +0100 Subject: platform: cznic: Add keyctl helpers for Turris platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Turris devices support signing messages with a per-device unique asymmetric key that was created on the device at manufacture time. Add helper module that helps to expose this ability via the keyctl() syscall. A device-specific driver can register a signing key by calling devm_turris_signing_key_create(). Both the `.turris-signing-keys` keyring and the signing key are created with only the VIEW, READ and SEARCH permissions for userspace - it is impossible to link / unlink / move them, set their attributes, or unlink the keyring from userspace. Signed-off-by: Marek Behún Signed-off-by: Arnd Bergmann --- MAINTAINERS | 1 + drivers/platform/cznic/Kconfig | 5 + drivers/platform/cznic/Makefile | 2 + drivers/platform/cznic/turris-signing-key.c | 192 ++++++++++++++++++++++++++++ include/linux/turris-signing-key.h | 33 +++++ 5 files changed, 233 insertions(+) create mode 100644 drivers/platform/cznic/turris-signing-key.c create mode 100644 include/linux/turris-signing-key.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 6182a8a0e1c7..6a90c12c22ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2441,6 +2441,7 @@ F: include/dt-bindings/bus/moxtet.h F: include/linux/armada-37xx-rwtm-mailbox.h F: include/linux/moxtet.h F: include/linux/turris-omnia-mcu-interface.h +F: include/linux/turris-signing-key.h ARM/FARADAY FA526 PORT M: Hans Ulli Kroll diff --git a/drivers/platform/cznic/Kconfig b/drivers/platform/cznic/Kconfig index 49c383eb6785..2b4c91ede6d8 100644 --- a/drivers/platform/cznic/Kconfig +++ b/drivers/platform/cznic/Kconfig @@ -77,4 +77,9 @@ config TURRIS_OMNIA_MCU_TRNG endif # TURRIS_OMNIA_MCU +config TURRIS_SIGNING_KEY + tristate + depends on KEYS + depends on ASYMMETRIC_KEY_TYPE + endif # CZNIC_PLATFORMS diff --git a/drivers/platform/cznic/Makefile b/drivers/platform/cznic/Makefile index ce6d997f34d6..2b8606a9486b 100644 --- a/drivers/platform/cznic/Makefile +++ b/drivers/platform/cznic/Makefile @@ -6,3 +6,5 @@ turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_GPIO) += turris-omnia-mcu-gpio.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP) += turris-omnia-mcu-sys-off-wakeup.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_TRNG) += turris-omnia-mcu-trng.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_WATCHDOG) += turris-omnia-mcu-watchdog.o + +obj-$(CONFIG_TURRIS_SIGNING_KEY) += turris-signing-key.o diff --git a/drivers/platform/cznic/turris-signing-key.c b/drivers/platform/cznic/turris-signing-key.c new file mode 100644 index 000000000000..3b12e5245fb7 --- /dev/null +++ b/drivers/platform/cznic/turris-signing-key.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some of CZ.NIC's Turris devices support signing messages with a per-device unique asymmetric + * cryptographic key that was burned into the device at manufacture. + * + * This helper module exposes this message signing ability via the keyctl() syscall. Upon load, it + * creates the `.turris-signing-keys` keyring. A device-specific driver then has to create a signing + * key by calling devm_turris_signing_key_create(). + * + * 2025 by Marek Behún + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int turris_signing_key_instantiate(struct key *, struct key_preparsed_payload *) +{ + return 0; +} + +static void turris_signing_key_describe(const struct key *key, struct seq_file *m) +{ + const struct turris_signing_key_subtype *subtype = dereference_key_rcu(key); + + if (!subtype) + return; + + seq_printf(m, "%s: %*phN", key->description, subtype->public_key_size, + subtype->get_public_key(key)); +} + +static long turris_signing_key_read(const struct key *key, char *buffer, size_t buflen) +{ + const struct turris_signing_key_subtype *subtype = dereference_key_rcu(key); + + if (!subtype) + return -EIO; + + if (buffer) { + if (buflen > subtype->public_key_size) + buflen = subtype->public_key_size; + + memcpy(buffer, subtype->get_public_key(key), subtype->public_key_size); + } + + return subtype->public_key_size; +} + +static bool turris_signing_key_asym_valid_params(const struct turris_signing_key_subtype *subtype, + const struct kernel_pkey_params *params) +{ + if (params->encoding && strcmp(params->encoding, "raw")) + return false; + + if (params->hash_algo && strcmp(params->hash_algo, subtype->hash_algo)) + return false; + + return true; +} + +static int turris_signing_key_asym_query(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info) +{ + const struct turris_signing_key_subtype *subtype = dereference_key_rcu(params->key); + + if (!subtype) + return -EIO; + + if (!turris_signing_key_asym_valid_params(subtype, params)) + return -EINVAL; + + info->supported_ops = KEYCTL_SUPPORTS_SIGN; + info->key_size = subtype->key_size; + info->max_data_size = subtype->data_size; + info->max_sig_size = subtype->sig_size; + info->max_enc_size = 0; + info->max_dec_size = 0; + + return 0; +} + +static int turris_signing_key_asym_eds_op(struct kernel_pkey_params *params, + const void *in, void *out) +{ + const struct turris_signing_key_subtype *subtype = dereference_key_rcu(params->key); + int err; + + if (!subtype) + return -EIO; + + if (!turris_signing_key_asym_valid_params(subtype, params)) + return -EINVAL; + + if (params->op != kernel_pkey_sign) + return -EOPNOTSUPP; + + if (params->in_len != subtype->data_size || params->out_len != subtype->sig_size) + return -EINVAL; + + err = subtype->sign(params->key, in, out); + if (err) + return err; + + return subtype->sig_size; +} + +static struct key_type turris_signing_key_type = { + .name = "turris-signing-key", + .instantiate = turris_signing_key_instantiate, + .describe = turris_signing_key_describe, + .read = turris_signing_key_read, + .asym_query = turris_signing_key_asym_query, + .asym_eds_op = turris_signing_key_asym_eds_op, +}; + +static struct key *turris_signing_keyring; + +static void turris_signing_key_release(void *key) +{ + key_unlink(turris_signing_keyring, key); + key_put(key); +} + +int +devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, + const char *desc) +{ + struct key *key; + key_ref_t kref; + + kref = key_create(make_key_ref(turris_signing_keyring, true), + turris_signing_key_type.name, desc, NULL, 0, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | + KEY_USR_SEARCH, + KEY_ALLOC_BUILT_IN | KEY_ALLOC_SET_KEEP | KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(kref)) + return PTR_ERR(kref); + + key = key_ref_to_ptr(kref); + key->payload.data[1] = dev; + rcu_assign_keypointer(key, subtype); + + return devm_add_action_or_reset(dev, turris_signing_key_release, key); +} +EXPORT_SYMBOL_GPL(devm_turris_signing_key_create); + +static int turris_signing_key_init(void) +{ + int err; + + err = register_key_type(&turris_signing_key_type); + if (err) + return err; + + turris_signing_keyring = keyring_alloc(".turris-signing-keys", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | + KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_BUILT_IN | KEY_ALLOC_SET_KEEP | + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(turris_signing_keyring)) { + pr_err("Cannot allocate Turris keyring\n"); + + unregister_key_type(&turris_signing_key_type); + + return PTR_ERR(turris_signing_keyring); + } + + return 0; +} +module_init(turris_signing_key_init); + +static void turris_signing_key_exit(void) +{ + key_put(turris_signing_keyring); + unregister_key_type(&turris_signing_key_type); +} +module_exit(turris_signing_key_exit); + +MODULE_AUTHOR("Marek Behun "); +MODULE_DESCRIPTION("CZ.NIC's Turris signing key helper"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/turris-signing-key.h b/include/linux/turris-signing-key.h new file mode 100644 index 000000000000..032ca8cbf636 --- /dev/null +++ b/include/linux/turris-signing-key.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * 2025 by Marek Behún + */ + +#ifndef __TURRIS_SIGNING_KEY_H +#define __TURRIS_SIGNING_KEY_H + +#include +#include + +struct device; + +struct turris_signing_key_subtype { + u16 key_size; + u8 data_size; + u8 sig_size; + u8 public_key_size; + const char *hash_algo; + const void *(*get_public_key)(const struct key *key); + int (*sign)(const struct key *key, const void *msg, void *signature); +}; + +static inline struct device *turris_signing_key_get_dev(const struct key *key) +{ + return key->payload.data[1]; +} + +int +devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, + const char *desc); + +#endif /* __TURRIS_SIGNING_KEY_H */ -- cgit v1.2.3 From 78a0323506f01e8017a5826cd7e91951c13184fa Mon Sep 17 00:00:00 2001 From: Sohil Mehta Date: Thu, 27 Mar 2025 23:46:22 +0000 Subject: x86/nmi: Consolidate NMI panic variables Commit: c305a4e98378 ("x86: Move sysctls into arch/x86") recently moved the sysctl handling of panic_on_unrecovered_nmi and panic_on_io_nmi to x86-specific code. These variables no longer need to be declared in the generic header file. Relocate the variable definitions and declarations closer to where they are used. This makes all the NMI panic options consistent and easier to track. [ mingo: Fixed up the SHA1 of the commit reference. ] Signed-off-by: Sohil Mehta Signed-off-by: Ingo Molnar Reviewed-by: Kai Huang Acked-by: Peter Zijlstra (Intel) Reviewed-by: Nikolay Borisov Cc: Joel Granados Link: https://lore.kernel.org/r/20250327234629.3953536-3-sohil.mehta@intel.com --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/dumpstack.c | 2 -- arch/x86/kernel/nmi.c | 3 +++ include/linux/panic.h | 2 -- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 9cf96cce02fc..f85aea7bf7f1 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,6 +17,8 @@ extern void release_evntsel_nmi(unsigned int); #endif /* CONFIG_X86_LOCAL_APIC */ extern int unknown_nmi_panic; +extern int panic_on_unrecovered_nmi; +extern int panic_on_io_nmi; #define NMI_FLAG_FIRST 1 diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c6fefd4585f8..71ee20102a8a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -23,8 +23,6 @@ #include #include -int panic_on_unrecovered_nmi; -int panic_on_io_nmi; static int die_counter; static struct pt_regs exec_summary_regs; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 9a95d00f1423..671d846ed620 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -91,6 +91,9 @@ static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); static int ignore_nmis __read_mostly; int unknown_nmi_panic; +int panic_on_unrecovered_nmi; +int panic_on_io_nmi; + /* * Prevent NMI reason port (0x61) being accessed simultaneously, can * only be used in NMI handler. diff --git a/include/linux/panic.h b/include/linux/panic.h index 2494d51707ef..4adc65766935 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -20,8 +20,6 @@ extern bool panic_triggering_all_cpu_backtrace; extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; -extern int panic_on_unrecovered_nmi; -extern int panic_on_io_nmi; extern int panic_on_warn; extern unsigned long panic_on_taint; -- cgit v1.2.3 From 642989287350fa4964c37df0f3769094072421c3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 24 Mar 2025 17:59:59 +0100 Subject: firmware: turris-mox-rwtm: fix building without CONFIG_KEYS "struct key" is defined conditionally, so the code referencing it must be made conditional as well: In file included from drivers/firmware/turris-mox-rwtm.c:29: include/linux/turris-signing-key.h: In function 'turris_signing_key_get_dev': include/linux/turris-signing-key.h:26:19: error: invalid use of undefined type 'const struct key' 26 | return key->payload.data[1]; | ^~ Signed-off-by: Arnd Bergmann --- include/linux/turris-signing-key.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/turris-signing-key.h b/include/linux/turris-signing-key.h index 032ca8cbf636..8a435b73c3a9 100644 --- a/include/linux/turris-signing-key.h +++ b/include/linux/turris-signing-key.h @@ -11,6 +11,7 @@ struct device; +#ifdef CONFIG_KEYS struct turris_signing_key_subtype { u16 key_size; u8 data_size; @@ -29,5 +30,6 @@ static inline struct device *turris_signing_key_get_dev(const struct key *key) int devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, const char *desc); +#endif #endif /* __TURRIS_SIGNING_KEY_H */ -- cgit v1.2.3 From f6e9a26e2d488c743757d66898ae91c53ffbe528 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:46 -0700 Subject: cgroup: move rstat base stat objects into their own struct This non-functional change serves as preparation for moving to subsystem-based rstat trees. The base stats are not an actual subsystem, but in future commits they will have exclusive rstat trees just as other subsystems will. Moving the base stat objects into a new struct allows the cgroup_rstat_cpu struct to become more compact since it now only contains the minimum amount of pointers needed for rstat participation. Subsystems will (in future commits) make use of the compact cgroup_rstat_cpu struct while avoiding the memory overhead of the base stat objects which they will not use. An instance of the new struct cgroup_rstat_base_cpu was placed on the cgroup struct so it can retain ownership of these base stats common to all cgroups. A helper function was added for looking up the cpu-specific base stats of a given cgroup. Finally, initialization and variable names were adjusted where applicable. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 38 +++++++++++--------- kernel/cgroup/cgroup.c | 8 +++-- kernel/cgroup/rstat.c | 84 +++++++++++++++++++++++++++------------------ 3 files changed, 79 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5bc8f55c8cca..1bf2e8db6dac 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -344,10 +344,29 @@ struct cgroup_base_stat { * frequency decreases the cost of each read. * * This struct hosts both the fields which implement the above - - * updated_children and updated_next - and the fields which track basic - * resource statistics on top of it - bsync, bstat and last_bstat. + * updated_children and updated_next. */ struct cgroup_rstat_cpu { + /* + * Child cgroups with stat updates on this cpu since the last read + * are linked on the parent's ->updated_children through + * ->updated_next. + * + * In addition to being more compact, singly-linked list pointing + * to the cgroup makes it unnecessary for each per-cpu struct to + * point back to the associated cgroup. + * + * Protected by per-cpu cgroup_rstat_cpu_lock. + */ + struct cgroup *updated_children; /* terminated by self cgroup */ + struct cgroup *updated_next; /* NULL iff not on the list */ +}; + +/* + * This struct hosts the fields which track basic resource statistics on + * top of it - bsync, bstat and last_bstat. + */ +struct cgroup_rstat_base_cpu { /* * ->bsync protects ->bstat. These are the only fields which get * updated in the hot path. @@ -374,20 +393,6 @@ struct cgroup_rstat_cpu { * deltas to propagate to the per-cpu subtree_bstat. */ struct cgroup_base_stat last_subtree_bstat; - - /* - * Child cgroups with stat updates on this cpu since the last read - * are linked on the parent's ->updated_children through - * ->updated_next. - * - * In addition to being more compact, singly-linked list pointing - * to the cgroup makes it unnecessary for each per-cpu struct to - * point back to the associated cgroup. - * - * Protected by per-cpu cgroup_rstat_cpu_lock. - */ - struct cgroup *updated_children; /* terminated by self cgroup */ - struct cgroup *updated_next; /* NULL iff not on the list */ }; struct cgroup_freezer_state { @@ -518,6 +523,7 @@ struct cgroup { /* per-cpu recursive resource statistics */ struct cgroup_rstat_cpu __percpu *rstat_cpu; + struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu; struct list_head rstat_css_list; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 49d622205997..e1f0284cbbb8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -161,10 +161,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); +static DEFINE_PER_CPU(struct cgroup_rstat_cpu, root_rstat_cpu); +static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu); /* the default hierarchy */ -struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; +struct cgroup_root cgrp_dfl_root = { + .cgrp.rstat_cpu = &root_rstat_cpu, + .cgrp.rstat_base_cpu = &root_rstat_base_cpu, +}; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b2239156b7de..918931f68de3 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -19,6 +19,12 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) return per_cpu_ptr(cgrp->rstat_cpu, cpu); } +static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( + struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); +} + /* * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). * @@ -352,12 +358,22 @@ int cgroup_rstat_init(struct cgroup *cgrp) return -ENOMEM; } + if (!cgrp->rstat_base_cpu) { + cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); + if (!cgrp->rstat_cpu) { + free_percpu(cgrp->rstat_cpu); + return -ENOMEM; + } + } + /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_base_cpu *rstatbc = + cgroup_rstat_base_cpu(cgrp, cpu); rstatc->updated_children = cgrp; - u64_stats_init(&rstatc->bsync); + u64_stats_init(&rstatbc->bsync); } return 0; @@ -380,6 +396,8 @@ void cgroup_rstat_exit(struct cgroup *cgrp) free_percpu(cgrp->rstat_cpu); cgrp->rstat_cpu = NULL; + free_percpu(cgrp->rstat_base_cpu); + cgrp->rstat_base_cpu = NULL; } void __init cgroup_rstat_boot(void) @@ -420,9 +438,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct cgroup_rstat_base_cpu *prstatbc; struct cgroup_base_stat delta; unsigned seq; @@ -432,15 +450,15 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { - seq = __u64_stats_fetch_begin(&rstatc->bsync); - delta = rstatc->bstat; - } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + seq = __u64_stats_fetch_begin(&rstatbc->bsync); + delta = rstatbc->bstat; + } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); /* propagate per-cpu delta to cgroup and per-cpu global statistics */ - cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); - cgroup_base_stat_add(&rstatc->last_bstat, &delta); - cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_bstat, &delta); + cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { @@ -449,73 +467,73 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); - delta = rstatc->subtree_bstat; - prstatc = cgroup_rstat_cpu(parent, cpu); - cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); - cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); - cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); + delta = rstatbc->subtree_bstat; + prstatbc = cgroup_rstat_base_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); + cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } -static struct cgroup_rstat_cpu * +static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; - rstatc = get_cpu_ptr(cgrp->rstat_cpu); - *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); - return rstatc; + rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); + *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); + return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc, + struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { - u64_stats_update_end_irqrestore(&rstatc->bsync, flags); + u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); cgroup_rstat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(rstatc); + put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); - rstatc->bstat.cputime.sum_exec_runtime += delta_exec; - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_NICE: - rstatc->bstat.ntime += delta_exec; + rstatbc->bstat.ntime += delta_exec; fallthrough; case CPUTIME_USER: - rstatc->bstat.cputime.utime += delta_exec; + rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: - rstatc->bstat.cputime.stime += delta_exec; + rstatbc->bstat.cputime.stime += delta_exec; break; #ifdef CONFIG_SCHED_CORE case CPUTIME_FORCEIDLE: - rstatc->bstat.forceidle_sum += delta_exec; + rstatbc->bstat.forceidle_sum += delta_exec; break; #endif default: break; } - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* -- cgit v1.2.3 From 845a7245801142bfff411bc84afa8cdbc789562f Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:47 -0700 Subject: cgroup: add helper for checking when css is cgroup::self The cgroup struct has a css field called "self". The main difference between this css and the others found in the cgroup::subsys array is that cgroup::self has a NULL subsystem pointer. There are several places where checks are performed to determine whether the css in question is cgroup::self or not. Instead of accessing css->ss directly, introduce a helper function that shows the intent and use where applicable. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 +++++ kernel/cgroup/cgroup.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e7da3c3b098b..65d95bb2199f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -347,6 +347,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_cgroup(struct cgroup_subsys_state *css) +{ + return css->ss == NULL; +} + static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e1f0284cbbb8..d98994b106ed 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1706,7 +1706,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - if (!css->ss) { + if (css_is_cgroup(css)) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); @@ -1738,7 +1738,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (css->flags & CSS_VISIBLE) return 0; - if (!css->ss) { + if (css_is_cgroup(css)) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); -- cgit v1.2.3 From a97915559f5c5ff1972d678b94fd460c72a3b5f2 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:48 -0700 Subject: cgroup: change rstat function signatures from cgroup-based to css-based This non-functional change serves as preparation for moving to subsystem-based rstat trees. To simplify future commits, change the signatures of existing cgroup-based rstat functions to become css-based and rename them to reflect that. Though the signatures have changed, the implementations have not. Within these functions use the css->cgroup pointer to obtain the associated cgroup and allow code to function the same just as it did before this patch. At applicable call sites, pass the subsystem-specific css pointer as an argument or pass a pointer to cgroup::self if not in subsystem context. Note that cgroup_rstat_updated_list() and cgroup_rstat_push_children() are not altered yet since there would be a larger amount of css to cgroup conversions which may overcomplicate the code at this intermediate phase. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- block/blk-cgroup.c | 6 +- include/linux/cgroup-defs.h | 2 +- include/linux/cgroup.h | 4 +- kernel/cgroup/cgroup-internal.h | 4 +- kernel/cgroup/cgroup.c | 30 ++++---- kernel/cgroup/rstat.c | 80 +++++++++++++--------- mm/memcontrol.c | 4 +- .../bpf/progs/cgroup_hierarchical_stats.c | 9 +-- 8 files changed, 78 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5905f277057b..0560ea402856 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1144,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no - * cgroups are defined. For that reason, cgroup_rstat_flush in + * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * @@ -1253,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else - cgroup_rstat_flush(blkcg->css.cgroup); + css_rstat_flush(&blkcg->css); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -2243,7 +2243,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + css_rstat_updated(&blkcg->css, cpu); put_cpu(); } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1bf2e8db6dac..e58bfb880111 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -536,7 +536,7 @@ struct cgroup { /* * A singly-linked list of cgroup structures to be rstat flushed. * This is a scratch field to be used exclusively by - * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. + * css_rstat_flush_locked() and protected by cgroup_rstat_lock. */ struct cgroup *rstat_flush_next; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 65d95bb2199f..1f5b0a4a3356 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -693,8 +693,8 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup scalable recursive statistics. */ -void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); -void cgroup_rstat_flush(struct cgroup *cgrp); +void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); +void css_rstat_flush(struct cgroup_subsys_state *css); /* * Basic resource stats. diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 95ab39e1ec8f..c161d34be634 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -270,8 +270,8 @@ int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ -int cgroup_rstat_init(struct cgroup *cgrp); -void cgroup_rstat_exit(struct cgroup *cgrp); +int css_rstat_init(struct cgroup_subsys_state *css); +void css_rstat_exit(struct cgroup_subsys_state *css); void cgroup_rstat_boot(void); void cgroup_base_stat_cputime_show(struct seq_file *seq); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d98994b106ed..c284df1efc9f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1362,7 +1362,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_unlock(); - cgroup_rstat_exit(cgrp); + css_rstat_exit(&cgrp->self); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -2136,7 +2136,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = cgroup_rstat_init(root_cgrp); + ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root; @@ -2178,7 +2178,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto out; exit_stats: - cgroup_rstat_exit(root_cgrp); + css_rstat_exit(&root_cgrp->self); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -5435,7 +5435,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - cgroup_rstat_exit(cgrp); + css_rstat_exit(css); kfree(cgrp); } else { /* @@ -5465,7 +5465,7 @@ static void css_release_work_fn(struct work_struct *work) /* css release path */ if (!list_empty(&css->rstat_css_node)) { - cgroup_rstat_flush(cgrp); + css_rstat_flush(css); list_del_rcu(&css->rstat_css_node); } @@ -5493,7 +5493,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - cgroup_rstat_flush(cgrp); + css_rstat_flush(css); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5686,17 +5686,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - /* create the directory */ kn = kernfs_create_dir_ns(parent->kn, name, mode, current_fsuid(), current_fsgid(), cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_stat_exit; + goto out_cancel_ref; } cgrp->kn = kn; @@ -5706,6 +5702,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->root = root; cgrp->level = level; + /* + * Now that init_cgroup_housekeeping() has been called and cgrp->self + * is setup, it is safe to perform rstat initialization on it. + */ + ret = css_rstat_init(&cgrp->self); + if (ret) + goto out_stat_exit; + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_kernfs_remove; @@ -5776,10 +5780,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_psi_free: psi_cgroup_free(cgrp); +out_stat_exit: + css_rstat_exit(&cgrp->self); out_kernfs_remove: kernfs_remove(cgrp->kn); -out_stat_exit: - cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 918931f68de3..4a8834a70ca6 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -34,9 +34,10 @@ static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( * operations without handling high-frequency fast-path "update" events. */ static __always_inline -unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, const bool fast_path) +unsigned long _css_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup_subsys_state *css, const bool fast_path) { + struct cgroup *cgrp = css->cgroup; unsigned long flags; bool contended; @@ -67,10 +68,12 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, } static __always_inline -void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, unsigned long flags, +void _css_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup_subsys_state *css, unsigned long flags, const bool fast_path) { + struct cgroup *cgrp = css->cgroup; + if (fast_path) trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); else @@ -80,16 +83,17 @@ void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, } /** - * cgroup_rstat_updated - keep track of updated rstat_cpu - * @cgrp: target cgroup + * css_rstat_updated - keep track of updated rstat_cpu + * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of + * @css->cgroup's rstat_cpu on @cpu was updated. Put it on the parent's + * matching rstat_cpu->updated_children list. See the comment on top of * cgroup_rstat_cpu definition for details. */ -__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { + struct cgroup *cgrp = css->cgroup; raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); unsigned long flags; @@ -104,7 +108,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); + flags = _css_rstat_cpu_lock(cpu_lock, cpu, css, true); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { @@ -132,7 +136,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) cgrp = parent; } - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); + _css_rstat_cpu_unlock(cpu_lock, cpu, css, flags, true); } /** @@ -213,7 +217,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) struct cgroup *head = NULL, *parent, *child; unsigned long flags; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); + flags = _css_rstat_cpu_lock(cpu_lock, cpu, &root->self, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -250,14 +254,14 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) if (child != root) head = cgroup_rstat_push_children(head, child, cpu); unlock_ret: - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); + _css_rstat_cpu_unlock(cpu_lock, cpu, &root->self, flags, false); return head; } /* * A hook for bpf stat collectors to attach to and flush their stats. - * Together with providing bpf kfuncs for cgroup_rstat_updated() and - * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that + * Together with providing bpf kfuncs for css_rstat_updated() and + * css_rstat_flush(), this enables a complete workflow where bpf progs that * collect cgroup stats can integrate with rstat for efficient flushing. * * A static noinline declaration here could cause the compiler to optimize away @@ -285,9 +289,11 @@ __bpf_hook_end(); * value -1 is used when obtaining the main lock else this is the CPU * number processed last. */ -static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) +static inline void __css_rstat_lock(struct cgroup_subsys_state *css, + int cpu_in_loop) __acquires(&cgroup_rstat_lock) { + struct cgroup *cgrp = css->cgroup; bool contended; contended = !spin_trylock_irq(&cgroup_rstat_lock); @@ -298,28 +304,32 @@ static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } -static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, + int cpu_in_loop) __releases(&cgroup_rstat_lock) { + struct cgroup *cgrp = css->cgroup; + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); spin_unlock_irq(&cgroup_rstat_lock); } /** - * cgroup_rstat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup + * css_rstat_flush - flush stats in @css->cgroup's subtree + * @css: target cgroup subsystem state * - * Collect all per-cpu stats in @cgrp's subtree into the global counters + * Collect all per-cpu stats in @css->cgroup's subtree into the global counters * and propagate them upwards. After this function returns, all cgroups in * the subtree have up-to-date ->stat. * - * This also gets all cgroups in the subtree including @cgrp off the + * This also gets all cgroups in the subtree including @css->cgroup off the * ->updated_children lists. * * This function may block. */ -__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) +__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; might_sleep(); @@ -327,7 +337,7 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) struct cgroup *pos; /* Reacquire for each CPU to avoid disabling IRQs too long */ - __cgroup_rstat_lock(cgrp, cpu); + __css_rstat_lock(css, cpu); pos = cgroup_rstat_updated_list(cgrp, cpu); for (; pos; pos = pos->rstat_flush_next) { struct cgroup_subsys_state *css; @@ -341,14 +351,15 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - __cgroup_rstat_unlock(cgrp, cpu); + __css_rstat_unlock(css, cpu); if (!cond_resched()) cpu_relax(); } } -int cgroup_rstat_init(struct cgroup *cgrp) +int css_rstat_init(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; /* the root cgrp has rstat_cpu preallocated */ @@ -379,11 +390,12 @@ int cgroup_rstat_init(struct cgroup *cgrp) return 0; } -void cgroup_rstat_exit(struct cgroup *cgrp) +void css_rstat_exit(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; - cgroup_rstat_flush(cgrp); + css_rstat_flush(&cgrp->self); /* sanity check */ for_each_possible_cpu(cpu) { @@ -490,7 +502,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); - cgroup_rstat_updated(cgrp, smp_processor_id()); + css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } @@ -592,12 +604,12 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush(cgrp); - __cgroup_rstat_lock(cgrp, -1); + css_rstat_flush(&cgrp->self); + __css_rstat_lock(&cgrp->self, -1); bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &bstat.cputime.utime, &bstat.cputime.stime); - __cgroup_rstat_unlock(cgrp, -1); + __css_rstat_unlock(&cgrp->self, -1); } else { root_cgroup_cputime(&bstat); } @@ -619,10 +631,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) cgroup_force_idle_show(seq, &bstat); } -/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ +/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ BTF_KFUNCS_START(bpf_rstat_kfunc_ids) -BTF_ID_FLAGS(func, cgroup_rstat_updated) -BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) +BTF_ID_FLAGS(func, css_rstat_updated) +BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 83c2df73e4b6..7152d9623cc8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -579,7 +579,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, cpu); + css_rstat_updated(&memcg->css, cpu); statc = this_cpu_ptr(memcg->vmstats_percpu); for (; statc; statc = statc->parent) { stats_updates = READ_ONCE(statc->stats_updates) + abs(val); @@ -611,7 +611,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) if (mem_cgroup_is_root(memcg)) WRITE_ONCE(flush_last_time, jiffies_64); - cgroup_rstat_flush(memcg->css.cgroup); + css_rstat_flush(&memcg->css); } /* diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c index c74362854948..ff189a736ad8 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -37,8 +37,9 @@ struct { __type(value, struct attach_counter); } attach_counters SEC(".maps"); -extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym; -extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym; +extern void css_rstat_updated( + struct cgroup_subsys_state *css, int cpu) __ksym; +extern void css_rstat_flush(struct cgroup_subsys_state *css) __ksym; static uint64_t cgroup_id(struct cgroup *cgrp) { @@ -75,7 +76,7 @@ int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader, else if (create_percpu_attach_counter(cg_id, 1)) return 0; - cgroup_rstat_updated(dst_cgrp, bpf_get_smp_processor_id()); + css_rstat_updated(&dst_cgrp->self, bpf_get_smp_processor_id()); return 0; } @@ -141,7 +142,7 @@ int BPF_PROG(dumper, struct bpf_iter_meta *meta, struct cgroup *cgrp) return 1; /* Flush the stats to make sure we get the most updated numbers */ - cgroup_rstat_flush(cgrp); + css_rstat_flush(&cgrp->self); total_counter = bpf_map_lookup_elem(&attach_counters, &cg_id); if (!total_counter) { -- cgit v1.2.3 From dd8a9807fa03666bff52cb28472fb227eaac36c9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 31 Mar 2025 13:35:59 +0300 Subject: spi: Group CS related fields in struct spi_device The CS related fields are sparse in the struct spi_device. Group them. While at it, fix the comment style of cs_index_mask. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250331103609.4160281-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 0ba5e49bace4..e10f0ebb8250 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -136,13 +136,6 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. - * @chip_select: Array of physical chipselect, spi->chipselect[i] gives - * the corresponding physical CS for logical CS i. - * @mode: The spi mode defines how data is clocked out and in. - * This may be changed by the device's driver. - * The "active low" default for chipselect mode can be overridden - * (by specifying SPI_CS_HIGH) as can the "MSB first" default for - * each word in a transfer (by specifying SPI_LSB_FIRST). * @bits_per_word: Data transfers involve one or more words; word sizes * like eight or 12 bits are common. In-memory wordsizes are * powers of two bytes (e.g. 20 bit samples use 32 bits). @@ -150,6 +143,11 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * default (0) indicating protocol words are eight bit bytes. * The spi_transfer.bits_per_word can override this for each transfer. * @rt: Make the pump thread real time priority. + * @mode: The spi mode defines how data is clocked out and in. + * This may be changed by the device's driver. + * The "active low" default for chipselect mode can be overridden + * (by specifying SPI_CS_HIGH) as can the "MSB first" default for + * each word in a transfer (by specifying SPI_LSB_FIRST). * @irq: Negative, or the number passed to request_irq() to receive * interrupts from this device. * @controller_state: Controller's runtime state @@ -162,8 +160,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * the device will bind to the named driver and only the named driver. * Do not set directly, because core frees it; use driver_set_override() to * set or clear it. - * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines - * (optional, NULL when not using a GPIO line) + * @pcpu_statistics: statistics for the spi_device * @word_delay: delay to be inserted between consecutive * words of a transfer * @cs_setup: delay to be introduced by the controller after CS is asserted @@ -171,8 +168,11 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @cs_inactive: delay to be introduced by the controller after CS is * deasserted. If @cs_change_delay is used from @spi_transfer, then the * two delays will be added up. - * @pcpu_statistics: statistics for the spi_device + * @chip_select: Array of physical chipselect, spi->chipselect[i] gives + * the corresponding physical CS for logical CS i. * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array + * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines + * (optional, NULL when not using a GPIO line) * * A @spi_device is used to interchange data between an SPI target device * (usually a discrete chip) and CPU memory. @@ -187,7 +187,6 @@ struct spi_device { struct device dev; struct spi_controller *controller; u32 max_speed_hz; - u8 chip_select[SPI_CS_CNT_MAX]; u8 bits_per_word; bool rt; #define SPI_NO_TX BIT(31) /* No transmit wire */ @@ -218,23 +217,29 @@ struct spi_device { void *controller_data; char modalias[SPI_NAME_SIZE]; const char *driver_override; - struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + + /* The statistics */ + struct spi_statistics __percpu *pcpu_statistics; + struct spi_delay word_delay; /* Inter-word delay */ + /* CS delays */ struct spi_delay cs_setup; struct spi_delay cs_hold; struct spi_delay cs_inactive; - /* The statistics */ - struct spi_statistics __percpu *pcpu_statistics; + u8 chip_select[SPI_CS_CNT_MAX]; - /* Bit mask of the chipselect(s) that the driver need to use from - * the chipselect array.When the controller is capable to handle + /* + * Bit mask of the chipselect(s) that the driver need to use from + * the chipselect array. When the controller is capable to handle * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ u32 cs_index_mask : SPI_CS_CNT_MAX; + struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + /* * Likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: -- cgit v1.2.3 From f7b86e0e75bc234751cb7a82d888083a57ef28b2 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 24 Mar 2025 21:15:17 +0000 Subject: crypto: ccp - Add new SEV/SNP platform shutdown API Add new API interface to do SEV/SNP platform shutdown when KVM module is unloaded. Reviewed-by: Dionna Glaze Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 9 +++++++++ include/linux/psp-sev.h | 3 +++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 6fdbb3bf44b5..671347702ae7 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2468,6 +2468,15 @@ static void sev_firmware_shutdown(struct sev_device *sev) mutex_unlock(&sev_cmd_mutex); } +void sev_platform_shutdown(void) +{ + if (!psp_master || !psp_master->sev_data) + return; + + sev_firmware_shutdown(psp_master->sev_data); +} +EXPORT_SYMBOL_GPL(sev_platform_shutdown); + void sev_dev_destroy(struct psp_device *psp) { struct sev_device *sev = psp->sev_data; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index f3cad182d4ef..0b3a36bdaa90 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -954,6 +954,7 @@ int sev_do_cmd(int cmd, void *data, int *psp_ret); void *psp_copy_user_blob(u64 uaddr, u32 len); void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); +void sev_platform_shutdown(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ @@ -988,6 +989,8 @@ static inline void *snp_alloc_firmware_page(gfp_t mask) static inline void snp_free_firmware_page(void *addr) { } +static inline void sev_platform_shutdown(void) { } + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ #endif /* __PSP_SEV_H__ */ -- cgit v1.2.3 From 1be1cd03a93339f14c8f4fe300bca321fddc6478 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 3 Apr 2025 18:59:14 +0300 Subject: gpiolib: acpi: Reduce memory footprint for struct acpi_gpio_params The line_index member in the struct acpi_gpio_params replicates what is covered in the ACPI GpioIo() or GpioInt() resource. The value there is limited to 16-bit one, so we don't really need to have a full 32-bit storage for it. Together with followed boolean the structure will be smaller. add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3) Function old new delta acpi_gpio_property_lookup 417 414 -3 Total: Before=15361, After=15358, chg -0.02% `pahole` difference before and after: - /* size: 12, cachelines: 1, members: 3 */ - /* padding: 3 */ + /* size: 8, cachelines: 1, members: 3 */ + /* padding: 1 */ Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20250403160034.2680485-4-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko --- include/linux/gpio/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 45b651c05b9c..899179972bec 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -587,7 +587,7 @@ struct gpio_desc *devm_fwnode_gpiod_get(struct device *dev, struct acpi_gpio_params { unsigned int crs_entry_index; - unsigned int line_index; + unsigned short line_index; bool active_low; }; -- cgit v1.2.3 From 5741909697a31cfb08e45d56b4211959fb791487 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Mar 2025 14:01:32 +1100 Subject: VFS: improve interface for lookup_one functions The family of functions: lookup_one() lookup_one_unlocked() lookup_one_positive_unlocked() appear designed to be used by external clients of the filesystem rather than by filesystems acting on themselves as the lookup_one_len family are used. They are used by: btrfs/ioctl - which is a user-space interface rather than an internal activity exportfs - i.e. from nfsd or the open_by_handle_at interface overlayfs - at access the underlying filesystems smb/server - for file service They should be used by nfsd (more than just the exportfs path) and cachefs but aren't. It would help if the documentation didn't claim they should "not be called by generic code". Also the path component name is passed as "name" and "len" which are (confusingly?) separate by the "base". In some cases the len in simply "strlen" and so passing a qstr using QSTR() would make the calling clearer. Other callers do pass separate name and len which are stored in a struct. Sometimes these are already stored in a qstr, other times it easily could be. So this patch changes these three functions to receive a 'struct qstr *', and improves the documentation. QSTR_LEN() is added to make it easy to pass a QSTR containing a known len. [brauner@kernel.org: take a struct qstr pointer] Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250319031545.2999807-2-neil@brown.name Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 9 +++++++ fs/btrfs/ioctl.c | 9 +++---- fs/exportfs/expfs.c | 5 ++-- fs/namei.c | 51 +++++++++++++++-------------------- fs/overlayfs/namei.c | 12 ++++----- fs/overlayfs/overlayfs.h | 2 +- fs/overlayfs/readdir.c | 9 ++++--- fs/smb/server/smb2pdu.c | 7 ++--- include/linux/dcache.h | 3 ++- include/linux/namei.h | 9 +++---- 10 files changed, 58 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 767b2927c762..57dcba6de743 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1203,3 +1203,12 @@ should use d_drop();d_splice_alias() and return the result of the latter. If a positive dentry cannot be returned for some reason, in-kernel clients such as cachefiles, nfsd, smb/server may not perform ideally but will fail-safe. + +--- + +** mandatory** + +lookup_one(), lookup_one_unlocked(), lookup_one_positive_unlocked() now +take a qstr instead of a name and len. These, not the "one_len" +versions, should be used whenever accessing a filesystem from outside +that filesysmtem, through a mount point - which will have a mnt_idmap. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a13d81bb56a0..502303438dd7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -909,7 +909,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one(idmap, name, parent->dentry, namelen); + dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; @@ -2288,7 +2288,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; - int subvol_namelen; int ret = 0; bool destroy_parent = false; @@ -2411,10 +2410,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - subvol_namelen = strlen(subvol_name); - if (strchr(subvol_name, '/') || - strncmp(subvol_name, "..", subvol_namelen) == 0) { + strcmp(subvol_name, "..") == 0) { ret = -EINVAL; goto free_subvol_name; } @@ -2427,7 +2424,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (ret == -EINTR) goto free_subvol_name; - dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, &QSTR(subvol_name), parent); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_unlock_dir; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 128dd092916b..f0ede3e81cf7 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -143,7 +143,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf)); + tmp = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), parent); if (IS_ERR(tmp)) { dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); @@ -549,8 +549,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, } inode_lock(target_dir->d_inode); - nresult = lookup_one(mnt_idmap(mnt), nbuf, - target_dir, strlen(nbuf)); + nresult = lookup_one(mnt_idmap(mnt), &QSTR(nbuf), target_dir); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { dput(nresult); diff --git a/fs/namei.c b/fs/namei.c index 360a86ca1f02..e499fb609271 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2922,19 +2922,17 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) EXPORT_SYMBOL(lookup_one_len); /** - * lookup_one - filesystem helper to lookup single pathname component + * lookup_one - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr holding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * * The caller must hold base->i_mutex. */ -struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, - struct dentry *base, int len) +struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, + struct dentry *base) { struct dentry *dentry; struct qstr this; @@ -2942,7 +2940,7 @@ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(idmap, name, base, len, &this); + err = lookup_one_common(idmap, name->name, base, name->len, &this); if (err) return ERR_PTR(err); @@ -2952,27 +2950,24 @@ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, EXPORT_SYMBOL(lookup_one); /** - * lookup_one_unlocked - filesystem helper to lookup single pathname component + * lookup_one_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr olding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * * Unlike lookup_one_len, it should be called without the parent - * i_mutex held, and will take the i_mutex itself if necessary. + * i_rwsem held, and will take the i_rwsem itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, - const char *name, struct dentry *base, - int len) + struct qstr *name, struct dentry *base) { struct qstr this; int err; struct dentry *ret; - err = lookup_one_common(idmap, name, base, len, &this); + err = lookup_one_common(idmap, name->name, base, name->len, &this); if (err) return ERR_PTR(err); @@ -2984,12 +2979,10 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, EXPORT_SYMBOL(lookup_one_unlocked); /** - * lookup_one_positive_unlocked - filesystem helper to lookup single - * pathname component + * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr holding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. @@ -2998,16 +2991,15 @@ EXPORT_SYMBOL(lookup_one_unlocked); * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * - * The helper should be called without i_mutex held. + * The helper should be called without i_rwsem held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, - const char *name, - struct dentry *base, int len) + struct qstr *name, + struct dentry *base) { - struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); + struct dentry *ret = lookup_one_unlocked(idmap, name, base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); @@ -3032,7 +3024,7 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked); struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); + return lookup_one_unlocked(&nop_mnt_idmap, &QSTR_LEN(name, len), base); } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -3047,7 +3039,8 @@ EXPORT_SYMBOL(lookup_one_len_unlocked); struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); + return lookup_one_positive_unlocked(&nop_mnt_idmap, + &QSTR_LEN(name, len), base); } EXPORT_SYMBOL(lookup_positive_unlocked); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index be5c65d6f848..ac6e893e846a 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -205,8 +205,8 @@ static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, - base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), + &QSTR_LEN(name, len), base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -789,8 +789,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, - ofs->workdir, name.len); + index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), &name, + ofs->workdir); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { @@ -1371,7 +1371,7 @@ out: bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); - const struct qstr *name = &dentry->d_name; + struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; bool positive = false; @@ -1396,7 +1396,7 @@ bool ovl_lower_positive(struct dentry *dentry) this = lookup_one_positive_unlocked( mnt_idmap(parentpath->layer->mnt), - name->name, parentpath->dentry, name->len); + name, parentpath->dentry); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6f2f8f4cfbbc..0db8d415c6d8 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -402,7 +402,7 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, const char *name, struct dentry *base, int len) { - return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len); + return lookup_one(ovl_upper_mnt_idmap(ofs), &QSTR_LEN(name, len), base); } static inline bool ovl_open_flags_need_copy_up(int flags) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 881ec5592da5..2fa450a7854d 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -271,7 +271,6 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { int err; - struct ovl_cache_entry *p; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; @@ -280,9 +279,11 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data err = down_write_killable(&dir->d_inode->i_rwsem); if (!err) { while (rdd->first_maybe_whiteout) { - p = rdd->first_maybe_whiteout; + struct ovl_cache_entry *p = + rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); + dentry = lookup_one(mnt_idmap(path->mnt), + &QSTR_LEN(p->name, p->len), dir); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); @@ -492,7 +493,7 @@ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, } } /* This checks also for xwhiteouts */ - this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); + this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index d24d95d15d87..0894128b9266 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4117,9 +4117,10 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) return -EINVAL; lock_dir(priv->dir_fp); - dent = lookup_one(idmap, priv->d_info->name, - priv->dir_fp->filp->f_path.dentry, - priv->d_info->name_len); + dent = lookup_one(idmap, + &QSTR_LEN(priv->d_info->name, + priv->d_info->name_len), + priv->dir_fp->filp->f_path.dentry); unlock_dir(priv->dir_fp); if (IS_ERR(dent)) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8d1395f945bf..e974e63bcdbc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -57,7 +57,8 @@ struct qstr { }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } -#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n)) +#define QSTR_LEN(n,l) (struct qstr)QSTR_INIT(n,l) +#define QSTR(n) QSTR_LEN(n, strlen(n)) extern const struct qstr empty_name; extern const struct qstr slash_name; diff --git a/include/linux/namei.h b/include/linux/namei.h index e3042176cdf4..ba02304a2a8a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -73,13 +73,12 @@ extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); -struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int); +struct dentry *lookup_one(struct mnt_idmap *, struct qstr *, struct dentry *); struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, - const char *name, struct dentry *base, - int len); + struct qstr *name, struct dentry *base); struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, - const char *name, - struct dentry *base, int len); + struct qstr *name, + struct dentry *base); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); -- cgit v1.2.3 From 0a02e1f4a54ace747304687ced3b76d159e58914 Mon Sep 17 00:00:00 2001 From: Yixun Lan Date: Wed, 26 Mar 2025 06:06:19 +0800 Subject: irqdomain: Support three-cell scheme interrupts Add new function *_twothreecell() to extend support to parse three-cell interrupts which encoded as , the translate function will retrieve irq number and flag from last two cells. This API will be used in gpio irq driver which need to work with two or three cells cases. Signed-off-by: Yixun Lan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250326-04-gpio-irq-threecell-v3-1-aab006ab0e00@gentoo.org --- include/linux/irqdomain.h | 20 ++++++++--------- kernel/irq/irqdomain.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index bb7111105296..df7e9278c8ac 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -571,16 +571,16 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type); - -int irq_domain_translate_twocell(struct irq_domain *d, - struct irq_fwspec *fwspec, - unsigned long *out_hwirq, - unsigned int *out_type); - -int irq_domain_translate_onecell(struct irq_domain *d, - struct irq_fwspec *fwspec, - unsigned long *out_hwirq, - unsigned int *out_type); +int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); + +int irq_domain_translate_onecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); +int irq_domain_translate_twocell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); +int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); /* IPI functions */ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9d5c8651492d..b294c3ff73b6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1132,6 +1132,31 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); +/** + * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings + * @d: Interrupt domain involved in the translation + * @ctrlr: The device tree node for the device whose interrupt is translated + * @intspec: The interrupt specifier data from the device tree + * @intsize: The number of entries in @intspec + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Device Tree interrupt specifier translation function for two or three + * cell bindings, where the cell values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) +{ + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + + return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type); +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell); + /** * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings * @d: Interrupt domain involved in the translation @@ -1216,6 +1241,37 @@ int irq_domain_translate_twocell(struct irq_domain *d, } EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); +/** + * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell + * bindings + * @d: Interrupt domain involved in the translation + * @fwspec: The firmware interrupt specifier to translate + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Firmware interrupt specifier translation function for two or three cell + * specifications, where the parameter values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (fwspec->param_count == 2) { + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + if (fwspec->param_count == 3) { + *out_hwirq = fwspec->param[1]; + *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { -- cgit v1.2.3 From 7b73c12c6ebf006ad496f0e38a605d92dfe05157 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 15:59:59 +0100 Subject: shmem: Add shmem_writeout() This will be the replacement for shmem_writepage(). Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250402150005.2309458-6-willy@infradead.org Reviewed-by: Baolin Wang Signed-off-by: Christian Brauner --- include/linux/shmem_fs.h | 7 ++++--- mm/shmem.c | 20 ++++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0b273a7b9f01..5f03a39a26f7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -104,10 +104,11 @@ static inline bool shmem_mapping(struct address_space *mapping) return false; } #endif /* CONFIG_SHMEM */ -extern void shmem_unlock_mapping(struct address_space *mapping); -extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, +void shmem_unlock_mapping(struct address_space *mapping); +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); -extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); +int shmem_writeout(struct folio *folio, struct writeback_control *wbc); +void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/shmem.c b/mm/shmem.c index 99327c30507c..7d377ceae035 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1536,12 +1536,20 @@ int shmem_unuse(unsigned int type) return error; } -/* - * Move the page from the page cache to the swap cache. - */ static int shmem_writepage(struct page *page, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); + return shmem_writeout(page_folio(page), wbc); +} + +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @wbc: How writeback is to be done + * + * Move the folio from the page cache to the swap cache. + */ +int shmem_writeout(struct folio *folio, struct writeback_control *wbc) +{ struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1586,9 +1594,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page_to_list_to_order(page, wbc->list, 0)) + if (split_folio_to_list(folio, wbc->list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } @@ -1660,6 +1667,7 @@ redirty: folio_unlock(folio); return 0; } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) -- cgit v1.2.3 From 6b0dfabb35550cb7b0808585dea6c24971d685d3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 16:00:03 +0100 Subject: fs: Remove aops->writepage All callers and implementations are now removed, so remove the operation and update the documentation to match. Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250402150005.2309458-10-willy@infradead.org Signed-off-by: Christian Brauner --- Documentation/admin-guide/cgroup-v2.rst | 2 +- Documentation/filesystems/fscrypt.rst | 2 +- Documentation/filesystems/locking.rst | 54 ++------------------------------- Documentation/filesystems/vfs.rst | 39 ++++++------------------ fs/buffer.c | 4 +-- include/linux/fs.h | 1 - mm/vmscan.c | 1 - 7 files changed, 15 insertions(+), 88 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 1a16ce68a4d7..9e7de8e70048 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -3019,7 +3019,7 @@ Filesystem Support for Writeback -------------------------------- A filesystem can support cgroup writeback by updating -address_space_operations->writepage[s]() to annotate bio's using the +address_space_operations->writepages() to annotate bio's using the following two functions. wbc_init_bio(@wbc, @bio) diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index e80329908549..3d22e2db732d 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -1409,7 +1409,7 @@ read the ciphertext into the page cache and decrypt it in-place. The folio lock must be held until decryption has finished, to prevent the folio from becoming visible to userspace prematurely. -For the write path (->writepage()) of regular files, filesystems +For the write path (->writepages()) of regular files, filesystems cannot encrypt data in-place in the page cache, since the cached plaintext must be preserved. Instead, filesystems must encrypt into a temporary buffer or "bounce page", then write out the temporary diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 0ec0bb6eb0fb..2e567e341c3b 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -249,7 +249,6 @@ address_space_operations ======================== prototypes:: - int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *folio); @@ -280,7 +279,6 @@ locking rules: ====================== ======================== ========= =============== ops folio locked i_rwsem invalidate_lock ====================== ======================== ========= =============== -writepage: yes, unlocks (see below) read_folio: yes, unlocks shared writepages: dirty_folio: maybe @@ -309,54 +307,6 @@ completion. ->readahead() unlocks the folios that I/O is attempted on like ->read_folio(). -->writepage() is used for two purposes: for "memory cleansing" and for -"sync". These are quite different operations and the behaviour may differ -depending upon the mode. - -If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then -it *must* start I/O against the page, even if that would involve -blocking on in-progress I/O. - -If writepage is called for memory cleansing (sync_mode == -WBC_SYNC_NONE) then its role is to get as much writeout underway as -possible. So writepage should try to avoid blocking against -currently-in-progress I/O. - -If the filesystem is not called for "sync" and it determines that it -would need to block against in-progress I/O to be able to start new I/O -against the page the filesystem should redirty the page with -redirty_page_for_writepage(), then unlock the page and return zero. -This may also be done to avoid internal deadlocks, but rarely. - -If the filesystem is called for sync then it must wait on any -in-progress I/O and then start new I/O. - -The filesystem should unlock the page synchronously, before returning to the -caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE -value. WRITEPAGE_ACTIVATE means that page cannot really be written out -currently, and VM should stop calling ->writepage() on this page for some -time. VM does this by moving page to the head of the active list, hence the -name. - -Unless the filesystem is going to redirty_page_for_writepage(), unlock the page -and return zero, writepage *must* run set_page_writeback() against the page, -followed by unlocking it. Once set_page_writeback() has been run against the -page, write I/O can be submitted and the write I/O completion handler must run -end_page_writeback() once the I/O is complete. If no I/O is submitted, the -filesystem must run end_page_writeback() against the page before returning from -writepage. - -That is: after 2.5.12, pages which are under writeout are *not* locked. Note, -if the filesystem needs the page to be locked during writeout, that is ok, too, -the page is allowed to be unlocked at any point in time between the calls to -set_page_writeback() and end_page_writeback(). - -Note, failure to run either redirty_page_for_writepage() or the combination of -set_page_writeback()/end_page_writeback() on a page submitted to writepage -will leave the page itself marked clean but it will be tagged as dirty in the -radix tree. This incoherency can lead to all sorts of hard-to-debug problems -in the filesystem like having dirty inodes at umount and losing written data. - ->writepages() is used for periodic writeback and for syscall-initiated sync operations. The address_space should start I/O against at least ``*nr_to_write`` pages. ``*nr_to_write`` must be decremented for each page @@ -364,8 +314,8 @@ which is written. The address_space implementation may write more (or less) pages than ``*nr_to_write`` asks for, but it should try to be reasonably close. If nr_to_write is NULL, all dirty pages must be written. -writepages should _only_ write pages which are present on -mapping->io_pages. +writepages should _only_ write pages which are present in +mapping->i_pages. ->dirty_folio() is called from various places in the kernel when the target folio is marked as needing writeback. The folio cannot be diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index ae79c30b6c0c..bf051c7da6b8 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -716,9 +716,8 @@ page lookup by address, and keeping track of pages tagged as Dirty or Writeback. The first can be used independently to the others. The VM can try to -either write dirty pages in order to clean them, or release clean pages -in order to reuse them. To do this it can call the ->writepage method -on dirty pages, and ->release_folio on clean folios with the private +release clean pages in order to reuse them. To do this it can call +->release_folio on clean folios with the private flag set. Clean pages without PagePrivate and with no external references will be released without notice being given to the address_space. @@ -731,8 +730,8 @@ maintains information about the PG_Dirty and PG_Writeback status of each page, so that pages with either of these flags can be found quickly. The Dirty tag is primarily used by mpage_writepages - the default -->writepages method. It uses the tag to find dirty pages to call -->writepage on. If mpage_writepages is not used (i.e. the address +->writepages method. It uses the tag to find dirty pages to +write back. If mpage_writepages is not used (i.e. the address provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost unused. write_inode_now and sync_inode do use it (through __sync_single_inode) to check if ->writepages has been successful in @@ -756,23 +755,23 @@ pages, however the address_space has finer control of write sizes. The read process essentially only requires 'read_folio'. The write process is more complicated and uses write_begin/write_end or -dirty_folio to write data into the address_space, and writepage and +dirty_folio to write data into the address_space, and writepages to writeback data to storage. Adding and removing pages to/from an address_space is protected by the inode's i_mutex. When data is written to a page, the PG_Dirty flag should be set. It -typically remains set until writepage asks for it to be written. This +typically remains set until writepages asks for it to be written. This should clear PG_Dirty and set PG_Writeback. It can be actually written at any point after PG_Dirty is clear. Once it is known to be safe, PG_Writeback is cleared. Writeback makes use of a writeback_control structure to direct the -operations. This gives the writepage and writepages operations some +operations. This gives the writepages operation some information about the nature of and reason for the writeback request, and the constraints under which it is being done. It is also used to -return information back to the caller about the result of a writepage or +return information back to the caller about the result of a writepages request. @@ -819,7 +818,6 @@ cache in your filesystem. The following members are defined: .. code-block:: c struct address_space_operations { - int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *); @@ -848,25 +846,6 @@ cache in your filesystem. The following members are defined: int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; -``writepage`` - called by the VM to write a dirty page to backing store. This - may happen for data integrity reasons (i.e. 'sync'), or to free - up memory (flush). The difference can be seen in - wbc->sync_mode. The PG_Dirty flag has been cleared and - PageLocked is true. writepage should start writeout, should set - PG_Writeback, and should make sure the page is unlocked, either - synchronously or asynchronously when the write operation - completes. - - If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to - try too hard if there are problems, and may choose to write out - other pages from the mapping if that is easier (e.g. due to - internal dependencies). If it chooses not to start writeout, it - should return AOP_WRITEPAGE_ACTIVATE so that the VM will not - keep calling ->writepage on that page. - - See the file "Locking" for more details. - ``read_folio`` Called by the page cache to read a folio from the backing store. The 'file' argument supplies authentication information to network @@ -909,7 +888,7 @@ cache in your filesystem. The following members are defined: given and that many pages should be written if possible. If no ->writepages is given, then mpage_writepages is used instead. This will choose pages from the address space that are tagged as - DIRTY and will pass them to ->writepage. + DIRTY and will write them back. ``dirty_folio`` called by the VM to mark a folio as dirty. This is particularly diff --git a/fs/buffer.c b/fs/buffer.c index c7abb4a029dc..b99dc69dba37 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2695,7 +2695,7 @@ unlock: EXPORT_SYMBOL(block_truncate_page); /* - * The generic ->writepage function for buffer-backed address_spaces + * The generic write folio function for buffer-backed address_spaces */ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block) @@ -2715,7 +2715,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, /* * The folio straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped + * writeback invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..f7beefb917a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,7 +433,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) } struct address_space_operations { - int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ diff --git a/mm/vmscan.c b/mm/vmscan.c index d172c998d592..6d56f4816171 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -648,7 +648,6 @@ typedef enum { /* * pageout is called by shrink_folio_list() for each dirty folio. - * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) -- cgit v1.2.3 From 559b3bbfa978ce3b23dc9c52d09a0eddca52c439 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 27 Mar 2025 10:06:10 -0400 Subject: locking/percpu-rwsem: add freezable alternative to down_read Percpu-rwsems are used for superblock locking. However, we know the read percpu-rwsem we take for sb_start_write() on a frozen filesystem needs not to inhibit system from suspending or hibernating. That means it needs to wait with TASK_UNINTERRUPTIBLE | TASK_FREEZABLE. Introduce a new percpu_down_read_freezable() that allows us to control whether TASK_FREEZABLE is added to the wait flags. Signed-off-by: James Bottomley Link: https://lore.kernel.org/r/20250327140613.25178-2-James.Bottomley@HansenPartnership.com Signed-off-by: Christian Brauner --- include/linux/percpu-rwsem.h | 20 ++++++++++++++++---- kernel/locking/percpu-rwsem.c | 13 ++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index af7d75ede619..288f5235649a 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -43,9 +43,10 @@ is_static struct percpu_rw_semaphore name = { \ #define DEFINE_STATIC_PERCPU_RWSEM(name) \ __DEFINE_PERCPU_RWSEM(name, static) -extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool); +extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool, bool); -static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +static inline void percpu_down_read_internal(struct percpu_rw_semaphore *sem, + bool freezable) { might_sleep(); @@ -63,7 +64,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else - __percpu_down_read(sem, false); /* Unconditional memory barrier */ + __percpu_down_read(sem, false, freezable); /* Unconditional memory barrier */ /* * The preempt_enable() prevents the compiler from * bleeding the critical section out. @@ -71,6 +72,17 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) preempt_enable(); } +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + percpu_down_read_internal(sem, false); +} + +static inline void percpu_down_read_freezable(struct percpu_rw_semaphore *sem, + bool freeze) +{ + percpu_down_read_internal(sem, freeze); +} + static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { bool ret = true; @@ -82,7 +94,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else - ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ + ret = __percpu_down_read(sem, true, false); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index d6964fc29f51..ef234469baac 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, return !reader; /* wake (readers until) 1 writer */ } -static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, + bool freeze) { DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); bool wait; @@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) spin_unlock_irq(&sem->waiters.lock); while (wait) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE | + (freeze ? TASK_FREEZABLE : 0)); if (!smp_load_acquire(&wq_entry.private)) break; schedule(); @@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, + bool freeze) { if (__percpu_down_read_trylock(sem)) return true; @@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); - percpu_rwsem_wait(sem, /* .reader = */ true); + percpu_rwsem_wait(sem, /* .reader = */ true, freeze); preempt_disable(); trace_contention_end(sem, 0); @@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) */ if (!__percpu_down_write_trylock(sem)) { trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); - percpu_rwsem_wait(sem, /* .reader = */ false); + percpu_rwsem_wait(sem, /* .reader = */ false, false); contended = true; } -- cgit v1.2.3 From f73bae83675b860cae9f58dba233b6be92e750ca Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 27 Mar 2025 10:06:11 -0400 Subject: fs: allow all writers to be frozen During freeze/thaw we need to be able to freeze all writers during suspend/hibernate. Otherwise tasks such as systemd-journald that mmap a file and write to it will not be frozen after we've already frozen the filesystem. This has some risk of not being able to freeze processes in case a process has acquired SB_FREEZE_PAGEFAULT under mmap_sem or SB_FREEZE_INTERNAL under some other filesytem specific lock. If the filesystem is frozen, a task can block on the frozen filesystem with e.g., mmap_sem held. If some other task then blocks on grabbing that mmap_sem, hibernation ill fail because it is unable to hibernate a task holding mmap_sem. This could be fixed by making a range of filesystem related locks use freezable sleeping. That's impractical and not warranted just for suspend/hibernate. Assume that this is an infrequent problem and we've given userspace a way to skip filesystem freezing through a sysfs file. Link: https://lore.kernel.org/r/20250402-work-freeze-v2-2-6719a97b52ac@kernel.org Link: https://lore.kernel.org/r/20250327140613.25178-3-James.Bottomley@HansenPartnership.com [brauner: make all freeze levels set TASK_FREEZABLE and rewrite commit message] Reviewed-by: Jan Kara Signed-off-by: James Bottomley Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..35b81f497904 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1780,7 +1780,7 @@ static inline void __sb_end_write(struct super_block *sb, int level) static inline void __sb_start_write(struct super_block *sb, int level) { - percpu_down_read(sb->s_writers.rw_sem + level - 1); + percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) -- cgit v1.2.3 From 2992476528aeecbaee17ba0a6396a817481205a3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 29 Mar 2025 09:42:17 +0100 Subject: super: use a common iterator (Part 1) Use a common iterator for all callbacks. Link: https://lore.kernel.org/r/20250329-work-freeze-v2-4-a47af37ecc3d@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 67 +++++++++++------------------------------------------- include/linux/fs.h | 6 ++++- 2 files changed, 18 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index c67ea3cdda41..0dd208804a74 100644 --- a/fs/super.c +++ b/fs/super.c @@ -887,37 +887,7 @@ void drop_super_exclusive(struct super_block *sb) } EXPORT_SYMBOL(drop_super_exclusive); -static void __iterate_supers(void (*f)(struct super_block *)) -{ - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (super_flags(sb, SB_DYING)) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - - f(sb); - - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); -} -/** - * iterate_supers - call function for all active superblocks - * @f: function to call - * @arg: argument to pass to it - * - * Scans the superblock list and calls given function, passing it - * locked superblock and given argument. - */ -void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl) { struct super_block *sb, *p = NULL; @@ -927,14 +897,13 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) if (super_flags(sb, SB_DYING)) continue; - sb->s_count++; spin_unlock(&sb_lock); - locked = super_lock_shared(sb); + locked = super_lock(sb, excl); if (locked) { f(sb, arg); - super_unlock_shared(sb); + super_unlock(sb, excl); } spin_lock(&sb_lock); @@ -1111,11 +1080,9 @@ cancel_readonly: return retval; } -static void do_emergency_remount_callback(struct super_block *sb) +static void do_emergency_remount_callback(struct super_block *sb, void *unused) { - bool locked = super_lock_excl(sb); - - if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { + if (sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1126,13 +1093,11 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - if (locked) - super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { - __iterate_supers(do_emergency_remount_callback); + __iterate_supers(do_emergency_remount_callback, NULL, true); kfree(work); printk("Emergency Remount complete\n"); } @@ -1148,24 +1113,18 @@ void emergency_remount(void) } } -static void do_thaw_all_callback(struct super_block *sb) +static void do_thaw_all_callback(struct super_block *sb, void *unused) { - bool locked = super_lock_excl(sb); - - if (locked && sb->s_root) { - if (IS_ENABLED(CONFIG_BLOCK)) - while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) - pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); - thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); - return; - } - if (locked) - super_unlock_excl(sb); + if (IS_ENABLED(CONFIG_BLOCK)) + while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) + pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); + thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); + return; } static void do_thaw_all(struct work_struct *work) { - __iterate_supers(do_thaw_all_callback); + __iterate_supers(do_thaw_all_callback, NULL, true); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 35b81f497904..5e007bffd00e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3515,7 +3515,11 @@ extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); -extern void iterate_supers(void (*)(struct super_block *, void *), void *); +void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl); +static inline void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + __iterate_supers(f, arg, false); +} extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From b47e42d10e8c20525febccbd6e0dc8528861aea4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 29 Mar 2025 09:42:18 +0100 Subject: super: use common iterator (Part 2) Use a common iterator for all callbacks. We could go for something even more elaborate (advance step-by-step similar to iov_iter) but I really don't think this is warranted. Link: https://lore.kernel.org/r/20250329-work-freeze-v2-5-a47af37ecc3d@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 48 +++++++++++++++++++++++++++++++++++++++--------- include/linux/fs.h | 6 +----- 2 files changed, 40 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 0dd208804a74..796e7f402a41 100644 --- a/fs/super.c +++ b/fs/super.c @@ -887,21 +887,46 @@ void drop_super_exclusive(struct super_block *sb) } EXPORT_SYMBOL(drop_super_exclusive); -void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl) +enum super_iter_flags_t { + SUPER_ITER_EXCL = (1U << 0), + SUPER_ITER_UNLOCKED = (1U << 1), + SUPER_ITER_REVERSE = (1U << 2), +}; + +static inline struct super_block *first_super(enum super_iter_flags_t flags) +{ + if (flags & SUPER_ITER_REVERSE) + return list_last_entry(&super_blocks, struct super_block, s_list); + return list_first_entry(&super_blocks, struct super_block, s_list); +} + +static inline struct super_block *next_super(struct super_block *sb, + enum super_iter_flags_t flags) +{ + if (flags & SUPER_ITER_REVERSE) + return list_prev_entry(sb, s_list); + return list_next_entry(sb, s_list); +} + +static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, + enum super_iter_flags_t flags) { struct super_block *sb, *p = NULL; + bool excl = flags & SUPER_ITER_EXCL; - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - bool locked; + guard(spinlock)(&sb_lock); + for (sb = first_super(flags); + !list_entry_is_head(sb, &super_blocks, s_list); + sb = next_super(sb, flags)) { if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); - locked = super_lock(sb, excl); - if (locked) { + if (flags & SUPER_ITER_UNLOCKED) { + f(sb, arg); + } else if (super_lock(sb, excl)) { f(sb, arg); super_unlock(sb, excl); } @@ -913,7 +938,11 @@ void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool e } if (p) __put_super(p); - spin_unlock(&sb_lock); +} + +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + __iterate_supers(f, arg, 0); } /** @@ -1097,7 +1126,8 @@ static void do_emergency_remount_callback(struct super_block *sb, void *unused) static void do_emergency_remount(struct work_struct *work) { - __iterate_supers(do_emergency_remount_callback, NULL, true); + __iterate_supers(do_emergency_remount_callback, NULL, + SUPER_ITER_EXCL | SUPER_ITER_REVERSE); kfree(work); printk("Emergency Remount complete\n"); } @@ -1124,7 +1154,7 @@ static void do_thaw_all_callback(struct super_block *sb, void *unused) static void do_thaw_all(struct work_struct *work) { - __iterate_supers(do_thaw_all_callback, NULL, true); + __iterate_supers(do_thaw_all_callback, NULL, SUPER_ITER_EXCL); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 5e007bffd00e..96f8c8a794ea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3515,11 +3515,7 @@ extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); -void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl); -static inline void iterate_supers(void (*f)(struct super_block *, void *), void *arg) -{ - __iterate_supers(f, arg, false); -} +extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From 06f2f68a670aae28b825065439301831e74da880 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 15:31:15 +0100 Subject: genirq/generic-chip: Make locking unconditional The SMP conditional wrappers around raw_spin_[un]lock() have no real value. On !SMP kernels the lock operations are NOOPs except for a preempt_disable/enable() pair on PREEMPT enabled kernels, which are not really worth to optimize for. Aside of that this evades lockdep on !SMP kernels. Remove the !SMP stubs and make it unconditional. No functional change. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250313142524.011345765@linutronix.de --- include/linux/irq.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index dd5df1e2d032..500772943207 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1222,7 +1222,6 @@ static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) -#ifdef CONFIG_SMP static inline void irq_gc_lock(struct irq_chip_generic *gc) { raw_spin_lock(&gc->lock); @@ -1232,10 +1231,6 @@ static inline void irq_gc_unlock(struct irq_chip_generic *gc) { raw_spin_unlock(&gc->lock); } -#else -static inline void irq_gc_lock(struct irq_chip_generic *gc) { } -static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } -#endif /* * The irqsave variants are for usage in non interrupt code. Do not use -- cgit v1.2.3 From 7ae844a6650c5c15ccfbf76ed767e7f2cc61ec1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 15:31:28 +0100 Subject: genirq/generic-chip: Remove unused lock wrappers All users are converted to lock guards. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250313142524.388478168@linutronix.de --- include/linux/irq.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 500772943207..d896d3a471ec 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1222,26 +1222,6 @@ static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) -static inline void irq_gc_lock(struct irq_chip_generic *gc) -{ - raw_spin_lock(&gc->lock); -} - -static inline void irq_gc_unlock(struct irq_chip_generic *gc) -{ - raw_spin_unlock(&gc->lock); -} - -/* - * The irqsave variants are for usage in non interrupt code. Do not use - * them in irq_chip callbacks. Use irq_gc_lock() instead. - */ -#define irq_gc_lock_irqsave(gc, flags) \ - raw_spin_lock_irqsave(&(gc)->lock, flags) - -#define irq_gc_unlock_irqrestore(gc, flags) \ - raw_spin_unlock_irqrestore(&(gc)->lock, flags) - static inline void irq_reg_writel(struct irq_chip_generic *gc, u32 val, int reg_offset) { -- cgit v1.2.3 From af1c968d25c7c44cd2738349c479f8b610a3fc40 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Mon, 7 Apr 2025 13:23:47 +0200 Subject: ASoC: Intel: avs: PTL-based platforms support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define handlers specific to ACE platforms, that Frisco Lake (FCL), a PantherLake (PTL)-based platform, is founded upon. Most operations are still inherited from their predecessors with the major difference being AudioDSP cores management - replaced by DSP-domain power management. Software has to ensure the DSP domain is both powered on and its power-gating disabled before it can be utilized for streaming. Reviewed-by: Amadeusz Sławiński Signed-off-by: Cezary Rojewski Acked-by: Liam Girdwood Link: https://patch.msgid.link/20250407112352.3720779-6-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/linux/pci_ids.h | 1 + sound/soc/intel/avs/Makefile | 6 +- sound/soc/intel/avs/avs.h | 7 ++ sound/soc/intel/avs/core.c | 29 ++++++ sound/soc/intel/avs/dsp.c | 2 - sound/soc/intel/avs/lnl.c | 27 ++++++ sound/soc/intel/avs/mtl.c | 200 ++++++++++++++++++++++++++++++++++++++++ sound/soc/intel/avs/ptl.c | 98 ++++++++++++++++++++ sound/soc/intel/avs/registers.h | 38 ++++++++ 9 files changed, 403 insertions(+), 5 deletions(-) create mode 100644 sound/soc/intel/avs/lnl.c create mode 100644 sound/soc/intel/avs/mtl.c create mode 100644 sound/soc/intel/avs/ptl.c (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 2e28182c3af0..981ed45cc45e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3070,6 +3070,7 @@ #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 #define PCI_DEVICE_ID_INTEL_5100_22 0x65f6 #define PCI_DEVICE_ID_INTEL_IOAT_SCNB 0x65ff +#define PCI_DEVICE_ID_INTEL_HDA_FCL 0x67a8 #define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000 #define PCI_DEVICE_ID_INTEL_82371SB_1 0x7010 #define PCI_DEVICE_ID_INTEL_82371SB_2 0x7020 diff --git a/sound/soc/intel/avs/Makefile b/sound/soc/intel/avs/Makefile index 5139a019a4ad..576dc0da381d 100644 --- a/sound/soc/intel/avs/Makefile +++ b/sound/soc/intel/avs/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only snd-soc-avs-y := dsp.o ipc.o messages.o utils.o core.o loader.o \ - topology.o path.o pcm.o board_selection.o control.o \ - sysfs.o + topology.o path.o pcm.o board_selection.o control.o \ + sysfs.o snd-soc-avs-y += cldma.o -snd-soc-avs-y += skl.o apl.o cnl.o icl.o tgl.o +snd-soc-avs-y += skl.o apl.o cnl.o icl.o tgl.o mtl.o lnl.o ptl.o snd-soc-avs-y += trace.o # tell define_trace.h where to find the trace header diff --git a/sound/soc/intel/avs/avs.h b/sound/soc/intel/avs/avs.h index ec5502f9d5cb..f5553aba813a 100644 --- a/sound/soc/intel/avs/avs.h +++ b/sound/soc/intel/avs/avs.h @@ -69,6 +69,7 @@ extern const struct avs_dsp_ops avs_apl_dsp_ops; extern const struct avs_dsp_ops avs_cnl_dsp_ops; extern const struct avs_dsp_ops avs_icl_dsp_ops; extern const struct avs_dsp_ops avs_tgl_dsp_ops; +extern const struct avs_dsp_ops avs_ptl_dsp_ops; #define AVS_PLATATTR_CLDMA BIT_ULL(0) #define AVS_PLATATTR_IMR BIT_ULL(1) @@ -267,8 +268,14 @@ void avs_ipc_block(struct avs_ipc *ipc); int avs_dsp_disable_d0ix(struct avs_dev *adev); int avs_dsp_enable_d0ix(struct avs_dev *adev); +int avs_mtl_core_power(struct avs_dev *adev, u32 core_mask, bool power); +int avs_mtl_core_reset(struct avs_dev *adev, u32 core_mask, bool power); +int avs_mtl_core_stall(struct avs_dev *adev, u32 core_mask, bool stall); +int avs_lnl_core_stall(struct avs_dev *adev, u32 core_mask, bool stall); +void avs_mtl_interrupt_control(struct avs_dev *adev, bool enable); void avs_skl_ipc_interrupt(struct avs_dev *adev); irqreturn_t avs_cnl_dsp_interrupt(struct avs_dev *adev); +irqreturn_t avs_mtl_dsp_interrupt(struct avs_dev *adev); int avs_apl_enable_logs(struct avs_dev *adev, enum avs_log_enable enable, u32 aging_period, u32 fifo_full_period, unsigned long resource_mask, u32 *priorities); int avs_icl_enable_logs(struct avs_dev *adev, enum avs_log_enable enable, u32 aging_period, diff --git a/sound/soc/intel/avs/core.c b/sound/soc/intel/avs/core.c index 1495e163d47e..a7aa3a6cde9e 100644 --- a/sound/soc/intel/avs/core.c +++ b/sound/soc/intel/avs/core.c @@ -762,6 +762,11 @@ static const struct avs_sram_spec apl_sram_spec = { .window_size = APL_ADSP_SRAM_WINDOW_SIZE, }; +static const struct avs_sram_spec mtl_sram_spec = { + .base_offset = MTL_ADSP_SRAM_BASE_OFFSET, + .window_size = MTL_ADSP_SRAM_WINDOW_SIZE, +}; + static const struct avs_hipc_spec skl_hipc_spec = { .req_offset = SKL_ADSP_REG_HIPCI, .req_ext_offset = SKL_ADSP_REG_HIPCIE, @@ -798,6 +803,18 @@ static const struct avs_hipc_spec cnl_hipc_spec = { .sts_offset = APL_ADSP_SRAM_BASE_OFFSET, }; +static const struct avs_hipc_spec lnl_hipc_spec = { + .req_offset = MTL_REG_HfIPCxIDR, + .req_ext_offset = MTL_REG_HfIPCxIDD, + .req_busy_mask = MTL_HfIPCxIDR_BUSY, + .ack_offset = MTL_REG_HfIPCxIDA, + .ack_done_mask = MTL_HfIPCxIDA_DONE, + .rsp_offset = MTL_REG_HfIPCxTDR, + .rsp_busy_mask = MTL_HfIPCxTDR_BUSY, + .ctl_offset = MTL_REG_HfIPCxCTL, + .sts_offset = LNL_REG_HfDFR(0), +}; + static const struct avs_spec skl_desc = { .name = "skl", .min_fw_version = { 9, 21, 0, 4732 }, @@ -865,6 +882,16 @@ AVS_TGL_BASED_SPEC(ehl, 30); AVS_TGL_BASED_SPEC(adl, 35); AVS_TGL_BASED_SPEC(adl_n, 35); +static const struct avs_spec fcl_desc = { + .name = "fcl", + .min_fw_version = { 0 }, + .dsp_ops = &avs_ptl_dsp_ops, + .core_init_mask = 1, + .attributes = AVS_PLATATTR_IMR | AVS_PLATATTR_ACE | AVS_PLATATTR_ALTHDA, + .sram = &mtl_sram_spec, + .hipc = &lnl_hipc_spec, +}; + static const struct pci_device_id avs_ids[] = { { PCI_DEVICE_DATA(INTEL, HDA_SKL_LP, &skl_desc) }, { PCI_DEVICE_DATA(INTEL, HDA_SKL, &skl_desc) }, @@ -900,6 +927,7 @@ static const struct pci_device_id avs_ids[] = { { PCI_DEVICE_DATA(INTEL, HDA_RPL_P_1, &adl_desc) }, { PCI_DEVICE_DATA(INTEL, HDA_RPL_M, &adl_desc) }, { PCI_DEVICE_DATA(INTEL, HDA_RPL_PX, &adl_desc) }, + { PCI_DEVICE_DATA(INTEL, HDA_FCL, &fcl_desc) }, { 0 } }; MODULE_DEVICE_TABLE(pci, avs_ids); @@ -931,3 +959,4 @@ MODULE_FIRMWARE("intel/tgl/dsp_basefw.bin"); MODULE_FIRMWARE("intel/ehl/dsp_basefw.bin"); MODULE_FIRMWARE("intel/adl/dsp_basefw.bin"); MODULE_FIRMWARE("intel/adl_n/dsp_basefw.bin"); +MODULE_FIRMWARE("intel/fcl/dsp_basefw.bin"); diff --git a/sound/soc/intel/avs/dsp.c b/sound/soc/intel/avs/dsp.c index b9de691e9b9b..464bd6859182 100644 --- a/sound/soc/intel/avs/dsp.c +++ b/sound/soc/intel/avs/dsp.c @@ -12,8 +12,6 @@ #include "registers.h" #include "trace.h" -#define AVS_ADSPCS_INTERVAL_US 500 -#define AVS_ADSPCS_TIMEOUT_US 50000 #define AVS_ADSPCS_DELAY_US 1000 int avs_dsp_core_power(struct avs_dev *adev, u32 core_mask, bool power) diff --git a/sound/soc/intel/avs/lnl.c b/sound/soc/intel/avs/lnl.c new file mode 100644 index 000000000000..03208596dfb1 --- /dev/null +++ b/sound/soc/intel/avs/lnl.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2021-2025 Intel Corporation + * + * Authors: Cezary Rojewski + * Amadeusz Slawinski + */ + +#include +#include "avs.h" +#include "registers.h" + +int avs_lnl_core_stall(struct avs_dev *adev, u32 core_mask, bool stall) +{ + struct hdac_bus *bus = &adev->base.core; + struct hdac_ext_link *hlink; + int ret; + + ret = avs_mtl_core_stall(adev, core_mask, stall); + + /* On unstall, route interrupts from the links to the DSP firmware. */ + if (!ret && !stall) + list_for_each_entry(hlink, &bus->hlink_list, list) + snd_hdac_updatel(hlink->ml_addr, AZX_REG_ML_LCTL, AZX_ML_LCTL_OFLEN, + AZX_ML_LCTL_OFLEN); + return ret; +} diff --git a/sound/soc/intel/avs/mtl.c b/sound/soc/intel/avs/mtl.c new file mode 100644 index 000000000000..e7b7915b2a82 --- /dev/null +++ b/sound/soc/intel/avs/mtl.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2021-2025 Intel Corporation + * + * Authors: Cezary Rojewski + * Amadeusz Slawinski + */ + +#include +#include "avs.h" +#include "registers.h" +#include "trace.h" + +#define MTL_HfDSSGBL_BASE 0x1000 +#define MTL_REG_HfDSSCS (MTL_HfDSSGBL_BASE + 0x0) +#define MTL_HfDSSCS_SPA BIT(16) +#define MTL_HfDSSCS_CPA BIT(24) + +#define MTL_DSPCS_BASE 0x178D00 +#define MTL_REG_DSPCCTL (MTL_DSPCS_BASE + 0x4) +#define MTL_DSPCCTL_SPA BIT(0) +#define MTL_DSPCCTL_CPA BIT(8) +#define MTL_DSPCCTL_OSEL GENMASK(25, 24) +#define MTL_DSPCCTL_OSEL_HOST BIT(25) + +#define MTL_HfINT_BASE 0x1100 +#define MTL_REG_HfINTIPPTR (MTL_HfINT_BASE + 0x8) +#define MTL_REG_HfHIPCIE (MTL_HfINT_BASE + 0x40) +#define MTL_HfINTIPPTR_PTR GENMASK(20, 0) +#define MTL_HfHIPCIE_IE BIT(0) + +#define MTL_DWICTL_INTENL_IE BIT(0) +#define MTL_DWICTL_FINALSTATUSL_IPC BIT(0) /* same as ADSPIS_IPC */ + +static int avs_mtl_core_power_on(struct avs_dev *adev) +{ + u32 reg; + int ret; + + /* Power up DSP domain. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfDSSCS, MTL_HfDSSCS_SPA, MTL_HfDSSCS_SPA); + trace_avs_dsp_core_op(1, AVS_MAIN_CORE_MASK, "power dsp", true); + + ret = snd_hdac_adsp_readl_poll(adev, MTL_REG_HfDSSCS, reg, + (reg & MTL_HfDSSCS_CPA) == MTL_HfDSSCS_CPA, + AVS_ADSPCS_INTERVAL_US, AVS_ADSPCS_TIMEOUT_US); + if (ret) { + dev_err(adev->dev, "power on domain dsp failed: %d\n", ret); + return ret; + } + + /* Prevent power gating of DSP domain. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfPWRCTL, MTL_HfPWRCTL_WPDSPHPxPG, + MTL_HfPWRCTL_WPDSPHPxPG); + trace_avs_dsp_core_op(1, AVS_MAIN_CORE_MASK, "prevent dsp PG", true); + + ret = snd_hdac_adsp_readl_poll(adev, MTL_REG_HfPWRSTS, reg, + (reg & MTL_HfPWRSTS_DSPHPxPGS) == MTL_HfPWRSTS_DSPHPxPGS, + AVS_ADSPCS_INTERVAL_US, AVS_ADSPCS_TIMEOUT_US); + + /* Set ownership to HOST. */ + snd_hdac_adsp_updatel(adev, MTL_REG_DSPCCTL, MTL_DSPCCTL_OSEL, MTL_DSPCCTL_OSEL_HOST); + return ret; +} + +static int avs_mtl_core_power_off(struct avs_dev *adev) +{ + u32 reg; + + /* Allow power gating of DSP domain. No STS polling as HOST is only one of its users. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfPWRCTL, MTL_HfPWRCTL_WPDSPHPxPG, 0); + trace_avs_dsp_core_op(0, AVS_MAIN_CORE_MASK, "allow dsp pg", false); + + /* Power down DSP domain. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfDSSCS, MTL_HfDSSCS_SPA, 0); + trace_avs_dsp_core_op(0, AVS_MAIN_CORE_MASK, "power dsp", false); + + return snd_hdac_adsp_readl_poll(adev, MTL_REG_HfDSSCS, reg, + (reg & MTL_HfDSSCS_CPA) == 0, + AVS_ADSPCS_INTERVAL_US, AVS_ADSPCS_TIMEOUT_US); +} + +int avs_mtl_core_power(struct avs_dev *adev, u32 core_mask, bool power) +{ + core_mask &= AVS_MAIN_CORE_MASK; + if (!core_mask) + return 0; + + if (power) + return avs_mtl_core_power_on(adev); + return avs_mtl_core_power_off(adev); +} + +int avs_mtl_core_reset(struct avs_dev *adev, u32 core_mask, bool power) +{ + /* No logical equivalent on ACE 1.x. */ + return 0; +} + +int avs_mtl_core_stall(struct avs_dev *adev, u32 core_mask, bool stall) +{ + u32 value, reg; + int ret; + + core_mask &= AVS_MAIN_CORE_MASK; + if (!core_mask) + return 0; + + value = snd_hdac_adsp_readl(adev, MTL_REG_DSPCCTL); + trace_avs_dsp_core_op(value, core_mask, "stall", stall); + if (value == UINT_MAX) + return 0; + + value = stall ? 0 : MTL_DSPCCTL_SPA; + snd_hdac_adsp_updatel(adev, MTL_REG_DSPCCTL, MTL_DSPCCTL_SPA, value); + + value = stall ? 0 : MTL_DSPCCTL_CPA; + ret = snd_hdac_adsp_readl_poll(adev, MTL_REG_DSPCCTL, + reg, (reg & MTL_DSPCCTL_CPA) == value, + AVS_ADSPCS_INTERVAL_US, AVS_ADSPCS_TIMEOUT_US); + if (ret) + dev_err(adev->dev, "core_mask %d %sstall failed: %d\n", + core_mask, stall ? "" : "un", ret); + return ret; +} + +static void avs_mtl_ipc_interrupt(struct avs_dev *adev) +{ + const struct avs_spec *spec = adev->spec; + u32 hipc_ack, hipc_rsp; + + snd_hdac_adsp_updatel(adev, spec->hipc->ctl_offset, + AVS_ADSP_HIPCCTL_DONE | AVS_ADSP_HIPCCTL_BUSY, 0); + + hipc_ack = snd_hdac_adsp_readl(adev, spec->hipc->ack_offset); + hipc_rsp = snd_hdac_adsp_readl(adev, spec->hipc->rsp_offset); + + /* DSP acked host's request. */ + if (hipc_ack & spec->hipc->ack_done_mask) { + complete(&adev->ipc->done_completion); + + /* Tell DSP it has our attention. */ + snd_hdac_adsp_updatel(adev, spec->hipc->ack_offset, spec->hipc->ack_done_mask, + spec->hipc->ack_done_mask); + } + + /* DSP sent new response to process. */ + if (hipc_rsp & spec->hipc->rsp_busy_mask) { + union avs_reply_msg msg; + + msg.primary = snd_hdac_adsp_readl(adev, MTL_REG_HfIPCxTDR); + msg.ext.val = snd_hdac_adsp_readl(adev, MTL_REG_HfIPCxTDD); + + avs_dsp_process_response(adev, msg.val); + + /* Tell DSP we accepted its message. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfIPCxTDR, + MTL_HfIPCxTDR_BUSY, MTL_HfIPCxTDR_BUSY); + /* Ack this response. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfIPCxTDA, MTL_HfIPCxTDA_BUSY, 0); + } + + snd_hdac_adsp_updatel(adev, spec->hipc->ctl_offset, + AVS_ADSP_HIPCCTL_DONE | AVS_ADSP_HIPCCTL_BUSY, + AVS_ADSP_HIPCCTL_DONE | AVS_ADSP_HIPCCTL_BUSY); +} + +irqreturn_t avs_mtl_dsp_interrupt(struct avs_dev *adev) +{ + u32 adspis = snd_hdac_adsp_readl(adev, MTL_DWICTL_REG_FINALSTATUSL); + irqreturn_t ret = IRQ_NONE; + + if (adspis == UINT_MAX) + return ret; + + if (adspis & MTL_DWICTL_FINALSTATUSL_IPC) { + avs_mtl_ipc_interrupt(adev); + ret = IRQ_HANDLED; + } + + return ret; +} + +void avs_mtl_interrupt_control(struct avs_dev *adev, bool enable) +{ + if (enable) { + snd_hdac_adsp_updatel(adev, MTL_DWICTL_REG_INTENL, MTL_DWICTL_INTENL_IE, + MTL_DWICTL_INTENL_IE); + snd_hdac_adsp_updatew(adev, MTL_REG_HfHIPCIE, MTL_HfHIPCIE_IE, MTL_HfHIPCIE_IE); + snd_hdac_adsp_updatel(adev, MTL_REG_HfIPCxCTL, AVS_ADSP_HIPCCTL_DONE, + AVS_ADSP_HIPCCTL_DONE); + snd_hdac_adsp_updatel(adev, MTL_REG_HfIPCxCTL, AVS_ADSP_HIPCCTL_BUSY, + AVS_ADSP_HIPCCTL_BUSY); + } else { + snd_hdac_adsp_updatel(adev, MTL_REG_HfIPCxCTL, AVS_ADSP_HIPCCTL_BUSY, 0); + snd_hdac_adsp_updatel(adev, MTL_REG_HfIPCxCTL, AVS_ADSP_HIPCCTL_DONE, 0); + snd_hdac_adsp_updatew(adev, MTL_REG_HfHIPCIE, MTL_HfHIPCIE_IE, 0); + snd_hdac_adsp_updatel(adev, MTL_DWICTL_REG_INTENL, MTL_DWICTL_INTENL_IE, 0); + } +} diff --git a/sound/soc/intel/avs/ptl.c b/sound/soc/intel/avs/ptl.c new file mode 100644 index 000000000000..2be4b545c91d --- /dev/null +++ b/sound/soc/intel/avs/ptl.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2024-2025 Intel Corporation + * + * Authors: Cezary Rojewski + * Amadeusz Slawinski + */ + +#include +#include "avs.h" +#include "registers.h" +#include "trace.h" + +#define MTL_HfDSSGBL_BASE 0x1000 +#define MTL_REG_HfDSSCS (MTL_HfDSSGBL_BASE + 0x0) +#define MTL_HfDSSCS_SPA BIT(16) +#define MTL_HfDSSCS_CPA BIT(24) + +#define MTL_DSPCS_BASE 0x178D00 +#define MTL_REG_DSPCCTL (MTL_DSPCS_BASE + 0x4) +#define MTL_DSPCCTL_OSEL GENMASK(25, 24) +#define MTL_DSPCCTL_OSEL_HOST BIT(25) + +static int avs_ptl_core_power_on(struct avs_dev *adev) +{ + u32 reg; + int ret; + + /* Power up DSP domain. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfDSSCS, MTL_HfDSSCS_SPA, MTL_HfDSSCS_SPA); + trace_avs_dsp_core_op(1, AVS_MAIN_CORE_MASK, "power dsp", true); + + ret = snd_hdac_adsp_readl_poll(adev, MTL_REG_HfDSSCS, reg, + (reg & MTL_HfDSSCS_CPA) == MTL_HfDSSCS_CPA, + AVS_ADSPCS_INTERVAL_US, AVS_ADSPCS_TIMEOUT_US); + if (ret) { + dev_err(adev->dev, "power on domain dsp failed: %d\n", ret); + return ret; + } + + /* Prevent power gating of DSP domain. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfPWRCTL2, MTL_HfPWRCTL2_WPDSPHPxPG, + MTL_HfPWRCTL2_WPDSPHPxPG); + trace_avs_dsp_core_op(1, AVS_MAIN_CORE_MASK, "prevent dsp PG", true); + + ret = snd_hdac_adsp_readl_poll(adev, MTL_REG_HfPWRSTS2, reg, + (reg & MTL_HfPWRSTS2_DSPHPxPGS) == MTL_HfPWRSTS2_DSPHPxPGS, + AVS_ADSPCS_INTERVAL_US, AVS_ADSPCS_TIMEOUT_US); + + /* Set ownership to HOST. */ + snd_hdac_adsp_updatel(adev, MTL_REG_DSPCCTL, MTL_DSPCCTL_OSEL, MTL_DSPCCTL_OSEL_HOST); + return ret; +} + +static int avs_ptl_core_power_off(struct avs_dev *adev) +{ + u32 reg; + + /* Allow power gating of DSP domain. No STS polling as HOST is only one of its users. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfPWRCTL2, MTL_HfPWRCTL2_WPDSPHPxPG, 0); + trace_avs_dsp_core_op(0, AVS_MAIN_CORE_MASK, "allow dsp pg", false); + + /* Power down DSP domain. */ + snd_hdac_adsp_updatel(adev, MTL_REG_HfDSSCS, MTL_HfDSSCS_SPA, 0); + trace_avs_dsp_core_op(0, AVS_MAIN_CORE_MASK, "power dsp", false); + + return snd_hdac_adsp_readl_poll(adev, MTL_REG_HfDSSCS, reg, + (reg & MTL_HfDSSCS_CPA) == 0, + AVS_ADSPCS_INTERVAL_US, AVS_ADSPCS_TIMEOUT_US); +} + +static int avs_ptl_core_power(struct avs_dev *adev, u32 core_mask, bool power) +{ + core_mask &= AVS_MAIN_CORE_MASK; + if (!core_mask) + return 0; + + if (power) + return avs_ptl_core_power_on(adev); + return avs_ptl_core_power_off(adev); +} + +const struct avs_dsp_ops avs_ptl_dsp_ops = { + .power = avs_ptl_core_power, + .reset = avs_mtl_core_reset, + .stall = avs_lnl_core_stall, + .dsp_interrupt = avs_mtl_dsp_interrupt, + .int_control = avs_mtl_interrupt_control, + .load_basefw = avs_hda_load_basefw, + .load_lib = avs_hda_load_library, + .transfer_mods = avs_hda_transfer_modules, + .log_buffer_offset = avs_icl_log_buffer_offset, + .log_buffer_status = avs_apl_log_buffer_status, + .coredump = avs_apl_coredump, + .d0ix_toggle = avs_icl_d0ix_toggle, + .set_d0ix = avs_icl_set_d0ix, + AVS_SET_ENABLE_LOGS_OP(icl) +}; diff --git a/sound/soc/intel/avs/registers.h b/sound/soc/intel/avs/registers.h index 4db0cdf68ffc..97767882ffa1 100644 --- a/sound/soc/intel/avs/registers.h +++ b/sound/soc/intel/avs/registers.h @@ -35,6 +35,8 @@ #define AVS_ADSPCS_CSTALL_MASK(cm) ((cm) << 8) #define AVS_ADSPCS_SPA_MASK(cm) ((cm) << 16) #define AVS_ADSPCS_CPA_MASK(cm) ((cm) << 24) +#define AVS_ADSPCS_INTERVAL_US 500 +#define AVS_ADSPCS_TIMEOUT_US 10000 #define AVS_MAIN_CORE_MASK BIT(0) #define AVS_ADSP_HIPCCTL_BUSY BIT(0) @@ -67,11 +69,47 @@ #define CNL_ADSP_HIPCIDR_BUSY BIT(31) #define CNL_ADSP_HIPCIDA_DONE BIT(31) +/* MTL Intel HOST Inter-Processor Communication Registers */ +#define MTL_HfIPC_BASE 0x73000 +#define MTL_REG_HfIPCxTDR (MTL_HfIPC_BASE + 0x200) +#define MTL_REG_HfIPCxTDA (MTL_HfIPC_BASE + 0x204) +#define MTL_REG_HfIPCxIDR (MTL_HfIPC_BASE + 0x210) +#define MTL_REG_HfIPCxIDA (MTL_HfIPC_BASE + 0x214) +#define MTL_REG_HfIPCxCTL (MTL_HfIPC_BASE + 0x228) +#define MTL_REG_HfIPCxTDD (MTL_HfIPC_BASE + 0x300) +#define MTL_REG_HfIPCxIDD (MTL_HfIPC_BASE + 0x380) + +#define MTL_HfIPCxTDR_BUSY BIT(31) +#define MTL_HfIPCxTDA_BUSY BIT(31) +#define MTL_HfIPCxIDR_BUSY BIT(31) +#define MTL_HfIPCxIDA_DONE BIT(31) + +#define MTL_HfFLV_BASE 0x162000 +#define MTL_REG_HfFLGP(x, y) (MTL_HfFLV_BASE + 0x1200 + (x) * 0x20 + (y) * 0x08) +#define LNL_REG_HfDFR(x) (0x160200 + (x) * 0x8) + +#define MTL_DWICTL_BASE 0x1800 +#define MTL_DWICTL_REG_INTENL (MTL_DWICTL_BASE + 0x0) +#define MTL_DWICTL_REG_FINALSTATUSL (MTL_DWICTL_BASE + 0x30) + +#define MTL_HfPMCCU_BASE 0x1D00 +#define MTL_REG_HfCLKCTL (MTL_HfPMCCU_BASE + 0x10) +#define MTL_REG_HfPWRCTL (MTL_HfPMCCU_BASE + 0x18) +#define MTL_REG_HfPWRSTS (MTL_HfPMCCU_BASE + 0x1C) +#define MTL_REG_HfPWRCTL2 (MTL_HfPMCCU_BASE + 0x20) +#define MTL_REG_HfPWRSTS2 (MTL_HfPMCCU_BASE + 0x24) +#define MTL_HfPWRCTL_WPDSPHPxPG BIT(0) +#define MTL_HfPWRSTS_DSPHPxPGS BIT(0) +#define MTL_HfPWRCTL2_WPDSPHPxPG BIT(0) +#define MTL_HfPWRSTS2_DSPHPxPGS BIT(0) + /* Intel HD Audio SRAM windows base addresses */ #define SKL_ADSP_SRAM_BASE_OFFSET 0x8000 #define SKL_ADSP_SRAM_WINDOW_SIZE 0x2000 #define APL_ADSP_SRAM_BASE_OFFSET 0x80000 #define APL_ADSP_SRAM_WINDOW_SIZE 0x20000 +#define MTL_ADSP_SRAM_BASE_OFFSET 0x180000 +#define MTL_ADSP_SRAM_WINDOW_SIZE 0x8000 /* Constants used when accessing SRAM, space shared with firmware */ #define AVS_FW_REG_BASE(adev) ((adev)->spec->hipc->sts_offset) -- cgit v1.2.3 From 83b9ae77f06607d19f7d3dcc6008742051137b27 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Fri, 4 Apr 2025 11:03:30 +0200 Subject: lib/string_helpers: Introduce parse_int_array() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Existing parse_inte_array_user() works with __user buffers only. Separate array parsing from __user bits so the functionality can be utilized with kernel buffers too. Cc: Andy Shevchenko Reviewed-by: Amadeusz Sławiński Signed-off-by: Cezary Rojewski Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20250404090337.3564117-2-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/linux/string_helpers.h | 1 + lib/string_helpers.c | 39 +++++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index e93fbb5b0c01..3fb88a1e9898 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -31,6 +31,7 @@ enum string_size_units { int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, char *buf, int len); +int parse_int_array(const char *buf, size_t count, int **array); int parse_int_array_user(const char __user *from, size_t count, int **array); #define UNESCAPE_SPACE BIT(0) diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 91fa37b5c510..ffb8ead6d4cd 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -138,6 +138,25 @@ int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, } EXPORT_SYMBOL(string_get_size); +int parse_int_array(const char *buf, size_t count, int **array) +{ + int *ints, nints; + + get_options(buf, 0, &nints); + if (!nints) + return -ENOENT; + + ints = kcalloc(nints + 1, sizeof(*ints), GFP_KERNEL); + if (!ints) + return -ENOMEM; + + get_options(buf, nints + 1, ints); + *array = ints; + + return 0; +} +EXPORT_SYMBOL(parse_int_array); + /** * parse_int_array_user - Split string into a sequence of integers * @from: The user space buffer to read from @@ -153,30 +172,14 @@ EXPORT_SYMBOL(string_get_size); */ int parse_int_array_user(const char __user *from, size_t count, int **array) { - int *ints, nints; char *buf; - int ret = 0; + int ret; buf = memdup_user_nul(from, count); if (IS_ERR(buf)) return PTR_ERR(buf); - get_options(buf, 0, &nints); - if (!nints) { - ret = -ENOENT; - goto free_buf; - } - - ints = kcalloc(nints + 1, sizeof(*ints), GFP_KERNEL); - if (!ints) { - ret = -ENOMEM; - goto free_buf; - } - - get_options(buf, nints + 1, ints); - *array = ints; - -free_buf: + ret = parse_int_array(buf, count, array); kfree(buf); return ret; } -- cgit v1.2.3 From 5f3e0b4a1f59ab616a09a45fddddefa6ef756229 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 7 Apr 2025 01:58:05 +0200 Subject: fs: predict not having to do anything in fdput() This matches the annotation in fdget(). Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/20250406235806.1637000-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 302f11355b10..af1768d934a0 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -59,7 +59,7 @@ static inline struct fd CLONED_FD(struct file *f) static inline void fdput(struct fd fd) { - if (fd.word & FDPUT_FPUT) + if (unlikely(fd.word & FDPUT_FPUT)) fput(fd_file(fd)); } -- cgit v1.2.3 From fa6fe07d1536361a227d655e69ca270faf28fdbe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Mar 2025 14:01:35 +1100 Subject: VFS: rename lookup_one_len family to lookup_noperm and remove permission check The lookup_one_len family of functions is (now) only used internally by a filesystem on itself either - in a context where permission checking is irrelevant such as by a virtual filesystem populating itself, or xfs accessing its ORPHANAGE or dquota accessing the quota file; or - in a context where a permission check (MAY_EXEC on the parent) has just been performed such as a network filesystem finding in "silly-rename" file in the same directory. This is also the context after the _parentat() functions where currently lookup_one_qstr_excl() is used. So the permission check is pointless. The name "one_len" is unhelpful in understanding the purpose of these functions and should be changed. Most of the callers pass the len as "strlen()" so using a qstr and QSTR() can simplify the code. This patch renames these functions (include lookup_positive_unlocked() which is part of the family despite the name) to have a name based on "lookup_noperm". They are changed to receive a 'struct qstr' instead of separate name and len. In a few cases the use of QSTR() results in a new call to strlen(). try_lookup_noperm() takes a pointer to a qstr instead of the whole qstr. This is consistent with d_hash_and_lookup() (which is nearly identical) and useful for lookup_noperm_unlocked(). The new lookup_noperm_common() doesn't take a qstr yet. That will be tidied up in a subsequent patch. Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250319031545.2999807-5-neil@brown.name Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 20 ++++++++ arch/s390/hypfs/inode.c | 2 +- drivers/android/binderfs.c | 4 +- drivers/infiniband/hw/qib/qib_fs.c | 4 +- fs/afs/dir.c | 2 +- fs/afs/dir_silly.c | 6 +-- fs/autofs/dev-ioctl.c | 3 +- fs/binfmt_misc.c | 2 +- fs/debugfs/inode.c | 6 +-- fs/ecryptfs/inode.c | 16 ++++--- fs/kernfs/mount.c | 2 +- fs/namei.c | 87 +++++++++++++++++++++-------------- fs/nfs/unlink.c | 11 ++--- fs/overlayfs/export.c | 6 +-- fs/overlayfs/namei.c | 2 +- fs/quota/dquot.c | 2 +- fs/smb/client/cached_dir.c | 5 +- fs/smb/client/cifsfs.c | 3 +- fs/tracefs/inode.c | 2 +- fs/xfs/scrub/orphanage.c | 3 +- include/linux/namei.h | 8 ++-- ipc/mqueue.c | 5 +- kernel/bpf/inode.c | 2 +- security/apparmor/apparmorfs.c | 4 +- security/inode.c | 2 +- 25 files changed, 122 insertions(+), 87 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 57dcba6de743..9150de7f64f1 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1212,3 +1212,23 @@ lookup_one(), lookup_one_unlocked(), lookup_one_positive_unlocked() now take a qstr instead of a name and len. These, not the "one_len" versions, should be used whenever accessing a filesystem from outside that filesysmtem, through a mount point - which will have a mnt_idmap. + +--- + +** mandatory** + +Functions try_lookup_one_len(), lookup_one_len(), +lookup_one_len_unlocked() and lookup_positive_unlocked() have been +renamed to try_lookup_noperm(), lookup_noperm(), +lookup_noperm_unlocked(), lookup_noperm_positive_unlocked(). They now +take a qstr instead of separate name and length. QSTR() can be used +when strlen() is needed for the length. + +For try_lookup_noperm() a reference to the qstr is passed in case the +hash might subsequently be needed. + +These function no longer do any permission checking - they previously +checked that the caller has 'X' permission on the parent. They must +ONLY be used internally by a filesystem on itself when it knows that +permissions are irrelevant or in a context where permission checks have +already been performed such as after vfs_path_parent_lookup() diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 04ea1c03a5ff..96409573c75d 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -342,7 +342,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, struct inode *inode; inode_lock(d_inode(parent)); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { dentry = ERR_PTR(-ENOMEM); goto fail; diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c index 94c6446604fc..98da8c4eea59 100644 --- a/drivers/android/binderfs.c +++ b/drivers/android/binderfs.c @@ -187,7 +187,7 @@ static int binderfs_binder_device_create(struct inode *ref_inode, inode_lock(d_inode(root)); /* look it up */ - dentry = lookup_one_len(name, root, name_len); + dentry = lookup_noperm(&QSTR(name), root); if (IS_ERR(dentry)) { inode_unlock(d_inode(root)); ret = PTR_ERR(dentry); @@ -487,7 +487,7 @@ static struct dentry *binderfs_create_dentry(struct dentry *parent, { struct dentry *dentry; - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) return dentry; diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index b9f4a2937c3a..2098de762bf5 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -90,7 +90,7 @@ static int create_file(const char *name, umode_t mode, int error; inode_lock(d_inode(parent)); - *dentry = lookup_one_len(name, parent, strlen(name)); + *dentry = lookup_noperm(&QSTR(name), parent); if (!IS_ERR(*dentry)) error = qibfs_mknod(d_inode(parent), *dentry, mode, fops, data); @@ -433,7 +433,7 @@ static int remove_device_files(struct super_block *sb, char unit[10]; snprintf(unit, sizeof(unit), "%u", dd->unit); - dir = lookup_one_len_unlocked(unit, sb->s_root, strlen(unit)); + dir = lookup_noperm_unlocked(&QSTR(unit), sb->s_root); if (IS_ERR(dir)) { pr_err("Lookup of %s failed\n", unit); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 9e7b1fe82c27..bfb69e066672 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -943,7 +943,7 @@ static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry) } strcpy(p, name); - ret = lookup_one_len(buf, dentry->d_parent, len); + ret = lookup_noperm(&QSTR(buf), dentry->d_parent); if (IS_ERR(ret) || d_is_positive(ret)) goto out_s; dput(ret); diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index a1e581946b93..0b80eb93fa40 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -113,16 +113,14 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, sdentry = NULL; do { - int slen; - dput(sdentry); sillycounter++; /* Create a silly name. Note that the ".__afs" prefix is * understood by the salvager and must not be changed. */ - slen = scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter); - sdentry = lookup_one_len(silly, dentry->d_parent, slen); + scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); /* N.B. Better to return EBUSY here ... it could be dangerous * to delete the file while it's in use. diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index c5a6aae12d2c..d8dd150cbd74 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -459,7 +459,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp, "the parent autofs mount timeout which could " "prevent shutdown\n"); - dentry = try_lookup_one_len(param->path, base, path_len); + dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len), + base); if (IS_ERR_OR_NULL(dentry)) return dentry ? PTR_ERR(dentry) : -ENOENT; ino = autofs_dentry_ino(dentry); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 5a7ebd160724..432fbf4fc334 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -842,7 +842,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, } inode_lock(d_inode(root)); - dentry = lookup_one_len(e->name, root, strlen(e->name)); + dentry = lookup_noperm(&QSTR(e->name), root); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 75715d8877ee..30c4944e1862 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -346,7 +346,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - dentry = lookup_positive_unlocked(name, parent, strlen(name)); + dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent); if (IS_ERR(dentry)) return NULL; return dentry; @@ -388,7 +388,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", @@ -872,7 +872,7 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, . } if (strcmp(old_name.name.name, new_name) == 0) goto out; - target = lookup_one_len(new_name, parent, strlen(new_name)); + target = lookup_noperm(&QSTR(new_name), parent); if (IS_ERR(target)) { error = PTR_ERR(target); goto out; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 51a5c54eb740..493d7f194956 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -394,8 +394,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, char *encrypted_and_encoded_name = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct dentry *lower_dir_dentry, *lower_dentry; - const char *name = ecryptfs_dentry->d_name.name; - size_t len = ecryptfs_dentry->d_name.len; + struct qstr qname = QSTR_INIT(ecryptfs_dentry->d_name.name, + ecryptfs_dentry->d_name.len); struct dentry *res; int rc = 0; @@ -404,23 +404,25 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + size_t len = qname.len; rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &len, - mount_crypt_stat, name, len); + mount_crypt_stat, qname.name, len); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " "filename; rc = [%d]\n", __func__, rc); return ERR_PTR(rc); } - name = encrypted_and_encoded_name; + qname.name = encrypted_and_encoded_name; + qname.len = len; } - lower_dentry = lookup_one_len_unlocked(name, lower_dir_dentry, len); + lower_dentry = lookup_noperm_unlocked(&qname, lower_dir_dentry); if (IS_ERR(lower_dentry)) { - ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + ecryptfs_printk(KERN_DEBUG, "%s: lookup_noperm() returned " "[%ld] on lower_dentry = [%s]\n", __func__, PTR_ERR(lower_dentry), - name); + qname.name); res = ERR_CAST(lower_dentry); } else { res = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 5124e196c2bf..a82fbce25c28 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -255,7 +255,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, dput(dentry); return ERR_PTR(-ENOMEM); } - dtmp = lookup_positive_unlocked(name, dentry, strlen(name)); + dtmp = lookup_noperm_positive_unlocked(&QSTR(name), dentry); dput(dentry); kfree(name); if (IS_ERR(dtmp)) diff --git a/fs/namei.c b/fs/namei.c index e499fb609271..f397ab5704e7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2834,9 +2834,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_one_common(struct mnt_idmap *idmap, - const char *name, struct dentry *base, int len, - struct qstr *this) +static int lookup_noperm_common(const char *name, struct dentry *base, + int len, + struct qstr *this) { this->name = name; this->len = len; @@ -2861,50 +2861,58 @@ static int lookup_one_common(struct mnt_idmap *idmap, if (err < 0) return err; } + return 0; +} +static int lookup_one_common(struct mnt_idmap *idmap, + const char *name, struct dentry *base, int len, + struct qstr *this) { + int err; + err = lookup_noperm_common(name, base, len, this); + if (err < 0) + return err; return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** - * try_lookup_one_len - filesystem helper to lookup single pathname component - * @name: pathname component to lookup + * try_lookup_noperm - filesystem helper to lookup single pathname component + * @name: qstr storing pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. * * No locks need be held - only a counted reference to @base is needed. * */ -struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) +struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) { struct qstr this; int err; - err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); + err = lookup_noperm_common(name->name, base, name->len, &this); if (err) return ERR_PTR(err); - return lookup_dcache(&this, base, 0); + name->hash = this.hash; + return lookup_dcache(name, base, 0); } -EXPORT_SYMBOL(try_lookup_one_len); +EXPORT_SYMBOL(try_lookup_noperm); /** - * lookup_one_len - filesystem helper to lookup single pathname component - * @name: pathname component to lookup + * lookup_noperm - filesystem helper to lookup single pathname component + * @name: qstr storing pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. * * The caller must hold base->i_mutex. */ -struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) { struct dentry *dentry; struct qstr this; @@ -2912,14 +2920,14 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); + err = lookup_noperm_common(name->name, base, name->len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } -EXPORT_SYMBOL(lookup_one_len); +EXPORT_SYMBOL(lookup_noperm); /** * lookup_one - lookup single pathname component @@ -2957,7 +2965,7 @@ EXPORT_SYMBOL(lookup_one); * * This can be used for in-kernel filesystem clients such as file servers. * - * Unlike lookup_one_len, it should be called without the parent + * Unlike lookup_one, it should be called without the parent * i_rwsem held, and will take the i_rwsem itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, @@ -3010,39 +3018,48 @@ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, EXPORT_SYMBOL(lookup_one_positive_unlocked); /** - * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * lookup_noperm_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. * - * Unlike lookup_one_len, it should be called without the parent - * i_mutex held, and will take the i_mutex itself if necessary. + * Unlike lookup_noperm, it should be called without the parent + * i_rwsem held, and will take the i_rwsem itself if necessary. */ -struct dentry *lookup_one_len_unlocked(const char *name, - struct dentry *base, int len) +struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) { - return lookup_one_unlocked(&nop_mnt_idmap, &QSTR_LEN(name, len), base); + struct dentry *ret; + + ret = try_lookup_noperm(name, base); + if (!ret) + ret = lookup_slow(name, base, 0); + return ret; } -EXPORT_SYMBOL(lookup_one_len_unlocked); +EXPORT_SYMBOL(lookup_noperm_unlocked); /* - * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) + * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent - * _can_ become positive at any time, so callers of lookup_one_len_unlocked() + * _can_ become positive at any time, so callers of lookup_noperm_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ -struct dentry *lookup_positive_unlocked(const char *name, - struct dentry *base, int len) +struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, + struct dentry *base) { - return lookup_one_positive_unlocked(&nop_mnt_idmap, - &QSTR_LEN(name, len), base); + struct dentry *ret; + + ret = lookup_noperm_unlocked(name, base); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; } -EXPORT_SYMBOL(lookup_positive_unlocked); +EXPORT_SYMBOL(lookup_noperm_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index bf77399696a7..b55467911648 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -464,18 +464,17 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) sdentry = NULL; do { - int slen; dput(sdentry); sillycounter++; - slen = scnprintf(silly, sizeof(silly), - SILLYNAME_PREFIX "%0*llx%0*x", - SILLYNAME_FILEID_LEN, fileid, - SILLYNAME_COUNTER_LEN, sillycounter); + scnprintf(silly, sizeof(silly), + SILLYNAME_PREFIX "%0*llx%0*x", + SILLYNAME_FILEID_LEN, fileid, + SILLYNAME_COUNTER_LEN, sillycounter); dfprintk(VFS, "NFS: trying to rename %pd to %s\n", dentry, silly); - sdentry = lookup_one_len(silly, dentry->d_parent, slen); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); /* * N.B. Better to return EBUSY here ... it could be * dangerous to delete the file while it's in use. diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 444aeeccb6da..83f80fdb1567 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -385,11 +385,9 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); /* - * No idmap handling here: it's an internal lookup. Could skip - * permission checking altogether, but for now just use non-idmap - * transformed ids. + * No idmap handling here: it's an internal lookup. */ - this = lookup_one_len(name.name.name, connected, name.name.len); + this = lookup_noperm(&name.name, connected); release_dentry_name_snapshot(&name); err = PTR_ERR(this); if (IS_ERR(this)) { diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index ac6e893e846a..bf722daf19a9 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -757,7 +757,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) if (err) return ERR_PTR(err); - index = lookup_positive_unlocked(name.name, ofs->workdir, name.len); + index = lookup_noperm_positive_unlocked(&name, ofs->workdir); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 825c5c2e0962..df4a9b348769 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2560,7 +2560,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name)); + dentry = lookup_noperm_positive_unlocked(&QSTR(qf_name), sb->s_root); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index fe738623cf1b..e6fc92667d41 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -109,7 +109,8 @@ path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path) while (*s && *s != sep) s++; - child = lookup_positive_unlocked(p, dentry, s - p); + child = lookup_noperm_positive_unlocked(&QSTR_LEN(p, s - p), + dentry); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); @@ -207,7 +208,7 @@ replay_again: spin_unlock(&cfids->cfid_list_lock); /* - * Skip any prefix paths in @path as lookup_positive_unlocked() ends up + * Skip any prefix paths in @path as lookup_noperm_positive_unlocked() ends up * calling ->lookup() which already adds those through * build_path_from_dentry(). Also, do it earlier as we might reconnect * below when trying to send compounded request and then potentially diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index a08c42363ffc..fb04e263611c 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -929,7 +929,8 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb) while (*s && *s != sep) s++; - child = lookup_positive_unlocked(p, dentry, s - p); + child = lookup_noperm_positive_unlocked(&QSTR_LEN(p, s - p), + dentry); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index cb1af30b49f5..a3fd3cc591bd 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -555,7 +555,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent) if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (!IS_ERR(dentry) && d_inode(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 3537f3cca6d5..475bb899b2c6 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -153,8 +153,7 @@ xrep_orphanage_create( /* Try to find the orphanage directory. */ inode_lock_nested(root_inode, I_MUTEX_PARENT); - orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry, - strlen(ORPHANAGE)); + orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry); if (IS_ERR(orphanage_dentry)) { error = PTR_ERR(orphanage_dentry); goto out_unlock_root; diff --git a/include/linux/namei.h b/include/linux/namei.h index ba02304a2a8a..cb6ce97782e0 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -69,10 +69,10 @@ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); -extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); -extern struct dentry *lookup_one_len(const char *, struct dentry *, int); -extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); -extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); +extern struct dentry *try_lookup_noperm(struct qstr *, struct dentry *); +extern struct dentry *lookup_noperm(struct qstr *, struct dentry *); +extern struct dentry *lookup_noperm_unlocked(struct qstr *, struct dentry *); +extern struct dentry *lookup_noperm_positive_unlocked(struct qstr *, struct dentry *); struct dentry *lookup_one(struct mnt_idmap *, struct qstr *, struct dentry *); struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 35b4f8659904..82ed2d3c9846 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -913,7 +913,7 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, ro = mnt_want_write(mnt); /* we'll drop it in any case */ inode_lock(d_inode(root)); - path.dentry = lookup_one_len(name->name, root, strlen(name->name)); + path.dentry = lookup_noperm(&QSTR(name->name), root); if (IS_ERR(path.dentry)) { error = PTR_ERR(path.dentry); goto out_putfd; @@ -969,8 +969,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) if (err) goto out_name; inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT); - dentry = lookup_one_len(name->name, mnt->mnt_root, - strlen(name->name)); + dentry = lookup_noperm(&QSTR(name->name), mnt->mnt_root); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index dc3aa91a6ba0..5c2e96b19392 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -421,7 +421,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, int ret; inode_lock(parent->d_inode); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { inode_unlock(parent->d_inode); return PTR_ERR(dentry); diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 6039afae4bfc..0aef34b9609b 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -283,7 +283,7 @@ static struct dentry *aafs_create(const char *name, umode_t mode, dir = d_inode(parent); inode_lock(dir); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); goto fail_lock; @@ -2551,7 +2551,7 @@ static int aa_mk_null_file(struct dentry *parent) return error; inode_lock(d_inode(parent)); - dentry = lookup_one_len(NULL_FILE_NAME, parent, strlen(NULL_FILE_NAME)); + dentry = lookup_noperm(&QSTR(NULL_FILE_NAME), parent); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); goto out; diff --git a/security/inode.c b/security/inode.c index da3ab44c8e57..3913501621fa 100644 --- a/security/inode.c +++ b/security/inode.c @@ -128,7 +128,7 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode, dir = d_inode(parent); inode_lock(dir); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) goto out; -- cgit v1.2.3 From 06c567403ae5a0b56005c2d4a184c903f572c844 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Mar 2025 14:01:36 +1100 Subject: Use try_lookup_noperm() instead of d_hash_and_lookup() outside of VFS try_lookup_noperm() and d_hash_and_lookup() are nearly identical. The former does some validation of the name where the latter doesn't. Outside of the VFS that validation is likely valuable, and having only one exported function for this task is certainly a good idea. So make d_hash_and_lookup() local to VFS files and change all other callers to try_lookup_noperm(). Note that the arguments are swapped. Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250319031545.2999807-6-neil@brown.name Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 11 +++++++++++ fs/dcache.c | 1 - fs/efivarfs/super.c | 15 +++++---------- fs/internal.h | 1 + fs/proc/base.c | 2 +- fs/smb/client/readdir.c | 3 ++- fs/xfs/scrub/orphanage.c | 4 ++-- include/linux/dcache.h | 1 - net/sunrpc/rpc_pipe.c | 12 ++++++------ security/selinux/selinuxfs.c | 4 ++-- 10 files changed, 30 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 9150de7f64f1..3111ef5592f3 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1232,3 +1232,14 @@ checked that the caller has 'X' permission on the parent. They must ONLY be used internally by a filesystem on itself when it knows that permissions are irrelevant or in a context where permission checks have already been performed such as after vfs_path_parent_lookup() + +--- + +** mandatory** + +d_hash_and_lookup() is no longer exported or available outside the VFS. +Use try_lookup_noperm() instead. This adds name validation and takes +arguments in the opposite order but is otherwise identical. + +Using try_lookup_noperm() will require linux/namei.h to be included. + diff --git a/fs/dcache.c b/fs/dcache.c index bd5aa136153a..89f4acab08c0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2412,7 +2412,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) } return d_lookup(dir, name); } -EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 0486e9b68bc6..b2de4079864c 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" @@ -204,7 +205,6 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name, char *name = efivar_get_utf8name(variable_name, vendor); struct super_block *sb = data; struct dentry *dentry; - struct qstr qstr; if (!name) /* @@ -217,9 +217,7 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name, */ return true; - qstr.name = name; - qstr.len = strlen(name); - dentry = d_hash_and_lookup(sb->s_root, &qstr); + dentry = try_lookup_noperm(&QSTR(name), sb->s_root); kfree(name); if (!IS_ERR_OR_NULL(dentry)) dput(dentry); @@ -404,8 +402,8 @@ static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len, { unsigned long size; struct efivarfs_ctx *ectx = container_of(ctx, struct efivarfs_ctx, ctx); - struct qstr qstr = { .name = name, .len = len }; - struct dentry *dentry = d_hash_and_lookup(ectx->sb->s_root, &qstr); + struct dentry *dentry = try_lookup_noperm(&QSTR_LEN(name, len), + ectx->sb->s_root); struct inode *inode; struct efivar_entry *entry; int err; @@ -441,7 +439,6 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor, char *name; struct super_block *sb = data; struct dentry *dentry; - struct qstr qstr; int err; if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) @@ -451,9 +448,7 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor, if (!name) return -ENOMEM; - qstr.name = name; - qstr.len = strlen(name); - dentry = d_hash_and_lookup(sb->s_root, &qstr); + dentry = try_lookup_noperm(&QSTR(name), sb->s_root); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; diff --git a/fs/internal.h b/fs/internal.h index b9b3e29a73fd..213bf3226213 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -66,6 +66,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode); +struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); /* * namespace.c diff --git a/fs/proc/base.c b/fs/proc/base.c index b0d4e1908b22..fe33a5843fbd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2121,7 +2121,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, unsigned type = DT_UNKNOWN; ino_t ino = 1; - child = d_hash_and_lookup(dir, &qname); + child = try_lookup_noperm(&qname, dir); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 50f96259d9ad..7329ec532bcf 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -9,6 +9,7 @@ * */ #include +#include #include #include #include @@ -78,7 +79,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); - dentry = d_hash_and_lookup(parent, name); + dentry = try_lookup_noperm(name, parent); if (!dentry) { /* * If we know that the inode will need to be revalidated diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 475bb899b2c6..9c12cb844231 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -444,7 +444,7 @@ xrep_adoption_check_dcache( if (!d_orphanage) return 0; - d_child = d_hash_and_lookup(d_orphanage, &qname); + d_child = try_lookup_noperm(&qname, d_orphanage); if (d_child) { trace_xrep_adoption_check_child(sc->mp, d_child); @@ -481,7 +481,7 @@ xrep_adoption_zap_dcache( if (!d_orphanage) return; - d_child = d_hash_and_lookup(d_orphanage, &qname); + d_child = try_lookup_noperm(&qname, d_orphanage); while (d_child != NULL) { trace_xrep_adoption_invalidate_child(sc->mp, d_child); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index e974e63bcdbc..a324f82df562 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -288,7 +288,6 @@ extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); -extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); static inline unsigned d_count(const struct dentry *dentry) { diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index eadc00410ebc..98f78cd55905 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -631,7 +631,7 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, const char *name) { struct qstr q = QSTR(name); - struct dentry *dentry = d_hash_and_lookup(parent, &q); + struct dentry *dentry = try_lookup_noperm(&q, parent); if (!dentry) { dentry = d_alloc(parent, &q); if (!dentry) @@ -658,7 +658,7 @@ static void __rpc_depopulate(struct dentry *parent, for (i = start; i < eof; i++) { name.name = files[i].name; name.len = strlen(files[i].name); - dentry = d_hash_and_lookup(parent, &name); + dentry = try_lookup_noperm(&name, parent); if (dentry == NULL) continue; @@ -1190,7 +1190,7 @@ static const struct rpc_filelist files[] = { struct dentry *rpc_d_lookup_sb(const struct super_block *sb, const unsigned char *dir_name) { - return d_hash_and_lookup(sb->s_root, &QSTR(dir_name)); + return try_lookup_noperm(&QSTR(dir_name), sb->s_root); } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); @@ -1301,7 +1301,7 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) struct dentry *pipe_dentry = NULL; /* We should never get this far if "gssd" doesn't exist */ - gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name)); + gssd_dentry = try_lookup_noperm(&QSTR(files[RPCAUTH_gssd].name), root); if (!gssd_dentry) return ERR_PTR(-ENOENT); @@ -1311,8 +1311,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) goto out; } - clnt_dentry = d_hash_and_lookup(gssd_dentry, - &QSTR(gssd_dummy_clnt_dir[0].name)); + clnt_dentry = try_lookup_noperm(&QSTR(gssd_dummy_clnt_dir[0].name), + gssd_dentry); if (!clnt_dentry) { __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); pipe_dentry = ERR_PTR(-ENOENT); diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 47480eb2189b..e67a8ce4b64c 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -2158,8 +2158,8 @@ static int __init init_sel_fs(void) return err; } - selinux_null.dentry = d_hash_and_lookup(selinux_null.mnt->mnt_root, - &null_name); + selinux_null.dentry = try_lookup_noperm(&null_name, + selinux_null.mnt->mnt_root); if (IS_ERR(selinux_null.dentry)) { pr_err("selinuxfs: could not lookup null!\n"); err = PTR_ERR(selinux_null.dentry); -- cgit v1.2.3 From da916e96e2dedcb2d40de77a7def833d315b81a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 Oct 2024 10:21:41 +0200 Subject: perf: Make perf_pmu_unregister() useable Previously it was only safe to call perf_pmu_unregister() if there were no active events of that pmu around -- which was impossible to guarantee since it races all sorts against perf_init_event(). Rework the whole thing by: - keeping track of all events for a given pmu - 'hiding' the pmu from perf_init_event() - waiting for the appropriate (s)rcu grace periods such that all prior references to the PMU will be completed - detaching all still existing events of that pmu (see first point) and moving them to a new REVOKED state. - actually freeing the pmu data. Where notably the new REVOKED state must inhibit all event actions from reaching code that wants to use event->pmu. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ravi Bangoria Link: https://lkml.kernel.org/r/20250307193723.525402029@infradead.org --- include/linux/perf_event.h | 15 ++- kernel/events/core.c | 320 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 280 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0069ba6866a4..7f49a58b271d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -325,6 +325,9 @@ struct perf_output_handle; struct pmu { struct list_head entry; + spinlock_t events_lock; + struct list_head events; + struct module *module; struct device *dev; struct device *parent; @@ -622,9 +625,10 @@ struct perf_addr_filter_range { * enum perf_event_state - the states of an event: */ enum perf_event_state { - PERF_EVENT_STATE_DEAD = -4, - PERF_EVENT_STATE_EXIT = -3, - PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_DEAD = -5, + PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ + PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ + PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, @@ -865,6 +869,7 @@ struct perf_event { void *security; #endif struct list_head sb_list; + struct list_head pmu_list; /* * Certain events gets forwarded to another pmu internally by over- @@ -1155,7 +1160,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); -extern void perf_pmu_unregister(struct pmu *pmu); +extern int perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); @@ -1760,7 +1765,7 @@ static inline bool needs_branch_stack(struct perf_event *event) static inline bool has_aux(struct perf_event *event) { - return event->pmu->setup_aux; + return event->pmu && event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) diff --git a/kernel/events/core.c b/kernel/events/core.c index 985b5c7d04e0..2eb9cd5d86a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -208,6 +208,7 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, } #define TASK_TOMBSTONE ((void *)-1L) +#define EVENT_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { @@ -2336,6 +2337,11 @@ static void perf_child_detach(struct perf_event *event) sync_child_event(event); list_del_init(&event->child_list); + /* + * Cannot set to NULL, as that would confuse the situation vs + * not being a child event. See for example unaccount_event(). + */ + event->parent = EVENT_TOMBSTONE; } static bool is_orphaned_event(struct perf_event *event) @@ -2457,8 +2463,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL -#define DETACH_DEAD 0x04UL -#define DETACH_EXIT 0x08UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -2484,18 +2491,21 @@ __perf_remove_from_context(struct perf_event *event, */ if (flags & DETACH_EXIT) state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; if (flags & DETACH_DEAD) { event->pending_disable = 1; state = PERF_EVENT_STATE_DEAD; } event_sched_out(event, ctx); - perf_event_set_state(event, min(event->state, state)); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); + event->state = min(event->state, state); + if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; @@ -4523,7 +4533,8 @@ out: static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx); + struct perf_event_context *ctx, + bool revoke); /* * Removes all events from the current task that have been marked @@ -4550,7 +4561,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx); + perf_event_exit_event(event, ctx, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -5132,6 +5143,7 @@ static bool is_sb_event(struct perf_event *event) attr->context_switch || attr->text_poke || attr->bpf_event) return true; + return false; } @@ -5528,6 +5540,8 @@ static void perf_free_addr_filters(struct perf_event *event); /* vs perf_event_alloc() error */ static void __free_event(struct perf_event *event) { + struct pmu *pmu = event->pmu; + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5557,6 +5571,7 @@ static void __free_event(struct perf_event *event) * put_pmu_ctx() needs an event->ctx reference, because of * epc->ctx. */ + WARN_ON_ONCE(!pmu); WARN_ON_ONCE(!event->ctx); WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); @@ -5569,8 +5584,13 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) - module_put(event->pmu->module); + if (pmu) { + module_put(pmu->module); + scoped_guard (spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -5605,22 +5625,6 @@ static void _free_event(struct perf_event *event) __free_event(event); } -/* - * Used to free events which have a known refcount of 1, such as in error paths - * where the event isn't exposed yet and inherited events. - */ -static void free_event(struct perf_event *event) -{ - if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, - "unexpected event refcount: %ld; ptr=%p\n", - atomic_long_read(&event->refcount), event)) { - /* leak to avoid use-after-free */ - return; - } - - _free_event(event); -} - /* * Remove user event from the owner task. */ @@ -5724,7 +5728,11 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); @@ -6013,7 +6021,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count) * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ - if (event->state == PERF_EVENT_STATE_ERROR) + if (event->state <= PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) @@ -6052,8 +6060,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) struct perf_buffer *rb; __poll_t events = EPOLLHUP; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; @@ -6232,6 +6246,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -6607,9 +6624,22 @@ void ring_buffer_put(struct perf_buffer *rb) call_rcu(&rb->rcu_head, rb_free_rcu); } +typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm); + +#define get_mapped(event, func) \ +({ struct pmu *pmu; \ + mapped_f f = NULL; \ + guard(rcu)(); \ + pmu = READ_ONCE(event->pmu); \ + if (pmu) \ + f = pmu->func; \ + f; \ +}) + static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f mapped = get_mapped(event, event_mapped); atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); @@ -6617,8 +6647,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); - if (event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + if (mapped) + mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -6634,14 +6664,16 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; - if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event, vma->vm_mm); + /* FIXIES vs perf_pmu_unregister() */ + if (unmapped) + unmapped(event, vma->vm_mm); /* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex @@ -6834,6 +6866,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long nr_pages; long user_extra = 0, extra = 0; int ret, flags = 0; + mapped_f mapped; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6864,6 +6897,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&event->mmap_mutex); ret = -EINVAL; + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) { + ret = -ENODEV; + goto unlock; + } + if (vma->vm_pgoff == 0) { nr_pages -= 1; @@ -7042,8 +7085,9 @@ aux_unlock: if (!ret) ret = map_range(rb, vma); - if (!ret && event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); return ret; } @@ -7054,6 +7098,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -11062,6 +11109,9 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); @@ -12245,6 +12295,9 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + /* * Now that the PMU is complete, make it visible to perf_try_init_event(). */ @@ -12258,21 +12311,143 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) } EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) +{ + /* + * De-schedule the event and mark it REVOKED. + */ + perf_event_exit_event(event, ctx, true); + + /* + * All _free_event() bits that rely on event->pmu: + * + * Notably, perf_mmap() relies on the ordering here. + */ + scoped_guard (mutex, &event->mmap_mutex) { + WARN_ON_ONCE(pmu->event_unmapped); + /* + * Mostly an empty lock sequence, such that perf_mmap(), which + * relies on mmap_mutex, is sure to observe the state change. + */ + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; + } + + exclusive_event_destroy(event); + module_put(pmu->module); + + event->pmu = NULL; /* force fault instead of UAF */ +} + +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard (spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) { scoped_guard (mutex, &pmus_lock) { + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) + return -EINVAL; + list_del_rcu(&pmu->entry); - idr_remove(&pmu_idr, pmu->type); } /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. + * + * Notably, the entirety of event creation, from perf_init_event() + * (which will now fail, because of the above) until + * perf_install_in_context() should be under SRCU such that + * this synchronizes against event creation. This avoids trying to + * detach events that are not fully formed. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); + if (pmu->event_unmapped && !pmu_empty(pmu)) { + /* + * Can't force remove events when pmu::event_unmapped() + * is used in perf_mmap_close(). + */ + guard(mutex)(&pmus_lock); + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu); + list_add_rcu(&pmu->entry, &pmus); + return -EBUSY; + } + + scoped_guard (mutex, &pmus_lock) + idr_remove(&pmu_idr, pmu->type); + + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ perf_pmu_free(pmu); + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -12366,7 +12541,7 @@ static struct pmu *perf_init_event(struct perf_event *event) struct pmu *pmu; int type, ret; - guard(srcu)(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* pmu idr/list access */ /* * Save original type before calling pmu->event_init() since certain @@ -12590,6 +12765,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); + INIT_LIST_HEAD(&event->pmu_list); init_waitqueue_head(&event->waitq); @@ -12768,6 +12944,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, /* symmetric to unaccount_event() in _free_event() */ account_event(event); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + lockdep_assert_held(&pmus_srcu); + scoped_guard (spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); + return_ptr(event); } @@ -12967,6 +13150,9 @@ set: goto unlock; if (output_event) { + if (output_event->state <= PERF_EVENT_STATE_REVOKED) + goto unlock; + /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) @@ -13148,6 +13334,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { if (!is_perf_file(group)) { @@ -13155,6 +13346,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -13451,7 +13646,7 @@ err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: - free_event(event); + put_event(event); err_task: if (task) put_task_struct(task); @@ -13488,6 +13683,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { @@ -13559,7 +13759,7 @@ err_unlock: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + put_event(event); err: return ERR_PTR(err); } @@ -13699,10 +13899,15 @@ static void sync_child_event(struct perf_event *child_event) } static void -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, bool revoke) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = 0; + unsigned long detach_flags = DETACH_EXIT; + bool is_child = !!parent_event; + + if (parent_event == EVENT_TOMBSTONE) + parent_event = NULL; if (parent_event) { /* @@ -13717,22 +13922,29 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - detach_flags = DETACH_GROUP | DETACH_CHILD; + detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); } - perf_remove_from_context(event, detach_flags | DETACH_EXIT); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; + perf_remove_from_context(event, detach_flags); /* * Child events can be freed. */ - if (parent_event) { - mutex_unlock(&parent_event->child_mutex); - /* - * Kick perf_poll() for is_event_hup(); - */ - perf_event_wakeup(parent_event); - put_event(event); + if (is_child) { + if (parent_event) { + mutex_unlock(&parent_event->child_mutex); + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + /* + * pmu_detach_event() will have an extra refcount. + */ + put_event(event); + } return; } @@ -13796,7 +14008,7 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit) perf_event_task(task, ctx, 0); list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) - perf_event_exit_event(child_event, ctx); + perf_event_exit_event(child_event, ctx, false); mutex_unlock(&ctx->mutex); @@ -13949,6 +14161,14 @@ inherit_event(struct perf_event *parent_event, if (parent_event->parent) parent_event = parent_event->parent; + if (parent_event->state <= PERF_EVENT_STATE_REVOKED) + return NULL; + + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, @@ -13962,7 +14182,7 @@ inherit_event(struct perf_event *parent_event, pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { - free_event(child_event); + put_event(child_event); return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; @@ -13977,7 +14197,7 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); - free_event(child_event); + put_event(child_event); return NULL; } -- cgit v1.2.3 From 4dfe3232cc04325a09e96f6c7f9546ba6c0b132b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:13 -0700 Subject: perf/x86: Add dynamic constraint More and more features require a dynamic event constraint, e.g., branch counter logging, auto counter reload, Arch PEBS, etc. Add a generic flag, PMU_FL_DYN_CONSTRAINT, to indicate the case. It avoids keeping adding the individual flag in intel_cpuc_prepare(). Add a variable dyn_constraint in the struct hw_perf_event to track the dynamic constraint of the event. Apply it if it's updated. Apply the generic dynamic constraint for branch counter logging. Many features on and after V6 require dynamic constraint. So unconditionally set the flag for V6+. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-2-kan.liang@linux.intel.com --- arch/x86/events/core.c | 1 + arch/x86/events/intel/core.c | 21 +++++++++++++++------ arch/x86/events/intel/lbr.c | 2 +- arch/x86/events/perf_event.h | 1 + include/linux/perf_event.h | 1 + 5 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6866cc5acb0b..a0fe51e0c00f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -674,6 +674,7 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + event->hw.dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 09d2d66c9f21..972492832d7f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3730,10 +3730,9 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); - /* Not all counters support the branch counter feature. */ - if (branch_sample_counters(event)) { + if (event->hw.dyn_constraint != ~0ULL) { c2 = dyn_constraint(cpuc, c2, idx); - c2->idxmsk64 &= x86_pmu.lbr_counters; + c2->idxmsk64 &= event->hw.dyn_constraint; c2->weight = hweight64(c2->idxmsk64); } @@ -4135,15 +4134,19 @@ static int intel_pmu_hw_config(struct perf_event *event) leader = event->group_leader; if (branch_sample_call_stack(leader)) return -EINVAL; - if (branch_sample_counters(leader)) + if (branch_sample_counters(leader)) { num++; + leader->hw.dyn_constraint &= x86_pmu.lbr_counters; + } leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; for_each_sibling_event(sibling, leader) { if (branch_sample_call_stack(sibling)) return -EINVAL; - if (branch_sample_counters(sibling)) + if (branch_sample_counters(sibling)) { num++; + sibling->hw.dyn_constraint &= x86_pmu.lbr_counters; + } } if (num > fls(x86_pmu.lbr_counters)) @@ -4943,7 +4946,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) goto err; } - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); @@ -6664,6 +6667,12 @@ __init int intel_pmu_init(void) pr_cont(" AnyThread deprecated, "); } + /* + * Many features on and after V6 require dynamic constraint, + * e.g., Arch PEBS, ACR. + */ + if (version >= 6) + x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; /* * Install the hw-cache-events table: */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f44c3d866f24..05acd6449ceb 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1618,7 +1618,7 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_nr = lbr_nr; if (!!x86_pmu.lbr_counters) - x86_pmu.flags |= PMU_FL_BR_CNTR; + x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT; if (x86_pmu.lbr_mispred) static_branch_enable(&x86_lbr_mispred); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2c0ce0e9545e..f5ba1658a889 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1042,6 +1042,7 @@ do { \ #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ #define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ +#define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7f49a58b271d..54dad174ed7a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -158,6 +158,7 @@ struct hw_perf_event { struct { /* hardware */ u64 config; u64 last_tag; + u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; -- cgit v1.2.3 From c9449c8506a5df5052ef4d17867699517b10b55a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:15 -0700 Subject: perf: Extend the bit width of the arch-specific flag The auto counter reload feature requires an event flag to indicate an auto counter reload group, which can only be scheduled on specific counters that enumerated in CPUID. However, the hw_perf_event.flags has run out on X86. Two solutions were considered to address the issue. - Currently, 20 bits are reserved for the architecture-specific flags. Only the bit 31 is used for the generic flag. There is still plenty of space left. Reserve 8 more bits for the arch-specific flags. - Add a new X86 specific hw_perf_event.flags1 to support more flags. The former is implemented. Enough room is still left in the global generic flag. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-4-kan.liang@linux.intel.com --- arch/x86/events/perf_event_flags.h | 41 +++++++++++++++++++------------------- include/linux/perf_event.h | 2 +- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 1d9e385649b5..70078334e4a3 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -2,23 +2,24 @@ /* * struct hw_perf_event.flags flags */ -PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ -PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ -PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ -PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ -PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ -PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ -PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ -PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ -PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ -PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ -PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ -PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ -PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ -PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ -PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ -PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ -PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ -PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ -PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ -PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(PEBS_LDLAT, 0x0000001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x0000002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x0000004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x0000008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x0000010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x0000020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x0000040) /* dynamic alloc'd constraint */ +PERF_ARCH(PEBS_CNTR, 0x0000080) /* PEBS counters snapshot */ +PERF_ARCH(EXCL_ACCT, 0x0000100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x0000200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x0000400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x0000800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x0001000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x0002000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x0004000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x0008000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x0010000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x0020000) /* ld and st lat for hybrid */ +PERF_ARCH(NEEDS_BRANCH_STACK, 0x0040000) /* require branch stack setup */ +PERF_ARCH(BRANCH_COUNTERS, 0x0080000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(ACR, 0x0100000) /* Auto counter reload */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 54dad174ed7a..5c547329cf02 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -144,7 +144,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x000fffff +#define PERF_EVENT_FLAG_ARCH 0x0fffffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); -- cgit v1.2.3 From ec980e4facef8110f6fce27e5b6344660117f01f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:17 -0700 Subject: perf/x86/intel: Support auto counter reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The relative rates among two or more events are useful for performance analysis, e.g., a high branch miss rate may indicate a performance issue. Usually, the samples with a relative rate that exceeds some threshold are more useful. However, the traditional sampling takes samples of events separately. To get the relative rates among two or more events, a high sample rate is required, which can bring high overhead. Many samples taken in the non-hotspot area are also dropped (useless) in the post-process. The auto counter reload (ACR) feature takes samples when the relative rate of two or more events exceeds some threshold, which provides the fine-grained information at a low cost. To support the feature, two sets of MSRs are introduced. For a given counter IA32_PMC_GPn_CTR/IA32_PMC_FXm_CTR, bit fields in the IA32_PMC_GPn_CFG_B/IA32_PMC_FXm_CFG_B MSR indicate which counter(s) can cause a reload of that counter. The reload value is stored in the IA32_PMC_GPn_CFG_C/IA32_PMC_FXm_CFG_C. The details can be found at Intel SDM (085), Volume 3, 21.9.11 Auto Counter Reload. In the hw_config(), an ACR event is specially configured, because the cause/reloadable counter mask has to be applied to the dyn_constraint. Besides the HW limit, e.g., not support perf metrics, PDist and etc, a SW limit is applied as well. ACR events in a group must be contiguous. It facilitates the later conversion from the event idx to the counter idx. Otherwise, the intel_pmu_acr_late_setup() has to traverse the whole event list again to find the "cause" event. Also, add a new flag PERF_X86_EVENT_ACR to indicate an ACR group, which is set to the group leader. The late setup() is also required for an ACR group. It's to convert the event idx to the counter idx, and saved it in hw.config1. The ACR configuration MSRs are only updated in the enable_event(). The disable_event() doesn't clear the ACR CFG register. Add acr_cfg_b/acr_cfg_c in the struct cpu_hw_events to cache the MSR values. It can avoid a MSR write if the value is not changed. Expose an acr_mask to the sysfs. The perf tool can utilize the new format to configure the relation of events in the group. The bit sequence of the acr_mask follows the events enabled order of the group. Example: Here is the snippet of the mispredict.c. Since the array has a random numbers, jumps are random and often mispredicted. The mispredicted rate depends on the compared value. For the Loop1, ~11% of all branches are mispredicted. For the Loop2, ~21% of all branches are mispredicted. main() { ... for (i = 0; i < N; i++) data[i] = rand() % 256; ... /* Loop 1 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 64) sum += data[i]; ... ... /* Loop 2 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 128) sum += data[i]; ... } Usually, a code with a high branch miss rate means a bad performance. To understand the branch miss rate of the codes, the traditional method usually samples both branches and branch-misses events. E.g., perf record -e "{cpu_atom/branch-misses/ppu, cpu_atom/branch-instructions/u}" -c 1000000 -- ./mispredict [ perf record: Woken up 4 times to write data ] [ perf record: Captured and wrote 0.925 MB perf.data (5106 samples) ] The 5106 samples are from both events and spread in both Loops. In the post-process stage, a user can know that the Loop 2 has a 21% branch miss rate. Then they can focus on the samples of branch-misses events for the Loop 2. With this patch, the user can generate the samples only when the branch miss rate > 20%. For example, perf record -e "{cpu_atom/branch-misses,period=200000,acr_mask=0x2/ppu, cpu_atom/branch-instructions,period=1000000,acr_mask=0x3/u}" -- ./mispredict (Two different periods are applied to branch-misses and branch-instructions. The ratio is set to 20%. If the branch-instructions is overflowed first, the branch-miss rate < 20%. No samples should be generated. All counters should be automatically reloaded. If the branch-misses is overflowed first, the branch-miss rate > 20%. A sample triggered by the branch-misses event should be generated. Just the counter of the branch-instructions should be automatically reloaded. The branch-misses event should only be automatically reloaded when the branch-instructions is overflowed. So the "cause" event is the branch-instructions event. The acr_mask is set to 0x2, since the event index in the group of branch-instructions is 1. The branch-instructions event is automatically reloaded no matter which events are overflowed. So the "cause" events are the branch-misses and the branch-instructions event. The acr_mask should be set to 0x3.) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.098 MB perf.data (2498 samples) ] $perf report Percent │154: movl $0x0,-0x14(%rbp) │ ↓ jmp 1af │ for (i = j; i < N; i++) │15d: mov -0x10(%rbp),%eax │ mov %eax,-0x18(%rbp) │ ↓ jmp 1a2 │ if (data[i] >= 128) │165: mov -0x18(%rbp),%eax │ cltq │ lea 0x0(,%rax,4),%rdx │ mov -0x8(%rbp),%rax │ add %rdx,%rax │ mov (%rax),%eax │ ┌──cmp $0x7f,%eax 100.00 0.00 │ ├──jle 19e │ │sum += data[i]; The 2498 samples are all from the branch-misses events for the Loop 2. The number of samples and overhead is significantly reduced without losing any information. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-6-kan.liang@linux.intel.com --- arch/x86/events/core.c | 2 +- arch/x86/events/intel/core.c | 226 ++++++++++++++++++++++++++++++++++++++- arch/x86/events/perf_event.h | 10 ++ arch/x86/include/asm/msr-index.h | 4 + include/linux/perf_event.h | 1 + 5 files changed, 240 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a0fe51e0c00f..f53ae1fd986f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -755,7 +755,7 @@ void x86_pmu_enable_all(int added) } } -static inline int is_x86_event(struct perf_event *event) +int is_x86_event(struct perf_event *event) { int i; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 876678aa3895..3152a018c502 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2603,7 +2603,8 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); - if (is_pebs_counter_event_group(event)) + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) this_cpu_ptr(&cpu_hw_events)->n_late_setup--; } @@ -2882,6 +2883,52 @@ static void intel_pmu_enable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val |= bits; } +static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int msr_b, msr_c; + + if (!mask && !cpuc->acr_cfg_b[idx]) + return; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; + msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + } else { + msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; + msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; + idx -= INTEL_PMC_IDX_FIXED; + } + + if (cpuc->acr_cfg_b[idx] != mask) { + wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + cpuc->acr_cfg_b[idx] = mask; + } + /* Only need to update the reload value when there is a valid config value. */ + if (mask && cpuc->acr_cfg_c[idx] != reload) { + wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + cpuc->acr_cfg_c[idx] = reload; + } +} + +static void intel_pmu_enable_acr(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_acr_event_group(event) || !event->attr.config2) { + /* + * The disable doesn't clear the ACR CFG register. + * Check and clear the ACR CFG register. + */ + intel_pmu_config_acr(hwc->idx, 0, 0); + return; + } + + intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2896,9 +2943,12 @@ static void intel_pmu_enable_event(struct perf_event *event) if (branch_sample_counters(event)) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); + static_call_cond(intel_pmu_enable_acr_event)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_enable_acr_event)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); break; @@ -2916,6 +2966,31 @@ static void intel_pmu_enable_event(struct perf_event *event) } } +static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) +{ + struct perf_event *event, *leader; + int i, j, idx; + + for (i = 0; i < cpuc->n_events; i++) { + leader = cpuc->event_list[i]; + if (!is_acr_event_group(leader)) + continue; + + /* The ACR events must be contiguous. */ + for (j = i; j < cpuc->n_events; j++) { + event = cpuc->event_list[j]; + if (event->group_leader != leader->group_leader) + break; + for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { + if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + return; + __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + } + } + i = j - 1; + } +} + void intel_pmu_late_setup(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2924,6 +2999,7 @@ void intel_pmu_late_setup(void) return; intel_pmu_pebs_late_setup(cpuc); + intel_pmu_acr_late_setup(cpuc); } static void intel_pmu_add_event(struct perf_event *event) @@ -2932,7 +3008,8 @@ static void intel_pmu_add_event(struct perf_event *event) intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); - if (is_pebs_counter_event_group(event)) + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) this_cpu_ptr(&cpu_hw_events)->n_late_setup++; } @@ -4087,6 +4164,39 @@ end: return start; } +static inline bool intel_pmu_has_acr(struct pmu *pmu) +{ + return !!hybrid(pmu, acr_cause_mask64); +} + +static bool intel_pmu_is_acr_group(struct perf_event *event) +{ + /* The group leader has the ACR flag set */ + if (is_acr_event_group(event)) + return true; + + /* The acr_mask is set */ + if (event->attr.config2) + return true; + + return false; +} + +static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, + u64 *cause_mask, int *num) +{ + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); + *cause_mask |= event->attr.config2; + *num += 1; +} + +static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event, + int idx, u64 cause_mask) +{ + if (test_bit(idx, (unsigned long *)&cause_mask)) + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -4215,6 +4325,94 @@ static int intel_pmu_hw_config(struct perf_event *event) event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) { + struct perf_event *sibling, *leader = event->group_leader; + struct pmu *pmu = event->pmu; + bool has_sw_event = false; + int num = 0, idx = 0; + u64 cause_mask = 0; + + /* Not support perf metrics */ + if (is_metric_event(event)) + return -EINVAL; + + /* Not support freq mode */ + if (event->attr.freq) + return -EINVAL; + + /* PDist is not supported */ + if (event->attr.config2 && event->attr.precise_ip > 2) + return -EINVAL; + + /* The reload value cannot exceeds the max period */ + if (event->attr.sample_period > x86_pmu.max_period) + return -EINVAL; + /* + * The counter-constraints of each event cannot be finalized + * unless the whole group is scanned. However, it's hard + * to know whether the event is the last one of the group. + * Recalculate the counter-constraints for each event when + * adding a new event. + * + * The group is traversed twice, which may be optimized later. + * In the first round, + * - Find all events which do reload when other events + * overflow and set the corresponding counter-constraints + * - Add all events, which can cause other events reload, + * in the cause_mask + * - Error out if the number of events exceeds the HW limit + * - The ACR events must be contiguous. + * Error out if there are non-X86 events between ACR events. + * This is not a HW limit, but a SW limit. + * With the assumption, the intel_pmu_acr_late_setup() can + * easily convert the event idx to counter idx without + * traversing the whole event list. + */ + if (!is_x86_event(leader)) + return -EINVAL; + + if (leader->attr.config2) + intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) { + if (!is_x86_event(sibling)) { + has_sw_event = true; + continue; + } + if (!sibling->attr.config2) + continue; + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num); + } + } + if (leader != event && event->attr.config2) { + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num); + } + + if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) || + num > hweight64(hybrid(event->pmu, acr_cntr_mask64))) + return -EINVAL; + /* + * In the second round, apply the counter-constraints for + * the events which can cause other events reload. + */ + intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) + intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask); + } + + if (leader != event) + intel_pmu_set_acr_caused_constr(event, idx, cause_mask); + + leader->hw.flags |= PERF_X86_EVENT_ACR; + } + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; @@ -6061,6 +6259,21 @@ td_is_visible(struct kobject *kobj, struct attribute *attr, int i) return attr->mode; } +PMU_FORMAT_ATTR(acr_mask, "config2:0-63"); + +static struct attribute *format_acr_attrs[] = { + &format_attr_acr_mask.attr, + NULL +}; + +static umode_t +acr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + + return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0; +} + static struct attribute_group group_events_td = { .name = "events", .is_visible = td_is_visible, @@ -6103,6 +6316,12 @@ static struct attribute_group group_format_evtsel_ext = { .is_visible = evtsel_ext_is_visible, }; +static struct attribute_group group_format_acr = { + .name = "format", + .attrs = format_acr_attrs, + .is_visible = acr_is_visible, +}; + static struct attribute_group group_default = { .attrs = intel_pmu_attrs, .is_visible = default_is_visible, @@ -6117,6 +6336,7 @@ static const struct attribute_group *attr_update[] = { &group_format_extra, &group_format_extra_skl, &group_format_evtsel_ext, + &group_format_acr, &group_default, NULL, }; @@ -6401,6 +6621,7 @@ static const struct attribute_group *hybrid_attr_update[] = { &group_caps_lbr, &hybrid_group_format_extra, &group_format_evtsel_ext, + &group_format_acr, &group_default, &hybrid_group_cpus, NULL, @@ -6593,6 +6814,7 @@ static __always_inline void intel_pmu_init_skt(struct pmu *pmu) intel_pmu_init_grt(pmu); hybrid(pmu, event_constraints) = intel_skt_event_constraints; hybrid(pmu, extra_regs) = intel_cmt_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } __init int intel_pmu_init(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ab9af2e4da1b..46bbb503aca1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -120,6 +120,11 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; } +static inline bool is_acr_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_ACR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -287,6 +292,10 @@ struct cpu_hw_events { u64 fixed_ctrl_val; u64 active_fixed_ctrl_val; + /* Intel ACR configuration */ + u64 acr_cfg_b[X86_PMC_IDX_MAX]; + u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* * Intel LBR bits */ @@ -1103,6 +1112,7 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } +int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e6134ef2263d..53da787b9326 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -594,7 +594,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* KeyID partitioning between MKTME and TDX */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5c547329cf02..947ad12dfdbe 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -157,6 +157,7 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; + u64 config1; u64 last_tag; u64 dyn_constraint; unsigned long config_base; -- cgit v1.2.3 From a36283e2b683f172aa1760c77325e50b16c0f792 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Apr 2025 17:45:41 +0200 Subject: udp_tunnel: create a fastpath GRO lookup. Most UDP tunnels bind a socket to a local port, with ANY address, no peer and no interface index specified. Additionally it's quite common to have a single tunnel device per namespace. Track in each namespace the UDP tunnel socket respecting the above. When only a single one is present, store a reference in the netns. When such reference is not NULL, UDP tunnel GRO lookup just need to match the incoming packet destination port vs the socket local port. The tunnel socket never sets the reuse[port] flag[s]. When bound to no address and interface, no other socket can exist in the same netns matching the specified local port. Matching packets with non-local destination addresses will be aggregated, and eventually segmented as needed - no behavior changes intended. Restrict the optimization to kernel sockets only: it covers all the relevant use-cases, and user-space owned sockets could be disconnected and rebound after setup_udp_tunnel_sock(), breaking the uniqueness assumption Note that the UDP tunnel socket reference is stored into struct netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep all the fastpath-related netns fields in the same struct and allow cacheline-based optimization. Currently both the IPv4 and IPv6 socket pointer share the same cacheline as the `udp_table` field. Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/41d16bc8d1257d567f9344c445b4ae0b4a91ede4.1744040675.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 16 ++++++++++++++++ include/net/netns/ipv4.h | 11 +++++++++++ include/net/udp.h | 1 + include/net/udp_tunnel.h | 12 ++++++++++++ net/ipv4/udp.c | 13 ++++++++++++- net/ipv4/udp_offload.c | 37 +++++++++++++++++++++++++++++++++++++ net/ipv4/udp_tunnel_core.c | 13 +++++++++++++ net/ipv6/udp.c | 2 ++ net/ipv6/udp_offload.c | 5 +++++ 9 files changed, 109 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec9..895240177f4f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 650b2dc9199f..6373e3f17da8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -85,6 +90,11 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -277,4 +287,5 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100d..a772510b2aa5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -290,6 +290,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323..1bb2b852e90e 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -191,6 +191,18 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} + static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2742cc7602bb..04cd3353f1d4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2897,8 +2897,10 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); + udp_tunnel_cleanup_gro(sk); + } } } @@ -3810,6 +3812,15 @@ fallback: static int __net_init udp_pernet_init(struct net *net) { +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + int i; + + /* No tunnel is configured */ + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); + } +#endif udp_sysctl_init(net); udp_set_table(net); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2c0725583be3..69fc4eae8250 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,6 +12,38 @@ #include #include #include +#include + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +static DEFINE_SPINLOCK(udp_tunnel_gro_lock); + +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + struct udp_sock *tup, *up = udp_sk(sk); + struct udp_tunnel_gro *udp_tunnel_gro; + + spin_lock(&udp_tunnel_gro_lock); + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; + if (add) + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); + else if (up->tunnel_list.pprev) + hlist_del_init(&up->tunnel_list); + + if (udp_tunnel_gro->list.first && + !udp_tunnel_gro->list.first->next) { + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, + tunnel_list); + + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); + } else { + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); + } + + spin_unlock(&udp_tunnel_gro_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); +#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -635,8 +667,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, false); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 619a53eb672d..3d1214e6df0e 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,6 +58,15 @@ error: } EXPORT_SYMBOL(udp_sock_create4); +static bool sk_saddr_any(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#else + return !sk->sk_rcv_saddr; +#endif +} + void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -80,6 +89,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); + + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && + sk->sk_kern_sock) + udp_tunnel_update_gro_lookup(net, sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 024458ef163c..7317f8e053f1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -1825,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 404212dfc99a..d8445ac1b2e4 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,8 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, -- cgit v1.2.3 From 420aabef3ab5fa743afb4d3d391f03ef0e777ca8 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 7 Apr 2025 21:01:02 +0200 Subject: net: Drop unused @sk of __skb_try_recv_from_queue() __skb_try_recv_from_queue() deals with a queue, @sk is not used since commit e427cad6eee4 ("net: datagram: drop 'destructor' argument from several helpers"). Remove sk from function parameters, adapt callers. No functional change intended. Signed-off-by: Michal Luczaj Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250407-cleanup-drop-param-sk-v1-1-cd076979afac@rbox.co Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 +-- net/core/datagram.c | 5 ++--- net/ipv4/udp.c | 8 ++++---- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a..f1381aff0f89 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4105,8 +4105,7 @@ static inline void skb_frag_list_init(struct sk_buff *skb) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); diff --git a/net/core/datagram.c b/net/core/datagram.c index f0693707aece..f0634f0cb834 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -163,8 +163,7 @@ done: return skb; } -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) @@ -261,7 +260,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 04cd3353f1d4..8867fe687888 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1942,8 +1942,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, error = -EAGAIN; do { spin_lock_bh(&queue->lock); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); @@ -1964,8 +1964,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); -- cgit v1.2.3 From 265daffe788aa1cc5925d0afcde4fe6e99c66638 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 7 Apr 2025 09:08:14 +0200 Subject: gpio: provide gpiod_is_equal() There are users in the kernel that directly compare raw GPIO descriptor pointers in order to determine whether they refer to the same physical GPIO pin. This accidentally works like this but is not guaranteed by any API contract. Let's provide a comparator function that hides the actual logic. Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20250407-gpiod-is-equal-v1-1-7d85f568ae6e@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 14 ++++++++++++++ include/linux/gpio/consumer.h | 9 +++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index b8197502a5ac..2e5b6982e76d 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -265,6 +265,20 @@ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc) } EXPORT_SYMBOL_GPL(gpiod_to_gpio_device); +/** + * gpiod_is_equal() - Check if two GPIO descriptors refer to the same pin. + * @desc: Descriptor to compare. + * @other: The second descriptor to compare against. + * + * Returns: + * True if the descriptors refer to the same physical pin. False otherwise. + */ +bool gpiod_is_equal(struct gpio_desc *desc, struct gpio_desc *other) +{ + return desc == other; +} +EXPORT_SYMBOL_GPL(gpiod_is_equal); + /** * gpio_device_get_base() - Get the base GPIO number allocated by this device * @gdev: GPIO device diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 45b651c05b9c..7355abadaef4 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -180,6 +180,8 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, enum gpiod_flags flags, const char *label); +bool gpiod_is_equal(struct gpio_desc *desc, struct gpio_desc *other); + #else /* CONFIG_GPIOLIB */ #include @@ -547,6 +549,13 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, return ERR_PTR(-ENOSYS); } +static inline bool +gpiod_is_equal(struct gpio_desc *desc, struct gpio_desc *other) +{ + WARN_ON(desc || other); + return false; +} + #endif /* CONFIG_GPIOLIB */ #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_HTE) -- cgit v1.2.3 From dd293df6395a2c9e0fc4faa8defeceaa907e7717 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Sun, 29 Dec 2024 16:11:25 +0100 Subject: tracing: Move trace sysctls into trace.c Move trace ctl tables into their own const array in kernel/trace/trace.c. The sysctl table register is called with subsys_initcall placing if after its original place in proc_root_init. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Signed-off-by: Joel Granados Acked-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 7 ------- kernel/sysctl.c | 24 ------------------------ kernel/trace/trace.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fbabc3d848b3..59774513ae45 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1298,16 +1298,9 @@ static inline void unpause_graph_tracing(void) { } #ifdef CONFIG_TRACING enum ftrace_dump_mode; -#define MAX_TRACER_SIZE 100 -extern char ftrace_dump_on_oops[]; extern int ftrace_dump_on_oops_enabled(void); -extern int tracepoint_printk; extern void disable_trace_on_warning(void); -extern int __disable_trace_on_warning; - -int tracepoint_printk_sysctl(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #else /* CONFIG_TRACING */ static inline void disable_trace_on_warning(void) { } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 811c50072e03..324ae43a3d50 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include @@ -1662,29 +1661,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = stack_trace_sysctl, }, #endif -#ifdef CONFIG_TRACING - { - .procname = "ftrace_dump_on_oops", - .data = &ftrace_dump_on_oops, - .maxlen = MAX_TRACER_SIZE, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "traceoff_on_warning", - .data = &__disable_trace_on_warning, - .maxlen = sizeof(__disable_trace_on_warning), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tracepoint_printk", - .data = &tracepoint_printk, - .maxlen = sizeof(tracepoint_printk), - .mode = 0644, - .proc_handler = tracepoint_printk_sysctl, - }, -#endif #ifdef CONFIG_MODULES { .procname = "modprobe", diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b581e388a9d9..bd9c30dd1183 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -120,6 +120,7 @@ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; +#define MAX_TRACER_SIZE 100 /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * @@ -142,7 +143,40 @@ cpumask_var_t __read_mostly tracing_buffer_mask; char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ -int __disable_trace_on_warning; +static int __disable_trace_on_warning; + +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +static const struct ctl_table trace_sysctl_table[] = { + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = MAX_TRACER_SIZE, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +}; + +static int __init init_trace_sysctls(void) +{ + register_sysctl_init("kernel", trace_sysctl_table); + return 0; +} +subsys_initcall(init_trace_sysctls); #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ -- cgit v1.2.3 From 67049b53e06fa1758df1463789f286a7cba67c50 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 8 Jan 2025 12:55:58 +0100 Subject: stack_tracer: move sysctl registration to kernel/trace/trace_stack.c Move stack_tracer_enabled into trace_stack_sysctl_table. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Acked-by: Steven Rostedt (Google) Signed-off-by: Joel Granados --- include/linux/ftrace.h | 2 -- kernel/sysctl.c | 10 ---------- kernel/trace/trace_stack.c | 22 +++++++++++++++++++++- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 59774513ae45..95851a6fb942 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -569,8 +569,6 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, #ifdef CONFIG_STACK_TRACER -extern int stack_tracer_enabled; - int stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 324ae43a3d50..e5430594dfd6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -58,7 +58,6 @@ #ifdef CONFIG_X86 #include -#include #include #endif #ifdef CONFIG_SPARC @@ -1652,15 +1651,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_STACK_TRACER - { - .procname = "stack_tracer_enabled", - .data = &stack_tracer_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = stack_trace_sysctl, - }, -#endif #ifdef CONFIG_MODULES { .procname = "modprobe", diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 14c6f272c4d8..e34223c8065d 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock = DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); -int stack_tracer_enabled; +static int stack_tracer_enabled; static void print_max_stack(void) { @@ -578,3 +578,23 @@ static __init int stack_trace_init(void) } device_initcall(stack_trace_init); + + +static const struct ctl_table trace_stack_sysctl_table[] = { + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +}; + +static int __init init_trace_stack_sysctls(void) +{ + register_sysctl_init("kernel", trace_stack_sysctl_table); + return 0; +} +subsys_initcall(init_trace_stack_sysctls); + + -- cgit v1.2.3 From e1bdbbc98279164d910d2de82a745f090a8b249f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 7 Apr 2025 13:36:55 -0500 Subject: ACPI: Add missing prototype for non CONFIG_SUSPEND/CONFIG_X86 case acpi_register_lps0_dev() and acpi_unregister_lps0_dev() may be used in drivers that don't require CONFIG_SUSPEND or compile on !X86. Add prototypes for those cases. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502191627.fRgoBwcZ-lkp@intel.com/ Signed-off-by: Mario Limonciello Link: https://patch.msgid.link/20250407183656.1503446-1-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3f2e93ed9730..fc372bbaa547 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1125,13 +1125,13 @@ void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state, acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b); -#if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) struct acpi_s2idle_dev_ops { struct list_head list_node; void (*prepare)(void); void (*check)(void); void (*restore)(void); }; +#if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg); int acpi_get_lps0_constraint(struct acpi_device *adev); @@ -1140,6 +1140,13 @@ static inline int acpi_get_lps0_constraint(struct device *dev) { return ACPI_STATE_UNKNOWN; } +static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg) +{ + return -ENODEV; +} +static inline void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg) +{ +} #endif /* CONFIG_SUSPEND && CONFIG_X86 */ void arch_reserve_mem_area(acpi_physical_address addr, size_t size); #else -- cgit v1.2.3 From 092d00ead733563f6d278295e0b5c5f97558b726 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Mar 2025 11:56:38 +0100 Subject: cleanup: Provide retain_and_null_ptr() In cases where an allocation is consumed by another function, the allocation needs to be retained on success or freed on failure. The code pattern is usually: struct foo *f = kzalloc(sizeof(*f), GFP_KERNEL); struct bar *b; ,,, // Initialize f ... if (ret) goto free; ... bar = bar_create(f); if (!bar) { ret = -ENOMEM; goto free; } ... return 0; free: kfree(f); return ret; This prevents using __free(kfree) on @f because there is no canonical way to tell the cleanup code that the allocation should not be freed. Abusing no_free_ptr() by force ignoring the return value is not really a sensible option either. Provide an explicit macro retain_and_null_ptr(), which NULLs the cleanup pointer. That makes it easy to analyze and reason about. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Reviewed-by: James Bottomley Link: https://lore.kernel.org/all/20250319105506.083538907@linutronix.de --- include/linux/cleanup.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 7e57047e1564..7093e1d08af0 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -216,6 +216,25 @@ const volatile void * __must_check_fn(const volatile void *val) #define return_ptr(p) return no_free_ptr(p) +/* + * Only for situations where an allocation is handed in to another function + * and consumed by that function on success. + * + * struct foo *f __free(kfree) = kzalloc(sizeof(*f), GFP_KERNEL); + * + * setup(f); + * if (some_condition) + * return -EINVAL; + * .... + * ret = bar(f); + * if (!ret) + * retain_and_null_ptr(f); + * return ret; + * + * After retain_and_null_ptr(f) the variable f is NULL and cannot be + * dereferenced anymore. + */ +#define retain_and_null_ptr(p) ((void)__get_and_null(p, NULL)) /* * DEFINE_CLASS(name, type, exit, init, init_args...): -- cgit v1.2.3 From 0dac2b09303c89ffb21d25d40bf6aefd39f214b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Mar 2025 11:56:39 +0100 Subject: genirq/msi: Use lock guards for MSI descriptor locking Provide a lock guard for MSI descriptor locking and update the core code accordingly. No functional change intended. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/all/20250319105506.144672678@linutronix.de --- include/linux/irqdomain.h | 2 + include/linux/msi.h | 3 ++ kernel/irq/msi.c | 109 +++++++++++++++++----------------------------- 3 files changed, 45 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index bb7111105296..2f955716cf82 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -281,6 +281,8 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) void irq_domain_free_fwnode(struct fwnode_handle *fwnode); +DEFINE_FREE(irq_domain_free_fwnode, struct fwnode_handle *, if (_T) irq_domain_free_fwnode(_T)) + struct irq_domain_chip_generic_info; /** diff --git a/include/linux/msi.h b/include/linux/msi.h index 86e42742fd0f..4e439916dbcf 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -232,6 +232,9 @@ int msi_setup_device_data(struct device *dev); void msi_lock_descs(struct device *dev); void msi_unlock_descs(struct device *dev); +DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, msi_lock_descs(_T->lock), + msi_unlock_descs(_T->lock)); + struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 5c8d43cdb0a3..5747d2078782 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -448,7 +448,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; - unsigned int ret = 0; bool pcimsi = false; struct xarray *xa; @@ -462,7 +461,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) pcimsi = to_pci_dev(dev)->msi_enabled; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); xa = &dev->msi.data->__domains[domid].store; desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { @@ -471,16 +470,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (pcimsi) { - if (index < desc->nvec_used) - ret = desc->irq + index; - } else { - ret = desc->irq; - } + if (!pcimsi) + return desc->irq; + if (index < desc->nvec_used) + return desc->irq + index; } - - msi_unlock_descs(dev); - return ret; + return 0; } EXPORT_SYMBOL_GPL(msi_domain_get_virq); @@ -998,9 +993,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; - struct fwnode_handle *fwnode, *fwnalloced = NULL; - struct msi_domain_template *bundle; const struct msi_parent_ops *pops; + struct fwnode_handle *fwnode; if (!irq_domain_is_msi_parent(parent)) return false; @@ -1008,7 +1002,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) return false; - bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + struct msi_domain_template *bundle __free(kfree) = + kmemdup(template, sizeof(*bundle), GFP_KERNEL); if (!bundle) return false; @@ -1031,41 +1026,36 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, * node as they are not guaranteed to have a fwnode. They are never * looked up and always handled in the context of the device. */ - if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) - fwnode = dev->fwnode; + struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL; + + if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)) + fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name); else - fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); + fwnode = dev->fwnode; if (!fwnode) - goto free_bundle; + return false; if (msi_setup_device_data(dev)) - goto free_fwnode; - - msi_lock_descs(dev); + return false; + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) - goto fail; + return false; if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) - goto fail; + return false; domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); if (!domain) - goto fail; + return false; + /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */ + retain_and_null_ptr(bundle); + retain_and_null_ptr(fwnode_alloced); domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; - msi_unlock_descs(dev); return true; - -fail: - msi_unlock_descs(dev); -free_fwnode: - irq_domain_free_fwnode(fwnalloced); -free_bundle: - kfree(bundle); - return false; } /** @@ -1079,12 +1069,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) struct msi_domain_info *info; struct irq_domain *domain; - msi_lock_descs(dev); - + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); - if (!domain || !irq_domain_is_msi_device(domain)) - goto unlock; + return; dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; @@ -1093,9 +1081,6 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) irq_domain_remove(domain); irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); - -unlock: - msi_unlock_descs(dev); } /** @@ -1111,16 +1096,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, { struct msi_domain_info *info; struct irq_domain *domain; - bool ret = false; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); if (domain && irq_domain_is_msi_device(domain)) { info = domain->host_data; - ret = info->bus_token == bus_token; + return info->bus_token == bus_token; } - msi_unlock_descs(dev); - return ret; + return false; } static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -1391,12 +1374,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - int ret; - msi_lock_descs(dev); - ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); - return ret; + guard(msi_descs_lock)(dev); + return msi_domain_alloc_irqs_range_locked(dev, domid, first, last); } EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); @@ -1500,12 +1480,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *icookie) { - struct msi_map map; - - msi_lock_descs(dev); - map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); - msi_unlock_descs(dev); - return map; + guard(msi_descs_lock)(dev); + return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); } /** @@ -1542,13 +1518,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, icookie.value = ((u64)type << 32) | hwirq; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) map.index = -EINVAL; else map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); - msi_unlock_descs(dev); - return map.index >= 0 ? map.virq : map.index; } @@ -1641,9 +1615,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); } EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); @@ -1673,9 +1646,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) */ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_all_locked(dev, domid); - msi_unlock_descs(dev); } /** @@ -1694,12 +1666,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) return; - msi_lock_descs(dev); - if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { - msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, - desc->msi_index); - } - msi_unlock_descs(dev); + guard(msi_descs_lock)(dev); + if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) + return; + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); } /** -- cgit v1.2.3 From 9357e329cdeb6f2d27b431a22d4965700bec478a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Mar 2025 11:57:01 +0100 Subject: genirq/msi: Rename msi_[un]lock_descs() Now that all abuse is gone and the legit users are converted to guard(msi_descs_lock), rename the lock functions and document them as internal. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/all/20250319105506.864699741@linutronix.de --- include/linux/msi.h | 8 ++++---- kernel/irq/msi.c | 16 ++++++++++------ 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4e439916dbcf..8c0ec9fc05a3 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -229,11 +229,11 @@ struct msi_dev_domain { int msi_setup_device_data(struct device *dev); -void msi_lock_descs(struct device *dev); -void msi_unlock_descs(struct device *dev); +void __msi_lock_descs(struct device *dev); +void __msi_unlock_descs(struct device *dev); -DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, msi_lock_descs(_T->lock), - msi_unlock_descs(_T->lock)); +DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, __msi_lock_descs(_T->lock), + __msi_unlock_descs(_T->lock)); struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 5747d2078782..a8f7701c2929 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -343,26 +343,30 @@ int msi_setup_device_data(struct device *dev) } /** - * msi_lock_descs - Lock the MSI descriptor storage of a device + * __msi_lock_descs - Lock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_lock_descs(struct device *dev) +void __msi_lock_descs(struct device *dev) { mutex_lock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_lock_descs); +EXPORT_SYMBOL_GPL(__msi_lock_descs); /** - * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * __msi_unlock_descs - Unlock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_unlock_descs(struct device *dev) +void __msi_unlock_descs(struct device *dev) { /* Invalidate the index which was cached by the iterator */ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_unlock_descs); +EXPORT_SYMBOL_GPL(__msi_unlock_descs); static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, enum msi_desc_filter filter) -- cgit v1.2.3 From 6fec833b9d70c54ceacbf7d07665215fbd0cddef Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 28 Mar 2025 21:42:48 +0100 Subject: cpufreq: Add and use cpufreq policy locking guards Introduce "read" and "write" locking guards for cpufreq policies and use them where applicable in the cpufreq core. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Mario Limonciello Acked-by: Sudeep Holla Tested-by: Sudeep Holla Link: https://patch.msgid.link/8518682.T7Z3S40VBb@rjwysocki.net --- drivers/cpufreq/cpufreq.c | 121 +++++++++++++++++++++------------------------- include/linux/cpufreq.h | 6 +++ 2 files changed, 60 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 31ede4d998d2..a3abd9c0440d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1009,17 +1009,16 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct cpufreq_policy *policy = to_policy(kobj); struct freq_attr *fattr = to_attr(attr); - ssize_t ret = -EBUSY; if (!fattr->show) return -EIO; - down_read(&policy->rwsem); + guard(cpufreq_policy_read)(policy); + if (likely(!policy_is_inactive(policy))) - ret = fattr->show(policy, buf); - up_read(&policy->rwsem); + return fattr->show(policy, buf); - return ret; + return -EBUSY; } static ssize_t store(struct kobject *kobj, struct attribute *attr, @@ -1027,17 +1026,16 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, { struct cpufreq_policy *policy = to_policy(kobj); struct freq_attr *fattr = to_attr(attr); - ssize_t ret = -EBUSY; if (!fattr->store) return -EIO; - down_write(&policy->rwsem); + guard(cpufreq_policy_write)(policy); + if (likely(!policy_is_inactive(policy))) - ret = fattr->store(policy, buf, count); - up_write(&policy->rwsem); + return fattr->store(policy, buf, count); - return ret; + return -EBUSY; } static void cpufreq_sysfs_release(struct kobject *kobj) @@ -1195,7 +1193,8 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp if (cpumask_test_cpu(cpu, policy->cpus)) return 0; - down_write(&policy->rwsem); + guard(cpufreq_policy_write)(policy); + if (has_target()) cpufreq_stop_governor(policy); @@ -1206,7 +1205,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp if (ret) pr_err("%s: Failed to start governor\n", __func__); } - up_write(&policy->rwsem); + return ret; } @@ -1226,9 +1225,10 @@ static void handle_update(struct work_struct *work) container_of(work, struct cpufreq_policy, update); pr_debug("handle_update for cpu %u called\n", policy->cpu); - down_write(&policy->rwsem); + + guard(cpufreq_policy_write)(policy); + refresh_frequency_limits(policy); - up_write(&policy->rwsem); } static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq, @@ -1254,11 +1254,11 @@ static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy) struct kobject *kobj; struct completion *cmp; - down_write(&policy->rwsem); - cpufreq_stats_free_table(policy); - kobj = &policy->kobj; - cmp = &policy->kobj_unregister; - up_write(&policy->rwsem); + scoped_guard(cpufreq_policy_write, policy) { + cpufreq_stats_free_table(policy); + kobj = &policy->kobj; + cmp = &policy->kobj_unregister; + } kobject_put(kobj); /* @@ -1409,7 +1409,7 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, unsigned int j; int ret; - down_write(&policy->rwsem); + guard(cpufreq_policy_write)(policy); policy->cpu = cpu; policy->governor = NULL; @@ -1586,10 +1586,7 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, goto out_destroy_policy; } -out_unlock: - up_write(&policy->rwsem); - - return ret; + return 0; out_destroy_policy: for_each_cpu(j, policy->real_cpus) @@ -1606,7 +1603,7 @@ out_exit_policy: out_clear_policy: cpumask_clear(policy->cpus); - goto out_unlock; + return ret; } static int cpufreq_online(unsigned int cpu) @@ -1754,11 +1751,10 @@ static int cpufreq_offline(unsigned int cpu) return 0; } - down_write(&policy->rwsem); + guard(cpufreq_policy_write)(policy); __cpufreq_offline(cpu, policy); - up_write(&policy->rwsem); return 0; } @@ -1775,33 +1771,29 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) if (!policy) return; - down_write(&policy->rwsem); + scoped_guard(cpufreq_policy_write, policy) { + if (cpu_online(cpu)) + __cpufreq_offline(cpu, policy); - if (cpu_online(cpu)) - __cpufreq_offline(cpu, policy); + remove_cpu_dev_symlink(policy, cpu, dev); - remove_cpu_dev_symlink(policy, cpu, dev); + if (!cpumask_empty(policy->real_cpus)) + return; - if (!cpumask_empty(policy->real_cpus)) { - up_write(&policy->rwsem); - return; - } + /* + * Unregister cpufreq cooling once all the CPUs of the policy + * are removed. + */ + if (cpufreq_thermal_control_enabled(cpufreq_driver)) { + cpufreq_cooling_unregister(policy->cdev); + policy->cdev = NULL; + } - /* - * Unregister cpufreq cooling once all the CPUs of the policy are - * removed. - */ - if (cpufreq_thermal_control_enabled(cpufreq_driver)) { - cpufreq_cooling_unregister(policy->cdev); - policy->cdev = NULL; + /* We did light-weight exit earlier, do full tear down now */ + if (cpufreq_driver->offline && cpufreq_driver->exit) + cpufreq_driver->exit(policy); } - /* We did light-weight exit earlier, do full tear down now */ - if (cpufreq_driver->offline && cpufreq_driver->exit) - cpufreq_driver->exit(policy); - - up_write(&policy->rwsem); - cpufreq_policy_free(policy); } @@ -1954,15 +1946,16 @@ unsigned int cpufreq_get(unsigned int cpu) struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); unsigned int ret_freq = 0; - if (policy) { - down_read(&policy->rwsem); + if (!policy) + return 0; + + scoped_guard(cpufreq_policy_read, policy) { if (cpufreq_driver->get) ret_freq = __cpufreq_get(policy); - up_read(&policy->rwsem); - - cpufreq_cpu_put(policy); } + cpufreq_cpu_put(policy); + return ret_freq; } EXPORT_SYMBOL(cpufreq_get); @@ -2022,9 +2015,9 @@ void cpufreq_suspend(void) for_each_active_policy(policy) { if (has_target()) { - down_write(&policy->rwsem); - cpufreq_stop_governor(policy); - up_write(&policy->rwsem); + scoped_guard(cpufreq_policy_write, policy) { + cpufreq_stop_governor(policy); + } } if (cpufreq_driver->suspend && cpufreq_driver->suspend(policy)) @@ -2065,9 +2058,9 @@ void cpufreq_resume(void) pr_err("%s: Failed to resume driver: %s\n", __func__, cpufreq_driver->name); } else if (has_target()) { - down_write(&policy->rwsem); - ret = cpufreq_start_governor(policy); - up_write(&policy->rwsem); + scoped_guard(cpufreq_policy_write, policy) { + ret = cpufreq_start_governor(policy); + } if (ret) pr_err("%s: Failed to start governor for CPU%u's policy\n", @@ -2434,15 +2427,9 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - int ret; + guard(cpufreq_policy_write)(policy); - down_write(&policy->rwsem); - - ret = __cpufreq_driver_target(policy, target_freq, relation); - - up_write(&policy->rwsem); - - return ret; + return __cpufreq_driver_target(policy, target_freq, relation); } EXPORT_SYMBOL_GPL(cpufreq_driver_target); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 400fee6427a5..cb972d2aa8df 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -170,6 +170,12 @@ struct cpufreq_policy { struct notifier_block nb_max; }; +DEFINE_GUARD(cpufreq_policy_write, struct cpufreq_policy *, + down_write(&_T->rwsem), up_write(&_T->rwsem)) + +DEFINE_GUARD(cpufreq_policy_read, struct cpufreq_policy *, + down_read(&_T->rwsem), up_read(&_T->rwsem)) + /* * Used for passing new cpufreq policy data to the cpufreq driver's ->verify() * callback for sanitization. That callback is only expected to modify the min -- cgit v1.2.3 From c7282dce257480f0f22ed3db69cfb400a18709f4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 28 Mar 2025 21:45:38 +0100 Subject: cpufreq: Drop cpufreq_cpu_acquire() and cpufreq_cpu_release() Since cpufreq_cpu_acquire() and cpufreq_cpu_release() have no more users in the tree, remove them. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Mario Limonciello Acked-by: Sudeep Holla Tested-by: Sudeep Holla Link: https://patch.msgid.link/3880470.kQq0lBPeGt@rjwysocki.net --- drivers/cpufreq/cpufreq.c | 45 --------------------------------------------- include/linux/cpufreq.h | 2 -- 2 files changed, 47 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 40244ff620b6..2ed873777fb5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -255,51 +255,6 @@ void cpufreq_cpu_put(struct cpufreq_policy *policy) } EXPORT_SYMBOL_GPL(cpufreq_cpu_put); -/** - * cpufreq_cpu_release - Unlock a policy and decrement its usage counter. - * @policy: cpufreq policy returned by cpufreq_cpu_acquire(). - */ -void cpufreq_cpu_release(struct cpufreq_policy *policy) -{ - if (WARN_ON(!policy)) - return; - - lockdep_assert_held(&policy->rwsem); - - up_write(&policy->rwsem); - - cpufreq_cpu_put(policy); -} - -/** - * cpufreq_cpu_acquire - Find policy for a CPU, mark it as busy and lock it. - * @cpu: CPU to find the policy for. - * - * Call cpufreq_cpu_get() to get a reference on the cpufreq policy for @cpu and - * if the policy returned by it is not NULL, acquire its rwsem for writing. - * Return the policy if it is active or release it and return NULL otherwise. - * - * The policy returned by this function has to be released with the help of - * cpufreq_cpu_release() in order to release its rwsem and balance its usage - * counter properly. - */ -struct cpufreq_policy *cpufreq_cpu_acquire(unsigned int cpu) -{ - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - - if (!policy) - return NULL; - - down_write(&policy->rwsem); - - if (policy_is_inactive(policy)) { - cpufreq_cpu_release(policy); - return NULL; - } - - return policy; -} - /********************************************************************* * EXTERNALLY AFFECTING FREQUENCY CHANGES * *********************************************************************/ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cb972d2aa8df..a33a094ef755 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -241,8 +241,6 @@ void disable_cpufreq(void); u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); -struct cpufreq_policy *cpufreq_cpu_acquire(unsigned int cpu); -void cpufreq_cpu_release(struct cpufreq_policy *policy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); void refresh_frequency_limits(struct cpufreq_policy *policy); void cpufreq_update_policy(unsigned int cpu); -- cgit v1.2.3 From eaff6b62d3439ca6ee00dba4f77673a8c37dac20 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 28 Mar 2025 21:48:54 +0100 Subject: cpufreq: Pass policy pointer to ->update_limits() Since cpufreq_update_limits() obtains a cpufreq policy pointer for the given CPU and reference counts the corresponding policy object, it may as well pass the policy pointer to the cpufreq driver's ->update_limits() callback which allows that callback to avoid invoking cpufreq_cpu_get() for the same CPU. Accordingly, redefine ->update_limits() to take a policy pointer instead of a CPU number and update both drivers implementing it, intel_pstate and amd-pstate, as needed. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Mario Limonciello Acked-by: Srinivas Pandruvada Acked-by: Sudeep Holla Tested-by: Sudeep Holla Link: https://patch.msgid.link/8560367.NyiUUSuA9g@rjwysocki.net --- drivers/cpufreq/amd-pstate.c | 7 ++----- drivers/cpufreq/cpufreq.c | 2 +- drivers/cpufreq/intel_pstate.c | 29 ++++++++++++++++++----------- include/linux/cpufreq.h | 2 +- 4 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 6789eed1bb5b..b9d59c7425f5 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -821,19 +821,16 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) schedule_work(&sched_prefcore_work); } -static void amd_pstate_update_limits(unsigned int cpu) +static void amd_pstate_update_limits(struct cpufreq_policy *policy) { - struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; u32 prev_high = 0, cur_high = 0; bool highest_perf_changed = false; + unsigned int cpu = policy->cpu; if (!amd_pstate_prefcore) return; - if (!policy) - return; - if (amd_get_highest_perf(cpu, &cur_high)) return; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c885e0ec174f..2b91ba503b32 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2769,7 +2769,7 @@ void cpufreq_update_limits(unsigned int cpu) return; if (cpufreq_driver->update_limits) - cpufreq_driver->update_limits(cpu); + cpufreq_driver->update_limits(policy); else cpufreq_policy_refresh(policy); } diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 108e4c6a371e..f5ca04b98b92 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1353,14 +1353,9 @@ static void intel_pstate_update_policies(void) cpufreq_update_policy(cpu); } -static bool intel_pstate_update_max_freq(struct cpudata *cpudata) +static void __intel_pstate_update_max_freq(struct cpufreq_policy *policy, + struct cpudata *cpudata) { - struct cpufreq_policy *policy __free(put_cpufreq_policy); - - policy = cpufreq_cpu_get(cpudata->cpu); - if (!policy) - return false; - guard(cpufreq_policy_write)(policy); if (hwp_active) @@ -1370,16 +1365,28 @@ static bool intel_pstate_update_max_freq(struct cpudata *cpudata) cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; refresh_frequency_limits(policy); +} + +static bool intel_pstate_update_max_freq(struct cpudata *cpudata) +{ + struct cpufreq_policy *policy __free(put_cpufreq_policy); + + policy = cpufreq_cpu_get(cpudata->cpu); + if (!policy) + return false; + + __intel_pstate_update_max_freq(policy, cpudata); return true; } -static void intel_pstate_update_limits(unsigned int cpu) +static void intel_pstate_update_limits(struct cpufreq_policy *policy) { - struct cpudata *cpudata = all_cpu_data[cpu]; + struct cpudata *cpudata = all_cpu_data[policy->cpu]; - if (intel_pstate_update_max_freq(cpudata)) - hybrid_update_capacity(cpudata); + __intel_pstate_update_max_freq(policy, cpudata); + + hybrid_update_capacity(cpudata); } static void intel_pstate_update_limits_for_all(void) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index a33a094ef755..f3cf2adea18f 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -399,7 +399,7 @@ struct cpufreq_driver { unsigned int (*get)(unsigned int cpu); /* Called to update policy limits on firmware notifications. */ - void (*update_limits)(unsigned int cpu); + void (*update_limits)(struct cpufreq_policy *policy); /* optional */ int (*bios_limit)(int cpu, unsigned int *limit); -- cgit v1.2.3 From 855c634930f04c21434fae9c41a7a7372b6ac879 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Thu, 27 Mar 2025 12:07:08 +0100 Subject: PCI: Remove pcim_iounmap_regions() All users of the deprecated function pcim_iounmap_regions() have been ported by now. Remove it. Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas Reviewed-by: Zijun Hu Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20250327110707.20025-4-phasta@kernel.org --- Documentation/driver-api/driver-model/devres.rst | 1 - drivers/pci/devres.c | 24 ------------------------ include/linux/pci.h | 1 - 3 files changed, 26 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index d75728eb05f8..601f1a74d34d 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -396,7 +396,6 @@ PCI pcim_iomap_regions() : do request_region() and iomap() on multiple BARs pcim_iomap_table() : array of mapped addresses indexed by BAR pcim_iounmap() : do iounmap() on a single BAR - pcim_iounmap_regions() : do iounmap() and release_region() on multiple BARs pcim_pin_device() : keep PCI device enabled after release pcim_set_mwi() : enable Memory-Write-Invalidate PCI transaction diff --git a/drivers/pci/devres.c b/drivers/pci/devres.c index 73047316889e..0317c56ac27d 100644 --- a/drivers/pci/devres.c +++ b/drivers/pci/devres.c @@ -955,30 +955,6 @@ err: } EXPORT_SYMBOL(pcim_request_all_regions); -/** - * pcim_iounmap_regions - Unmap and release PCI BARs (DEPRECATED) - * @pdev: PCI device to map IO resources for - * @mask: Mask of BARs to unmap and release - * - * Unmap and release regions specified by @mask. - * - * This function is DEPRECATED. Do not use it in new code. - * Use pcim_iounmap_region() instead. - */ -void pcim_iounmap_regions(struct pci_dev *pdev, int mask) -{ - int i; - - for (i = 0; i < PCI_STD_NUM_BARS; i++) { - if (!mask_contains_bar(mask, i)) - continue; - - pcim_iounmap_region(pdev, i); - pcim_remove_bar_from_legacy_table(pdev, i); - } -} -EXPORT_SYMBOL(pcim_iounmap_regions); - /** * pcim_iomap_range - Create a ranged __iomap mapping within a PCI BAR * @pdev: PCI device to map IO resources for diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..a60fdb344d9e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2322,7 +2322,6 @@ void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr); void __iomem * const *pcim_iomap_table(struct pci_dev *pdev); int pcim_request_region(struct pci_dev *pdev, int bar, const char *name); int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name); -void pcim_iounmap_regions(struct pci_dev *pdev, int mask); void __iomem *pcim_iomap_range(struct pci_dev *pdev, int bar, unsigned long offset, unsigned long len); -- cgit v1.2.3 From a82dc19db13649aa4232ce37cb6f4ceff851e2fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:48 -0700 Subject: net: avoid potential race between netdev_get_by_index_lock() and netns switch netdev_get_by_index_lock() performs following steps: rcu_lock(); dev = lookup(netns, ifindex); dev_get(dev); rcu_unlock(); [... lock & validate the dev ...] return dev Validation right now only checks if the device is registered but since the lookup is netns-aware we must also protect against the device switching netns right after we dropped the RCU lock. Otherwise the caller in netns1 may get a pointer to a device which has just switched to netns2. We can't hold the lock for the entire netns change process (because of the NETDEV_UNREGISTER notifier), and there's no existing marking to indicate that the netns is unlisted because of netns move, so add one. AFAIU none of the existing netdev_get_by_index_lock() callers can suffer from this problem (NAPI code double checks the netns membership and other callers are either under rtnl_lock or not ns-sensitive), so this patch does not have to be treated as a fix. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 +++++- net/core/dev.c | 25 ++++++++++++++++++------- net/core/dev.h | 2 +- 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf3b6445817b..8e9be80bc167 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1952,6 +1952,7 @@ enum netdev_reg_state { * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside + * protected by @lock * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type @@ -2359,6 +2360,9 @@ struct net_device { bool dismantle; + /** @moving_ns: device is changing netns, protected by @lock */ + bool moving_ns; + enum { RTNL_LINK_INITIALIZED, RTNL_LINK_INITIALIZING, @@ -2521,7 +2525,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up + * @up, @moving_ns, @nd_net * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/net/core/dev.c b/net/core/dev.c index 4ccc6dc5303e..8c13e5339cc9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -828,7 +828,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); if (!dev) return NULL; @@ -1039,10 +1039,11 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ -struct net_device *__netdev_put_lock(struct net_device *dev) +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) { netdev_lock(dev); - if (dev->reg_state > NETREG_REGISTERED) { + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { netdev_unlock(dev); dev_put(dev); return NULL; @@ -1070,7 +1071,7 @@ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) if (!dev) return NULL; - return __netdev_put_lock(dev); + return __netdev_put_lock(dev, net); } struct net_device * @@ -1090,7 +1091,7 @@ netdev_xa_find_lock(struct net *net, struct net_device *dev, dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); if (dev) return dev; @@ -12157,7 +12158,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, netif_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); - netdev_unlock_ops(dev); + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); synchronize_net(); @@ -12193,7 +12198,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); dev->ifindex = new_ifindex; if (new_name[0]) { @@ -12219,7 +12226,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = netdev_change_owner(dev, net_old, net); WARN_ON(err); - netdev_lock_ops(dev); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ diff --git a/net/core/dev.h b/net/core/dev.h index 710abc05ebdb..e7bf21f52fc7 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,7 +30,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); -struct net_device *__netdev_put_lock(struct net_device *dev); +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); -- cgit v1.2.3 From 606048cbd8346e616cfaee01b0143d072534136d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:49 -0700 Subject: net: designate XSK pool pointers in queues as "ops protected" Read accesses go via xsk_get_pool_from_qid(), the call coming from the core and gve look safe (other "ops locked" drivers don't support XSK). Write accesses go via xsk_reg_pool_at_qid() and xsk_clear_pool_at_qid(). Former is already under the ops lock, latter is not (both coming from the workqueue via xp_clear_dev() and NETDEV_UNREGISTER via xsk_notifier()). Acked-by: Stanislav Fomichev Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + include/net/netdev_rx_queue.h | 6 +++--- net/xdp/xsk_buff_pool.c | 6 +++++- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8e9be80bc167..7242fb8a22fc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -688,6 +688,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS + /* "ops protected", see comment about net_device::lock */ struct xsk_buff_pool *pool; #endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index b2238b551dce..8cdcd138b33f 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -20,12 +20,12 @@ struct netdev_rx_queue { struct net_device *dev; netdevice_tracker dev_tracker; + /* All fields below are "ops protected", + * see comment about net_device::lock + */ #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif - /* NAPI instance for the queue - * "ops protected", see comment about net_device::lock - */ struct napi_struct *napi; struct pp_memory_provider_params mp_params; } ____cacheline_aligned_in_smp; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 25a76c5ce0f1..cbf2129e808b 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -266,13 +266,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, void xp_clear_dev(struct xsk_buff_pool *pool) { + struct net_device *netdev = pool->netdev; + if (!pool->netdev) return; + netdev_lock_ops(netdev); xp_disable_drv_zc(pool); xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); - dev_put(pool->netdev); pool->netdev = NULL; + netdev_unlock_ops(netdev); + dev_put(netdev); } static void xp_release_deferred(struct work_struct *work) -- cgit v1.2.3 From 03df156dd3a6d5992f17682cd5c3b11e5ffdae02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:52 -0700 Subject: xdp: double protect netdev->xdp_flags with netdev->lock Protect xdp_features with netdev->lock. This way pure readers no longer have to take rtnl_lock to access the field. This includes calling NETDEV_XDP_FEAT_CHANGE under the lock. Looks like that's fine for bonding, the only "real" listener, it's the same as ethtool feature change. In terms of normal drivers - only GVE need special consideration (other drivers don't use instance lock or don't support XDP). It calls xdp_set_features_flag() helper from gve_init_priv() which in turn is called from gve_reset_recovery() (locked), or prior to netdev registration. So switch to _locked. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Acked-by: Harshitha Ramamurthy Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250408195956.412733-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 1 + drivers/net/ethernet/google/gve/gve_main.c | 2 +- include/linux/netdevice.h | 2 +- include/net/xdp.h | 1 + net/core/lock_debug.c | 2 +- net/core/xdp.c | 12 +++++++++++- 6 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 6c2d8945f597..d6357472d3f1 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -354,6 +354,7 @@ For devices with locked ops, currently only the following notifiers are running under the lock: * ``NETDEV_REGISTER`` * ``NETDEV_UP`` +* ``NETDEV_XDP_FEAT_CHANGE`` The following notifiers are running without the lock: * ``NETDEV_UNREGISTER`` diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index f9a73c956861..7a249baee316 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2185,7 +2185,7 @@ static void gve_set_netdev_xdp_features(struct gve_priv *priv) xdp_features = 0; } - xdp_set_features_flag(priv->dev, xdp_features); + xdp_set_features_flag_locked(priv->dev, xdp_features); } static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7242fb8a22fc..dece2ae396a1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2526,7 +2526,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net + * @up, @moving_ns, @nd_net, @xdp_flags * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/include/net/xdp.h b/include/net/xdp.h index 48efacbaa35d..20e41b5ff319 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -616,6 +616,7 @@ struct xdp_metadata_ops { u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c index b7f22dc92a6f..598c443ef2f3 100644 --- a/net/core/lock_debug.c +++ b/net/core/lock_debug.c @@ -20,6 +20,7 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, switch (cmd) { case NETDEV_REGISTER: case NETDEV_UP: + case NETDEV_XDP_FEAT_CHANGE: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: @@ -58,7 +59,6 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: - case NETDEV_XDP_FEAT_CHANGE: ASSERT_RTNL(); break; diff --git a/net/core/xdp.c b/net/core/xdp.c index f86eedad586a..3cd0db9c9d2d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include #include +#include #include #include /* struct xdp_mem_allocator */ #include @@ -991,17 +992,26 @@ static int __init xdp_metadata_init(void) } late_initcall(xdp_metadata_init); -void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) { val &= NETDEV_XDP_ACT_MASK; if (dev->xdp_features == val) return; + netdev_assert_locked_or_invisible(dev); dev->xdp_features = val; if (dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} EXPORT_SYMBOL_GPL(xdp_set_features_flag); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) -- cgit v1.2.3 From 229671ac60e298b85c2644f52d7e487e9f487d06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Apr 2025 20:27:42 +0000 Subject: net: remove cpu stall in txq_trans_update() txq_trans_update() currently uses txq->xmit_lock_owner to conditionally update txq->trans_start. For regular devices, txq->xmit_lock_owner is updated from HARD_TX_LOCK() and HARD_TX_UNLOCK(), and this apparently causes cpu stalls. Using dev->lltx, which sits in a read-mostly cache-line, and already used in HARD_TX_LOCK() and HARD_TX_UNLOCK() helps cpu prediction. On an AMD EPYC 7B12 dual socket server, tcp_rr with 128 threads and 30,000 flows gets a 5 % increase in throughput. As explained in commit 95ecba62e2fd ("net: fix races in netdev_tx_sent_queue()/dev_watchdog()") I am planning to no longer update txq->trans_start in the fast path in a followup patch. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250408202742.2145516-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- include/linux/netdevice.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index c9fd34787c99..e78de79a5d78 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -427,7 +427,7 @@ static void am65_cpsw_nuss_ndo_host_tx_timeout(struct net_device *ndev, if (netif_tx_queue_stopped(netif_txq)) { /* try recover if stopped by us */ - txq_trans_update(netif_txq); + txq_trans_update(ndev, netif_txq); netif_tx_wake_queue(netif_txq); } } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dece2ae396a1..a28a08046615 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4693,9 +4693,10 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) /* * txq->trans_start can be read locklessly from dev_watchdog() */ -static inline void txq_trans_update(struct netdev_queue *txq) +static inline void txq_trans_update(const struct net_device *dev, + struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (!dev->lltx) WRITE_ONCE(txq->trans_start, jiffies); } @@ -5214,7 +5215,7 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) - txq_trans_update(txq); + txq_trans_update(dev, txq); return rc; } -- cgit v1.2.3 From 1490cbb9dbfd0eabfe45f9b674097aea7e6760fc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 Mar 2025 16:54:51 +0200 Subject: device property: Split fwnode_get_child_node_count() The new helper is introduced to allow counting the child firmware nodes of their parent without requiring a device to be passed. This also makes the fwnode and device property API more symmetrical with the rest. Signed-off-by: Andy Shevchenko Acked-by: "Rafael J. Wysocki" Reviewed-by: Sakari Ailus Reviewed-by: Heikki Krogerus Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20250310150835.3139322-2-andriy.shevchenko@linux.intel.com Signed-off-by: Lee Jones --- drivers/base/property.c | 12 ++++++------ include/linux/property.h | 7 ++++++- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/property.c b/drivers/base/property.c index c1392743df9c..805f75b35115 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -928,22 +928,22 @@ bool fwnode_device_is_available(const struct fwnode_handle *fwnode) EXPORT_SYMBOL_GPL(fwnode_device_is_available); /** - * device_get_child_node_count - return the number of child nodes for device - * @dev: Device to count the child nodes for + * fwnode_get_child_node_count - return the number of child nodes for a given firmware node + * @fwnode: Pointer to the parent firmware node * - * Return: the number of child nodes for a given device. + * Return: the number of child nodes for a given firmware node. */ -unsigned int device_get_child_node_count(const struct device *dev) +unsigned int fwnode_get_child_node_count(const struct fwnode_handle *fwnode) { struct fwnode_handle *child; unsigned int count = 0; - device_for_each_child_node(dev, child) + fwnode_for_each_child_node(fwnode, child) count++; return count; } -EXPORT_SYMBOL_GPL(device_get_child_node_count); +EXPORT_SYMBOL_GPL(fwnode_get_child_node_count); bool device_dma_supported(const struct device *dev) { diff --git a/include/linux/property.h b/include/linux/property.h index e214ecd241eb..bc5bfc98176b 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -208,7 +208,12 @@ DEFINE_FREE(fwnode_handle, struct fwnode_handle *, fwnode_handle_put(_T)) int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index); int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name); -unsigned int device_get_child_node_count(const struct device *dev); +unsigned int fwnode_get_child_node_count(const struct fwnode_handle *fwnode); + +static inline unsigned int device_get_child_node_count(const struct device *dev) +{ + return fwnode_get_child_node_count(dev_fwnode(dev)); +} static inline int device_property_read_u8(const struct device *dev, const char *propname, u8 *val) -- cgit v1.2.3 From 7e3711eb87c584ed224a7ad7000eba36e6fa3a51 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:53:55 +0100 Subject: fbdev: Track display blanking state Store the display's blank status in struct fb_info.blank and track it in fb_blank(). As an extra, the status is now available from the sysfs blank attribute. Support for blanking is optional. Therefore framebuffer_alloc() initializes the state to FB_BLANK_UNBLANK (i.e., the display is on). If the fb_blank callback has been set, register_framebuffer() sets the state to FB_BLANK_POWERDOWN. On the first modeset, the call to fb_blank() will update it to _UNBLANK. This is important, as listeners to FB_EVENT_BLANK will now see the display being switched on. Signed-off-by: Thomas Zimmermann Acked-by: Simona Vetter Link: https://lore.kernel.org/r/20250321095517.313713-3-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/fbdev/core/fb_info.c | 1 + drivers/video/fbdev/core/fbmem.c | 17 ++++++++++++++++- drivers/video/fbdev/core/fbsysfs.c | 8 ++++---- include/linux/fb.h | 2 ++ 4 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fb_info.c b/drivers/video/fbdev/core/fb_info.c index 4847ebe50d7d..52f9bd2c5417 100644 --- a/drivers/video/fbdev/core/fb_info.c +++ b/drivers/video/fbdev/core/fb_info.c @@ -42,6 +42,7 @@ struct fb_info *framebuffer_alloc(size_t size, struct device *dev) info->device = dev; info->fbcon_rotate_hint = -1; + info->blank = FB_BLANK_UNBLANK; #if IS_ENABLED(CONFIG_FB_BACKLIGHT) mutex_init(&info->bl_curve_mutex); diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 39e2b81473ad..5d1529d300b7 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -341,6 +341,7 @@ EXPORT_SYMBOL(fb_set_var); int fb_blank(struct fb_info *info, int blank) { + int old_blank = info->blank; struct fb_event event; int ret; @@ -353,13 +354,19 @@ int fb_blank(struct fb_info *info, int blank) event.info = info; event.data = ␣ + info->blank = blank; + ret = info->fbops->fb_blank(blank, info); if (ret) - return ret; + goto err; fb_notifier_call_chain(FB_EVENT_BLANK, &event); return 0; + +err: + info->blank = old_blank; + return ret; } EXPORT_SYMBOL(fb_blank); @@ -408,6 +415,14 @@ static int do_register_framebuffer(struct fb_info *fb_info) mutex_init(&fb_info->lock); mutex_init(&fb_info->mm_lock); + /* + * With an fb_blank callback present, we assume that the + * display is blank, so that fb_blank() enables it on the + * first modeset. + */ + if (fb_info->fbops->fb_blank) + fb_info->blank = FB_BLANK_POWERDOWN; + fb_device_create(fb_info); if (fb_info->pixmap.addr == NULL) { diff --git a/drivers/video/fbdev/core/fbsysfs.c b/drivers/video/fbdev/core/fbsysfs.c index 06d75c767579..b8344c40073b 100644 --- a/drivers/video/fbdev/core/fbsysfs.c +++ b/drivers/video/fbdev/core/fbsysfs.c @@ -242,11 +242,11 @@ static ssize_t store_blank(struct device *device, return count; } -static ssize_t show_blank(struct device *device, - struct device_attribute *attr, char *buf) +static ssize_t show_blank(struct device *device, struct device_attribute *attr, char *buf) { -// struct fb_info *fb_info = dev_get_drvdata(device); - return 0; + struct fb_info *fb_info = dev_get_drvdata(device); + + return sysfs_emit(buf, "%d\n", fb_info->blank); } static ssize_t store_console(struct device *device, diff --git a/include/linux/fb.h b/include/linux/fb.h index cd653862ab99..348aa2e146e2 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -472,6 +472,8 @@ struct fb_info { struct list_head modelist; /* mode list */ struct fb_videomode *mode; /* current mode */ + int blank; /* current blanking; see FB_BLANK_ constants */ + #if IS_ENABLED(CONFIG_FB_BACKLIGHT) /* assigned backlight device */ /* set before framebuffer registration, -- cgit v1.2.3 From 726491f2038ec71122d45700f3abf36fdb277aaa Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:53:57 +0100 Subject: backlight: Implement fbdev tracking with blank state from event Look at the blank state provided by FB_EVENT_BLANK to determine whether to enable or disable a backlight. Remove the tracking fields from struct backlight_device. Tracking requires three variables, fb_on, prev_fb_on and the backlight's use_count. If fb_on is true, the display has been unblanked. The backlight needs to be enabled if the display was blanked before (i.e., prev_fb_on is false) or if use_count is still at 0. If fb_on is false, the display has been blanked. In this case, the backlight has to be disabled was unblanked before and the backlight's use_count is greater than 0. This change removes fbdev state tracking from blacklight. All the backlight requires it its own use counter and information about changes to the display. Removing fbdev internals makes backlight drivers easier to integrate into other display drivers, such as DRM. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Acked-by: Simona Vetter Link: https://lore.kernel.org/r/20250321095517.313713-5-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/backlight.c | 14 +++++++------- include/linux/backlight.h | 10 +--------- 2 files changed, 8 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index f699e5827ccb..bb01f57c4683 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -98,9 +98,9 @@ static int fb_notifier_callback(struct notifier_block *self, struct backlight_device *bd; struct fb_event *evdata = data; struct fb_info *info = evdata->info; + const int *fb_blank = evdata->data; struct backlight_device *fb_bd = fb_bl_device(info); - int node = info->node; - int fb_blank = 0; + bool fb_on, prev_fb_on; /* If we aren't interested in this event, skip it immediately ... */ if (event != FB_EVENT_BLANK) @@ -116,15 +116,15 @@ static int fb_notifier_callback(struct notifier_block *self, if (fb_bd && fb_bd != bd) goto out; - fb_blank = *(int *)evdata->data; - if (fb_blank == FB_BLANK_UNBLANK && !bd->fb_bl_on[node]) { - bd->fb_bl_on[node] = true; + fb_on = fb_blank[0] == FB_BLANK_UNBLANK; + prev_fb_on = fb_blank[1] == FB_BLANK_UNBLANK; + + if (fb_on && (!prev_fb_on || !bd->use_count)) { if (!bd->use_count++) { bd->props.state &= ~BL_CORE_FBBLANK; backlight_update_status(bd); } - } else if (fb_blank != FB_BLANK_UNBLANK && bd->fb_bl_on[node]) { - bd->fb_bl_on[node] = false; + } else if (!fb_on && prev_fb_on && bd->use_count) { if (!(--bd->use_count)) { bd->props.state |= BL_CORE_FBBLANK; backlight_update_status(bd); diff --git a/include/linux/backlight.h b/include/linux/backlight.h index f5652e5a9060..03723a5478f8 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -294,15 +294,7 @@ struct backlight_device { struct device dev; /** - * @fb_bl_on: The state of individual fbdev's. - * - * Multiple fbdev's may share one backlight device. The fb_bl_on - * records the state of the individual fbdev. - */ - bool fb_bl_on[FB_MAX]; - - /** - * @use_count: The number of uses of fb_bl_on. + * @use_count: The number of unblanked displays. */ int use_count; }; -- cgit v1.2.3 From b01beb2f1f6bcda17634a5b529865ffc5a9b795f Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:53:59 +0100 Subject: backlight: Replace fb events with a dedicated function call Remove support for fb events from backlight subsystem. Provide the helper backlight_notify_blank_all() instead. Also export the existing helper backlight_notify_blank() to update a single backlight device. In fbdev, call either helper to inform the backlight subsystem of changes to a display's blank state. If the framebuffer device has a specific backlight, only update this one; otherwise update all. v4: - protect blacklight declarations with IS_REACHABLE() (kernel test robot) v3: - declare empty fb_bl_notify_blank() as static inline (kernel test robot) Signed-off-by: Thomas Zimmermann Acked-by: Simona Vetter Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250321095517.313713-7-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/backlight.c | 85 +++++---------------------------- drivers/video/fbdev/core/fb_backlight.c | 12 +++++ drivers/video/fbdev/core/fbmem.c | 2 + include/linux/backlight.h | 22 ++++++--- include/linux/fb.h | 4 ++ 5 files changed, 46 insertions(+), 79 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 1c43f579396f..9dc93c5e480b 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #ifdef CONFIG_PMAC_BACKLIGHT @@ -57,10 +56,10 @@ * a hot-key to adjust backlight, the driver must notify the backlight * core that brightness has changed using backlight_force_update(). * - * The backlight driver core receives notifications from fbdev and - * if the event is FB_EVENT_BLANK and if the value of blank, from the - * FBIOBLANK ioctrl, results in a change in the backlight state the - * update_status() operation is called. + * Display drives can control the backlight device's status using + * backlight_notify_blank() and backlight_notify_blank_all(). If this + * results in a change in the backlight state the functions call the + * update_status() operation. */ static struct list_head backlight_dev_list; @@ -78,11 +77,8 @@ static const char *const backlight_scale_types[] = { [BACKLIGHT_SCALE_NON_LINEAR] = "non-linear", }; -#if defined(CONFIG_FB_CORE) || (defined(CONFIG_FB_CORE_MODULE) && \ - defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE)) -static void backlight_notify_blank(struct backlight_device *bd, - struct device *display_dev, - bool fb_on, bool prev_fb_on) +void backlight_notify_blank(struct backlight_device *bd, struct device *display_dev, + bool fb_on, bool prev_fb_on) { guard(mutex)(&bd->ops_lock); @@ -103,68 +99,18 @@ static void backlight_notify_blank(struct backlight_device *bd, } } } +EXPORT_SYMBOL(backlight_notify_blank); -/* - * fb_notifier_callback - * - * This callback gets called when something important happens inside a - * framebuffer driver. The backlight core only cares about FB_BLANK_UNBLANK - * which is reported to the driver using backlight_update_status() - * as a state change. - * - * There may be several fbdev's connected to the backlight device, - * in which case they are kept track of. A state change is only reported - * if there is a change in backlight for the specified fbdev. - */ -static int fb_notifier_callback(struct notifier_block *self, - unsigned long event, void *data) +void backlight_notify_blank_all(struct device *display_dev, bool fb_on, bool prev_fb_on) { struct backlight_device *bd; - struct fb_event *evdata = data; - struct fb_info *info = evdata->info; - const int *fb_blank = evdata->data; - struct backlight_device *fb_bd = fb_bl_device(info); - bool fb_on, prev_fb_on; - - /* If we aren't interested in this event, skip it immediately ... */ - if (event != FB_EVENT_BLANK) - return 0; - - bd = container_of(self, struct backlight_device, fb_notif); - - if (fb_bd && fb_bd != bd) - return 0; - - fb_on = fb_blank[0] == FB_BLANK_UNBLANK; - prev_fb_on = fb_blank[1] == FB_BLANK_UNBLANK; - - backlight_notify_blank(bd, info->device, fb_on, prev_fb_on); - - return 0; -} - -static int backlight_register_fb(struct backlight_device *bd) -{ - memset(&bd->fb_notif, 0, sizeof(bd->fb_notif)); - bd->fb_notif.notifier_call = fb_notifier_callback; - return fb_register_client(&bd->fb_notif); -} + guard(mutex)(&backlight_dev_list_mutex); -static void backlight_unregister_fb(struct backlight_device *bd) -{ - fb_unregister_client(&bd->fb_notif); -} -#else -static inline int backlight_register_fb(struct backlight_device *bd) -{ - return 0; + list_for_each_entry(bd, &backlight_dev_list, entry) + backlight_notify_blank(bd, display_dev, fb_on, prev_fb_on); } - -static inline void backlight_unregister_fb(struct backlight_device *bd) -{ -} -#endif /* CONFIG_FB_CORE */ +EXPORT_SYMBOL(backlight_notify_blank_all); static void backlight_generate_event(struct backlight_device *bd, enum backlight_update_reason reason) @@ -455,12 +401,6 @@ struct backlight_device *backlight_device_register(const char *name, return ERR_PTR(rc); } - rc = backlight_register_fb(new_bd); - if (rc) { - device_unregister(&new_bd->dev); - return ERR_PTR(rc); - } - new_bd->ops = ops; #ifdef CONFIG_PMAC_BACKLIGHT @@ -547,7 +487,6 @@ void backlight_device_unregister(struct backlight_device *bd) bd->ops = NULL; mutex_unlock(&bd->ops_lock); - backlight_unregister_fb(bd); device_unregister(&bd->dev); } EXPORT_SYMBOL(backlight_device_unregister); diff --git a/drivers/video/fbdev/core/fb_backlight.c b/drivers/video/fbdev/core/fb_backlight.c index 6fdaa9f81be9..dbed9696f4c5 100644 --- a/drivers/video/fbdev/core/fb_backlight.c +++ b/drivers/video/fbdev/core/fb_backlight.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include #include @@ -36,4 +37,15 @@ struct backlight_device *fb_bl_device(struct fb_info *info) return info->bl_dev; } EXPORT_SYMBOL(fb_bl_device); + +void fb_bl_notify_blank(struct fb_info *info, int old_blank) +{ + bool on = info->blank == FB_BLANK_UNBLANK; + bool prev_on = old_blank == FB_BLANK_UNBLANK; + + if (info->bl_dev) + backlight_notify_blank(info->bl_dev, info->device, on, prev_on); + else + backlight_notify_blank_all(info->device, on, prev_on); +} #endif diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 9650b641d8e8..c931f270ac34 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -363,6 +363,8 @@ int fb_blank(struct fb_info *info, int blank) if (ret) goto err; + fb_bl_notify_blank(info, old_blank); + fb_notifier_call_chain(FB_EVENT_BLANK, &event); return 0; diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 03723a5478f8..10e626db7eee 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -12,7 +12,6 @@ #include #include #include -#include #include /** @@ -278,11 +277,6 @@ struct backlight_device { */ const struct backlight_ops *ops; - /** - * @fb_notif: The framebuffer notifier block - */ - struct notifier_block fb_notif; - /** * @entry: List entry of all registered backlight devices */ @@ -400,6 +394,22 @@ struct backlight_device *backlight_device_get_by_type(enum backlight_type type); int backlight_device_set_brightness(struct backlight_device *bd, unsigned long brightness); +#if IS_REACHABLE(CONFIG_BACKLIGHT_CLASS_DEVICE) +void backlight_notify_blank(struct backlight_device *bd, + struct device *display_dev, + bool fb_on, bool prev_fb_on); +void backlight_notify_blank_all(struct device *display_dev, + bool fb_on, bool prev_fb_on); +#else +static inline void backlight_notify_blank(struct backlight_device *bd, + struct device *display_dev, + bool fb_on, bool prev_fb_on) +{ } +static inline void backlight_notify_blank_all(struct device *display_dev, + bool fb_on, bool prev_fb_on) +{ } +#endif + #define to_backlight_device(obj) container_of(obj, struct backlight_device, dev) /** diff --git a/include/linux/fb.h b/include/linux/fb.h index 348aa2e146e2..835e13d6eba8 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -758,11 +758,15 @@ extern void fb_bl_default_curve(struct fb_info *fb_info, u8 off, u8 min, u8 max) #if IS_ENABLED(CONFIG_FB_BACKLIGHT) struct backlight_device *fb_bl_device(struct fb_info *info); +void fb_bl_notify_blank(struct fb_info *info, int old_blank); #else static inline struct backlight_device *fb_bl_device(struct fb_info *info) { return NULL; } + +static inline void fb_bl_notify_blank(struct fb_info *info, int old_blank) +{ } #endif static inline struct lcd_device *fb_lcd_device(struct fb_info *info) -- cgit v1.2.3 From bc70cc84f5a2ebfd7e7112e9b8837e0c7954fc65 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:54:01 +0100 Subject: backlight: lcd: Replace fb events with a dedicated function call Remove support for fb events from the lcd subsystem. Provide the helper lcd_notify_blank_all() instead. In fbdev, call lcd_notify_blank_all() to inform the lcd subsystem of changes to a display's blank state. Fbdev maintains a list of all installed notifiers. Instead of fbdev notifiers, maintain an internal list of lcd devices. v3: - export lcd_notify_mode_change_all() (kernel test robot) v2: - maintain global list of lcd devices - avoid IS_REACHABLE() in source file - use lock guards - initialize lcd list and list mutex Signed-off-by: Thomas Zimmermann Acked-by: Simona Vetter Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250321095517.313713-9-tzimmermann@suse.de Signed-off-by: Lee Jones --- drivers/video/backlight/lcd.c | 98 +++++++++++----------------------------- drivers/video/fbdev/core/fbmem.c | 39 ++++++++++++++-- include/linux/lcd.h | 21 +++++++-- 3 files changed, 79 insertions(+), 79 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c index f57ff8bcc2fa..affe5c52471a 100644 --- a/drivers/video/backlight/lcd.c +++ b/drivers/video/backlight/lcd.c @@ -15,9 +15,11 @@ #include #include #include -#include #include +static DEFINE_MUTEX(lcd_dev_list_mutex); +static LIST_HEAD(lcd_dev_list); + static void lcd_notify_blank(struct lcd_device *ld, struct device *display_dev, int power) { @@ -31,6 +33,17 @@ static void lcd_notify_blank(struct lcd_device *ld, struct device *display_dev, ld->ops->set_power(ld, power); } +void lcd_notify_blank_all(struct device *display_dev, int power) +{ + struct lcd_device *ld; + + guard(mutex)(&lcd_dev_list_mutex); + + list_for_each_entry(ld, &lcd_dev_list, entry) + lcd_notify_blank(ld, display_dev, power); +} +EXPORT_SYMBOL(lcd_notify_blank_all); + static void lcd_notify_mode_change(struct lcd_device *ld, struct device *display_dev, unsigned int width, unsigned int height) { @@ -44,75 +57,17 @@ static void lcd_notify_mode_change(struct lcd_device *ld, struct device *display ld->ops->set_mode(ld, width, height); } -#if defined(CONFIG_FB) || (defined(CONFIG_FB_MODULE) && \ - defined(CONFIG_LCD_CLASS_DEVICE_MODULE)) -static int to_lcd_power(int fb_blank) +void lcd_notify_mode_change_all(struct device *display_dev, + unsigned int width, unsigned int height) { - switch (fb_blank) { - case FB_BLANK_UNBLANK: - return LCD_POWER_ON; - /* deprecated; TODO: should become 'off' */ - case FB_BLANK_NORMAL: - return LCD_POWER_REDUCED; - case FB_BLANK_VSYNC_SUSPEND: - return LCD_POWER_REDUCED_VSYNC_SUSPEND; - /* 'off' */ - case FB_BLANK_HSYNC_SUSPEND: - case FB_BLANK_POWERDOWN: - default: - return LCD_POWER_OFF; - } -} + struct lcd_device *ld; -/* This callback gets called when something important happens inside a - * framebuffer driver. We're looking if that important event is blanking, - * and if it is, we're switching lcd power as well ... - */ -static int fb_notifier_callback(struct notifier_block *self, - unsigned long event, void *data) -{ - struct lcd_device *ld = container_of(self, struct lcd_device, fb_notif); - struct fb_event *evdata = data; - struct fb_info *info = evdata->info; - struct lcd_device *fb_lcd = fb_lcd_device(info); - - if (fb_lcd && fb_lcd != ld) - return 0; - - if (event == FB_EVENT_BLANK) { - int power = to_lcd_power(*(int *)evdata->data); - - lcd_notify_blank(ld, info->device, power); - } else { - const struct fb_videomode *videomode = evdata->data; - - lcd_notify_mode_change(ld, info->device, videomode->xres, videomode->yres); - } + guard(mutex)(&lcd_dev_list_mutex); - return 0; + list_for_each_entry(ld, &lcd_dev_list, entry) + lcd_notify_mode_change(ld, display_dev, width, height); } - -static int lcd_register_fb(struct lcd_device *ld) -{ - memset(&ld->fb_notif, 0, sizeof(ld->fb_notif)); - ld->fb_notif.notifier_call = fb_notifier_callback; - return fb_register_client(&ld->fb_notif); -} - -static void lcd_unregister_fb(struct lcd_device *ld) -{ - fb_unregister_client(&ld->fb_notif); -} -#else -static int lcd_register_fb(struct lcd_device *ld) -{ - return 0; -} - -static inline void lcd_unregister_fb(struct lcd_device *ld) -{ -} -#endif /* CONFIG_FB */ +EXPORT_SYMBOL(lcd_notify_mode_change_all); static ssize_t lcd_power_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -263,11 +218,8 @@ struct lcd_device *lcd_device_register(const char *name, struct device *parent, return ERR_PTR(rc); } - rc = lcd_register_fb(new_ld); - if (rc) { - device_unregister(&new_ld->dev); - return ERR_PTR(rc); - } + guard(mutex)(&lcd_dev_list_mutex); + list_add(&new_ld->entry, &lcd_dev_list); return new_ld; } @@ -284,10 +236,12 @@ void lcd_device_unregister(struct lcd_device *ld) if (!ld) return; + guard(mutex)(&lcd_dev_list_mutex); + list_del(&ld->entry); + mutex_lock(&ld->ops_lock); ld->ops = NULL; mutex_unlock(&ld->ops_lock); - lcd_unregister_fb(ld); device_unregister(&ld->dev); } diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index c931f270ac34..001662c606d7 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -15,6 +15,7 @@ #include #include #include +#include #include