From 90503f9ffee927c3abdc94a4862d13ae71ea9442 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:39 -0800 Subject: powercap: intel_rapl: Use unit conversion macros from units.h Replace hardcoded numeric constants with standard unit conversion macros from linux/units.h for better code clarity and self-documentation. Add MICROJOULE_PER_JOULE and NANOJOULE_PER_JOULE to units.h to support energy unit conversions, following the existing pattern for power units. No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/units.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/units.h b/include/linux/units.h index 80d57c50b9e3..c6d78988613a 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -57,6 +57,9 @@ #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL +#define MICROJOULE_PER_JOULE 1000000UL +#define NANOJOULE_PER_JOULE 1000000000UL + #define BYTES_PER_KBIT (KILO / BITS_PER_BYTE) #define BYTES_PER_MBIT (MEGA / BITS_PER_BYTE) #define BYTES_PER_GBIT (GIGA / BITS_PER_BYTE) -- cgit v1.2.3 From d7ca7d1488cc916dbf0a6a594abbda81d4eaeee9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:40 -0800 Subject: powercap: intel_rapl: Allow interface drivers to configure rapl_defaults RAPL default settings vary across different RAPL interfaces (MSR, TPMI, MMIO). Currently, these defaults are stored in the common RAPL driver, which requires interface-specific handling logic and makes the common layer unnecessarily complex. There is no strong reason for the common code to own these defaults, since they are inherently interface-specific. To prepare for moving default configuration into the individual interface drivers, 1. Move struct rapl_defaults into a shared header so that interface drivers can directly populate their own default settings. 2. Change the @defaults field in struct rapl_if_priv from void * to const struct rapl_defaults * to improve type safety and readability and update the common driver to use the typed defaults structure. 3. Update all internal getter functions and local pointers to use const struct rapl_defaults * to maintain const-correctness. 4. Rename and export the common helper functions (check_unit, set_floor_freq, compute_time_window) so interface drivers may reuse or override them as appropriate. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL default settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-9-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 64 ++++++++++++++++-------------------- include/linux/intel_rapl.h | 17 ++++++++-- 2 files changed, 43 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 380893baf987..7c95eb658c16 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -221,20 +221,10 @@ static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim) #define power_zone_to_rapl_domain(_zone) \ container_of(_zone, struct rapl_domain, power_zone) -struct rapl_defaults { - u8 floor_freq_reg_addr; - int (*check_unit)(struct rapl_domain *rd); - void (*set_floor_freq)(struct rapl_domain *rd, bool mode); - u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, - bool to_raw); - unsigned int dram_domain_energy_unit; - unsigned int psys_domain_energy_unit; - bool spr_psys_bits; -}; -static struct rapl_defaults *defaults_msr; +static const struct rapl_defaults *defaults_msr; static const struct rapl_defaults defaults_tpmi; -static struct rapl_defaults *get_defaults(struct rapl_package *rp) +static const struct rapl_defaults *get_defaults(struct rapl_package *rp) { return rp->priv->defaults; } @@ -351,7 +341,7 @@ static int find_nr_power_limit(struct rapl_domain *rd) static int set_domain_enable(struct powercap_zone *power_zone, bool mode) { struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u64 val; int ret; @@ -640,7 +630,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, u64 value, int to_raw) { u64 units = 1; - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u64 scale = 1; switch (type) { @@ -785,11 +775,11 @@ static int rapl_config(struct rapl_package *rp) /* MMIO I/F shares the same register layout as MSR registers */ case RAPL_IF_MMIO: case RAPL_IF_MSR: - rp->priv->defaults = (void *)defaults_msr; + rp->priv->defaults = defaults_msr; rp->priv->rpi = (void *)rpi_msr; break; case RAPL_IF_TPMI: - rp->priv->defaults = (void *)&defaults_tpmi; + rp->priv->defaults = &defaults_tpmi; rp->priv->rpi = (void *)rpi_tpmi; break; default: @@ -806,7 +796,7 @@ static int rapl_config(struct rapl_package *rp) static enum rapl_primitives prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) { - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); if (!defaults->spr_psys_bits) return prim; @@ -951,7 +941,7 @@ static int rapl_write_pl_data(struct rapl_domain *rd, int pl, * power unit : microWatts : Represented in milliWatts by default * time unit : microseconds: Represented in seconds by default */ -static int rapl_check_unit_core(struct rapl_domain *rd) +int rapl_default_check_unit(struct rapl_domain *rd) { struct reg_action ra; u32 value; @@ -978,6 +968,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) return 0; } +EXPORT_SYMBOL_NS_GPL(rapl_default_check_unit, "INTEL_RAPL"); static int rapl_check_unit_atom(struct rapl_domain *rd) { @@ -1071,7 +1062,7 @@ static void package_power_limit_irq_restore(struct rapl_package *rp) wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); } -static void set_floor_freq_default(struct rapl_domain *rd, bool mode) +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode) { int i; @@ -1085,11 +1076,12 @@ static void set_floor_freq_default(struct rapl_domain *rd, bool mode) rapl_write_pl_data(rd, i, PL_CLAMP, mode); } } +EXPORT_SYMBOL_NS_GPL(rapl_default_set_floor_freq, "INTEL_RAPL"); static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) { static u32 power_ctrl_orig_val; - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u32 mdata; if (!defaults->floor_freq_reg_addr) { @@ -1110,8 +1102,7 @@ static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) defaults->floor_freq_reg_addr, mdata); } -static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, - bool to_raw) +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw) { u64 f, y; /* fraction and exp. used for time unit */ @@ -1142,6 +1133,7 @@ static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, } return value; } +EXPORT_SYMBOL_NS_GPL(rapl_default_compute_time_window, "INTEL_RAPL"); static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, bool to_raw) @@ -1187,28 +1179,28 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) static const struct rapl_defaults defaults_tpmi = { .check_unit = rapl_check_unit_tpmi, /* Reuse existing logic, ignore the PL_CLAMP failures and enable all Power Limits */ - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, }; static const struct rapl_defaults rapl_defaults_core = { .floor_freq_reg_addr = 0, - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, }; static const struct rapl_defaults rapl_defaults_hsw_server = { - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, .dram_domain_energy_unit = 15300, }; static const struct rapl_defaults rapl_defaults_spr_server = { - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, .psys_domain_energy_unit = NANOJOULE_PER_JOULE, .spr_psys_bits = true, }; @@ -1242,7 +1234,7 @@ static const struct rapl_defaults rapl_defaults_cht = { }; static const struct rapl_defaults rapl_defaults_amd = { - .check_unit = rapl_check_unit_core, + .check_unit = rapl_default_check_unit, }; static const struct x86_cpu_id rapl_ids[] __initconst = { @@ -1448,7 +1440,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp) */ static int rapl_get_domain_unit(struct rapl_domain *rd) { - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); int ret; if (!rd->regs[RAPL_DOMAIN_REG_UNIT].val) { @@ -2341,7 +2333,7 @@ static int __init rapl_init(void) id = x86_match_cpu(rapl_ids); if (id) { - defaults_msr = (struct rapl_defaults *)id->driver_data; + defaults_msr = (const struct rapl_defaults *)id->driver_data; rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); if (!rapl_msr_platdev) diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index fa1f328d6712..6d694099a3ad 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -128,6 +128,16 @@ struct reg_action { int err; }; +struct rapl_defaults { + u8 floor_freq_reg_addr; + int (*check_unit)(struct rapl_domain *rd); + void (*set_floor_freq)(struct rapl_domain *rd, bool mode); + u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, bool to_raw); + unsigned int dram_domain_energy_unit; + unsigned int psys_domain_energy_unit; + bool spr_psys_bits; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -142,7 +152,7 @@ struct reg_action { * registers. * @write_raw: Callback for writing RAPL interface specific * registers. - * @defaults: internal pointer to interface default settings + * @defaults: pointer to default settings * @rpi: internal pointer to interface primitive info */ struct rapl_if_priv { @@ -154,7 +164,7 @@ struct rapl_if_priv { int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); - void *defaults; + const struct rapl_defaults *defaults; void *rpi; }; @@ -211,6 +221,9 @@ void rapl_remove_package_cpuslocked(struct rapl_package *rp); struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +int rapl_default_check_unit(struct rapl_domain *rd); +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode); +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw); #ifdef CONFIG_PERF_EVENTS int rapl_package_add_pmu(struct rapl_package *rp); -- cgit v1.2.3 From 16c1e8385b3bb65d412d7a60107f8894587c63fa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Mar 2026 15:25:44 -0400 Subject: cpufreq: optimize policy_is_shared() The switch to cpumask_nth() over cpumask_weight(), as it may return earlier - as soon as the function counts the required number of CPUs. Signed-off-by: Yury Norov Acked-by: Viresh Kumar Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/20260314192544.605914-1-ynorov@nvidia.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..8ca2bcb3d7ae 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -232,7 +232,7 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy) static inline bool policy_is_shared(struct cpufreq_policy *policy) { - return cpumask_weight(policy->cpus) > 1; + return cpumask_nth(1, policy->cpus) < nr_cpumask_bits; } #ifdef CONFIG_CPU_FREQ -- cgit v1.2.3 From 16fb8d8a0e050e8f151da7dd2e03ccc500dfd8da Mon Sep 17 00:00:00 2001 From: Henry Tseng Date: Tue, 24 Mar 2026 17:09:48 +0800 Subject: cpufreq: acpi-cpufreq: use DMI max speed when CPPC is unavailable On AMD Ryzen Embedded V1780B (Family 17h, Zen 1), the BIOS does not provide ACPI _CPC objects and the CPU does not support MSR-based CPPC (X86_FEATURE_CPPC). The _PSS table only lists nominal P-states (P0 = 3350 MHz), so when get_max_boost_ratio() fails at cppc_get_perf_caps(), cpuinfo_max_freq reports only the base frequency instead of the rated boost frequency (3600 MHz). dmesg: ACPI CPPC: No CPC descriptor for CPU:0 acpi_cpufreq: CPU0: Unable to get performance capabilities (-19) cppc-cpufreq already has a DMI fallback (cppc_get_dmi_max_khz()) that reads the processor max speed from SMBIOS Type 4. Export it and reuse it in acpi-cpufreq as a last-resort source for the boost frequency. A sanity check ensures the DMI value is above the _PSS P0 frequency and within 2x of it; values outside that range are ignored and the existing arch_set_max_freq_ratio() path is taken instead. The 2x upper bound is based on a survey of the AMD Ryzen Embedded V1000 series, where the highest boost-to-base ratio is 1.8x (V1404I: 2.0 GHz base / 3.6 GHz boost). The DMI lookup and sanity check are wrapped in a helper, acpi_cpufreq_resolve_max_freq(), which falls through to arch_set_max_freq_ratio() if the DMI value is absent or out of range. Tested on AMD Ryzen Embedded V1780B with v7.0-rc4: Before: cpuinfo_max_freq = 3350000 (base only) After: cpuinfo_max_freq = 3600000 (includes boost) Link: https://www.amd.com/en/products/embedded/ryzen/ryzen-v1000-series.html#specifications Signed-off-by: Henry Tseng Link: https://patch.msgid.link/20260324090948.1667340-1-henrytseng@qnap.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 3 ++- drivers/cpufreq/acpi-cpufreq.c | 31 ++++++++++++++++++++++++------- include/acpi/cppc_acpi.h | 1 + 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index f0e513e9ed5d..f53de414acf2 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1944,7 +1944,7 @@ static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) } /* Look up the max frequency in DMI */ -static u64 cppc_get_dmi_max_khz(void) +u64 cppc_get_dmi_max_khz(void) { u16 mhz = 0; @@ -1958,6 +1958,7 @@ static u64 cppc_get_dmi_max_khz(void) return KHZ_PER_MHZ * mhz; } +EXPORT_SYMBOL_GPL(cppc_get_dmi_max_khz); /* * If CPPC lowest_freq and nominal_freq registers are exposed then we can diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index e7eff6c2f092..21639d9ac753 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -675,6 +675,29 @@ static inline u64 get_max_boost_ratio(unsigned int cpu, u64 *nominal_freq) } #endif +static void acpi_cpufreq_resolve_max_freq(struct cpufreq_policy *policy, + unsigned int pss_max_freq) +{ +#ifdef CONFIG_ACPI_CPPC_LIB + u64 max_speed = cppc_get_dmi_max_khz(); + /* + * Use DMI "Max Speed" if it looks plausible: must be + * above _PSS P0 frequency and within 2x of it. + */ + if (max_speed > pss_max_freq && max_speed < pss_max_freq * 2) { + policy->cpuinfo.max_freq = max_speed; + return; + } +#endif + /* + * If the maximum "boost" frequency is unknown, ask the arch + * scale-invariance code to use the "nominal" performance for + * CPU utilization scaling so as to prevent the schedutil + * governor from selecting inadequate CPU frequencies. + */ + arch_set_max_freq_ratio(true); +} + static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) { struct cpufreq_frequency_table *freq_table; @@ -849,13 +872,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.max_freq = freq * max_boost_ratio >> SCHED_CAPACITY_SHIFT; } else { - /* - * If the maximum "boost" frequency is unknown, ask the arch - * scale-invariance code to use the "nominal" performance for - * CPU utilization scaling so as to prevent the schedutil - * governor from selecting inadequate CPU frequencies. - */ - arch_set_max_freq_ratio(true); + acpi_cpufreq_resolve_max_freq(policy, freq_table[0].frequency); } policy->freq_table = freq_table; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 4d644f03098e..e6c5ef3173c5 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -156,6 +156,7 @@ extern int cppc_set_enable(int cpu, bool enable); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); extern bool cppc_perf_ctrs_in_pcc_cpu(unsigned int cpu); extern bool cppc_perf_ctrs_in_pcc(void); +extern u64 cppc_get_dmi_max_khz(void); extern unsigned int cppc_perf_to_khz(struct cppc_perf_caps *caps, unsigned int perf); extern unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int freq); extern bool acpi_cpc_valid(void); -- cgit v1.2.3 From f3b536878a3cf47e5193a96176a3ca2aaf0d848f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Mar 2026 22:14:44 -0700 Subject: powercap: correct kernel-doc function parameter names Use the correct function parameter names in kernel-doc comments to avoid these warnings: Warning: include/linux/powercap.h:254 function parameter 'name' not described in 'powercap_register_control_type' Warning: include/linux/powercap.h:298 function parameter 'nr_constraints' not described in 'powercap_register_zone' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260312051444.685136-1-rdunlap@infradead.org Signed-off-by: Rafael J. Wysocki --- include/linux/powercap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/powercap.h b/include/linux/powercap.h index 3d557bbcd2c7..603419db924c 100644 --- a/include/linux/powercap.h +++ b/include/linux/powercap.h @@ -238,7 +238,7 @@ static inline void *powercap_get_zone_data(struct powercap_zone *power_zone) * Advantage of this parameter is that client can embed * this data in its data structures and allocate in a * single call, preventing multiple allocations. -* @control_type_name: The Name of this control_type, which will be shown +* @name: The Name of this control_type, which will be shown * in the sysfs Interface. * @ops: Callbacks for control type. This parameter is optional. * @@ -277,7 +277,7 @@ int powercap_unregister_control_type(struct powercap_control_type *instance); * @name: A name for this zone. * @parent: A pointer to the parent power zone instance if any or NULL * @ops: Pointer to zone operation callback structure. -* @no_constraints: Number of constraints for this zone +* @nr_constraints: Number of constraints for this zone * @const_ops: Pointer to constraint callback structure * * Register a power zone under a given control type. A power zone must register -- cgit v1.2.3 From 8765715b4e8a1dd24ab5d507c42fc0bcd3d83f5c Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Fri, 13 Mar 2026 11:53:27 -0700 Subject: powercap: intel_rapl: Remove unused AVERAGE_POWER primitive The AVERAGE_POWER primitive and RAPL_PRIMITIVE_DERIVED flag are not used anywhere in the code. Remove them to simplify the primitive handling logic. No functional changes. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260313185333.2370733-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 13 ------------- include/linux/intel_rapl.h | 1 - 2 files changed, 14 deletions(-) (limited to 'include') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 6a2153039f73..e099514e6c56 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -90,7 +90,6 @@ #define TPMI_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(60, 54) /* Non HW constants */ -#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ #define RAPL_PRIMITIVE_DUMMY BIT(2) #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ @@ -703,9 +702,6 @@ static struct rapl_primitive_info rpi_msr[NR_RAPL_PRIMITIVES] = { 19, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), [PSYS_TIME_WINDOW2] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - /* non-hardware */ - [AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, - RAPL_PRIMITIVE_DERIVED), }; /* RAPL primitives for TPMI I/F */ @@ -745,9 +741,6 @@ static struct rapl_primitive_info rpi_tpmi[NR_RAPL_PRIMITIVES] = { 54, RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), - /* non-hardware */ - [AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, - RAPL_PRIMITIVE_DERIVED), }; static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim) @@ -841,12 +834,6 @@ static int rapl_read_data_raw(struct rapl_domain *rd, if (!ra.reg.val) return -EINVAL; - /* non-hardware data are collected by the polling thread */ - if (rpi->flag & RAPL_PRIMITIVE_DERIVED) { - *data = rd->rdd.primitives[prim]; - return 0; - } - ra.mask = rpi->mask; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, pmu_ctx)) { diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 6d694099a3ad..9e6bd654be1f 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -77,7 +77,6 @@ enum rapl_primitives { PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW2, /* below are not raw primitive data */ - AVERAGE_POWER, NR_RAPL_PRIMITIVES, }; -- cgit v1.2.3 From 6e39ba4e5a82aa5469b2ac517b74a71accb0540f Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 26 Mar 2026 21:44:01 +0100 Subject: cpufreq: Add boost_freq_req QoS request The Power Management Quality of Service (PM QoS) allows to aggregate constraints from multiple entities. It is currently used to manage the min/max frequency of a given policy. Frequency constraints can come for instance from: - Thermal framework: acpi_thermal_cpufreq_init() - Firmware: _PPC objects: acpi_processor_ppc_init() - User: by setting policyX/scaling_[min|max]_freq The minimum of the max frequency constraints is used to compute the resulting maximum allowed frequency. When enabling boost frequencies, the same frequency request object (policy->max_freq_req) as to handle requests from users is used. As a result, when setting: - scaling_max_freq - boost The last sysfs file used overwrites the request from the other sysfs file. To avoid this, create a per-policy boost_freq_req to save the boost constraints instead of overwriting the last scaling_max_freq constraint. policy_set_boost() calls the cpufreq set_boost callback. Update the newly added boost_freq_req request from there: - whenever boost is toggled - to cover all possible paths In the existing .set_boost() callbacks: - Don't update policy->max as this is done through the qos notifier cpufreq_notifier_max() which calls cpufreq_set_policy(). - Remove freq_qos_update_request() calls as the qos request is now done in policy_set_boost() and updates the new boost_freq_req $ ## Init state scaling_max_freq:1000000 cpuinfo_max_freq:1000000 $ echo 700000 > scaling_max_freq scaling_max_freq:700000 cpuinfo_max_freq:1000000 $ echo 1 > ../boost scaling_max_freq:1200000 cpuinfo_max_freq:1200000 $ echo 800000 > scaling_max_freq scaling_max_freq:800000 cpuinfo_max_freq:1200000 $ ## Final step: $ ## Without the patches: $ echo 0 > ../boost scaling_max_freq:1000000 cpuinfo_max_freq:1000000 $ ## With the patches: $ echo 0 > ../boost scaling_max_freq:800000 cpuinfo_max_freq:1000000 Note: cpufreq_frequency_table_cpuinfo() updates policy->min and max from: A. cpufreq_boost_set_sw() \-cpufreq_frequency_table_cpuinfo() B. cpufreq_policy_online() \-cpufreq_table_validate_and_sort() \-cpufreq_frequency_table_cpuinfo() Keep these updates as some drivers expect policy->min and max to be set through B. Reviewed-by: Lifeng Zheng Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Link: https://patch.msgid.link/20260326204404.1401849-3-pierre.gondois@arm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 2 -- drivers/cpufreq/cppc_cpufreq.c | 10 ++------- drivers/cpufreq/cpufreq.c | 46 ++++++++++++++++++++++++++++-------------- include/linux/cpufreq.h | 1 + 4 files changed, 34 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5aa9fcd80cf5..d0675d6a19fe 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -769,8 +769,6 @@ static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) else if (policy->cpuinfo.max_freq > nominal_freq) policy->cpuinfo.max_freq = nominal_freq; - policy->max = policy->cpuinfo.max_freq; - if (cppc_state == AMD_PSTATE_PASSIVE) { ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq); if (ret < 0) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 011f35cb47b9..f4f574fbe547 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -807,17 +807,11 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) { struct cppc_cpudata *cpu_data = policy->driver_data; struct cppc_perf_caps *caps = &cpu_data->perf_caps; - int ret; if (state) - policy->max = cppc_perf_to_khz(caps, caps->highest_perf); + policy->cpuinfo.max_freq = cppc_perf_to_khz(caps, caps->highest_perf); else - policy->max = cppc_perf_to_khz(caps, caps->nominal_perf); - policy->cpuinfo.max_freq = policy->max; - - ret = freq_qos_update_request(policy->max_freq_req, policy->max); - if (ret < 0) - return ret; + policy->cpuinfo.max_freq = cppc_perf_to_khz(caps, caps->nominal_perf); return 0; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b127f5cb682c..c0aa970c7a67 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -609,10 +609,19 @@ static int policy_set_boost(struct cpufreq_policy *policy, bool enable) policy->boost_enabled = enable; ret = cpufreq_driver->set_boost(policy, enable); - if (ret) + if (ret) { policy->boost_enabled = !policy->boost_enabled; + return ret; + } - return ret; + ret = freq_qos_update_request(policy->boost_freq_req, policy->cpuinfo.max_freq); + if (ret < 0) { + policy->boost_enabled = !policy->boost_enabled; + cpufreq_driver->set_boost(policy, policy->boost_enabled); + return ret; + } + + return 0; } static ssize_t store_local_boost(struct cpufreq_policy *policy, @@ -1377,6 +1386,7 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) } freq_qos_remove_request(policy->min_freq_req); + freq_qos_remove_request(policy->boost_freq_req); kfree(policy->min_freq_req); cpufreq_policy_put_kobj(policy); @@ -1442,26 +1452,38 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, cpumask_and(policy->cpus, policy->cpus, cpu_online_mask); if (new_policy) { + unsigned int count; + for_each_cpu(j, policy->related_cpus) { per_cpu(cpufreq_cpu_data, j) = policy; add_cpu_dev_symlink(policy, j, get_cpu_device(j)); } - policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req), + count = policy->boost_supported ? 3 : 2; + policy->min_freq_req = kzalloc(count * sizeof(*policy->min_freq_req), GFP_KERNEL); if (!policy->min_freq_req) { ret = -ENOMEM; goto out_destroy_policy; } + if (policy->boost_supported) { + policy->boost_freq_req = policy->min_freq_req + 2; + + ret = freq_qos_add_request(&policy->constraints, + policy->boost_freq_req, + FREQ_QOS_MAX, + policy->cpuinfo.max_freq); + if (ret < 0) { + policy->boost_freq_req = NULL; + goto out_destroy_policy; + } + } + ret = freq_qos_add_request(&policy->constraints, policy->min_freq_req, FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); if (ret < 0) { - /* - * So we don't call freq_qos_remove_request() for an - * uninitialized request. - */ kfree(policy->min_freq_req); policy->min_freq_req = NULL; goto out_destroy_policy; @@ -2785,16 +2807,10 @@ int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) return -ENXIO; ret = cpufreq_frequency_table_cpuinfo(policy); - if (ret) { + if (ret) pr_err("%s: Policy frequency update failed\n", __func__); - return ret; - } - - ret = freq_qos_update_request(policy->max_freq_req, policy->max); - if (ret < 0) - return ret; - return 0; + return ret; } EXPORT_SYMBOL_GPL(cpufreq_boost_set_sw); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 8ca2bcb3d7ae..b6f6c7d06912 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -81,6 +81,7 @@ struct cpufreq_policy { struct freq_constraints constraints; struct freq_qos_request *min_freq_req; struct freq_qos_request *max_freq_req; + struct freq_qos_request *boost_freq_req; struct cpufreq_frequency_table *freq_table; enum cpufreq_table_sorting freq_table_sorted; -- cgit v1.2.3 From 9266b4da051a410d9e6c5c0b0ef0c877855aa1b8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 31 Mar 2026 10:33:46 +0530 Subject: cpufreq: Allocate QoS freq_req objects with policy A recent change exposed a bug in the error path: if freq_qos_add_request(boost_freq_req) fails, min_freq_req may remain a valid pointer even though it was never successfully added. During policy teardown, this leads to an unconditional call to freq_qos_remove_request(), triggering a WARN. The current design allocates all three freq_req objects together, making the lifetime rules unclear and error handling fragile. Simplify this by allocating the QoS freq_req objects at policy allocation time. The policy itself is dynamically allocated, and two of the three requests are always needed anyway. This ensures consistent lifetime management and eliminates the inconsistent state in failure paths. Reported-by: Zhongqiu Han Fixes: 6e39ba4e5a82 ("cpufreq: Add boost_freq_req QoS request") Signed-off-by: Viresh Kumar Reviewed-by: Lifeng Zheng Tested-by: Pierre Gondois Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/a293f29d841b86c51f34699c6e717e01858d8ada.1774933424.git.viresh.kumar@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 53 +++++++++++++---------------------------------- include/linux/cpufreq.h | 6 +++--- 2 files changed, 17 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c0aa970c7a67..f4a949f1e48f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -614,7 +614,7 @@ static int policy_set_boost(struct cpufreq_policy *policy, bool enable) return ret; } - ret = freq_qos_update_request(policy->boost_freq_req, policy->cpuinfo.max_freq); + ret = freq_qos_update_request(&policy->boost_freq_req, policy->cpuinfo.max_freq); if (ret < 0) { policy->boost_enabled = !policy->boost_enabled; cpufreq_driver->set_boost(policy, policy->boost_enabled); @@ -769,7 +769,7 @@ static ssize_t store_##file_name \ if (ret) \ return ret; \ \ - ret = freq_qos_update_request(policy->object##_freq_req, val);\ + ret = freq_qos_update_request(&policy->object##_freq_req, val); \ return ret >= 0 ? count : ret; \ } @@ -1374,7 +1374,7 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) /* Cancel any pending policy->update work before freeing the policy. */ cancel_work_sync(&policy->update); - if (policy->max_freq_req) { + if (freq_qos_request_active(&policy->max_freq_req)) { /* * Remove max_freq_req after sending CPUFREQ_REMOVE_POLICY * notification, since CPUFREQ_CREATE_POLICY notification was @@ -1382,12 +1382,13 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) */ blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_REMOVE_POLICY, policy); - freq_qos_remove_request(policy->max_freq_req); + freq_qos_remove_request(&policy->max_freq_req); } - freq_qos_remove_request(policy->min_freq_req); - freq_qos_remove_request(policy->boost_freq_req); - kfree(policy->min_freq_req); + if (freq_qos_request_active(&policy->min_freq_req)) + freq_qos_remove_request(&policy->min_freq_req); + if (freq_qos_request_active(&policy->boost_freq_req)) + freq_qos_remove_request(&policy->boost_freq_req); cpufreq_policy_put_kobj(policy); free_cpumask_var(policy->real_cpus); @@ -1452,57 +1453,31 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, cpumask_and(policy->cpus, policy->cpus, cpu_online_mask); if (new_policy) { - unsigned int count; - for_each_cpu(j, policy->related_cpus) { per_cpu(cpufreq_cpu_data, j) = policy; add_cpu_dev_symlink(policy, j, get_cpu_device(j)); } - count = policy->boost_supported ? 3 : 2; - policy->min_freq_req = kzalloc(count * sizeof(*policy->min_freq_req), - GFP_KERNEL); - if (!policy->min_freq_req) { - ret = -ENOMEM; - goto out_destroy_policy; - } - if (policy->boost_supported) { - policy->boost_freq_req = policy->min_freq_req + 2; - ret = freq_qos_add_request(&policy->constraints, - policy->boost_freq_req, + &policy->boost_freq_req, FREQ_QOS_MAX, policy->cpuinfo.max_freq); - if (ret < 0) { - policy->boost_freq_req = NULL; + if (ret < 0) goto out_destroy_policy; - } } ret = freq_qos_add_request(&policy->constraints, - policy->min_freq_req, FREQ_QOS_MIN, + &policy->min_freq_req, FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); - if (ret < 0) { - kfree(policy->min_freq_req); - policy->min_freq_req = NULL; + if (ret < 0) goto out_destroy_policy; - } - - /* - * This must be initialized right here to avoid calling - * freq_qos_remove_request() on uninitialized request in case - * of errors. - */ - policy->max_freq_req = policy->min_freq_req + 1; ret = freq_qos_add_request(&policy->constraints, - policy->max_freq_req, FREQ_QOS_MAX, + &policy->max_freq_req, FREQ_QOS_MAX, FREQ_QOS_MAX_DEFAULT_VALUE); - if (ret < 0) { - policy->max_freq_req = NULL; + if (ret < 0) goto out_destroy_policy; - } blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b6f6c7d06912..9b10eb486ece 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -79,9 +79,9 @@ struct cpufreq_policy { * called, but you're in IRQ context */ struct freq_constraints constraints; - struct freq_qos_request *min_freq_req; - struct freq_qos_request *max_freq_req; - struct freq_qos_request *boost_freq_req; + struct freq_qos_request min_freq_req; + struct freq_qos_request max_freq_req; + struct freq_qos_request boost_freq_req; struct cpufreq_frequency_table *freq_table; enum cpufreq_table_sorting freq_table_sorted; -- cgit v1.2.3 From 04bcbed4cd33495d05ba98857a748e416ab603b7 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:46 -0700 Subject: powercap: intel_rapl: Move primitive info to header for interface drivers RAPL primitive information varies across different RAPL interfaces (MSR, TPMI, MMIO). Keeping them in the common code adds no benefit, but requires interface-specific handling logic and makes the common layer unnecessarily complex. Move the primitive info infrastructure to the shared header to allow interface drivers to configure RAPL primitives. Specific changes: 1. Move struct rapl_primitive_info, enum unit_type, and PRIMITIVE_INFO_INIT macro to intel_rapl.h. 2. Change the @rpi field in struct rapl_if_priv from void * to struct rapl_primitive_info * to improve type safety and eliminate unnecessary casts. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL primitive settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-4-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 32 ++------------------------------ include/linux/intel_rapl.h | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index f2637cc2cc6a..ffc9d0378257 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -100,13 +100,6 @@ #define RAPL_EVENT_MASK GENMASK(7, 0) -enum unit_type { - ARBITRARY_UNIT, /* no translation */ - POWER_UNIT, - ENERGY_UNIT, - TIME_UNIT, -}; - static const char *pl_names[NR_POWER_LIMITS] = { [POWER_LIMIT1] = "long_term", [POWER_LIMIT2] = "short_term", @@ -208,27 +201,6 @@ static const struct rapl_defaults *get_defaults(struct rapl_package *rp) return rp->priv->defaults; } -/* per domain data. used to describe individual knobs such that access function - * can be consolidated into one instead of many inline functions. - */ -struct rapl_primitive_info { - const char *name; - u64 mask; - int shift; - enum rapl_domain_reg_id id; - enum unit_type unit; - u32 flag; -}; - -#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ - .name = #p, \ - .mask = m, \ - .shift = s, \ - .id = i, \ - .unit = u, \ - .flag = f \ - } - static void rapl_init_domains(struct rapl_package *rp); static int rapl_read_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, @@ -748,10 +720,10 @@ static int rapl_config(struct rapl_package *rp) /* MMIO I/F shares the same register layout as MSR registers */ case RAPL_IF_MMIO: case RAPL_IF_MSR: - rp->priv->rpi = (void *)rpi_msr; + rp->priv->rpi = rpi_msr; break; case RAPL_IF_TPMI: - rp->priv->rpi = (void *)rpi_tpmi; + rp->priv->rpi = rpi_tpmi; break; default: return -EINVAL; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 9e6bd654be1f..01f290de3586 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -137,6 +137,34 @@ struct rapl_defaults { bool spr_psys_bits; }; +#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ + .name = #p, \ + .mask = m, \ + .shift = s, \ + .id = i, \ + .unit = u, \ + .flag = f \ + } + +enum unit_type { + ARBITRARY_UNIT, /* no translation */ + POWER_UNIT, + ENERGY_UNIT, + TIME_UNIT, +}; + +/* per domain data. used to describe individual knobs such that access function + * can be consolidated into one instead of many inline functions. + */ +struct rapl_primitive_info { + const char *name; + u64 mask; + int shift; + enum rapl_domain_reg_id id; + enum unit_type unit; + u32 flag; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -152,7 +180,7 @@ struct rapl_defaults { * @write_raw: Callback for writing RAPL interface specific * registers. * @defaults: pointer to default settings - * @rpi: internal pointer to interface primitive info + * @rpi: pointer to interface primitive info */ struct rapl_if_priv { enum rapl_if_type type; @@ -164,7 +192,7 @@ struct rapl_if_priv { int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); const struct rapl_defaults *defaults; - void *rpi; + struct rapl_primitive_info *rpi; }; #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From c3bb8d4f5d802ec1a16f018e82030bccb7a053a4 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:50 -0700 Subject: powercap: intel_rapl: Consolidate PL4 and PMU support flags into rapl_defaults Currently, PL4 and MSR-based RAPL PMU support are detected using separate CPU ID tables (pl4_support_ids and pmu_support_ids) in the MSR driver probe path. This creates a maintenance burden since adding a new CPU requires updates in two places: the rapl_ids table and one or both of these capability tables. Consolidate PL4 and PMU capability information directly into struct rapl_defaults by adding msr_pl4_support and msr_pmu_support flags. This allows per-CPU capability to be expressed in a single place alongside other per-CPU defaults, eliminating the duplicate CPU ID tables entirely. No functional changes are intended. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Acked-by: Srinivas Pandruvada Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_msr.c | 83 +++++++++++++++++---------------------- include/linux/intel_rapl.h | 2 + 2 files changed, 38 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index cfb35973f0b5..a34543e66446 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -216,33 +216,6 @@ static int rapl_msr_write_raw(int cpu, struct reg_action *ra) return ra->err; } -/* List of verified CPUs. */ -static const struct x86_cpu_id pl4_support_ids[] = { - X86_MATCH_VFM(INTEL_ICELAKE_L, NULL), - X86_MATCH_VFM(INTEL_TIGERLAKE_L, NULL), - X86_MATCH_VFM(INTEL_ALDERLAKE, NULL), - X86_MATCH_VFM(INTEL_ALDERLAKE_L, NULL), - X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, NULL), - X86_MATCH_VFM(INTEL_RAPTORLAKE, NULL), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, NULL), - X86_MATCH_VFM(INTEL_METEORLAKE, NULL), - X86_MATCH_VFM(INTEL_METEORLAKE_L, NULL), - X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL), - X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), - X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), - X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), - X86_MATCH_VFM(INTEL_NOVALAKE, NULL), - X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL), - {} -}; - -/* List of MSR-based RAPL PMU support CPUs */ -static const struct x86_cpu_id pmu_support_ids[] = { - X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), - X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), - {} -}; - static int rapl_check_unit_atom(struct rapl_domain *rd) { struct reg_action ra; @@ -420,6 +393,23 @@ static const struct rapl_defaults rapl_defaults_amd = { .check_unit = rapl_default_check_unit, }; +static const struct rapl_defaults rapl_defaults_core_pl4 = { + .floor_freq_reg_addr = 0, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, + .msr_pl4_support = 1, +}; + +static const struct rapl_defaults rapl_defaults_core_pl4_pmu = { + .floor_freq_reg_addr = 0, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, + .msr_pl4_support = 1, + .msr_pmu_support = 1, +}; + static const struct x86_cpu_id rapl_ids[] = { X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core), @@ -443,35 +433,35 @@ static const struct x86_cpu_id rapl_ids[] = { X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server), X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server), X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core), X86_MATCH_VFM(INTEL_BARTLETTLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core_pl4_pmu), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core_pl4_pmu), + X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), @@ -498,7 +488,6 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_ids); static int rapl_msr_probe(struct platform_device *pdev) { - const struct x86_cpu_id *id = x86_match_cpu(pl4_support_ids); int ret; switch (boot_cpu_data.x86_vendor) { @@ -518,16 +507,16 @@ static int rapl_msr_probe(struct platform_device *pdev) rapl_msr_priv->defaults = (const struct rapl_defaults *)pdev->dev.platform_data; rapl_msr_priv->rpi = rpi_msr; - if (id) { + if (rapl_msr_priv->defaults->msr_pl4_support) { rapl_msr_priv->limits[RAPL_DOMAIN_PACKAGE] |= BIT(POWER_LIMIT4); rapl_msr_priv->regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PL4].msr = MSR_VR_CURRENT_CONFIG; - pr_info("PL4 support detected.\n"); + pr_info("PL4 support detected (updated).\n"); } - if (x86_match_cpu(pmu_support_ids)) { + if (rapl_msr_priv->defaults->msr_pmu_support) { rapl_msr_pmu = true; - pr_info("MSR-based RAPL PMU support enabled\n"); + pr_info("MSR-based RAPL PMU support enabled (updated)\n"); } rapl_msr_priv->control_type = powercap_register_control_type(NULL, "intel-rapl", NULL); diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 01f290de3586..328004f605c3 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -135,6 +135,8 @@ struct rapl_defaults { unsigned int dram_domain_energy_unit; unsigned int psys_domain_energy_unit; bool spr_psys_bits; + bool msr_pl4_support; + bool msr_pmu_support; }; #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ -- cgit v1.2.3 From c03791085adcd61fa9b766ab303c7d0941d7378d Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 16 Mar 2026 08:18:49 +0000 Subject: cpufreq: Pass the policy to cpufreq_driver->adjust_perf() cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent writer(s), however amd-pstate depends on fetching the cpudata via the policy's driver data which necessitates grabbing the reference. Since schedutil governor can call "cpufreq_driver->update_perf()" during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled, fetching the policy object using the cpufreq_cpu_get() helper in the scheduler fast-path leads to "BUG: scheduling while atomic" on PREEMPT_RT [1]. Pass the cached cpufreq policy object in sg_policy to the update_perf() instead of just the CPU. The CPU can be inferred using "policy->cpu". The lifetime of cpufreq_policy object outlasts that of the governor and the cpufreq driver (allocated when the CPU is onlined and only reclaimed when the CPU is offlined / the CPU device is removed) which makes it safe to be referenced throughout the governor's lifetime. Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1] Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State") Reported-by: Bert Karwatzki Acked-by: Viresh Kumar Signed-off-by: K Prateek Nayak Acked-by: Gary Guo # Rust Reviewed-by: Gautham R. Shenoy Reviewed-by: Zhongqiu Han Link: https://lore.kernel.org/r/20260316081849.19368-3-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 3 +-- drivers/cpufreq/cpufreq.c | 6 +++--- drivers/cpufreq/intel_pstate.c | 4 ++-- include/linux/cpufreq.h | 4 ++-- kernel/sched/cpufreq_schedutil.c | 5 +++-- rust/kernel/cpufreq.rs | 13 ++++++------- 6 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2ea4d27fe020..c825fab0bf5c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -788,13 +788,12 @@ static unsigned int amd_pstate_fast_switch(struct cpufreq_policy *policy, return policy->cur; } -static void amd_pstate_adjust_perf(unsigned int cpu, +static void amd_pstate_adjust_perf(struct cpufreq_policy *policy, unsigned long _min_perf, unsigned long target_perf, unsigned long capacity) { u8 max_perf, min_perf, des_perf, cap_perf; - struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; union perf_cached perf; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 277884d91913..90e939069cde 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2231,7 +2231,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); /** * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go. - * @cpu: Target CPU. + * @policy: cpufreq policy object of the target CPU. * @min_perf: Minimum (required) performance level (units of @capacity). * @target_perf: Target (desired) performance level (units of @capacity). * @capacity: Capacity of the target CPU. @@ -2250,12 +2250,12 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); * parallel with either ->target() or ->target_index() or ->fast_switch() for * the same CPU. */ -void cpufreq_driver_adjust_perf(unsigned int cpu, +void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity) { - cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity); + cpufreq_driver->adjust_perf(policy, min_perf, target_perf, capacity); } /** diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 11c58af41900..0f50034e4b68 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3239,12 +3239,12 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, return target_pstate * cpu->pstate.scaling; } -static void intel_cpufreq_adjust_perf(unsigned int cpunum, +static void intel_cpufreq_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity) { - struct cpudata *cpu = all_cpu_data[cpunum]; + struct cpudata *cpu = all_cpu_data[policy->cpu]; u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); int old_pstate = cpu->pstate.current_pstate; int cap_pstate, min_pstate, max_pstate, target_pstate; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..4317c5a312bd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -372,7 +372,7 @@ struct cpufreq_driver { * conditions) scale invariance can be disabled, which causes the * schedutil governor to fall back to the latter. */ - void (*adjust_perf)(unsigned int cpu, + void (*adjust_perf)(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); @@ -617,7 +617,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); -void cpufreq_driver_adjust_perf(unsigned int cpu, +void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 153232dd8276..ae9fd211cec1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long prev_util = sg_cpu->util; unsigned long max_cap; @@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min, sg_cpu->util, max_cap); - sg_cpu->sg_policy->last_freq_update_time = time; + sg_policy->last_freq_update_time = time; } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index f5adee48d40c..d8d26870bea2 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -1257,18 +1257,17 @@ impl Registration { /// # Safety /// /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. unsafe extern "C" fn adjust_perf_callback( - cpu: c_uint, + ptr: *mut bindings::cpufreq_policy, min_perf: c_ulong, target_perf: c_ulong, capacity: c_ulong, ) { - // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. - let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) }; - - if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) { - T::adjust_perf(&mut policy, min_perf, target_perf, capacity); - } + // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the + // lifetime of `policy`. + let policy = unsafe { Policy::from_raw_mut(ptr) }; + T::adjust_perf(policy, min_perf, target_perf, capacity); } /// Driver's `get_intermediate` callback. -- cgit v1.2.3