diff options
50 files changed, 1160 insertions, 384 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b0692a642a7..8870a29f92a8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7041,3 +7041,10 @@ management firmware translates the requests into actual hardware states (core frequency, data fabric and memory clocks etc.) + active + Use amd_pstate_epp driver instance as the scaling driver, + driver provides a hint to the hardware if software wants + to bias toward performance (0x0) or energy efficiency (0xff) + to the CPPC firmware. then CPPC power algorithm will + calculate the runtime workload and adjust the realtime cores + frequency. diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 5376d53faaa8..d143e72cf93e 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -230,8 +230,8 @@ with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond to the request from AMD P-States. -User Space Interface in ``sysfs`` -================================== +User Space Interface in ``sysfs`` - Per-policy control +====================================================== ``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to control its functionality at the system level. They are located in the @@ -262,6 +262,25 @@ lowest non-linear performance in `AMD CPPC Performance Capability <perf_cap_>`_.) This attribute is read-only. +``energy_performance_available_preferences`` + +A list of all the supported EPP preferences that could be used for +``energy_performance_preference`` on this system. +These profiles represent different hints that are provided +to the low-level firmware about the user's desired energy vs efficiency +tradeoff. ``default`` represents the epp value is set by platform +firmware. This attribute is read-only. + +``energy_performance_preference`` + +The current energy performance preference can be read from this attribute. +and user can change current preference according to energy or performance needs +Please get all support profiles list from +``energy_performance_available_preferences`` attribute, all the profiles are +integer values defined between 0 to 255 when EPP feature is enabled by platform +firmware, if EPP feature is disabled, driver will ignore the written value +This attribute is read-write. + Other performance and frequency values can be read back from ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. @@ -280,8 +299,30 @@ module which supports the new AMD P-States mechanism on most of the future AMD platforms. The AMD P-States mechanism is the more performance and energy efficiency frequency management method on AMD processors. -Kernel Module Options for ``amd-pstate`` -========================================= + +AMD Pstate Driver Operation Modes +================================= + +``amd_pstate`` CPPC has two operation modes: CPPC Autonomous(active) mode and +CPPC non-autonomous(passive) mode. +active mode and passive mode can be chosen by different kernel parameters. +When in Autonomous mode, CPPC ignores requests done in the Desired Performance +Target register and takes into account only the values set to the Minimum requested +performance, Maximum requested performance, and Energy Performance Preference +registers. When Autonomous is disabled, it only considers the Desired Performance Target. + +Active Mode +------------ + +``amd_pstate=active`` + +This is the low-level firmware control mode which is implemented by ``amd_pstate_epp`` +driver with ``amd_pstate=active`` passed to the kernel in the command line. +In this mode, ``amd_pstate_epp`` driver provides a hint to the hardware if software +wants to bias toward performance (0x0) or energy efficiency (0xff) to the CPPC firmware. +then CPPC power algorithm will calculate the runtime workload and adjust the realtime +cores frequency according to the power supply and thermal, core voltage and some other +hardware conditions. Passive Mode ------------ @@ -298,6 +339,35 @@ processor must provide at least nominal performance requested and go higher if c operating conditions allow. +User Space Interface in ``sysfs`` - General +=========================================== + +Global Attributes +----------------- + +``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to +control its functionality at the system level. They are located in the +``/sys/devices/system/cpu/amd-pstate/`` directory and affect all CPUs. + +``status`` + Operation mode of the driver: "active", "passive" or "disable". + + "active" + The driver is functional and in the ``active mode`` + + "passive" + The driver is functional and in the ``passive mode`` + + "disable" + The driver is unregistered and not functional now. + + This attribute can be written to in order to change the driver's + operation mode or to unregister it. The string written to it must be + one of the possible values of it and, if successful, writing one of + these values to the sysfs file will cause the driver to switch over + to the operation mode represented by that string - or to be + unregistered in the "disable" case. + ``cpupower`` tool support for ``amd-pstate`` =============================================== diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml index 99e159bc5fb1..e4aa8c67d532 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml @@ -26,8 +26,13 @@ properties: items: - enum: - qcom,qdu1000-cpufreq-epss + - qcom,sc7280-cpufreq-epss + - qcom,sc8280xp-cpufreq-epss - qcom,sm6375-cpufreq-epss - qcom,sm8250-cpufreq-epss + - qcom,sm8350-cpufreq-epss + - qcom,sm8450-cpufreq-epss + - qcom,sm8550-cpufreq-epss - const: qcom,cpufreq-epss reg: diff --git a/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml b/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml index 9c086eac6ca7..6f5e7904181f 100644 --- a/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml +++ b/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml @@ -17,6 +17,9 @@ description: | on the CPU OPP in use. The CPUFreq driver sets the CPR power domain level according to the required OPPs defined in the CPU OPP tables. + For old implementation efuses are parsed to select the correct opp table and + voltage and CPR is not supported/used. + select: properties: compatible: @@ -33,37 +36,65 @@ select: required: - compatible -properties: - cpus: - type: object - - patternProperties: - '^cpu@[0-9a-f]+$': - type: object - - properties: - power-domains: - maxItems: 1 - - power-domain-names: - items: - - const: cpr - - required: - - power-domains - - power-domain-names - patternProperties: '^opp-table(-[a-z0-9]+)?$': - if: + allOf: + - if: + properties: + compatible: + const: operating-points-v2-kryo-cpu + then: + $ref: /schemas/opp/opp-v2-kryo-cpu.yaml# + + - if: + properties: + compatible: + const: operating-points-v2-qcom-level + then: + $ref: /schemas/opp/opp-v2-qcom-level.yaml# + + unevaluatedProperties: false + +allOf: + - if: properties: compatible: - const: operating-points-v2-kryo-cpu + contains: + enum: + - qcom,qcs404 + then: + properties: + cpus: + type: object + + patternProperties: + '^cpu@[0-9a-f]+$': + type: object + + properties: + power-domains: + maxItems: 1 + + power-domain-names: + items: + - const: cpr + + required: + - power-domains + - power-domain-names + patternProperties: - '^opp-?[0-9]+$': - required: - - required-opps + '^opp-table(-[a-z0-9]+)?$': + if: + properties: + compatible: + const: operating-points-v2-kryo-cpu + then: + patternProperties: + '^opp-?[0-9]+$': + required: + - required-opps additionalProperties: true diff --git a/Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml b/Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml index 60cf3cbde4c5..bbbad31ae4ca 100644 --- a/Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml +++ b/Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml @@ -50,12 +50,22 @@ patternProperties: opp-supported-hw: description: | A single 32 bit bitmap value, representing compatible HW. - Bitmap: + Bitmap for MSM8996 format: 0: MSM8996, speedbin 0 1: MSM8996, speedbin 1 2: MSM8996, speedbin 2 - 3-31: unused - maximum: 0x7 + 3: MSM8996, speedbin 3 + 4-31: unused + + Bitmap for MSM8996SG format (speedbin shifted of 4 left): + 0-3: unused + 4: MSM8996SG, speedbin 0 + 5: MSM8996SG, speedbin 1 + 6: MSM8996SG, speedbin 2 + 7-31: unused + enum: [0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x9, 0xd, 0xe, 0xf, + 0x10, 0x20, 0x30, 0x70] clock-latency-ns: true @@ -106,6 +116,7 @@ examples: L2_0: l2-cache { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; @@ -140,6 +151,7 @@ examples: L2_1: l2-cache { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; diff --git a/Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml b/Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml index b9ce2e099ce9..a30ef93213c0 100644 --- a/Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml +++ b/Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml @@ -30,7 +30,9 @@ patternProperties: this OPP node. Sometimes several corners/levels shares a certain fuse corner/level. A fuse corner/level contains e.g. ref uV, min uV, and max uV. - $ref: /schemas/types.yaml#/definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 2 required: - opp-level diff --git a/Documentation/power/suspend-and-interrupts.rst b/Documentation/power/suspend-and-interrupts.rst index 4cda6617709a..dfbace2f4600 100644 --- a/Documentation/power/suspend-and-interrupts.rst +++ b/Documentation/power/suspend-and-interrupts.rst @@ -67,7 +67,7 @@ That may involve turning on a special signal handling logic within the platform during system sleep so as to trigger a system wakeup when needed. For example, the platform may include a dedicated interrupt controller used specifically for handling system wakeup events. Then, if a given interrupt line is supposed to -wake up the system from sleep sates, the corresponding input of that interrupt +wake up the system from sleep states, the corresponding input of that interrupt controller needs to be enabled to receive signals from the line in question. After wakeup, it generally is better to disable that input to prevent the dedicated controller from triggering interrupts unnecessarily. diff --git a/arch/mips/include/asm/mach-loongson32/cpufreq.h b/arch/mips/include/asm/mach-loongson32/cpufreq.h deleted file mode 100644 index e422a32883ae..000000000000 --- a/arch/mips/include/asm/mach-loongson32/cpufreq.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2014 Zhang, Keguang <keguang.zhang@gmail.com> - * - * Loongson 1 CPUFreq platform support. - */ - -#ifndef __ASM_MACH_LOONGSON32_CPUFREQ_H -#define __ASM_MACH_LOONGSON32_CPUFREQ_H - -struct plat_ls1x_cpufreq { - const char *clk_name; /* CPU clk */ - const char *osc_clk_name; /* OSC clk */ - unsigned int max_freq; /* in kHz */ - unsigned int min_freq; /* in kHz */ -}; - -#endif /* __ASM_MACH_LOONGSON32_CPUFREQ_H */ diff --git a/arch/mips/include/asm/mach-loongson32/platform.h b/arch/mips/include/asm/mach-loongson32/platform.h index eb83e2741887..86e1a6aab4e5 100644 --- a/arch/mips/include/asm/mach-loongson32/platform.h +++ b/arch/mips/include/asm/mach-loongson32/platform.h @@ -12,7 +12,6 @@ #include <nand.h> extern struct platform_device ls1x_uart_pdev; -extern struct platform_device ls1x_cpufreq_pdev; extern struct platform_device ls1x_eth0_pdev; extern struct platform_device ls1x_eth1_pdev; extern struct platform_device ls1x_ehci_pdev; diff --git a/arch/mips/loongson32/common/platform.c b/arch/mips/loongson32/common/platform.c index 311dc1580bbd..64d7979394e6 100644 --- a/arch/mips/loongson32/common/platform.c +++ b/arch/mips/loongson32/common/platform.c @@ -15,7 +15,6 @@ #include <platform.h> #include <loongson1.h> -#include <cpufreq.h> #include <dma.h> #include <nand.h> @@ -62,21 +61,6 @@ void __init ls1x_serial_set_uartclk(struct platform_device *pdev) p->uartclk = clk_get_rate(clk); } -/* CPUFreq */ -static struct plat_ls1x_cpufreq ls1x_cpufreq_pdata = { - .clk_name = "cpu_clk", - .osc_clk_name = "osc_clk", - .max_freq = 266 * 1000, - .min_freq = 33 * 1000, -}; - -struct platform_device ls1x_cpufreq_pdev = { - .name = "ls1x-cpufreq", - .dev = { - .platform_data = &ls1x_cpufreq_pdata, - }, -}; - /* Synopsys Ethernet GMAC */ static struct stmmac_mdio_bus_data ls1x_mdio_bus_data = { .phy_mask = 0, diff --git a/arch/mips/loongson32/ls1b/board.c b/arch/mips/loongson32/ls1b/board.c index 727e06718dab..fed8d432ef20 100644 --- a/arch/mips/loongson32/ls1b/board.c +++ b/arch/mips/loongson32/ls1b/board.c @@ -35,7 +35,6 @@ static const struct gpio_led_platform_data ls1x_led_pdata __initconst = { static struct platform_device *ls1b_platform_devices[] __initdata = { &ls1x_uart_pdev, - &ls1x_cpufreq_pdev, &ls1x_eth0_pdev, &ls1x_eth1_pdev, &ls1x_ehci_pdev, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e57cd31bfec4..b650cde3f64d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -739,6 +739,7 @@ void __cpuidle arch_cpu_idle(void) { static_call(x86_idle)(); } +EXPORT_SYMBOL_GPL(arch_cpu_idle); #ifdef CONFIG_XEN bool xen_set_default_idle(void) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 0f17b1c32718..02d83c807271 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) } /** + * cppc_get_epp_perf - Get the epp register value. + * @cpunum: CPU from which to get epp preference value. + * @epp_perf: Return address. + * + * Return: 0 for success, -EIO otherwise. + */ +int cppc_get_epp_perf(int cpunum, u64 *epp_perf) +{ + return cppc_get_perf(cpunum, ENERGY_PERF, epp_perf); +} +EXPORT_SYMBOL_GPL(cppc_get_epp_perf); + +/** * cppc_get_perf_caps - Get a CPU's performance capabilities. * @cpunum: CPU from which to get capabilities info. * @perf_caps: ptr to cppc_perf_caps. See cppc_acpi.h @@ -1365,6 +1378,60 @@ out_err: } EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs); +/* + * Set Energy Performance Preference Register value through + * Performance Controls Interface + */ +int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) +{ + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cpc_register_resource *epp_set_reg; + struct cpc_register_resource *auto_sel_reg; + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpu); + return -ENODEV; + } + + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; + epp_set_reg = &cpc_desc->cpc_regs[ENERGY_PERF]; + + if (CPC_IN_PCC(epp_set_reg) || CPC_IN_PCC(auto_sel_reg)) { + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu); + return -ENODEV; + } + + if (CPC_SUPPORTED(auto_sel_reg)) { + ret = cpc_write(cpu, auto_sel_reg, enable); + if (ret) + return ret; + } + + if (CPC_SUPPORTED(epp_set_reg)) { + ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf); + if (ret) + return ret; + } + + pcc_ss_data = pcc_data[pcc_ss_id]; + + down_write(&pcc_ss_data->pcc_lock); + /* after writing CPC, transfer the ownership of PCC to platform */ + ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); + up_write(&pcc_ss_data->pcc_lock); + } else { + ret = -ENOTSUPP; + pr_debug("_CPC in PCC is not supported\n"); + } + + return ret; +} +EXPORT_SYMBOL_GPL(cppc_set_epp_perf); + /** * cppc_set_enable - Set to enable CPPC on the processor by writing the * Continuous Performance Control package EnableRegister field. diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 967bcf9d415e..6097644ebdc5 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -220,13 +220,10 @@ static void genpd_debug_add(struct generic_pm_domain *genpd); static void genpd_debug_remove(struct generic_pm_domain *genpd) { - struct dentry *d; - if (!genpd_debugfs_dir) return; - d = debugfs_lookup(genpd->name, genpd_debugfs_dir); - debugfs_remove(d); + debugfs_lookup_and_remove(genpd->name, genpd_debugfs_dir); } static void genpd_update_accounting(struct generic_pm_domain *genpd) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 98f7b3d7d669..4545669cb973 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1864,6 +1864,10 @@ static bool pm_runtime_need_not_resume(struct device *dev) * sure the device is put into low power state and it should only be used during * system-wide PM transitions to sleep states. It assumes that the analogous * pm_runtime_force_resume() will be used to resume the device. + * + * Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent + * state where this function has called the ->runtime_suspend callback but the + * PM core marks the driver as runtime active. */ int pm_runtime_force_suspend(struct device *dev) { diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 8466f78651fc..2c839bd2b051 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -3,7 +3,6 @@ menu "CPU Frequency scaling" config CPU_FREQ bool "CPU Frequency scaling" - select SRCU help CPU Frequency scaling allows you to change the clock speed of CPUs on the fly. This is a nice method to save power, because @@ -271,15 +270,6 @@ config LOONGSON2_CPUFREQ Loongson2F and its successors support this feature. If in doubt, say N. - -config LOONGSON1_CPUFREQ - tristate "Loongson1 CPUFreq Driver" - depends on LOONGSON1_LS1B - help - This option adds a CPUFreq driver for loongson1 processors which - support software configurable cpu frequency. - - If in doubt, say N. endif if SPARC64 diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index a19842fbd521..ef8510774913 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -104,7 +104,6 @@ obj-$(CONFIG_POWERNV_CPUFREQ) += powernv-cpufreq.o obj-$(CONFIG_BMIPS_CPUFREQ) += bmips-cpufreq.o obj-$(CONFIG_IA64_ACPI_CPUFREQ) += ia64-acpi-cpufreq.o obj-$(CONFIG_LOONGSON2_CPUFREQ) += loongson2_cpufreq.o -obj-$(CONFIG_LOONGSON1_CPUFREQ) += loongson1-cpufreq.o obj-$(CONFIG_SH_CPU_FREQ) += sh-cpufreq.o obj-$(CONFIG_SPARC_US2E_CPUFREQ) += sparc-us2e-cpufreq.o obj-$(CONFIG_SPARC_US3_CPUFREQ) += sparc-us3-cpufreq.o diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index c17bd845f5fc..45c88894fd8e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -59,8 +59,171 @@ * we disable it by default to go acpi-cpufreq on these processors and add a * module parameter to be able to enable it manually for debugging. */ +static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; -static int cppc_load __initdata; +static struct cpufreq_driver amd_pstate_epp_driver; +static int cppc_state = AMD_PSTATE_DISABLE; +struct kobject *amd_pstate_kobj; + +/* + * AMD Energy Preference Performance (EPP) + * The EPP is used in the CCLK DPM controller to drive + * the frequency that a core is going to operate during + * short periods of activity. EPP values will be utilized for + * different OS profiles (balanced, performance, power savings) + * display strings corresponding to EPP index in the + * energy_perf_strings[] + * index String + *------------------------------------- + * 0 default + * 1 performance + * 2 balance_performance + * 3 balance_power + * 4 power + */ +enum energy_perf_value_index { + EPP_INDEX_DEFAULT = 0, + EPP_INDEX_PERFORMANCE, + EPP_INDEX_BALANCE_PERFORMANCE, + EPP_INDEX_BALANCE_POWERSAVE, + EPP_INDEX_POWERSAVE, +}; + +static const char * const energy_perf_strings[] = { + [EPP_INDEX_DEFAULT] = "default", + [EPP_INDEX_PERFORMANCE] = "performance", + [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", + [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", + [EPP_INDEX_POWERSAVE] = "power", + NULL +}; + +static unsigned int epp_values[] = { + [EPP_INDEX_DEFAULT] = 0, + [EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE, + [EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE, + [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, + [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, + }; + +static inline int get_mode_idx_from_str(const char *str, size_t size) +{ + int i; + + for (i=0; i < AMD_PSTATE_MAX; i++) { + if (!strncmp(str, amd_pstate_mode_string[i], size)) + return i; + } + return -EINVAL; +} + +static DEFINE_MUTEX(amd_pstate_limits_lock); +static DEFINE_MUTEX(amd_pstate_driver_lock); + +static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) +{ + u64 epp; + int ret; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (!cppc_req_cached) { + epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, + &cppc_req_cached); + if (epp) + return epp; + } + epp = (cppc_req_cached >> 24) & 0xFF; + } else { + ret = cppc_get_epp_perf(cpudata->cpu, &epp); + if (ret < 0) { + pr_debug("Could not retrieve energy perf value (%d)\n", ret); + return -EIO; + } + } + + return (s16)(epp & 0xff); +} + +static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) +{ + s16 epp; + int index = -EINVAL; + + epp = amd_pstate_get_epp(cpudata, 0); + if (epp < 0) + return epp; + + switch (epp) { + case AMD_CPPC_EPP_PERFORMANCE: + index = EPP_INDEX_PERFORMANCE; + break; + case AMD_CPPC_EPP_BALANCE_PERFORMANCE: + index = EPP_INDEX_BALANCE_PERFORMANCE; + break; + case AMD_CPPC_EPP_BALANCE_POWERSAVE: + index = EPP_INDEX_BALANCE_POWERSAVE; + break; + case AMD_CPPC_EPP_POWERSAVE: + index = EPP_INDEX_POWERSAVE; + break; + default: + break; + } + + return index; +} + +static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) +{ + int ret; + struct cppc_perf_ctrls perf_ctrls; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + u64 value = READ_ONCE(cpudata->cppc_req_cached); + + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + WRITE_ONCE(cpudata->cppc_req_cached, value); + + ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + if (!ret) + cpudata->epp_cached = epp; + } else { + perf_ctrls.energy_perf = epp; + ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + if (ret) { + pr_debug("failed to set energy perf value (%d)\n", ret); + return ret; + } + cpudata->epp_cached = epp; + } + + return ret; +} + +static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, + int pref_index) +{ + int epp = -EINVAL; + int ret; + + if (!pref_index) { + pr_debug("EPP pref_index is invalid\n"); + return -EINVAL; + } + + if (epp == -EINVAL) + epp = epp_values[pref_index]; + + if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { + pr_debug("EPP cannot be set under performance policy\n"); + return -EBUSY; + } + + ret = amd_pstate_set_epp(cpudata, epp); + + return ret; +} static inline int pstate_enable(bool enable) { @@ -70,11 +233,21 @@ static inline int pstate_enable(bool enable) static int cppc_enable(bool enable) { int cpu, ret = 0; + struct cppc_perf_ctrls perf_ctrls; for_each_present_cpu(cpu) { ret = cppc_set_enable(cpu, enable); if (ret) return ret; + + /* Enable autonomous mode for EPP */ + if (cppc_state == AMD_PSTATE_ACTIVE) { + /* Set desired perf as zero to allow EPP firmware control */ + perf_ctrls.desired_perf = 0; + ret = cppc_set_perf(cpu, &perf_ctrls); + if (ret) + return ret; + } } return ret; @@ -418,7 +591,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata) return; cpudata->boost_supported = true; - amd_pstate_driver.boost_enabled = true; + current_pstate_driver->boost_enabled = true; } static void amd_perf_ctl_reset(unsigned int cpu) @@ -501,6 +674,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata; amd_pstate_boost_init(cpudata); + if (!current_pstate_driver->adjust_perf) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; return 0; @@ -561,7 +736,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, if (max_freq < 0) return max_freq; - return sprintf(&buf[0], "%u\n", max_freq); + return sysfs_emit(buf, "%u\n", max_freq); } static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, @@ -574,7 +749,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli if (freq < 0) return freq; - return sprintf(&buf[0], "%u\n", freq); + return sysfs_emit(buf, "%u\n", freq); } /* @@ -589,13 +764,151 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, perf = READ_ONCE(cpudata->highest_perf); - return sprintf(&buf[0], "%u\n", perf); + return sysfs_emit(buf, "%u\n", perf); +} + +static ssize_t show_energy_performance_available_preferences( + struct cpufreq_policy *policy, char *buf) +{ + int i = 0; + int offset = 0; + + while (energy_perf_strings[i] != NULL) + offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]); + + sysfs_emit_at(buf, offset, "\n"); + + return offset; +} + +static ssize_t store_energy_performance_preference( + struct cpufreq_policy *policy, const char *buf, size_t count) +{ + struct amd_cpudata *cpudata = policy->driver_data; + char str_preference[21]; + ssize_t ret; + + ret = sscanf(buf, "%20s", str_preference); + if (ret != 1) + return -EINVAL; + + ret = match_string(energy_perf_strings, -1, str_preference); + if (ret < 0) + return -EINVAL; + + mutex_lock(&amd_pstate_limits_lock); + ret = amd_pstate_set_energy_pref_index(cpudata, ret); + mutex_unlock(&amd_pstate_limits_lock); + + return ret ?: count; +} + +static ssize_t show_energy_performance_preference( + struct cpufreq_policy *policy, char *buf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int preference; + + preference = amd_pstate_get_energy_pref_index(cpudata); + if (preference < 0) + return preference; + + return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); +} + +static ssize_t amd_pstate_show_status(char *buf) +{ + if (!current_pstate_driver) + return sysfs_emit(buf, "disable\n"); + + return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]); +} + +static void amd_pstate_driver_cleanup(void) +{ + current_pstate_driver = NULL; +} + +static int amd_pstate_update_status(const char *buf, size_t size) +{ + int ret = 0; + int mode_idx; + + if (size > 7 || size < 6) + return -EINVAL; + mode_idx = get_mode_idx_from_str(buf, size); + + switch(mode_idx) { + case AMD_PSTATE_DISABLE: + if (!current_pstate_driver) + return -EINVAL; + if (cppc_state == AMD_PSTATE_ACTIVE) + return -EBUSY; + cpufreq_unregister_driver(current_pstate_driver); + amd_pstate_driver_cleanup(); + break; + case AMD_PSTATE_PASSIVE: + if (current_pstate_driver) { + if (current_pstate_driver == &amd_pstate_driver) + return 0; + cpufreq_unregister_driver(current_pstate_driver); + cppc_state = AMD_PSTATE_PASSIVE; + current_pstate_driver = &amd_pstate_driver; + } + + ret = cpufreq_register_driver(current_pstate_driver); + break; + case AMD_PSTATE_ACTIVE: + if (current_pstate_driver) { + if (current_pstate_driver == &amd_pstate_epp_driver) + return 0; + cpufreq_unregister_driver(current_pstate_driver); + current_pstate_driver = &amd_pstate_epp_driver; + cppc_state = AMD_PSTATE_ACTIVE; + } + + ret = cpufreq_register_driver(current_pstate_driver); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static ssize_t show_status(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t ret; + + mutex_lock(&amd_pstate_driver_lock); + ret = amd_pstate_show_status(buf); + mutex_unlock(&amd_pstate_driver_lock); + + return ret; +} + +static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, + const char *buf, size_t count) +{ + char *p = memchr(buf, '\n', count); + int ret; + + mutex_lock(&amd_pstate_driver_lock); + ret = amd_pstate_update_status(buf, p ? p - buf : count); + mutex_unlock(&amd_pstate_driver_lock); + + return ret < 0 ? ret : count; } cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); cpufreq_freq_attr_ro(amd_pstate_highest_perf); +cpufreq_freq_attr_rw(energy_performance_preference); +cpufreq_freq_attr_ro(energy_performance_available_preferences); +define_one_global_rw(status); static struct freq_attr *amd_pstate_attr[] = { &amd_pstate_max_freq, @@ -604,6 +917,313 @@ static struct freq_attr *amd_pstate_attr[] = { NULL, }; +static struct freq_attr *amd_pstate_epp_attr[] = { + &amd_pstate_max_freq, + &amd_pstate_lowest_nonlinear_freq, + &amd_pstate_highest_perf, + &energy_performance_preference, + &energy_performance_available_preferences, + NULL, +}; + +static struct attribute *pstate_global_attributes[] = { + &status.attr, + NULL +}; + +static const struct attribute_group amd_pstate_global_attr_group = { + .attrs = pstate_global_attributes, +}; + +static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) +{ + int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; + struct amd_cpudata *cpudata; + struct device *dev; + u64 value; + + /* + * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, + * which is ideal for initialization process. + */ + amd_perf_ctl_reset(policy->cpu); + dev = get_cpu_device(policy->cpu); + if (!dev) + return -ENODEV; + + cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL); + if (!cpudata) + return -ENOMEM; + + cpudata->cpu = policy->cpu; + cpudata->epp_policy = 0; + + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; + + min_freq = amd_get_min_freq(cpudata); + max_freq = amd_get_max_freq(cpudata); + nominal_freq = amd_get_nominal_freq(cpudata); + lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); + if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { + dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", + min_freq, max_freq); + ret = -EINVAL; + goto free_cpudata1; + } + + policy->cpuinfo.min_freq = min_freq; + policy->cpuinfo.max_freq = max_freq; + /* It will be updated by governor */ + policy->cur = policy->cpuinfo.min_freq; + + /* Initial processor data capability frequencies */ + cpudata->max_freq = max_freq; + cpudata->min_freq = min_freq; + cpudata->nominal_freq = nominal_freq; + cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; + + policy->driver_data = cpudata; + + cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0); + + policy->min = policy->cpuinfo.min_freq; + policy->max = policy->cpuinfo.max_freq; + + /* + * Set the policy to powersave to provide a valid fallback value in case + * the default cpufreq governor is neither powersave nor performance. + */ + policy->policy = CPUFREQ_POLICY_POWERSAVE; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + policy->fast_switch_possible = true; + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); + if (ret) + return ret; + WRITE_ONCE(cpudata->cppc_req_cached, value); + + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value); + if (ret) + return ret; + WRITE_ONCE(cpudata->cppc_cap1_cached, value); + } + amd_pstate_boost_init(cpudata); + + return 0; + +free_cpudata1: + kfree(cpudata); + return ret; +} + +static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) +{ + pr_debug("CPU %d exiting\n", policy->cpu); + policy->fast_switch_possible = false; + return 0; +} + +static void amd_pstate_epp_init(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata = policy->driver_data; + u32 max_perf, min_perf; + u64 value; + s16 epp; + + max_perf = READ_ONCE(cpudata->highest_perf); + min_perf = READ_ONCE(cpudata->lowest_perf); + + value = READ_ONCE(cpudata->cppc_req_cached); + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + min_perf = max_perf; + + /* Initial min/max values for CPPC Performance Controls Register */ + value &= ~AMD_CPPC_MIN_PERF(~0L); + value |= AMD_CPPC_MIN_PERF(min_perf); + + value &= ~AMD_CPPC_MAX_PERF(~0L); + value |= AMD_CPPC_MAX_PERF(max_perf); + + /* CPPC EPP feature require to set zero to the desire perf bit */ + value &= ~AMD_CPPC_DES_PERF(~0L); + value |= AMD_CPPC_DES_PERF(0); + + if (cpudata->epp_policy == cpudata->policy) + goto skip_epp; + + cpudata->epp_policy = cpudata->policy; + + /* Get BIOS pre-defined epp value */ + epp = amd_pstate_get_epp(cpudata, value); + if (epp < 0) { + /** + * This return value can only be negative for shared_memory + * systems where EPP register read/write not supported. + */ + goto skip_epp; + } + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + epp = 0; + + /* Set initial EPP value */ + if (boot_cpu_has(X86_FEATURE_CPPC)) { + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + } + + WRITE_ONCE(cpudata->cppc_req_cached, value); + amd_pstate_set_epp(cpudata, epp); +skip_epp: + cpufreq_cpu_put(policy); +} + +static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (!policy->cpuinfo.max_freq) + return -ENODEV; + + pr_debug("set_policy: cpuinfo.max %u policy->max %u\n", + policy->cpuinfo.max_freq, policy->max); + + cpudata->policy = policy->policy; + + amd_pstate_epp_init(policy->cpu); + + return 0; +} + +static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) +{ + struct cppc_perf_ctrls perf_ctrls; + u64 value, max_perf; + int ret; + + ret = amd_pstate_enable(true); + if (ret) + pr_err("failed to enable amd pstate during resume, return %d\n", ret); + + value = READ_ONCE(cpudata->cppc_req_cached); + max_perf = READ_ONCE(cpudata->highest_perf); + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.max_perf = max_perf; + perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); + cppc_set_perf(cpudata->cpu, &perf_ctrls); + } +} + +static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); + + if (cppc_state == AMD_PSTATE_ACTIVE) { + amd_pstate_epp_reenable(cpudata); + cpudata->suspended = false; + } + + return 0; +} + +static void amd_pstate_epp_offline(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + struct cppc_perf_ctrls perf_ctrls; + int min_perf; + u64 value; + + min_perf = READ_ONCE(cpudata->lowest_perf); + value = READ_ONCE(cpudata->cppc_req_cached); + + mutex_lock(&amd_pstate_limits_lock); + if (boot_cpu_has(X86_FEATURE_CPPC)) { + cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; + + /* Set max perf same as min perf */ + value &= ~AMD_CPPC_MAX_PERF(~0L); + value |= AMD_CPPC_MAX_PERF(min_perf); + value &= ~AMD_CPPC_MIN_PERF(~0L); + value |= AMD_CPPC_MIN_PERF(min_perf); + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.desired_perf = 0; + perf_ctrls.max_perf = min_perf; + perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); + cppc_set_perf(cpudata->cpu, &perf_ctrls); + } + mutex_unlock(&amd_pstate_limits_lock); +} + +static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu); + + if (cpudata->suspended) + return 0; + + if (cppc_state == AMD_PSTATE_ACTIVE) + amd_pstate_epp_offline(policy); + + return 0; +} + +static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy) +{ + cpufreq_verify_within_cpu_limits(policy); + pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min); + return 0; +} + +static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int ret; + + /* avoid suspending when EPP is not enabled */ + if (cppc_state != AMD_PSTATE_ACTIVE) + return 0; + + /* set this flag to avoid setting core offline*/ + cpudata->suspended = true; + + /* disable CPPC in lowlevel firmware */ + ret = amd_pstate_enable(false); + if (ret) + pr_err("failed to suspend, return %d\n", ret); + + return 0; +} + +static int amd_pstate_epp_resume(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->suspended) { + mutex_lock(&amd_pstate_limits_lock); + + /* enable amd pstate from suspend state*/ + amd_pstate_epp_reenable(cpudata); + + mutex_unlock(&amd_pstate_limits_lock); + + cpudata->suspended = false; + } + + return 0; +} + static struct cpufreq_driver amd_pstate_driver = { .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, .verify = amd_pstate_verify, @@ -617,6 +1237,20 @@ static struct cpufreq_driver amd_pstate_driver = { .attr = amd_pstate_attr, }; +static struct cpufreq_driver amd_pstate_epp_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .verify = amd_pstate_epp_verify_policy, + .setpolicy = amd_pstate_epp_set_policy, + .init = amd_pstate_epp_cpu_init, + .exit = amd_pstate_epp_cpu_exit, + .offline = amd_pstate_epp_cpu_offline, + .online = amd_pstate_epp_cpu_online, + .suspend = amd_pstate_epp_suspend, + .resume = amd_pstate_epp_resume, + .name = "amd_pstate_epp", + .attr = amd_pstate_epp_attr, +}; + static int __init amd_pstate_init(void) { int ret; @@ -626,10 +1260,10 @@ static int __init amd_pstate_init(void) /* * by default the pstate driver is disabled to load * enable the amd_pstate passive mode driver explicitly - * with amd_pstate=passive in kernel command line + * with amd_pstate=passive or other modes in kernel command line */ - if (!cppc_load) { - pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n"); + if (cppc_state == AMD_PSTATE_DISABLE) { + pr_debug("driver load is disabled, boot with specific mode to enable this\n"); return -ENODEV; } @@ -645,7 +1279,8 @@ static int __init amd_pstate_init(void) /* capability check */ if (boot_cpu_has(X86_FEATURE_CPPC)) { pr_debug("AMD CPPC MSR based functionality is supported\n"); - amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf; + if (cppc_state == AMD_PSTATE_PASSIVE) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; } else { pr_debug("AMD CPPC shared memory based functionality is supported\n"); static_call_update(amd_pstate_enable, cppc_enable); @@ -656,31 +1291,63 @@ static int __init amd_pstate_init(void) /* enable amd pstate feature */ ret = amd_pstate_enable(true); if (ret) { - pr_err("failed to enable amd-pstate with return %d\n", ret); + pr_err("failed to enable with return %d\n", ret); return ret; } - ret = cpufreq_register_driver(&amd_pstate_driver); + ret = cpufreq_register_driver(current_pstate_driver); if (ret) - pr_err("failed to register amd_pstate_driver with return %d\n", - ret); + pr_err("failed to register with return %d\n", ret); + + amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj); + if (!amd_pstate_kobj) { + ret = -EINVAL; + pr_err("global sysfs registration failed.\n"); + goto kobject_free; + } + + ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group); + if (ret) { + pr_err("sysfs attribute export failed with error %d.\n", ret); + goto global_attr_free; + } return ret; + +global_attr_free: + kobject_put(amd_pstate_kobj); +kobject_free: + cpufreq_unregister_driver(current_pstate_driver); + return ret; } device_initcall(amd_pstate_init); static int __init amd_pstate_param(char *str) { + size_t size; + int mode_idx; + if (!str) return -EINVAL; - if (!strcmp(str, "disable")) { - cppc_load = 0; - pr_info("driver is explicitly disabled\n"); - } else if (!strcmp(str, "passive")) - cppc_load = 1; + size = strlen(str); + mode_idx = get_mode_idx_from_str(str, size); - return 0; + if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { + cppc_state = mode_idx; + if (cppc_state == AMD_PSTATE_DISABLE) + pr_info("driver is explicitly disabled\n"); + + if (cppc_state == AMD_PSTATE_ACTIVE) + current_pstate_driver = &amd_pstate_epp_driver; + + if (cppc_state == AMD_PSTATE_PASSIVE) + current_pstate_driver = &amd_pstate_driver; + + return 0; + } + + return -EINVAL; } early_param("amd_pstate", amd_pstate_param); diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c index 4153150e20db..ffea6402189d 100644 --- a/drivers/cpufreq/brcmstb-avs-cpufreq.c +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -751,10 +751,7 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev) static int brcm_avs_cpufreq_remove(struct platform_device *pdev) { - int ret; - - ret = cpufreq_unregister_driver(&brcm_avs_driver); - WARN_ON(ret); + cpufreq_unregister_driver(&brcm_avs_driver); brcm_avs_prepare_uninit(pdev); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7e56a42750ea..6d8fd3b8dcb5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -993,7 +993,7 @@ static const struct sysfs_ops sysfs_ops = { .store = store, }; -static struct kobj_type ktype_cpufreq = { +static const struct kobj_type ktype_cpufreq = { .sysfs_ops = &sysfs_ops, .default_groups = cpufreq_groups, .release = cpufreq_sysfs_release, @@ -2904,12 +2904,12 @@ EXPORT_SYMBOL_GPL(cpufreq_register_driver); * Returns zero if successful, and -EINVAL if the cpufreq_driver is * currently not initialised. */ -int cpufreq_unregister_driver(struct cpufreq_driver *driver) +void cpufreq_unregister_driver(struct cpufreq_driver *driver) { unsigned long flags; - if (!cpufreq_driver || (driver != cpufreq_driver)) - return -EINVAL; + if (WARN_ON(!cpufreq_driver || (driver != cpufreq_driver))) + return; pr_debug("unregistering driver %s\n", driver->name); @@ -2926,8 +2926,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) write_unlock_irqrestore(&cpufreq_driver_lock, flags); cpus_read_unlock(); - - return 0; } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c index 9e97f60f8199..ebb3a8102681 100644 --- a/drivers/cpufreq/davinci-cpufreq.c +++ b/drivers/cpufreq/davinci-cpufreq.c @@ -133,12 +133,14 @@ static int __init davinci_cpufreq_probe(struct platform_device *pdev) static int __exit davinci_cpufreq_remove(struct platform_device *pdev) { + cpufreq_unregister_driver(&davinci_driver); + clk_put(cpufreq.armclk); if (cpufreq.asyncclk) clk_put(cpufreq.asyncclk); - return cpufreq_unregister_driver(&davinci_driver); + return 0; } static struct platform_driver davinci_cpufreq_driver = { diff --git a/drivers/cpufreq/loongson1-cpufreq.c b/drivers/cpufreq/loongson1-cpufreq.c deleted file mode 100644 index fb72d709db56..000000000000 --- a/drivers/cpufreq/loongson1-cpufreq.c +++ /dev/null @@ -1,222 +0,0 @@ -/* - * CPU Frequency Scaling for Loongson 1 SoC - * - * Copyright (C) 2014-2016 Zhang, Keguang <keguang.zhang@gmail.com> - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ - -#include <linux/clk.h> -#include <linux/clk-provider.h> -#include <linux/cpu.h> -#include <linux/cpufreq.h> -#include <linux/delay.h> -#include <linux/io.h> -#include <linux/module.h> -#include <linux/platform_device.h> -#include <linux/slab.h> - -#include <cpufreq.h> -#include <loongson1.h> - -struct ls1x_cpufreq { - struct device *dev; - struct clk *clk; /* CPU clk */ - struct clk *mux_clk; /* MUX of CPU clk */ - struct clk *pll_clk; /* PLL clk */ - struct clk *osc_clk; /* OSC clk */ - unsigned int max_freq; - unsigned int min_freq; -}; - -static struct ls1x_cpufreq *cpufreq; - -static int ls1x_cpufreq_notifier(struct notifier_block *nb, - unsigned long val, void *data) -{ - if (val == CPUFREQ_POSTCHANGE) - current_cpu_data.udelay_val = loops_per_jiffy; - - return NOTIFY_OK; -} - -static struct notifier_block ls1x_cpufreq_notifier_block = { - .notifier_call = ls1x_cpufreq_notifier -}; - -static int ls1x_cpufreq_target(struct cpufreq_policy *policy, - unsigned int index) -{ - struct device *cpu_dev = get_cpu_device(policy->cpu); - unsigned int old_freq, new_freq; - - old_freq = policy->cur; - new_freq = policy->freq_table[index].frequency; - - /* - * The procedure of reconfiguring CPU clk is as below. - * - * - Reparent CPU clk to OSC clk - * - Reset CPU clock (very important) - * - Reconfigure CPU DIV - * - Reparent CPU clk back to CPU DIV clk - */ - - clk_set_parent(policy->clk, cpufreq->osc_clk); - __raw_writel(__raw_readl(LS1X_CLK_PLL_DIV) | RST_CPU_EN | RST_CPU, - LS1X_CLK_PLL_DIV); - __raw_writel(__raw_readl(LS1X_CLK_PLL_DIV) & ~(RST_CPU_EN | RST_CPU), - LS1X_CLK_PLL_DIV); - clk_set_rate(cpufreq->mux_clk, new_freq * 1000); - clk_set_parent(policy->clk, cpufreq->mux_clk); - dev_dbg(cpu_dev, "%u KHz --> %u KHz\n", old_freq, new_freq); - - return 0; -} - -static int ls1x_cpufreq_init(struct cpufreq_policy *policy) -{ - struct device *cpu_dev = get_cpu_device(policy->cpu); - struct cpufreq_frequency_table *freq_tbl; - unsigned int pll_freq, freq; - int steps, i; - - pll_freq = clk_get_rate(cpufreq->pll_clk) / 1000; - - steps = 1 << DIV_CPU_WIDTH; - freq_tbl = kcalloc(steps, sizeof(*freq_tbl), GFP_KERNEL); - if (!freq_tbl) - return -ENOMEM; - - for (i = 0; i < (steps - 1); i++) { - freq = pll_freq / (i + 1); - if ((freq < cpufreq->min_freq) || (freq > cpufreq->max_freq)) - freq_tbl[i].frequency = CPUFREQ_ENTRY_INVALID; - else - freq_tbl[i].frequency = freq; - dev_dbg(cpu_dev, - "cpufreq table: index %d: frequency %d\n", i, - freq_tbl[i].frequency); - } - freq_tbl[i].frequency = CPUFREQ_TABLE_END; - - policy->clk = cpufreq->clk; - cpufreq_generic_init(policy, freq_tbl, 0); - - return 0; -} - -static int ls1x_cpufreq_exit(struct cpufreq_policy *policy) -{ - kfree(policy->freq_table); - return 0; -} - -static struct cpufreq_driver ls1x_cpufreq_driver = { - .name = "cpufreq-ls1x", - .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, - .verify = cpufreq_generic_frequency_table_verify, - .target_index = ls1x_cpufreq_target, - .get = cpufreq_generic_get, - .init = ls1x_cpufreq_init, - .exit = ls1x_cpufreq_exit, - .attr = cpufreq_generic_attr, -}; - -static int ls1x_cpufreq_remove(struct platform_device *pdev) -{ - cpufreq_unregister_notifier(&ls1x_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - cpufreq_unregister_driver(&ls1x_cpufreq_driver); - - return 0; -} - -static int ls1x_cpufreq_probe(struct platform_device *pdev) -{ - struct plat_ls1x_cpufreq *pdata = dev_get_platdata(&pdev->dev); - struct clk *clk; - int ret; - - if (!pdata || !pdata->clk_name || !pdata->osc_clk_name) { - dev_err(&pdev->dev, "platform data missing\n"); - return -EINVAL; - } - - cpufreq = - devm_kzalloc(&pdev->dev, sizeof(struct ls1x_cpufreq), GFP_KERNEL); - if (!cpufreq) - return -ENOMEM; - - cpufreq->dev = &pdev->dev; - - clk = devm_clk_get(&pdev->dev, pdata->clk_name); - if (IS_ERR(clk)) { - dev_err(&pdev->dev, "unable to get %s clock\n", - pdata->clk_name); - return PTR_ERR(clk); - } - cpufreq->clk = clk; - - clk = clk_get_parent(clk); - if (IS_ERR(clk)) { - dev_err(&pdev->dev, "unable to get parent of %s clock\n", - __clk_get_name(cpufreq->clk)); - return PTR_ERR(clk); - } - cpufreq->mux_clk = clk; - - clk = clk_get_parent(clk); - if (IS_ERR(clk)) { - dev_err(&pdev->dev, "unable to get parent of %s clock\n", - __clk_get_name(cpufreq->mux_clk)); - return PTR_ERR(clk); - } - cpufreq->pll_clk = clk; - - clk = devm_clk_get(&pdev->dev, pdata->osc_clk_name); - if (IS_ERR(clk)) { - dev_err(&pdev->dev, "unable to get %s clock\n", - pdata->osc_clk_name); - return PTR_ERR(clk); - } - cpufreq->osc_clk = clk; - - cpufreq->max_freq = pdata->max_freq; - cpufreq->min_freq = pdata->min_freq; - - ret = cpufreq_register_driver(&ls1x_cpufreq_driver); - if (ret) { - dev_err(&pdev->dev, - "failed to register CPUFreq driver: %d\n", ret); - return ret; - } - - ret = cpufreq_register_notifier(&ls1x_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - - if (ret) { - dev_err(&pdev->dev, - "failed to register CPUFreq notifier: %d\n",ret); - cpufreq_unregister_driver(&ls1x_cpufreq_driver); - } - - return ret; -} - -static struct platform_driver ls1x_cpufreq_platdrv = { - .probe = ls1x_cpufreq_probe, - .remove = ls1x_cpufreq_remove, - .driver = { - .name = "ls1x-cpufreq", - }, -}; - -module_platform_driver(ls1x_cpufreq_platdrv); - -MODULE_ALIAS("platform:ls1x-cpufreq"); -MODULE_AUTHOR("Kelvin Cheung <keguang.zhang@gmail.com>"); -MODULE_DESCRIPTION("Loongson1 CPUFreq driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index f80339779084..b22f5cc8a463 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -317,13 +317,16 @@ static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev) static int mtk_cpufreq_hw_driver_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); + cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); + + return 0; } static const struct of_device_id mtk_cpufreq_hw_match[] = { { .compatible = "mediatek,cpufreq-hw", .data = &cpufreq_mtk_offsets }, {} }; +MODULE_DEVICE_TABLE(of, mtk_cpufreq_hw_match); static struct platform_driver mtk_cpufreq_hw_driver = { .probe = mtk_cpufreq_hw_driver_probe, diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index 1b50df06c6bc..81649a1969b6 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -184,7 +184,9 @@ static int omap_cpufreq_probe(struct platform_device *pdev) static int omap_cpufreq_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&omap_driver); + cpufreq_unregister_driver(&omap_driver); + + return 0; } static struct platform_driver omap_cpufreq_platdrv = { diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index d3f55ca06ed3..2f581d2d617d 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -770,7 +770,9 @@ of_exit: static int qcom_cpufreq_hw_driver_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); + cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); + + return 0; } static struct platform_driver qcom_cpufreq_hw_driver = { diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c index 4596c3e323aa..5890e25d7f77 100644 --- a/drivers/cpufreq/tegra194-cpufreq.c +++ b/drivers/cpufreq/tegra194-cpufreq.c @@ -411,7 +411,8 @@ static int tegra194_cpufreq_set_target(struct cpufreq_policy *policy, static struct cpufreq_driver tegra194_cpufreq_driver = { .name = "tegra194", - .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_INITIAL_FREQ_CHECK | + CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, .target_index = tegra194_cpufreq_set_target, .get = tegra194_get_speed, diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index ff71dd662880..cac5997dca50 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -74,6 +74,7 @@ endmenu config HALTPOLL_CPUIDLE tristate "Halt poll cpuidle driver" depends on X86 && KVM_GUEST + select CPU_IDLE_GOV_HALTPOLL default y help This option enables halt poll cpuidle driver, which allows to poll diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm index 747aa537389b..a1ee475d180d 100644 --- a/drivers/cpuidle/Kconfig.arm +++ b/drivers/cpuidle/Kconfig.arm @@ -24,6 +24,14 @@ config ARM_PSCI_CPUIDLE It provides an idle driver that is capable of detecting and managing idle states through the PSCI firmware interface. + The driver has limitations when used with PREEMPT_RT: + - If the idle states are described with the non-hierarchical layout, + all idle states are still available. + + - If the idle states are described with the hierarchical layout, + only the idle states defined per CPU are available, but not the ones + being shared among a group of CPUs (aka cluster idle states). + config ARM_PSCI_CPUIDLE_DOMAIN bool "PSCI CPU idle Domain" depends on ARM_PSCI_CPUIDLE @@ -102,6 +110,7 @@ config ARM_MVEBU_V7_CPUIDLE config ARM_TEGRA_CPUIDLE bool "CPU Idle Driver for NVIDIA Tegra SoCs" depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU + depends on ARCH_SUSPEND_POSSIBLE select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP select ARM_CPU_SUSPEND help @@ -110,6 +119,7 @@ config ARM_TEGRA_CPUIDLE config ARM_QCOM_SPM_CPUIDLE bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)" depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU + depends on ARCH_SUSPEND_POSSIBLE select ARM_CPU_SUSPEND select CPU_IDLE_MULTIPLE_DRIVERS select DT_IDLE_STATES diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 3a39a7f48b77..e66df22f9695 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -32,7 +32,7 @@ static int default_enter_idle(struct cpuidle_device *dev, local_irq_enable(); return index; } - default_idle(); + arch_cpu_idle(); return index; } diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index c80cf9ddabd8..6ad2954948a5 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -64,8 +64,11 @@ static int psci_pd_init(struct device_node *np, bool use_osi) pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN; - /* Allow power off when OSI has been successfully enabled. */ - if (use_osi) + /* + * Allow power off when OSI has been successfully enabled. + * PREEMPT_RT is not yet ready to enter domain idle states. + */ + if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT)) pd->power_off = psci_pd_power_off; else pd->flags |= GENPD_FLAG_ALWAYS_ON; diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index 312a34ef28dc..6de027f9f6f5 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -222,6 +222,9 @@ static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv, if (!psci_has_osi_support()) return 0; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return 0; + data->dev = psci_dt_attach_cpu(cpu); if (IS_ERR_OR_NULL(data->dev)) return PTR_ERR_OR_ZERO(data->dev); diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index f70aa17e2a8e..d9cda7f6ccb9 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -183,11 +183,15 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) s->target_residency_ns = s->target_residency * NSEC_PER_USEC; else if (s->target_residency_ns < 0) s->target_residency_ns = 0; + else + s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC); if (s->exit_latency > 0) s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC; else if (s->exit_latency_ns < 0) s->exit_latency_ns = 0; + else + s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); } } diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index d9262db79cae..987fc5f3997d 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -2,8 +2,13 @@ /* * Timer events oriented CPU idle governor * + * TEO governor: * Copyright (C) 2018 - 2021 Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * Util-awareness mechanism: + * Copyright (C) 2022 Arm Ltd. + * Author: Kajetan Puchalski <kajetan.puchalski@arm.com> */ /** @@ -99,15 +104,56 @@ * select the given idle state instead of the candidate one. * * 3. By default, select the candidate state. + * + * Util-awareness mechanism: + * + * The idea behind the util-awareness extension is that there are two distinct + * scenarios for the CPU which should result in two different approaches to idle + * state selection - utilized and not utilized. + * + * In this case, 'utilized' means that the average runqueue util of the CPU is + * above a certain threshold. + * + * When the CPU is utilized while going into idle, more likely than not it will + * be woken up to do more work soon and so a shallower idle state should be + * selected to minimise latency and maximise performance. When the CPU is not + * being utilized, the usual metrics-based approach to selecting the deepest + * available idle state should be preferred to take advantage of the power + * saving. + * + * In order to achieve this, the governor uses a utilization threshold. + * The threshold is computed per-CPU as a percentage of the CPU's capacity + * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%) + * seems to be getting the best results. + * + * Before selecting the next idle state, the governor compares the current CPU + * util to the precomputed util threshold. If it's below, it defaults to the + * TEO metrics mechanism. If it's above, the closest shallower idle state will + * be selected instead, as long as is not a polling state. */ #include <linux/cpuidle.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/sched/topology.h> #include <linux/tick.h> /* + * The number of bits to shift the CPU's capacity by in order to determine + * the utilized threshold. + * + * 6 was chosen based on testing as the number that achieved the best balance + * of power and performance on average. + * + * The resulting threshold is high enough to not be triggered by background + * noise and low enough to react quickly when activity starts to ramp up. + */ +#define UTIL_THRESHOLD_SHIFT 6 + + +/* * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value * is used for decreasing metrics on a regular basis. */ @@ -137,9 +183,11 @@ struct teo_bin { * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. - * @total: Grand total of the "intercepts" and "hits" mertics for all bins. + * @total: Grand total of the "intercepts" and "hits" metrics for all bins. * @next_recent_idx: Index of the next @recent_idx entry to update. * @recent_idx: Indices of bins corresponding to recent "intercepts". + * @util_threshold: Threshold above which the CPU is considered utilized + * @utilized: Whether the last sleep on the CPU happened while utilized */ struct teo_cpu { s64 time_span_ns; @@ -148,11 +196,30 @@ struct teo_cpu { unsigned int total; int next_recent_idx; int recent_idx[NR_RECENT]; + unsigned long util_threshold; + bool utilized; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); /** + * teo_cpu_is_utilized - Check if the CPU's util is above the threshold + * @cpu: Target CPU + * @cpu_data: Governor CPU data for the target CPU + */ +#ifdef CONFIG_SMP +static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) +{ + return sched_cpu_util(cpu) > cpu_data->util_threshold; +} +#else +static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) +{ + return false; +} +#endif + +/** * teo_update - Update CPU metrics after wakeup. * @drv: cpuidle driver containing state data. * @dev: Target CPU. @@ -258,15 +325,17 @@ static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) * @dev: Target CPU. * @state_idx: Index of the capping idle state. * @duration_ns: Idle duration value to match. + * @no_poll: Don't consider polling states. */ static int teo_find_shallower_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, int state_idx, - s64 duration_ns) + s64 duration_ns, bool no_poll) { int i; for (i = state_idx - 1; i >= 0; i--) { - if (dev->states_usage[i].disable) + if (dev->states_usage[i].disable || + (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) continue; state_idx = i; @@ -321,6 +390,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto end; } + cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); + /* + * If the CPU is being utilized over the threshold and there are only 2 + * states to choose from, the metrics need not be considered, so choose + * the shallowest non-polling state and exit. + */ + if (drv->state_count < 3 && cpu_data->utilized) { + for (i = 0; i < drv->state_count; ++i) { + if (!dev->states_usage[i].disable && + !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) { + idx = i; + goto end; + } + } + } + /* * Find the deepest idle state whose target residency does not exceed * the current sleep length and the deepest idle state not deeper than @@ -452,6 +537,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx > constraint_idx) idx = constraint_idx; + /* + * If the CPU is being utilized over the threshold, choose a shallower + * non-polling state to improve latency + */ + if (cpu_data->utilized) + idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true); + end: /* * Don't stop the tick if the selected state is a polling one or if the @@ -469,7 +561,7 @@ end: */ if (idx > idx0 && drv->states[idx].target_residency_ns > delta_tick) - idx = teo_find_shallower_state(drv, dev, idx, delta_tick); + idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); } return idx; @@ -508,9 +600,11 @@ static int teo_enable_device(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu); int i; memset(cpu_data, 0, sizeof(*cpu_data)); + cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT; for (i = 0; i < NR_RECENT; i++) cpu_data->recent_idx[i] = -1; diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 2b496a53cbca..48948b171749 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -200,7 +200,7 @@ static void cpuidle_sysfs_release(struct kobject *kobj) complete(&kdev->kobj_unregister); } -static struct kobj_type ktype_cpuidle = { +static const struct kobj_type ktype_cpuidle = { .sysfs_ops = &cpuidle_sysfs_ops, .release = cpuidle_sysfs_release, }; @@ -447,7 +447,7 @@ static void cpuidle_state_sysfs_release(struct kobject *kobj) complete(&state_obj->kobj_unregister); } -static struct kobj_type ktype_state_cpuidle = { +static const struct kobj_type ktype_state_cpuidle = { .sysfs_ops = &cpuidle_state_sysfs_ops, .default_groups = cpuidle_state_default_groups, .release = cpuidle_state_sysfs_release, @@ -594,7 +594,7 @@ static struct attribute *cpuidle_driver_default_attrs[] = { }; ATTRIBUTE_GROUPS(cpuidle_driver_default); -static struct kobj_type ktype_driver_cpuidle = { +static const struct kobj_type ktype_driver_cpuidle = { .sysfs_ops = &cpuidle_driver_sysfs_ops, .default_groups = cpuidle_driver_default_groups, .release = cpuidle_driver_sysfs_release, diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index e2d64a8f9422..938c17f25d94 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1424,6 +1424,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &idle_cpu_adl_n), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), @@ -1859,6 +1860,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) skx_idle_state_table_update(); break; case INTEL_FAM6_SAPPHIRERAPIDS_X: + case INTEL_FAM6_EMERALDRAPIDS_X: spr_idle_state_table_update(); break; case INTEL_FAM6_ALDERLAKE: diff --git a/drivers/opp/Kconfig b/drivers/opp/Kconfig index e8ce47b32735..d7c649a1a981 100644 --- a/drivers/opp/Kconfig +++ b/drivers/opp/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config PM_OPP bool - select SRCU help SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c index 96a30a032c5f..2c7fb683441e 100644 --- a/drivers/opp/debugfs.c +++ b/drivers/opp/debugfs.c @@ -235,7 +235,7 @@ static void opp_migrate_dentry(struct opp_device *opp_dev, dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir, opp_table->dentry_name); - if (!dentry) { + if (IS_ERR(dentry)) { dev_err(dev, "%s: Failed to rename link from: %s to %s\n", __func__, dev_name(opp_dev->dev), dev_name(dev)); return; diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index fe86a09e3b67..c03b5402c03b 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -155,10 +155,12 @@ void idle_inject_set_duration(struct idle_inject_device *ii_dev, unsigned int run_duration_us, unsigned int idle_duration_us) { - if (run_duration_us && idle_duration_us) { + if (run_duration_us + idle_duration_us) { WRITE_ONCE(ii_dev->run_duration_us, run_duration_us); WRITE_ONCE(ii_dev->idle_duration_us, idle_duration_us); } + if (!run_duration_us) + pr_debug("CPU is forced to 100 percent idle\n"); } /** @@ -201,7 +203,7 @@ int idle_inject_start(struct idle_inject_device *ii_dev) unsigned int idle_duration_us = READ_ONCE(ii_dev->idle_duration_us); unsigned int run_duration_us = READ_ONCE(ii_dev->run_duration_us); - if (!idle_duration_us || !run_duration_us) + if (!(idle_duration_us + run_duration_us)) return -EINVAL; pr_debug("Starting injecting idle cycles on CPUs '%*pbl'\n", diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 26d00b1853b4..8970c7b80884 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -999,7 +999,15 @@ static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value, do_div(value, rp->time_unit); y = ilog2(value); - f = div64_u64(4 * (value - (1 << y)), 1 << y); + + /* + * The target hardware field is 7 bits wide, so return all ones + * if the exponent is too large. + */ + if (y > 0x1f) + return 0x7f; + + f = div64_u64(4 * (value - (1ULL << y)), 1ULL << y); value = (y & 0x1f) | ((f & 0x3) << 5); } return value; @@ -1113,7 +1121,10 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &rapl_defaults_core), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &rapl_defaults_core), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt), diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c index 1f968353d479..e180dee0f83d 100644 --- a/drivers/powercap/powercap_sys.c +++ b/drivers/powercap/powercap_sys.c @@ -530,9 +530,6 @@ struct powercap_zone *powercap_register_zone( power_zone->name = kstrdup(name, GFP_KERNEL); if (!power_zone->name) goto err_name_alloc; - dev_set_name(&power_zone->dev, "%s:%x", - dev_name(power_zone->dev.parent), - power_zone->id); power_zone->constraints = kcalloc(nr_constraints, sizeof(*power_zone->constraints), GFP_KERNEL); @@ -555,9 +552,16 @@ struct powercap_zone *powercap_register_zone( power_zone->dev_attr_groups[0] = &power_zone->dev_zone_attr_group; power_zone->dev_attr_groups[1] = NULL; power_zone->dev.groups = power_zone->dev_attr_groups; + dev_set_name(&power_zone->dev, "%s:%x", + dev_name(power_zone->dev.parent), + power_zone->id); result = device_register(&power_zone->dev); - if (result) - goto err_dev_ret; + if (result) { + put_device(&power_zone->dev); + mutex_unlock(&control_type->lock); + + return ERR_PTR(result); + } control_type->nr_zones++; mutex_unlock(&control_type->lock); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index c5614444031f..6b487a5bd638 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -108,12 +108,14 @@ struct cppc_perf_caps { u32 lowest_nonlinear_perf; u32 lowest_freq; u32 nominal_freq; + u32 energy_perf; }; struct cppc_perf_ctrls { u32 max_perf; u32 min_perf; u32 desired_perf; + u32 energy_perf; }; struct cppc_perf_fb_ctrs { @@ -149,6 +151,8 @@ extern bool cpc_ffh_supported(void); extern bool cpc_supported_by_cpu(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); +extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); +extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); #else /* !CONFIG_ACPI_CPPC_LIB */ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) { @@ -202,6 +206,14 @@ static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) { return -ENOTSUPP; } +static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) +{ + return -ENOTSUPP; +} +static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) +{ + return -ENOTSUPP; +} #endif /* !CONFIG_ACPI_CPPC_LIB */ #endif /* _CPPC_ACPI_H*/ diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 1c4b8659f171..f5f22418e64b 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -12,6 +12,11 @@ #include <linux/pm_qos.h> +#define AMD_CPPC_EPP_PERFORMANCE 0x00 +#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 +#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF +#define AMD_CPPC_EPP_POWERSAVE 0xFF + /********************************************************************* * AMD P-state INTERFACE * *********************************************************************/ @@ -47,6 +52,10 @@ struct amd_aperf_mperf { * @prev: Last Aperf/Mperf/tsc count value read from register * @freq: current cpu frequency value * @boost_supported: check whether the Processor or SBIOS supports boost mode + * @epp_policy: Last saved policy used to set energy-performance preference + * @epp_cached: Cached CPPC energy-performance preference value + * @policy: Cpufreq policy value + * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value * * The amd_cpudata is key private data for each CPU thread in AMD P-State, and * represents all the attributes and goals that AMD P-State requests at runtime. @@ -72,6 +81,29 @@ struct amd_cpudata { u64 freq; bool boost_supported; + + /* EPP feature related attributes*/ + s16 epp_policy; + s16 epp_cached; + u32 policy; + u64 cppc_cap1_cached; + bool suspended; }; +/* + * enum amd_pstate_mode - driver working mode of amd pstate + */ +enum amd_pstate_mode { + AMD_PSTATE_DISABLE = 0, + AMD_PSTATE_PASSIVE, + AMD_PSTATE_ACTIVE, + AMD_PSTATE_MAX, +}; + +static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_DISABLE] = "disable", + [AMD_PSTATE_PASSIVE] = "passive", + [AMD_PSTATE_ACTIVE] = "active", + NULL, +}; #endif /* _LINUX_AMD_PSTATE_H */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6a94a6eaad27..65623233ab2f 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -448,7 +448,7 @@ struct cpufreq_driver { #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING BIT(6) int cpufreq_register_driver(struct cpufreq_driver *driver_data); -int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); +void cpufreq_unregister_driver(struct cpufreq_driver *driver_data); bool cpufreq_driver_test_flags(u16 flags); const char *cpufreq_get_current_driver(void); diff --git a/include/linux/pm.h b/include/linux/pm.h index 93cd34f00822..035d9649eba4 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -379,9 +379,13 @@ const struct dev_pm_ops name = { \ const struct dev_pm_ops name; \ __EXPORT_SYMBOL(name, sec, ns); \ const struct dev_pm_ops name +#define EXPORT_PM_FN_GPL(name) EXPORT_SYMBOL_GPL(name) +#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, ns) #else #define _EXPORT_DEV_PM_OPS(name, sec, ns) \ static __maybe_unused const struct dev_pm_ops __static_##name +#define EXPORT_PM_FN_GPL(name) +#define EXPORT_PM_FN_NS_GPL(name, ns) #endif #define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "") diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 60a1d3051cc7..4b31629c5be4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -118,7 +118,6 @@ config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS select PM - select SRCU config PM_SLEEP_SMP def_bool y diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index f82111837b8d..7b44f5b89fa1 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -87,10 +87,7 @@ static void em_debug_create_pd(struct device *dev) static void em_debug_remove_pd(struct device *dev) { - struct dentry *debug_dir; - - debug_dir = debugfs_lookup(dev_name(dev), rootdir); - debugfs_remove_recursive(debug_dir); + debugfs_lookup_and_remove(dev_name(dev), rootdir); } static int __init em_debug_init(void) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 277434b6c0bf..36a1df48280c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -581,7 +581,7 @@ static int save_image(struct swap_map_handle *handle, return ret; } -/** +/* * Structure used for CRC32. */ struct crc_data { @@ -596,7 +596,7 @@ struct crc_data { unsigned char *unc[LZO_THREADS]; /* uncompressed data */ }; -/** +/* * CRC32 update function that runs in its own thread. */ static int crc32_threadfn(void *data) @@ -623,7 +623,7 @@ static int crc32_threadfn(void *data) } return 0; } -/** +/* * Structure used for LZO data compression. */ struct cmp_data { @@ -640,7 +640,7 @@ struct cmp_data { unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ }; -/** +/* * Compression function that runs in its own thread. */ static int lzo_compress_threadfn(void *data) @@ -948,9 +948,9 @@ out_finish: return error; } -/** +/* * The following functions allow us to read data using a swap map - * in a file-alike way + * in a file-like way. */ static void release_swap_reader(struct swap_map_handle *handle) @@ -1107,7 +1107,7 @@ static int load_image(struct swap_map_handle *handle, return ret; } -/** +/* * Structure used for LZO data decompression. */ struct dec_data { @@ -1123,7 +1123,7 @@ struct dec_data { unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ }; -/** +/* * Decompression function that runs in its own thread. */ static int lzo_decompress_threadfn(void *data) diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index c60c90f35d18..82c09cd25cc2 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -120,9 +120,9 @@ class SystemValues: cgexp = False testdir = '' outdir = '' - tpath = '/sys/kernel/debug/tracing/' + tpath = '/sys/kernel/tracing/' fpdtpath = '/sys/firmware/acpi/tables/FPDT' - epath = '/sys/kernel/debug/tracing/events/power/' + epath = '/sys/kernel/tracing/events/power/' pmdpath = '/sys/power/pm_debug_messages' s0ixpath = '/sys/module/intel_pmc_core/parameters/warn_on_s0ix_failures' s0ixres = '/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us' diff --git a/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py b/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py index 2dea4032ac56..904df0ea0a1e 100755 --- a/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py +++ b/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py @@ -248,7 +248,7 @@ def signal_handler(signal, frame): ipt.free_trace_buffer() sys.exit(0) -trace_file = "/sys/kernel/debug/tracing/events/amd_cpu/enable" +trace_file = "/sys/kernel/tracing/events/amd_cpu/enable" signal.signal(signal.SIGINT, signal_handler) interval = "" @@ -319,7 +319,7 @@ print(cur_version) cleanup_data_files() if interval: - file_name = "/sys/kernel/debug/tracing/trace" + file_name = "/sys/kernel/tracing/trace" ipt.clear_trace_file() ipt.set_trace_buffer_size(memory) ipt.enable_trace(trace_file) diff --git a/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py b/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py index b46e9eb8f5aa..ec3323100e1a 100755 --- a/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py +++ b/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py @@ -373,7 +373,7 @@ def clear_trace_file(): """ Clear trace file """ try: - f_handle = open('/sys/kernel/debug/tracing/trace', 'w') + f_handle = open('/sys/kernel/tracing/trace', 'w') f_handle.close() except: print('IO error clearing trace file ') @@ -401,7 +401,7 @@ def set_trace_buffer_size(memory): """ Set trace buffer size """ try: - with open('/sys/kernel/debug/tracing/buffer_size_kb', 'w') as fp: + with open('/sys/kernel/tracing/buffer_size_kb', 'w') as fp: fp.write(memory) except: print('IO error setting trace buffer size ') @@ -411,7 +411,7 @@ def free_trace_buffer(): """ Free the trace buffer memory """ try: - open('/sys/kernel/debug/tracing/buffer_size_kb' + open('/sys/kernel/tracing/buffer_size_kb' , 'w').write("1") except: print('IO error freeing trace buffer ') @@ -495,7 +495,7 @@ def signal_handler(signal, frame): sys.exit(0) if __name__ == "__main__": - trace_file = "/sys/kernel/debug/tracing/events/power/pstate_sample/enable" + trace_file = "/sys/kernel/tracing/events/power/pstate_sample/enable" signal.signal(signal.SIGINT, signal_handler) interval = "" @@ -569,7 +569,7 @@ if __name__ == "__main__": cleanup_data_files() if interval: - filename = "/sys/kernel/debug/tracing/trace" + filename = "/sys/kernel/tracing/trace" clear_trace_file() set_trace_buffer_size(memory) enable_trace(trace_file) |