diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-devices-system-cpu | 69 | ||||
-rw-r--r-- | drivers/cpufreq/acpi-cpufreq.c | 18 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq.c | 98 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq_governor.c | 2 | ||||
-rw-r--r-- | drivers/cpufreq/intel_pstate.c | 73 | ||||
-rw-r--r-- | drivers/cpufreq/powernv-cpufreq.c | 124 | ||||
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 50 | ||||
-rw-r--r-- | drivers/idle/intel_idle.c | 133 |
8 files changed, 407 insertions, 160 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index b683e8ee69ec..16501334b99f 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -271,3 +271,72 @@ Description: Parameters for the CPU cache attributes - WriteBack: data is written only to the cache line and the modified cache line is written to main memory only when it is replaced + +What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset +Date: March 2016 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> + Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org> +Description: POWERNV CPUFreq driver's frequency throttle stats directory and + attributes + + 'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency + throttle stat attributes for the chip. The throttle stats of a cpu + is common across all the cpus belonging to a chip. Below are the + throttle attributes exported in the 'throttle_stats' directory: + + - turbo_stat : This file gives the total number of times the max + frequency is throttled to lower frequency in turbo (at and above + nominal frequency) range of frequencies. + + - sub_turbo_stat : This file gives the total number of times the + max frequency is throttled to lower frequency in sub-turbo(below + nominal frequency) range of frequencies. + + - unthrottle : This file gives the total number of times the max + frequency is unthrottled after being throttled. + + - powercap : This file gives the total number of times the max + frequency is throttled due to 'Power Capping'. + + - overtemp : This file gives the total number of times the max + frequency is throttled due to 'CPU Over Temperature'. + + - supply_fault : This file gives the total number of times the + max frequency is throttled due to 'Power Supply Failure'. + + - overcurrent : This file gives the total number of times the + max frequency is throttled due to 'Overcurrent'. + + - occ_reset : This file gives the total number of times the max + frequency is throttled due to 'OCC Reset'. + + The sysfs attributes representing different throttle reasons like + powercap, overtemp, supply_fault, overcurrent and occ_reset map to + the reasons provided by OCC firmware for throttling the frequency. + +What: /sys/devices/system/cpu/cpufreq/policyX/throttle_stats + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/turbo_stat + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/sub_turbo_stat + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/unthrottle + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/powercap + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overtemp + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/supply_fault + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overcurrent + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/occ_reset +Date: March 2016 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> + Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org> +Description: POWERNV CPUFreq driver's frequency throttle stats directory and + attributes + + 'policyX/throttle_stats' directory and all the attributes are same as + the /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory and + attributes which give the frequency throttle information of the chip. diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 59a7b380fbe2..fb5712141040 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -245,7 +245,7 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data) } } -u32 cpu_freq_read_intel(struct acpi_pct_register *not_used) +static u32 cpu_freq_read_intel(struct acpi_pct_register *not_used) { u32 val, dummy; @@ -253,7 +253,7 @@ u32 cpu_freq_read_intel(struct acpi_pct_register *not_used) return val; } -void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val) +static void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val) { u32 lo, hi; @@ -262,7 +262,7 @@ void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val) wrmsr(MSR_IA32_PERF_CTL, lo, hi); } -u32 cpu_freq_read_amd(struct acpi_pct_register *not_used) +static u32 cpu_freq_read_amd(struct acpi_pct_register *not_used) { u32 val, dummy; @@ -270,12 +270,12 @@ u32 cpu_freq_read_amd(struct acpi_pct_register *not_used) return val; } -void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val) +static void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val) { wrmsr(MSR_AMD_PERF_CTL, val, 0); } -u32 cpu_freq_read_io(struct acpi_pct_register *reg) +static u32 cpu_freq_read_io(struct acpi_pct_register *reg) { u32 val; @@ -283,7 +283,7 @@ u32 cpu_freq_read_io(struct acpi_pct_register *reg) return val; } -void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val) +static void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val) { acpi_os_write_port(reg->address, val, reg->bit_width); } @@ -514,8 +514,10 @@ static int boost_notify(struct notifier_block *nb, unsigned long action, */ switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask); break; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 4c7825856eab..b87596b591b3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -76,6 +76,7 @@ static inline bool has_target(void) /* internal prototypes */ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event); static unsigned int __cpufreq_get(struct cpufreq_policy *policy); +static int cpufreq_start_governor(struct cpufreq_policy *policy); /** * Two notifier lists: the "policy" list is involved in the @@ -964,10 +965,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp cpumask_set_cpu(cpu, policy->cpus); if (has_target()) { - ret = cpufreq_governor(policy, CPUFREQ_GOV_START); - if (!ret) - ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); - + ret = cpufreq_start_governor(policy); if (ret) pr_err("%s: Failed to start governor\n", __func__); } @@ -1308,10 +1306,7 @@ static void cpufreq_offline(unsigned int cpu) /* Start governor again for active policy */ if (!policy_is_inactive(policy)) { if (has_target()) { - ret = cpufreq_governor(policy, CPUFREQ_GOV_START); - if (!ret) - ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); - + ret = cpufreq_start_governor(policy); if (ret) pr_err("%s: Failed to start governor\n", __func__); } @@ -1401,9 +1396,17 @@ unsigned int cpufreq_quick_get(unsigned int cpu) { struct cpufreq_policy *policy; unsigned int ret_freq = 0; + unsigned long flags; - if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) - return cpufreq_driver->get(cpu); + read_lock_irqsave(&cpufreq_driver_lock, flags); + + if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) { + ret_freq = cpufreq_driver->get(cpu); + read_unlock_irqrestore(&cpufreq_driver_lock, flags); + return ret_freq; + } + + read_unlock_irqrestore(&cpufreq_driver_lock, flags); policy = cpufreq_cpu_get(cpu); if (policy) { @@ -1484,6 +1487,24 @@ unsigned int cpufreq_get(unsigned int cpu) } EXPORT_SYMBOL(cpufreq_get); +static unsigned int cpufreq_update_current_freq(struct cpufreq_policy *policy) +{ + unsigned int new_freq; + + new_freq = cpufreq_driver->get(policy->cpu); + if (!new_freq) + return 0; + + if (!policy->cur) { + pr_debug("cpufreq: Driver did not initialize current freq\n"); + policy->cur = new_freq; + } else if (policy->cur != new_freq && has_target()) { + cpufreq_out_of_sync(policy, new_freq); + } + + return new_freq; +} + static struct subsys_interface cpufreq_interface = { .name = "cpufreq", .subsys = &cpu_subsys, @@ -1583,9 +1604,7 @@ void cpufreq_resume(void) policy); } else { down_write(&policy->rwsem); - ret = cpufreq_governor(policy, CPUFREQ_GOV_START); - if (!ret) - cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); + ret = cpufreq_start_governor(policy); up_write(&policy->rwsem); if (ret) @@ -1593,17 +1612,6 @@ void cpufreq_resume(void) __func__, policy); } } - - /* - * schedule call cpufreq_update_policy() for first-online CPU, as that - * wouldn't be hotplugged-out on suspend. It will verify that the - * current freq is in sync with what we believe it to be. - */ - policy = cpufreq_cpu_get_raw(cpumask_first(cpu_online_mask)); - if (WARN_ON(!policy)) - return; - - schedule_work(&policy->update); } /** @@ -1927,6 +1935,17 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) return ret; } +static int cpufreq_start_governor(struct cpufreq_policy *policy) +{ + int ret; + + if (cpufreq_driver->get && !cpufreq_driver->setpolicy) + cpufreq_update_current_freq(policy); + + ret = cpufreq_governor(policy, CPUFREQ_GOV_START); + return ret ? ret : cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); +} + int cpufreq_register_governor(struct cpufreq_governor *governor) { int err; @@ -2063,8 +2082,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, return cpufreq_driver->setpolicy(new_policy); } - if (new_policy->governor == policy->governor) - goto out; + if (new_policy->governor == policy->governor) { + pr_debug("cpufreq: governor limits update\n"); + return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); + } pr_debug("governor switch\n"); @@ -2092,10 +2113,11 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, policy->governor = new_policy->governor; ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT); if (!ret) { - ret = cpufreq_governor(policy, CPUFREQ_GOV_START); - if (!ret) - goto out; - + ret = cpufreq_start_governor(policy); + if (!ret) { + pr_debug("cpufreq: governor change\n"); + return 0; + } cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT); } @@ -2106,14 +2128,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, if (cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT)) policy->governor = NULL; else - cpufreq_governor(policy, CPUFREQ_GOV_START); + cpufreq_start_governor(policy); } return ret; - - out: - pr_debug("governor: change or update limits\n"); - return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); } /** @@ -2144,19 +2162,11 @@ int cpufreq_update_policy(unsigned int cpu) * -> ask driver for current freq and notify governors about a change */ if (cpufreq_driver->get && !cpufreq_driver->setpolicy) { - new_policy.cur = cpufreq_driver->get(cpu); + new_policy.cur = cpufreq_update_current_freq(policy); if (WARN_ON(!new_policy.cur)) { ret = -EIO; goto unlock; } - - if (!policy->cur) { - pr_debug("Driver did not initialize current freq\n"); - policy->cur = new_policy.cur; - } else { - if (policy->cur != new_policy.cur && has_target()) - cpufreq_out_of_sync(policy, new_policy.cur); - } } ret = cpufreq_set_policy(policy, &new_policy); diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 1c25ef405616..10a5cfeae8c5 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -329,7 +329,7 @@ static void dbs_irq_work(struct irq_work *irq_work) struct policy_dbs_info *policy_dbs; policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work); - schedule_work(&policy_dbs->work); + schedule_work_on(smp_processor_id(), &policy_dbs->work); } static void dbs_update_util_handler(struct update_util_data *data, u64 time, diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index cb5607495816..4b644526fd59 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -134,7 +134,7 @@ struct pstate_funcs { int (*get_min)(void); int (*get_turbo)(void); int (*get_scaling)(void); - void (*set)(struct cpudata*, int pstate); + u64 (*get_val)(struct cpudata*, int pstate); void (*get_vid)(struct cpudata *); int32_t (*get_target_pstate)(struct cpudata *); }; @@ -565,7 +565,7 @@ static int atom_get_turbo_pstate(void) return value & 0x7F; } -static void atom_set_pstate(struct cpudata *cpudata, int pstate) +static u64 atom_get_val(struct cpudata *cpudata, int pstate) { u64 val; int32_t vid_fp; @@ -585,9 +585,7 @@ static void atom_set_pstate(struct cpudata *cpudata, int pstate) if (pstate > cpudata->pstate.max_pstate) vid = cpudata->vid.turbo; - val |= vid; - - wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val); + return val | vid; } static int silvermont_get_scaling(void) @@ -711,7 +709,7 @@ static inline int core_get_scaling(void) return 100000; } -static void core_set_pstate(struct cpudata *cpudata, int pstate) +static u64 core_get_val(struct cpudata *cpudata, int pstate) { u64 val; @@ -719,7 +717,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate) if (limits->no_turbo && !limits->turbo_disabled) val |= (u64)1 << 32; - wrmsrl(MSR_IA32_PERF_CTL, val); + return val; } static int knl_get_turbo_pstate(void) @@ -750,7 +748,7 @@ static struct cpu_defaults core_params = { .get_min = core_get_min_pstate, .get_turbo = core_get_turbo_pstate, .get_scaling = core_get_scaling, - .set = core_set_pstate, + .get_val = core_get_val, .get_target_pstate = get_target_pstate_use_performance, }, }; @@ -769,7 +767,7 @@ static struct cpu_defaults silvermont_params = { .get_max_physical = atom_get_max_pstate, .get_min = atom_get_min_pstate, .get_turbo = atom_get_turbo_pstate, - .set = atom_set_pstate, + .get_val = atom_get_val, .get_scaling = silvermont_get_scaling, .get_vid = atom_get_vid, .get_target_pstate = get_target_pstate_use_cpu_load, @@ -790,7 +788,7 @@ static struct cpu_defaults airmont_params = { .get_max_physical = atom_get_max_pstate, .get_min = atom_get_min_pstate, .get_turbo = atom_get_turbo_pstate, - .set = atom_set_pstate, + .get_val = atom_get_val, .get_scaling = airmont_get_scaling, .get_vid = atom_get_vid, .get_target_pstate = get_target_pstate_use_cpu_load, @@ -812,7 +810,7 @@ static struct cpu_defaults knl_params = { .get_min = core_get_min_pstate, .get_turbo = knl_get_turbo_pstate, .get_scaling = core_get_scaling, - .set = core_set_pstate, + .get_val = core_get_val, .get_target_pstate = get_target_pstate_use_performance, }, }; @@ -839,25 +837,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); } -static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate, bool force) +static inline void intel_pstate_record_pstate(struct cpudata *cpu, int pstate) { - int max_perf, min_perf; - - if (force) { - update_turbo_state(); - - intel_pstate_get_min_max(cpu, &min_perf, &max_perf); - - pstate = clamp_t(int, pstate, min_perf, max_perf); - - if (pstate == cpu->pstate.current_pstate) - return; - } trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); - cpu->pstate.current_pstate = pstate; +} - pstate_funcs.set(cpu, pstate); +static void intel_pstate_set_min_pstate(struct cpudata *cpu) +{ + int pstate = cpu->pstate.min_pstate; + + intel_pstate_record_pstate(cpu, pstate); + /* + * Generally, there is no guarantee that this code will always run on + * the CPU being updated, so force the register update to run on the + * right CPU. + */ + wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, + pstate_funcs.get_val(cpu, pstate)); } static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) @@ -870,7 +867,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) if (pstate_funcs.get_vid) pstate_funcs.get_vid(cpu); - intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false); + + intel_pstate_set_min_pstate(cpu); } static inline void intel_pstate_calc_busy(struct cpudata *cpu) @@ -997,6 +995,21 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy); } +static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) +{ + int max_perf, min_perf; + + update_turbo_state(); + + intel_pstate_get_min_max(cpu, &min_perf, &max_perf); + pstate = clamp_t(int, pstate, min_perf, max_perf); + if (pstate == cpu->pstate.current_pstate) + return; + + intel_pstate_record_pstate(cpu, pstate); + wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); +} + static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) { int from, target_pstate; @@ -1006,7 +1019,7 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) target_pstate = pstate_funcs.get_target_pstate(cpu); - intel_pstate_set_pstate(cpu, target_pstate, true); + intel_pstate_update_pstate(cpu, target_pstate); sample = &cpu->sample; trace_pstate_sample(fp_toint(sample->core_pct_busy), @@ -1180,7 +1193,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) if (hwp_active) return; - intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false); + intel_pstate_set_min_pstate(cpu); } static int intel_pstate_cpu_init(struct cpufreq_policy *policy) @@ -1255,7 +1268,7 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs) pstate_funcs.get_min = funcs->get_min; pstate_funcs.get_turbo = funcs->get_turbo; pstate_funcs.get_scaling = funcs->get_scaling; - pstate_funcs.set = funcs->set; + pstate_funcs.get_val = funcs->get_val; pstate_funcs.get_vid = funcs->get_vid; pstate_funcs.get_target_pstate = funcs->get_target_pstate; diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 50bf12033bbc..39ac78c94be0 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -44,7 +44,6 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; -static unsigned int *core_to_chip_map; static const char * const throttle_reason[] = { "No throttling", @@ -55,6 +54,16 @@ static const char * const throttle_reason[] = { "OCC Reset" }; +enum throttle_reason_type { + NO_THROTTLE = 0, + POWERCAP, + CPU_OVERTEMP, + POWER_SUPPLY_FAILURE, + OVERCURRENT, + OCC_RESET_THROTTLE, + OCC_MAX_REASON +}; + static struct chip { unsigned int id; bool throttled; @@ -62,9 +71,13 @@ static struct chip { u8 throttle_reason; cpumask_t mask; struct work_struct throttle; + int throttle_turbo; + int throttle_sub_turbo; + int reason[OCC_MAX_REASON]; } *chips; static int nr_chips; +static DEFINE_PER_CPU(struct chip *, chip_info); /* * Note: The set of pstates consists of contiguous integers, the @@ -196,6 +209,42 @@ static struct freq_attr *powernv_cpu_freq_attr[] = { NULL, }; +#define throttle_attr(name, member) \ +static ssize_t name##_show(struct cpufreq_policy *policy, char *buf) \ +{ \ + struct chip *chip = per_cpu(chip_info, policy->cpu); \ + \ + return sprintf(buf, "%u\n", chip->member); \ +} \ + \ +static struct freq_attr throttle_attr_##name = __ATTR_RO(name) \ + +throttle_attr(unthrottle, reason[NO_THROTTLE]); +throttle_attr(powercap, reason[POWERCAP]); +throttle_attr(overtemp, reason[CPU_OVERTEMP]); +throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]); +throttle_attr(overcurrent, reason[OVERCURRENT]); +throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]); +throttle_attr(turbo_stat, throttle_turbo); +throttle_attr(sub_turbo_stat, throttle_sub_turbo); + +static struct attribute *throttle_attrs[] = { + &throttle_attr_unthrottle.attr, + &throttle_attr_powercap.attr, + &throttle_attr_overtemp.attr, + &throttle_attr_supply_fault.attr, + &throttle_attr_overcurrent.attr, + &throttle_attr_occ_reset.attr, + &throttle_attr_turbo_stat.attr, + &throttle_attr_sub_turbo_stat.attr, + NULL, +}; + +static const struct attribute_group throttle_attr_grp = { + .name = "throttle_stats", + .attrs = throttle_attrs, +}; + /* Helper routines */ /* Access helpers to power mgt SPR */ @@ -324,34 +373,35 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { + struct chip *chip; unsigned int cpu = smp_processor_id(); - unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)]; unsigned long pmsr; - int pmsr_pmax, i; + int pmsr_pmax; pmsr = get_pmspr(SPRN_PMSR); - - for (i = 0; i < nr_chips; i++) - if (chips[i].id == chip_id) - break; + chip = this_cpu_read(chip_info); /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { - if (chips[i].throttled) + if (chip->throttled) goto next; - chips[i].throttled = true; - if (pmsr_pmax < powernv_pstate_info.nominal) + chip->throttled = true; + if (pmsr_pmax < powernv_pstate_info.nominal) { pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, + cpu, chip->id, pmsr_pmax, powernv_pstate_info.nominal); - trace_powernv_throttle(chips[i].id, - throttle_reason[chips[i].throttle_reason], + chip->throttle_sub_turbo++; + } else { + chip->throttle_turbo++; + } + trace_powernv_throttle(chip->id, + throttle_reason[chip->throttle_reason], pmsr_pmax); - } else if (chips[i].throttled) { - chips[i].throttled = false; - trace_powernv_throttle(chips[i].id, - throttle_reason[chips[i].throttle_reason], + } else if (chip->throttled) { + chip->throttled = false; + trace_powernv_throttle(chip->id, + throttle_reason[chip->throttle_reason], pmsr_pmax); } @@ -411,6 +461,21 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) for (i = 0; i < threads_per_core; i++) cpumask_set_cpu(base + i, policy->cpus); + if (!policy->driver_data) { + int ret; + + ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp); + if (ret) { + pr_info("Failed to create throttle stats directory for cpu %d\n", + policy->cpu); + return ret; + } + /* + * policy->driver_data is used as a flag for one-time + * creation of throttle sysfs files. + */ + policy->driver_data = policy; + } return cpufreq_table_validate_and_show(policy, powernv_freqs); } @@ -517,8 +582,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, break; if (omsg.throttle_status >= 0 && - omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) + omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) { chips[i].throttle_reason = omsg.throttle_status; + chips[i].reason[omsg.throttle_status]++; + } if (!omsg.throttle_status) chips[i].restore = true; @@ -558,47 +625,34 @@ static int init_chip_info(void) unsigned int chip[256]; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; - cpumask_t cpu_mask; - int ret = -ENOMEM; - - core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int), - GFP_KERNEL); - if (!core_to_chip_map) - goto out; - cpumask_copy(&cpu_mask, cpu_possible_mask); - for_each_cpu(cpu, &cpu_mask) { + for_each_possible_cpu(cpu) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; chip[nr_chips++] = id; } - core_to_chip_map[cpu_core_index_of_thread(cpu)] = id; - cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu)); } chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) - goto free_chip_map; + return -ENOMEM; for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); + for_each_cpu(cpu, &chips[i].mask) + per_cpu(chip_info, cpu) = &chips[i]; } return 0; -free_chip_map: - kfree(core_to_chip_map); -out: - return ret; } static inline void clean_chip_info(void) { kfree(chips); - kfree(core_to_chip_map); } static inline void unregister_all_notifiers(void) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 27fc733cb5b9..03d38c291de6 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -196,7 +196,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); * of points is below a threshold. If it is... then use the * average of these 8 points as the estimated value. */ -static void get_typical_interval(struct menu_device *data) +static unsigned int get_typical_interval(struct menu_device *data) { int i, divisor; unsigned int max, thresh, avg; @@ -253,9 +253,7 @@ again: if (likely(variance <= U64_MAX/36)) { if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3)) || variance <= 400) { - if (data->next_timer_us > avg) - data->predicted_us = avg; - return; + return avg; } } @@ -269,7 +267,7 @@ again: * with sporadic activity with a bunch of short pauses. */ if ((divisor * 4) <= INTERVALS * 3) - return; + return UINT_MAX; thresh = max - 1; goto again; @@ -286,6 +284,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); int i; unsigned int interactivity_req; + unsigned int expected_interval; unsigned long nr_iowaiters, cpu_load; if (data->needs_update) { @@ -312,32 +311,43 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->correction_factor[data->bucket], RESOLUTION * DECAY); - get_typical_interval(data); - - /* - * Performance multiplier defines a minimum predicted idle - * duration / latency ratio. Adjust the latency limit if - * necessary. - */ - interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); - if (latency_req > interactivity_req) - latency_req = interactivity_req; + expected_interval = get_typical_interval(data); + expected_interval = min(expected_interval, data->next_timer_us); if (CPUIDLE_DRIVER_STATE_START > 0) { - data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; + struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START]; + unsigned int polling_threshold; + /* * We want to default to C1 (hlt), not to busy polling - * unless the timer is happening really really soon. + * unless the timer is happening really really soon, or + * C1's exit latency exceeds the user configured limit. */ - if (interactivity_req > 20 && - !drv->states[CPUIDLE_DRIVER_STATE_START].disabled && - dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0) + polling_threshold = max_t(unsigned int, 20, s->target_residency); + if (data->next_timer_us > polling_threshold && + latency_req > s->exit_latency && !s->disabled && + !dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable) data->last_state_idx = CPUIDLE_DRIVER_STATE_START; + else + data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; } else { data->last_state_idx = CPUIDLE_DRIVER_STATE_START; } /* + * Use the lowest expected idle interval to pick the idle state. + */ + data->predicted_us = min(data->predicted_us, expected_interval); + + /* + * Use the performance multiplier and the user-configurable + * latency_req to determine the maximum exit latency. + */ + interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); + if (latency_req > interactivity_req) + latency_req = interactivity_req; + + /* * Find the idle state with the lowest power while satisfying * our constraints. */ diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cd4510a63375..ba947df5a8c7 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -65,7 +65,7 @@ #include <asm/mwait.h> #include <asm/msr.h> -#define INTEL_IDLE_VERSION "0.4" +#define INTEL_IDLE_VERSION "0.4.1" #define PREFIX "intel_idle: " static struct cpuidle_driver intel_idle_driver = { @@ -716,6 +716,26 @@ static struct cpuidle_state avn_cstates[] = { { .enter = NULL } }; +static struct cpuidle_state knl_cstates[] = { + { + .name = "C1-KNL", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze }, + { + .name = "C6-KNL", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 120, + .target_residency = 500, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze }, + { + .enter = NULL } +}; /** * intel_idle @@ -890,6 +910,10 @@ static const struct idle_cpu idle_cpu_avn = { .disable_promotion_to_c1e = true, }; +static const struct idle_cpu idle_cpu_knl = { + .state_table = knl_cstates, +}; + #define ICPU(model, cpu) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu } @@ -921,6 +945,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(0x56, idle_cpu_bdw), ICPU(0x4e, idle_cpu_skl), ICPU(0x5e, idle_cpu_skl), + ICPU(0x57, idle_cpu_knl), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); @@ -994,36 +1019,92 @@ static void intel_idle_cpuidle_devices_uninit(void) } /* - * intel_idle_state_table_update() + * ivt_idle_state_table_update(void) * - * Update the default state_table for this CPU-id - * - * Currently used to access tuned IVT multi-socket targets + * Tune IVT multi-socket targets * Assumption: num_sockets == (max_package_num + 1) */ -void intel_idle_state_table_update(void) +static void ivt_idle_state_table_update(void) { /* IVT uses a different table for 1-2, 3-4, and > 4 sockets */ - if (boot_cpu_data.x86_model == 0x3e) { /* IVT */ - int cpu, package_num, num_sockets = 1; - - for_each_online_cpu(cpu) { - package_num = topology_physical_package_id(cpu); - if (package_num + 1 > num_sockets) { - num_sockets = package_num + 1; - - if (num_sockets > 4) { - cpuidle_state_table = ivt_cstates_8s; - return; - } + int cpu, package_num, num_sockets = 1; + + for_each_online_cpu(cpu) { + package_num = topology_physical_package_id(cpu); + if (package_num + 1 > num_sockets) { + num_sockets = package_num + 1; + + if (num_sockets > 4) { + cpuidle_state_table = ivt_cstates_8s; + return; } } + } + + if (num_sockets > 2) + cpuidle_state_table = ivt_cstates_4s; - if (num_sockets > 2) - cpuidle_state_table = ivt_cstates_4s; - /* else, 1 and 2 socket systems use default ivt_cstates */ + /* else, 1 and 2 socket systems use default ivt_cstates */ +} +/* + * sklh_idle_state_table_update(void) + * + * On SKL-H (model 0x5e) disable C8 and C9 if: + * C10 is enabled and SGX disabled + */ +static void sklh_idle_state_table_update(void) +{ + unsigned long long msr; + unsigned int eax, ebx, ecx, edx; + + + /* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */ + if (max_cstate <= 7) + return; + + /* if PC10 not present in CPUID.MWAIT.EDX */ + if ((mwait_substates & (0xF << 28)) == 0) + return; + + rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr); + + /* PC10 is not enabled in PKG C-state limit */ + if ((msr & 0xF) != 8) + return; + + ecx = 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + + /* if SGX is present */ + if (ebx & (1 << 2)) { + + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + + /* if SGX is enabled */ + if (msr & (1 << 18)) + return; + } + + skl_cstates[5].disabled = 1; /* C8-SKL */ + skl_cstates[6].disabled = 1; /* C9-SKL */ +} +/* + * intel_idle_state_table_update() + * + * Update the default state_table for this CPU-id + */ + +static void intel_idle_state_table_update(void) +{ + switch (boot_cpu_data.x86_model) { + + case 0x3e: /* IVT */ + ivt_idle_state_table_update(); + break; + case 0x5e: /* SKL-H */ + sklh_idle_state_table_update(); + break; } - return; } /* @@ -1063,6 +1144,14 @@ static int __init intel_idle_cpuidle_driver_init(void) if (num_substates == 0) continue; + /* if state marked as disabled, skip it */ + if (cpuidle_state_table[cstate].disabled != 0) { + pr_debug(PREFIX "state %s is disabled", + cpuidle_state_table[cstate].name); + continue; + } + + if (((mwait_cstate + 1) > 2) && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halts in idle" |