summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu69
-rw-r--r--drivers/cpufreq/acpi-cpufreq.c18
-rw-r--r--drivers/cpufreq/cpufreq.c98
-rw-r--r--drivers/cpufreq/cpufreq_governor.c2
-rw-r--r--drivers/cpufreq/intel_pstate.c73
-rw-r--r--drivers/cpufreq/powernv-cpufreq.c124
-rw-r--r--drivers/cpuidle/governors/menu.c50
-rw-r--r--drivers/idle/intel_idle.c133
8 files changed, 407 insertions, 160 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index b683e8ee69ec..16501334b99f 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -271,3 +271,72 @@ Description: Parameters for the CPU cache attributes
- WriteBack: data is written only to the cache line and
the modified cache line is written to main
memory only when it is replaced
+
+What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats
+ /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat
+ /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat
+ /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle
+ /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap
+ /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp
+ /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault
+ /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent
+ /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset
+Date: March 2016
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+ Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description: POWERNV CPUFreq driver's frequency throttle stats directory and
+ attributes
+
+ 'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency
+ throttle stat attributes for the chip. The throttle stats of a cpu
+ is common across all the cpus belonging to a chip. Below are the
+ throttle attributes exported in the 'throttle_stats' directory:
+
+ - turbo_stat : This file gives the total number of times the max
+ frequency is throttled to lower frequency in turbo (at and above
+ nominal frequency) range of frequencies.
+
+ - sub_turbo_stat : This file gives the total number of times the
+ max frequency is throttled to lower frequency in sub-turbo(below
+ nominal frequency) range of frequencies.
+
+ - unthrottle : This file gives the total number of times the max
+ frequency is unthrottled after being throttled.
+
+ - powercap : This file gives the total number of times the max
+ frequency is throttled due to 'Power Capping'.
+
+ - overtemp : This file gives the total number of times the max
+ frequency is throttled due to 'CPU Over Temperature'.
+
+ - supply_fault : This file gives the total number of times the
+ max frequency is throttled due to 'Power Supply Failure'.
+
+ - overcurrent : This file gives the total number of times the
+ max frequency is throttled due to 'Overcurrent'.
+
+ - occ_reset : This file gives the total number of times the max
+ frequency is throttled due to 'OCC Reset'.
+
+ The sysfs attributes representing different throttle reasons like
+ powercap, overtemp, supply_fault, overcurrent and occ_reset map to
+ the reasons provided by OCC firmware for throttling the frequency.
+
+What: /sys/devices/system/cpu/cpufreq/policyX/throttle_stats
+ /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/turbo_stat
+ /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/sub_turbo_stat
+ /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/unthrottle
+ /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/powercap
+ /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overtemp
+ /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/supply_fault
+ /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overcurrent
+ /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/occ_reset
+Date: March 2016
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+ Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description: POWERNV CPUFreq driver's frequency throttle stats directory and
+ attributes
+
+ 'policyX/throttle_stats' directory and all the attributes are same as
+ the /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory and
+ attributes which give the frequency throttle information of the chip.
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 59a7b380fbe2..fb5712141040 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -245,7 +245,7 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
}
}
-u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
+static u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
{
u32 val, dummy;
@@ -253,7 +253,7 @@ u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
return val;
}
-void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
+static void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
{
u32 lo, hi;
@@ -262,7 +262,7 @@ void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
wrmsr(MSR_IA32_PERF_CTL, lo, hi);
}
-u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
+static u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
{
u32 val, dummy;
@@ -270,12 +270,12 @@ u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
return val;
}
-void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
+static void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
{
wrmsr(MSR_AMD_PERF_CTL, val, 0);
}
-u32 cpu_freq_read_io(struct acpi_pct_register *reg)
+static u32 cpu_freq_read_io(struct acpi_pct_register *reg)
{
u32 val;
@@ -283,7 +283,7 @@ u32 cpu_freq_read_io(struct acpi_pct_register *reg)
return val;
}
-void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
+static void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
{
acpi_os_write_port(reg->address, val, reg->bit_width);
}
@@ -514,8 +514,10 @@ static int boost_notify(struct notifier_block *nb, unsigned long action,
*/
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
break;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 4c7825856eab..b87596b591b3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -76,6 +76,7 @@ static inline bool has_target(void)
/* internal prototypes */
static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
+static int cpufreq_start_governor(struct cpufreq_policy *policy);
/**
* Two notifier lists: the "policy" list is involved in the
@@ -964,10 +965,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
cpumask_set_cpu(cpu, policy->cpus);
if (has_target()) {
- ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
- if (!ret)
- ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
-
+ ret = cpufreq_start_governor(policy);
if (ret)
pr_err("%s: Failed to start governor\n", __func__);
}
@@ -1308,10 +1306,7 @@ static void cpufreq_offline(unsigned int cpu)
/* Start governor again for active policy */
if (!policy_is_inactive(policy)) {
if (has_target()) {
- ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
- if (!ret)
- ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
-
+ ret = cpufreq_start_governor(policy);
if (ret)
pr_err("%s: Failed to start governor\n", __func__);
}
@@ -1401,9 +1396,17 @@ unsigned int cpufreq_quick_get(unsigned int cpu)
{
struct cpufreq_policy *policy;
unsigned int ret_freq = 0;
+ unsigned long flags;
- if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get)
- return cpufreq_driver->get(cpu);
+ read_lock_irqsave(&cpufreq_driver_lock, flags);
+
+ if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) {
+ ret_freq = cpufreq_driver->get(cpu);
+ read_unlock_irqrestore(&cpufreq_driver_lock, flags);
+ return ret_freq;
+ }
+
+ read_unlock_irqrestore(&cpufreq_driver_lock, flags);
policy = cpufreq_cpu_get(cpu);
if (policy) {
@@ -1484,6 +1487,24 @@ unsigned int cpufreq_get(unsigned int cpu)
}
EXPORT_SYMBOL(cpufreq_get);
+static unsigned int cpufreq_update_current_freq(struct cpufreq_policy *policy)
+{
+ unsigned int new_freq;
+
+ new_freq = cpufreq_driver->get(policy->cpu);
+ if (!new_freq)
+ return 0;
+
+ if (!policy->cur) {
+ pr_debug("cpufreq: Driver did not initialize current freq\n");
+ policy->cur = new_freq;
+ } else if (policy->cur != new_freq && has_target()) {
+ cpufreq_out_of_sync(policy, new_freq);
+ }
+
+ return new_freq;
+}
+
static struct subsys_interface cpufreq_interface = {
.name = "cpufreq",
.subsys = &cpu_subsys,
@@ -1583,9 +1604,7 @@ void cpufreq_resume(void)
policy);
} else {
down_write(&policy->rwsem);
- ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
- if (!ret)
- cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+ ret = cpufreq_start_governor(policy);
up_write(&policy->rwsem);
if (ret)
@@ -1593,17 +1612,6 @@ void cpufreq_resume(void)
__func__, policy);
}
}
-
- /*
- * schedule call cpufreq_update_policy() for first-online CPU, as that
- * wouldn't be hotplugged-out on suspend. It will verify that the
- * current freq is in sync with what we believe it to be.
- */
- policy = cpufreq_cpu_get_raw(cpumask_first(cpu_online_mask));
- if (WARN_ON(!policy))
- return;
-
- schedule_work(&policy->update);
}
/**
@@ -1927,6 +1935,17 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
return ret;
}
+static int cpufreq_start_governor(struct cpufreq_policy *policy)
+{
+ int ret;
+
+ if (cpufreq_driver->get && !cpufreq_driver->setpolicy)
+ cpufreq_update_current_freq(policy);
+
+ ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
+ return ret ? ret : cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+}
+
int cpufreq_register_governor(struct cpufreq_governor *governor)
{
int err;
@@ -2063,8 +2082,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
return cpufreq_driver->setpolicy(new_policy);
}
- if (new_policy->governor == policy->governor)
- goto out;
+ if (new_policy->governor == policy->governor) {
+ pr_debug("cpufreq: governor limits update\n");
+ return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+ }
pr_debug("governor switch\n");
@@ -2092,10 +2113,11 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
policy->governor = new_policy->governor;
ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
if (!ret) {
- ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
- if (!ret)
- goto out;
-
+ ret = cpufreq_start_governor(policy);
+ if (!ret) {
+ pr_debug("cpufreq: governor change\n");
+ return 0;
+ }
cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
}
@@ -2106,14 +2128,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
if (cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
policy->governor = NULL;
else
- cpufreq_governor(policy, CPUFREQ_GOV_START);
+ cpufreq_start_governor(policy);
}
return ret;
-
- out:
- pr_debug("governor: change or update limits\n");
- return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
}
/**
@@ -2144,19 +2162,11 @@ int cpufreq_update_policy(unsigned int cpu)
* -> ask driver for current freq and notify governors about a change
*/
if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
- new_policy.cur = cpufreq_driver->get(cpu);
+ new_policy.cur = cpufreq_update_current_freq(policy);
if (WARN_ON(!new_policy.cur)) {
ret = -EIO;
goto unlock;
}
-
- if (!policy->cur) {
- pr_debug("Driver did not initialize current freq\n");
- policy->cur = new_policy.cur;
- } else {
- if (policy->cur != new_policy.cur && has_target())
- cpufreq_out_of_sync(policy, new_policy.cur);
- }
}
ret = cpufreq_set_policy(policy, &new_policy);
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 1c25ef405616..10a5cfeae8c5 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -329,7 +329,7 @@ static void dbs_irq_work(struct irq_work *irq_work)
struct policy_dbs_info *policy_dbs;
policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
- schedule_work(&policy_dbs->work);
+ schedule_work_on(smp_processor_id(), &policy_dbs->work);
}
static void dbs_update_util_handler(struct update_util_data *data, u64 time,
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cb5607495816..4b644526fd59 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -134,7 +134,7 @@ struct pstate_funcs {
int (*get_min)(void);
int (*get_turbo)(void);
int (*get_scaling)(void);
- void (*set)(struct cpudata*, int pstate);
+ u64 (*get_val)(struct cpudata*, int pstate);
void (*get_vid)(struct cpudata *);
int32_t (*get_target_pstate)(struct cpudata *);
};
@@ -565,7 +565,7 @@ static int atom_get_turbo_pstate(void)
return value & 0x7F;
}
-static void atom_set_pstate(struct cpudata *cpudata, int pstate)
+static u64 atom_get_val(struct cpudata *cpudata, int pstate)
{
u64 val;
int32_t vid_fp;
@@ -585,9 +585,7 @@ static void atom_set_pstate(struct cpudata *cpudata, int pstate)
if (pstate > cpudata->pstate.max_pstate)
vid = cpudata->vid.turbo;
- val |= vid;
-
- wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
+ return val | vid;
}
static int silvermont_get_scaling(void)
@@ -711,7 +709,7 @@ static inline int core_get_scaling(void)
return 100000;
}
-static void core_set_pstate(struct cpudata *cpudata, int pstate)
+static u64 core_get_val(struct cpudata *cpudata, int pstate)
{
u64 val;
@@ -719,7 +717,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate)
if (limits->no_turbo && !limits->turbo_disabled)
val |= (u64)1 << 32;
- wrmsrl(MSR_IA32_PERF_CTL, val);
+ return val;
}
static int knl_get_turbo_pstate(void)
@@ -750,7 +748,7 @@ static struct cpu_defaults core_params = {
.get_min = core_get_min_pstate,
.get_turbo = core_get_turbo_pstate,
.get_scaling = core_get_scaling,
- .set = core_set_pstate,
+ .get_val = core_get_val,
.get_target_pstate = get_target_pstate_use_performance,
},
};
@@ -769,7 +767,7 @@ static struct cpu_defaults silvermont_params = {
.get_max_physical = atom_get_max_pstate,
.get_min = atom_get_min_pstate,
.get_turbo = atom_get_turbo_pstate,
- .set = atom_set_pstate,
+ .get_val = atom_get_val,
.get_scaling = silvermont_get_scaling,
.get_vid = atom_get_vid,
.get_target_pstate = get_target_pstate_use_cpu_load,
@@ -790,7 +788,7 @@ static struct cpu_defaults airmont_params = {
.get_max_physical = atom_get_max_pstate,
.get_min = atom_get_min_pstate,
.get_turbo = atom_get_turbo_pstate,
- .set = atom_set_pstate,
+ .get_val = atom_get_val,
.get_scaling = airmont_get_scaling,
.get_vid = atom_get_vid,
.get_target_pstate = get_target_pstate_use_cpu_load,
@@ -812,7 +810,7 @@ static struct cpu_defaults knl_params = {
.get_min = core_get_min_pstate,
.get_turbo = knl_get_turbo_pstate,
.get_scaling = core_get_scaling,
- .set = core_set_pstate,
+ .get_val = core_get_val,
.get_target_pstate = get_target_pstate_use_performance,
},
};
@@ -839,25 +837,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
}
-static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate, bool force)
+static inline void intel_pstate_record_pstate(struct cpudata *cpu, int pstate)
{
- int max_perf, min_perf;
-
- if (force) {
- update_turbo_state();
-
- intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
-
- pstate = clamp_t(int, pstate, min_perf, max_perf);
-
- if (pstate == cpu->pstate.current_pstate)
- return;
- }
trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
-
cpu->pstate.current_pstate = pstate;
+}
- pstate_funcs.set(cpu, pstate);
+static void intel_pstate_set_min_pstate(struct cpudata *cpu)
+{
+ int pstate = cpu->pstate.min_pstate;
+
+ intel_pstate_record_pstate(cpu, pstate);
+ /*
+ * Generally, there is no guarantee that this code will always run on
+ * the CPU being updated, so force the register update to run on the
+ * right CPU.
+ */
+ wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
+ pstate_funcs.get_val(cpu, pstate));
}
static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
@@ -870,7 +867,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
if (pstate_funcs.get_vid)
pstate_funcs.get_vid(cpu);
- intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false);
+
+ intel_pstate_set_min_pstate(cpu);
}
static inline void intel_pstate_calc_busy(struct cpudata *cpu)
@@ -997,6 +995,21 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy);
}
+static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+{
+ int max_perf, min_perf;
+
+ update_turbo_state();
+
+ intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
+ pstate = clamp_t(int, pstate, min_perf, max_perf);
+ if (pstate == cpu->pstate.current_pstate)
+ return;
+
+ intel_pstate_record_pstate(cpu, pstate);
+ wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
+}
+
static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
{
int from, target_pstate;
@@ -1006,7 +1019,7 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
target_pstate = pstate_funcs.get_target_pstate(cpu);
- intel_pstate_set_pstate(cpu, target_pstate, true);
+ intel_pstate_update_pstate(cpu, target_pstate);
sample = &cpu->sample;
trace_pstate_sample(fp_toint(sample->core_pct_busy),
@@ -1180,7 +1193,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
if (hwp_active)
return;
- intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false);
+ intel_pstate_set_min_pstate(cpu);
}
static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
@@ -1255,7 +1268,7 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs)
pstate_funcs.get_min = funcs->get_min;
pstate_funcs.get_turbo = funcs->get_turbo;
pstate_funcs.get_scaling = funcs->get_scaling;
- pstate_funcs.set = funcs->set;
+ pstate_funcs.get_val = funcs->get_val;
pstate_funcs.get_vid = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 50bf12033bbc..39ac78c94be0 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -44,7 +44,6 @@
static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
static bool rebooting, throttled, occ_reset;
-static unsigned int *core_to_chip_map;
static const char * const throttle_reason[] = {
"No throttling",
@@ -55,6 +54,16 @@ static const char * const throttle_reason[] = {
"OCC Reset"
};
+enum throttle_reason_type {
+ NO_THROTTLE = 0,
+ POWERCAP,
+ CPU_OVERTEMP,
+ POWER_SUPPLY_FAILURE,
+ OVERCURRENT,
+ OCC_RESET_THROTTLE,
+ OCC_MAX_REASON
+};
+
static struct chip {
unsigned int id;
bool throttled;
@@ -62,9 +71,13 @@ static struct chip {
u8 throttle_reason;
cpumask_t mask;
struct work_struct throttle;
+ int throttle_turbo;
+ int throttle_sub_turbo;
+ int reason[OCC_MAX_REASON];
} *chips;
static int nr_chips;
+static DEFINE_PER_CPU(struct chip *, chip_info);
/*
* Note: The set of pstates consists of contiguous integers, the
@@ -196,6 +209,42 @@ static struct freq_attr *powernv_cpu_freq_attr[] = {
NULL,
};
+#define throttle_attr(name, member) \
+static ssize_t name##_show(struct cpufreq_policy *policy, char *buf) \
+{ \
+ struct chip *chip = per_cpu(chip_info, policy->cpu); \
+ \
+ return sprintf(buf, "%u\n", chip->member); \
+} \
+ \
+static struct freq_attr throttle_attr_##name = __ATTR_RO(name) \
+
+throttle_attr(unthrottle, reason[NO_THROTTLE]);
+throttle_attr(powercap, reason[POWERCAP]);
+throttle_attr(overtemp, reason[CPU_OVERTEMP]);
+throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
+throttle_attr(overcurrent, reason[OVERCURRENT]);
+throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
+throttle_attr(turbo_stat, throttle_turbo);
+throttle_attr(sub_turbo_stat, throttle_sub_turbo);
+
+static struct attribute *throttle_attrs[] = {
+ &throttle_attr_unthrottle.attr,
+ &throttle_attr_powercap.attr,
+ &throttle_attr_overtemp.attr,
+ &throttle_attr_supply_fault.attr,
+ &throttle_attr_overcurrent.attr,
+ &throttle_attr_occ_reset.attr,
+ &throttle_attr_turbo_stat.attr,
+ &throttle_attr_sub_turbo_stat.attr,
+ NULL,
+};
+
+static const struct attribute_group throttle_attr_grp = {
+ .name = "throttle_stats",
+ .attrs = throttle_attrs,
+};
+
/* Helper routines */
/* Access helpers to power mgt SPR */
@@ -324,34 +373,35 @@ static inline unsigned int get_nominal_index(void)
static void powernv_cpufreq_throttle_check(void *data)
{
+ struct chip *chip;
unsigned int cpu = smp_processor_id();
- unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)];
unsigned long pmsr;
- int pmsr_pmax, i;
+ int pmsr_pmax;
pmsr = get_pmspr(SPRN_PMSR);
-
- for (i = 0; i < nr_chips; i++)
- if (chips[i].id == chip_id)
- break;
+ chip = this_cpu_read(chip_info);
/* Check for Pmax Capping */
pmsr_pmax = (s8)PMSR_MAX(pmsr);
if (pmsr_pmax != powernv_pstate_info.max) {
- if (chips[i].throttled)
+ if (chip->throttled)
goto next;
- chips[i].throttled = true;
- if (pmsr_pmax < powernv_pstate_info.nominal)
+ chip->throttled = true;
+ if (pmsr_pmax < powernv_pstate_info.nominal) {
pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
- cpu, chips[i].id, pmsr_pmax,
+ cpu, chip->id, pmsr_pmax,
powernv_pstate_info.nominal);
- trace_powernv_throttle(chips[i].id,
- throttle_reason[chips[i].throttle_reason],
+ chip->throttle_sub_turbo++;
+ } else {
+ chip->throttle_turbo++;
+ }
+ trace_powernv_throttle(chip->id,
+ throttle_reason[chip->throttle_reason],
pmsr_pmax);
- } else if (chips[i].throttled) {
- chips[i].throttled = false;
- trace_powernv_throttle(chips[i].id,
- throttle_reason[chips[i].throttle_reason],
+ } else if (chip->throttled) {
+ chip->throttled = false;
+ trace_powernv_throttle(chip->id,
+ throttle_reason[chip->throttle_reason],
pmsr_pmax);
}
@@ -411,6 +461,21 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
for (i = 0; i < threads_per_core; i++)
cpumask_set_cpu(base + i, policy->cpus);
+ if (!policy->driver_data) {
+ int ret;
+
+ ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
+ if (ret) {
+ pr_info("Failed to create throttle stats directory for cpu %d\n",
+ policy->cpu);
+ return ret;
+ }
+ /*
+ * policy->driver_data is used as a flag for one-time
+ * creation of throttle sysfs files.
+ */
+ policy->driver_data = policy;
+ }
return cpufreq_table_validate_and_show(policy, powernv_freqs);
}
@@ -517,8 +582,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
break;
if (omsg.throttle_status >= 0 &&
- omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS)
+ omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
chips[i].throttle_reason = omsg.throttle_status;
+ chips[i].reason[omsg.throttle_status]++;
+ }
if (!omsg.throttle_status)
chips[i].restore = true;
@@ -558,47 +625,34 @@ static int init_chip_info(void)
unsigned int chip[256];
unsigned int cpu, i;
unsigned int prev_chip_id = UINT_MAX;
- cpumask_t cpu_mask;
- int ret = -ENOMEM;
-
- core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int),
- GFP_KERNEL);
- if (!core_to_chip_map)
- goto out;
- cpumask_copy(&cpu_mask, cpu_possible_mask);
- for_each_cpu(cpu, &cpu_mask) {
+ for_each_possible_cpu(cpu) {
unsigned int id = cpu_to_chip_id(cpu);
if (prev_chip_id != id) {
prev_chip_id = id;
chip[nr_chips++] = id;
}
- core_to_chip_map[cpu_core_index_of_thread(cpu)] = id;
- cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu));
}
chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
if (!chips)
- goto free_chip_map;
+ return -ENOMEM;
for (i = 0; i < nr_chips; i++) {
chips[i].id = chip[i];
cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
+ for_each_cpu(cpu, &chips[i].mask)
+ per_cpu(chip_info, cpu) = &chips[i];
}
return 0;
-free_chip_map:
- kfree(core_to_chip_map);
-out:
- return ret;
}
static inline void clean_chip_info(void)
{
kfree(chips);
- kfree(core_to_chip_map);
}
static inline void unregister_all_notifiers(void)
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 27fc733cb5b9..03d38c291de6 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -196,7 +196,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
* of points is below a threshold. If it is... then use the
* average of these 8 points as the estimated value.
*/
-static void get_typical_interval(struct menu_device *data)
+static unsigned int get_typical_interval(struct menu_device *data)
{
int i, divisor;
unsigned int max, thresh, avg;
@@ -253,9 +253,7 @@ again:
if (likely(variance <= U64_MAX/36)) {
if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
|| variance <= 400) {
- if (data->next_timer_us > avg)
- data->predicted_us = avg;
- return;
+ return avg;
}
}
@@ -269,7 +267,7 @@ again:
* with sporadic activity with a bunch of short pauses.
*/
if ((divisor * 4) <= INTERVALS * 3)
- return;
+ return UINT_MAX;
thresh = max - 1;
goto again;
@@ -286,6 +284,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
int i;
unsigned int interactivity_req;
+ unsigned int expected_interval;
unsigned long nr_iowaiters, cpu_load;
if (data->needs_update) {
@@ -312,32 +311,43 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
data->correction_factor[data->bucket],
RESOLUTION * DECAY);
- get_typical_interval(data);
-
- /*
- * Performance multiplier defines a minimum predicted idle
- * duration / latency ratio. Adjust the latency limit if
- * necessary.
- */
- interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
- if (latency_req > interactivity_req)
- latency_req = interactivity_req;
+ expected_interval = get_typical_interval(data);
+ expected_interval = min(expected_interval, data->next_timer_us);
if (CPUIDLE_DRIVER_STATE_START > 0) {
- data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
+ struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START];
+ unsigned int polling_threshold;
+
/*
* We want to default to C1 (hlt), not to busy polling
- * unless the timer is happening really really soon.
+ * unless the timer is happening really really soon, or
+ * C1's exit latency exceeds the user configured limit.
*/
- if (interactivity_req > 20 &&
- !drv->states[CPUIDLE_DRIVER_STATE_START].disabled &&
- dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0)
+ polling_threshold = max_t(unsigned int, 20, s->target_residency);
+ if (data->next_timer_us > polling_threshold &&
+ latency_req > s->exit_latency && !s->disabled &&
+ !dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable)
data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
+ else
+ data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
} else {
data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
}
/*
+ * Use the lowest expected idle interval to pick the idle state.
+ */
+ data->predicted_us = min(data->predicted_us, expected_interval);
+
+ /*
+ * Use the performance multiplier and the user-configurable
+ * latency_req to determine the maximum exit latency.
+ */
+ interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
+ if (latency_req > interactivity_req)
+ latency_req = interactivity_req;
+
+ /*
* Find the idle state with the lowest power while satisfying
* our constraints.
*/
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index cd4510a63375..ba947df5a8c7 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -65,7 +65,7 @@
#include <asm/mwait.h>
#include <asm/msr.h>
-#define INTEL_IDLE_VERSION "0.4"
+#define INTEL_IDLE_VERSION "0.4.1"
#define PREFIX "intel_idle: "
static struct cpuidle_driver intel_idle_driver = {
@@ -716,6 +716,26 @@ static struct cpuidle_state avn_cstates[] = {
{
.enter = NULL }
};
+static struct cpuidle_state knl_cstates[] = {
+ {
+ .name = "C1-KNL",
+ .desc = "MWAIT 0x00",
+ .flags = MWAIT2flg(0x00),
+ .exit_latency = 1,
+ .target_residency = 2,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze },
+ {
+ .name = "C6-KNL",
+ .desc = "MWAIT 0x10",
+ .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 120,
+ .target_residency = 500,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze },
+ {
+ .enter = NULL }
+};
/**
* intel_idle
@@ -890,6 +910,10 @@ static const struct idle_cpu idle_cpu_avn = {
.disable_promotion_to_c1e = true,
};
+static const struct idle_cpu idle_cpu_knl = {
+ .state_table = knl_cstates,
+};
+
#define ICPU(model, cpu) \
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu }
@@ -921,6 +945,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
ICPU(0x56, idle_cpu_bdw),
ICPU(0x4e, idle_cpu_skl),
ICPU(0x5e, idle_cpu_skl),
+ ICPU(0x57, idle_cpu_knl),
{}
};
MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids);
@@ -994,36 +1019,92 @@ static void intel_idle_cpuidle_devices_uninit(void)
}
/*
- * intel_idle_state_table_update()
+ * ivt_idle_state_table_update(void)
*
- * Update the default state_table for this CPU-id
- *
- * Currently used to access tuned IVT multi-socket targets
+ * Tune IVT multi-socket targets
* Assumption: num_sockets == (max_package_num + 1)
*/
-void intel_idle_state_table_update(void)
+static void ivt_idle_state_table_update(void)
{
/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
- if (boot_cpu_data.x86_model == 0x3e) { /* IVT */
- int cpu, package_num, num_sockets = 1;
-
- for_each_online_cpu(cpu) {
- package_num = topology_physical_package_id(cpu);
- if (package_num + 1 > num_sockets) {
- num_sockets = package_num + 1;
-
- if (num_sockets > 4) {
- cpuidle_state_table = ivt_cstates_8s;
- return;
- }
+ int cpu, package_num, num_sockets = 1;
+
+ for_each_online_cpu(cpu) {
+ package_num = topology_physical_package_id(cpu);
+ if (package_num + 1 > num_sockets) {
+ num_sockets = package_num + 1;
+
+ if (num_sockets > 4) {
+ cpuidle_state_table = ivt_cstates_8s;
+ return;
}
}
+ }
+
+ if (num_sockets > 2)
+ cpuidle_state_table = ivt_cstates_4s;
- if (num_sockets > 2)
- cpuidle_state_table = ivt_cstates_4s;
- /* else, 1 and 2 socket systems use default ivt_cstates */
+ /* else, 1 and 2 socket systems use default ivt_cstates */
+}
+/*
+ * sklh_idle_state_table_update(void)
+ *
+ * On SKL-H (model 0x5e) disable C8 and C9 if:
+ * C10 is enabled and SGX disabled
+ */
+static void sklh_idle_state_table_update(void)
+{
+ unsigned long long msr;
+ unsigned int eax, ebx, ecx, edx;
+
+
+ /* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */
+ if (max_cstate <= 7)
+ return;
+
+ /* if PC10 not present in CPUID.MWAIT.EDX */
+ if ((mwait_substates & (0xF << 28)) == 0)
+ return;
+
+ rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr);
+
+ /* PC10 is not enabled in PKG C-state limit */
+ if ((msr & 0xF) != 8)
+ return;
+
+ ecx = 0;
+ cpuid(7, &eax, &ebx, &ecx, &edx);
+
+ /* if SGX is present */
+ if (ebx & (1 << 2)) {
+
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+
+ /* if SGX is enabled */
+ if (msr & (1 << 18))
+ return;
+ }
+
+ skl_cstates[5].disabled = 1; /* C8-SKL */
+ skl_cstates[6].disabled = 1; /* C9-SKL */
+}
+/*
+ * intel_idle_state_table_update()
+ *
+ * Update the default state_table for this CPU-id
+ */
+
+static void intel_idle_state_table_update(void)
+{
+ switch (boot_cpu_data.x86_model) {
+
+ case 0x3e: /* IVT */
+ ivt_idle_state_table_update();
+ break;
+ case 0x5e: /* SKL-H */
+ sklh_idle_state_table_update();
+ break;
}
- return;
}
/*
@@ -1063,6 +1144,14 @@ static int __init intel_idle_cpuidle_driver_init(void)
if (num_substates == 0)
continue;
+ /* if state marked as disabled, skip it */
+ if (cpuidle_state_table[cstate].disabled != 0) {
+ pr_debug(PREFIX "state %s is disabled",
+ cpuidle_state_table[cstate].name);
+ continue;
+ }
+
+
if (((mwait_cstate + 1) > 2) &&
!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halts in idle"