summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt3
-rw-r--r--Documentation/admin-guide/pm/intel_pstate.rst3
-rw-r--r--drivers/cpufreq/cpufreq_governor.c45
-rw-r--r--drivers/cpufreq/intel_pstate.c27
4 files changed, 47 insertions, 31 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb8752b42ec8..77e671e55993 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2316,6 +2316,9 @@
per_cpu_perf_limits
Allow per-logical-CPU P-State performance control limits using
cpufreq sysfs interface
+ no_cas
+ Do not enable capacity-aware scheduling (CAS) on
+ hybrid systems
intremap= [X86-64,Intel-IOMMU,EARLY]
on enable Interrupt Remapping (default)
diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst
index bf13ad25a32f..78fc83ed2a7e 100644
--- a/Documentation/admin-guide/pm/intel_pstate.rst
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -696,6 +696,9 @@ of them have to be prepended with the ``intel_pstate=`` prefix.
Use per-logical-CPU P-State limits (see `Coordination of P-state
Limits`_ for details).
+``no_cas``
+ Do not enable capacity-aware scheduling (CAS) which is enabled by
+ default on hybrid systems.
Diagnostics and Tuning
======================
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index af44ee6a6430..1a7fcaf39cc9 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -145,7 +145,23 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
time_elapsed = update_time - j_cdbs->prev_update_time;
j_cdbs->prev_update_time = update_time;
- idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
+ /*
+ * cur_idle_time could be smaller than j_cdbs->prev_cpu_idle if
+ * it's obtained from get_cpu_idle_time_jiffy() when NOHZ is
+ * off, where idle_time is calculated by the difference between
+ * time elapsed in jiffies and "busy time" obtained from CPU
+ * statistics. If a CPU is 100% busy, the time elapsed and busy
+ * time should grow with the same amount in two consecutive
+ * samples, but in practice there could be a tiny difference,
+ * making the accumulated idle time decrease sometimes. Hence,
+ * in this case, idle_time should be regarded as 0 in order to
+ * make the further process correct.
+ */
+ if (cur_idle_time > j_cdbs->prev_cpu_idle)
+ idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
+ else
+ idle_time = 0;
+
j_cdbs->prev_cpu_idle = cur_idle_time;
if (ignore_nice) {
@@ -162,7 +178,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
* calls, so the previous load value can be used then.
*/
load = j_cdbs->prev_load;
- } else if (unlikely((int)idle_time > 2 * sampling_rate &&
+ } else if (unlikely(idle_time > 2 * sampling_rate &&
j_cdbs->prev_load)) {
/*
* If the CPU had gone completely idle and a task has
@@ -189,30 +205,15 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
load = j_cdbs->prev_load;
j_cdbs->prev_load = 0;
} else {
- if (time_elapsed >= idle_time) {
+ if (time_elapsed > idle_time)
load = 100 * (time_elapsed - idle_time) / time_elapsed;
- } else {
- /*
- * That can happen if idle_time is returned by
- * get_cpu_idle_time_jiffy(). In that case
- * idle_time is roughly equal to the difference
- * between time_elapsed and "busy time" obtained
- * from CPU statistics. Then, the "busy time"
- * can end up being greater than time_elapsed
- * (for example, if jiffies_64 and the CPU
- * statistics are updated by different CPUs),
- * so idle_time may in fact be negative. That
- * means, though, that the CPU was busy all
- * the time (on the rough average) during the
- * last sampling interval and 100 can be
- * returned as the load.
- */
- load = (int)idle_time < 0 ? 100 : 0;
- }
+ else
+ load = 0;
+
j_cdbs->prev_load = load;
}
- if (unlikely((int)idle_time > 2 * sampling_rate)) {
+ if (unlikely(idle_time > 2 * sampling_rate)) {
unsigned int periods = idle_time / sampling_rate;
if (periods < idle_periods)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 9c4cc01fd51a..fa81054fa411 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -936,6 +936,8 @@ static struct freq_attr *hwp_cpufreq_attrs[] = {
NULL,
};
+static bool no_cas __ro_after_init;
+
static struct cpudata *hybrid_max_perf_cpu __read_mostly;
/*
* Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
@@ -1041,6 +1043,10 @@ static void hybrid_refresh_cpu_capacity_scaling(void)
static void hybrid_init_cpu_capacity_scaling(bool refresh)
{
+ /* Bail out if enabling capacity-aware scheduling is prohibited. */
+ if (no_cas)
+ return;
+
/*
* If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity
* scaling has been enabled already and the driver is just changing the
@@ -3688,6 +3694,15 @@ static int __init intel_pstate_init(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return -ENODEV;
+ /*
+ * The Intel pstate driver will be ignored if the platform
+ * firmware has its own power management modes.
+ */
+ if (intel_pstate_platform_pwr_mgmt_exists()) {
+ pr_info("P-states controlled by the platform\n");
+ return -ENODEV;
+ }
+
id = x86_match_cpu(hwp_support_ids);
if (id) {
hwp_forced = intel_pstate_hwp_is_enabled();
@@ -3743,15 +3758,6 @@ static int __init intel_pstate_init(void)
default_driver = &intel_cpufreq;
hwp_cpu_matched:
- /*
- * The Intel pstate driver will be ignored if the platform
- * firmware has its own power management modes.
- */
- if (intel_pstate_platform_pwr_mgmt_exists()) {
- pr_info("P-states controlled by the platform\n");
- return -ENODEV;
- }
-
if (!hwp_active && hwp_only)
return -ENOTSUPP;
@@ -3835,6 +3841,9 @@ static int __init intel_pstate_setup(char *str)
if (!strcmp(str, "no_hwp"))
no_hwp = 1;
+ if (!strcmp(str, "no_cas"))
+ no_cas = true;
+
if (!strcmp(str, "force"))
force_load = 1;
if (!strcmp(str, "hwp_only"))