diff options
-rw-r--r-- | Documentation/power/energy-model.rst | 24 | ||||
-rw-r--r-- | drivers/cpufreq/mediatek-cpufreq-hw.c | 4 | ||||
-rw-r--r-- | drivers/cpufreq/scmi-cpufreq.c | 4 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-psci-domain.c | 4 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-riscv-sbi.c | 4 | ||||
-rw-r--r-- | drivers/idle/intel_idle.c | 133 | ||||
-rw-r--r-- | drivers/opp/of.c | 6 | ||||
-rw-r--r-- | drivers/powercap/dtpm_cpu.c | 2 | ||||
-rw-r--r-- | drivers/thermal/cpufreq_cooling.c | 2 | ||||
-rw-r--r-- | drivers/thermal/devfreq_cooling.c | 8 | ||||
-rw-r--r-- | include/linux/energy_model.h | 35 | ||||
-rw-r--r-- | kernel/power/energy_model.c | 65 |
12 files changed, 240 insertions, 51 deletions
diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index 49549aab41b4..feb257b7f350 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -123,6 +123,26 @@ allows a platform to register EM power values which are reflecting total power (static + dynamic). These power values might be coming directly from experiments and measurements. +Registration of 'artificial' EM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There is an option to provide a custom callback for drivers missing detailed +knowledge about power value for each performance state. The callback +.get_cost() is optional and provides the 'cost' values used by the EAS. +This is useful for platforms that only provide information on relative +efficiency between CPU types, where one could use the information to +create an abstract power model. But even an abstract power model can +sometimes be hard to fit in, given the input power value size restrictions. +The .get_cost() allows to provide the 'cost' values which reflect the +efficiency of the CPUs. This would allow to provide EAS information which +has different relation than what would be forced by the EM internal +formulas calculating 'cost' values. To register an EM for such platform, the +driver must set the flag 'milliwatts' to 0, provide .get_power() callback +and provide .get_cost() callback. The EM framework would handle such platform +properly during registration. A flag EM_PERF_DOMAIN_ARTIFICIAL is set for such +platform. Special care should be taken by other frameworks which are using EM +to test and treat this flag properly. + Registration of 'simple' EM ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -181,8 +201,8 @@ EM framework:: -> drivers/cpufreq/foo_cpufreq.c - 01 static int est_power(unsigned long *mW, unsigned long *KHz, - 02 struct device *dev) + 01 static int est_power(struct device *dev, unsigned long *mW, + 02 unsigned long *KHz) 03 { 04 long freq, power; 05 diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index 0a94c56ddad2..813cccbfe934 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -51,8 +51,8 @@ static const u16 cpufreq_mtk_offsets[REG_ARRAY_SIZE] = { }; static int __maybe_unused -mtk_cpufreq_get_cpu_power(unsigned long *mW, - unsigned long *KHz, struct device *cpu_dev) +mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW, + unsigned long *KHz) { struct mtk_cpufreq_data *data; struct cpufreq_policy *policy; diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 919fa6e3f462..6d2a4cf46db7 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -96,8 +96,8 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) } static int __maybe_unused -scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, - struct device *cpu_dev) +scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power, + unsigned long *KHz) { unsigned long Hz; int ret, domain; diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index 755bbdfc5b82..3db4fca1172b 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -52,7 +52,7 @@ static int psci_pd_init(struct device_node *np, bool use_osi) struct generic_pm_domain *pd; struct psci_pd_provider *pd_provider; struct dev_power_governor *pd_gov; - int ret = -ENOMEM, state_count = 0; + int ret = -ENOMEM; pd = dt_idle_pd_alloc(np, psci_dt_parse_state_node); if (!pd) @@ -71,7 +71,7 @@ static int psci_pd_init(struct device_node *np, bool use_osi) pd->flags |= GENPD_FLAG_ALWAYS_ON; /* Use governor for CPU PM domains if it has some states to manage. */ - pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL; + pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; ret = pm_genpd_init(pd, pd_gov, false); if (ret) diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index 5c852e671992..1151e5e2ba82 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -414,7 +414,7 @@ static int sbi_pd_init(struct device_node *np) struct generic_pm_domain *pd; struct sbi_pd_provider *pd_provider; struct dev_power_governor *pd_gov; - int ret = -ENOMEM, state_count = 0; + int ret = -ENOMEM; pd = dt_idle_pd_alloc(np, sbi_dt_parse_state_node); if (!pd) @@ -433,7 +433,7 @@ static int sbi_pd_init(struct device_node *np) pd->flags |= GENPD_FLAG_ALWAYS_ON; /* Use governor for CPU PM domains if it has some states to manage. */ - pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL; + pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; ret = pm_genpd_init(pd, pd_gov, false); if (ret) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 47551ab73ca8..b9bb94bd0f67 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -765,6 +765,106 @@ static struct cpuidle_state icx_cstates[] __initdata = { }; /* + * On AlderLake C1 has to be disabled if C1E is enabled, and vice versa. + * C1E is enabled only if "C1E promotion" bit is set in MSR_IA32_POWER_CTL. + * But in this case there is effectively no C1, because C1 requests are + * promoted to C1E. If the "C1E promotion" bit is cleared, then both C1 + * and C1E requests end up with C1, so there is effectively no C1E. + * + * By default we enable C1E and disable C1 by marking it with + * 'CPUIDLE_FLAG_UNUSABLE'. + */ +static struct cpuidle_state adl_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 220, + .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C8", + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 280, + .target_residency = 800, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C10", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 680, + .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + +static struct cpuidle_state adl_l_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 170, + .target_residency = 500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C8", + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, + .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C10", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 230, + .target_residency = 700, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + +/* * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1 @@ -1147,6 +1247,14 @@ static const struct idle_cpu idle_cpu_icx __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_adl __initconst = { + .state_table = adl_cstates, +}; + +static const struct idle_cpu idle_cpu_adl_l __initconst = { + .state_table = adl_l_cstates, +}; + static const struct idle_cpu idle_cpu_spr __initconst = { .state_table = spr_cstates, .disable_promotion_to_c1e = true, @@ -1215,6 +1323,8 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &idle_cpu_adl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), @@ -1574,6 +1684,25 @@ static void __init skx_idle_state_table_update(void) } /** + * adl_idle_state_table_update - Adjust AlderLake idle states table. + */ +static void __init adl_idle_state_table_update(void) +{ + /* Check if user prefers C1 over C1E. */ + if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) { + cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE; + cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE; + + /* Disable C1E by clearing the "C1E promotion" bit. */ + c1e_promotion = C1E_PROMOTION_DISABLE; + return; + } + + /* Make sure C1E is enabled by default */ + c1e_promotion = C1E_PROMOTION_ENABLE; +} + +/** * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. */ static void __init spr_idle_state_table_update(void) @@ -1642,6 +1771,10 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_FAM6_SAPPHIRERAPIDS_X: spr_idle_state_table_update(); break; + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: + adl_idle_state_table_update(); + break; } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 440ab5a03df9..485ea980bde7 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1448,7 +1448,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); * Returns 0 on success or a proper -EINVAL value in case of error. */ static int __maybe_unused -_get_dt_power(unsigned long *mW, unsigned long *kHz, struct device *dev) +_get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz) { struct dev_pm_opp *opp; unsigned long opp_freq, opp_power; @@ -1482,8 +1482,8 @@ _get_dt_power(unsigned long *mW, unsigned long *kHz, struct device *dev) * Returns -EINVAL if the power calculation failed because of missing * parameters, 0 otherwise. */ -static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz, - struct device *dev) +static int __maybe_unused _get_power(struct device *dev, unsigned long *mW, + unsigned long *kHz) { struct dev_pm_opp *opp; struct device_node *np; diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index bca2f912d349..f5eced0842b3 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -211,7 +211,7 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent) return 0; pd = em_cpu_get(cpu); - if (!pd) + if (!pd || em_is_artificial(pd)) return -EINVAL; dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL); diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 0bfb8eebd126..b8151d95a806 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -328,7 +328,7 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev, struct cpufreq_policy *policy; unsigned int nr_levels; - if (!em) + if (!em || em_is_artificial(em)) return false; policy = cpufreq_cdev->policy; diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c index 4310cb342a9f..b04dcbbf721a 100644 --- a/drivers/thermal/devfreq_cooling.c +++ b/drivers/thermal/devfreq_cooling.c @@ -358,6 +358,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, struct thermal_cooling_device *cdev; struct device *dev = df->dev.parent; struct devfreq_cooling_device *dfc; + struct em_perf_domain *em; char *name; int err, num_opps; @@ -367,8 +368,9 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, dfc->devfreq = df; - dfc->em_pd = em_pd_get(dev); - if (dfc->em_pd) { + em = em_pd_get(dev); + if (em && !em_is_artificial(em)) { + dfc->em_pd = em; devfreq_cooling_ops.get_requested_power = devfreq_cooling_get_requested_power; devfreq_cooling_ops.state2power = devfreq_cooling_state2power; @@ -379,7 +381,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, num_opps = em_pd_nr_perf_states(dfc->em_pd); } else { /* Backward compatibility for drivers which do not use IPA */ - dev_dbg(dev, "missing EM for cooling device\n"); + dev_dbg(dev, "missing proper EM for cooling device\n"); num_opps = dev_pm_opp_get_opp_count(dev); diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 9f3c400bc52d..8419bffb4398 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -67,11 +67,16 @@ struct em_perf_domain { * * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating * energy consumption. + * + * EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be + * created by platform missing real power information */ #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) +#define EM_PERF_DOMAIN_ARTIFICIAL BIT(2) #define em_span_cpus(em) (to_cpumask((em)->cpus)) +#define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL) #ifdef CONFIG_ENERGY_MODEL #define EM_MAX_POWER 0xFFFF @@ -96,11 +101,11 @@ struct em_data_callback { /** * active_power() - Provide power at the next performance state of * a device + * @dev : Device for which we do this operation (can be a CPU) * @power : Active power at the performance state * (modified) * @freq : Frequency at the performance state in kHz * (modified) - * @dev : Device for which we do this operation (can be a CPU) * * active_power() must find the lowest performance state of 'dev' above * 'freq' and update 'power' and 'freq' to the matching active power @@ -112,11 +117,32 @@ struct em_data_callback { * * Return 0 on success. */ - int (*active_power)(unsigned long *power, unsigned long *freq, - struct device *dev); + int (*active_power)(struct device *dev, unsigned long *power, + unsigned long *freq); + + /** + * get_cost() - Provide the cost at the given performance state of + * a device + * @dev : Device for which we do this operation (can be a CPU) + * @freq : Frequency at the performance state in kHz + * @cost : The cost value for the performance state + * (modified) + * + * In case of CPUs, the cost is the one of a single CPU in the domain. + * It is expected to fit in the [0, EM_MAX_POWER] range due to internal + * usage in EAS calculation. + * + * Return 0 on success, or appropriate error value in case of failure. + */ + int (*get_cost)(struct device *dev, unsigned long freq, + unsigned long *cost); }; -#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } #define EM_SET_ACTIVE_POWER_CB(em_cb, cb) ((em_cb).active_power = cb) +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) \ + { .active_power = _active_power_cb, \ + .get_cost = _cost_cb } +#define EM_DATA_CB(_active_power_cb) \ + EM_ADV_DATA_CB(_active_power_cb, NULL) struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); @@ -264,6 +290,7 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) #else struct em_data_callback {}; +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { } #define EM_DATA_CB(_active_power_cb) { } #define EM_SET_ACTIVE_POWER_CB(em_cb, cb) do { } while (0) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0153b0ca7b23..6c373f2960e7 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static int em_debug_units_show(struct seq_file *s, void *unused) +static int em_debug_flags_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? - "milliWatts" : "bogoWatts"; - seq_printf(s, "%s\n", units); + seq_printf(s, "%#lx\n", pd->flags); return 0; } -DEFINE_SHOW_ATTRIBUTE(em_debug_units); - -static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) -{ - struct em_perf_domain *pd = s->private; - int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; - - seq_printf(s, "%d\n", enabled); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); +DEFINE_SHOW_ATTRIBUTE(em_debug_flags); static void em_debug_create_pd(struct device *dev) { @@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); - debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); - debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, - &em_debug_skip_inefficiencies_fops); + debugfs_create_file("flags", 0444, d, dev->em_pd, + &em_debug_flags_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) @@ -121,7 +107,8 @@ static void em_debug_remove_pd(struct device *dev) {} #endif static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, - int nr_states, struct em_data_callback *cb) + int nr_states, struct em_data_callback *cb, + unsigned long flags) { unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; @@ -139,7 +126,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); @@ -173,10 +160,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = nr_states - 1; i >= 0; i--) { - unsigned long power_res = em_scale_power(table[i].power); + unsigned long power_res, cost; + + if (flags & EM_PERF_DOMAIN_ARTIFICIAL) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + goto free_ps_table; + } + } else { + power_res = em_scale_power(table[i].power); + cost = div64_u64(fmax * power_res, table[i].frequency); + } + + table[i].cost = cost; - table[i].cost = div64_u64(fmax * power_res, - table[i].frequency); if (table[i].cost >= prev_cost) { table[i].flags = EM_PERF_STATE_INEFFICIENT; dev_dbg(dev, "EM: OPP:%lu is inefficient\n", @@ -197,7 +196,8 @@ free_ps_table: } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + struct em_data_callback *cb, cpumask_t *cpus, + unsigned long flags) { struct em_perf_domain *pd; struct device *cpu_dev; @@ -215,7 +215,7 @@ static int em_create_pd(struct device *dev, int nr_states, return -ENOMEM; } - ret = em_create_perf_table(dev, pd, nr_states, cb); + ret = em_create_perf_table(dev, pd, nr_states, cb, flags); if (ret) { kfree(pd); return ret; @@ -259,6 +259,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev) found++; } + cpufreq_cpu_put(policy); + if (!found) return; @@ -332,6 +334,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, bool milliwatts) { unsigned long cap, prev_cap = 0; + unsigned long flags = 0; int cpu, ret; if (!dev || !nr_states || !cb) @@ -378,12 +381,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } } - ret = em_create_pd(dev, nr_states, cb, cpus); + if (milliwatts) + flags |= EM_PERF_DOMAIN_MILLIWATTS; + else if (cb->get_cost) + flags |= EM_PERF_DOMAIN_ARTIFICIAL; + + ret = em_create_pd(dev, nr_states, cb, cpus, flags); if (ret) goto unlock; - if (milliwatts) - dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + dev->em_pd->flags |= flags; em_cpufreq_update_efficiencies(dev); |