diff options
Diffstat (limited to 'drivers/thermal/intel')
-rw-r--r-- | drivers/thermal/intel/Kconfig | 3 | ||||
-rw-r--r-- | drivers/thermal/intel/intel_powerclamp.c | 379 |
2 files changed, 159 insertions, 223 deletions
diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index e50fd260484a..b5808f92702d 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -3,6 +3,9 @@ config INTEL_POWERCLAMP tristate "Intel PowerClamp idle injection driver" depends on X86 depends on CPU_SUP_INTEL + depends on CPU_IDLE + select POWERCAP + select IDLE_INJECT help Enable this to enable Intel PowerClamp idle injection driver. This enforce idle time which results in more package C-state residency. The diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 2f4cbfdf26a0..209bb5f5ae17 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -2,7 +2,7 @@ /* * intel_powerclamp.c - package c-state idle injection * - * Copyright (c) 2012, Intel Corporation. + * Copyright (c) 2012-2023, Intel Corporation. * * Authors: * Arjan van de Ven <arjan@linux.intel.com> @@ -27,21 +27,15 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/delay.h> -#include <linux/kthread.h> #include <linux/cpu.h> #include <linux/thermal.h> -#include <linux/slab.h> -#include <linux/tick.h> #include <linux/debugfs.h> #include <linux/seq_file.h> -#include <linux/sched/rt.h> -#include <uapi/linux/sched/types.h> +#include <linux/idle_inject.h> -#include <asm/nmi.h> #include <asm/msr.h> #include <asm/mwait.h> #include <asm/cpu_device_id.h> -#include <asm/hardirq.h> #define MAX_TARGET_RATIO (50U) /* For each undisturbed clamping period (no extra wake ups during idle time), @@ -59,35 +53,26 @@ static unsigned int target_mwait; static struct dentry *debug_dir; static bool poll_pkg_cstate_enable; -/* user selected target */ -static unsigned int set_target_ratio; +/* Idle ratio observed using package C-state counters */ static unsigned int current_ratio; -static bool should_skip; -static unsigned int control_cpu; /* The cpu assigned to collect stat and update - * control parameters. default to BSP but BSP - * can be offlined. - */ -static bool clamping; +/* Skip the idle injection till set to true */ +static bool should_skip; -struct powerclamp_worker_data { - struct kthread_worker *worker; - struct kthread_work balancing_work; - struct kthread_delayed_work idle_injection_work; +struct powerclamp_data { unsigned int cpu; unsigned int count; unsigned int guard; unsigned int window_size_now; unsigned int target_ratio; - unsigned int duration_jiffies; bool clamping; }; -static struct powerclamp_worker_data __percpu *worker_data; +static struct powerclamp_data powerclamp_data; + static struct thermal_cooling_device *cooling_dev; -static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu - * clamping kthread worker - */ + +static DEFINE_MUTEX(powerclamp_lock); static unsigned int duration; static unsigned int pkg_cstate_ratio_cur; @@ -306,7 +291,7 @@ static void adjust_compensation(int target_ratio, unsigned int win) if (d->confidence >= CONFIDENCE_OK) return; - delta = set_target_ratio - current_ratio; + delta = powerclamp_data.target_ratio - current_ratio; /* filter out bad data */ if (delta >= 0 && delta <= (1+target_ratio/10)) { if (d->steady_comp) @@ -345,82 +330,39 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio, adjust_compensation(target_ratio, win); /* if we are above target+guard, skip */ - return set_target_ratio + guard <= current_ratio; + return powerclamp_data.target_ratio + guard <= current_ratio; } -static void clamp_balancing_func(struct kthread_work *work) +/* + * This function calculates runtime from the current target ratio. + * This function gets called under powerclamp_lock. + */ +static unsigned int get_run_time(void) { - struct powerclamp_worker_data *w_data; - int sleeptime; - unsigned long target_jiffies; unsigned int compensated_ratio; - int interval; /* jiffies to sleep for each attempt */ - - w_data = container_of(work, struct powerclamp_worker_data, - balancing_work); + unsigned int runtime; /* * make sure user selected ratio does not take effect until * the next round. adjust target_ratio if user has changed * target such that we can converge quickly. */ - w_data->target_ratio = READ_ONCE(set_target_ratio); - w_data->guard = 1 + w_data->target_ratio / 20; - w_data->window_size_now = window_size; - w_data->duration_jiffies = msecs_to_jiffies(duration); - w_data->count++; + powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20; + powerclamp_data.window_size_now = window_size; /* * systems may have different ability to enter package level * c-states, thus we need to compensate the injected idle ratio * to achieve the actual target reported by the HW. */ - compensated_ratio = w_data->target_ratio + - get_compensation(w_data->target_ratio); + compensated_ratio = powerclamp_data.target_ratio + + get_compensation(powerclamp_data.target_ratio); if (compensated_ratio <= 0) compensated_ratio = 1; - interval = w_data->duration_jiffies * 100 / compensated_ratio; - - /* align idle time */ - target_jiffies = roundup(jiffies, interval); - sleeptime = target_jiffies - jiffies; - if (sleeptime <= 0) - sleeptime = 1; - - if (clamping && w_data->clamping && cpu_online(w_data->cpu)) - kthread_queue_delayed_work(w_data->worker, - &w_data->idle_injection_work, - sleeptime); -} -static void clamp_idle_injection_func(struct kthread_work *work) -{ - struct powerclamp_worker_data *w_data; + runtime = duration * 100 / compensated_ratio - duration; - w_data = container_of(work, struct powerclamp_worker_data, - idle_injection_work.work); - - /* - * only elected controlling cpu can collect stats and update - * control parameters. - */ - if (w_data->cpu == control_cpu && - !(w_data->count % w_data->window_size_now)) { - should_skip = - powerclamp_adjust_controls(w_data->target_ratio, - w_data->guard, - w_data->window_size_now); - smp_mb(); - } - - if (should_skip) - goto balance; - - play_idle(jiffies_to_usecs(w_data->duration_jiffies)); - -balance: - if (clamping && w_data->clamping && cpu_online(w_data->cpu)) - kthread_queue_work(w_data->worker, &w_data->balancing_work); + return runtime; } /* @@ -456,127 +398,135 @@ static void poll_pkg_cstate(struct work_struct *dummy) msr_last = msr_now; tsc_last = tsc_now; - if (true == clamping) + mutex_lock(&powerclamp_lock); + if (powerclamp_data.clamping) schedule_delayed_work(&poll_pkg_cstate_work, HZ); + mutex_unlock(&powerclamp_lock); } -static void start_power_clamp_worker(unsigned long cpu) -{ - struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); - struct kthread_worker *worker; - - worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu); - if (IS_ERR(worker)) - return; - - w_data->worker = worker; - w_data->count = 0; - w_data->cpu = cpu; - w_data->clamping = true; - set_bit(cpu, cpu_clamping_mask); - sched_set_fifo(worker->task); - kthread_init_work(&w_data->balancing_work, clamp_balancing_func); - kthread_init_delayed_work(&w_data->idle_injection_work, - clamp_idle_injection_func); - kthread_queue_work(w_data->worker, &w_data->balancing_work); -} +static struct idle_inject_device *ii_dev; -static void stop_power_clamp_worker(unsigned long cpu) +/* + * This function is called from idle injection core on timer expiry + * for the run duration. This allows powerclamp to readjust or skip + * injecting idle for this cycle. + */ +static bool idle_inject_update(void) { - struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + bool update = false; - if (!w_data->worker) - return; + /* We can't sleep in this callback */ + if (!mutex_trylock(&powerclamp_lock)) + return true; - w_data->clamping = false; - /* - * Make sure that all works that get queued after this point see - * the clamping disabled. The counter part is not needed because - * there is an implicit memory barrier when the queued work - * is proceed. - */ - smp_wmb(); - kthread_cancel_work_sync(&w_data->balancing_work); - kthread_cancel_delayed_work_sync(&w_data->idle_injection_work); - /* - * The balancing work still might be queued here because - * the handling of the "clapming" variable, cancel, and queue - * operations are not synchronized via a lock. But it is not - * a big deal. The balancing work is fast and destroy kthread - * will wait for it. - */ - clear_bit(w_data->cpu, cpu_clamping_mask); - kthread_destroy_worker(w_data->worker); + if (!(powerclamp_data.count % powerclamp_data.window_size_now)) { - w_data->worker = NULL; -} + should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio, + powerclamp_data.guard, + powerclamp_data.window_size_now); + update = true; + } -static int start_power_clamp(void) -{ - unsigned long cpu; + if (update) { + unsigned int runtime = get_run_time(); - set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); - /* prevent cpu hotplug */ - cpus_read_lock(); + idle_inject_set_duration(ii_dev, runtime, duration); + } - /* prefer BSP */ - control_cpu = cpumask_first(cpu_online_mask); + powerclamp_data.count++; - clamping = true; - if (poll_pkg_cstate_enable) - schedule_delayed_work(&poll_pkg_cstate_work, 0); + mutex_unlock(&powerclamp_lock); - /* start one kthread worker per online cpu */ - for_each_online_cpu(cpu) { - start_power_clamp_worker(cpu); - } - cpus_read_unlock(); + if (should_skip) + return false; - return 0; + return true; } -static void end_power_clamp(void) +/* This function starts idle injection by calling idle_inject_start() */ +static void trigger_idle_injection(void) { - int i; + unsigned int runtime = get_run_time(); + idle_inject_set_duration(ii_dev, runtime, duration); + idle_inject_start(ii_dev); + powerclamp_data.clamping = true; +} + +/* + * This function is called from start_power_clamp() to register + * CPUS with powercap idle injection register and set default + * idle duration and latency. + */ +static int powerclamp_idle_injection_register(void) +{ /* - * Block requeuing in all the kthread workers. They will flush and - * stop faster. + * The idle inject core will only inject for online CPUs, + * So we can register for all present CPUs. In this way + * if some CPU goes online/offline while idle inject + * is registered, nothing additional calls are required. + * The same runtime and idle time is applicable for + * newly onlined CPUs if any. + * + * Here cpu_present_mask can be used as is. + * cast to (struct cpumask *) is required as the + * cpu_present_mask is const struct cpumask *, otherwise + * there will be compiler warnings. */ - clamping = false; - for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { - pr_debug("clamping worker for cpu %d alive, destroy\n", i); - stop_power_clamp_worker(i); + ii_dev = idle_inject_register_full((struct cpumask *)cpu_present_mask, + idle_inject_update); + if (!ii_dev) { + pr_err("powerclamp: idle_inject_register failed\n"); + return -EAGAIN; } + + idle_inject_set_duration(ii_dev, TICK_USEC, duration); + idle_inject_set_latency(ii_dev, UINT_MAX); + + return 0; } -static int powerclamp_cpu_online(unsigned int cpu) +/* + * This function is called from end_power_clamp() to stop idle injection + * and unregister CPUS from powercap idle injection core. + */ +static void remove_idle_injection(void) { - if (clamping == false) - return 0; - start_power_clamp_worker(cpu); - /* prefer BSP as controlling CPU */ - if (cpu == 0) { - control_cpu = 0; - smp_mb(); - } - return 0; + if (!powerclamp_data.clamping) + return; + + powerclamp_data.clamping = false; + idle_inject_stop(ii_dev); } -static int powerclamp_cpu_predown(unsigned int cpu) +/* + * This function is called when user change the cooling device + * state from zero to some other value. + */ +static int start_power_clamp(void) { - if (clamping == false) - return 0; + int ret; - stop_power_clamp_worker(cpu); - if (cpu != control_cpu) - return 0; + ret = powerclamp_idle_injection_register(); + if (!ret) { + trigger_idle_injection(); + if (poll_pkg_cstate_enable) + schedule_delayed_work(&poll_pkg_cstate_work, 0); + } - control_cpu = cpumask_first(cpu_online_mask); - if (control_cpu == cpu) - control_cpu = cpumask_next(cpu, cpu_online_mask); - smp_mb(); - return 0; + return ret; +} + +/* + * This function is called when user change the cooling device + * state from non zero value zero. + */ +static void end_power_clamp(void) +{ + if (powerclamp_data.clamping) { + remove_idle_injection(); + idle_inject_unregister(ii_dev); + } } static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, @@ -590,16 +540,20 @@ static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev, unsigned long *state) { - if (clamping) { + mutex_lock(&powerclamp_lock); + + if (powerclamp_data.clamping) { if (poll_pkg_cstate_enable) *state = pkg_cstate_ratio_cur; else - *state = set_target_ratio; + *state = powerclamp_data.target_ratio; } else { /* to save power, do not poll idle ratio while not clamping */ *state = -1; /* indicates invalid state */ } + mutex_unlock(&powerclamp_lock); + return 0; } @@ -608,24 +562,32 @@ static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev, { int ret = 0; + mutex_lock(&powerclamp_lock); + new_target_ratio = clamp(new_target_ratio, 0UL, - (unsigned long) (MAX_TARGET_RATIO-1)); - if (set_target_ratio == 0 && new_target_ratio > 0) { + (unsigned long) (MAX_TARGET_RATIO - 1)); + if (!powerclamp_data.target_ratio && new_target_ratio > 0) { pr_info("Start idle injection to reduce power\n"); - set_target_ratio = new_target_ratio; + powerclamp_data.target_ratio = new_target_ratio; ret = start_power_clamp(); + if (ret) + powerclamp_data.target_ratio = 0; goto exit_set; - } else if (set_target_ratio > 0 && new_target_ratio == 0) { + } else if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) { pr_info("Stop forced idle injection\n"); end_power_clamp(); - set_target_ratio = 0; + powerclamp_data.target_ratio = 0; } else /* adjust currently running */ { - set_target_ratio = new_target_ratio; - /* make new set_target_ratio visible to other cpus */ - smp_mb(); + unsigned int runtime; + + powerclamp_data.target_ratio = new_target_ratio; + runtime = get_run_time(); + idle_inject_set_duration(ii_dev, runtime, duration); } exit_set: + mutex_unlock(&powerclamp_lock); + return ret; } @@ -666,7 +628,6 @@ static int powerclamp_debug_show(struct seq_file *m, void *unused) { int i = 0; - seq_printf(m, "controlling cpu: %d\n", control_cpu); seq_printf(m, "pct confidence steady dynamic (compensation)\n"); for (i = 0; i < MAX_TARGET_RATIO; i++) { seq_printf(m, "%d\t%lu\t%lu\t%lu\n", @@ -689,78 +650,50 @@ static inline void powerclamp_create_debug_files(void) &powerclamp_debug_fops); } -static enum cpuhp_state hp_state; - static int __init powerclamp_init(void) { int retval; - cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL); - if (!cpu_clamping_mask) - return -ENOMEM; - /* probe cpu features and ids here */ retval = powerclamp_probe(); if (retval) - goto exit_free; + return retval; /* set default limit, maybe adjusted during runtime based on feedback */ window_size = 2; - retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "thermal/intel_powerclamp:online", - powerclamp_cpu_online, - powerclamp_cpu_predown); - if (retval < 0) - goto exit_free; - - hp_state = retval; - - worker_data = alloc_percpu(struct powerclamp_worker_data); - if (!worker_data) { - retval = -ENOMEM; - goto exit_unregister; - } if (topology_max_packages() == 1 && topology_max_die_per_package() == 1) poll_pkg_cstate_enable = true; cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL, - &powerclamp_cooling_ops); - if (IS_ERR(cooling_dev)) { - retval = -ENODEV; - goto exit_free_thread; - } + &powerclamp_cooling_ops); + if (IS_ERR(cooling_dev)) + return -ENODEV; if (!duration) - duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES); + duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES); powerclamp_create_debug_files(); return 0; - -exit_free_thread: - free_percpu(worker_data); -exit_unregister: - cpuhp_remove_state_nocalls(hp_state); -exit_free: - bitmap_free(cpu_clamping_mask); - return retval; } module_init(powerclamp_init); static void __exit powerclamp_exit(void) { + mutex_lock(&powerclamp_lock); end_power_clamp(); - cpuhp_remove_state_nocalls(hp_state); - free_percpu(worker_data); + mutex_unlock(&powerclamp_lock); + thermal_cooling_device_unregister(cooling_dev); - bitmap_free(cpu_clamping_mask); cancel_delayed_work_sync(&poll_pkg_cstate_work); debugfs_remove_recursive(debug_dir); } module_exit(powerclamp_exit); +MODULE_IMPORT_NS(IDLE_INJECT); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>"); |