From 34e2c555f3e13c90e9284e23d00f03be8a6e06c5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Feb 2016 20:20:42 +0100 Subject: cpufreq: Add mechanism for registering utilization update callbacks Introduce a mechanism by which parts of the cpufreq subsystem ("setpolicy" drivers or the core) can register callbacks to be executed from cpufreq_update_util() which is invoked by the scheduler's update_load_avg() on CPU utilization changes. This allows the "setpolicy" drivers to dispense with their timers and do all of the computations they need and frequency/voltage adjustments in the update_load_avg() code path, among other things. The update_load_avg() changes were suggested by Peter Zijlstra. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar --- include/linux/cpufreq.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d0bf555b6bbf..704d85bf7242 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -151,6 +151,36 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy) extern struct kobject *cpufreq_global_kobject; #ifdef CONFIG_CPU_FREQ +void cpufreq_update_util(u64 time, unsigned long util, unsigned long max); + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} + +struct update_util_data { + void (*func)(struct update_util_data *data, + u64 time, unsigned long util, unsigned long max); +}; + +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); + unsigned int cpufreq_get(unsigned int cpu); unsigned int cpufreq_quick_get(unsigned int cpu); unsigned int cpufreq_quick_get_max(unsigned int cpu); @@ -162,6 +192,10 @@ int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else +static inline void cpufreq_update_util(u64 time, unsigned long util, + unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} + static inline unsigned int cpufreq_get(unsigned int cpu) { return 0; -- cgit v1.2.3 From 68e80dae09033d778b98dc88e5bfe8fdade188e5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 9 Feb 2016 09:01:35 +0530 Subject: Revert "cpufreq: Drop rwsem lock around CPUFREQ_GOV_POLICY_EXIT" Earlier, when the struct freq-attr was used to represent governor attributes, the standard cpufreq show/store sysfs attribute callbacks were applied to the governor tunable attributes and they always acquire the policy->rwsem lock before carrying out the operation. That could have resulted in an ABBA deadlock if governor tunable attributes are removed under policy->rwsem while one of them is being accessed concurrently (if sysfs attributes removal wins the race, it will wait for the access to complete with policy->rwsem held while the attribute callback will block on policy->rwsem indefinitely). We attempted to address this issue by dropping policy->rwsem around governor tunable attributes removal (that is, around invocations of the ->governor callback with the event arg equal to CPUFREQ_GOV_POLICY_EXIT) in cpufreq_set_policy(), but that opened up race conditions that had not been possible with policy->rwsem held all the time. The previous commit, "cpufreq: governor: New sysfs show/store callbacks for governor tunables", fixed the original ABBA deadlock by adding new governor specific show/store callbacks. We don't have to drop rwsem around invocations of governor event CPUFREQ_GOV_POLICY_EXIT anymore, and original fix can be reverted now. Fixes: 955ef4833574 (cpufreq: Drop rwsem lock around CPUFREQ_GOV_POLICY_EXIT) Signed-off-by: Viresh Kumar Reported-by: Juri Lelli Tested-by: Juri Lelli Tested-by: Shilpasri G Bhat Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 5 ----- include/linux/cpufreq.h | 4 ---- 2 files changed, 9 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e172b2a02c1d..e92e9eab7c6c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2205,10 +2205,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, return ret; } - up_write(&policy->rwsem); ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT); - down_write(&policy->rwsem); - if (ret) { pr_err("%s: Failed to Exit Governor: %s (%d)\n", __func__, old_gov->name, ret); @@ -2224,9 +2221,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, if (!ret) goto out; - up_write(&policy->rwsem); __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT); - down_write(&policy->rwsem); } /* new governor failed, so re-start old one */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 704d85bf7242..cac3d1ba8200 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -100,10 +100,6 @@ struct cpufreq_policy { * - Any routine that will write to the policy structure and/or may take away * the policy altogether (eg. CPU hotplug), will hold this lock in write * mode before doing so. - * - * Additional rules: - * - Lock should not be held across - * __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT); */ struct rw_semaphore rwsem; -- cgit v1.2.3 From 242aa883a64d8c54cfeee47f3603b21bc705e081 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 22 Feb 2016 16:36:44 +0530 Subject: cpufreq: Remove 'policy->governor_enabled' The entire sequence of events (like INIT/START or STOP/EXIT) for which cpufreq_governor() is called, is guaranteed to be protected by policy->rwsem now. The additional checks that were added earlier (as we were forced to drop policy->rwsem before calling cpufreq_governor() for EXIT event), aren't required anymore. Over that, they weren't sufficient really. They just take care of START/STOP events, but not INIT/EXIT and the state machine was never maintained properly by them. Kill the unnecessary checks and policy->governor_enabled field. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 17 ----------------- include/linux/cpufreq.h | 1 - 2 files changed, 18 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b3d05a905034..dd568aaf2728 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2010,17 +2010,6 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) pr_debug("%s: for CPU %u, event %u\n", __func__, policy->cpu, event); - if ((policy->governor_enabled && event == CPUFREQ_GOV_START) - || (!policy->governor_enabled - && (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) { - return -EBUSY; - } - - if (event == CPUFREQ_GOV_STOP) - policy->governor_enabled = false; - else if (event == CPUFREQ_GOV_START) - policy->governor_enabled = true; - ret = policy->governor->governor(policy, event); if (!ret) { @@ -2028,12 +2017,6 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) policy->governor->initialized++; else if (event == CPUFREQ_GOV_POLICY_EXIT) policy->governor->initialized--; - } else { - /* Restore original values */ - if (event == CPUFREQ_GOV_STOP) - policy->governor_enabled = true; - else if (event == CPUFREQ_GOV_START) - policy->governor_enabled = false; } if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) || diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cac3d1ba8200..a50c5b2e3bf2 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -80,7 +80,6 @@ struct cpufreq_policy { unsigned int last_policy; /* policy before unplug */ struct cpufreq_governor *governor; /* see below */ void *governor_data; - bool governor_enabled; /* governor start/stop flag */ char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */ struct work_struct update; /* if update_policy() needs to be -- cgit v1.2.3 From adaf9fcd136970e480d7ca834c0cf25ce922ea74 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Mar 2016 20:44:47 +0100 Subject: cpufreq: Move scheduler-related code to the sched directory Create cpufreq.c under kernel/sched/ and move the cpufreq code related to the scheduler to that file and to sched.h. Redefine cpufreq_update_util() as a static inline function to avoid function calls at its call sites in the scheduler code (as suggested by Peter Zijlstra). Also move the definition of struct update_util_data and declaration of cpufreq_set_update_util_data() from include/linux/cpufreq.h to include/linux/sched.h. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpufreq/cpufreq.c | 53 -------------------------------------- drivers/cpufreq/cpufreq_governor.c | 1 + include/linux/cpufreq.h | 34 ------------------------ include/linux/sched.h | 9 +++++++ kernel/sched/Makefile | 1 + kernel/sched/cpufreq.c | 37 ++++++++++++++++++++++++++ kernel/sched/sched.h | 49 ++++++++++++++++++++++++++++++++++- 7 files changed, 96 insertions(+), 88 deletions(-) create mode 100644 kernel/sched/cpufreq.c (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6eca12ab71d7..58e1a39b4d22 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -103,59 +103,6 @@ static struct cpufreq_driver *cpufreq_driver; static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data); static DEFINE_RWLOCK(cpufreq_driver_lock); -static DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); - -/** - * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. - * @cpu: The CPU to set the pointer for. - * @data: New pointer value. - * - * Set and publish the update_util_data pointer for the given CPU. That pointer - * points to a struct update_util_data object containing a callback function - * to call from cpufreq_update_util(). That function will be called from an RCU - * read-side critical section, so it must not sleep. - * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() - * right after this function to avoid use-after-free. - */ -void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) -{ - if (WARN_ON(data && !data->func)) - return; - - rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); -} -EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); - -/** - * cpufreq_update_util - Take a note about CPU utilization changes. - * @time: Current time. - * @util: Current utilization. - * @max: Utilization ceiling. - * - * This function is called by the scheduler on every invocation of - * update_load_avg() on the CPU whose utilization is being updated. - * - * It can only be called from RCU-sched read-side critical sections. - */ -void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) -{ - struct update_util_data *data; - -#ifdef CONFIG_LOCKDEP - WARN_ON(debug_locks && !rcu_read_lock_sched_held()); -#endif - - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); - /* - * If this isn't inside of an RCU-sched read-side critical section, data - * may become NULL after the check below. - */ - if (data) - data->func(data, time, util, max); -} - /* Flag to suspend/resume CPUFreq governors */ static bool cpufreq_suspended; diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index db46190bb246..1c25ef405616 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -18,6 +18,7 @@ #include #include +#include #include #include "cpufreq_governor.h" diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index a50c5b2e3bf2..a5ea52f793f3 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -146,36 +146,6 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy) extern struct kobject *cpufreq_global_kobject; #ifdef CONFIG_CPU_FREQ -void cpufreq_update_util(u64 time, unsigned long util, unsigned long max); - -/** - * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. - * @time: Current time. - * - * The way cpufreq is currently arranged requires it to evaluate the CPU - * performance state (frequency/voltage) on a regular basis to prevent it from - * being stuck in a completely inadequate performance level for too long. - * That is not guaranteed to happen if the updates are only triggered from CFS, - * though, because they may not be coming in if RT or deadline tasks are active - * all the time (or there are RT and DL tasks only). - * - * As a workaround for that issue, this function is called by the RT and DL - * sched classes to trigger extra cpufreq updates to prevent it from stalling, - * but that really is a band-aid. Going forward it should be replaced with - * solutions targeted more specifically at RT and DL tasks. - */ -static inline void cpufreq_trigger_update(u64 time) -{ - cpufreq_update_util(time, ULONG_MAX, 0); -} - -struct update_util_data { - void (*func)(struct update_util_data *data, - u64 time, unsigned long util, unsigned long max); -}; - -void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); - unsigned int cpufreq_get(unsigned int cpu); unsigned int cpufreq_quick_get(unsigned int cpu); unsigned int cpufreq_quick_get_max(unsigned int cpu); @@ -187,10 +157,6 @@ int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else -static inline void cpufreq_update_util(u64 time, unsigned long util, - unsigned long max) {} -static inline void cpufreq_trigger_update(u64 time) {} - static inline unsigned int cpufreq_get(unsigned int cpu) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..913e755ef7b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3207,4 +3207,13 @@ static inline unsigned long rlimit_max(unsigned int limit) return task_rlimit_max(current, limit); } +#ifdef CONFIG_CPU_FREQ +struct update_util_data { + void (*func)(struct update_util_data *data, + u64 time, unsigned long util, unsigned long max); +}; + +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); +#endif /* CONFIG_CPU_FREQ */ + #endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..9507522164ac 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..928c4ba32f68 --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,37 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU. That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util(). That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ + if (WARN_ON(data && !data->func)) + return; + + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f042190c8002..faf7e2758dd0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -9,7 +9,6 @@ #include #include #include -#include #include "cpupri.h" #include "cpudeadline.h" @@ -1739,3 +1738,51 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ -- cgit v1.2.3