From b57100a3d9ced8c2b78e87d313f514a3338d016e Mon Sep 17 00:00:00 2001 From: Malaya Kumar Rout Date: Tue, 14 Oct 2025 01:00:27 +0530 Subject: PM: console: Fix memory allocation error handling in pm_vt_switch_required() The pm_vt_switch_required() function fails silently when memory allocation fails, offering no indication to callers that the operation was unsuccessful. This behavior prevents drivers from handling allocation errors correctly or implementing retry mechanisms. By ensuring that failures are reported back to the caller, drivers can make informed decisions, improve robustness, and avoid unexpected behavior during critical power management operations. Change the function signature to return an integer error code and modify the implementation to return -ENOMEM when kmalloc() fails. Update both the function declaration and the inline stub in include/linux/pm.h to maintain consistency across CONFIG_VT_CONSOLE_SLEEP configurations. The function now returns: - 0 on success (including when updating existing entries) - -ENOMEM when memory allocation fails This change improves error reporting without breaking existing callers, as the current callers in drivers/video/fbdev/core/fbmem.c already ignore the return value, making this a backward-compatible improvement. Reviewed-by: Lyude Paul Signed-off-by: Malaya Kumar Rout Reviewed-by: Dhruva Gole Reviewed-by: Lyude Paul Link: https://patch.msgid.link/20251013193028.89570-1-mrout@redhat.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index cc7b2dc28574..a72e42eec130 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -25,11 +25,12 @@ extern void (*pm_power_off)(void); struct device; /* we have a circular dep with device.h */ #ifdef CONFIG_VT_CONSOLE_SLEEP -extern void pm_vt_switch_required(struct device *dev, bool required); +extern int pm_vt_switch_required(struct device *dev, bool required); extern void pm_vt_switch_unregister(struct device *dev); #else -static inline void pm_vt_switch_required(struct device *dev, bool required) +static inline int pm_vt_switch_required(struct device *dev, bool required) { + return 0; } static inline void pm_vt_switch_unregister(struct device *dev) { -- cgit v1.2.3 From cbe5aeedecc72314c3a8fd0d41d9b270f576aee1 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:05 +0900 Subject: PM: EM: Assign a unique ID when creating a performance domain It is necessary to refer to a specific performance domain from a userspace. For example, the energy model of a particular performance domain is updated. To this end, assign a unique ID to each performance domain to address it, and manage them in a global linked list to look up a specific one by matching ID. IDA is used for ID assignment, and the mutex is used to protect the global list from concurrent access. Note that the mutex (em_pd_list_mutex) is not supposed to hold while holding em_pd_mutex to avoid ABBA deadlock. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-2-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 4 ++++ kernel/power/energy_model.c | 30 +++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 61d50571ad88..43aa6153dc57 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -54,6 +54,8 @@ struct em_perf_table { /** * struct em_perf_domain - Performance domain * @em_table: Pointer to the runtime modifiable em_perf_table + * @node: node in em_pd_list (in energy_model.c) + * @id: A unique ID number for each performance domain * @nr_perf_states: Number of performance states * @min_perf_state: Minimum allowed Performance State index * @max_perf_state: Maximum allowed Performance State index @@ -71,6 +73,8 @@ struct em_perf_table { */ struct em_perf_domain { struct em_perf_table __rcu *em_table; + struct list_head node; + int id; int nr_perf_states; int min_perf_state; int max_perf_state; diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 5f17d2e8e954..2047b546ad11 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -23,6 +23,16 @@ */ static DEFINE_MUTEX(em_pd_mutex); +/* + * Manage performance domains with IDs. One can iterate the performance domains + * through the list and pick one with their associated ID. The mutex serializes + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be + * taken to avoid potential deadlock. + */ +static DEFINE_IDA(em_pd_ida); +static LIST_HEAD(em_pd_list); +static DEFINE_MUTEX(em_pd_list_mutex); + static void em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table); static void em_check_capacity_update(void); @@ -396,7 +406,7 @@ static int em_create_pd(struct device *dev, int nr_states, struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret, num_cpus; + int cpu, ret, num_cpus, id; if (_is_cpu_device(dev)) { num_cpus = cpumask_weight(cpus); @@ -420,6 +430,13 @@ static int em_create_pd(struct device *dev, int nr_states, pd->nr_perf_states = nr_states; + INIT_LIST_HEAD(&pd->node); + + id = ida_alloc(&em_pd_ida, GFP_KERNEL); + if (id < 0) + return -ENOMEM; + pd->id = id; + em_table = em_table_alloc(pd); if (!em_table) goto free_pd; @@ -444,6 +461,7 @@ free_pd_table: kfree(em_table); free_pd: kfree(pd); + ida_free(&em_pd_ida, id); return -EINVAL; } @@ -660,6 +678,10 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); + mutex_lock(&em_pd_list_mutex); + list_add_tail(&dev->em_pd->node, &em_pd_list); + mutex_unlock(&em_pd_list_mutex); + return ret; } EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); @@ -678,6 +700,10 @@ void em_dev_unregister_perf_domain(struct device *dev) if (_is_cpu_device(dev)) return; + mutex_lock(&em_pd_list_mutex); + list_del_init(&dev->em_pd->node); + mutex_unlock(&em_pd_list_mutex); + /* * The mutex separates all register/unregister requests and protects * from potential clean-up/setup issues in the debugfs directories. @@ -689,6 +715,8 @@ void em_dev_unregister_perf_domain(struct device *dev) em_table_free(rcu_dereference_protected(dev->em_pd->em_table, lockdep_is_held(&em_pd_mutex))); + ida_free(&em_pd_ida, dev->em_pd->id); + kfree(dev->em_pd); dev->em_pd = NULL; mutex_unlock(&em_pd_mutex); -- cgit v1.2.3 From bd26631ccdfd11701fa29e665a7f041875ba9423 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:07 +0900 Subject: PM: EM: Add em.yaml and autogen files Add a generic netlink spec in YAML format and autogenerate boilerplate code using ynl-regen.sh to introduce a generic netlink for the energy model. It allows a userspace program to read the performance domain and its energy model. It notifies the userspace program when a performance domain is created or deleted or its energy model is updated through a multicast interface. Specifically, it supports two commands: - EM_CMD_GET_PDS: Get the list of information for all performance domains. - EM_CMD_GET_PD_TABLE: Get the energy model table of a performance domain. Also, it supports three notification events: - EM_CMD_PD_CREATED: When a performance domain is created. - EM_CMD_PD_DELETED: When a performance domain is deleted. - EM_CMD_PD_UPDATED: When the energy model table of a performance domain is updated. Finally, update MAINTAINERS to include new files. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-4-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- Documentation/netlink/specs/em.yaml | 113 ++++++++++++++++++++++++++++++++++++ MAINTAINERS | 3 + include/uapi/linux/energy_model.h | 62 ++++++++++++++++++++ kernel/power/em_netlink_autogen.c | 48 +++++++++++++++ kernel/power/em_netlink_autogen.h | 23 ++++++++ 5 files changed, 249 insertions(+) create mode 100644 Documentation/netlink/specs/em.yaml create mode 100644 include/uapi/linux/energy_model.h create mode 100644 kernel/power/em_netlink_autogen.c create mode 100644 kernel/power/em_netlink_autogen.h (limited to 'include') diff --git a/Documentation/netlink/specs/em.yaml b/Documentation/netlink/specs/em.yaml new file mode 100644 index 000000000000..9905ca482325 --- /dev/null +++ b/Documentation/netlink/specs/em.yaml @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) + +name: em + +doc: | + Energy model netlink interface to notify its changes. + +protocol: genetlink + +uapi-header: linux/energy_model.h + +attribute-sets: + - + name: pds + attributes: + - + name: pd + type: nest + nested-attributes: pd + multi-attr: true + - + name: pd + attributes: + - + name: pad + type: pad + - + name: pd-id + type: u32 + - + name: flags + type: u64 + - + name: cpus + type: string + - + name: pd-table + attributes: + - + name: pd-id + type: u32 + - + name: ps + type: nest + nested-attributes: ps + multi-attr: true + - + name: ps + attributes: + - + name: pad + type: pad + - + name: performance + type: u64 + - + name: frequency + type: u64 + - + name: power + type: u64 + - + name: cost + type: u64 + - + name: flags + type: u64 + +operations: + list: + - + name: get-pds + attribute-set: pds + doc: Get the list of information for all performance domains. + do: + reply: + attributes: + - pd + - + name: get-pd-table + attribute-set: pd-table + doc: Get the energy model table of a performance domain. + do: + request: + attributes: + - pd-id + reply: + attributes: + - pd-id + - ps + - + name: pd-created + doc: A performance domain is created. + notify: get-pd-table + mcgrp: event + - + name: pd-updated + doc: A performance domain is updated. + notify: get-pd-table + mcgrp: event + - + name: pd-deleted + doc: A performance domain is deleted. + attribute-set: pd-table + event: + attributes: + - pd-id + mcgrp: event + +mcast-groups: + list: + - + name: event diff --git a/MAINTAINERS b/MAINTAINERS index 545a4776795e..e6b3bab9dbeb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9181,6 +9181,9 @@ S: Maintained F: kernel/power/energy_model.c F: include/linux/energy_model.h F: Documentation/power/energy-model.rst +F: Documentation/netlink/specs/em.yaml +F: include/uapi/linux/energy_model.h +F: kernel/power/em_netlink_autogen.* EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER M: Laurentiu Tudor diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h new file mode 100644 index 000000000000..4ec4c0eabbbb --- /dev/null +++ b/include/uapi/linux/energy_model.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_ENERGY_MODEL_H +#define _UAPI_LINUX_ENERGY_MODEL_H + +#define EM_FAMILY_NAME "em" +#define EM_FAMILY_VERSION 1 + +enum { + EM_A_PDS_PD = 1, + + __EM_A_PDS_MAX, + EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1) +}; + +enum { + EM_A_PD_PAD = 1, + EM_A_PD_PD_ID, + EM_A_PD_FLAGS, + EM_A_PD_CPUS, + + __EM_A_PD_MAX, + EM_A_PD_MAX = (__EM_A_PD_MAX - 1) +}; + +enum { + EM_A_PD_TABLE_PD_ID = 1, + EM_A_PD_TABLE_PS, + + __EM_A_PD_TABLE_MAX, + EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1) +}; + +enum { + EM_A_PS_PAD = 1, + EM_A_PS_PERFORMANCE, + EM_A_PS_FREQUENCY, + EM_A_PS_POWER, + EM_A_PS_COST, + EM_A_PS_FLAGS, + + __EM_A_PS_MAX, + EM_A_PS_MAX = (__EM_A_PS_MAX - 1) +}; + +enum { + EM_CMD_GET_PDS = 1, + EM_CMD_GET_PD_TABLE, + EM_CMD_PD_CREATED, + EM_CMD_PD_UPDATED, + EM_CMD_PD_DELETED, + + __EM_CMD_MAX, + EM_CMD_MAX = (__EM_CMD_MAX - 1) +}; + +#define EM_MCGRP_EVENT "event" + +#endif /* _UAPI_LINUX_ENERGY_MODEL_H */ diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c new file mode 100644 index 000000000000..a7a09ab1d1c2 --- /dev/null +++ b/kernel/power/em_netlink_autogen.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "em_netlink_autogen.h" + +#include + +/* EM_CMD_GET_PD_TABLE - do */ +static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { + [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +}; + +/* Ops table for em */ +static const struct genl_split_ops em_nl_ops[] = { + { + .cmd = EM_CMD_GET_PDS, + .doit = em_nl_get_pds_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = EM_CMD_GET_PD_TABLE, + .doit = em_nl_get_pd_table_doit, + .policy = em_get_pd_table_nl_policy, + .maxattr = EM_A_PD_TABLE_PD_ID, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group em_nl_mcgrps[] = { + [EM_NLGRP_EVENT] = { "event", }, +}; + +struct genl_family em_nl_family __ro_after_init = { + .name = EM_FAMILY_NAME, + .version = EM_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = em_nl_ops, + .n_split_ops = ARRAY_SIZE(em_nl_ops), + .mcgrps = em_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), +}; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h new file mode 100644 index 000000000000..78ce609641f1 --- /dev/null +++ b/kernel/power/em_netlink_autogen.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_EM_GEN_H +#define _LINUX_EM_GEN_H + +#include +#include + +#include + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + EM_NLGRP_EVENT, +}; + +extern struct genl_family em_nl_family; + +#endif /* _LINUX_EM_GEN_H */ -- cgit v1.2.3 From 8e4ec90701efec7f2814c89b398d6d4272636814 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Oct 2025 07:55:55 -1000 Subject: freezer: Clarify that only cgroup1 freezer uses PM freezer cgroup1 freezer piggybacks on the PM freezer, which inadvertently allowed userspace to produce uninterruptible tasks at will. To avoid the issue, cgroup2 freezer switched to a separate job control based mechanism. While this happened a long time ago, the code and comment haven't been updated making it confusing to people who aren't familiar with the history. Rename cgroup_freezing() to cgroup1_freezing() and update comments on top of freezing() and frozen() to clarify that cgroup2 freezer isn't covered by the PM freezer mechanism. Signed-off-by: Tejun Heo Suggested-by: Qu Wenruo Link: https://patch.msgid.link/aPZ3q6Hm865NicBC@slm.duckdns.org Signed-off-by: Rafael J. Wysocki --- include/linux/freezer.h | 12 ++++++++---- kernel/cgroup/legacy_freezer.c | 2 +- kernel/freezer.c | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 32884c9721e5..0a8c6c4d1a82 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -22,14 +22,18 @@ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ extern unsigned int freeze_timeout_msecs; /* - * Check if a process has been frozen + * Check if a process has been frozen for PM or cgroup1 freezer. Note that + * cgroup2 freezer uses the job control mechanism and does not interact with + * the PM freezer. */ extern bool frozen(struct task_struct *p); extern bool freezing_slow_path(struct task_struct *p); /* - * Check if there is a request to freeze a process + * Check if there is a request to freeze a task from PM or cgroup1 freezer. + * Note that cgroup2 freezer uses the job control mechanism and does not + * interact with the PM freezer. */ static inline bool freezing(struct task_struct *p) { @@ -63,9 +67,9 @@ extern bool freeze_task(struct task_struct *p); extern bool set_freezable(void); #ifdef CONFIG_CGROUP_FREEZER -extern bool cgroup_freezing(struct task_struct *task); +extern bool cgroup1_freezing(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ -static inline bool cgroup_freezing(struct task_struct *task) +static inline bool cgroup1_freezing(struct task_struct *task) { return false; } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index dd9417425d92..915b02f65980 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -63,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer) return css_freezer(freezer->css.parent); } -bool cgroup_freezing(struct task_struct *task) +bool cgroup1_freezing(struct task_struct *task) { bool ret; diff --git a/kernel/freezer.c b/kernel/freezer.c index ddc11a8bd2ea..a76bf957fb32 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,7 +44,7 @@ bool freezing_slow_path(struct task_struct *p) if (tsk_is_oom_victim(p)) return false; - if (pm_nosig_freezing || cgroup_freezing(p)) + if (pm_nosig_freezing || cgroup1_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) -- cgit v1.2.3 From 0ca04993dac9b0d21ffbfd22bf54cc43ec2c49f2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 12 Nov 2025 16:40:23 -0600 Subject: PM: Introduce new PMSG_POWEROFF event PMSG_POWEROFF will be used for the PM core to allow differentiating between a hibernation or shutdown sequence when re-using callbacks for common code. Hibernation is started by writing a hibernation method (such as 'platform' 'shutdown', or 'reboot') to use into /sys/power/disk and writing 'disk' to /sys/power/state. Shutdown is initiated with the reboot() syscall with arguments on whether to halt the system or power it off. Tested-by: Eric Naim Signed-off-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251112224025.2051702-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 5 +++++ include/linux/pm.h | 3 +++ include/trace/events/power.h | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 7a8807ec9a5d..38fc8a978b88 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -96,6 +96,8 @@ static const char *pm_verb(int event) return "restore"; case PM_EVENT_RECOVER: return "recover"; + case PM_EVENT_POWEROFF: + return "poweroff"; default: return "(unknown PM event)"; } @@ -368,6 +370,7 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: @@ -402,6 +405,7 @@ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: @@ -436,6 +440,7 @@ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t stat case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: diff --git a/include/linux/pm.h b/include/linux/pm.h index a72e42eec130..7f69f739f613 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -508,6 +508,7 @@ const struct dev_pm_ops name = { \ * RECOVER Creation of a hibernation image or restoration of the main * memory contents from a hibernation image has failed, call * ->thaw() and ->complete() for all devices. + * POWEROFF System will poweroff, call ->poweroff() for all devices. * * The following PM_EVENT_ messages are defined for internal use by * kernel subsystems. They are never issued by the PM core. @@ -538,6 +539,7 @@ const struct dev_pm_ops name = { \ #define PM_EVENT_USER 0x0100 #define PM_EVENT_REMOTE 0x0200 #define PM_EVENT_AUTO 0x0400 +#define PM_EVENT_POWEROFF 0x0800 #define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE) #define PM_EVENT_USER_SUSPEND (PM_EVENT_USER | PM_EVENT_SUSPEND) @@ -552,6 +554,7 @@ const struct dev_pm_ops name = { \ #define PMSG_QUIESCE ((struct pm_message){ .event = PM_EVENT_QUIESCE, }) #define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, }) #define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, }) +#define PMSG_POWEROFF ((struct pm_message){ .event = PM_EVENT_POWEROFF, }) #define PMSG_RESUME ((struct pm_message){ .event = PM_EVENT_RESUME, }) #define PMSG_THAW ((struct pm_message){ .event = PM_EVENT_THAW, }) #define PMSG_RESTORE ((struct pm_message){ .event = PM_EVENT_RESTORE, }) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 82904291c2b8..370f8df2fdb4 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -179,7 +179,8 @@ TRACE_EVENT(pstate_sample, { PM_EVENT_HIBERNATE, "hibernate" }, \ { PM_EVENT_THAW, "thaw" }, \ { PM_EVENT_RESTORE, "restore" }, \ - { PM_EVENT_RECOVER, "recover" }) + { PM_EVENT_RECOVER, "recover" }, \ + { PM_EVENT_POWEROFF, "poweroff" }) DEFINE_EVENT(cpu, cpu_frequency, -- cgit v1.2.3 From ef8057b07c72a817537856b98d6e7493b9404eaf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 20:33:33 +0100 Subject: PM: runtime: Wrapper macros for ACQUIRE()/ACQUIRE_ERR() Add wrapper macros for ACQUIRE()/ACQUIRE_ERR() and runtime PM usage counter guards introduced recently: pm_runtime_active_try, pm_runtime_active_auto_try, pm_runtime_active_try_enabled, and pm_runtime_active_auto_try_enabled. The new macros should be more straightforward to use. For example, they can be used for rewriting a piece of code like below: ACQUIRE(pm_runtime_active_try, pm)(dev); if ((ret = ACQUIRE_ERR(pm_runtime_active_try, &pm))) return ret; in the following way: PM_RUNTIME_ACQUIRE(dev, pm); if ((ret = PM_RUNTIME_ACQUIRE_ERR(&pm))) return ret; If the original code does not care about the specific error code returned when attepmting to resume the device: ACQUIRE(pm_runtime_active_try, pm)(dev); if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) return -ENXIO; it may be changed like this: PM_RUNTIME_ACQUIRE(dev, pm); if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; Link: https://lore.kernel.org/linux-pm/5068916.31r3eYUQgx@rafael.j.wysocki/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Dan Williams Reviewed-by: Dhruva Gole Reviewed-by: Jonathan Cameron Reviewed-by: Frank Li Link: https://patch.msgid.link/3400866.aeNJFYEL58@rafael.j.wysocki --- include/linux/pm_runtime.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 0b436e15f4cd..911d7a4d32c1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -637,6 +637,30 @@ DEFINE_GUARD_COND(pm_runtime_active_auto, _try, DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, pm_runtime_resume_and_get(_T), _RET == 0) +/* ACQUIRE() wrapper macros for the guards defined above. */ + +#define PM_RUNTIME_ACQUIRE(_dev, _var) \ + ACQUIRE(pm_runtime_active_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var) \ + ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev) + +/* + * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active. + * + * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the + * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with + * any of them) and if it is nonzero, avoid accessing the given device. + */ +#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr) \ + ACQUIRE_ERR(pm_runtime_active, _var_ptr) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From 1d6c915819f5b805c35487b6ce5923e31a28266b Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 20 Nov 2025 16:05:38 -0800 Subject: powercap: intel_rapl: Prepare read_raw() interface for atomic-context callers The current read_raw() implementation of the TPMI, MMIO and MSR interfaces does not distinguish between atomic and non-atomic callers. rapl_msr_read_raw() uses rdmsrq_safe_on_cpu(), which can sleep and issue cross CPU calls. When MSR-based RAPL PMU support is enabled, PMU event handlers can invoke this function from atomic context where sleeping or rescheduling is not allowed. In atomic context, the caller is already executing on the target CPU, so a direct rdmsrq() is sufficient. To support such usage, introduce an atomic flag to the read_raw() interface to allow callers pass the context information. Modify the common RAPL code to propagate this flag, and set the flag to reflect the calling contexts. Utilize the atomic flag in rapl_msr_read_raw() to perform direct MSR read with rdmsrq() when running in atomic context, and a sanity check to ensure target CPU matches the current CPU for such use cases. The TPMI and MMIO implementations do not require special atomic handling, so the flag is ignored in those paths. This is a preparatory patch for adding MSR-based RAPL PMU support. Signed-off-by: Kuppuswamy Sathyanarayanan Reviewed-by: Srinivas Pandruvada [ rjw: Subject tweak ] Link: https://patch.msgid.link/20251121000539.386069-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 24 ++++++++++++---------- drivers/powercap/intel_rapl_msr.c | 16 ++++++++++++++- drivers/powercap/intel_rapl_tpmi.c | 2 +- .../intel/int340x_thermal/processor_thermal_rapl.c | 2 +- include/linux/intel_rapl.h | 2 +- 5 files changed, 31 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 57bebd07c7d0..47ec34d4c099 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -253,7 +253,8 @@ struct rapl_primitive_info { static void rapl_init_domains(struct rapl_package *rp); static int rapl_read_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, - bool xlate, u64 *data); + bool xlate, u64 *data, + bool atomic); static int rapl_write_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, unsigned long long value); @@ -289,7 +290,7 @@ static int get_energy_counter(struct powercap_zone *power_zone, cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); - if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { + if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now, false)) { *energy_raw = energy_now; cpus_read_unlock(); @@ -830,7 +831,8 @@ prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) * 63-------------------------- 31--------------------------- 0 */ static int rapl_read_data_raw(struct rapl_domain *rd, - enum rapl_primitives prim, bool xlate, u64 *data) + enum rapl_primitives prim, bool xlate, u64 *data, + bool atomic) { u64 value; enum rapl_primitives prim_fixed = prim_fixups(rd, prim); @@ -852,7 +854,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, ra.mask = rpi->mask; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, atomic)) { pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name); return -EIO; } @@ -904,7 +906,7 @@ static int rapl_read_pl_data(struct rapl_domain *rd, int pl, if (!is_pl_valid(rd, pl)) return -EINVAL; - return rapl_read_data_raw(rd, prim, xlate, data); + return rapl_read_data_raw(rd, prim, xlate, data, false); } static int rapl_write_pl_data(struct rapl_domain *rd, int pl, @@ -941,7 +943,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -969,7 +971,7 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -1156,7 +1158,7 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -1328,7 +1330,7 @@ static void rapl_update_domain_data(struct rapl_package *rp) struct rapl_primitive_info *rpi = get_rpi(rp, prim); if (!rapl_read_data_raw(&rp->domains[dmn], prim, - rpi->unit, &val)) + rpi->unit, &val, false)) rp->domains[dmn].rdd.primitives[prim] = val; } } @@ -1428,7 +1430,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp) */ ra.mask = ENERGY_STATUS_MASK; - if (rp->priv->read_raw(get_rid(rp), &ra) || !ra.value) + if (rp->priv->read_raw(get_rid(rp), &ra, false) || !ra.value) return -ENODEV; return 0; @@ -1639,7 +1641,7 @@ static u64 event_read_counter(struct perf_event *event) if (event->hw.idx < 0) return 0; - ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val); + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val, true); /* Return 0 for failed read */ if (ret) diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index c6b9a7debc35..6e3c50af0912 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -102,12 +102,26 @@ static int rapl_cpu_down_prep(unsigned int cpu) return 0; } -static int rapl_msr_read_raw(int cpu, struct reg_action *ra) +static int rapl_msr_read_raw(int cpu, struct reg_action *ra, bool atomic) { + /* + * When called from atomic-context (eg PMU event handler) + * perform MSR read directly using rdmsrq(). + */ + if (atomic) { + if (unlikely(smp_processor_id() != cpu)) + return -EIO; + + rdmsrq(ra->reg.msr, ra->value); + goto out; + } + if (rdmsrq_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) { pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu); return -EIO; } + +out: ra->value &= ra->mask; return 0; } diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 82201bf4685d..0a0b85f4528b 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -60,7 +60,7 @@ static DEFINE_MUTEX(tpmi_rapl_lock); static struct powercap_control_type *tpmi_control_type; -static int tpmi_rapl_read_raw(int id, struct reg_action *ra) +static int tpmi_rapl_read_raw(int id, struct reg_action *ra, bool atomic) { if (!ra->reg.mmio) return -EINVAL; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c index bde2cc386afd..bf51a17c5be6 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c @@ -19,7 +19,7 @@ static const struct rapl_mmio_regs rapl_mmio_default = { .limits[RAPL_DOMAIN_DRAM] = BIT(POWER_LIMIT2), }; -static int rapl_mmio_read_raw(int cpu, struct reg_action *ra) +static int rapl_mmio_read_raw(int cpu, struct reg_action *ra, bool atomic) { if (!ra->reg.mmio) return -EINVAL; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index c0397423d3a8..e9ade2ff4af6 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -152,7 +152,7 @@ struct rapl_if_priv { union rapl_reg reg_unit; union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; - int (*read_raw)(int id, struct reg_action *ra); + int (*read_raw)(int id, struct reg_action *ra, bool atomic); int (*write_raw)(int id, struct reg_action *ra); void *defaults; void *rpi; -- cgit v1.2.3 From 447c4e8338dbfad517769d26b53d633b88d51184 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 30 Oct 2025 20:26:28 +0200 Subject: PM / devfreq: Move governor.h to a public header location Some device drivers (and out-of-tree modules) might want to define device-specific device governors. Rather than restricting all of them to be a part of drivers/devfreq/ (which is not possible for out-of-tree drivers anyway) move governor.h to include/linux/devfreq-governor.h and update all drivers to use it. The devfreq_cpu_data is only used internally, by the passive governor, so it is moved to the driver source rather than being a part of the public interface. Reported-by: Robie Basak Acked-by: Jon Hunter Signed-off-by: Dmitry Baryshkov Reviewed-by: Bjorn Andersson Acked-by: MyungJoo Ham Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20251030-governor-public-v2-1-432a11a9975a@oss.qualcomm.com/ --- drivers/devfreq/devfreq.c | 2 +- drivers/devfreq/governor.h | 127 ------------------------------ drivers/devfreq/governor_passive.c | 27 ++++++- drivers/devfreq/governor_performance.c | 2 +- drivers/devfreq/governor_powersave.c | 2 +- drivers/devfreq/governor_simpleondemand.c | 2 +- drivers/devfreq/governor_userspace.c | 2 +- drivers/devfreq/hisi_uncore_freq.c | 3 +- drivers/devfreq/tegra30-devfreq.c | 3 +- include/linux/devfreq-governor.h | 102 ++++++++++++++++++++++++ 10 files changed, 135 insertions(+), 137 deletions(-) delete mode 100644 drivers/devfreq/governor.h create mode 100644 include/linux/devfreq-governor.h (limited to 'include') diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 2e8d01d47f69..00979f2e0e27 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,6 @@ #include #include #include -#include "governor.h" #define CREATE_TRACE_POINTS #include diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h deleted file mode 100644 index 0adfebc0467a..000000000000 --- a/drivers/devfreq/governor.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * governor.h - internal header for devfreq governors. - * - * Copyright (C) 2011 Samsung Electronics - * MyungJoo Ham - * - * This header is for devfreq governors in drivers/devfreq/ - */ - -#ifndef _GOVERNOR_H -#define _GOVERNOR_H - -#include - -#define DEVFREQ_NAME_LEN 16 - -#define to_devfreq(DEV) container_of((DEV), struct devfreq, dev) - -/* Devfreq events */ -#define DEVFREQ_GOV_START 0x1 -#define DEVFREQ_GOV_STOP 0x2 -#define DEVFREQ_GOV_UPDATE_INTERVAL 0x3 -#define DEVFREQ_GOV_SUSPEND 0x4 -#define DEVFREQ_GOV_RESUME 0x5 - -#define DEVFREQ_MIN_FREQ 0 -#define DEVFREQ_MAX_FREQ ULONG_MAX - -/* - * Definition of the governor feature flags - * - DEVFREQ_GOV_FLAG_IMMUTABLE - * : This governor is never changeable to other governors. - * - DEVFREQ_GOV_FLAG_IRQ_DRIVEN - * : The devfreq won't schedule the work for this governor. - */ -#define DEVFREQ_GOV_FLAG_IMMUTABLE BIT(0) -#define DEVFREQ_GOV_FLAG_IRQ_DRIVEN BIT(1) - -/* - * Definition of governor attribute flags except for common sysfs attributes - * - DEVFREQ_GOV_ATTR_POLLING_INTERVAL - * : Indicate polling_interval sysfs attribute - * - DEVFREQ_GOV_ATTR_TIMER - * : Indicate timer sysfs attribute - */ -#define DEVFREQ_GOV_ATTR_POLLING_INTERVAL BIT(0) -#define DEVFREQ_GOV_ATTR_TIMER BIT(1) - -/** - * struct devfreq_cpu_data - Hold the per-cpu data - * @node: list node - * @dev: reference to cpu device. - * @first_cpu: the cpumask of the first cpu of a policy. - * @opp_table: reference to cpu opp table. - * @cur_freq: the current frequency of the cpu. - * @min_freq: the min frequency of the cpu. - * @max_freq: the max frequency of the cpu. - * - * This structure stores the required cpu_data of a cpu. - * This is auto-populated by the governor. - */ -struct devfreq_cpu_data { - struct list_head node; - - struct device *dev; - unsigned int first_cpu; - - struct opp_table *opp_table; - unsigned int cur_freq; - unsigned int min_freq; - unsigned int max_freq; -}; - -/** - * struct devfreq_governor - Devfreq policy governor - * @node: list node - contains registered devfreq governors - * @name: Governor's name - * @attrs: Governor's sysfs attribute flags - * @flags: Governor's feature flags - * @get_target_freq: Returns desired operating frequency for the device. - * Basically, get_target_freq will run - * devfreq_dev_profile.get_dev_status() to get the - * status of the device (load = busy_time / total_time). - * @event_handler: Callback for devfreq core framework to notify events - * to governors. Events include per device governor - * init and exit, opp changes out of devfreq, suspend - * and resume of per device devfreq during device idle. - * - * Note that the callbacks are called with devfreq->lock locked by devfreq. - */ -struct devfreq_governor { - struct list_head node; - - const char name[DEVFREQ_NAME_LEN]; - const u64 attrs; - const u64 flags; - int (*get_target_freq)(struct devfreq *this, unsigned long *freq); - int (*event_handler)(struct devfreq *devfreq, - unsigned int event, void *data); -}; - -void devfreq_monitor_start(struct devfreq *devfreq); -void devfreq_monitor_stop(struct devfreq *devfreq); -void devfreq_monitor_suspend(struct devfreq *devfreq); -void devfreq_monitor_resume(struct devfreq *devfreq); -void devfreq_update_interval(struct devfreq *devfreq, unsigned int *delay); - -int devfreq_add_governor(struct devfreq_governor *governor); -int devfreq_remove_governor(struct devfreq_governor *governor); - -int devm_devfreq_add_governor(struct device *dev, - struct devfreq_governor *governor); - -int devfreq_update_status(struct devfreq *devfreq, unsigned long freq); -int devfreq_update_target(struct devfreq *devfreq, unsigned long freq); -void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq, - unsigned long *max_freq); - -static inline int devfreq_update_stats(struct devfreq *df) -{ - if (!df->profile->get_dev_status) - return -EINVAL; - - return df->profile->get_dev_status(df->dev.parent, &df->last_status); -} -#endif /* _GOVERNOR_H */ diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c index 953cf9a1e9f7..8cd6f9a59f64 100644 --- a/drivers/devfreq/governor_passive.c +++ b/drivers/devfreq/governor_passive.c @@ -14,8 +14,33 @@ #include #include #include +#include #include -#include "governor.h" + +/** + * struct devfreq_cpu_data - Hold the per-cpu data + * @node: list node + * @dev: reference to cpu device. + * @first_cpu: the cpumask of the first cpu of a policy. + * @opp_table: reference to cpu opp table. + * @cur_freq: the current frequency of the cpu. + * @min_freq: the min frequency of the cpu. + * @max_freq: the max frequency of the cpu. + * + * This structure stores the required cpu_data of a cpu. + * This is auto-populated by the governor. + */ +struct devfreq_cpu_data { + struct list_head node; + + struct device *dev; + unsigned int first_cpu; + + struct opp_table *opp_table; + unsigned int cur_freq; + unsigned int min_freq; + unsigned int max_freq; +}; static struct devfreq_cpu_data * get_parent_cpu_data(struct devfreq_passive_data *p_data, diff --git a/drivers/devfreq/governor_performance.c b/drivers/devfreq/governor_performance.c index 2e4e981446fa..fdb22bf512cf 100644 --- a/drivers/devfreq/governor_performance.c +++ b/drivers/devfreq/governor_performance.c @@ -7,8 +7,8 @@ */ #include +#include #include -#include "governor.h" static int devfreq_performance_func(struct devfreq *df, unsigned long *freq) diff --git a/drivers/devfreq/governor_powersave.c b/drivers/devfreq/governor_powersave.c index f059e8814804..ee2d6ec8a512 100644 --- a/drivers/devfreq/governor_powersave.c +++ b/drivers/devfreq/governor_powersave.c @@ -7,8 +7,8 @@ */ #include +#include #include -#include "governor.h" static int devfreq_powersave_func(struct devfreq *df, unsigned long *freq) diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c index c23435736367..9c69b96df5f9 100644 --- a/drivers/devfreq/governor_simpleondemand.c +++ b/drivers/devfreq/governor_simpleondemand.c @@ -9,8 +9,8 @@ #include #include #include +#include #include -#include "governor.h" /* Default constants for DevFreq-Simple-Ondemand (DFSO) */ #define DFSO_UPTHRESHOLD (90) diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c index 175de0c0b50e..395174f93960 100644 --- a/drivers/devfreq/governor_userspace.c +++ b/drivers/devfreq/governor_userspace.c @@ -9,11 +9,11 @@ #include #include #include +#include #include #include #include #include -#include "governor.h" struct userspace_data { unsigned long user_frequency; diff --git a/drivers/devfreq/hisi_uncore_freq.c b/drivers/devfreq/hisi_uncore_freq.c index 96d1815059e3..b8e4621c57eb 100644 --- a/drivers/devfreq/hisi_uncore_freq.c +++ b/drivers/devfreq/hisi_uncore_freq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,8 +27,6 @@ #include #include -#include "governor.h" - struct hisi_uncore_pcc_data { u16 status; u16 resv; diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c index 4a4f0106ab9d..77cbb204087c 100644 --- a/drivers/devfreq/tegra30-devfreq.c +++ b/drivers/devfreq/tegra30-devfreq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -21,8 +22,6 @@ #include -#include "governor.h" - #define ACTMON_GLB_STATUS 0x0 #define ACTMON_GLB_PERIOD_CTRL 0x4 diff --git a/include/linux/devfreq-governor.h b/include/linux/devfreq-governor.h new file mode 100644 index 000000000000..dfdd0160a29f --- /dev/null +++ b/include/linux/devfreq-governor.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * governor.h - internal header for devfreq governors. + * + * Copyright (C) 2011 Samsung Electronics + * MyungJoo Ham + * + * This header is for devfreq governors + */ + +#ifndef __LINUX_DEVFREQ_DEVFREQ_H__ +#define __LINUX_DEVFREQ_DEVFREQ_H__ + +#include + +#define DEVFREQ_NAME_LEN 16 + +#define to_devfreq(DEV) container_of((DEV), struct devfreq, dev) + +/* Devfreq events */ +#define DEVFREQ_GOV_START 0x1 +#define DEVFREQ_GOV_STOP 0x2 +#define DEVFREQ_GOV_UPDATE_INTERVAL 0x3 +#define DEVFREQ_GOV_SUSPEND 0x4 +#define DEVFREQ_GOV_RESUME 0x5 + +#define DEVFREQ_MIN_FREQ 0 +#define DEVFREQ_MAX_FREQ ULONG_MAX + +/* + * Definition of the governor feature flags + * - DEVFREQ_GOV_FLAG_IMMUTABLE + * : This governor is never changeable to other governors. + * - DEVFREQ_GOV_FLAG_IRQ_DRIVEN + * : The devfreq won't schedule the work for this governor. + */ +#define DEVFREQ_GOV_FLAG_IMMUTABLE BIT(0) +#define DEVFREQ_GOV_FLAG_IRQ_DRIVEN BIT(1) + +/* + * Definition of governor attribute flags except for common sysfs attributes + * - DEVFREQ_GOV_ATTR_POLLING_INTERVAL + * : Indicate polling_interval sysfs attribute + * - DEVFREQ_GOV_ATTR_TIMER + * : Indicate timer sysfs attribute + */ +#define DEVFREQ_GOV_ATTR_POLLING_INTERVAL BIT(0) +#define DEVFREQ_GOV_ATTR_TIMER BIT(1) + +/** + * struct devfreq_governor - Devfreq policy governor + * @node: list node - contains registered devfreq governors + * @name: Governor's name + * @attrs: Governor's sysfs attribute flags + * @flags: Governor's feature flags + * @get_target_freq: Returns desired operating frequency for the device. + * Basically, get_target_freq will run + * devfreq_dev_profile.get_dev_status() to get the + * status of the device (load = busy_time / total_time). + * @event_handler: Callback for devfreq core framework to notify events + * to governors. Events include per device governor + * init and exit, opp changes out of devfreq, suspend + * and resume of per device devfreq during device idle. + * + * Note that the callbacks are called with devfreq->lock locked by devfreq. + */ +struct devfreq_governor { + struct list_head node; + + const char name[DEVFREQ_NAME_LEN]; + const u64 attrs; + const u64 flags; + int (*get_target_freq)(struct devfreq *this, unsigned long *freq); + int (*event_handler)(struct devfreq *devfreq, + unsigned int event, void *data); +}; + +void devfreq_monitor_start(struct devfreq *devfreq); +void devfreq_monitor_stop(struct devfreq *devfreq); +void devfreq_monitor_suspend(struct devfreq *devfreq); +void devfreq_monitor_resume(struct devfreq *devfreq); +void devfreq_update_interval(struct devfreq *devfreq, unsigned int *delay); + +int devfreq_add_governor(struct devfreq_governor *governor); +int devfreq_remove_governor(struct devfreq_governor *governor); + +int devm_devfreq_add_governor(struct device *dev, + struct devfreq_governor *governor); + +int devfreq_update_status(struct devfreq *devfreq, unsigned long freq); +int devfreq_update_target(struct devfreq *devfreq, unsigned long freq); +void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq, + unsigned long *max_freq); + +static inline int devfreq_update_stats(struct devfreq *df) +{ + if (!df->profile->get_dev_status) + return -EINVAL; + + return df->profile->get_dev_status(df->dev.parent, &df->last_status); +} +#endif /* __LINUX_DEVFREQ_DEVFREQ_H__ */ -- cgit v1.2.3 From a4e6512a79d8486dccf3e8b066e5d6bd5ff95446 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:42 +0100 Subject: PM: QoS: Introduce a CPU system wakeup QoS limit Some platforms supports multiple low power states for CPUs that can be used when entering system-wide suspend. Currently we are always selecting the deepest possible state for the CPUs, which can break the system wakeup latency constraint that may be required for a use case. Let's take the first step towards addressing this problem, by introducing an interface for user space, that allows us to specify the CPU system wakeup QoS limit. Subsequent changes will start taking into account the new QoS limit. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-2-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm_qos.h | 9 +++++ kernel/power/Kconfig | 11 +++++ kernel/power/qos.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+) (limited to 'include') diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 4a69d4af3ff8..6cea4455f867 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -162,6 +162,15 @@ static inline void cpu_latency_qos_update_request(struct pm_qos_request *req, static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {} #endif +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +s32 cpu_wakeup_latency_qos_limit(void); +#else +static inline s32 cpu_wakeup_latency_qos_limit(void) +{ + return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; +} +#endif + #ifdef CONFIG_PM enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask); enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 54a623680019..05337f437cca 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -202,6 +202,17 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y +config PM_QOS_CPU_SYSTEM_WAKEUP + bool "User space interface for CPU system wakeup QoS" + depends on CPU_IDLE + help + Enable this to allow user space via the cpu_wakeup_latency file to + specify a CPU system wakeup latency limit. + + This may be particularly useful for platforms supporting multiple low + power states for CPUs during system-wide suspend and s2idle in + particular. + config PM bool "Device power management core functionality" help diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 4244b069442e..f7d8064e9adc 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -415,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = { .fops = &cpu_latency_qos_fops, }; +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +/* The CPU system wakeup latency QoS. */ +static struct pm_qos_constraints cpu_wakeup_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list), + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .type = PM_QOS_MIN, +}; + +/** + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit. + * + * Returns the current CPU system wakeup latency QoS limit that may have been + * requested by user space. + */ +s32 cpu_wakeup_latency_qos_limit(void) +{ + return pm_qos_read_value(&cpu_wakeup_latency_constraints); +} + +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->qos = &cpu_wakeup_latency_constraints; + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + filp->private_data = req; + + return 0; +} + +static int cpu_wakeup_latency_qos_release(struct inode *inode, + struct file *filp) +{ + struct pm_qos_request *req = filp->private_data; + + filp->private_data = NULL; + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + kfree(req); + + return 0; +} + +static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t cpu_wakeup_latency_qos_write(struct file *filp, + const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct pm_qos_request *req = filp->private_data; + s32 value; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + if (value < 0) + return -EINVAL; + + pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value); + + return count; +} + +static const struct file_operations cpu_wakeup_latency_qos_fops = { + .open = cpu_wakeup_latency_qos_open, + .release = cpu_wakeup_latency_qos_release, + .read = cpu_wakeup_latency_qos_read, + .write = cpu_wakeup_latency_qos_write, + .llseek = noop_llseek, +}; + +static struct miscdevice cpu_wakeup_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_wakeup_latency", + .fops = &cpu_wakeup_latency_qos_fops, +}; +#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */ + static int __init cpu_latency_qos_init(void) { int ret; @@ -424,6 +523,13 @@ static int __init cpu_latency_qos_init(void) pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP + ret = misc_register(&cpu_wakeup_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_wakeup_latency_qos_miscdev.name); +#endif + return ret; } late_initcall(cpu_latency_qos_init); -- cgit v1.2.3 From 8e7de6dc420979f4e4443807b71dcc8b72d8c4a9 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:43 +0100 Subject: pmdomain: Respect the CPU system wakeup QoS limit for s2idle A CPU system wakeup QoS limit may have been requested by user space. To avoid breaking this constraint when entering a low power state during s2idle through genpd, let's extend the corresponding genpd governor for CPUs. More precisely, during s2idle let the genpd governor select a suitable domain idle state, by taking into account the QoS limit. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-3-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/pmdomain/core.c | 10 ++++++++-- drivers/pmdomain/governor.c | 27 +++++++++++++++++++++++++++ include/linux/pm_domain.h | 1 + 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index 61c2277c9ce3..4fd546ef0448 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -1425,8 +1425,14 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock, return; } - /* Choose the deepest state when suspending */ - genpd->state_idx = genpd->state_count - 1; + if (genpd->gov && genpd->gov->system_power_down_ok) { + if (!genpd->gov->system_power_down_ok(&genpd->domain)) + return; + } else { + /* Default to the deepest state. */ + genpd->state_idx = genpd->state_count - 1; + } + if (_genpd_power_off(genpd, false)) { genpd->states[genpd->state_idx].rejected++; return; diff --git a/drivers/pmdomain/governor.c b/drivers/pmdomain/governor.c index 39359811a930..bd1b9d66d4a5 100644 --- a/drivers/pmdomain/governor.c +++ b/drivers/pmdomain/governor.c @@ -415,9 +415,36 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd) return false; } +static bool cpu_system_power_down_ok(struct dev_pm_domain *pd) +{ + s64 constraint_ns = cpu_wakeup_latency_qos_limit() * NSEC_PER_USEC; + struct generic_pm_domain *genpd = pd_to_genpd(pd); + int state_idx = genpd->state_count - 1; + + if (!(genpd->flags & GENPD_FLAG_CPU_DOMAIN)) { + genpd->state_idx = state_idx; + return true; + } + + /* Find the deepest state for the latency constraint. */ + while (state_idx >= 0) { + s64 latency_ns = genpd->states[state_idx].power_off_latency_ns + + genpd->states[state_idx].power_on_latency_ns; + + if (latency_ns <= constraint_ns) { + genpd->state_idx = state_idx; + return true; + } + state_idx--; + } + + return false; +} + struct dev_power_governor pm_domain_cpu_gov = { .suspend_ok = default_suspend_ok, .power_down_ok = cpu_power_down_ok, + .system_power_down_ok = cpu_system_power_down_ok, }; #endif diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index f67a2cb7d781..93ba0143ca47 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -153,6 +153,7 @@ enum genpd_sync_state { }; struct dev_power_governor { + bool (*system_power_down_ok)(struct dev_pm_domain *domain); bool (*power_down_ok)(struct dev_pm_domain *domain); bool (*suspend_ok)(struct device *dev); }; -- cgit v1.2.3 From 99b42445f4a4aaff75eca24dfc9e6e376292dd48 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:45 +0100 Subject: sched: idle: Respect the CPU system wakeup QoS limit for s2idle A CPU system wakeup QoS limit may have been requested by user space. To avoid breaking this constraint when entering a low power state during s2idle, let's start to take into account the QoS limit. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-5-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 12 +++++++----- include/linux/cpuidle.h | 6 ++++-- kernel/sched/idle.c | 12 +++++++----- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 56132e843c99..c7876e9e024f 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -184,20 +184,22 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv, * cpuidle_enter_s2idle - Enter an idle state suitable for suspend-to-idle. * @drv: cpuidle driver for the given CPU. * @dev: cpuidle device for the given CPU. + * @latency_limit_ns: Idle state exit latency limit * * If there are states with the ->enter_s2idle callback, find the deepest of * them and enter it with frozen tick. */ -int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) +int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev, + u64 latency_limit_ns) { int index; /* - * Find the deepest state with ->enter_s2idle present, which guarantees - * that interrupts won't be enabled when it exits and allows the tick to - * be frozen safely. + * Find the deepest state with ->enter_s2idle present that meets the + * specified latency limit, which guarantees that interrupts won't be + * enabled when it exits and allows the tick to be frozen safely. */ - index = find_deepest_state(drv, dev, U64_MAX, 0, true); + index = find_deepest_state(drv, dev, latency_limit_ns, 0, true); if (index > 0) { enter_s2idle_proper(drv, dev, index); local_irq_enable(); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a9ee4fe55dcf..4073690504a7 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -248,7 +248,8 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, u64 latency_limit_ns); extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + u64 latency_limit_ns); extern void cpuidle_use_deepest_state(u64 latency_limit_ns); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, @@ -256,7 +257,8 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, u64 latency_limit_ns) {return -ENODEV; } static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 latency_limit_ns) {return -ENODEV; } static inline void cpuidle_use_deepest_state(u64 latency_limit_ns) { diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c39b089d4f09..c1c3d0166610 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -131,12 +131,13 @@ void __cpuidle default_idle_call(void) } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 max_latency_ns) { if (current_clr_polling_and_test()) return -EBUSY; - return cpuidle_enter_s2idle(drv, dev); + return cpuidle_enter_s2idle(drv, dev, max_latency_ns); } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -205,12 +206,13 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { + max_latency_ns = cpu_wakeup_latency_qos_limit() * + NSEC_PER_USEC; - entered_state = call_cpuidle_s2idle(drv, dev); + entered_state = call_cpuidle_s2idle(drv, dev, + max_latency_ns); if (entered_state > 0) goto exit_idle; - - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } -- cgit v1.2.3