From f996fc9671d088bd5f52a70f18c64bfe3d0e418f Mon Sep 17 00:00:00 2001 From: Bojan Smojver Date: Thu, 9 Sep 2010 23:06:23 +0200 Subject: PM / Hibernate: Compress hibernation image with LZO Compress hibernation image with LZO in order to save on I/O and therefore time to hibernate/thaw. [rjw: Added hibernate=nocompress command line option instead of just nocompress which would be confusing, fixed a couple of compiler warnings, fixed kerneldoc comments, minor cleanups.] Signed-off-by: Bojan Smojver Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 2 + kernel/power/hibernate.c | 13 +++ kernel/power/power.h | 1 + kernel/power/swap.c | 290 +++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 299 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca6066a6952e..cb57eb99215f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -137,6 +137,8 @@ config SUSPEND_FREEZER config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + select LZO_COMPRESS + select LZO_DECOMPRESS select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8dc31e02ae12..6c9c9dc48c75 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -29,6 +29,7 @@ #include "power.h" +static int nocompress = 0; static int noresume = 0; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; @@ -638,6 +639,8 @@ int hibernate(void) if (hibernation_mode == HIBERNATION_PLATFORM) flags |= SF_PLATFORM_MODE; + if (nocompress) + flags |= SF_NOCOMPRESS_MODE; pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); @@ -1004,6 +1007,15 @@ static int __init resume_offset_setup(char *str) return 1; } +static int __init hibernate_setup(char *str) +{ + if (!strncmp(str, "noresume", 8)) + noresume = 1; + else if (!strncmp(str, "nocompress", 10)) + nocompress = 1; + return 1; +} + static int __init noresume_setup(char *str) { noresume = 1; @@ -1013,3 +1025,4 @@ static int __init noresume_setup(char *str) __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); +__setup("hibernate=", hibernate_setup); diff --git a/kernel/power/power.h b/kernel/power/power.h index 006270fe382d..c7e42e47eb0b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -134,6 +134,7 @@ extern int swsusp_swap_in_use(void); * the image header. */ #define SF_PLATFORM_MODE 1 +#define SF_NOCOMPRESS_MODE 2 /* kernel/power/hibernate.c */ extern int swsusp_check(void); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e6a5bdf61a37..3dc0552cddfb 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include "power.h" @@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle, return error; } +/* We need to remember how much compressed data we need to read. */ +#define LZO_HEADER sizeof(size_t) + +/* Number of pages/bytes we'll compress at one time. */ +#define LZO_UNC_PAGES 32 +#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) + +/* Number of pages/bytes we need for compressed data (worst case). */ +#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ + LZO_HEADER, PAGE_SIZE) +#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) + /** * save_image - save the suspend image data */ @@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle, return ret; } + +/** + * save_image_lzo - Save the suspend image data compressed with LZO. + * @handle: Swap mam handle to use for saving the image. + * @snapshot: Image to read data from. + * @nr_to_write: Number of pages to save. + */ +static int save_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret = 0; + int nr_pages; + int err2; + struct bio *bio; + struct timeval start; + struct timeval stop; + size_t off, unc_len, cmp_len; + unsigned char *unc, *cmp, *wrk, *page; + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + return -ENOMEM; + } + + wrk = vmalloc(LZO1X_1_MEM_COMPRESS); + if (!wrk) { + printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); + free_page((unsigned long)page); + return -ENOMEM; + } + + unc = vmalloc(LZO_UNC_SIZE); + if (!unc) { + printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); + vfree(wrk); + free_page((unsigned long)page); + return -ENOMEM; + } + + cmp = vmalloc(LZO_CMP_SIZE); + if (!cmp) { + printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); + vfree(wrk); + free_page((unsigned long)page); + return -ENOMEM; + } + + printk(KERN_INFO + "PM: Compressing and saving image data (%u pages) ... ", + nr_to_write); + m = nr_to_write / 100; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + for (;;) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + + if (!off) + break; + + unc_len = off; + ret = lzo1x_1_compress(unc, unc_len, + cmp + LZO_HEADER, &cmp_len, wrk); + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + break; + } + + if (unlikely(!cmp_len || + cmp_len > lzo1x_worst_compress(unc_len))) { + printk(KERN_ERR "PM: Invalid LZO compressed length\n"); + ret = -1; + break; + } + + *(size_t *)cmp = cmp_len; + + /* + * Given we are writing one page at a time to disk, we copy + * that much from the buffer, although the last bit will likely + * be smaller than full page. This is OK - we saved the length + * of the compressed data, so any garbage at the end will be + * discarded when we read it. + */ + for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { + memcpy(page, cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } + } + +out_finish: + err2 = hib_wait_on_bio_chain(&bio); + do_gettimeofday(&stop); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_CONT "\b\b\b\bdone\n"); + else + printk(KERN_CONT "\n"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + + vfree(cmp); + vfree(unc); + vfree(wrk); + free_page((unsigned long)page); + + return ret; +} + /** * enough_swap - Make sure we have enough swap to save the image. * @@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle, * space avaiable from the resume partition. */ -static int enough_swap(unsigned int nr_pages) +static int enough_swap(unsigned int nr_pages, unsigned int flags) { unsigned int free_swap = count_swap_pages(root_swap, 1); + unsigned int required; pr_debug("PM: Free swap pages: %u\n", free_swap); - return free_swap > nr_pages + PAGES_FOR_IO; + + required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? + nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); + return free_swap > required; } /** @@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags) printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } - if (!enough_swap(pages)) { + if (!enough_swap(pages, flags)) { printk(KERN_ERR "PM: Not enough free swap\n"); error = -ENOSPC; goto out_finish; @@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags) } header = (struct swsusp_info *)data_of(snapshot); error = swap_write_page(&handle, header, NULL); - if (!error) - error = save_image(&handle, &snapshot, pages - 1); + if (!error) { + error = (flags & SF_NOCOMPRESS_MODE) ? + save_image(&handle, &snapshot, pages - 1) : + save_image_lzo(&handle, &snapshot, pages - 1); + } out_finish: error = swap_writer_finish(&handle, flags, error); return error; @@ -589,6 +741,127 @@ static int load_image(struct swap_map_handle *handle, return error; } +/** + * load_image_lzo - Load compressed image data and decompress them with LZO. + * @handle: Swap map handle to use for loading data. + * @snapshot: Image to copy uncompressed data into. + * @nr_to_read: Number of pages to load. + */ +static int load_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int error = 0; + struct timeval start; + struct timeval stop; + unsigned nr_pages; + size_t off, unc_len, cmp_len; + unsigned char *unc, *cmp, *page; + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + return -ENOMEM; + } + + unc = vmalloc(LZO_UNC_SIZE); + if (!unc) { + printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); + free_page((unsigned long)page); + return -ENOMEM; + } + + cmp = vmalloc(LZO_CMP_SIZE); + if (!cmp) { + printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); + free_page((unsigned long)page); + return -ENOMEM; + } + + printk(KERN_INFO + "PM: Loading and decompressing image data (%u pages) ... ", + nr_to_read); + m = nr_to_read / 100; + if (!m) + m = 1; + nr_pages = 0; + do_gettimeofday(&start); + + error = snapshot_write_next(snapshot); + if (error <= 0) + goto out_finish; + + for (;;) { + error = swap_read_page(handle, page, NULL); /* sync */ + if (error) + break; + + cmp_len = *(size_t *)page; + if (unlikely(!cmp_len || + cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR "PM: Invalid LZO compressed length\n"); + error = -1; + break; + } + + memcpy(cmp, page, PAGE_SIZE); + for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { + error = swap_read_page(handle, page, NULL); /* sync */ + if (error) + goto out_finish; + + memcpy(cmp + off, page, PAGE_SIZE); + } + + unc_len = LZO_UNC_SIZE; + error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, + unc, &unc_len); + if (error < 0) { + printk(KERN_ERR "PM: LZO decompression failed\n"); + break; + } + + if (unlikely(!unc_len || + unc_len > LZO_UNC_SIZE || + unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); + error = -1; + break; + } + + for (off = 0; off < unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + + error = snapshot_write_next(snapshot); + if (error <= 0) + goto out_finish; + } + } + +out_finish: + do_gettimeofday(&stop); + if (!error) { + printk("\b\b\b\bdone\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + error = -ENODATA; + } else + printk("\n"); + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + + vfree(cmp); + vfree(unc); + free_page((unsigned long)page); + + return error; +} + /** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should @@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p) goto end; if (!error) error = swap_read_page(&handle, header, NULL); - if (!error) - error = load_image(&handle, &snapshot, header->pages - 1); + if (!error) { + error = (*flags_p & SF_NOCOMPRESS_MODE) ? + load_image(&handle, &snapshot, header->pages - 1) : + load_image_lzo(&handle, &snapshot, header->pages - 1); + } swap_reader_finish(&handle); end: if (!error) -- cgit v1.2.3 From ede890c2c069d611ece0e184103a6b9236ce416a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 12 Sep 2010 21:40:01 +0200 Subject: PM: Fix unmet dependency warning from kconfig Fix the following build warning: warning: (PM_SLEEP_SMP && SMP && (ARCH_SUSPEND_POSSIBLE || \ ARCH_HIBERNATION_POSSIBLE) && PM_SLEEP) selects HOTPLUG_CPU which \ has unmet direct dependencies (SMP && HOTPLUG) by selecting HOTPLUG along with CPU_HOTPLUG. Signed-off-by: Rafael J. Wysocki Acked-by: Randy Dunlap --- kernel/power/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index cb57eb99215f..e45894c696ee 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -86,6 +86,7 @@ config PM_SLEEP_SMP depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP + select HOTPLUG select HOTPLUG_CPU default y -- cgit v1.2.3 From bcb5ba8b4e8a5ae14b27351bdf499dd4c3bcc944 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 20 Sep 2010 19:44:17 +0200 Subject: PM / Runtime: Use alloc_workqueue() for creating the PM workqueue Although we need the PM workqueue to be freezable, we don't need it to be singlethread. Also, the number of concurrent work items running on a single CPU need not be constrained. For these reasons use alloc_workqueue() directly, with suitable arguments, instead of create_freezeable_workqueue(), to create the runtime PM workqueue. Signed-off-by: Rafael J. Wysocki Acked-by: Tejun Heo --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 62b0bc6e4983..0a28d4db3597 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { - pm_wq = create_freezeable_workqueue("pm"); + pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); return pm_wq ? 0 : -ENOMEM; } -- cgit v1.2.3 From 266f1a25eff5ff98c498d7754a419aacfd88f71c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 20 Sep 2010 19:44:38 +0200 Subject: PM / Hibernate: Improve comments in hibernate_preallocate_memory() One comment in hibernate_preallocate_memory() is wrong, so fix it and add one more comment to clarify the meaning of the fixed one. Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d3f795f01bbc..d9191b40cf6e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1318,12 +1318,14 @@ int hibernate_preallocate_memory(void) /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; + /* Compute the desired number of image pages specified by image_size. */ size = DIV_ROUND_UP(image_size, PAGE_SIZE); if (size > max_size) size = max_size; /* - * If the maximum is not less than the current number of saveable pages - * in memory, allocate page frames for the image and we're done. + * If the desired number of image pages is at least as large as the + * current number of saveable pages in memory, allocate page frames for + * the image and we're done. */ if (size >= saveable) { pages = preallocate_image_highmem(save_highmem); -- cgit v1.2.3 From ac5c24ec1e983313ef0015258fba6f630e54e7cf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 20 Sep 2010 19:44:56 +0200 Subject: PM / Hibernate: Make default image size depend on total RAM size The default hibernation image size is currently hard coded and euqal to 500 MB, which is not a reasonable default on many contemporary systems. Make it equal 2/5 of the total RAM size (this is slightly below the maximum, i.e. 1/2 of the total RAM size, and seems to be generally suitable). Signed-off-by: Rafael J. Wysocki Tested-by: M. Vefa Bicakci --- Documentation/power/interface.txt | 2 +- kernel/power/main.c | 1 + kernel/power/power.h | 9 ++++++++- kernel/power/snapshot.c | 7 ++++++- 4 files changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt index e67211fe0ee2..c537834af005 100644 --- a/Documentation/power/interface.txt +++ b/Documentation/power/interface.txt @@ -57,7 +57,7 @@ smallest image possible. In particular, if "0" is written to this file, the suspend image will be as small as possible. Reading from this file will display the current image size limit, which -is set to 500 MB by default. +is set to 2/5 of available RAM by default. /sys/power/pm_trace controls the code which saves the last PM event point in the RTC across reboots, so that you can debug a machine that just hangs diff --git a/kernel/power/main.c b/kernel/power/main.c index 0a28d4db3597..f06ad6eff37a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -321,6 +321,7 @@ static int __init pm_init(void) int error = pm_start_workqueue(); if (error) return error; + hibernate_image_size_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index c7e42e47eb0b..03634be55f62 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -14,6 +14,9 @@ struct swsusp_info { } __attribute__((aligned(PAGE_SIZE))); #ifdef CONFIG_HIBERNATION +/* kernel/power/snapshot.c */ +extern void __init hibernate_image_size_init(void); + #ifdef CONFIG_ARCH_HIBERNATION_HEADER /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) @@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info) extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); -#endif + +#else /* !CONFIG_HIBERNATION */ + +static inline void hibernate_image_size_init(void) {} +#endif /* !CONFIG_HIBERNATION */ extern int pfn_is_nosave(unsigned long); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d9191b40cf6e..ac7eb109f196 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *); * size will not exceed N bytes, but if that is impossible, it will * try to create the smallest image possible. */ -unsigned long image_size = 500 * 1024 * 1024; +unsigned long image_size; + +void __init hibernate_image_size_init(void) +{ + image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; +} /* List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been -- cgit v1.2.3 From 074037ec79bea73edf1b1ec72fef1010e83e3cc5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 22 Sep 2010 22:09:10 +0200 Subject: PM / Wakeup: Introduce wakeup source objects and event statistics (v3) Introduce struct wakeup_source for representing system wakeup sources within the kernel and for collecting statistics related to them. Make the recently introduced helper functions pm_wakeup_event(), pm_stay_awake() and pm_relax() use struct wakeup_source objects internally, so that wakeup statistics associated with wakeup devices can be collected and reported in a consistent way (the definition of pm_relax() is changed, which is harmless, because this function is not called directly by anyone yet). Introduce new wakeup-related sysfs device attributes in /sys/devices/.../power for reporting the device wakeup statistics. Change the global wakeup events counters event_count and events_in_progress into atomic variables, so that it is not necessary to acquire a global spinlock in pm_wakeup_event(), pm_stay_awake() and pm_relax(), which should allow us to avoid lock contention in these functions on SMP systems with many wakeup devices. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-devices-power | 70 ++++ drivers/base/power/main.c | 4 +- drivers/base/power/power.h | 1 + drivers/base/power/runtime.c | 2 - drivers/base/power/sysfs.c | 121 +++++- drivers/base/power/wakeup.c | 528 ++++++++++++++++++++++---- include/linux/pm.h | 16 +- include/linux/pm_wakeup.h | 127 +++++-- include/linux/suspend.h | 4 +- kernel/power/main.c | 8 +- 10 files changed, 756 insertions(+), 125 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power index 6123c523bfd7..6bb2dd3c3a71 100644 --- a/Documentation/ABI/testing/sysfs-devices-power +++ b/Documentation/ABI/testing/sysfs-devices-power @@ -77,3 +77,73 @@ Description: devices this attribute is set to "enabled" by bus type code or device drivers and in that cases it should be safe to leave the default value. + +What: /sys/devices/.../power/wakeup_count +Date: September 2010 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../wakeup_count attribute contains the number + of signaled wakeup events associated with the device. This + attribute is read-only. If the device is not enabled to wake up + the system from sleep states, this attribute is empty. + +What: /sys/devices/.../power/wakeup_active_count +Date: September 2010 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../wakeup_active_count attribute contains the + number of times the processing of wakeup events associated with + the device was completed (at the kernel level). This attribute + is read-only. If the device is not enabled to wake up the + system from sleep states, this attribute is empty. + +What: /sys/devices/.../power/wakeup_hit_count +Date: September 2010 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../wakeup_hit_count attribute contains the + number of times the processing of a wakeup event associated with + the device might prevent the system from entering a sleep state. + This attribute is read-only. If the device is not enabled to + wake up the system from sleep states, this attribute is empty. + +What: /sys/devices/.../power/wakeup_active +Date: September 2010 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../wakeup_active attribute contains either 1, + or 0, depending on whether or not a wakeup event associated with + the device is being processed (1). This attribute is read-only. + If the device is not enabled to wake up the system from sleep + states, this attribute is empty. + +What: /sys/devices/.../power/wakeup_total_time_ms +Date: September 2010 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../wakeup_total_time_ms attribute contains + the total time of processing wakeup events associated with the + device, in milliseconds. This attribute is read-only. If the + device is not enabled to wake up the system from sleep states, + this attribute is empty. + +What: /sys/devices/.../power/wakeup_max_time_ms +Date: September 2010 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../wakeup_max_time_ms attribute contains + the maximum time of processing a single wakeup event associated + with the device, in milliseconds. This attribute is read-only. + If the device is not enabled to wake up the system from sleep + states, this attribute is empty. + +What: /sys/devices/.../power/wakeup_last_time_ms +Date: September 2010 +Contact: Rafael J. Wysocki +Description: + The /sys/devices/.../wakeup_last_time_ms attribute contains + the value of the monotonic clock corresponding to the time of + signaling the last wakeup event associated with the device, in + milliseconds. This attribute is read-only. If the device is + not enabled to wake up the system from sleep states, this + attribute is empty. diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index db677199403c..7ae6fe414c38 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -60,7 +60,8 @@ void device_pm_init(struct device *dev) dev->power.status = DPM_ON; init_completion(&dev->power.completion); complete_all(&dev->power.completion); - dev->power.wakeup_count = 0; + dev->power.wakeup = NULL; + spin_lock_init(&dev->power.lock); pm_runtime_init(dev); } @@ -120,6 +121,7 @@ void device_pm_remove(struct device *dev) mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); mutex_unlock(&dpm_list_mtx); + device_wakeup_disable(dev); pm_runtime_remove(dev); } diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index c0bd03c83b9c..8b2745c69e2a 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -34,6 +34,7 @@ extern void device_pm_move_last(struct device *); static inline void device_pm_init(struct device *dev) { + spin_lock_init(&dev->power.lock); pm_runtime_init(dev); } diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index b78c401ffa73..e9520ad2d716 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1099,8 +1099,6 @@ EXPORT_SYMBOL_GPL(pm_runtime_allow); */ void pm_runtime_init(struct device *dev) { - spin_lock_init(&dev->power.lock); - dev->power.runtime_status = RPM_SUSPENDED; dev->power.idle_notification = false; diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index e56b4388fe61..8859780817e1 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -210,11 +210,122 @@ static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store); static ssize_t wakeup_count_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", dev->power.wakeup_count); + unsigned long count = 0; + bool enabled = false; + + spin_lock_irq(&dev->power.lock); + if (dev->power.wakeup) { + count = dev->power.wakeup->event_count; + enabled = true; + } + spin_unlock_irq(&dev->power.lock); + return enabled ? sprintf(buf, "%lu\n", count) : sprintf(buf, "\n"); } static DEVICE_ATTR(wakeup_count, 0444, wakeup_count_show, NULL); -#endif + +static ssize_t wakeup_active_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned long count = 0; + bool enabled = false; + + spin_lock_irq(&dev->power.lock); + if (dev->power.wakeup) { + count = dev->power.wakeup->active_count; + enabled = true; + } + spin_unlock_irq(&dev->power.lock); + return enabled ? sprintf(buf, "%lu\n", count) : sprintf(buf, "\n"); +} + +static DEVICE_ATTR(wakeup_active_count, 0444, wakeup_active_count_show, NULL); + +static ssize_t wakeup_hit_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned long count = 0; + bool enabled = false; + + spin_lock_irq(&dev->power.lock); + if (dev->power.wakeup) { + count = dev->power.wakeup->hit_count; + enabled = true; + } + spin_unlock_irq(&dev->power.lock); + return enabled ? sprintf(buf, "%lu\n", count) : sprintf(buf, "\n"); +} + +static DEVICE_ATTR(wakeup_hit_count, 0444, wakeup_hit_count_show, NULL); + +static ssize_t wakeup_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned int active = 0; + bool enabled = false; + + spin_lock_irq(&dev->power.lock); + if (dev->power.wakeup) { + active = dev->power.wakeup->active; + enabled = true; + } + spin_unlock_irq(&dev->power.lock); + return enabled ? sprintf(buf, "%u\n", active) : sprintf(buf, "\n"); +} + +static DEVICE_ATTR(wakeup_active, 0444, wakeup_active_show, NULL); + +static ssize_t wakeup_total_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + s64 msec = 0; + bool enabled = false; + + spin_lock_irq(&dev->power.lock); + if (dev->power.wakeup) { + msec = ktime_to_ms(dev->power.wakeup->total_time); + enabled = true; + } + spin_unlock_irq(&dev->power.lock); + return enabled ? sprintf(buf, "%lld\n", msec) : sprintf(buf, "\n"); +} + +static DEVICE_ATTR(wakeup_total_time_ms, 0444, wakeup_total_time_show, NULL); + +static ssize_t wakeup_max_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + s64 msec = 0; + bool enabled = false; + + spin_lock_irq(&dev->power.lock); + if (dev->power.wakeup) { + msec = ktime_to_ms(dev->power.wakeup->max_time); + enabled = true; + } + spin_unlock_irq(&dev->power.lock); + return enabled ? sprintf(buf, "%lld\n", msec) : sprintf(buf, "\n"); +} + +static DEVICE_ATTR(wakeup_max_time_ms, 0444, wakeup_max_time_show, NULL); + +static ssize_t wakeup_last_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + s64 msec = 0; + bool enabled = false; + + spin_lock_irq(&dev->power.lock); + if (dev->power.wakeup) { + msec = ktime_to_ms(dev->power.wakeup->last_time); + enabled = true; + } + spin_unlock_irq(&dev->power.lock); + return enabled ? sprintf(buf, "%lld\n", msec) : sprintf(buf, "\n"); +} + +static DEVICE_ATTR(wakeup_last_time_ms, 0444, wakeup_last_time_show, NULL); +#endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_ADVANCED_DEBUG #ifdef CONFIG_PM_RUNTIME @@ -288,6 +399,12 @@ static struct attribute * power_attrs[] = { &dev_attr_wakeup.attr, #ifdef CONFIG_PM_SLEEP &dev_attr_wakeup_count.attr, + &dev_attr_wakeup_active_count.attr, + &dev_attr_wakeup_hit_count.attr, + &dev_attr_wakeup_active.attr, + &dev_attr_wakeup_total_time_ms.attr, + &dev_attr_wakeup_max_time_ms.attr, + &dev_attr_wakeup_last_time_ms.attr, #endif #ifdef CONFIG_PM_ADVANCED_DEBUG &dev_attr_async.attr, diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index eb594facfc3f..03751a01dbad 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -11,7 +11,10 @@ #include #include #include -#include + +#include "power.h" + +#define TIMEOUT 100 /* * If set, the suspend/hibernate code will abort transitions to a sleep state @@ -20,18 +23,244 @@ bool events_check_enabled; /* The counter of registered wakeup events. */ -static unsigned long event_count; +static atomic_t event_count = ATOMIC_INIT(0); /* A preserved old value of event_count. */ -static unsigned long saved_event_count; +static unsigned int saved_count; /* The counter of wakeup events being processed. */ -static unsigned long events_in_progress; +static atomic_t events_in_progress = ATOMIC_INIT(0); static DEFINE_SPINLOCK(events_lock); static void pm_wakeup_timer_fn(unsigned long data); -static DEFINE_TIMER(events_timer, pm_wakeup_timer_fn, 0, 0); -static unsigned long events_timer_expires; +static LIST_HEAD(wakeup_sources); + +/** + * wakeup_source_create - Create a struct wakeup_source object. + * @name: Name of the new wakeup source. + */ +struct wakeup_source *wakeup_source_create(const char *name) +{ + struct wakeup_source *ws; + + ws = kzalloc(sizeof(*ws), GFP_KERNEL); + if (!ws) + return NULL; + + spin_lock_init(&ws->lock); + if (name) + ws->name = kstrdup(name, GFP_KERNEL); + + return ws; +} +EXPORT_SYMBOL_GPL(wakeup_source_create); + +/** + * wakeup_source_destroy - Destroy a struct wakeup_source object. + * @ws: Wakeup source to destroy. + */ +void wakeup_source_destroy(struct wakeup_source *ws) +{ + if (!ws) + return; + + spin_lock_irq(&ws->lock); + while (ws->active) { + spin_unlock_irq(&ws->lock); + + schedule_timeout_interruptible(msecs_to_jiffies(TIMEOUT)); + + spin_lock_irq(&ws->lock); + } + spin_unlock_irq(&ws->lock); + + kfree(ws->name); + kfree(ws); +} +EXPORT_SYMBOL_GPL(wakeup_source_destroy); + +/** + * wakeup_source_add - Add given object to the list of wakeup sources. + * @ws: Wakeup source object to add to the list. + */ +void wakeup_source_add(struct wakeup_source *ws) +{ + if (WARN_ON(!ws)) + return; + + setup_timer(&ws->timer, pm_wakeup_timer_fn, (unsigned long)ws); + ws->active = false; + + spin_lock_irq(&events_lock); + list_add_rcu(&ws->entry, &wakeup_sources); + spin_unlock_irq(&events_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(wakeup_source_add); + +/** + * wakeup_source_remove - Remove given object from the wakeup sources list. + * @ws: Wakeup source object to remove from the list. + */ +void wakeup_source_remove(struct wakeup_source *ws) +{ + if (WARN_ON(!ws)) + return; + + spin_lock_irq(&events_lock); + list_del_rcu(&ws->entry); + spin_unlock_irq(&events_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(wakeup_source_remove); + +/** + * wakeup_source_register - Create wakeup source and add it to the list. + * @name: Name of the wakeup source to register. + */ +struct wakeup_source *wakeup_source_register(const char *name) +{ + struct wakeup_source *ws; + + ws = wakeup_source_create(name); + if (ws) + wakeup_source_add(ws); + + return ws; +} +EXPORT_SYMBOL_GPL(wakeup_source_register); + +/** + * wakeup_source_unregister - Remove wakeup source from the list and remove it. + * @ws: Wakeup source object to unregister. + */ +void wakeup_source_unregister(struct wakeup_source *ws) +{ + wakeup_source_remove(ws); + wakeup_source_destroy(ws); +} +EXPORT_SYMBOL_GPL(wakeup_source_unregister); + +/** + * device_wakeup_attach - Attach a wakeup source object to a device object. + * @dev: Device to handle. + * @ws: Wakeup source object to attach to @dev. + * + * This causes @dev to be treated as a wakeup device. + */ +static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws) +{ + spin_lock_irq(&dev->power.lock); + if (dev->power.wakeup) { + spin_unlock_irq(&dev->power.lock); + return -EEXIST; + } + dev->power.wakeup = ws; + spin_unlock_irq(&dev->power.lock); + return 0; +} + +/** + * device_wakeup_enable - Enable given device to be a wakeup source. + * @dev: Device to handle. + * + * Create a wakeup source object, register it and attach it to @dev. + */ +int device_wakeup_enable(struct device *dev) +{ + struct wakeup_source *ws; + int ret; + + if (!dev || !dev->power.can_wakeup) + return -EINVAL; + + ws = wakeup_source_register(dev_name(dev)); + if (!ws) + return -ENOMEM; + + ret = device_wakeup_attach(dev, ws); + if (ret) + wakeup_source_unregister(ws); + + return ret; +} +EXPORT_SYMBOL_GPL(device_wakeup_enable); + +/** + * device_wakeup_detach - Detach a device's wakeup source object from it. + * @dev: Device to detach the wakeup source object from. + * + * After it returns, @dev will not be treated as a wakeup device any more. + */ +static struct wakeup_source *device_wakeup_detach(struct device *dev) +{ + struct wakeup_source *ws; + + spin_lock_irq(&dev->power.lock); + ws = dev->power.wakeup; + dev->power.wakeup = NULL; + spin_unlock_irq(&dev->power.lock); + return ws; +} + +/** + * device_wakeup_disable - Do not regard a device as a wakeup source any more. + * @dev: Device to handle. + * + * Detach the @dev's wakeup source object from it, unregister this wakeup source + * object and destroy it. + */ +int device_wakeup_disable(struct device *dev) +{ + struct wakeup_source *ws; + + if (!dev || !dev->power.can_wakeup) + return -EINVAL; + + ws = device_wakeup_detach(dev); + if (ws) + wakeup_source_unregister(ws); + + return 0; +} +EXPORT_SYMBOL_GPL(device_wakeup_disable); + +/** + * device_init_wakeup - Device wakeup initialization. + * @dev: Device to handle. + * @enable: Whether or not to enable @dev as a wakeup device. + * + * By default, most devices should leave wakeup disabled. The exceptions are + * devices that everyone expects to be wakeup sources: keyboards, power buttons, + * possibly network interfaces, etc. + */ +int device_init_wakeup(struct device *dev, bool enable) +{ + int ret = 0; + + if (enable) { + device_set_wakeup_capable(dev, true); + ret = device_wakeup_enable(dev); + } else { + device_set_wakeup_capable(dev, false); + } + + return ret; +} +EXPORT_SYMBOL_GPL(device_init_wakeup); + +/** + * device_set_wakeup_enable - Enable or disable a device to wake up the system. + * @dev: Device to handle. + */ +int device_set_wakeup_enable(struct device *dev, bool enable) +{ + if (!dev || !dev->power.can_wakeup) + return -EINVAL; + + return enable ? device_wakeup_enable(dev) : device_wakeup_disable(dev); +} +EXPORT_SYMBOL_GPL(device_set_wakeup_enable); /* * The functions below use the observation that each wakeup event starts a @@ -55,118 +284,259 @@ static unsigned long events_timer_expires; * knowledge, however, may not be available to it, so it can simply specify time * to wait before the system can be suspended and pass it as the second * argument of pm_wakeup_event(). + * + * It is valid to call pm_relax() after pm_wakeup_event(), in which case the + * "no suspend" period will be ended either by the pm_relax(), or by the timer + * function executed when the timer expires, whichever comes first. */ +/** + * wakup_source_activate - Mark given wakeup source as active. + * @ws: Wakeup source to handle. + * + * Update the @ws' statistics and, if @ws has just been activated, notify the PM + * core of the event by incrementing the counter of of wakeup events being + * processed. + */ +static void wakeup_source_activate(struct wakeup_source *ws) +{ + ws->active = true; + ws->active_count++; + ws->timer_expires = jiffies; + ws->last_time = ktime_get(); + + atomic_inc(&events_in_progress); +} + +/** + * __pm_stay_awake - Notify the PM core of a wakeup event. + * @ws: Wakeup source object associated with the source of the event. + * + * It is safe to call this function from interrupt context. + */ +void __pm_stay_awake(struct wakeup_source *ws) +{ + unsigned long flags; + + if (!ws) + return; + + spin_lock_irqsave(&ws->lock, flags); + ws->event_count++; + if (!ws->active) + wakeup_source_activate(ws); + spin_unlock_irqrestore(&ws->lock, flags); +} +EXPORT_SYMBOL_GPL(__pm_stay_awake); + /** * pm_stay_awake - Notify the PM core that a wakeup event is being processed. * @dev: Device the wakeup event is related to. * - * Notify the PM core of a wakeup event (signaled by @dev) by incrementing the - * counter of wakeup events being processed. If @dev is not NULL, the counter - * of wakeup events related to @dev is incremented too. + * Notify the PM core of a wakeup event (signaled by @dev) by calling + * __pm_stay_awake for the @dev's wakeup source object. * * Call this function after detecting of a wakeup event if pm_relax() is going * to be called directly after processing the event (and possibly passing it to * user space for further processing). - * - * It is safe to call this function from interrupt context. */ void pm_stay_awake(struct device *dev) { unsigned long flags; - spin_lock_irqsave(&events_lock, flags); - if (dev) - dev->power.wakeup_count++; + if (!dev) + return; - events_in_progress++; - spin_unlock_irqrestore(&events_lock, flags); + spin_lock_irqsave(&dev->power.lock, flags); + __pm_stay_awake(dev->power.wakeup); + spin_unlock_irqrestore(&dev->power.lock, flags); } +EXPORT_SYMBOL_GPL(pm_stay_awake); /** - * pm_relax - Notify the PM core that processing of a wakeup event has ended. + * wakup_source_deactivate - Mark given wakeup source as inactive. + * @ws: Wakeup source to handle. * - * Notify the PM core that a wakeup event has been processed by decrementing - * the counter of wakeup events being processed and incrementing the counter - * of registered wakeup events. + * Update the @ws' statistics and notify the PM core that the wakeup source has + * become inactive by decrementing the counter of wakeup events being processed + * and incrementing the counter of registered wakeup events. + */ +static void wakeup_source_deactivate(struct wakeup_source *ws) +{ + ktime_t duration; + ktime_t now; + + ws->relax_count++; + /* + * __pm_relax() may be called directly or from a timer function. + * If it is called directly right after the timer function has been + * started, but before the timer function calls __pm_relax(), it is + * possible that __pm_stay_awake() will be called in the meantime and + * will set ws->active. Then, ws->active may be cleared immediately + * by the __pm_relax() called from the timer function, but in such a + * case ws->relax_count will be different from ws->active_count. + */ + if (ws->relax_count != ws->active_count) { + ws->relax_count--; + return; + } + + ws->active = false; + + now = ktime_get(); + duration = ktime_sub(now, ws->last_time); + ws->total_time = ktime_add(ws->total_time, duration); + if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time)) + ws->max_time = duration; + + del_timer(&ws->timer); + + /* + * event_count has to be incremented before events_in_progress is + * modified, so that the callers of pm_check_wakeup_events() and + * pm_save_wakeup_count() don't see the old value of event_count and + * events_in_progress equal to zero at the same time. + */ + atomic_inc(&event_count); + smp_mb__before_atomic_dec(); + atomic_dec(&events_in_progress); +} + +/** + * __pm_relax - Notify the PM core that processing of a wakeup event has ended. + * @ws: Wakeup source object associated with the source of the event. * * Call this function for wakeup events whose processing started with calling - * pm_stay_awake(). + * __pm_stay_awake(). * * It is safe to call it from interrupt context. */ -void pm_relax(void) +void __pm_relax(struct wakeup_source *ws) { unsigned long flags; - spin_lock_irqsave(&events_lock, flags); - if (events_in_progress) { - events_in_progress--; - event_count++; - } - spin_unlock_irqrestore(&events_lock, flags); + if (!ws) + return; + + spin_lock_irqsave(&ws->lock, flags); + if (ws->active) + wakeup_source_deactivate(ws); + spin_unlock_irqrestore(&ws->lock, flags); +} +EXPORT_SYMBOL_GPL(__pm_relax); + +/** + * pm_relax - Notify the PM core that processing of a wakeup event has ended. + * @dev: Device that signaled the event. + * + * Execute __pm_relax() for the @dev's wakeup source object. + */ +void pm_relax(struct device *dev) +{ + unsigned long flags; + + if (!dev) + return; + + spin_lock_irqsave(&dev->power.lock, flags); + __pm_relax(dev->power.wakeup); + spin_unlock_irqrestore(&dev->power.lock, flags); } +EXPORT_SYMBOL_GPL(pm_relax); /** * pm_wakeup_timer_fn - Delayed finalization of a wakeup event. + * @data: Address of the wakeup source object associated with the event source. * - * Decrease the counter of wakeup events being processed after it was increased - * by pm_wakeup_event(). + * Call __pm_relax() for the wakeup source whose address is stored in @data. */ static void pm_wakeup_timer_fn(unsigned long data) +{ + __pm_relax((struct wakeup_source *)data); +} + +/** + * __pm_wakeup_event - Notify the PM core of a wakeup event. + * @ws: Wakeup source object associated with the event source. + * @msec: Anticipated event processing time (in milliseconds). + * + * Notify the PM core of a wakeup event whose source is @ws that will take + * approximately @msec milliseconds to be processed by the kernel. If @ws is + * not active, activate it. If @msec is nonzero, set up the @ws' timer to + * execute pm_wakeup_timer_fn() in future. + * + * It is safe to call this function from interrupt context. + */ +void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) { unsigned long flags; + unsigned long expires; - spin_lock_irqsave(&events_lock, flags); - if (events_timer_expires - && time_before_eq(events_timer_expires, jiffies)) { - events_in_progress--; - events_timer_expires = 0; + if (!ws) + return; + + spin_lock_irqsave(&ws->lock, flags); + + ws->event_count++; + if (!ws->active) + wakeup_source_activate(ws); + + if (!msec) { + wakeup_source_deactivate(ws); + goto unlock; } - spin_unlock_irqrestore(&events_lock, flags); + + expires = jiffies + msecs_to_jiffies(msec); + if (!expires) + expires = 1; + + if (time_after(expires, ws->timer_expires)) { + mod_timer(&ws->timer, expires); + ws->timer_expires = expires; + } + + unlock: + spin_unlock_irqrestore(&ws->lock, flags); } +EXPORT_SYMBOL_GPL(__pm_wakeup_event); + /** * pm_wakeup_event - Notify the PM core of a wakeup event. * @dev: Device the wakeup event is related to. * @msec: Anticipated event processing time (in milliseconds). * - * Notify the PM core of a wakeup event (signaled by @dev) that will take - * approximately @msec milliseconds to be processed by the kernel. Increment - * the counter of registered wakeup events and (if @msec is nonzero) set up - * the wakeup events timer to execute pm_wakeup_timer_fn() in future (if the - * timer has not been set up already, increment the counter of wakeup events - * being processed). If @dev is not NULL, the counter of wakeup events related - * to @dev is incremented too. - * - * It is safe to call this function from interrupt context. + * Call __pm_wakeup_event() for the @dev's wakeup source object. */ void pm_wakeup_event(struct device *dev, unsigned int msec) { unsigned long flags; - spin_lock_irqsave(&events_lock, flags); - event_count++; - if (dev) - dev->power.wakeup_count++; - - if (msec) { - unsigned long expires; + if (!dev) + return; - expires = jiffies + msecs_to_jiffies(msec); - if (!expires) - expires = 1; + spin_lock_irqsave(&dev->power.lock, flags); + __pm_wakeup_event(dev->power.wakeup, msec); + spin_unlock_irqrestore(&dev->power.lock, flags); +} +EXPORT_SYMBOL_GPL(pm_wakeup_event); - if (!events_timer_expires - || time_after(expires, events_timer_expires)) { - if (!events_timer_expires) - events_in_progress++; +/** + * pm_wakeup_update_hit_counts - Update hit counts of all active wakeup sources. + */ +static void pm_wakeup_update_hit_counts(void) +{ + unsigned long flags; + struct wakeup_source *ws; - mod_timer(&events_timer, expires); - events_timer_expires = expires; - } + rcu_read_lock(); + list_for_each_entry_rcu(ws, &wakeup_sources, entry) { + spin_lock_irqsave(&ws->lock, flags); + if (ws->active) + ws->hit_count++; + spin_unlock_irqrestore(&ws->lock, flags); } - spin_unlock_irqrestore(&events_lock, flags); + rcu_read_unlock(); } /** @@ -184,10 +554,13 @@ bool pm_check_wakeup_events(void) spin_lock_irqsave(&events_lock, flags); if (events_check_enabled) { - ret = (event_count == saved_event_count) && !events_in_progress; + ret = ((unsigned int)atomic_read(&event_count) == saved_count) + && !atomic_read(&events_in_progress); events_check_enabled = ret; } spin_unlock_irqrestore(&events_lock, flags); + if (!ret) + pm_wakeup_update_hit_counts(); return ret; } @@ -202,24 +575,20 @@ bool pm_check_wakeup_events(void) * drop down to zero has been interrupted by a signal (and the current number * of wakeup events being processed is still nonzero). Otherwise return true. */ -bool pm_get_wakeup_count(unsigned long *count) +bool pm_get_wakeup_count(unsigned int *count) { bool ret; - spin_lock_irq(&events_lock); if (capable(CAP_SYS_ADMIN)) events_check_enabled = false; - while (events_in_progress && !signal_pending(current)) { - spin_unlock_irq(&events_lock); - - schedule_timeout_interruptible(msecs_to_jiffies(100)); - - spin_lock_irq(&events_lock); + while (atomic_read(&events_in_progress) && !signal_pending(current)) { + pm_wakeup_update_hit_counts(); + schedule_timeout_interruptible(msecs_to_jiffies(TIMEOUT)); } - *count = event_count; - ret = !events_in_progress; - spin_unlock_irq(&events_lock); + + ret = !atomic_read(&events_in_progress); + *count = atomic_read(&event_count); return ret; } @@ -232,16 +601,19 @@ bool pm_get_wakeup_count(unsigned long *count) * old number of registered wakeup events to be used by pm_check_wakeup_events() * and return true. Otherwise return false. */ -bool pm_save_wakeup_count(unsigned long count) +bool pm_save_wakeup_count(unsigned int count) { bool ret = false; spin_lock_irq(&events_lock); - if (count == event_count && !events_in_progress) { - saved_event_count = count; + if (count == (unsigned int)atomic_read(&event_count) + && !atomic_read(&events_in_progress)) { + saved_count = count; events_check_enabled = true; ret = true; } spin_unlock_irq(&events_lock); + if (!ret) + pm_wakeup_update_hit_counts(); return ret; } diff --git a/include/linux/pm.h b/include/linux/pm.h index 52e8c55ff314..a84118911ced 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -448,23 +448,24 @@ enum rpm_request { RPM_REQ_RESUME, }; +struct wakeup_source; + struct dev_pm_info { pm_message_t power_state; unsigned int can_wakeup:1; - unsigned int should_wakeup:1; unsigned async_suspend:1; enum dpm_state status; /* Owned by the PM core */ + spinlock_t lock; #ifdef CONFIG_PM_SLEEP struct list_head entry; struct completion completion; - unsigned long wakeup_count; + struct wakeup_source *wakeup; #endif #ifdef CONFIG_PM_RUNTIME struct timer_list suspend_timer; unsigned long timer_expires; struct work_struct work; wait_queue_head_t wait_queue; - spinlock_t lock; atomic_t usage_count; atomic_t child_count; unsigned int disable_depth:3; @@ -559,11 +560,6 @@ extern void __suspend_report_result(const char *function, void *fn, int ret); } while (0) extern void device_pm_wait_for_dev(struct device *sub, struct device *dev); - -/* drivers/base/power/wakeup.c */ -extern void pm_wakeup_event(struct device *dev, unsigned int msec); -extern void pm_stay_awake(struct device *dev); -extern void pm_relax(void); #else /* !CONFIG_PM_SLEEP */ #define device_pm_lock() do {} while (0) @@ -577,10 +573,6 @@ static inline int dpm_suspend_start(pm_message_t state) #define suspend_report_result(fn, ret) do {} while (0) static inline void device_pm_wait_for_dev(struct device *a, struct device *b) {} - -static inline void pm_wakeup_event(struct device *dev, unsigned int msec) {} -static inline void pm_stay_awake(struct device *dev) {} -static inline void pm_relax(void) {} #endif /* !CONFIG_PM_SLEEP */ /* How to reorder dpm_list after device_move() */ diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 76aca48722ae..9cff00dd6b63 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -2,6 +2,7 @@ * pm_wakeup.h - Power management wakeup interface * * Copyright (C) 2008 Alan Stern + * Copyright (C) 2010 Rafael J. Wysocki, Novell Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -27,19 +28,77 @@ #include -#ifdef CONFIG_PM - -/* Changes to device_may_wakeup take effect on the next pm state change. +/** + * struct wakeup_source - Representation of wakeup sources * - * By default, most devices should leave wakeup disabled. The exceptions - * are devices that everyone expects to be wakeup sources: keyboards, - * power buttons, possibly network interfaces, etc. + * @total_time: Total time this wakeup source has been active. + * @max_time: Maximum time this wakeup source has been continuously active. + * @last_time: Monotonic clock when the wakeup source's was activated last time. + * @event_count: Number of signaled wakeup events. + * @active_count: Number of times the wakeup sorce was activated. + * @relax_count: Number of times the wakeup sorce was deactivated. + * @hit_count: Number of times the wakeup sorce might abort system suspend. + * @active: Status of the wakeup source. */ -static inline void device_init_wakeup(struct device *dev, bool val) +struct wakeup_source { + char *name; + struct list_head entry; + spinlock_t lock; + struct timer_list timer; + unsigned long timer_expires; + ktime_t total_time; + ktime_t max_time; + ktime_t last_time; + unsigned long event_count; + unsigned long active_count; + unsigned long relax_count; + unsigned long hit_count; + unsigned int active:1; +}; + +#ifdef CONFIG_PM_SLEEP + +/* + * Changes to device_may_wakeup take effect on the next pm state change. + */ + +static inline void device_set_wakeup_capable(struct device *dev, bool capable) +{ + dev->power.can_wakeup = capable; +} + +static inline bool device_can_wakeup(struct device *dev) +{ + return dev->power.can_wakeup; +} + + + +static inline bool device_may_wakeup(struct device *dev) { - dev->power.can_wakeup = dev->power.should_wakeup = val; + return dev->power.can_wakeup && !!dev->power.wakeup; } +/* drivers/base/power/wakeup.c */ +extern struct wakeup_source *wakeup_source_create(const char *name); +extern void wakeup_source_destroy(struct wakeup_source *ws); +extern void wakeup_source_add(struct wakeup_source *ws); +extern void wakeup_source_remove(struct wakeup_source *ws); +extern struct wakeup_source *wakeup_source_register(const char *name); +extern void wakeup_source_unregister(struct wakeup_source *ws); +extern int device_wakeup_enable(struct device *dev); +extern int device_wakeup_disable(struct device *dev); +extern int device_init_wakeup(struct device *dev, bool val); +extern int device_set_wakeup_enable(struct device *dev, bool enable); +extern void __pm_stay_awake(struct wakeup_source *ws); +extern void pm_stay_awake(struct device *dev); +extern void __pm_relax(struct wakeup_source *ws); +extern void pm_relax(struct device *dev); +extern void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec); +extern void pm_wakeup_event(struct device *dev, unsigned int msec); + +#else /* !CONFIG_PM_SLEEP */ + static inline void device_set_wakeup_capable(struct device *dev, bool capable) { dev->power.can_wakeup = capable; @@ -50,43 +109,63 @@ static inline bool device_can_wakeup(struct device *dev) return dev->power.can_wakeup; } -static inline void device_set_wakeup_enable(struct device *dev, bool enable) +static inline bool device_may_wakeup(struct device *dev) { - dev->power.should_wakeup = enable; + return false; } -static inline bool device_may_wakeup(struct device *dev) +static inline struct wakeup_source *wakeup_source_create(const char *name) { - return dev->power.can_wakeup && dev->power.should_wakeup; + return NULL; } -#else /* !CONFIG_PM */ +static inline void wakeup_source_destroy(struct wakeup_source *ws) {} + +static inline void wakeup_source_add(struct wakeup_source *ws) {} -/* For some reason the following routines work even without CONFIG_PM */ -static inline void device_init_wakeup(struct device *dev, bool val) +static inline void wakeup_source_remove(struct wakeup_source *ws) {} + +static inline struct wakeup_source *wakeup_source_register(const char *name) { - dev->power.can_wakeup = val; + return NULL; } -static inline void device_set_wakeup_capable(struct device *dev, bool capable) +static inline void wakeup_source_unregister(struct wakeup_source *ws) {} + +static inline int device_wakeup_enable(struct device *dev) { - dev->power.can_wakeup = capable; + return -EINVAL; } -static inline bool device_can_wakeup(struct device *dev) +static inline int device_wakeup_disable(struct device *dev) { - return dev->power.can_wakeup; + return 0; } -static inline void device_set_wakeup_enable(struct device *dev, bool enable) +static inline int device_init_wakeup(struct device *dev, bool val) { + dev->power.can_wakeup = val; + return val ? -EINVAL : 0; } -static inline bool device_may_wakeup(struct device *dev) + +static inline int device_set_wakeup_enable(struct device *dev, bool enable) { - return false; + return -EINVAL; } -#endif /* !CONFIG_PM */ +static inline void __pm_stay_awake(struct wakeup_source *ws) {} + +static inline void pm_stay_awake(struct device *dev) {} + +static inline void __pm_relax(struct wakeup_source *ws) {} + +static inline void pm_relax(struct device *dev) {} + +static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) {} + +static inline void pm_wakeup_event(struct device *dev, unsigned int msec) {} + +#endif /* !CONFIG_PM_SLEEP */ #endif /* _LINUX_PM_WAKEUP_H */ diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 4af270ec2204..6b1712c51102 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -293,8 +293,8 @@ extern int unregister_pm_notifier(struct notifier_block *nb); extern bool events_check_enabled; extern bool pm_check_wakeup_events(void); -extern bool pm_get_wakeup_count(unsigned long *count); -extern bool pm_save_wakeup_count(unsigned long count); +extern bool pm_get_wakeup_count(unsigned int *count); +extern bool pm_save_wakeup_count(unsigned int count); #else /* !CONFIG_PM_SLEEP */ static inline int register_pm_notifier(struct notifier_block *nb) diff --git a/kernel/power/main.c b/kernel/power/main.c index f06ad6eff37a..6b12a0cf4d9f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -237,18 +237,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - unsigned long val; + unsigned int val; - return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; + return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - unsigned long val; + unsigned int val; - if (sscanf(buf, "%lu", &val) == 1) { + if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) return n; } -- cgit v1.2.3 From d0941ead3fdd31aafff992d211bcefdbff1eaedb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 28 Sep 2010 23:31:22 +0200 Subject: PM / Hibernate: Make some boot messages look less scary The hibernate resume code checks if there is an image to resume from on every boot and, if the kernel is built with CONFIG_PM_DEBUG set and the image is not present, it prints some scary messages suggesting there was a boot error of some sort. Apparently, some users are confused by them, so make them look less scary and adjust the other hibernate resume debug messages to match them. Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 12 ++++++------ kernel/power/swap.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6c9c9dc48c75..657272e91d0a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -708,7 +708,7 @@ static int software_resume(void) goto Unlock; } - pr_debug("PM: Checking image partition %s\n", resume_file); + pr_debug("PM: Checking hibernation image partition %s\n", resume_file); /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); @@ -733,10 +733,10 @@ static int software_resume(void) } Check_image: - pr_debug("PM: Resume from partition %d:%d\n", + pr_debug("PM: Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - pr_debug("PM: Checking hibernation image.\n"); + pr_debug("PM: Looking for hibernation image.\n"); error = swsusp_check(); if (error) goto Unlock; @@ -768,14 +768,14 @@ static int software_resume(void) goto Done; } - pr_debug("PM: Reading hibernation image.\n"); + pr_debug("PM: Loading hibernation image.\n"); error = swsusp_read(&flags); swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); - printk(KERN_ERR "PM: Restore failed, recovering.\n"); + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); swsusp_free(); thaw_processes(); Done: @@ -788,7 +788,7 @@ static int software_resume(void) /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&pm_mutex); - pr_debug("PM: Resume from disk failed.\n"); + pr_debug("PM: Hibernation image not present or could not be loaded.\n"); return error; close_finish: swsusp_close(FMODE_READ); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3dc0552cddfb..2dfa87869b49 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -929,13 +929,13 @@ put: if (error) blkdev_put(hib_resume_bdev, FMODE_READ); else - pr_debug("PM: Signature found, resuming\n"); + pr_debug("PM: Image signature found, resuming\n"); } else { error = PTR_ERR(hib_resume_bdev); } if (error) - pr_debug("PM: Error %d checking image file\n", error); + pr_debug("PM: Image not found (code %d)\n", error); return error; } -- cgit v1.2.3 From dbeeec5fe868f2e2e92fe94daa2c5a047240fdc4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 4 Oct 2010 22:07:32 +0200 Subject: PM: Allow wakeup events to abort freezing of tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there is a wakeup event during the freezing of tasks, suspend or hibernation will fail anyway. Since try_to_freeze_tasks() can take up to 20 seconds to complete or fail, aborting it as soon as a wakeup event is detected improves the worst case wakeup latency. Based on a patch from Arve Hjønnevåg. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek --- include/linux/suspend.h | 2 ++ kernel/power/process.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 6b1712c51102..26697514c5ec 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -308,6 +308,8 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) } #define pm_notifier(fn, pri) do { (void)(fn); } while (0) + +static inline bool pm_check_wakeup_events(void) { return true; } #endif /* !CONFIG_PM_SLEEP */ extern struct mutex pm_mutex; diff --git a/kernel/power/process.c b/kernel/power/process.c index 028a99598f49..e50b4c1b2a0f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only) struct timeval start, end; u64 elapsed_csecs64; unsigned int elapsed_csecs; + bool wakeup = false; do_gettimeofday(&start); @@ -78,6 +79,11 @@ static int try_to_freeze_tasks(bool sig_only) if (!todo || time_after(jiffies, end_time)) break; + if (!pm_check_wakeup_events()) { + wakeup = true; + break; + } + /* * We need to retry, but first give the freezing tasks some * time to enter the regrigerator. @@ -97,8 +103,9 @@ static int try_to_freeze_tasks(bool sig_only) * but it cleans up leftover PF_FREEZE requests. */ printk("\n"); - printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " + printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", + wakeup ? "aborted" : "failed", elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); @@ -107,7 +114,7 @@ static int try_to_freeze_tasks(bool sig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); - if (freezing(p) && !freezer_should_skip(p)) + if (!wakeup && freezing(p) && !freezer_should_skip(p)) sched_show_task(p); cancel_freezing(p); task_unlock(p); -- cgit v1.2.3 From 3624eb04c24861ab296842414f9752a393e68372 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 4 Oct 2010 22:08:12 +0200 Subject: PM / Hibernate: Modify signature used to mark swap Since we are adding compression to the kernel's hibernate code, change signature used by it to mark swap spaces, so that earlier kernels don't attempt to restore compressed images they cannot handle. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek --- kernel/power/swap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2dfa87869b49..916eaa790399 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -29,7 +29,7 @@ #include "power.h" -#define SWSUSP_SIG "S1SUSPEND" +#define HIBERNATE_SIG "LINHIB0001" /* * The swap map is a data structure used for keeping track of each page @@ -195,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); - memcpy(swsusp_header->sig,SWSUSP_SIG, 10); + memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; error = hib_bio_write_page(swsusp_resume_block, @@ -916,7 +916,7 @@ int swsusp_check(void) if (error) goto put; - if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { + if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ error = hib_bio_write_page(swsusp_resume_block, -- cgit v1.2.3 From d33ac60beaf2c7dee5cd90aba7c1eb385dd70937 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 12 Oct 2010 00:00:25 +0200 Subject: PM: Add sysfs attr for rechecking dev hash from PM trace If the device which fails to resume is part of a loadable kernel module it won't be checked at startup against the magic number stored in the RTC. Add a read-only sysfs attribute /sys/power/pm_trace_dev_match which contains a list of newline separated devices (usually just the one) which currently match the last magic number. This allows the device which is failing to resume to be found after the modules are loaded again. Signed-off-by: James Hogan Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-power | 29 +++++++++++++++++++++++++++++ Documentation/power/s2ram.txt | 7 +++++++ drivers/base/power/trace.c | 31 +++++++++++++++++++++++++++++++ include/linux/resume-trace.h | 2 ++ kernel/power/main.c | 18 ++++++++++++++++++ 5 files changed, 87 insertions(+) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 2875f1f74a07..194ca446ac28 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -99,9 +99,38 @@ Description: dmesg -s 1000000 | grep 'hash matches' + If you do not get any matches (or they appear to be false + positives), it is possible that the last PM event point + referred to a device created by a loadable kernel module. In + this case cat /sys/power/pm_trace_dev_match (see below) after + your system is started up and the kernel modules are loaded. + CAUTION: Using it will cause your machine's real-time (CMOS) clock to be set to a random invalid time after a resume. +What; /sys/power/pm_trace_dev_match +Date: October 2010 +Contact: James Hogan +Description: + The /sys/power/pm_trace_dev_match file contains the name of the + device associated with the last PM event point saved in the RTC + across reboots when pm_trace has been used. More precisely it + contains the list of current devices (including those + registered by loadable kernel modules since boot) which match + the device hash in the RTC at boot, with a newline after each + one. + + The advantage of this file over the hash matches printed to the + kernel log (see /sys/power/pm_trace), is that it includes + devices created after boot by loadable kernel modules. + + Due to the small hash size necessary to fit in the RTC, it is + possible that more than one device matches the hash, in which + case further investigation is required to determine which + device is causing the problem. Note that genuine RTC clock + values (such as when pm_trace has not been used), can still + match a device and output it's name here. + What: /sys/power/pm_async Date: January 2009 Contact: Rafael J. Wysocki diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt index 514b94fc931e..1bdfa0443773 100644 --- a/Documentation/power/s2ram.txt +++ b/Documentation/power/s2ram.txt @@ -49,6 +49,13 @@ machine that doesn't boot) is: device (lspci and /sys/devices/pci* is your friend), and see if you can fix it, disable it, or trace into its resume function. + If no device matches the hash (or any matches appear to be false positives), + the culprit may be a device from a loadable kernel module that is not loaded + until after the hash is checked. You can check the hash against the current + devices again after more modules are loaded using sysfs: + + cat /sys/power/pm_trace_dev_match + For example, the above happens to be the VGA device on my EVO, which I used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out that "radeonfb" simply cannot resume that device - it tries to set the diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 17e24e3f4422..9f4258df4cfd 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -207,6 +207,37 @@ static int show_dev_hash(unsigned int value) static unsigned int hash_value_early_read; +int show_trace_dev_match(char *buf, size_t size) +{ + unsigned int value = hash_value_early_read / (USERHASH * FILEHASH); + int ret = 0; + struct list_head *entry; + + /* + * It's possible that multiple devices will match the hash and we can't + * tell which is the culprit, so it's best to output them all. + */ + device_pm_lock(); + entry = dpm_list.prev; + while (size && entry != &dpm_list) { + struct device *dev = to_device(entry); + unsigned int hash = hash_string(DEVSEED, dev_name(dev), + DEVHASH); + if (hash == value) { + int len = snprintf(buf, size, "%s\n", + dev_driver_string(dev)); + if (len > size) + len = size; + buf += len; + ret += len; + size -= len; + } + entry = entry->prev; + } + device_pm_unlock(); + return ret; +} + static int early_resume_init(void) { hash_value_early_read = read_magic_time(); diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h index bc8c3881c729..f31db2368782 100644 --- a/include/linux/resume-trace.h +++ b/include/linux/resume-trace.h @@ -3,6 +3,7 @@ #ifdef CONFIG_PM_TRACE #include +#include extern int pm_trace_enabled; @@ -14,6 +15,7 @@ static inline int pm_trace_is_enabled(void) struct device; extern void set_trace_device(struct device *); extern void generate_resume_trace(const void *tracedata, unsigned int user); +extern int show_trace_dev_match(char *buf, size_t size); #define TRACE_DEVICE(dev) do { \ if (pm_trace_enabled) \ diff --git a/kernel/power/main.c b/kernel/power/main.c index 6b12a0cf4d9f..7b5db6a8561e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -281,12 +281,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, } power_attr(pm_trace); + +static ssize_t pm_trace_dev_match_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return show_trace_dev_match(buf, PAGE_SIZE); +} + +static ssize_t +pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} + +power_attr(pm_trace_dev_match); + #endif /* CONFIG_PM_TRACE */ static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, + &pm_trace_dev_match_attr.attr, #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, -- cgit v1.2.3 From e1f60b292ffd61151403327aa19ff7a1871820bd Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Wed, 13 Oct 2010 00:13:10 +0200 Subject: PM: Introduce library for device-specific OPPs (v7) SoCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. These are called Operating Performance Points or OPPs. The actual definitions of OPP varies over silicon versions. For a specific domain, we can have a set of {frequency, voltage} pairs. As the kernel boots and more information is available, a default set of these are activated based on the precise nature of device. Further on operation, based on conditions prevailing in the system (such as temperature), some OPP availability may be temporarily controlled by the SoC frameworks. To implement an OPP, some sort of power management support is necessary hence this library depends on CONFIG_PM. Contributions include: Sanjeev Premi for the initial concept: http://patchwork.kernel.org/patch/50998/ Kevin Hilman for converting original design to device-based. Kevin Hilman and Paul Walmsey for cleaning up many of the function abstractions, improvements and data structure handling. Romit Dasgupta for using enums instead of opp pointers. Thara Gopinath, Eduardo Valentin and Vishwanath BS for fixes and cleanups. Linus Walleij for recommending this layer be made generic for usage in other architectures beyond OMAP and ARM. Mark Brown, Andrew Morton, Rafael J. Wysocki, Paul E. McKenney for valuable improvements. Discussions and comments from: http://marc.info/?l=linux-omap&m=126033945313269&w=2 http://marc.info/?l=linux-omap&m=125482970102327&w=2 http://marc.info/?t=125809247500002&r=1&w=2 http://marc.info/?l=linux-omap&m=126025973426007&w=2 http://marc.info/?t=128152609200064&r=1&w=2 http://marc.info/?t=128468723000002&r=1&w=2 incorporated. v1: http://marc.info/?t=128468723000002&r=1&w=2 Signed-off-by: Nishanth Menon Signed-off-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- Documentation/power/00-INDEX | 2 + Documentation/power/opp.txt | 375 ++++++++++++++++++++++++++ drivers/base/power/Makefile | 1 + drivers/base/power/opp.c | 628 +++++++++++++++++++++++++++++++++++++++++++ include/linux/opp.h | 105 ++++++++ kernel/power/Kconfig | 14 + 6 files changed, 1125 insertions(+) create mode 100644 Documentation/power/opp.txt create mode 100644 drivers/base/power/opp.c create mode 100644 include/linux/opp.h (limited to 'kernel') diff --git a/Documentation/power/00-INDEX b/Documentation/power/00-INDEX index fb742c213c9e..45e9d4a91284 100644 --- a/Documentation/power/00-INDEX +++ b/Documentation/power/00-INDEX @@ -14,6 +14,8 @@ interface.txt - Power management user interface in /sys/power notifiers.txt - Registering suspend notifiers in device drivers +opp.txt + - Operating Performance Point library pci.txt - How the PCI Subsystem Does Power Management pm_qos_interface.txt diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt new file mode 100644 index 000000000000..44d87ad3cea9 --- /dev/null +++ b/Documentation/power/opp.txt @@ -0,0 +1,375 @@ +*=============* +* OPP Library * +*=============* + +(C) 2009-2010 Nishanth Menon , Texas Instruments Incorporated + +Contents +-------- +1. Introduction +2. Initial OPP List Registration +3. OPP Search Functions +4. OPP Availability Control Functions +5. OPP Data Retrieval Functions +6. Cpufreq Table Generation +7. Data Structures + +1. Introduction +=============== +Complex SoCs of today consists of a multiple sub-modules working in conjunction. +In an operational system executing varied use cases, not all modules in the SoC +need to function at their highest performing frequency all the time. To +facilitate this, sub-modules in a SoC are grouped into domains, allowing some +domains to run at lower voltage and frequency while other domains are loaded +more. The set of discrete tuples consisting of frequency and voltage pairs that +the device will support per domain are called Operating Performance Points or +OPPs. + +OPP library provides a set of helper functions to organize and query the OPP +information. The library is located in drivers/base/power/opp.c and the header +is located in include/linux/opp.h. OPP library can be enabled by enabling +CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on +CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to +optionally boot at a certain OPP without needing cpufreq. + +Typical usage of the OPP library is as follows: +(users) -> registers a set of default OPPs -> (library) +SoC framework -> modifies on required cases certain OPPs -> OPP layer + -> queries to search/retrieve information -> + +OPP layer expects each domain to be represented by a unique device pointer. SoC +framework registers a set of initial OPPs per device with the OPP layer. This +list is expected to be an optimally small number typically around 5 per device. +This initial list contains a set of OPPs that the framework expects to be safely +enabled by default in the system. + +Note on OPP Availability: +------------------------ +As the system proceeds to operate, SoC framework may choose to make certain +OPPs available or not available on each device based on various external +factors. Example usage: Thermal management or other exceptional situations where +SoC framework might choose to disable a higher frequency OPP to safely continue +operations until that OPP could be re-enabled if possible. + +OPP library facilitates this concept in it's implementation. The following +operational functions operate only on available opps: +opp_find_freq_{ceil, floor}, opp_get_voltage, opp_get_freq, opp_get_opp_count +and opp_init_cpufreq_table + +opp_find_freq_exact is meant to be used to find the opp pointer which can then +be used for opp_enable/disable functions to make an opp available as required. + +WARNING: Users of OPP library should refresh their availability count using +get_opp_count if opp_enable/disable functions are invoked for a device, the +exact mechanism to trigger these or the notification mechanism to other +dependent subsystems such as cpufreq are left to the discretion of the SoC +specific framework which uses the OPP library. Similar care needs to be taken +care to refresh the cpufreq table in cases of these operations. + +WARNING on OPP List locking mechanism: +------------------------------------------------- +OPP library uses RCU for exclusivity. RCU allows the query functions to operate +in multiple contexts and this synchronization mechanism is optimal for a read +intensive operations on data structure as the OPP library caters to. + +To ensure that the data retrieved are sane, the users such as SoC framework +should ensure that the section of code operating on OPP queries are locked +using RCU read locks. The opp_find_freq_{exact,ceil,floor}, +opp_get_{voltage, freq, opp_count} fall into this category. + +opp_{add,enable,disable} are updaters which use mutex and implement it's own +RCU locking mechanisms. opp_init_cpufreq_table acts as an updater and uses +mutex to implment RCU updater strategy. These functions should *NOT* be called +under RCU locks and other contexts that prevent blocking functions in RCU or +mutex operations from working. + +2. Initial OPP List Registration +================================ +The SoC implementation calls opp_add function iteratively to add OPPs per +device. It is expected that the SoC framework will register the OPP entries +optimally- typical numbers range to be less than 5. The list generated by +registering the OPPs is maintained by OPP library throughout the device +operation. The SoC framework can subsequently control the availability of the +OPPs dynamically using the opp_enable / disable functions. + +opp_add - Add a new OPP for a specific domain represented by the device pointer. + The OPP is defined using the frequency and voltage. Once added, the OPP + is assumed to be available and control of it's availability can be done + with the opp_enable/disable functions. OPP library internally stores + and manages this information in the opp struct. This function may be + used by SoC framework to define a optimal list as per the demands of + SoC usage environment. + + WARNING: Do not use this function in interrupt context. + + Example: + soc_pm_init() + { + /* Do things */ + r = opp_add(mpu_dev, 1000000, 900000); + if (!r) { + pr_err("%s: unable to register mpu opp(%d)\n", r); + goto no_cpufreq; + } + /* Do cpufreq things */ + no_cpufreq: + /* Do remaining things */ + } + +3. OPP Search Functions +======================= +High level framework such as cpufreq operates on frequencies. To map the +frequency back to the corresponding OPP, OPP library provides handy functions +to search the OPP list that OPP library internally manages. These search +functions return the matching pointer representing the opp if a match is +found, else returns error. These errors are expected to be handled by standard +error checks such as IS_ERR() and appropriate actions taken by the caller. + +opp_find_freq_exact - Search for an OPP based on an *exact* frequency and + availability. This function is especially useful to enable an OPP which + is not available by default. + Example: In a case when SoC framework detects a situation where a + higher frequency could be made available, it can use this function to + find the OPP prior to call the opp_enable to actually make it available. + rcu_read_lock(); + opp = opp_find_freq_exact(dev, 1000000000, false); + rcu_read_unlock(); + /* dont operate on the pointer.. just do a sanity check.. */ + if (IS_ERR(opp)) { + pr_err("frequency not disabled!\n"); + /* trigger appropriate actions.. */ + } else { + opp_enable(dev,1000000000); + } + + NOTE: This is the only search function that operates on OPPs which are + not available. + +opp_find_freq_floor - Search for an available OPP which is *at most* the + provided frequency. This function is useful while searching for a lesser + match OR operating on OPP information in the order of decreasing + frequency. + Example: To find the highest opp for a device: + freq = ULONG_MAX; + rcu_read_lock(); + opp_find_freq_floor(dev, &freq); + rcu_read_unlock(); + +opp_find_freq_ceil - Search for an available OPP which is *at least* the + provided frequency. This function is useful while searching for a + higher match OR operating on OPP information in the order of increasing + frequency. + Example 1: To find the lowest opp for a device: + freq = 0; + rcu_read_lock(); + opp_find_freq_ceil(dev, &freq); + rcu_read_unlock(); + Example 2: A simplified implementation of a SoC cpufreq_driver->target: + soc_cpufreq_target(..) + { + /* Do stuff like policy checks etc. */ + /* Find the best frequency match for the req */ + rcu_read_lock(); + opp = opp_find_freq_ceil(dev, &freq); + rcu_read_unlock(); + if (!IS_ERR(opp)) + soc_switch_to_freq_voltage(freq); + else + /* do something when we cant satisfy the req */ + /* do other stuff */ + } + +4. OPP Availability Control Functions +===================================== +A default OPP list registered with the OPP library may not cater to all possible +situation. The OPP library provides a set of functions to modify the +availability of a OPP within the OPP list. This allows SoC frameworks to have +fine grained dynamic control of which sets of OPPs are operationally available. +These functions are intended to *temporarily* remove an OPP in conditions such +as thermal considerations (e.g. don't use OPPx until the temperature drops). + +WARNING: Do not use these functions in interrupt context. + +opp_enable - Make a OPP available for operation. + Example: Lets say that 1GHz OPP is to be made available only if the + SoC temperature is lower than a certain threshold. The SoC framework + implementation might choose to do something as follows: + if (cur_temp < temp_low_thresh) { + /* Enable 1GHz if it was disabled */ + rcu_read_lock(); + opp = opp_find_freq_exact(dev, 1000000000, false); + rcu_read_unlock(); + /* just error check */ + if (!IS_ERR(opp)) + ret = opp_enable(dev, 1000000000); + else + goto try_something_else; + } + +opp_disable - Make an OPP to be not available for operation + Example: Lets say that 1GHz OPP is to be disabled if the temperature + exceeds a threshold value. The SoC framework implementation might + choose to do something as follows: + if (cur_temp > temp_high_thresh) { + /* Disable 1GHz if it was enabled */ + rcu_read_lock(); + opp = opp_find_freq_exact(dev, 1000000000, true); + rcu_read_unlock(); + /* just error check */ + if (!IS_ERR(opp)) + ret = opp_disable(dev, 1000000000); + else + goto try_something_else; + } + +5. OPP Data Retrieval Functions +=============================== +Since OPP library abstracts away the OPP information, a set of functions to pull +information from the OPP structure is necessary. Once an OPP pointer is +retrieved using the search functions, the following functions can be used by SoC +framework to retrieve the information represented inside the OPP layer. + +opp_get_voltage - Retrieve the voltage represented by the opp pointer. + Example: At a cpufreq transition to a different frequency, SoC + framework requires to set the voltage represented by the OPP using + the regulator framework to the Power Management chip providing the + voltage. + soc_switch_to_freq_voltage(freq) + { + /* do things */ + rcu_read_lock(); + opp = opp_find_freq_ceil(dev, &freq); + v = opp_get_voltage(opp); + rcu_read_unlock(); + if (v) + regulator_set_voltage(.., v); + /* do other things */ + } + +opp_get_freq - Retrieve the freq represented by the opp pointer. + Example: Lets say the SoC framework uses a couple of helper functions + we could pass opp pointers instead of doing additional parameters to + handle quiet a bit of data parameters. + soc_cpufreq_target(..) + { + /* do things.. */ + max_freq = ULONG_MAX; + rcu_read_lock(); + max_opp = opp_find_freq_floor(dev,&max_freq); + requested_opp = opp_find_freq_ceil(dev,&freq); + if (!IS_ERR(max_opp) && !IS_ERR(requested_opp)) + r = soc_test_validity(max_opp, requested_opp); + rcu_read_unlock(); + /* do other things */ + } + soc_test_validity(..) + { + if(opp_get_voltage(max_opp) < opp_get_voltage(requested_opp)) + return -EINVAL; + if(opp_get_freq(max_opp) < opp_get_freq(requested_opp)) + return -EINVAL; + /* do things.. */ + } + +opp_get_opp_count - Retrieve the number of available opps for a device + Example: Lets say a co-processor in the SoC needs to know the available + frequencies in a table, the main processor can notify as following: + soc_notify_coproc_available_frequencies() + { + /* Do things */ + rcu_read_lock(); + num_available = opp_get_opp_count(dev); + speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL); + /* populate the table in increasing order */ + freq = 0; + while (!IS_ERR(opp = opp_find_freq_ceil(dev, &freq))) { + speeds[i] = freq; + freq++; + i++; + } + rcu_read_unlock(); + + soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available); + /* Do other things */ + } + +6. Cpufreq Table Generation +=========================== +opp_init_cpufreq_table - cpufreq framework typically is initialized with + cpufreq_frequency_table_cpuinfo which is provided with the list of + frequencies that are available for operation. This function provides + a ready to use conversion routine to translate the OPP layer's internal + information about the available frequencies into a format readily + providable to cpufreq. + + WARNING: Do not use this function in interrupt context. + + Example: + soc_pm_init() + { + /* Do things */ + r = opp_init_cpufreq_table(dev, &freq_table); + if (!r) + cpufreq_frequency_table_cpuinfo(policy, freq_table); + /* Do other things */ + } + + NOTE: This function is available only if CONFIG_CPU_FREQ is enabled in + addition to CONFIG_PM as power management feature is required to + dynamically scale voltage and frequency in a system. + +7. Data Structures +================== +Typically an SoC contains multiple voltage domains which are variable. Each +domain is represented by a device pointer. The relationship to OPP can be +represented as follows: +SoC + |- device 1 + | |- opp 1 (availability, freq, voltage) + | |- opp 2 .. + ... ... + | `- opp n .. + |- device 2 + ... + `- device m + +OPP library maintains a internal list that the SoC framework populates and +accessed by various functions as described above. However, the structures +representing the actual OPPs and domains are internal to the OPP library itself +to allow for suitable abstraction reusable across systems. + +struct opp - The internal data structure of OPP library which is used to + represent an OPP. In addition to the freq, voltage, availability + information, it also contains internal book keeping information required + for the OPP library to operate on. Pointer to this structure is + provided back to the users such as SoC framework to be used as a + identifier for OPP in the interactions with OPP layer. + + WARNING: The struct opp pointer should not be parsed or modified by the + users. The defaults of for an instance is populated by opp_add, but the + availability of the OPP can be modified by opp_enable/disable functions. + +struct device - This is used to identify a domain to the OPP layer. The + nature of the device and it's implementation is left to the user of + OPP library such as the SoC framework. + +Overall, in a simplistic view, the data structure operations is represented as +following: + +Initialization / modification: + +-----+ /- opp_enable +opp_add --> | opp | <------- + | +-----+ \- opp_disable + \-------> domain_info(device) + +Search functions: + /-- opp_find_freq_ceil ---\ +-----+ +domain_info<---- opp_find_freq_exact -----> | opp | + \-- opp_find_freq_floor ---/ +-----+ + +Retrieval functions: ++-----+ /- opp_get_voltage +| opp | <--- ++-----+ \- opp_get_freq + +domain_info <- opp_get_opp_count diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index cbccf9a3cee4..abe46edfe5b4 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_PM_SLEEP) += main.o wakeup.o obj-$(CONFIG_PM_RUNTIME) += runtime.o obj-$(CONFIG_PM_OPS) += generic_ops.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o +obj-$(CONFIG_PM_OPP) += opp.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG ccflags-$(CONFIG_PM_VERBOSE) += -DDEBUG diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c new file mode 100644 index 000000000000..2bb9b4cf59d7 --- /dev/null +++ b/drivers/base/power/opp.c @@ -0,0 +1,628 @@ +/* + * Generic OPP Interface + * + * Copyright (C) 2009-2010 Texas Instruments Incorporated. + * Nishanth Menon + * Romit Dasgupta + * Kevin Hilman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Internal data structure organization with the OPP layer library is as + * follows: + * dev_opp_list (root) + * |- device 1 (represents voltage domain 1) + * | |- opp 1 (availability, freq, voltage) + * | |- opp 2 .. + * ... ... + * | `- opp n .. + * |- device 2 (represents the next voltage domain) + * ... + * `- device m (represents mth voltage domain) + * device 1, 2.. are represented by dev_opp structure while each opp + * is represented by the opp structure. + */ + +/** + * struct opp - Generic OPP description structure + * @node: opp list node. The nodes are maintained throughout the lifetime + * of boot. It is expected only an optimal set of OPPs are + * added to the library by the SoC framework. + * RCU usage: opp list is traversed with RCU locks. node + * modification is possible realtime, hence the modifications + * are protected by the dev_opp_list_lock for integrity. + * IMPORTANT: the opp nodes should be maintained in increasing + * order. + * @available: true/false - marks if this OPP as available or not + * @rate: Frequency in hertz + * @u_volt: Nominal voltage in microvolts corresponding to this OPP + * @dev_opp: points back to the device_opp struct this opp belongs to + * + * This structure stores the OPP information for a given device. + */ +struct opp { + struct list_head node; + + bool available; + unsigned long rate; + unsigned long u_volt; + + struct device_opp *dev_opp; +}; + +/** + * struct device_opp - Device opp structure + * @node: list node - contains the devices with OPPs that + * have been registered. Nodes once added are not modified in this + * list. + * RCU usage: nodes are not modified in the list of device_opp, + * however addition is possible and is secured by dev_opp_list_lock + * @dev: device pointer + * @opp_list: list of opps + * + * This is an internal data structure maintaining the link to opps attached to + * a device. This structure is not meant to be shared to users as it is + * meant for book keeping and private to OPP library + */ +struct device_opp { + struct list_head node; + + struct device *dev; + struct list_head opp_list; +}; + +/* + * The root of the list of all devices. All device_opp structures branch off + * from here, with each device_opp containing the list of opp it supports in + * various states of availability. + */ +static LIST_HEAD(dev_opp_list); +/* Lock to allow exclusive modification to the device and opp lists */ +static DEFINE_MUTEX(dev_opp_list_lock); + +/** + * find_device_opp() - find device_opp struct using device pointer + * @dev: device pointer used to lookup device OPPs + * + * Search list of device OPPs for one containing matching device. Does a RCU + * reader operation to grab the pointer needed. + * + * Returns pointer to 'struct device_opp' if found, otherwise -ENODEV or + * -EINVAL based on type of error. + * + * Locking: This function must be called under rcu_read_lock(). device_opp + * is a RCU protected pointer. This means that device_opp is valid as long + * as we are under RCU lock. + */ +static struct device_opp *find_device_opp(struct device *dev) +{ + struct device_opp *tmp_dev_opp, *dev_opp = ERR_PTR(-ENODEV); + + if (unlikely(IS_ERR_OR_NULL(dev))) { + pr_err("%s: Invalid parameters\n", __func__); + return ERR_PTR(-EINVAL); + } + + list_for_each_entry_rcu(tmp_dev_opp, &dev_opp_list, node) { + if (tmp_dev_opp->dev == dev) { + dev_opp = tmp_dev_opp; + break; + } + } + + return dev_opp; +} + +/** + * opp_get_voltage() - Gets the voltage corresponding to an available opp + * @opp: opp for which voltage has to be returned for + * + * Return voltage in micro volt corresponding to the opp, else + * return 0 + * + * Locking: This function must be called under rcu_read_lock(). opp is a rcu + * protected pointer. This means that opp which could have been fetched by + * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are + * under RCU lock. The pointer returned by the opp_find_freq family must be + * used in the same section as the usage of this function with the pointer + * prior to unlocking with rcu_read_unlock() to maintain the integrity of the + * pointer. + */ +unsigned long opp_get_voltage(struct opp *opp) +{ + struct opp *tmp_opp; + unsigned long v = 0; + + tmp_opp = rcu_dereference(opp); + if (unlikely(IS_ERR_OR_NULL(tmp_opp)) || !tmp_opp->available) + pr_err("%s: Invalid parameters\n", __func__); + else + v = tmp_opp->u_volt; + + return v; +} + +/** + * opp_get_freq() - Gets the frequency corresponding to an available opp + * @opp: opp for which frequency has to be returned for + * + * Return frequency in hertz corresponding to the opp, else + * return 0 + * + * Locking: This function must be called under rcu_read_lock(). opp is a rcu + * protected pointer. This means that opp which could have been fetched by + * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are + * under RCU lock. The pointer returned by the opp_find_freq family must be + * used in the same section as the usage of this function with the pointer + * prior to unlocking with rcu_read_unlock() to maintain the integrity of the + * pointer. + */ +unsigned long opp_get_freq(struct opp *opp) +{ + struct opp *tmp_opp; + unsigned long f = 0; + + tmp_opp = rcu_dereference(opp); + if (unlikely(IS_ERR_OR_NULL(tmp_opp)) || !tmp_opp->available) + pr_err("%s: Invalid parameters\n", __func__); + else + f = tmp_opp->rate; + + return f; +} + +/** + * opp_get_opp_count() - Get number of opps available in the opp list + * @dev: device for which we do this operation + * + * This function returns the number of available opps if there are any, + * else returns 0 if none or the corresponding error value. + * + * Locking: This function must be called under rcu_read_lock(). This function + * internally references two RCU protected structures: device_opp and opp which + * are safe as long as we are under a common RCU locked section. + */ +int opp_get_opp_count(struct device *dev) +{ + struct device_opp *dev_opp; + struct opp *temp_opp; + int count = 0; + + dev_opp = find_device_opp(dev); + if (IS_ERR(dev_opp)) { + int r = PTR_ERR(dev_opp); + dev_err(dev, "%s: device OPP not found (%d)\n", __func__, r); + return r; + } + + list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) { + if (temp_opp->available) + count++; + } + + return count; +} + +/** + * opp_find_freq_exact() - search for an exact frequency + * @dev: device for which we do this operation + * @freq: frequency to search for + * @is_available: true/false - match for available opp + * + * Searches for exact match in the opp list and returns pointer to the matching + * opp if found, else returns ERR_PTR in case of error and should be handled + * using IS_ERR. + * + * Note: available is a modifier for the search. if available=true, then the + * match is for exact matching frequency and is available in the stored OPP + * table. if false, the match is for exact frequency which is not available. + * + * This provides a mechanism to enable an opp which is not available currently + * or the opposite as well. + * + * Locking: This function must be called under rcu_read_lock(). opp is a rcu + * protected pointer. The reason for the same is that the opp pointer which is + * returned will remain valid for use with opp_get_{voltage, freq} only while + * under the locked area. The pointer returned must be used prior to unlocking + * with rcu_read_unlock() to maintain the integrity of the pointer. + */ +struct opp *opp_find_freq_exact(struct device *dev, unsigned long freq, + bool available) +{ + struct device_opp *dev_opp; + struct opp *temp_opp, *opp = ERR_PTR(-ENODEV); + + dev_opp = find_device_opp(dev); + if (IS_ERR(dev_opp)) { + int r = PTR_ERR(dev_opp); + dev_err(dev, "%s: device OPP not found (%d)\n", __func__, r); + return ERR_PTR(r); + } + + list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) { + if (temp_opp->available == available && + temp_opp->rate == freq) { + opp = temp_opp; + break; + } + } + + return opp; +} + +/** + * opp_find_freq_ceil() - Search for an rounded ceil freq + * @dev: device for which we do this operation + * @freq: Start frequency + * + * Search for the matching ceil *available* OPP from a starting freq + * for a device. + * + * Returns matching *opp and refreshes *freq accordingly, else returns + * ERR_PTR in case of error and should be handled using IS_ERR. + * + * Locking: This function must be called under rcu_read_lock(). opp is a rcu + * protected pointer. The reason for the same is that the opp pointer which is + * returned will remain valid for use with opp_get_{voltage, freq} only while + * under the locked area. The pointer returned must be used prior to unlocking + * with rcu_read_unlock() to maintain the integrity of the pointer. + */ +struct opp *opp_find_freq_ceil(struct device *dev, unsigned long *freq) +{ + struct device_opp *dev_opp; + struct opp *temp_opp, *opp = ERR_PTR(-ENODEV); + + if (!dev || !freq) { + dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq); + return ERR_PTR(-EINVAL); + } + + dev_opp = find_device_opp(dev); + if (IS_ERR(dev_opp)) + return opp; + + list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) { + if (temp_opp->available && temp_opp->rate >= *freq) { + opp = temp_opp; + *freq = opp->rate; + break; + } + } + + return opp; +} + +/** + * opp_find_freq_floor() - Search for a rounded floor freq + * @dev: device for which we do this operation + * @freq: Start frequency + * + * Search for the matching floor *available* OPP from a starting freq + * for a device. + * + * Returns matching *opp and refreshes *freq accordingly, else returns + * ERR_PTR in case of error and should be handled using IS_ERR. + * + * Locking: This function must be called under rcu_read_lock(). opp is a rcu + * protected pointer. The reason for the same is that the opp pointer which is + * returned will remain valid for use with opp_get_{voltage, freq} only while + * under the locked area. The pointer returned must be used prior to unlocking + * with rcu_read_unlock() to maintain the integrity of the pointer. + */ +struct opp *opp_find_freq_floor(struct device *dev, unsigned long *freq) +{ + struct device_opp *dev_opp; + struct opp *temp_opp, *opp = ERR_PTR(-ENODEV); + + if (!dev || !freq) { + dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq); + return ERR_PTR(-EINVAL); + } + + dev_opp = find_device_opp(dev); + if (IS_ERR(dev_opp)) + return opp; + + list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) { + if (temp_opp->available) { + /* go to the next node, before choosing prev */ + if (temp_opp->rate > *freq) + break; + else + opp = temp_opp; + } + } + if (!IS_ERR(opp)) + *freq = opp->rate; + + return opp; +} + +/** + * opp_add() - Add an OPP table from a table definitions + * @dev: device for which we do this operation + * @freq: Frequency in Hz for this OPP + * @u_volt: Voltage in uVolts for this OPP + * + * This function adds an opp definition to the opp list and returns status. + * The opp is made available by default and it can be controlled using + * opp_enable/disable functions. + * + * Locking: The internal device_opp and opp structures are RCU protected. + * Hence this function internally uses RCU updater strategy with mutex locks + * to keep the integrity of the internal data structures. Callers should ensure + * that this function is *NOT* called under RCU protection or in contexts where + * mutex cannot be locked. + */ +int opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) +{ + struct device_opp *dev_opp = NULL; + struct opp *opp, *new_opp; + struct list_head *head; + + /* allocate new OPP node */ + new_opp = kzalloc(sizeof(struct opp), GFP_KERNEL); + if (!new_opp) { + dev_warn(dev, "%s: Unable to create new OPP node\n", __func__); + return -ENOMEM; + } + + /* Hold our list modification lock here */ + mutex_lock(&dev_opp_list_lock); + + /* Check for existing list for 'dev' */ + dev_opp = find_device_opp(dev); + if (IS_ERR(dev_opp)) { + /* + * Allocate a new device OPP table. In the infrequent case + * where a new device is needed to be added, we pay this + * penalty. + */ + dev_opp = kzalloc(sizeof(struct device_opp), GFP_KERNEL); + if (!dev_opp) { + mutex_unlock(&dev_opp_list_lock); + kfree(new_opp); + dev_warn(dev, + "%s: Unable to create device OPP structure\n", + __func__); + return -ENOMEM; + } + + dev_opp->dev = dev; + INIT_LIST_HEAD(&dev_opp->opp_list); + + /* Secure the device list modification */ + list_add_rcu(&dev_opp->node, &dev_opp_list); + } + + /* populate the opp table */ + new_opp->dev_opp = dev_opp; + new_opp->rate = freq; + new_opp->u_volt = u_volt; + new_opp->available = true; + + /* Insert new OPP in order of increasing frequency */ + head = &dev_opp->opp_list; + list_for_each_entry_rcu(opp, &dev_opp->opp_list, node) { + if (new_opp->rate < opp->rate) + break; + else + head = &opp->node; + } + + list_add_rcu(&new_opp->node, head); + mutex_unlock(&dev_opp_list_lock); + + return 0; +} + +/** + * opp_set_availability() - helper to set the availability of an opp + * @dev: device for which we do this operation + * @freq: OPP frequency to modify availability + * @availability_req: availability status requested for this opp + * + * Set the availability of an OPP with an RCU operation, opp_{enable,disable} + * share a common logic which is isolated here. + * + * Returns -EINVAL for bad pointers, -ENOMEM if no memory available for the + * copy operation, returns 0 if no modifcation was done OR modification was + * successful. + * + * Locking: The internal device_opp and opp structures are RCU protected. + * Hence this function internally uses RCU updater strategy with mutex locks to + * keep the integrity of the internal data structures. Callers should ensure + * that this function is *NOT* called under RCU protection or in contexts where + * mutex locking or synchronize_rcu() blocking calls cannot be used. + */ +static int opp_set_availability(struct device *dev, unsigned long freq, + bool availability_req) +{ + struct device_opp *tmp_dev_opp, *dev_opp = NULL; + struct opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV); + int r = 0; + + /* keep the node allocated */ + new_opp = kmalloc(sizeof(struct opp), GFP_KERNEL); + if (!new_opp) { + dev_warn(dev, "%s: Unable to create OPP\n", __func__); + return -ENOMEM; + } + + mutex_lock(&dev_opp_list_lock); + + /* Find the device_opp */ + list_for_each_entry(tmp_dev_opp, &dev_opp_list, node) { + if (dev == tmp_dev_opp->dev) { + dev_opp = tmp_dev_opp; + break; + } + } + if (IS_ERR(dev_opp)) { + r = PTR_ERR(dev_opp); + dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r); + goto unlock; + } + + /* Do we have the frequency? */ + list_for_each_entry(tmp_opp, &dev_opp->opp_list, node) { + if (tmp_opp->rate == freq) { + opp = tmp_opp; + break; + } + } + if (IS_ERR(opp)) { + r = PTR_ERR(opp); + goto unlock; + } + + /* Is update really needed? */ + if (opp->available == availability_req) + goto unlock; + /* copy the old data over */ + *new_opp = *opp; + + /* plug in new node */ + new_opp->available = availability_req; + + list_replace_rcu(&opp->node, &new_opp->node); + mutex_unlock(&dev_opp_list_lock); + synchronize_rcu(); + + /* clean up old opp */ + new_opp = opp; + goto out; + +unlock: + mutex_unlock(&dev_opp_list_lock); +out: + kfree(new_opp); + return r; +} + +/** + * opp_enable() - Enable a specific OPP + * @dev: device for which we do this operation + * @freq: OPP frequency to enable + * + * Enables a provided opp. If the operation is valid, this returns 0, else the + * corresponding error value. It is meant to be used for users an OPP available + * after being temporarily made unavailable with opp_disable. + * + * Locking: The internal device_opp and opp structures are RCU protected. + * Hence this function indirectly uses RCU and mutex locks to keep the + * integrity of the internal data structures. Callers should ensure that + * this function is *NOT* called under RCU protection or in contexts where + * mutex locking or synchronize_rcu() blocking calls cannot be used. + */ +int opp_enable(struct device *dev, unsigned long freq) +{ + return opp_set_availability(dev, freq, true); +} + +/** + * opp_disable() - Disable a specific OPP + * @dev: device for which we do this operation + * @freq: OPP frequency to disable + * + * Disables a provided opp. If the operation is valid, this returns + * 0, else the corresponding error value. It is meant to be a temporary + * control by users to make this OPP not available until the circumstances are + * right to make it available again (with a call to opp_enable). + * + * Locking: The internal device_opp and opp structures are RCU protected. + * Hence this function indirectly uses RCU and mutex locks to keep the + * integrity of the internal data structures. Callers should ensure that + * this function is *NOT* called under RCU protection or in contexts where + * mutex locking or synchronize_rcu() blocking calls cannot be used. + */ +int opp_disable(struct device *dev, unsigned long freq) +{ + return opp_set_availability(dev, freq, false); +} + +#ifdef CONFIG_CPU_FREQ +/** + * opp_init_cpufreq_table() - create a cpufreq table for a device + * @dev: device for which we do this operation + * @table: Cpufreq table returned back to caller + * + * Generate a cpufreq table for a provided device- this assumes that the + * opp list is already initialized and ready for usage. + * + * This function allocates required memory for the cpufreq table. It is + * expected that the caller does the required maintenance such as freeing + * the table as required. + * + * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM + * if no memory available for the operation (table is not populated), returns 0 + * if successful and table is populated. + * + * WARNING: It is important for the callers to ensure refreshing their copy of + * the table if any of the mentioned functions have been invoked in the interim. + * + * Locking: The internal device_opp and opp structures are RCU protected. + * To simplify the logic, we pretend we are updater and hold relevant mutex here + * Callers should ensure that this function is *NOT* called under RCU protection + * or in contexts where mutex locking cannot be used. + */ +int opp_init_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table **table) +{ + struct device_opp *dev_opp; + struct opp *opp; + struct cpufreq_frequency_table *freq_table; + int i = 0; + + /* Pretend as if I am an updater */ + mutex_lock(&dev_opp_list_lock); + + dev_opp = find_device_opp(dev); + if (IS_ERR(dev_opp)) { + int r = PTR_ERR(dev_opp); + mutex_unlock(&dev_opp_list_lock); + dev_err(dev, "%s: Device OPP not found (%d)\n", __func__, r); + return r; + } + + freq_table = kzalloc(sizeof(struct cpufreq_frequency_table) * + (opp_get_opp_count(dev) + 1), GFP_KERNEL); + if (!freq_table) { + mutex_unlock(&dev_opp_list_lock); + dev_warn(dev, "%s: Unable to allocate frequency table\n", + __func__); + return -ENOMEM; + } + + list_for_each_entry(opp, &dev_opp->opp_list, node) { + if (opp->available) { + freq_table[i].index = i; + freq_table[i].frequency = opp->rate / 1000; + i++; + } + } + mutex_unlock(&dev_opp_list_lock); + + freq_table[i].index = i; + freq_table[i].frequency = CPUFREQ_TABLE_END; + + *table = &freq_table[0]; + + return 0; +} +#endif /* CONFIG_CPU_FREQ */ diff --git a/include/linux/opp.h b/include/linux/opp.h new file mode 100644 index 000000000000..5449945d589f --- /dev/null +++ b/include/linux/opp.h @@ -0,0 +1,105 @@ +/* + * Generic OPP Interface + * + * Copyright (C) 2009-2010 Texas Instruments Incorporated. + * Nishanth Menon + * Romit Dasgupta + * Kevin Hilman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_OPP_H__ +#define __LINUX_OPP_H__ + +#include +#include + +struct opp; + +#if defined(CONFIG_PM_OPP) + +unsigned long opp_get_voltage(struct opp *opp); + +unsigned long opp_get_freq(struct opp *opp); + +int opp_get_opp_count(struct device *dev); + +struct opp *opp_find_freq_exact(struct device *dev, unsigned long freq, + bool available); + +struct opp *opp_find_freq_floor(struct device *dev, unsigned long *freq); + +struct opp *opp_find_freq_ceil(struct device *dev, unsigned long *freq); + +int opp_add(struct device *dev, unsigned long freq, unsigned long u_volt); + +int opp_enable(struct device *dev, unsigned long freq); + +int opp_disable(struct device *dev, unsigned long freq); + +#else +static inline unsigned long opp_get_voltage(struct opp *opp) +{ + return 0; +} + +static inline unsigned long opp_get_freq(struct opp *opp) +{ + return 0; +} + +static inline int opp_get_opp_count(struct device *dev) +{ + return 0; +} + +static inline struct opp *opp_find_freq_exact(struct device *dev, + unsigned long freq, bool available) +{ + return ERR_PTR(-EINVAL); +} + +static inline struct opp *opp_find_freq_floor(struct device *dev, + unsigned long *freq) +{ + return ERR_PTR(-EINVAL); +} + +static inline struct opp *opp_find_freq_ceil(struct device *dev, + unsigned long *freq) +{ + return ERR_PTR(-EINVAL); +} + +static inline int opp_add(struct device *dev, unsigned long freq, + unsigned long u_volt) +{ + return -EINVAL; +} + +static inline int opp_enable(struct device *dev, unsigned long freq) +{ + return 0; +} + +static inline int opp_disable(struct device *dev, unsigned long freq) +{ + return 0; +} +#endif /* CONFIG_PM */ + +#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) +int opp_init_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table **table); +#else +static inline int opp_init_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table **table) +{ + return -EINVAL; +} +#endif /* CONFIG_CPU_FREQ */ + +#endif /* __LINUX_OPP_H__ */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e45894c696ee..29bff6117abc 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -245,3 +245,17 @@ config PM_OPS bool depends on PM_SLEEP || PM_RUNTIME default y + +config PM_OPP + bool "Operating Performance Point (OPP) Layer library" + depends on PM + ---help--- + SOCs have a standard set of tuples consisting of frequency and + voltage pairs that the device will support per voltage domain. This + is called Operating Performance Point or OPP. The actual definitions + of OPP varies over silicon within the same family of devices. + + OPP layer organizes the data internally using device pointers + representing individual voltage domains and provides SOC + implementations a ready to use framework to manage OPPs. + For more information, read -- cgit v1.2.3