From 4f88651125e2ca8b106b6f65b65ea45776517bf3 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 23 Mar 2006 02:59:34 -0800 Subject: [PATCH] i386: allow disabling X86_FEATURE_SEP at boot Allow the x86 "sep" feature to be disabled at bootup. This forces use of the int80 vsyscall. Mainly for testing or benchmarking the int80 vsyscall code. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index fc99075e0af4..880be3a30d8d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1008,7 +1008,9 @@ running once the system is up. noexec=on: enable non-executable mappings (default) noexec=off: disable nn-executable mappings - nofxsr [BUGS=IA-32] + nofxsr [BUGS=IA-32] Disables x86 floating point extended + register save and restore. The kernel will only save + legacy floating-point registers on task switch. nohlt [BUGS=ARM] @@ -1053,6 +1055,8 @@ running once the system is up. nosbagart [IA-64] + nosep [BUGS=IA-32] Disables x86 SYSENTER/SYSEXIT support. + nosmp [SMP] Tells an SMP kernel to act as a UP kernel. nosync [HW,M68K] Disables sync negotiation for all devices. -- cgit v1.2.3 From 543cc27d09643640cbc34189c03a40beb8227aef Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 23 Mar 2006 03:00:02 -0800 Subject: [PATCH] swsusp: documentation updates Update suspend-to-RAM documentation with new machines, and makes message when processes can't be stopped little clearer. (In one case, waiting longer actually did help). From: "Rafael J. Wysocki" Warn in the documentation that data may be lost if there are some filesystems mounted from USB devices before suspend. [Thanks to Alan Stern for providing the answer to the question in the Q:-A: part.] Signed-off-by: Pavel Machek Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/power/swsusp.txt | 51 +++++++++++++++++++++++++---- Documentation/power/video.txt | 74 +++++++++++++++++++----------------------- kernel/power/process.c | 2 +- 3 files changed, 78 insertions(+), 49 deletions(-) (limited to 'Documentation') diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt index b28b7f04abb8..d7814a113ee1 100644 --- a/Documentation/power/swsusp.txt +++ b/Documentation/power/swsusp.txt @@ -17,6 +17,11 @@ Some warnings, first. * but it will probably only crash. * * (*) suspend/resume support is needed to make it safe. + * + * If you have any filesystems on USB devices mounted before suspend, + * they won't be accessible after resume and you may lose data, as though + * you have unplugged the USB devices with mounted filesystems on them + * (see the FAQ below for details). You need to append resume=/dev/your_swap_partition to kernel command line. Then you suspend by @@ -27,19 +32,18 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state echo platform > /sys/power/disk; echo disk > /sys/power/state +. If you have SATA disks, you'll need recent kernels with SATA suspend +support. For suspend and resume to work, make sure your disk drivers +are built into kernel -- not modules. [There's way to make +suspend/resume with modular disk drivers, see FAQ, but you probably +should not do that.] + If you want to limit the suspend image size to N bytes, do echo N > /sys/power/image_size before suspend (it is limited to 500 MB by default). -Encrypted suspend image: ------------------------- -If you want to store your suspend image encrypted with a temporary -key to prevent data gathering after resume you must compile -crypto and the aes algorithm into the kernel - modules won't work -as they cannot be loaded at resume time. - Article about goals and implementation of Software Suspend for Linux ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -333,4 +337,37 @@ init=/bin/bash, then swapon and starting suspend sequence manually usually does the trick. Then it is good idea to try with latest vanilla kernel. +Q: How can distributions ship a swsusp-supporting kernel with modular +disk drivers (especially SATA)? + +A: Well, it can be done, load the drivers, then do echo into +/sys/power/disk/resume file from initrd. Be sure not to mount +anything, not even read-only mount, or you are going to lose your +data. + +Q: How do I make suspend more verbose? + +A: If you want to see any non-error kernel messages on the virtual +terminal the kernel switches to during suspend, you have to set the +kernel console loglevel to at least 5, for example by doing + + echo 5 > /proc/sys/kernel/printk + +Q: Is this true that if I have a mounted filesystem on a USB device and +I suspend to disk, I can lose data unless the filesystem has been mounted +with "sync"? + +A: That's right. It depends on your hardware, and it could be true even for +suspend-to-RAM. In fact, even with "-o sync" you can lose data if your +programs have information in buffers they haven't written out to disk. + +If you're lucky, your hardware will support low-power modes for USB +controllers while the system is asleep. Lots of hardware doesn't, +however. Shutting off the power to a USB controller is equivalent to +unplugging all the attached devices. + +Remember that it's always a bad idea to unplug a disk drive containing a +mounted filesystem. With USB that's true even when your system is asleep! +The safest thing is to unmount all USB-based filesystems before suspending +and remount them after resuming. diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt index 912bed87c758..d18a57d1a531 100644 --- a/Documentation/power/video.txt +++ b/Documentation/power/video.txt @@ -1,7 +1,7 @@ Video issues with S3 resume ~~~~~~~~~~~~~~~~~~~~~~~~~~~ - 2003-2005, Pavel Machek + 2003-2006, Pavel Machek During S3 resume, hardware needs to be reinitialized. For most devices, this is easy, and kernel driver knows how to do @@ -15,6 +15,27 @@ run normally so video card is normally initialized. It should not be problem for S1 standby, because hardware should retain its state over that. +We either have to run video BIOS during early resume, or interpret it +using vbetool later, or maybe nothing is neccessary on particular +system because video state is preserved. Unfortunately different +methods work on different systems, and no known method suits all of +them. + +Userland application called s2ram has been developed; it contains long +whitelist of systems, and automatically selects working method for a +given system. It can be downloaded from CVS at +www.sf.net/projects/suspend . If you get a system that is not in the +whitelist, please try to find a working solution, and submit whitelist +entry so that work does not need to be repeated. + +Currently, VBE_SAVE method (6 below) works on most +systems. Unfortunately, vbetool only runs after userland is resumed, +so it makes debugging of early resume problems +hard/impossible. Methods that do not rely on userland are preferable. + +Details +~~~~~~~ + There are a few types of systems where video works after S3 resume: (1) systems where video state is preserved over S3. @@ -104,6 +125,7 @@ HP NX7000 ??? (*) HP Pavilion ZD7000 vbetool post needed, need open-source nv driver for X HP Omnibook XE3 athlon version none (1) HP Omnibook XE3GC none (1), video is S3 Savage/IX-MV +HP Omnibook 5150 none (1), (S1 also works OK) IBM TP T20, model 2647-44G none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work. IBM TP A31 / Type 2652-M5G s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(] IBM TP R32 / Type 2658-MMG none (1) @@ -120,18 +142,24 @@ IBM ThinkPad T42p (2373-GTG) s3_bios (2) IBM TP X20 ??? (*) IBM TP X30 s3_bios (2) IBM TP X31 / Type 2672-XXH none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight. -IBM TP X32 none (1), but backlight is on and video is trashed after long suspend +IBM TP X32 none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results? IBM Thinkpad X40 Type 2371-7JG s3_bios,s3_mode (4) +IBM TP 600e none(1), but a switch to console and back to X is needed Medion MD4220 ??? (*) Samsung P35 vbetool needed (6) -Sharp PC-AR10 (ATI rage) none (1) +Sharp PC-AR10 (ATI rage) none (1), backlight does not switch off Sony Vaio PCG-C1VRX/K s3_bios (2) Sony Vaio PCG-F403 ??? (*) +Sony Vaio PCG-GRT995MP none (1), works with 'nv' X driver +Sony Vaio PCG-GR7/K none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight. Sony Vaio PCG-N505SN ??? (*) Sony Vaio vgn-s260 X or boot-radeon can init it (5) +Sony Vaio vgn-S580BH vga=normal, but suspend from X. Console will be blank unless you return to X. +Sony Vaio vgn-FS115B s3_bios (2),s3_mode (4) Toshiba Libretto L5 none (1) -Toshiba Satellite 4030CDT s3_mode (3) -Toshiba Satellite 4080XCDT s3_mode (3) +Toshiba Portege 3020CT s3_mode (3) +Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK) +Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK) Toshiba Satellite 4090XCDT ??? (*) Toshiba Satellite P10-554 s3_bios,s3_mode (4)(****) Toshiba M30 (2) xor X with nvidia driver using internal AGP @@ -151,39 +179,3 @@ Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4) (***) To be tested with a newer kernel. (****) Not with SMP kernel, UP only. - -VBEtool details -~~~~~~~~~~~~~~~ -(with thanks to Carl-Daniel Hailfinger) - -First, boot into X and run the following script ONCE: -#!/bin/bash -statedir=/root/s3/state -mkdir -p $statedir -chvt 2 -sleep 1 -vbetool vbestate save >$statedir/vbe - - -To suspend and resume properly, call the following script as root: -#!/bin/bash -statedir=/root/s3/state -curcons=`fgconsole` -fuser /dev/tty$curcons 2>/dev/null|xargs ps -o comm= -p|grep -q X && chvt 2 -cat /dev/vcsa >$statedir/vcsa -sync -echo 3 >/proc/acpi/sleep -sync -vbetool post -vbetool vbestate restore <$statedir/vbe -cat $statedir/vcsa >/dev/vcsa -rckbd restart -chvt $[curcons%6+1] -chvt $curcons - - -Unless you change your graphics card or other hardware configuration, -the state once saved will be OK for every resume afterwards. -NOTE: The "rckbd restart" command may be different for your -distribution. Simply replace it with the command you would use to -set the fonts on screen. diff --git a/kernel/power/process.c b/kernel/power/process.c index 28de118f7a0b..02a1b3a9fa90 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -83,7 +83,7 @@ int freeze_processes(void) yield(); /* Yield is okay here */ if (todo && time_after(jiffies, start_time + TIMEOUT)) { printk( "\n" ); - printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); + printk(KERN_ERR " stopping tasks timed out (%d tasks remaining)\n", todo ); break; } } while(todo); -- cgit v1.2.3 From 6e1819d615f24ce0726a7d0bd3dd0152d7b21654 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 23 Mar 2006 03:00:03 -0800 Subject: [PATCH] swsusp: userland interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a user space interface for swsusp. The interface is based on a special character device, called the snapshot device, that allows user space processes to perform suspend and resume-related operations with the help of some ioctls and the read()/write() functions.  Additionally it allows these processes to allocate free swap pages from a selected swap partition, called the resume partition, so that they know which sectors of the resume partition are available to them. The interface uses the same low-level system memory snapshot-handling functions that are used by the built-it swap-writing/reading code of swsusp. The interface documentation is included in the patch. The patch assumes that the major and minor numbers of the snapshot device will be 10 (ie. misc device) and 231, the registration of which has already been requested. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/power/userland-swsusp.txt | 149 ++++++++++++++++ init/do_mounts_initrd.c | 1 + kernel/power/Makefile | 2 +- kernel/power/power.h | 14 ++ kernel/power/snapshot.c | 9 +- kernel/power/user.c | 301 ++++++++++++++++++++++++++++++++ mm/swapfile.c | 6 +- 7 files changed, 475 insertions(+), 7 deletions(-) create mode 100644 Documentation/power/userland-swsusp.txt create mode 100644 kernel/power/user.c (limited to 'Documentation') diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt new file mode 100644 index 000000000000..94058220aaf0 --- /dev/null +++ b/Documentation/power/userland-swsusp.txt @@ -0,0 +1,149 @@ +Documentation for userland software suspend interface + (C) 2006 Rafael J. Wysocki + +First, the warnings at the beginning of swsusp.txt still apply. + +Second, you should read the FAQ in swsusp.txt _now_ if you have not +done it already. + +Now, to use the userland interface for software suspend you need special +utilities that will read/write the system memory snapshot from/to the +kernel. Such utilities are available, for example, from +. You may want to have +a look at them if you are going to develop your own suspend/resume +utilities. + +The interface consists of a character device providing the open(), +release(), read(), and write() operations as well as several ioctl() +commands defined in kernel/power/power.h. The major and minor +numbers of the device are, respectively, 10 and 231, and they can +be read from /sys/class/misc/snapshot/dev. + +The device can be open either for reading or for writing. If open for +reading, it is considered to be in the suspend mode. Otherwise it is +assumed to be in the resume mode. The device cannot be open for reading +and writing. It is also impossible to have the device open more than once +at a time. + +The ioctl() commands recognized by the device are: + +SNAPSHOT_FREEZE - freeze user space processes (the current process is + not frozen); this is required for SNAPSHOT_ATOMIC_SNAPSHOT + and SNAPSHOT_ATOMIC_RESTORE to succeed + +SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE + +SNAPSHOT_ATOMIC_SNAPSHOT - create a snapshot of the system memory; the + last argument of ioctl() should be a pointer to an int variable, + the value of which will indicate whether the call returned after + creating the snapshot (1) or after restoring the system memory state + from it (0) (after resume the system finds itself finishing the + SNAPSHOT_ATOMIC_SNAPSHOT ioctl() again); after the snapshot + has been created the read() operation can be used to transfer + it out of the kernel + +SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the + uploaded snapshot image; before calling it you should transfer + the system memory snapshot back to the kernel using the write() + operation; this call will not succeed if the snapshot + image is not available to the kernel + +SNAPSHOT_FREE - free memory allocated for the snapshot image + +SNAPSHOT_SET_IMAGE_SIZE - set the preferred maximum size of the image + (the kernel will do its best to ensure the image size will not exceed + this number, but if it turns out to be impossible, the kernel will + create the smallest image possible) + +SNAPSHOT_AVAIL_SWAP - return the amount of available swap in bytes (the last + argument should be a pointer to an unsigned int variable that will + contain the result if the call is successful). + +SNAPSHOT_GET_SWAP_PAGE - allocate a swap page from the resume partition + (the last argument should be a pointer to a loff_t variable that + will contain the swap page offset if the call is successful) + +SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated with + SNAPSHOT_GET_SWAP_PAGE + +SNAPSHOT_SET_SWAP_FILE - set the resume partition (the last ioctl() argument + should specify the device's major and minor numbers in the old + two-byte format, as returned by the stat() function in the .st_rdev + member of the stat structure); it is recommended to always use this + call, because the code to set the resume partition could be removed from + future kernels + +The device's read() operation can be used to transfer the snapshot image from +the kernel. It has the following limitations: +- you cannot read() more than one virtual memory page at a time +- read()s accross page boundaries are impossible (ie. if ypu read() 1/2 of + a page in the previous call, you will only be able to read() + _at_ _most_ 1/2 of the page in the next call) + +The device's write() operation is used for uploading the system memory snapshot +into the kernel. It has the same limitations as the read() operation. + +The release() operation frees all memory allocated for the snapshot image +and all swap pages allocated with SNAPSHOT_GET_SWAP_PAGE (if any). +Thus it is not necessary to use either SNAPSHOT_FREE or +SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also +unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are +still frozen when the device is being closed). + +Currently it is assumed that the userland utilities reading/writing the +snapshot image from/to the kernel will use a swap parition, called the resume +partition, as storage space. However, this is not really required, as they +can use, for example, a special (blank) suspend partition or a file on a partition +that is unmounted before SNAPSHOT_ATOMIC_SNAPSHOT and mounted afterwards. + +These utilities SHOULD NOT make any assumptions regarding the ordering of +data within the snapshot image, except for the image header that MAY be +assumed to start with an swsusp_info structure, as specified in +kernel/power/power.h. This structure MAY be used by the userland utilities +to obtain some information about the snapshot image, such as the size +of the snapshot image, including the metadata and the header itself, +contained in the .size member of swsusp_info. + +The snapshot image MUST be written to the kernel unaltered (ie. all of the image +data, metadata and header MUST be written in _exactly_ the same amount, form +and order in which they have been read). Otherwise, the behavior of the +resumed system may be totally unpredictable. + +While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the +structure of the snapshot image is consistent with the information stored +in the image header. If any inconsistencies are detected, +SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof +mechanism and the userland utilities using the interface SHOULD use additional +means, such as checksums, to ensure the integrity of the snapshot image. + +The suspending and resuming utilities MUST lock themselves in memory, +preferrably using mlockall(), before calling SNAPSHOT_FREEZE. + +The suspending utility MUST check the value stored by SNAPSHOT_ATOMIC_SNAPSHOT +in the memory location pointed to by the last argument of ioctl() and proceed +in accordance with it: +1. If the value is 1 (ie. the system memory snapshot has just been + created and the system is ready for saving it): + (a) The suspending utility MUST NOT close the snapshot device + _unless_ the whole suspend procedure is to be cancelled, in + which case, if the snapshot image has already been saved, the + suspending utility SHOULD destroy it, preferrably by zapping + its header. If the suspend is not to be cancelled, the + system MUST be powered off or rebooted after the snapshot + image has been saved. + (b) The suspending utility SHOULD NOT attempt to perform any + file system operations (including reads) on the file systems + that were mounted before SNAPSHOT_ATOMIC_SNAPSHOT has been + called. However, it MAY mount a file system that was not + mounted at that time and perform some operations on it (eg. + use it for saving the image). +2. If the value is 0 (ie. the system state has just been restored from + the snapshot image), the suspending utility MUST close the snapshot + device. Afterwards it will be treated as a regular userland process, + so it need not exit. + +The resuming utility SHOULD NOT attempt to mount any file systems that could +be mounted before suspend and SHOULD NOT attempt to perform any operations +involving such file systems. + +For details, please refer to the source code. diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a05cabd0fd10..405f9031af87 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -56,6 +56,7 @@ static void __init handle_initrd(void) sys_chroot("."); mount_devfs_fs (); + current->flags |= PF_NOFREEZE; pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD); if (pid > 0) { while (pid != sys_wait4(-1, NULL, 0, NULL)) diff --git a/kernel/power/Makefile b/kernel/power/Makefile index bb91a0615303..8d0af3d37a4b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ endif obj-y := main.o process.o console.o obj-$(CONFIG_PM_LEGACY) += pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o obj-$(CONFIG_SUSPEND_SMP) += smp.o diff --git a/kernel/power/power.h b/kernel/power/power.h index 5d1abffbb9ce..42c431c8bdde 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -8,6 +8,7 @@ struct swsusp_info { int cpus; unsigned long image_pages; unsigned long pages; + unsigned long size; } __attribute__((aligned(PAGE_SIZE))); @@ -65,6 +66,19 @@ extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); int snapshot_image_loaded(struct snapshot_handle *handle); +#define SNAPSHOT_IOC_MAGIC '3' +#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) +#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2) +#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) +#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4) +#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5) +#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) +#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) +#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) +#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) +#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) +#define SNAPSHOT_IOC_MAXNR 10 + /** * The bitmap is used for tracing allocated swap pages * diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cc349437fb72..0036955357e0 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -37,6 +37,7 @@ struct pbe *pagedir_nosave; static unsigned int nr_copy_pages; static unsigned int nr_meta_pages; +static unsigned long *buffer; #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void) @@ -389,7 +390,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed free_pagedir(pblist); pblist = NULL; } else - create_pbe_list(pblist, nr_pages); + create_pbe_list(pblist, nr_pages); return pblist; } @@ -418,6 +419,7 @@ void swsusp_free(void) nr_copy_pages = 0; nr_meta_pages = 0; pagedir_nosave = NULL; + buffer = NULL; } @@ -523,6 +525,8 @@ static void init_header(struct swsusp_info *info) info->cpus = num_online_cpus(); info->image_pages = nr_copy_pages; info->pages = nr_copy_pages + nr_meta_pages + 1; + info->size = info->pages; + info->size <<= PAGE_SHIFT; } /** @@ -568,8 +572,6 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb int snapshot_read_next(struct snapshot_handle *handle, size_t count) { - static unsigned long *buffer; - if (handle->page > nr_meta_pages + nr_copy_pages) return 0; if (!buffer) { @@ -774,7 +776,6 @@ static int create_image(struct snapshot_handle *handle) int snapshot_write_next(struct snapshot_handle *handle, size_t count) { - static unsigned long *buffer; int error = 0; if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) diff --git a/kernel/power/user.c b/kernel/power/user.c new file mode 100644 index 000000000000..8cabc405ca10 --- /dev/null +++ b/kernel/power/user.c @@ -0,0 +1,301 @@ +/* + * linux/kernel/power/user.c + * + * This file provides the user space interface for software suspend/resume. + * + * Copyright (C) 2006 Rafael J. Wysocki + * + * This file is released under the GPLv2. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "power.h" + +#define SNAPSHOT_MINOR 231 + +static struct snapshot_data { + struct snapshot_handle handle; + int swap; + struct bitmap_page *bitmap; + int mode; + char frozen; + char ready; +} snapshot_state; + +static atomic_t device_available = ATOMIC_INIT(1); + +static int snapshot_open(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + + if (!atomic_add_unless(&device_available, -1, 0)) + return -EBUSY; + + if ((filp->f_flags & O_ACCMODE) == O_RDWR) + return -ENOSYS; + + nonseekable_open(inode, filp); + data = &snapshot_state; + filp->private_data = data; + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { + data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; + data->mode = O_RDONLY; + } else { + data->swap = -1; + data->mode = O_WRONLY; + } + data->bitmap = NULL; + data->frozen = 0; + data->ready = 0; + + return 0; +} + +static int snapshot_release(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + + swsusp_free(); + data = filp->private_data; + free_all_swap_pages(data->swap, data->bitmap); + free_bitmap(data->bitmap); + if (data->frozen) { + down(&pm_sem); + thaw_processes(); + enable_nonboot_cpus(); + up(&pm_sem); + } + atomic_inc(&device_available); + return 0; +} + +static ssize_t snapshot_read(struct file *filp, char __user *buf, + size_t count, loff_t *offp) +{ + struct snapshot_data *data; + ssize_t res; + + data = filp->private_data; + res = snapshot_read_next(&data->handle, count); + if (res > 0) { + if (copy_to_user(buf, data_of(data->handle), res)) + res = -EFAULT; + else + *offp = data->handle.offset; + } + return res; +} + +static ssize_t snapshot_write(struct file *filp, const char __user *buf, + size_t count, loff_t *offp) +{ + struct snapshot_data *data; + ssize_t res; + + data = filp->private_data; + res = snapshot_write_next(&data->handle, count); + if (res > 0) { + if (copy_from_user(data_of(data->handle), buf, res)) + res = -EFAULT; + else + *offp = data->handle.offset; + } + return res; +} + +static int snapshot_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int error = 0; + struct snapshot_data *data; + loff_t offset, avail; + + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) + return -ENOTTY; + if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) + return -ENOTTY; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + data = filp->private_data; + + switch (cmd) { + + case SNAPSHOT_FREEZE: + if (data->frozen) + break; + sys_sync(); + down(&pm_sem); + pm_prepare_console(); + disable_nonboot_cpus(); + if (freeze_processes()) { + thaw_processes(); + enable_nonboot_cpus(); + pm_restore_console(); + error = -EBUSY; + } + up(&pm_sem); + if (!error) + data->frozen = 1; + break; + + case SNAPSHOT_UNFREEZE: + if (!data->frozen) + break; + down(&pm_sem); + thaw_processes(); + enable_nonboot_cpus(); + pm_restore_console(); + up(&pm_sem); + data->frozen = 0; + break; + + case SNAPSHOT_ATOMIC_SNAPSHOT: + if (data->mode != O_RDONLY || !data->frozen || data->ready) { + error = -EPERM; + break; + } + down(&pm_sem); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (!error) { + error = device_suspend(PMSG_FREEZE); + if (!error) { + in_suspend = 1; + error = swsusp_suspend(); + device_resume(); + } + } + up(&pm_sem); + if (!error) + error = put_user(in_suspend, (unsigned int __user *)arg); + if (!error) + data->ready = 1; + break; + + case SNAPSHOT_ATOMIC_RESTORE: + if (data->mode != O_WRONLY || !data->frozen || + !snapshot_image_loaded(&data->handle)) { + error = -EPERM; + break; + } + down(&pm_sem); + pm_prepare_console(); + error = device_suspend(PMSG_FREEZE); + if (!error) { + error = swsusp_resume(); + device_resume(); + } + pm_restore_console(); + up(&pm_sem); + break; + + case SNAPSHOT_FREE: + swsusp_free(); + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + data->ready = 0; + break; + + case SNAPSHOT_SET_IMAGE_SIZE: + image_size = arg; + break; + + case SNAPSHOT_AVAIL_SWAP: + avail = count_swap_pages(data->swap, 1); + avail <<= PAGE_SHIFT; + error = put_user(avail, (loff_t __user *)arg); + break; + + case SNAPSHOT_GET_SWAP_PAGE: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + if (!data->bitmap) { + data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0)); + if (!data->bitmap) { + error = -ENOMEM; + break; + } + } + offset = alloc_swap_page(data->swap, data->bitmap); + if (offset) { + offset <<= PAGE_SHIFT; + error = put_user(offset, (loff_t __user *)arg); + } else { + error = -ENOSPC; + } + break; + + case SNAPSHOT_FREE_SWAP_PAGES: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + free_all_swap_pages(data->swap, data->bitmap); + free_bitmap(data->bitmap); + data->bitmap = NULL; + break; + + case SNAPSHOT_SET_SWAP_FILE: + if (!data->bitmap) { + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + if (old_decode_dev(arg)) { + data->swap = swap_type_of(old_decode_dev(arg)); + if (data->swap < 0) + error = -ENODEV; + } else { + data->swap = -1; + error = -EINVAL; + } + } else { + error = -EPERM; + } + break; + + default: + error = -ENOTTY; + + } + + return error; +} + +static struct file_operations snapshot_fops = { + .open = snapshot_open, + .release = snapshot_release, + .read = snapshot_read, + .write = snapshot_write, + .llseek = no_llseek, + .ioctl = snapshot_ioctl, +}; + +static struct miscdevice snapshot_device = { + .minor = SNAPSHOT_MINOR, + .name = "snapshot", + .fops = &snapshot_fops, +}; + +static int __init snapshot_device_init(void) +{ + return misc_register(&snapshot_device); +}; + +device_initcall(snapshot_device_init); diff --git a/mm/swapfile.c b/mm/swapfile.c index 4d11f9d84666..39aa9d129612 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -428,14 +428,16 @@ int swap_type_of(dev_t device) { int i; - if (!device) - return -EINVAL; spin_lock(&swap_lock); for (i = 0; i < nr_swapfiles; i++) { struct inode *inode; if (!(swap_info[i].flags & SWP_WRITEOK)) continue; + if (!device) { + spin_unlock(&swap_lock); + return i; + } inode = swap_info->swap_file->f_dentry->d_inode; if (S_ISBLK(inode->i_mode) && device == MKDEV(imajor(inode), iminor(inode))) { -- cgit v1.2.3 From dd287796d608fcdc3fe5e8fdb5bf762a8f1bc32a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Mar 2006 03:00:57 -0800 Subject: [PATCH] pause_on_oops command line option Attempt to fix the problem wherein people's oops reports scroll off the screen due to repeated oopsing or to oopses on other CPUs. If this happens the user can reboot with the `pause_on_oops=' option. It will allow the first oopsing CPU to print an oops record just a single time. Second oopsing attempts, or oopses on other CPUs will cause those CPUs to enter a tight loop until the specified number of seconds have elapsed. The patch implements the infrastructure generically in the expectation that architectures other than x86 will find it useful. Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 5 ++ arch/i386/kernel/traps.c | 3 ++ arch/i386/mm/fault.c | 39 +++++++++------ include/linux/kernel.h | 3 ++ kernel/panic.c | 97 ++++++++++++++++++++++++++++++++++++- 5 files changed, 130 insertions(+), 17 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 880be3a30d8d..7b7382d0f758 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1126,6 +1126,11 @@ running once the system is up. pas16= [HW,SCSI] See header of drivers/scsi/pas16.c. + pause_on_oops= + Halt all CPUs after the first oops has been printed for + the specified number of seconds. This is to be used if + your oopses keep scrolling off the screen. + pcbit= [HW,ISDN] pcd. [PARIDE] diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 1b7ad4115d81..de5386b01d38 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -352,6 +352,8 @@ void die(const char * str, struct pt_regs * regs, long err) static int die_counter; unsigned long flags; + oops_enter(); + if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); spin_lock_irqsave(&die.lock, flags); @@ -404,6 +406,7 @@ void die(const char * str, struct pt_regs * regs, long err) ssleep(5); panic("Fatal exception"); } + oops_exit(); do_exit(SIGSEGV); } diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index 47a3b72ec7b6..7f0fcf219a26 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -509,24 +509,31 @@ no_context: bust_spinlocks(1); -#ifdef CONFIG_X86_PAE - if (error_code & 16) { - pte_t *pte = lookup_address(address); - - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) - printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); + if (oops_may_print()) { + #ifdef CONFIG_X86_PAE + if (error_code & 16) { + pte_t *pte = lookup_address(address); + + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) + printk(KERN_CRIT "kernel tried to execute " + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current->uid); + } + #endif + if (address < PAGE_SIZE) + printk(KERN_ALERT "BUG: unable to handle kernel NULL " + "pointer dereference"); + else + printk(KERN_ALERT "BUG: unable to handle kernel paging" + " request"); + printk(" at virtual address %08lx\n",address); + printk(KERN_ALERT " printing eip:\n"); + printk("%08lx\n", regs->eip); } -#endif - if (address < PAGE_SIZE) - printk(KERN_ALERT "BUG: unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "BUG: unable to handle kernel paging request"); - printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT " printing eip:\n"); - printk("%08lx\n", regs->eip); page = read_cr3(); page = ((unsigned long *) __va(page))[address >> 22]; - printk(KERN_ALERT "*pde = %08lx\n", page); + if (oops_may_print()) + printk(KERN_ALERT "*pde = %08lx\n", page); /* * We must not directly access the pte in the highpte * case, the page table might be allocated in highmem. @@ -534,7 +541,7 @@ no_context: * it's allocated already. */ #ifndef CONFIG_HIGHPTE - if (page & 1) { + if ((page & 1) && oops_may_print()) { page &= PAGE_MASK; address &= 0x003ff000; page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3b507bf05d09..bb6e7ddee2fd 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -91,6 +91,9 @@ extern struct notifier_block *panic_notifier_list; extern long (*panic_blink)(long time); NORET_TYPE void panic(const char * fmt, ...) __attribute__ ((NORET_AND format (printf, 1, 2))); +extern void oops_enter(void); +extern void oops_exit(void); +extern int oops_may_print(void); fastcall NORET_TYPE void do_exit(long error_code) ATTRIB_NORET; NORET_TYPE void complete_and_exit(struct completion *, long) diff --git a/kernel/panic.c b/kernel/panic.c index 126dc43f1c74..acd95adddb93 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -20,10 +20,13 @@ #include #include -int panic_timeout; int panic_on_oops; int tainted; +static int pause_on_oops; +static int pause_on_oops_flag; +static DEFINE_SPINLOCK(pause_on_oops_lock); +int panic_timeout; EXPORT_SYMBOL(panic_timeout); struct notifier_block *panic_notifier_list; @@ -174,3 +177,95 @@ void add_taint(unsigned flag) tainted |= flag; } EXPORT_SYMBOL(add_taint); + +static int __init pause_on_oops_setup(char *str) +{ + pause_on_oops = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("pause_on_oops=", pause_on_oops_setup); + +static void spin_msec(int msecs) +{ + int i; + + for (i = 0; i < msecs; i++) { + touch_nmi_watchdog(); + mdelay(1); + } +} + +/* + * It just happens that oops_enter() and oops_exit() are identically + * implemented... + */ +static void do_oops_enter_exit(void) +{ + unsigned long flags; + static int spin_counter; + + if (!pause_on_oops) + return; + + spin_lock_irqsave(&pause_on_oops_lock, flags); + if (pause_on_oops_flag == 0) { + /* This CPU may now print the oops message */ + pause_on_oops_flag = 1; + } else { + /* We need to stall this CPU */ + if (!spin_counter) { + /* This CPU gets to do the counting */ + spin_counter = pause_on_oops; + do { + spin_unlock(&pause_on_oops_lock); + spin_msec(MSEC_PER_SEC); + spin_lock(&pause_on_oops_lock); + } while (--spin_counter); + pause_on_oops_flag = 0; + } else { + /* This CPU waits for a different one */ + while (spin_counter) { + spin_unlock(&pause_on_oops_lock); + spin_msec(1); + spin_lock(&pause_on_oops_lock); + } + } + } + spin_unlock_irqrestore(&pause_on_oops_lock, flags); +} + +/* + * Return true if the calling CPU is allowed to print oops-related info. This + * is a bit racy.. + */ +int oops_may_print(void) +{ + return pause_on_oops_flag == 0; +} + +/* + * Called when the architecture enters its oops handler, before it prints + * anything. If this is the first CPU to oops, and it's oopsing the first time + * then let it proceed. + * + * This is all enabled by the pause_on_oops kernel boot option. We do all this + * to ensure that oopses don't scroll off the screen. It has the side-effect + * of preventing later-oopsing CPUs from mucking up the display, too. + * + * It turns out that the CPU which is allowed to print ends up pausing for the + * right duration, whereas all the other CPUs pause for twice as long: once in + * oops_enter(), once in oops_exit(). + */ +void oops_enter(void) +{ + do_oops_enter_exit(); +} + +/* + * Called when the architecture exits its oops handler, after printing + * everything. + */ +void oops_exit(void) +{ + do_oops_enter_exit(); +} -- cgit v1.2.3