Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-17 02:20:36 +0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-17 02:20:36 +0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel/power
download: linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.xz
11 files changed, 2844 insertions, 0 deletions
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
new file mode 100644
index 000000000000..696387ffe49c
--- /dev/null
+++ b/kernel/power/Kconfig
@@ -0,0 +1,74 @@
+config PM
+	bool "Power Management support"
+	---help---
+	  "Power Management" means that parts of your computer are shut
+	  off or put into a power conserving "sleep" mode if they are not
+	  being used.  There are two competing standards for doing this: APM
+	  and ACPI.  If you want to use either one, say Y here and then also
+	  to the requisite support below.
+
+	  Power Management is most important for battery powered laptop
+	  computers; if you have a laptop, check out the Linux Laptop home
+	  page on the WWW at <http://www.linux-on-laptops.com/> or
+	  Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
+	  and the Battery Powered Linux mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  Note that, even if you say N here, Linux on the x86 architecture
+	  will issue the hlt instruction if nothing is to be done, thereby
+	  sending the processor to sleep and saving power.
+
+config PM_DEBUG
+	bool "Power Management Debug Support"
+	depends on PM
+	---help---
+	This option enables verbose debugging support in the Power Management
+	code. This is helpful when debugging and reporting various PM bugs, 
+	like suspend support.
+
+config SOFTWARE_SUSPEND
+	bool "Software Suspend (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && PM && SWAP
+	---help---
+	  Enable the possibility of suspending the machine.
+	  It doesn't need APM.
+	  You may suspend your machine by 'swsusp' or 'shutdown -z <time>' 
+	  (patch for sysvinit needed). 
+
+	  It creates an image which is saved in your active swap. Upon next
+	  boot, pass the 'resume=/dev/swappartition' argument to the kernel to
+	  have it detect the saved image, restore memory state from it, and
+	  continue to run as before. If you do not want the previous state to
+	  be reloaded, then use the 'noresume' kernel argument. However, note
+	  that your partitions will be fsck'd and you must re-mkswap your swap
+	  partitions. It does not work with swap files.
+
+	  Right now you may boot without resuming and then later resume but
+	  in meantime you cannot use those swap partitions/files which were
+	  involved in suspending. Also in this case there is a risk that buffers
+	  on disk won't match with saved ones.
+
+	  For more information take a look at <file:Documentation/power/swsusp.txt>.
+
+config PM_STD_PARTITION
+	string "Default resume partition"
+	depends on SOFTWARE_SUSPEND
+	default ""
+	---help---
+	  The default resume partition is the partition that the suspend-
+	  to-disk implementation will look for a suspended disk image. 
+
+	  The partition specified here will be different for almost every user. 
+	  It should be a valid swap partition (at least for now) that is turned
+	  on before suspending. 
+
+	  The partition specified can be overridden by specifying:
+
+		resume=/dev/<other device> 
+
+	  which will set the resume partition to the device specified. 
+
+	  Note there is currently not a way to specify which device to save the
+	  suspended image to. It will simply pick the first available swap 
+	  device.
+
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
new file mode 100644
index 000000000000..fbdc634135a7
--- /dev/null
+++ b/kernel/power/Makefile
@@ -0,0 +1,11 @@
+
+ifeq ($(CONFIG_PM_DEBUG),y)
+EXTRA_CFLAGS	+=	-DDEBUG
+endif
+
+swsusp-smp-$(CONFIG_SMP)	+= smp.o
+
+obj-y				:= main.o process.o console.o pm.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o $(swsusp-smp-y) disk.o
+
+obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
new file mode 100644
index 000000000000..7ff375e7c95f
--- /dev/null
+++ b/kernel/power/console.c
@@ -0,0 +1,58 @@
+/*
+ * drivers/power/process.c - Functions for saving/restoring console.
+ *
+ * Originally from swsusp.
+ */
+
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/console.h>
+#include "power.h"
+
+static int new_loglevel = 10;
+static int orig_loglevel;
+#ifdef SUSPEND_CONSOLE
+static int orig_fgconsole, orig_kmsg;
+#endif
+
+int pm_prepare_console(void)
+{
+	orig_loglevel = console_loglevel;
+	console_loglevel = new_loglevel;
+
+#ifdef SUSPEND_CONSOLE
+	acquire_console_sem();
+
+	orig_fgconsole = fg_console;
+
+	if (vc_allocate(SUSPEND_CONSOLE)) {
+	  /* we can't have a free VC for now. Too bad,
+	   * we don't want to mess the screen for now. */
+		release_console_sem();
+		return 1;
+	}
+
+	set_console(SUSPEND_CONSOLE);
+	release_console_sem();
+
+	if (vt_waitactive(SUSPEND_CONSOLE)) {
+		pr_debug("Suspend: Can't switch VCs.");
+		return 1;
+	}
+	orig_kmsg = kmsg_redirect;
+	kmsg_redirect = SUSPEND_CONSOLE;
+#endif
+	return 0;
+}
+
+void pm_restore_console(void)
+{
+	console_loglevel = orig_loglevel;
+#ifdef SUSPEND_CONSOLE
+	acquire_console_sem();
+	set_console(orig_fgconsole);
+	release_console_sem();
+	kmsg_redirect = orig_kmsg;
+#endif
+	return;
+}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
new file mode 100644
index 000000000000..02b6764034dc
--- /dev/null
+++ b/kernel/power/disk.c
@@ -0,0 +1,431 @@
+/*
+ * kernel/power/disk.c - Suspend-to-disk support.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include "power.h"
+
+
+extern suspend_disk_method_t pm_disk_mode;
+extern struct pm_ops * pm_ops;
+
+extern int swsusp_suspend(void);
+extern int swsusp_write(void);
+extern int swsusp_check(void);
+extern int swsusp_read(void);
+extern void swsusp_close(void);
+extern int swsusp_resume(void);
+extern int swsusp_free(void);
+
+
+static int noresume = 0;
+char resume_file[256] = CONFIG_PM_STD_PARTITION;
+dev_t swsusp_resume_device;
+
+/**
+ *	power_down - Shut machine down for hibernate.
+ *	@mode:		Suspend-to-disk mode
+ *
+ *	Use the platform driver, if configured so, and return gracefully if it
+ *	fails.
+ *	Otherwise, try to power off and reboot. If they fail, halt the machine,
+ *	there ain't no turning back.
+ */
+
+static void power_down(suspend_disk_method_t mode)
+{
+	unsigned long flags;
+	int error = 0;
+
+	local_irq_save(flags);
+	switch(mode) {
+	case PM_DISK_PLATFORM:
+ 		device_shutdown();
+		error = pm_ops->enter(PM_SUSPEND_DISK);
+		break;
+	case PM_DISK_SHUTDOWN:
+		printk("Powering off system\n");
+		device_shutdown();
+		machine_power_off();
+		break;
+	case PM_DISK_REBOOT:
+		device_shutdown();
+		machine_restart(NULL);
+		break;
+	}
+	machine_halt();
+	/* Valid image is on the disk, if we continue we risk serious data corruption
+	   after resume. */
+	printk(KERN_CRIT "Please power me down manually\n");
+	while(1);
+}
+
+
+static int in_suspend __nosavedata = 0;
+
+
+/**
+ *	free_some_memory -  Try to free as much memory as possible
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped at this point, or
+ *	livelock is possible.
+ */
+
+static void free_some_memory(void)
+{
+	unsigned int i = 0;
+	unsigned int tmp;
+	unsigned long pages = 0;
+	char *p = "-\\|/";
+
+	printk("Freeing memory...  ");
+	while ((tmp = shrink_all_memory(10000))) {
+		pages += tmp;
+		printk("\b%c", p[i]);
+		i++;
+		if (i > 3)
+			i = 0;
+	}
+	printk("\bdone (%li pages freed)\n", pages);
+}
+
+
+static inline void platform_finish(void)
+{
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->finish)
+			pm_ops->finish(PM_SUSPEND_DISK);
+	}
+}
+
+static void finish(void)
+{
+	device_resume();
+	platform_finish();
+	enable_nonboot_cpus();
+	thaw_processes();
+	pm_restore_console();
+}
+
+
+static int prepare_processes(void)
+{
+	int error;
+
+	pm_prepare_console();
+
+	sys_sync();
+
+	if (freeze_processes()) {
+		error = -EBUSY;
+		return error;
+	}
+
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->prepare) {
+			if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
+				return error;
+		}
+	}
+
+	/* Free memory before shutting down devices. */
+	free_some_memory();
+
+	return 0;
+}
+
+static void unprepare_processes(void)
+{
+	enable_nonboot_cpus();
+	thaw_processes();
+	pm_restore_console();
+}
+
+static int prepare_devices(void)
+{
+	int error;
+
+	disable_nonboot_cpus();
+	if ((error = device_suspend(PMSG_FREEZE))) {
+		printk("Some devices failed to suspend\n");
+		platform_finish();
+		enable_nonboot_cpus();
+		return error;
+	}
+
+	return 0;
+}
+
+/**
+ *	pm_suspend_disk - The granpappy of power management.
+ *
+ *	If we're going through the firmware, then get it over with quickly.
+ *
+ *	If not, then call swsusp to do its thing, then figure out how
+ *	to power down the system.
+ */
+
+int pm_suspend_disk(void)
+{
+	int error;
+
+	error = prepare_processes();
+	if (!error) {
+		error = prepare_devices();
+	}
+
+	if (error) {
+		unprepare_processes();
+		return error;
+	}
+
+	pr_debug("PM: Attempting to suspend to disk.\n");
+	if (pm_disk_mode == PM_DISK_FIRMWARE)
+		return pm_ops->enter(PM_SUSPEND_DISK);
+
+	pr_debug("PM: snapshotting memory.\n");
+	in_suspend = 1;
+	if ((error = swsusp_suspend()))
+		goto Done;
+
+	if (in_suspend) {
+		pr_debug("PM: writing image.\n");
+		error = swsusp_write();
+		if (!error)
+			power_down(pm_disk_mode);
+	} else
+		pr_debug("PM: Image restored successfully.\n");
+	swsusp_free();
+ Done:
+	finish();
+	return error;
+}
+
+
+/**
+ *	software_resume - Resume from a saved image.
+ *
+ *	Called as a late_initcall (so all devices are discovered and
+ *	initialized), we call swsusp to see if we have a saved image or not.
+ *	If so, we quiesce devices, the restore the saved image. We will
+ *	return above (in pm_suspend_disk() ) if everything goes well.
+ *	Otherwise, we fail gracefully and return to the normally
+ *	scheduled program.
+ *
+ */
+
+static int software_resume(void)
+{
+	int error;
+
+	if (noresume) {
+		/**
+		 * FIXME: If noresume is specified, we need to find the partition
+		 * and reset it back to normal swap space.
+		 */
+		return 0;
+	}
+
+	pr_debug("PM: Checking swsusp image.\n");
+
+	if ((error = swsusp_check()))
+		goto Done;
+
+	pr_debug("PM: Preparing processes for restore.\n");
+
+	if ((error = prepare_processes())) {
+		swsusp_close();
+		goto Cleanup;
+	}
+
+	pr_debug("PM: Reading swsusp image.\n");
+
+	if ((error = swsusp_read()))
+		goto Cleanup;
+
+	pr_debug("PM: Preparing devices for restore.\n");
+
+	if ((error = prepare_devices()))
+		goto Free;
+
+	mb();
+
+	pr_debug("PM: Restoring saved image.\n");
+	swsusp_resume();
+	pr_debug("PM: Restore failed, recovering.n");
+	finish();
+ Free:
+	swsusp_free();
+ Cleanup:
+	unprepare_processes();
+ Done:
+	pr_debug("PM: Resume from disk failed.\n");
+	return 0;
+}
+
+late_initcall(software_resume);
+
+
+static char * pm_disk_modes[] = {
+	[PM_DISK_FIRMWARE]	= "firmware",
+	[PM_DISK_PLATFORM]	= "platform",
+	[PM_DISK_SHUTDOWN]	= "shutdown",
+	[PM_DISK_REBOOT]	= "reboot",
+};
+
+/**
+ *	disk - Control suspend-to-disk mode
+ *
+ *	Suspend-to-disk can be handled in several ways. The greatest
+ *	distinction is who writes memory to disk - the firmware or the OS.
+ *	If the firmware does it, we assume that it also handles suspending
+ *	the system.
+ *	If the OS does it, then we have three options for putting the system
+ *	to sleep - using the platform driver (e.g. ACPI or other PM registers),
+ *	powering off the system or rebooting the system (for testing).
+ *
+ *	The system will support either 'firmware' or 'platform', and that is
+ *	known a priori (and encoded in pm_ops). But, the user may choose
+ *	'shutdown' or 'reboot' as alternatives.
+ *
+ *	show() will display what the mode is currently set to.
+ *	store() will accept one of
+ *
+ *	'firmware'
+ *	'platform'
+ *	'shutdown'
+ *	'reboot'
+ *
+ *	It will only change to 'firmware' or 'platform' if the system
+ *	supports it (as determined from pm_ops->pm_disk_mode).
+ */
+
+static ssize_t disk_show(struct subsystem * subsys, char * buf)
+{
+	return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
+}
+
+
+static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
+{
+	int error = 0;
+	int i;
+	int len;
+	char *p;
+	suspend_disk_method_t mode = 0;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	down(&pm_sem);
+	for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
+		if (!strncmp(buf, pm_disk_modes[i], len)) {
+			mode = i;
+			break;
+		}
+	}
+	if (mode) {
+		if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
+			pm_disk_mode = mode;
+		else {
+			if (pm_ops && pm_ops->enter &&
+			    (mode == pm_ops->pm_disk_mode))
+				pm_disk_mode = mode;
+			else
+				error = -EINVAL;
+		}
+	} else
+		error = -EINVAL;
+
+	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+		 pm_disk_modes[mode]);
+	up(&pm_sem);
+	return error ? error : n;
+}
+
+power_attr(disk);
+
+static ssize_t resume_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+		       MINOR(swsusp_resume_device));
+}
+
+static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	int len;
+	char *p;
+	unsigned int maj, min;
+	int error = -EINVAL;
+	dev_t res;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
+		res = MKDEV(maj,min);
+		if (maj == MAJOR(res) && min == MINOR(res)) {
+			swsusp_resume_device = res;
+			printk("Attempting manual resume\n");
+			noresume = 0;
+			software_resume();
+		}
+	}
+
+	return error >= 0 ? n : error;
+}
+
+power_attr(resume);
+
+static struct attribute * g[] = {
+	&disk_attr.attr,
+	&resume_attr.attr,
+	NULL,
+};
+
+
+static struct attribute_group attr_group = {
+	.attrs = g,
+};
+
+
+static int __init pm_disk_init(void)
+{
+	return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+}
+
+core_initcall(pm_disk_init);
+
+
+static int __init resume_setup(char *str)
+{
+	if (noresume)
+		return 1;
+
+	strncpy( resume_file, str, 255 );
+	return 1;
+}
+
+static int __init noresume_setup(char *str)
+{
+	noresume = 1;
+	return 1;
+}
+
+__setup("noresume", noresume_setup);
+__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
new file mode 100644
index 000000000000..7960ddf04a57
--- /dev/null
+++ b/kernel/power/main.c
@@ -0,0 +1,269 @@
+/*
+ * kernel/power/main.c - PM subsystem core functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * 
+ * This file is released under the GPLv2
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+
+
+#include "power.h"
+
+DECLARE_MUTEX(pm_sem);
+
+struct pm_ops * pm_ops = NULL;
+suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
+
+/**
+ *	pm_set_ops - Set the global power method table. 
+ *	@ops:	Pointer to ops structure.
+ */
+
+void pm_set_ops(struct pm_ops * ops)
+{
+	down(&pm_sem);
+	pm_ops = ops;
+	up(&pm_sem);
+}
+
+
+/**
+ *	suspend_prepare - Do prep work before entering low-power state.
+ *	@state:		State we're entering.
+ *
+ *	This is common code that is called for each state that we're 
+ *	entering. Allocate a console, stop all processes, then make sure
+ *	the platform can enter the requested state.
+ */
+
+static int suspend_prepare(suspend_state_t state)
+{
+	int error = 0;
+
+	if (!pm_ops || !pm_ops->enter)
+		return -EPERM;
+
+	pm_prepare_console();
+
+	if (freeze_processes()) {
+		error = -EAGAIN;
+		goto Thaw;
+	}
+
+	if (pm_ops->prepare) {
+		if ((error = pm_ops->prepare(state)))
+			goto Thaw;
+	}
+
+	if ((error = device_suspend(PMSG_SUSPEND))) {
+		printk(KERN_ERR "Some devices failed to suspend\n");
+		goto Finish;
+	}
+	return 0;
+ Finish:
+	if (pm_ops->finish)
+		pm_ops->finish(state);
+ Thaw:
+	thaw_processes();
+	pm_restore_console();
+	return error;
+}
+
+
+static int suspend_enter(suspend_state_t state)
+{
+	int error = 0;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	if ((error = device_power_down(PMSG_SUSPEND))) {
+		printk(KERN_ERR "Some devices failed to power down\n");
+		goto Done;
+	}
+	error = pm_ops->enter(state);
+	device_power_up();
+ Done:
+	local_irq_restore(flags);
+	return error;
+}
+
+
+/**
+ *	suspend_finish - Do final work before exiting suspend sequence.
+ *	@state:		State we're coming out of.
+ *
+ *	Call platform code to clean up, restart processes, and free the 
+ *	console that we've allocated. This is not called for suspend-to-disk.
+ */
+
+static void suspend_finish(suspend_state_t state)
+{
+	device_resume();
+	if (pm_ops && pm_ops->finish)
+		pm_ops->finish(state);
+	thaw_processes();
+	pm_restore_console();
+}
+
+
+
+
+static char * pm_states[] = {
+	[PM_SUSPEND_STANDBY]	= "standby",
+	[PM_SUSPEND_MEM]	= "mem",
+	[PM_SUSPEND_DISK]	= "disk",
+	NULL,
+};
+
+
+/**
+ *	enter_state - Do common work of entering low-power state.
+ *	@state:		pm_state structure for state we're entering.
+ *
+ *	Make sure we're the only ones trying to enter a sleep state. Fail
+ *	if someone has beat us to it, since we don't want anything weird to
+ *	happen when we wake up.
+ *	Then, do the setup for suspend, enter the state, and cleaup (after
+ *	we've woken up).
+ */
+
+static int enter_state(suspend_state_t state)
+{
+	int error;
+
+	if (down_trylock(&pm_sem))
+		return -EBUSY;
+
+	if (state == PM_SUSPEND_DISK) {
+		error = pm_suspend_disk();
+		goto Unlock;
+	}
+
+	/* Suspend is hard to get right on SMP. */
+	if (num_online_cpus() != 1) {
+		error = -EPERM;
+		goto Unlock;
+	}
+
+	pr_debug("PM: Preparing system for suspend\n");
+	if ((error = suspend_prepare(state)))
+		goto Unlock;
+
+	pr_debug("PM: Entering state.\n");
+	error = suspend_enter(state);
+
+	pr_debug("PM: Finishing up.\n");
+	suspend_finish(state);
+ Unlock:
+	up(&pm_sem);
+	return error;
+}
+
+/*
+ * This is main interface to the outside world. It needs to be
+ * called from process context.
+ */
+int software_suspend(void)
+{
+	return enter_state(PM_SUSPEND_DISK);
+}
+
+
+/**
+ *	pm_suspend - Externally visible function for suspending system.
+ *	@state:		Enumarted value of state to enter.
+ *
+ *	Determine whether or not value is within range, get state 
+ *	structure, and enter (above).
+ */
+
+int pm_suspend(suspend_state_t state)
+{
+	if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
+		return enter_state(state);
+	return -EINVAL;
+}
+
+
+
+decl_subsys(power,NULL,NULL);
+
+
+/**
+ *	state - control system power state.
+ *
+ *	show() returns what states are supported, which is hard-coded to
+ *	'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
+ *	'disk' (Suspend-to-Disk).
+ *
+ *	store() accepts one of those strings, translates it into the 
+ *	proper enumerated value, and initiates a suspend transition.
+ */
+
+static ssize_t state_show(struct subsystem * subsys, char * buf)
+{
+	int i;
+	char * s = buf;
+
+	for (i = 0; i < PM_SUSPEND_MAX; i++) {
+		if (pm_states[i])
+			s += sprintf(s,"%s ",pm_states[i]);
+	}
+	s += sprintf(s,"\n");
+	return (s - buf);
+}
+
+static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	suspend_state_t state = PM_SUSPEND_STANDBY;
+	char ** s;
+	char *p;
+	int error;
+	int len;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
+		if (*s && !strncmp(buf, *s, len))
+			break;
+	}
+	if (*s)
+		error = enter_state(state);
+	else
+		error = -EINVAL;
+	return error ? error : n;
+}
+
+power_attr(state);
+
+static struct attribute * g[] = {
+	&state_attr.attr,
+	NULL,
+};
+
+static struct attribute_group attr_group = {
+	.attrs = g,
+};
+
+
+static int __init pm_init(void)
+{
+	int error = subsystem_register(&power_subsys);
+	if (!error)
+		error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+	return error;
+}
+
+core_initcall(pm_init);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
new file mode 100644
index 000000000000..61deda04e39e
--- /dev/null
+++ b/kernel/power/pm.c
@@ -0,0 +1,265 @@
+/*
+ *  pm.c - Power management interface
+ *
+ *  Copyright (C) 2000 Andrew Henroid
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/interrupt.h>
+
+int pm_active;
+
+/*
+ *	Locking notes:
+ *		pm_devs_lock can be a semaphore providing pm ops are not called
+ *	from an interrupt handler (already a bad idea so no change here). Each
+ *	change must be protected so that an unlink of an entry doesn't clash
+ *	with a pm send - which is permitted to sleep in the current architecture
+ *
+ *	Module unloads clashing with pm events now work out safely, the module 
+ *	unload path will block until the event has been sent. It may well block
+ *	until a resume but that will be fine.
+ */
+ 
+static DECLARE_MUTEX(pm_devs_lock);
+static LIST_HEAD(pm_devs);
+
+/**
+ *	pm_register - register a device with power management
+ *	@type: device type 
+ *	@id: device ID
+ *	@callback: callback function
+ *
+ *	Add a device to the list of devices that wish to be notified about
+ *	power management events. A &pm_dev structure is returned on success,
+ *	on failure the return is %NULL.
+ *
+ *      The callback function will be called in process context and
+ *      it may sleep.
+ */
+ 
+struct pm_dev *pm_register(pm_dev_t type,
+			   unsigned long id,
+			   pm_callback callback)
+{
+	struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+	if (dev) {
+		memset(dev, 0, sizeof(*dev));
+		dev->type = type;
+		dev->id = id;
+		dev->callback = callback;
+
+		down(&pm_devs_lock);
+		list_add(&dev->entry, &pm_devs);
+		up(&pm_devs_lock);
+	}
+	return dev;
+}
+
+/**
+ *	pm_unregister -  unregister a device with power management
+ *	@dev: device to unregister
+ *
+ *	Remove a device from the power management notification lists. The
+ *	dev passed must be a handle previously returned by pm_register.
+ */
+ 
+void pm_unregister(struct pm_dev *dev)
+{
+	if (dev) {
+		down(&pm_devs_lock);
+		list_del(&dev->entry);
+		up(&pm_devs_lock);
+
+		kfree(dev);
+	}
+}
+
+static void __pm_unregister(struct pm_dev *dev)
+{
+	if (dev) {
+		list_del(&dev->entry);
+		kfree(dev);
+	}
+}
+
+/**
+ *	pm_unregister_all - unregister all devices with matching callback
+ *	@callback: callback function pointer
+ *
+ *	Unregister every device that would call the callback passed. This
+ *	is primarily meant as a helper function for loadable modules. It
+ *	enables a module to give up all its managed devices without keeping
+ *	its own private list.
+ */
+ 
+void pm_unregister_all(pm_callback callback)
+{
+	struct list_head *entry;
+
+	if (!callback)
+		return;
+
+	down(&pm_devs_lock);
+	entry = pm_devs.next;
+	while (entry != &pm_devs) {
+		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+		entry = entry->next;
+		if (dev->callback == callback)
+			__pm_unregister(dev);
+	}
+	up(&pm_devs_lock);
+}
+
+/**
+ *	pm_send - send request to a single device
+ *	@dev: device to send to
+ *	@rqst: power management request
+ *	@data: data for the callback
+ *
+ *	Issue a power management request to a given device. The 
+ *	%PM_SUSPEND and %PM_RESUME events are handled specially. The
+ *	data field must hold the intended next state. No call is made
+ *	if the state matches.
+ *
+ *	BUGS: what stops two power management requests occurring in parallel
+ *	and conflicting.
+ *
+ *	WARNING: Calling pm_send directly is not generally recommended, in
+ *	particular there is no locking against the pm_dev going away. The
+ *	caller must maintain all needed locking or have 'inside knowledge'
+ *	on the safety. Also remember that this function is not locked against
+ *	pm_unregister. This means that you must handle SMP races on callback
+ *	execution and unload yourself.
+ */
+ 
+static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
+{
+	int status = 0;
+	unsigned long prev_state, next_state;
+
+	if (in_interrupt())
+		BUG();
+
+	switch (rqst) {
+	case PM_SUSPEND:
+	case PM_RESUME:
+		prev_state = dev->state;
+		next_state = (unsigned long) data;
+		if (prev_state != next_state) {
+			if (dev->callback)
+				status = (*dev->callback)(dev, rqst, data);
+			if (!status) {
+				dev->state = next_state;
+				dev->prev_state = prev_state;
+			}
+		}
+		else {
+			dev->prev_state = prev_state;
+		}
+		break;
+	default:
+		if (dev->callback)
+			status = (*dev->callback)(dev, rqst, data);
+		break;
+	}
+	return status;
+}
+
+/*
+ * Undo incomplete request
+ */
+static void pm_undo_all(struct pm_dev *last)
+{
+	struct list_head *entry = last->entry.prev;
+	while (entry != &pm_devs) {
+		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+		if (dev->state != dev->prev_state) {
+			/* previous state was zero (running) resume or
+			 * previous state was non-zero (suspended) suspend
+			 */
+			pm_request_t undo = (dev->prev_state
+					     ? PM_SUSPEND:PM_RESUME);
+			pm_send(dev, undo, (void*) dev->prev_state);
+		}
+		entry = entry->prev;
+	}
+}
+
+/**
+ *	pm_send_all - send request to all managed devices
+ *	@rqst: power management request
+ *	@data: data for the callback
+ *
+ *	Issue a power management request to a all devices. The 
+ *	%PM_SUSPEND events are handled specially. Any device is 
+ *	permitted to fail a suspend by returning a non zero (error)
+ *	value from its callback function. If any device vetoes a 
+ *	suspend request then all other devices that have suspended 
+ *	during the processing of this request are restored to their
+ *	previous state.
+ *
+ *	WARNING:  This function takes the pm_devs_lock. The lock is not dropped until
+ *	the callbacks have completed. This prevents races against pm locking
+ *	functions, races against module unload pm_unregister code. It does
+ *	mean however that you must not issue pm_ functions within the callback
+ *	or you will deadlock and users will hate you.
+ *
+ *	Zero is returned on success. If a suspend fails then the status
+ *	from the device that vetoes the suspend is returned.
+ *
+ *	BUGS: what stops two power management requests occurring in parallel
+ *	and conflicting.
+ */
+ 
+int pm_send_all(pm_request_t rqst, void *data)
+{
+	struct list_head *entry;
+	
+	down(&pm_devs_lock);
+	entry = pm_devs.next;
+	while (entry != &pm_devs) {
+		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+		if (dev->callback) {
+			int status = pm_send(dev, rqst, data);
+			if (status) {
+				/* return devices to previous state on
+				 * failed suspend request
+				 */
+				if (rqst == PM_SUSPEND)
+					pm_undo_all(dev);
+				up(&pm_devs_lock);
+				return status;
+			}
+		}
+		entry = entry->next;
+	}
+	up(&pm_devs_lock);
+	return 0;
+}
+
+EXPORT_SYMBOL(pm_register);
+EXPORT_SYMBOL(pm_unregister);
+EXPORT_SYMBOL(pm_unregister_all);
+EXPORT_SYMBOL(pm_send_all);
+EXPORT_SYMBOL(pm_active);
+
+
diff --git a/kernel/power/power.h b/kernel/power/power.h
new file mode 100644
index 000000000000..cd6a3493cc0d
--- /dev/null
+++ b/kernel/power/power.h
@@ -0,0 +1,52 @@
+#include <linux/suspend.h>
+#include <linux/utsname.h>
+
+/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
+   we probably do not take enough locks for switching consoles, etc,
+   so bad things might happen.
+*/
+#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+#define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
+#endif
+
+
+struct swsusp_info {
+	struct new_utsname	uts;
+	u32			version_code;
+	unsigned long		num_physpages;
+	int			cpus;
+	unsigned long		image_pages;
+	unsigned long		pagedir_pages;
+	suspend_pagedir_t	* suspend_pagedir;
+	swp_entry_t		pagedir[768];
+} __attribute__((aligned(PAGE_SIZE)));
+
+
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+extern int pm_suspend_disk(void);
+
+#else
+static inline int pm_suspend_disk(void)
+{
+	return -EPERM;
+}
+#endif
+extern struct semaphore pm_sem;
+#define power_attr(_name) \
+static struct subsys_attribute _name##_attr = {	\
+	.attr	= {				\
+		.name = __stringify(_name),	\
+		.mode = 0644,			\
+	},					\
+	.show	= _name##_show,			\
+	.store	= _name##_store,		\
+}
+
+extern struct subsystem power_subsys;
+
+extern int freeze_processes(void);
+extern void thaw_processes(void);
+
+extern int pm_prepare_console(void);
+extern void pm_restore_console(void);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
new file mode 100644
index 000000000000..715081b2d829
--- /dev/null
+++ b/kernel/power/poweroff.c
@@ -0,0 +1,45 @@
+/*
+ * poweroff.c - sysrq handler to gracefully power down machine.
+ *
+ * This file is released under the GPL v2
+ */
+
+#include <linux/kernel.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/workqueue.h>
+
+/*
+ * When the user hits Sys-Rq o to power down the machine this is the
+ * callback we use.
+ */
+
+static void do_poweroff(void *dummy)
+{
+	if (pm_power_off)
+		pm_power_off();
+}
+
+static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
+
+static void handle_poweroff(int key, struct pt_regs *pt_regs,
+				struct tty_struct *tty)
+{
+	schedule_work(&poweroff_work);
+}
+
+static struct sysrq_key_op	sysrq_poweroff_op = {
+	.handler        = handle_poweroff,
+	.help_msg       = "powerOff",
+	.action_msg     = "Power Off",
+ 	.enable_mask	= SYSRQ_ENABLE_BOOT,
+};
+
+static int pm_sysrq_init(void)
+{
+	register_sysrq_key('o', &sysrq_poweroff_op);
+	return 0;
+}
+
+subsys_initcall(pm_sysrq_init);
diff --git a/kernel/power/process.c b/kernel/power/process.c
new file mode 100644
index 000000000000..78d92dc6a1ed
--- /dev/null
+++ b/kernel/power/process.c
@@ -0,0 +1,121 @@
+/*
+ * drivers/power/process.c - Functions for starting/stopping processes on 
+ *                           suspend transitions.
+ *
+ * Originally from swsusp.
+ */
+
+
+#undef DEBUG
+
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+
+/* 
+ * Timeout for stopping processes
+ */
+#define TIMEOUT	(6 * HZ)
+
+
+static inline int freezeable(struct task_struct * p)
+{
+	if ((p == current) || 
+	    (p->flags & PF_NOFREEZE) ||
+	    (p->exit_state == EXIT_ZOMBIE) ||
+	    (p->exit_state == EXIT_DEAD) ||
+	    (p->state == TASK_STOPPED) ||
+	    (p->state == TASK_TRACED))
+		return 0;
+	return 1;
+}
+
+/* Refrigerator is place where frozen processes are stored :-). */
+void refrigerator(unsigned long flag)
+{
+	/* Hmm, should we be allowed to suspend when there are realtime
+	   processes around? */
+	long save;
+	save = current->state;
+	current->state = TASK_UNINTERRUPTIBLE;
+	pr_debug("%s entered refrigerator\n", current->comm);
+	printk("=");
+	current->flags &= ~PF_FREEZE;
+
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
+
+	current->flags |= PF_FROZEN;
+	while (current->flags & PF_FROZEN)
+		schedule();
+	pr_debug("%s left refrigerator\n", current->comm);
+	current->state = save;
+}
+
+/* 0 = success, else # of processes that we failed to stop */
+int freeze_processes(void)
+{
+       int todo;
+       unsigned long start_time;
+	struct task_struct *g, *p;
+	
+	printk( "Stopping tasks: " );
+	start_time = jiffies;
+	do {
+		todo = 0;
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			unsigned long flags;
+			if (!freezeable(p))
+				continue;
+			if ((p->flags & PF_FROZEN) ||
+			    (p->state == TASK_TRACED) ||
+			    (p->state == TASK_STOPPED))
+				continue;
+
+			/* FIXME: smp problem here: we may not access other process' flags
+			   without locking */
+			p->flags |= PF_FREEZE;
+			spin_lock_irqsave(&p->sighand->siglock, flags);
+			signal_wake_up(p, 0);
+			spin_unlock_irqrestore(&p->sighand->siglock, flags);
+			todo++;
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+		yield();			/* Yield is okay here */
+		if (time_after(jiffies, start_time + TIMEOUT)) {
+			printk( "\n" );
+			printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
+			return todo;
+		}
+	} while(todo);
+	
+	printk( "|\n" );
+	BUG_ON(in_atomic());
+	return 0;
+}
+
+void thaw_processes(void)
+{
+	struct task_struct *g, *p;
+
+	printk( "Restarting tasks..." );
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (!freezeable(p))
+			continue;
+		if (p->flags & PF_FROZEN) {
+			p->flags &= ~PF_FROZEN;
+			wake_up_process(p);
+		} else
+			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+	} while_each_thread(g, p);
+
+	read_unlock(&tasklist_lock);
+	schedule();
+	printk( " done\n" );
+}
+
+EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
new file mode 100644
index 000000000000..7fa7f6e2b7fb
--- /dev/null
+++ b/kernel/power/smp.c
@@ -0,0 +1,85 @@
+/*
+ * drivers/power/smp.c - Functions for stopping other CPUs.
+ *
+ * Copyright 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#undef DEBUG
+
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+#include <asm/tlbflush.h>
+
+static atomic_t cpu_counter, freeze;
+
+
+static void smp_pause(void * data)
+{
+	struct saved_context ctxt;
+	__save_processor_state(&ctxt);
+	printk("Sleeping in:\n");
+	dump_stack();
+	atomic_inc(&cpu_counter);
+	while (atomic_read(&freeze)) {
+		/* FIXME: restore takes place at random piece inside this.
+		   This should probably be written in assembly, and
+		   preserve general-purpose registers, too
+
+		   What about stack? We may need to move to new stack here.
+
+		   This should better be ran with interrupts disabled.
+		 */
+		cpu_relax();
+		barrier();
+	}
+	atomic_dec(&cpu_counter);
+	__restore_processor_state(&ctxt);
+}
+
+static cpumask_t oldmask;
+
+void disable_nonboot_cpus(void)
+{
+	printk("Freezing CPUs (at %d)", smp_processor_id());
+	oldmask = current->cpus_allowed;
+	set_cpus_allowed(current, cpumask_of_cpu(0));
+	current->state = TASK_INTERRUPTIBLE;
+	schedule_timeout(HZ);
+	printk("...");
+	BUG_ON(smp_processor_id() != 0);
+
+	/* FIXME: for this to work, all the CPUs must be running
+	 * "idle" thread (or we deadlock). Is that guaranteed? */
+
+	atomic_set(&cpu_counter, 0);
+	atomic_set(&freeze, 1);
+	smp_call_function(smp_pause, NULL, 0, 0);
+	while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
+		cpu_relax();
+		barrier();
+	}
+	printk("ok\n");
+}
+
+void enable_nonboot_cpus(void)
+{
+	printk("Restarting CPUs");
+	atomic_set(&freeze, 0);
+	while (atomic_read(&cpu_counter)) {
+		cpu_relax();
+		barrier();
+	}
+	printk("...");
+	set_cpus_allowed(current, oldmask);
+	schedule();
+	printk("ok\n");
+
+}
+
+
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
new file mode 100644
index 000000000000..ae5bebc3b18f
--- /dev/null
+++ b/kernel/power/swsusp.c
@@ -0,0 +1,1433 @@
+/*
+ * linux/kernel/power/swsusp.c
+ *
+ * This file is to realize architecture-independent
+ * machine suspend feature using pretty near only high-level routines
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ * I'd like to thank the following people for their work:
+ * 
+ * Pavel Machek <pavel@ucw.cz>:
+ * Modifications, defectiveness pointing, being with me at the very beginning,
+ * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
+ *
+ * Steve Doddi <dirk@loth.demon.co.uk>: 
+ * Support the possibility of hardware state restoring.
+ *
+ * Raph <grey.havens@earthling.net>:
+ * Support for preserving states of network devices and virtual console
+ * (including X and svgatextmode)
+ *
+ * Kurt Garloff <garloff@suse.de>:
+ * Straightened the critical function in order to prevent compilers from
+ * playing tricks with local variables.
+ *
+ * Andreas Mohr <a.mohr@mailto.de>
+ *
+ * Alex Badea <vampire@go.ro>:
+ * Fixed runaway init
+ *
+ * More state savers are welcome. Especially for the scsi layer...
+ *
+ * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/bitops.h>
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/keyboard.h>
+#include <linux/spinlock.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/swap.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/swapops.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+#include <linux/bio.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include "power.h"
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+/* Variables to be preserved over suspend */
+static int nr_copy_pages_check;
+
+extern char resume_file[];
+
+/* Local variables that should not be affected by save */
+unsigned int nr_copy_pages __nosavedata = 0;
+
+/* Suspend pagedir is allocated before final copy, therefore it
+   must be freed after resume 
+
+   Warning: this is evil. There are actually two pagedirs at time of
+   resume. One is "pagedir_save", which is empty frame allocated at
+   time of suspend, that must be freed. Second is "pagedir_nosave", 
+   allocated at time of resume, that travels through memory not to
+   collide with anything.
+
+   Warning: this is even more evil than it seems. Pagedirs this file
+   talks about are completely different from page directories used by
+   MMU hardware.
+ */
+suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
+static suspend_pagedir_t *pagedir_save;
+
+#define SWSUSP_SIG	"S1SUSPEND"
+
+static struct swsusp_header {
+	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+	swp_entry_t swsusp_info;
+	char	orig_sig[10];
+	char	sig[10];
+} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+
+static struct swsusp_info swsusp_info;
+
+/*
+ * XXX: We try to keep some more pages free so that I/O operations succeed
+ * without paging. Might this be more?
+ */
+#define PAGES_FOR_IO	512
+
+/*
+ * Saving part...
+ */
+
+/* We memorize in swapfile_used what swap devices are used for suspension */
+#define SWAPFILE_UNUSED    0
+#define SWAPFILE_SUSPEND   1	/* This is the suspending device */
+#define SWAPFILE_IGNORED   2	/* Those are other swap devices ignored for suspension */
+
+static unsigned short swapfile_used[MAX_SWAPFILES];
+static unsigned short root_swap;
+
+static int mark_swapfiles(swp_entry_t prev)
+{
+	int error;
+
+	rw_swap_page_sync(READ, 
+			  swp_entry(root_swap, 0),
+			  virt_to_page((unsigned long)&swsusp_header));
+	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
+	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
+		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
+		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+		swsusp_header.swsusp_info = prev;
+		error = rw_swap_page_sync(WRITE, 
+					  swp_entry(root_swap, 0),
+					  virt_to_page((unsigned long)
+						       &swsusp_header));
+	} else {
+		pr_debug("swsusp: Partition is not swap space.\n");
+		error = -ENODEV;
+	}
+	return error;
+}
+
+/*
+ * Check whether the swap device is the specified resume
+ * device, irrespective of whether they are specified by
+ * identical names.
+ *
+ * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
+ * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
+ * and they'll be considered the same device.  This is *necessary* for
+ * devfs, since the resume code can only recognize the form /dev/hda4,
+ * but the suspend code would see the long name.)
+ */
+static int is_resume_device(const struct swap_info_struct *swap_info)
+{
+	struct file *file = swap_info->swap_file;
+	struct inode *inode = file->f_dentry->d_inode;
+
+	return S_ISBLK(inode->i_mode) &&
+		swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
+}
+
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+	int i, len;
+	
+	len=strlen(resume_file);
+	root_swap = 0xFFFF;
+	
+	swap_list_lock();
+	for(i=0; i<MAX_SWAPFILES; i++) {
+		if (swap_info[i].flags == 0) {
+			swapfile_used[i]=SWAPFILE_UNUSED;
+		} else {
+			if(!len) {
+	    			printk(KERN_WARNING "resume= option should be used to set suspend device" );
+				if(root_swap == 0xFFFF) {
+					swapfile_used[i] = SWAPFILE_SUSPEND;
+					root_swap = i;
+				} else
+					swapfile_used[i] = SWAPFILE_IGNORED;				  
+			} else {
+	  			/* we ignore all swap devices that are not the resume_file */
+				if (is_resume_device(&swap_info[i])) {
+					swapfile_used[i] = SWAPFILE_SUSPEND;
+					root_swap = i;
+				} else {
+				  	swapfile_used[i] = SWAPFILE_IGNORED;
+				}
+			}
+		}
+	}
+	swap_list_unlock();
+	return (root_swap != 0xffff) ? 0 : -ENODEV;
+}
+
+/**
+ * This is called after saving image so modification
+ * will be lost after resume... and that's what we want.
+ * we make the device unusable. A new call to
+ * lock_swapdevices can unlock the devices. 
+ */
+static void lock_swapdevices(void)
+{
+	int i;
+
+	swap_list_lock();
+	for(i = 0; i< MAX_SWAPFILES; i++)
+		if(swapfile_used[i] == SWAPFILE_IGNORED) {
+			swap_info[i].flags ^= 0xFF;
+		}
+	swap_list_unlock();
+}
+
+/**
+ *	write_swap_page - Write one page to a fresh swap location.
+ *	@addr:	Address we're writing.
+ *	@loc:	Place to store the entry we used.
+ *
+ *	Allocate a new swap entry and 'sync' it. Note we discard -EIO
+ *	errors. That is an artifact left over from swsusp. It did not 
+ *	check the return of rw_swap_page_sync() at all, since most pages
+ *	written back to swap would return -EIO.
+ *	This is a partial improvement, since we will at least return other
+ *	errors, though we need to eventually fix the damn code.
+ */
+static int write_page(unsigned long addr, swp_entry_t * loc)
+{
+	swp_entry_t entry;
+	int error = 0;
+
+	entry = get_swap_page();
+	if (swp_offset(entry) && 
+	    swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
+		error = rw_swap_page_sync(WRITE, entry,
+					  virt_to_page(addr));
+		if (error == -EIO)
+			error = 0;
+		if (!error)
+			*loc = entry;
+	} else
+		error = -ENOSPC;
+	return error;
+}
+
+/**
+ *	data_free - Free the swap entries used by the saved image.
+ *
+ *	Walk the list of used swap entries and free each one. 
+ *	This is only used for cleanup when suspend fails.
+ */
+static void data_free(void)
+{
+	swp_entry_t entry;
+	int i;
+
+	for (i = 0; i < nr_copy_pages; i++) {
+		entry = (pagedir_nosave + i)->swap_address;
+		if (entry.val)
+			swap_free(entry);
+		else
+			break;
+		(pagedir_nosave + i)->swap_address = (swp_entry_t){0};
+	}
+}
+
+/**
+ *	data_write - Write saved image to swap.
+ *
+ *	Walk the list of pages in the image and sync each one to swap.
+ */
+static int data_write(void)
+{
+	int error = 0, i = 0;
+	unsigned int mod = nr_copy_pages / 100;
+	struct pbe *p;
+
+	if (!mod)
+		mod = 1;
+
+	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
+	for_each_pbe(p, pagedir_nosave) {
+		if (!(i%mod))
+			printk( "\b\b\b\b%3d%%", i / mod );
+		if ((error = write_page(p->address, &(p->swap_address))))
+			return error;
+		i++;
+	}
+	printk("\b\b\b\bdone\n");
+	return error;
+}
+
+static void dump_info(void)
+{
+	pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
+	pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
+	pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
+	pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
+	pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
+	pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
+	pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
+	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
+	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
+	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
+	pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages);
+}
+
+static void init_header(void)
+{
+	memset(&swsusp_info, 0, sizeof(swsusp_info));
+	swsusp_info.version_code = LINUX_VERSION_CODE;
+	swsusp_info.num_physpages = num_physpages;
+	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
+
+	swsusp_info.suspend_pagedir = pagedir_nosave;
+	swsusp_info.cpus = num_online_cpus();
+	swsusp_info.image_pages = nr_copy_pages;
+}
+
+static int close_swap(void)
+{
+	swp_entry_t entry;
+	int error;
+
+	dump_info();
+	error = write_page((unsigned long)&swsusp_info, &entry);
+	if (!error) { 
+		printk( "S" );
+		error = mark_swapfiles(entry);
+		printk( "|\n" );
+	}
+	return error;
+}
+
+/**
+ *	free_pagedir_entries - Free pages used by the page directory.
+ *
+ *	This is used during suspend for error recovery.
+ */
+
+static void free_pagedir_entries(void)
+{
+	int i;
+
+	for (i = 0; i < swsusp_info.pagedir_pages; i++)
+		swap_free(swsusp_info.pagedir[i]);
+}
+
+
+/**
+ *	write_pagedir - Write the array of pages holding the page directory.
+ *	@last:	Last swap entry we write (needed for header).
+ */
+
+static int write_pagedir(void)
+{
+	int error = 0;
+	unsigned n = 0;
+	struct pbe * pbe;
+
+	printk( "Writing pagedir...");
+	for_each_pb_page(pbe, pagedir_nosave) {
+		if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
+			return error;
+	}
+
+	swsusp_info.pagedir_pages = n;
+	printk("done (%u pages)\n", n);
+	return error;
+}
+
+/**
+ *	write_suspend_image - Write entire image and metadata.
+ *
+ */
+
+static int write_suspend_image(void)
+{
+	int error;
+
+	init_header();
+	if ((error = data_write()))
+		goto FreeData;
+
+	if ((error = write_pagedir()))
+		goto FreePagedir;
+
+	if ((error = close_swap()))
+		goto FreePagedir;
+ Done:
+	return error;
+ FreePagedir:
+	free_pagedir_entries();
+ FreeData:
+	data_free();
+	goto Done;
+}
+
+
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+	char *data;
+	struct page *page;
+	struct highmem_page *next;
+};
+
+static struct highmem_page *highmem_copy;
+
+static int save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	mark_free_pages(zone);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%1000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			continue;
+		}
+		BUG_ON(PageNosave(page));
+		if (PageNosaveFree(page))
+			continue;
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			return -ENOMEM;
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+	return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+
+static int save_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	struct zone *zone;
+	int res = 0;
+
+	pr_debug("swsusp: Saving Highmem\n");
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
+	}
+#endif
+	return 0;
+}
+
+static int restore_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	printk("swsusp: Restoring Highmem\n");
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+#endif
+	return 0;
+}
+
+
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+/**
+ *	saveable - Determine whether a page should be cloned or not.
+ *	@pfn:	The page
+ *
+ *	We save a page if it's Reserved, and not in the range of pages
+ *	statically defined as 'unsaveable', or if it isn't reserved, and
+ *	isn't part of a free chunk of pages.
+ */
+
+static int saveable(struct zone * zone, unsigned long * zone_pfn)
+{
+	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+	struct page * page;
+
+	if (!pfn_valid(pfn))
+		return 0;
+
+	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
+	if (PageNosave(page))
+		return 0;
+	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+		pr_debug("[nosave pfn 0x%lx]", pfn);
+		return 0;
+	}
+	if (PageNosaveFree(page))
+		return 0;
+
+	return 1;
+}
+
+static void count_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	nr_copy_pages = 0;
+
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			nr_copy_pages += saveable(zone, &zone_pfn);
+	}
+}
+
+
+static void copy_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe * pbe = pagedir_nosave;
+	
+	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+			if (saveable(zone, &zone_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				BUG_ON(!pbe);
+				pbe->orig_address = (long) page_address(page);
+				/* copy_page is not usable for copying task structs. */
+				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				pbe = pbe->next;
+			}
+		}
+	}
+	BUG_ON(pbe);
+}
+
+
+/**
+ *	calc_nr - Determine the number of pages needed for a pbe list.
+ */
+
+static int calc_nr(int nr_copy)
+{
+	int extra = 0;
+	int mod = !!(nr_copy % PBES_PER_PAGE);
+	int diff = (nr_copy / PBES_PER_PAGE) + mod;
+
+	do {
+		extra += diff;
+		nr_copy += diff;
+		mod = !!(nr_copy % PBES_PER_PAGE);
+		diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
+	} while (diff > 0);
+
+	return nr_copy;
+}
+
+/**
+ *	free_pagedir - free pages allocated with alloc_pagedir()
+ */
+
+static inline void free_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbe;
+
+	while (pblist) {
+		pbe = (pblist + PB_PAGE_SKIP)->next;
+		free_page((unsigned long)pblist);
+		pblist = pbe;
+	}
+}
+
+/**
+ *	fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+	struct pbe *p;
+
+	p = pbpage;
+	pbpage += PB_PAGE_SKIP;
+	do
+		p->next = p + 1;
+	while (++p < pbpage);
+}
+
+/**
+ *	create_pbe_list - Create a list of PBEs on top of a given chain
+ *	of memory pages allocated with alloc_pagedir()
+ */
+
+static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+{
+	struct pbe *pbpage, *p;
+	unsigned num = PBES_PER_PAGE;
+
+	for_each_pb_page (pbpage, pblist) {
+		if (num >= nr_pages)
+			break;
+
+		fill_pb_page(pbpage);
+		num += PBES_PER_PAGE;
+	}
+	if (pbpage) {
+		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+			p->next = p + 1;
+		p->next = NULL;
+	}
+	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+
+/**
+ *	alloc_pagedir - Allocate the page directory.
+ *
+ *	First, determine exactly how many pages we need and
+ *	allocate them.
+ *
+ *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *	struct pbe elements (pbes) and the last element in the page points
+ *	to the next page.
+ *
+ *	On each page we set up a list of struct_pbe elements.
+ */
+
+static struct pbe * alloc_pagedir(unsigned nr_pages)
+{
+	unsigned num;
+	struct pbe *pblist, *pbe;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
+	pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+        		pbe = pbe->next, num += PBES_PER_PAGE) {
+		pbe += PB_PAGE_SKIP;
+		pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	}
+	if (!pbe) { /* get_zeroed_page() failed */
+		free_pagedir(pblist);
+		pblist = NULL;
+        }
+	return pblist;
+}
+
+/**
+ *	free_image_pages - Free pages allocated for snapshot
+ */
+
+static void free_image_pages(void)
+{
+	struct pbe * p;
+
+	for_each_pbe(p, pagedir_save) {
+		if (p->address) {
+			ClearPageNosave(virt_to_page(p->address));
+			free_page(p->address);
+			p->address = 0;
+		}
+	}
+}
+
+/**
+ *	alloc_image_pages - Allocate pages for the snapshot.
+ */
+
+static int alloc_image_pages(void)
+{
+	struct pbe * p;
+
+	for_each_pbe(p, pagedir_save) {
+		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+		if (!p->address)
+			return -ENOMEM;
+		SetPageNosave(virt_to_page(p->address));
+	}
+	return 0;
+}
+
+void swsusp_free(void)
+{
+	BUG_ON(PageNosave(virt_to_page(pagedir_save)));
+	BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
+	free_image_pages();
+	free_pagedir(pagedir_save);
+}
+
+
+/**
+ *	enough_free_mem - Make sure we enough free memory to snapshot.
+ *
+ *	Returns TRUE or FALSE after checking the number of available 
+ *	free pages.
+ */
+
+static int enough_free_mem(void)
+{
+	if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
+		pr_debug("swsusp: Not enough free pages: Have %d\n",
+			 nr_free_pages());
+		return 0;
+	}
+	return 1;
+}
+
+
+/**
+ *	enough_swap - Make sure we have enough swap to save the image.
+ *
+ *	Returns TRUE or FALSE after checking the total amount of swap 
+ *	space avaiable.
+ *
+ *	FIXME: si_swapinfo(&i) returns all swap devices information.
+ *	We should only consider resume_device. 
+ */
+
+static int enough_swap(void)
+{
+	struct sysinfo i;
+
+	si_swapinfo(&i);
+	if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO))  {
+		pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
+		return 0;
+	}
+	return 1;
+}
+
+static int swsusp_alloc(void)
+{
+	int error;
+
+	pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
+		 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
+
+	pagedir_nosave = NULL;
+	if (!enough_free_mem())
+		return -ENOMEM;
+
+	if (!enough_swap())
+		return -ENOSPC;
+
+	nr_copy_pages = calc_nr(nr_copy_pages);
+
+	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
+		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+		return -ENOMEM;
+	}
+	create_pbe_list(pagedir_save, nr_copy_pages);
+	pagedir_nosave = pagedir_save;
+	if ((error = alloc_image_pages())) {
+		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+		swsusp_free();
+		return error;
+	}
+
+	nr_copy_pages_check = nr_copy_pages;
+	return 0;
+}
+
+static int suspend_prepare_image(void)
+{
+	int error;
+
+	pr_debug("swsusp: critical section: \n");
+	if (save_highmem()) {
+		printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
+		restore_highmem();
+		return -ENOMEM;
+	}
+
+	drain_local_pages();
+	count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+
+	error = swsusp_alloc();
+	if (error)
+		return error;
+	
+	/* During allocating of suspend pagedir, new cold pages may appear. 
+	 * Kill them.
+	 */
+	drain_local_pages();
+	copy_data_pages();
+
+	/*
+	 * End of critical section. From now on, we can write to memory,
+	 * but we should not touch disk. This specially means we must _not_
+	 * touch swap space! Except we must write out our image of course.
+	 */
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
+	return 0;
+}
+
+
+/* It is important _NOT_ to umount filesystems at this point. We want
+ * them synced (in case something goes wrong) but we DO not want to mark
+ * filesystem clean: it is not. (And it does not matter, if we resume
+ * correctly, we'll mark system clean, anyway.)
+ */
+int swsusp_write(void)
+{
+	int error;
+	device_resume();
+	lock_swapdevices();
+	error = write_suspend_image();
+	/* This will unlock ignored swap devices since writing is finished */
+	lock_swapdevices();
+	return error;
+
+}
+
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+
+asmlinkage int swsusp_save(void)
+{
+	int error = 0;
+
+	if ((error = swsusp_swap_check())) {
+		printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
+				"swapon -a!\n");
+		return error;
+	}
+	return suspend_prepare_image();
+}
+
+int swsusp_suspend(void)
+{
+	int error;
+	if ((error = arch_prepare_suspend()))
+		return error;
+	local_irq_disable();
+	/* At this point, device_suspend() has been called, but *not*
+	 * device_power_down(). We *must* device_power_down() now.
+	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
+	 * become desynchronized with the actual state of the hardware
+	 * at resume time, and evil weirdness ensues.
+	 */
+	if ((error = device_power_down(PMSG_FREEZE))) {
+		printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
+		local_irq_enable();
+		swsusp_free();
+		return error;
+	}
+	save_processor_state();
+	if ((error = swsusp_arch_suspend()))
+		swsusp_free();
+	/* Restore control flow magically appears here */
+	restore_processor_state();
+	BUG_ON (nr_copy_pages_check != nr_copy_pages);
+	restore_highmem();
+	device_power_up();
+	local_irq_enable();
+	return error;
+}
+
+int swsusp_resume(void)
+{
+	int error;
+	local_irq_disable();
+	if (device_power_down(PMSG_FREEZE))
+		printk(KERN_ERR "Some devices failed to power down, very bad\n");
+	/* We'll ignore saved state, but this gets preempt count (etc) right */
+	save_processor_state();
+	error = swsusp_arch_resume();
+	/* Code below is only ever reached in case of failure. Otherwise
+	 * execution continues at place where swsusp_arch_suspend was called
+         */
+	BUG_ON(!error);
+	restore_processor_state();
+	restore_highmem();
+	device_power_up();
+	local_irq_enable();
+	return error;
+}
+
+/* More restore stuff */
+
+/*
+ * Returns true if given address/order collides with any orig_address 
+ */
+static int does_collide_order(unsigned long addr, int order)
+{
+	int i;
+	
+	for (i=0; i < (1<<order); i++)
+		if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
+			return 1;
+	return 0;
+}
+
+/**
+ *	On resume, for storing the PBE list and the image,
+ *	we can only use memory pages that do not conflict with the pages
+ *	which had been used before suspend.
+ *
+ *	We don't know which pages are usable until we allocate them.
+ *
+ *	Allocated but unusable (ie eaten) memory pages are linked together
+ *	to create a list, so that we can free them easily
+ *
+ *	We could have used a type other than (void *)
+ *	for this purpose, but ...
+ */
+static void **eaten_memory = NULL;
+
+static inline void eat_page(void *page)
+{
+	void **c;
+
+	c = eaten_memory;
+	eaten_memory = page;
+	*eaten_memory = c;
+}
+
+static unsigned long get_usable_page(unsigned gfp_mask)
+{
+	unsigned long m;
+
+	m = get_zeroed_page(gfp_mask);
+	while (does_collide_order(m, 0)) {
+		eat_page((void *)m);
+		m = get_zeroed_page(gfp_mask);
+		if (!m)
+			break;
+	}
+	return m;
+}
+
+static void free_eaten_memory(void)
+{
+	unsigned long m;
+	void **c;
+	int i = 0;
+
+	c = eaten_memory;
+	while (c) {
+		m = (unsigned long)c;
+		c = *c;
+		free_page(m);
+		i++;
+	}
+	eaten_memory = NULL;
+	pr_debug("swsusp: %d unused pages freed\n", i);
+}
+
+/**
+ *	check_pagedir - We ensure here that pages that the PBEs point to
+ *	won't collide with pages where we're going to restore from the loaded
+ *	pages later
+ */
+
+static int check_pagedir(struct pbe *pblist)
+{
+	struct pbe *p;
+
+	/* This is necessary, so that we can free allocated pages
+	 * in case of failure
+	 */
+	for_each_pbe (p, pblist)
+		p->address = 0UL;
+
+	for_each_pbe (p, pblist) {
+		p->address = get_usable_page(GFP_ATOMIC);
+		if (!p->address)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+/**
+ *	swsusp_pagedir_relocate - It is possible, that some memory pages
+ *	occupied by the list of PBEs collide with pages where we're going to
+ *	restore from the loaded pages later.  We relocate them here.
+ */
+
+static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *pbpage, *tail, *p;
+	void *m;
+	int rel = 0, error = 0;
+
+	if (!pblist) /* a sanity check */
+		return NULL;
+
+	pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
+			swsusp_info.pagedir_pages);
+
+	/* Set page flags */
+
+	for_each_zone(zone) {
+        	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                	SetPageNosaveFree(pfn_to_page(zone_pfn +
+					zone->zone_start_pfn));
+	}
+
+	/* Clear orig addresses */
+
+	for_each_pbe (p, pblist)
+		ClearPageNosaveFree(virt_to_page(p->orig_address));
+
+	tail = pblist + PB_PAGE_SKIP;
+
+	/* Relocate colliding pages */
+
+	for_each_pb_page (pbpage, pblist) {
+		if (does_collide_order((unsigned long)pbpage, 0)) {
+			m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
+			if (!m) {
+				error = -ENOMEM;
+				break;
+			}
+			memcpy(m, (void *)pbpage, PAGE_SIZE);
+			if (pbpage == pblist)
+				pblist = (struct pbe *)m;
+			else
+				tail->next = (struct pbe *)m;
+
+			eat_page((void *)pbpage);
+			pbpage = (struct pbe *)m;
+
+			/* We have to link the PBEs again */
+
+			for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
+				if (p->next) /* needed to save the end */
+					p->next = p + 1;
+
+			rel++;
+		}
+		tail = pbpage + PB_PAGE_SKIP;
+	}
+
+	if (error) {
+		printk("\nswsusp: Out of memory\n\n");
+		free_pagedir(pblist);
+		free_eaten_memory();
+		pblist = NULL;
+	}
+	else
+		printk("swsusp: Relocated %d pages\n", rel);
+
+	return pblist;
+}
+
+/**
+ *	Using bio to read from swap.
+ *	This code requires a bit more work than just using buffer heads
+ *	but, it is the recommended way for 2.5/2.6.
+ *	The following are to signal the beginning and end of I/O. Bios
+ *	finish asynchronously, while we want them to happen synchronously.
+ *	A simple atomic_t, and a wait loop take care of this problem.
+ */
+
+static atomic_t io_done = ATOMIC_INIT(0);
+
+static int end_io(struct bio * bio, unsigned int num, int err)
+{
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		panic("I/O error reading memory image");
+	atomic_set(&io_done, 0);
+	return 0;
+}
+
+static struct block_device * resume_bdev;
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're writing, make sure the page is marked as dirty.
+ *	Then submit it and wait.
+ */
+
+static int submit(int rw, pgoff_t page_off, void * page)
+{
+	int error = 0;
+	struct bio * bio;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio_get(bio);
+	bio->bi_bdev = resume_bdev;
+	bio->bi_end_io = end_io;
+
+	if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
+		error = -EFAULT;
+		goto Done;
+	}
+
+	if (rw == WRITE)
+		bio_set_pages_dirty(bio);
+
+	atomic_set(&io_done, 1);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	while (atomic_read(&io_done))
+		yield();
+
+ Done:
+	bio_put(bio);
+	return error;
+}
+
+static int bio_read_page(pgoff_t page_off, void * page)
+{
+	return submit(READ, page_off, page);
+}
+
+static int bio_write_page(pgoff_t page_off, void * page)
+{
+	return submit(WRITE, page_off, page);
+}
+
+/*
+ * Sanity check if this image makes sense with this kernel/swap context
+ * I really don't think that it's foolproof but more than nothing..
+ */
+
+static const char * sanity_check(void)
+{
+	dump_info();
+	if(swsusp_info.version_code != LINUX_VERSION_CODE)
+		return "kernel version";
+	if(swsusp_info.num_physpages != num_physpages)
+		return "memory size";
+	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
+		return "system type";
+	if (strcmp(swsusp_info.uts.release,system_utsname.release))
+		return "kernel release";
+	if (strcmp(swsusp_info.uts.version,system_utsname.version))
+		return "version";
+	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
+		return "machine";
+	if(swsusp_info.cpus != num_online_cpus())
+		return "number of cpus";
+	return NULL;
+}
+
+
+static int check_header(void)
+{
+	const char * reason = NULL;
+	int error;
+
+	if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
+		return error;
+
+ 	/* Is this same machine? */
+	if ((reason = sanity_check())) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
+		return -EPERM;
+	}
+	nr_copy_pages = swsusp_info.image_pages;
+	return error;
+}
+
+static int check_sig(void)
+{
+	int error;
+
+	memset(&swsusp_header, 0, sizeof(swsusp_header));
+	if ((error = bio_read_page(0, &swsusp_header)))
+		return error;
+	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+
+		/*
+		 * Reset swap signature now.
+		 */
+		error = bio_write_page(0, &swsusp_header);
+	} else { 
+		printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
+		return -EINVAL;
+	}
+	if (!error)
+		pr_debug("swsusp: Signature found, resuming\n");
+	return error;
+}
+
+/**
+ *	data_read - Read image pages from swap.
+ *
+ *	You do not need to check for overlaps, check_pagedir()
+ *	already did that.
+ */
+
+static int data_read(struct pbe *pblist)
+{
+	struct pbe * p;
+	int error = 0;
+	int i = 0;
+	int mod = swsusp_info.image_pages / 100;
+
+	if (!mod)
+		mod = 1;
+
+	printk("swsusp: Reading image data (%lu pages):     ",
+			swsusp_info.image_pages);
+
+	for_each_pbe (p, pblist) {
+		if (!(i % mod))
+			printk("\b\b\b\b%3d%%", i / mod);
+
+		error = bio_read_page(swp_offset(p->swap_address),
+				  (void *)p->address);
+		if (error)
+			return error;
+
+		i++;
+	}
+	printk("\b\b\b\bdone\n");
+	return error;
+}
+
+extern dev_t name_to_dev_t(const char *line);
+
+/**
+ *	read_pagedir - Read page backup list pages from swap
+ */
+
+static int read_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbpage, *p;
+	unsigned i = 0;
+	int error;
+
+	if (!pblist)
+		return -EFAULT;
+
+	printk("swsusp: Reading pagedir (%lu pages)\n",
+			swsusp_info.pagedir_pages);
+
+	for_each_pb_page (pbpage, pblist) {
+		unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);
+
+		error = -EFAULT;
+		if (offset) {
+			p = (pbpage + PB_PAGE_SKIP)->next;
+			error = bio_read_page(offset, (void *)pbpage);
+			(pbpage + PB_PAGE_SKIP)->next = p;
+		}
+		if (error)
+			break;
+	}
+
+	if (error)
+		free_page((unsigned long)pblist);
+
+	BUG_ON(i != swsusp_info.pagedir_pages);
+
+	return error;
+}
+
+
+static int check_suspend_image(void)
+{
+	int error = 0;
+
+	if ((error = check_sig()))
+		return error;
+
+	if ((error = check_header()))
+		return error;
+
+	return 0;
+}
+
+static int read_suspend_image(void)
+{
+	int error = 0;
+	struct pbe *p;
+
+	if (!(p = alloc_pagedir(nr_copy_pages)))
+		return -ENOMEM;
+
+	if ((error = read_pagedir(p)))
+		return error;
+
+	create_pbe_list(p, nr_copy_pages);
+
+	if (!(pagedir_nosave = swsusp_pagedir_relocate(p)))
+		return -ENOMEM;
+
+	/* Allocate memory for the image and read the data from swap */
+
+	error = check_pagedir(pagedir_nosave);
+	free_eaten_memory();
+	if (!error)
+		error = data_read(pagedir_nosave);
+
+	if (error) { /* We fail cleanly */
+		for_each_pbe (p, pagedir_nosave)
+			if (p->address) {
+				free_page(p->address);
+				p->address = 0UL;
+			}
+		free_pagedir(pagedir_nosave);
+	}
+	return error;
+}
+
+/**
+ *      swsusp_check - Check for saved image in swap
+ */
+
+int swsusp_check(void)
+{
+	int error;
+
+	if (!swsusp_resume_device) {
+		if (!strlen(resume_file))
+			return -ENOENT;
+		swsusp_resume_device = name_to_dev_t(resume_file);
+		pr_debug("swsusp: Resume From Partition %s\n", resume_file);
+	} else {
+		pr_debug("swsusp: Resume From Partition %d:%d\n",
+			 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
+	}
+
+	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	if (!IS_ERR(resume_bdev)) {
+		set_blocksize(resume_bdev, PAGE_SIZE);
+		error = check_suspend_image();
+		if (error)
+		    blkdev_put(resume_bdev);
+	} else
+		error = PTR_ERR(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: resume file found\n");
+	else
+		pr_debug("swsusp: Error %d check for resume file\n", error);
+	return error;
+}
+
+/**
+ *	swsusp_read - Read saved image from swap.
+ */
+
+int swsusp_read(void)
+{
+	int error;
+
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return PTR_ERR(resume_bdev);
+	}
+
+	error = read_suspend_image();
+	blkdev_put(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: Reading resume file was successful\n");
+	else
+		pr_debug("swsusp: Error %d resuming\n", error);
+	return error;
+}
+
+/**
+ *	swsusp_close - close swap device.
+ */
+
+void swsusp_close(void)
+{
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return;
+	}
+
+	blkdev_put(resume_bdev);
+}
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-17 02:20:36 +0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-17 02:20:36 +0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel/power
download	linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.xz