From 1e66df3ee301209f4a38df097d7cc5cb9b367a3f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: add kstrndup

Add a kstrndup function, modelled on strndup.  Like strndup this
returns a string copied into its own allocated memory, but it copies
no more than the specified number of bytes from the source.

Remove private strndup() from irda code.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@mandriva.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Panagiotis Issaris <takis@issaris.org>
Cc: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 include/linux/string.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 7f2eb6a477f9..ee5e9ccc4aae 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -105,6 +105,7 @@ extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 
 extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
 
 #ifdef __cplusplus
-- 
cgit v1.2.3


From d84d1cc7647c7e4f77d517e2d87b4a106a0420d9 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: add argv_split()

argv_split() is a helper function which takes a string, splits it at
whitespace, and returns a NULL-terminated argv vector.  This is
deliberately simple - it does no quote processing of any kind.

[ Seems to me that this is something which is already being done in
  the kernel, but I couldn't find any other implementations, either to
  steal or replace.  Keep an eye out. ]

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/string.h |   3 ++
 lib/Makefile           |   2 +-
 lib/argv_split.c       | 105 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 lib/argv_split.c

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index ee5e9ccc4aae..836062b7582a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -108,6 +108,9 @@ extern char *kstrdup(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
 
+extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
+extern void argv_free(char **argv);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index da68b2ca0606..614966387402 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o irq_regs.o reciprocal_div.o
+	 sha1.o irq_regs.o reciprocal_div.o argv_split.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/argv_split.c b/lib/argv_split.c
new file mode 100644
index 000000000000..4096ed42f490
--- /dev/null
+++ b/lib/argv_split.c
@@ -0,0 +1,105 @@
+/*
+ * Helper function for splitting a string into an argv-like array.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/bug.h>
+
+static const char *skip_sep(const char *cp)
+{
+	while (*cp && isspace(*cp))
+		cp++;
+
+	return cp;
+}
+
+static const char *skip_arg(const char *cp)
+{
+	while (*cp && !isspace(*cp))
+		cp++;
+
+	return cp;
+}
+
+static int count_argc(const char *str)
+{
+	int count = 0;
+
+	while (*str) {
+		str = skip_sep(str);
+		if (*str) {
+			count++;
+			str = skip_arg(str);
+		}
+	}
+
+	return count;
+}
+
+/**
+ * argv_free - free an argv
+ * @argv - the argument vector to be freed
+ *
+ * Frees an argv and the strings it points to.
+ */
+void argv_free(char **argv)
+{
+	char **p;
+	for (p = argv; *p; p++)
+		kfree(*p);
+
+	kfree(argv);
+}
+EXPORT_SYMBOL(argv_free);
+
+/**
+ * argv_split - split a string at whitespace, returning an argv
+ * @gfp: the GFP mask used to allocate memory
+ * @str: the string to be split
+ * @argcp: returned argument count
+ *
+ * Returns an array of pointers to strings which are split out from
+ * @str.  This is performed by strictly splitting on white-space; no
+ * quote processing is performed.  Multiple whitespace characters are
+ * considered to be a single argument separator.  The returned array
+ * is always NULL-terminated.  Returns NULL on memory allocation
+ * failure.
+ */
+char **argv_split(gfp_t gfp, const char *str, int *argcp)
+{
+	int argc = count_argc(str);
+	char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp);
+	char **argvp;
+
+	if (argv == NULL)
+		goto out;
+
+	*argcp = argc;
+	argvp = argv;
+
+	while (*str) {
+		str = skip_sep(str);
+
+		if (*str) {
+			const char *p = str;
+			char *t;
+
+			str = skip_arg(str);
+
+			t = kstrndup(p, str-p, gfp);
+			if (t == NULL)
+				goto fail;
+			*argvp++ = t;
+		}
+	}
+	*argvp = NULL;
+
+  out:
+	return argv;
+
+  fail:
+	argv_free(argv);
+	return NULL;
+}
+EXPORT_SYMBOL(argv_split);
-- 
cgit v1.2.3


From 0ab4dc92278a0f3816e486d6350c6652a72e06c8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: usermodehelper: split setup from execution

Rather than having hundreds of variations of call_usermodehelper for
various pieces of usermode state which could be set up, split the
info allocation and initialization from the actual process execution.

This means the general pattern becomes:
 info = call_usermodehelper_setup(path, argv, envp); /* basic state */
 call_usermodehelper_<SET EXTRA STATE>(info, stuff...);	/* extra state */
 call_usermodehelper_exec(info, wait);	/* run process and free info */

This patch introduces wrappers for all the existing calling styles for
call_usermodehelper_*, but folds their implementations into one.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: David Howells <dhowells@redhat.com>
Cc: Bj?rn Steinbrink <B.Steinbrink@gmx.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/kmod.h |  44 +++++++++++-
 kernel/kmod.c        | 191 ++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 176 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 10f505c8431d..c4cbe59d9c67 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -36,13 +36,51 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
 #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
 
 struct key;
-extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[],
-				    struct key *session_keyring, int wait);
+struct file;
+struct subprocess_info;
+
+/* Allocate a subprocess_info structure */
+struct subprocess_info *call_usermodehelper_setup(char *path,
+						  char **argv, char **envp);
+
+/* Set various pieces of state into the subprocess_info structure */
+void call_usermodehelper_setkeys(struct subprocess_info *info,
+				 struct key *session_keyring);
+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
+				  struct file **filp);
+void call_usermodehelper_setcleanup(struct subprocess_info *info,
+				    void (*cleanup)(char **argv, char **envp));
+
+/* Actually execute the sub-process */
+int call_usermodehelper_exec(struct subprocess_info *info, int wait);
+
+/* Free the subprocess_info. This is only needed if you're not going
+   to call call_usermodehelper_exec */
+void call_usermodehelper_freeinfo(struct subprocess_info *info);
 
 static inline int
 call_usermodehelper(char *path, char **argv, char **envp, int wait)
 {
-	return call_usermodehelper_keys(path, argv, envp, NULL, wait);
+	struct subprocess_info *info;
+
+	info = call_usermodehelper_setup(path, argv, envp);
+	if (info == NULL)
+		return -ENOMEM;
+	return call_usermodehelper_exec(info, wait);
+}
+
+static inline int
+call_usermodehelper_keys(char *path, char **argv, char **envp,
+			 struct key *session_keyring, int wait)
+{
+	struct subprocess_info *info;
+
+	info = call_usermodehelper_setup(path, argv, envp);
+	if (info == NULL)
+		return -ENOMEM;
+
+	call_usermodehelper_setkeys(info, session_keyring);
+	return call_usermodehelper_exec(info, wait);
 }
 
 extern void usermodehelper_init(void);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb077179..d2dce71115d8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -122,6 +122,7 @@ struct subprocess_info {
 	int wait;
 	int retval;
 	struct file *stdin;
+	void (*cleanup)(char **argv, char **envp);
 };
 
 /*
@@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data)
 	do_exit(0);
 }
 
+void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+	if (info->cleanup)
+		(*info->cleanup)(info->argv, info->envp);
+	kfree(info);
+}
+EXPORT_SYMBOL(call_usermodehelper_freeinfo);
+
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -217,7 +226,7 @@ static int wait_for_helper(void *data)
 	}
 
 	if (sub_info->wait < 0)
-		kfree(sub_info);
+		call_usermodehelper_freeinfo(sub_info);
 	else
 		complete(sub_info->complete);
 	return 0;
@@ -252,11 +261,94 @@ static void __call_usermodehelper(struct work_struct *work)
 }
 
 /**
- * call_usermodehelper_keys - start a usermode application
- * @path: pathname for the application
- * @argv: null-terminated argument list
- * @envp: null-terminated environment list
- * @session_keyring: session keyring for process (NULL for an empty keyring)
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path - path to usermode executable
+ * @argv - arg vector for process
+ * @envp - environment for process
+ *
+ * Returns either NULL on allocation failure, or a subprocess_info
+ * structure.  This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ */
+struct subprocess_info *call_usermodehelper_setup(char *path,
+						  char **argv, char **envp)
+{
+	struct subprocess_info *sub_info;
+	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+	if (!sub_info)
+		goto out;
+
+	INIT_WORK(&sub_info->work, __call_usermodehelper);
+	sub_info->path = path;
+	sub_info->argv = argv;
+	sub_info->envp = envp;
+
+  out:
+	return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_setkeys - set the session keys for usermode helper
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @session_keyring: the session keyring for the process
+ */
+void call_usermodehelper_setkeys(struct subprocess_info *info,
+				 struct key *session_keyring)
+{
+	info->ring = session_keyring;
+}
+EXPORT_SYMBOL(call_usermodehelper_setkeys);
+
+/**
+ * call_usermodehelper_setcleanup - set a cleanup function
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @cleanup: a cleanup function
+ *
+ * The cleanup function is just befor ethe subprocess_info is about to
+ * be freed.  This can be used for freeing the argv and envp.  The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+void call_usermodehelper_setcleanup(struct subprocess_info *info,
+				    void (*cleanup)(char **argv, char **envp))
+{
+	info->cleanup = cleanup;
+}
+EXPORT_SYMBOL(call_usermodehelper_setcleanup);
+
+/**
+ * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
+ * @sub_info: a subprocess_info returned by call_usermodehelper_setup
+ * @filp: set to the write-end of a pipe
+ *
+ * This constructs a pipe, and sets the read end to be the stdin of the
+ * subprocess, and returns the write-end in *@filp.
+ */
+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
+				  struct file **filp)
+{
+	struct file *f;
+
+	f = create_write_pipe();
+	if (IS_ERR(f))
+		return PTR_ERR(f);
+	*filp = f;
+
+	f = create_read_pipe(f);
+	if (IS_ERR(f)) {
+		free_write_pipe(*filp);
+		return PTR_ERR(f);
+	}
+	sub_info->stdin = f;
+
+	return 0;
+}
+EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
  * @wait: wait for the application to finish and return status.
  *        when -1 don't wait at all, but you get no useful error back when
  *        the program couldn't be exec'ed. This makes it safe to call
@@ -265,33 +357,24 @@ static void __call_usermodehelper(struct work_struct *work)
  * Runs a user-space application.  The application is started
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
- *
- * Must be called from process context.  Returns a negative error code
- * if program was not execed successfully, or 0.
  */
-int call_usermodehelper_keys(char *path, char **argv, char **envp,
-			     struct key *session_keyring, int wait)
+int call_usermodehelper_exec(struct subprocess_info *sub_info,
+			     int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
-	struct subprocess_info *sub_info;
 	int retval;
 
-	if (!khelper_wq)
-		return -EBUSY;
-
-	if (path[0] == '\0')
-		return 0;
+	if (sub_info->path[0] == '\0') {
+		retval = 0;
+		goto out;
+	}
 
-	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
-	if (!sub_info)
-		return -ENOMEM;
+	if (!khelper_wq) {
+		retval = -EBUSY;
+		goto out;
+	}
 
-	INIT_WORK(&sub_info->work, __call_usermodehelper);
 	sub_info->complete = &done;
-	sub_info->path = path;
-	sub_info->argv = argv;
-	sub_info->envp = envp;
-	sub_info->ring = session_keyring;
 	sub_info->wait = wait;
 
 	queue_work(khelper_wq, &sub_info->work);
@@ -299,47 +382,43 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 		return 0;
 	wait_for_completion(&done);
 	retval = sub_info->retval;
-	kfree(sub_info);
+
+  out:
+	call_usermodehelper_freeinfo(sub_info);
 	return retval;
 }
-EXPORT_SYMBOL(call_usermodehelper_keys);
+EXPORT_SYMBOL(call_usermodehelper_exec);
 
+/**
+ * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @filp: set to the write-end of a pipe
+ *
+ * This is a simple wrapper which executes a usermode-helper function
+ * with a pipe as stdin.  It is implemented entirely in terms of
+ * lower-level call_usermodehelper_* functions.
+ */
 int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 			     struct file **filp)
 {
-	DECLARE_COMPLETION(done);
-	struct subprocess_info sub_info = {
-		.work		= __WORK_INITIALIZER(sub_info.work,
-						     __call_usermodehelper),
-		.complete	= &done,
-		.path		= path,
-		.argv		= argv,
-		.envp		= envp,
-		.retval		= 0,
-	};
-	struct file *f;
-
-	if (!khelper_wq)
-		return -EBUSY;
+	struct subprocess_info *sub_info;
+	int ret;
 
-	if (path[0] == '\0')
-		return 0;
+	sub_info = call_usermodehelper_setup(path, argv, envp);
+	if (sub_info == NULL)
+		return -ENOMEM;
 
-	f = create_write_pipe();
-	if (IS_ERR(f))
-		return PTR_ERR(f);
-	*filp = f;
+	ret = call_usermodehelper_stdinpipe(sub_info, filp);
+	if (ret < 0)
+		goto out;
 
-	f = create_read_pipe(f);
-	if (IS_ERR(f)) {
-		free_write_pipe(*filp);
-		return PTR_ERR(f);
-	}
-	sub_info.stdin = f;
+	return call_usermodehelper_exec(sub_info, 1);
 
-	queue_work(khelper_wq, &sub_info.work);
-	wait_for_completion(&done);
-	return sub_info.retval;
+  out:
+	call_usermodehelper_freeinfo(sub_info);
+	return ret;
 }
 EXPORT_SYMBOL(call_usermodehelper_pipe);
 
-- 
cgit v1.2.3


From 10a0a8d4e3f6bf2d077f94344441909abe670f5a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: Add common orderly_poweroff()

Various pieces of code around the kernel want to be able to trigger an
orderly poweroff.  This pulls them together into a single
implementation.

By default the poweroff command is /sbin/poweroff, but it can be set
via sysctl: kernel/poweroff_cmd.  This is split at whitespace, so it
can include command-line arguments.

This patch replaces four other instances of invoking either "poweroff"
or "shutdown -h now": two sbus drivers, and acpi thermal
management.

sparc64 has its own "powerd"; still need to determine whether it should
be replaced by orderly_poweroff().

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Len Brown <lenb@kernel.org>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/thermal.c          | 24 ++---------------
 drivers/sbus/char/bbc_envctrl.c |  5 ++--
 drivers/sbus/char/envctrl.c     |  7 ++---
 include/linux/reboot.h          |  5 ++++
 kernel/sys.c                    | 58 +++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c                 | 10 +++++++
 6 files changed, 79 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 88a6fc7fd271..58f1338981bc 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -40,6 +40,7 @@
 #include <linux/jiffies.h>
 #include <linux/kmod.h>
 #include <linux/seq_file.h>
+#include <linux/reboot.h>
 #include <asm/uaccess.h>
 
 #include <acpi/acpi_bus.h>
@@ -59,7 +60,6 @@
 #define ACPI_THERMAL_NOTIFY_CRITICAL	0xF0
 #define ACPI_THERMAL_NOTIFY_HOT		0xF1
 #define ACPI_THERMAL_MODE_ACTIVE	0x00
-#define ACPI_THERMAL_PATH_POWEROFF	"/sbin/poweroff"
 
 #define ACPI_THERMAL_MAX_ACTIVE	10
 #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
 	return 0;
 }
 
-static int acpi_thermal_call_usermode(char *path)
-{
-	char *argv[2] = { NULL, NULL };
-	char *envp[3] = { NULL, NULL, NULL };
-
-
-	if (!path)
-		return -EINVAL;
-
-	argv[0] = path;
-
-	/* minimal command environment */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-
-	call_usermodehelper(argv[0], argv, envp, 0);
-
-	return 0;
-}
-
 static int acpi_thermal_critical(struct acpi_thermal *tz)
 {
 	if (!tz || !tz->trips.critical.flags.valid)
@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
 	acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
 				tz->trips.critical.flags.enabled);
 
-	acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF);
+	orderly_poweroff(true);
 
 	return 0;
 }
diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c
index a54e4140683a..e821a155b658 100644
--- a/drivers/sbus/char/bbc_envctrl.c
+++ b/drivers/sbus/char/bbc_envctrl.c
@@ -7,6 +7,7 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/kmod.h>
+#include <linux/reboot.h>
 #include <asm/oplib.h>
 #include <asm/ebus.h>
 
@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
 static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
 {
 	static int shutting_down = 0;
-	static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
-	char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
 	char *type = "???";
 	s8 val = -1;
 
@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
 	printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
 
 	shutting_down = 1;
-	if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0)
+	if (orderly_poweroff(true) < 0)
 		printk(KERN_CRIT "envctrl: shutdown execution failed\n");
 }
 
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 8328acab47fd..dadabef116b6 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -26,6 +26,7 @@
 #include <linux/ioport.h>
 #include <linux/miscdevice.h>
 #include <linux/kmod.h>
+#include <linux/reboot.h>
 
 #include <asm/ebus.h>
 #include <asm/uaccess.h>
@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
 static void envctrl_do_shutdown(void)
 {
 	static int inprog = 0;
-	static char *envp[] = {	
-		"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
-	char *argv[] = { 
-		"/sbin/shutdown", "-h", "now", NULL };	
 	int ret;
 
 	if (inprog != 0)
@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
 
 	inprog = 1;
 	printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
-	ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0);
+	ret = orderly_poweroff(true);
 	if (ret < 0) {
 		printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); 
 		inprog = 0;  /* unlikely to succeed, but we could try again */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 1dd1c707311f..85ea63f462af 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -67,6 +67,11 @@ extern void kernel_power_off(void);
 
 void ctrl_alt_del(void);
 
+#define POWEROFF_CMD_PATH_LEN	256
+extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
+
+extern int orderly_poweroff(bool force);
+
 /*
  * Emergency restart, callable from an interrupt handler.
  */
diff --git a/kernel/sys.c b/kernel/sys.c
index 4d141ae3e802..aeded9ad66ce 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2286,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
 	}
 	return err ? -EFAULT : 0;
 }
+
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+
+static void argv_cleanup(char **argv, char **envp)
+{
+	argv_free(argv);
+}
+
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+	int argc;
+	char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+	static char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL
+	};
+	int ret = -ENOMEM;
+	struct subprocess_info *info;
+
+	if (argv == NULL) {
+		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
+		       __func__, poweroff_cmd);
+		goto out;
+	}
+
+	info = call_usermodehelper_setup(argv[0], argv, envp);
+	if (info == NULL) {
+		argv_free(argv);
+		goto out;
+	}
+
+	call_usermodehelper_setcleanup(info, argv_cleanup);
+
+	ret = call_usermodehelper_exec(info, -1);
+
+  out:
+	if (ret && force) {
+		printk(KERN_WARNING "Failed to start orderly shutdown: "
+		       "forcing the issue\n");
+
+		/* I guess this should try to kick off some daemon to
+		   sync and poweroff asap.  Or not even bother syncing
+		   if we're doing an emergency shutdown? */
+		emergency_sync();
+		kernel_power_off();
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7063ebc6db05..44a1d699aad7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
+#include <linux/reboot.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -705,6 +706,15 @@ static ctl_table kern_table[] = {
 		.proc_handler   = &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "poweroff_cmd",
+		.data		= &poweroff_cmd,
+		.maxlen		= POWEROFF_CMD_PATH_LEN,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
 
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3


From 86313c488a6848b7ec2ba04e74f25f79dd32a0b7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: usermodehelper: Tidy up waiting

Rather than using a tri-state integer for the wait flag in
call_usermodehelper_exec, define a proper enum, and use that.  I've
preserved the integer values so that any callers I've missed should
still work OK.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: David Howells <dhowells@redhat.com>
---
 arch/i386/mach-voyager/voyager_thread.c |  2 +-
 arch/x86_64/kernel/mce.c                |  2 +-
 drivers/macintosh/therm_pm72.c          |  3 ++-
 drivers/macintosh/windfarm_core.c       |  3 ++-
 drivers/net/hamradio/baycom_epp.c       |  2 +-
 drivers/pnp/pnpbios/core.c              |  2 +-
 fs/ocfs2/heartbeat.c                    |  2 +-
 include/linux/kmod.h                    | 12 +++++++++---
 kernel/cpuset.c                         |  2 +-
 kernel/kmod.c                           | 27 ++++++++++++++++-----------
 kernel/sys.c                            |  2 +-
 lib/kobject_uevent.c                    |  2 +-
 net/bridge/br_stp_if.c                  |  2 +-
 security/keys/request_key.c             |  3 ++-
 14 files changed, 40 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e1..f9d595338159 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
 		NULL,
 	};
 
-	if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
+	if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
 		printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
 		       string, ret);
 	}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index aa1d15991794..f3fb8174559e 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
 	if (events != atomic_read(&mce_logged) && trigger[0]) {
 		/* Small race window, but should be harmless.  */
 		atomic_set(&mce_logged, events);
-		call_usermodehelper(trigger, trigger_argv, NULL, -1);
+		call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 	}
 }
 
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index dbb22403979f..3d90fc002097 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 				NULL };
 
-	return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
+	return call_usermodehelper(critical_overtemp_path,
+				   argv, envp, UMH_WAIT_EXEC);
 }
 
 
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index e18d265d5d33..516d943227e2 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 				NULL };
 
-	return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
+	return call_usermodehelper(critical_overtemp_path,
+				   argv, envp, UMH_WAIT_EXEC);
 }
 EXPORT_SYMBOL_GPL(wf_critical_overtemp);
 
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 84aa2117c0ee..355c6cf3d112 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
 	sprintf(portarg, "%ld", bc->pdev->port->base);
 	printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
 
-	return call_usermodehelper(eppconfig_path, argv, envp, 1);
+	return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 03baf1c64a2e..ed112ee16012 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
 		info->location_id, info->serial, info->capabilities);
 	envp[i] = NULL;
 	
-	value = call_usermodehelper (argv [0], argv, envp, 0);
+	value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
 	kfree (buf);
 	kfree (envp);
 	return 0;
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 352eb4a13f98..c4c36171240d 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
 	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[2] = NULL;
 
-	ret = call_usermodehelper(argv[0], argv, envp, 1);
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (ret < 0)
 		mlog_errno(ret);
 }
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index c4cbe59d9c67..5dc13848891b 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -51,15 +51,21 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
 void call_usermodehelper_setcleanup(struct subprocess_info *info,
 				    void (*cleanup)(char **argv, char **envp));
 
+enum umh_wait {
+	UMH_NO_WAIT = -1,	/* don't wait at all */
+	UMH_WAIT_EXEC = 0,	/* wait for the exec, but not the process */
+	UMH_WAIT_PROC = 1,	/* wait for the process to complete */
+};
+
 /* Actually execute the sub-process */
-int call_usermodehelper_exec(struct subprocess_info *info, int wait);
+int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
 
 /* Free the subprocess_info. This is only needed if you're not going
    to call call_usermodehelper_exec */
 void call_usermodehelper_freeinfo(struct subprocess_info *info);
 
 static inline int
-call_usermodehelper(char *path, char **argv, char **envp, int wait)
+call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
 {
 	struct subprocess_info *info;
 
@@ -71,7 +77,7 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait)
 
 static inline int
 call_usermodehelper_keys(char *path, char **argv, char **envp,
-			 struct key *session_keyring, int wait)
+			 struct key *session_keyring, enum umh_wait wait)
 {
 	struct subprocess_info *info;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b4796d850140..57e6448b171e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
 	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[i] = NULL;
 
-	call_usermodehelper(argv[0], argv, envp, 0);
+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 	kfree(pathbuf);
 }
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index d2dce71115d8..78d365c524ed 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -119,7 +119,7 @@ struct subprocess_info {
 	char **argv;
 	char **envp;
 	struct key *ring;
-	int wait;
+	enum umh_wait wait;
 	int retval;
 	struct file *stdin;
 	void (*cleanup)(char **argv, char **envp);
@@ -225,7 +225,7 @@ static int wait_for_helper(void *data)
 			sub_info->retval = ret;
 	}
 
-	if (sub_info->wait < 0)
+	if (sub_info->wait == UMH_NO_WAIT)
 		call_usermodehelper_freeinfo(sub_info);
 	else
 		complete(sub_info->complete);
@@ -238,26 +238,31 @@ static void __call_usermodehelper(struct work_struct *work)
 	struct subprocess_info *sub_info =
 		container_of(work, struct subprocess_info, work);
 	pid_t pid;
-	int wait = sub_info->wait;
+	enum umh_wait wait = sub_info->wait;
 
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
-	if (wait)
+	if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
 		pid = kernel_thread(wait_for_helper, sub_info,
 				    CLONE_FS | CLONE_FILES | SIGCHLD);
 	else
 		pid = kernel_thread(____call_usermodehelper, sub_info,
 				    CLONE_VFORK | SIGCHLD);
 
-	if (wait < 0)
-		return;
+	switch (wait) {
+	case UMH_NO_WAIT:
+		break;
 
-	if (pid < 0) {
+	case UMH_WAIT_PROC:
+		if (pid > 0)
+			break;
 		sub_info->retval = pid;
+		/* FALLTHROUGH */
+
+	case UMH_WAIT_EXEC:
 		complete(sub_info->complete);
-	} else if (!wait)
-		complete(sub_info->complete);
+	}
 }
 
 /**
@@ -359,7 +364,7 @@ EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
  * (ie. it runs with full root capabilities).
  */
 int call_usermodehelper_exec(struct subprocess_info *sub_info,
-			     int wait)
+			     enum umh_wait wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval;
@@ -378,7 +383,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	sub_info->wait = wait;
 
 	queue_work(khelper_wq, &sub_info->work);
-	if (wait < 0) /* task has freed sub_info */
+	if (wait == UMH_NO_WAIT) /* task has freed sub_info */
 		return 0;
 	wait_for_completion(&done);
 	retval = sub_info->retval;
diff --git a/kernel/sys.c b/kernel/sys.c
index aeded9ad66ce..18987c7f6add 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2327,7 +2327,7 @@ int orderly_poweroff(bool force)
 
 	call_usermodehelper_setcleanup(info, argv_cleanup);
 
-	ret = call_usermodehelper_exec(info, -1);
+	ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
 
   out:
 	if (ret && force) {
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 12e311dc664c..bd5ecbbafab1 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -208,7 +208,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 		argv [0] = uevent_helper;
 		argv [1] = (char *)subsystem;
 		argv [2] = NULL;
-		call_usermodehelper (argv[0], argv, envp, 0);
+		call_usermodehelper (argv[0], argv, envp, UMH_WAIT_EXEC);
 	}
 
 exit:
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index a786e7863200..1ea2f86f7683 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -125,7 +125,7 @@ static void br_stp_start(struct net_bridge *br)
 	char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
 	char *envp[] = { NULL };
 
-	r = call_usermodehelper(BR_STP_PROG, argv, envp, 1);
+	r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
 	if (r == 0) {
 		br->stp_enabled = BR_USER_STP;
 		printk(KERN_INFO "%s: userspace STP started\n", br->dev->name);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index f573ac189a0a..557500110a13 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -108,7 +108,8 @@ static int call_sbin_request_key(struct key *key,
 	argv[i] = NULL;
 
 	/* do it */
-	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1);
+	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring,
+				       UMH_WAIT_PROC);
 
 error_link:
 	key_put(keyring);
-- 
cgit v1.2.3


From 810bab448e563ffd1718d78e9a3756806b626acc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: use elfnote.h to generate vsyscall notes.

Use existing elfnote.h to generate vsyscall notes, rather than doing
it locally.  Changes elfnote.h a bit to suit, since this is the first
asm user, and it wasn't quite right.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.com>
---
 arch/i386/kernel/vsyscall-note.S | 23 ++++++-----------------
 include/linux/elfnote.h          | 22 +++++++++++++++-------
 2 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index d4b5be4f3d5f..52e0cbbac70c 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,12 @@
  * Here we can supply some information useful to userland.
  */
 
-#include <linux/uts.h>
 #include <linux/version.h>
+#include <linux/elfnote.h>
 
-#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)			      \
-	.section name, flags;						      \
-	.balign 4;							      \
-	.long 1f - 0f;		/* name length */			      \
-	.long 3f - 2f;		/* data length */			      \
-	.long type;		/* note type */				      \
-0:	.asciz vendor;		/* vendor name */			      \
-1:	.balign 4;							      \
-2:
-
-#define ASM_ELF_NOTE_END						      \
-3:	.balign 4;		/* pad out section */			      \
-	.previous
-
-	ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
 	.long LINUX_VERSION_CODE
-	ASM_ELF_NOTE_END
+ELFNOTE_END
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 9a1e0674e56c..e831759b2fb5 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -38,17 +38,25 @@
  * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
  *      ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
  */
-#define ELFNOTE(name, type, desctype, descdata)	\
-.pushsection .note.name, "",@note	;	\
-  .align 4				;	\
+#define ELFNOTE_START(name, type, flags)	\
+.pushsection .note.name, flags,@note	;	\
+  .balign 4				;	\
   .long 2f - 1f		/* namesz */	;	\
-  .long 4f - 3f		/* descsz */	;	\
+  .long 4484f - 3f	/* descsz */	;	\
   .long type				;	\
 1:.asciz #name				;	\
-2:.align 4				;	\
-3:desctype descdata			;	\
-4:.align 4				;	\
+2:.balign 4				;	\
+3:
+
+#define ELFNOTE_END				\
+4484:.balign 4				;	\
 .popsection				;
+
+#define ELFNOTE(name, type, desc)		\
+	ELFNOTE_START(name, type, "")		\
+		desc			;	\
+	ELFNOTE_END
+
 #else	/* !__ASSEMBLER__ */
 #include <linux/elf.h>
 /*
-- 
cgit v1.2.3


From 5f4352fbffd6c45123dbce9e195efd54df4e177e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:04 -0700
Subject: Allocate and free vmalloc areas

Allocate/release a chunk of vmalloc address space:
 alloc_vm_area reserves a chunk of address space, and makes sure all
 the pagetables are constructed for that address range - but no pages.

 free_vm_area releases the address space range.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: "Jan Beulich" <JBeulich@novell.com>
Cc: "Andi Kleen" <ak@muc.de>
---
 include/linux/vmalloc.h |  4 ++++
 mm/vmalloc.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 132b260aef1e..c2b10cae5da5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -70,6 +70,10 @@ extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 
+/* Allocate/destroy a 'vmalloc' VM area. */
+extern struct vm_struct *alloc_vm_area(size_t size);
+extern void free_vm_area(struct vm_struct *area);
+
 /*
  *	Internals.  Dont't use..
  */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8e05a11155c9..3130c343088f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -767,3 +767,56 @@ EXPORT_SYMBOL(remap_vmalloc_range);
 void  __attribute__((weak)) vmalloc_sync_all(void)
 {
 }
+
+
+static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+	/* apply_to_page_range() does all the hard work. */
+	return 0;
+}
+
+/**
+ *	alloc_vm_area - allocate a range of kernel address space
+ *	@size:		size of the area
+ *	@returns:	NULL on failure, vm_struct on success
+ *
+ *	This function reserves a range of kernel address space, and
+ *	allocates pagetables to map that range.  No actual mappings
+ *	are created.  If the kernel address space is not shared
+ *	between processes, it syncs the pagetable across all
+ *	processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+	struct vm_struct *area;
+
+	area = get_vm_area(size, VM_IOREMAP);
+	if (area == NULL)
+		return NULL;
+
+	/*
+	 * This ensures that page tables are constructed for this region
+	 * of kernel virtual address space and mapped into init_mm.
+	 */
+	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+				area->size, f, NULL)) {
+		free_vm_area(area);
+		return NULL;
+	}
+
+	/* Make sure the pagetables are constructed in process kernel
+	   mappings */
+	vmalloc_sync_all();
+
+	return area;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+	struct vm_struct *ret;
+	ret = remove_vm_area(area->addr);
+	BUG_ON(ret != area);
+	kfree(area);
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
-- 
cgit v1.2.3


From c85b04c3749507546f6d5868976e4793e35c2ec0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: add pinned page flag

Add a new definition for PG_owner_priv_1 to define PG_pinned on Xen
pagetable pages.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/linux/page-flags.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ae2d79f2107e..731cd2ac3227 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -92,6 +92,7 @@
 
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
+#define PG_pinned		PG_owner_priv_1	/* Xen pinned pagetable */
 
 #if (BITS_PER_LONG > 32)
 /*
@@ -170,6 +171,10 @@ static inline void SetPageUptodate(struct page *page)
 #define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
 #define ClearPageChecked(page)	clear_bit(PG_checked, &(page)->flags)
 
+#define PagePinned(page)	test_bit(PG_pinned, &(page)->flags)
+#define SetPagePinned(page)	set_bit(PG_pinned, &(page)->flags)
+#define ClearPagePinned(page)	clear_bit(PG_pinned, &(page)->flags)
+
 #define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
 #define SetPageReserved(page)	set_bit(PG_reserved, &(page)->flags)
 #define ClearPageReserved(page)	clear_bit(PG_reserved, &(page)->flags)
-- 
cgit v1.2.3


From 9f27ee595038653ddf8bca871200d39247d6f4fc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:06 -0700
Subject: xen: add virtual block device driver.

The block device frontend driver allows the kernel to access block
devices exported exported by a virtual machine containing a physical
block device driver.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Greg KH <greg@kroah.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig        |   9 +
 drivers/block/Makefile       |   1 +
 drivers/block/xen-blkfront.c | 988 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/major.h        |   2 +
 4 files changed, 1000 insertions(+)
 create mode 100644 drivers/block/xen-blkfront.c

(limited to 'include/linux')

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8f65b88cf711..a4a311992408 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -427,4 +427,13 @@ config XILINX_SYSACE
 	help
 	  Include support for the Xilinx SystemACE CompactFlash interface
 
+config XEN_BLKDEV_FRONTEND
+	tristate "Xen virtual block device support"
+	depends on XEN
+	default y
+	help
+	  This driver implements the front-end of the Xen virtual
+	  block device driver.  It communicates with a back-end driver
+	  in another domain which drives the actual block device.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 9ee08ab4ffa8..3e31532df0ed 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD)		+= viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
 obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
new file mode 100644
index 000000000000..6746c29181f8
--- /dev/null
+++ b/drivers/block/xen-blkfront.c
@@ -0,0 +1,988 @@
+/*
+ * blkfront.c
+ *
+ * XenLinux virtual block device driver.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/page.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/blkif.h>
+
+#include <asm/xen/hypervisor.h>
+
+enum blkif_state {
+	BLKIF_STATE_DISCONNECTED,
+	BLKIF_STATE_CONNECTED,
+	BLKIF_STATE_SUSPENDED,
+};
+
+struct blk_shadow {
+	struct blkif_request req;
+	unsigned long request;
+	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+static struct block_device_operations xlvbd_block_fops;
+
+#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+	struct xenbus_device *xbdev;
+	dev_t dev;
+	struct gendisk *gd;
+	int vdevice;
+	blkif_vdev_t handle;
+	enum blkif_state connected;
+	int ring_ref;
+	struct blkif_front_ring ring;
+	unsigned int evtchn, irq;
+	struct request_queue *rq;
+	struct work_struct work;
+	struct gnttab_free_callback callback;
+	struct blk_shadow shadow[BLK_RING_SIZE];
+	unsigned long shadow_free;
+	int feature_barrier;
+
+	/**
+	 * The number of people holding this device open.  We won't allow a
+	 * hot-unplug unless this is 0.
+	 */
+	int users;
+};
+
+static DEFINE_SPINLOCK(blkif_io_lock);
+
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+#define GRANT_INVALID_REF	0
+
+#define PARTS_PER_DISK		16
+
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+
+#define DEV_NAME	"xvd"	/* name in /dev */
+
+/* Information about our VBDs. */
+#define MAX_VBDS 64
+static LIST_HEAD(vbds_list);
+
+static int get_id_from_freelist(struct blkfront_info *info)
+{
+	unsigned long free = info->shadow_free;
+	BUG_ON(free > BLK_RING_SIZE);
+	info->shadow_free = info->shadow[free].req.id;
+	info->shadow[free].req.id = 0x0fffffee; /* debug */
+	return free;
+}
+
+static void add_id_to_freelist(struct blkfront_info *info,
+			       unsigned long id)
+{
+	info->shadow[id].req.id  = info->shadow_free;
+	info->shadow[id].request = 0;
+	info->shadow_free = id;
+}
+
+static void blkif_restart_queue_callback(void *arg)
+{
+	struct blkfront_info *info = (struct blkfront_info *)arg;
+	schedule_work(&info->work);
+}
+
+/*
+ * blkif_queue_request
+ *
+ * request block io
+ *
+ * id: for guest use only.
+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ */
+static int blkif_queue_request(struct request *req)
+{
+	struct blkfront_info *info = req->rq_disk->private_data;
+	unsigned long buffer_mfn;
+	struct blkif_request *ring_req;
+	struct bio *bio;
+	struct bio_vec *bvec;
+	int idx;
+	unsigned long id;
+	unsigned int fsect, lsect;
+	int ref;
+	grant_ref_t gref_head;
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+		return 1;
+
+	if (gnttab_alloc_grant_references(
+		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
+		gnttab_request_free_callback(
+			&info->callback,
+			blkif_restart_queue_callback,
+			info,
+			BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		return 1;
+	}
+
+	/* Fill out a communications ring structure. */
+	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+	id = get_id_from_freelist(info);
+	info->shadow[id].request = (unsigned long)req;
+
+	ring_req->id = id;
+	ring_req->sector_number = (blkif_sector_t)req->sector;
+	ring_req->handle = info->handle;
+
+	ring_req->operation = rq_data_dir(req) ?
+		BLKIF_OP_WRITE : BLKIF_OP_READ;
+	if (blk_barrier_rq(req))
+		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
+
+	ring_req->nr_segments = 0;
+	rq_for_each_bio (bio, req) {
+		bio_for_each_segment (bvec, bio, idx) {
+			BUG_ON(ring_req->nr_segments
+			       == BLKIF_MAX_SEGMENTS_PER_REQUEST);
+			buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
+			fsect = bvec->bv_offset >> 9;
+			lsect = fsect + (bvec->bv_len >> 9) - 1;
+			/* install a grant reference. */
+			ref = gnttab_claim_grant_reference(&gref_head);
+			BUG_ON(ref == -ENOSPC);
+
+			gnttab_grant_foreign_access_ref(
+				ref,
+				info->xbdev->otherend_id,
+				buffer_mfn,
+				rq_data_dir(req) );
+
+			info->shadow[id].frame[ring_req->nr_segments] =
+				mfn_to_pfn(buffer_mfn);
+
+			ring_req->seg[ring_req->nr_segments] =
+				(struct blkif_request_segment) {
+					.gref       = ref,
+					.first_sect = fsect,
+					.last_sect  = lsect };
+
+			ring_req->nr_segments++;
+		}
+	}
+
+	info->ring.req_prod_pvt++;
+
+	/* Keep a private copy so we can reissue requests when recovering. */
+	info->shadow[id].req = *ring_req;
+
+	gnttab_free_grant_references(gref_head);
+
+	return 0;
+}
+
+
+static inline void flush_requests(struct blkfront_info *info)
+{
+	int notify;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+
+	if (notify)
+		notify_remote_via_irq(info->irq);
+}
+
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+static void do_blkif_request(request_queue_t *rq)
+{
+	struct blkfront_info *info = NULL;
+	struct request *req;
+	int queued;
+
+	pr_debug("Entered do_blkif_request\n");
+
+	queued = 0;
+
+	while ((req = elv_next_request(rq)) != NULL) {
+		info = req->rq_disk->private_data;
+		if (!blk_fs_request(req)) {
+			end_request(req, 0);
+			continue;
+		}
+
+		if (RING_FULL(&info->ring))
+			goto wait;
+
+		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
+			 "(%u/%li) buffer:%p [%s]\n",
+			 req, req->cmd, (unsigned long)req->sector,
+			 req->current_nr_sectors,
+			 req->nr_sectors, req->buffer,
+			 rq_data_dir(req) ? "write" : "read");
+
+
+		blkdev_dequeue_request(req);
+		if (blkif_queue_request(req)) {
+			blk_requeue_request(rq, req);
+wait:
+			/* Avoid pointless unplugs. */
+			blk_stop_queue(rq);
+			break;
+		}
+
+		queued++;
+	}
+
+	if (queued != 0)
+		flush_requests(info);
+}
+
+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+{
+	request_queue_t *rq;
+
+	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+	if (rq == NULL)
+		return -1;
+
+	elevator_init(rq, "noop");
+
+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
+	blk_queue_hardsect_size(rq, sector_size);
+	blk_queue_max_sectors(rq, 512);
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+	/* Ensure a merged request will fit in a single I/O ring slot. */
+	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	/* Make sure buffer addresses are sector-aligned. */
+	blk_queue_dma_alignment(rq, 511);
+
+	gd->queue = rq;
+
+	return 0;
+}
+
+
+static int xlvbd_barrier(struct blkfront_info *info)
+{
+	int err;
+
+	err = blk_queue_ordered(info->rq,
+				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
+				NULL);
+
+	if (err)
+		return err;
+
+	printk(KERN_INFO "blkfront: %s: barriers %s\n",
+	       info->gd->disk_name,
+	       info->feature_barrier ? "enabled" : "disabled");
+	return 0;
+}
+
+
+static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
+			       int vdevice, u16 vdisk_info, u16 sector_size,
+			       struct blkfront_info *info)
+{
+	struct gendisk *gd;
+	int nr_minors = 1;
+	int err = -ENODEV;
+
+	BUG_ON(info->gd != NULL);
+	BUG_ON(info->rq != NULL);
+
+	if ((minor % PARTS_PER_DISK) == 0)
+		nr_minors = PARTS_PER_DISK;
+
+	gd = alloc_disk(nr_minors);
+	if (gd == NULL)
+		goto out;
+
+	if (nr_minors > 1)
+		sprintf(gd->disk_name, "%s%c", DEV_NAME,
+			'a' + minor / PARTS_PER_DISK);
+	else
+		sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
+			'a' + minor / PARTS_PER_DISK,
+			minor % PARTS_PER_DISK);
+
+	gd->major = XENVBD_MAJOR;
+	gd->first_minor = minor;
+	gd->fops = &xlvbd_block_fops;
+	gd->private_data = info;
+	gd->driverfs_dev = &(info->xbdev->dev);
+	set_capacity(gd, capacity);
+
+	if (xlvbd_init_blk_queue(gd, sector_size)) {
+		del_gendisk(gd);
+		goto out;
+	}
+
+	info->rq = gd->queue;
+	info->gd = gd;
+
+	if (info->feature_barrier)
+		xlvbd_barrier(info);
+
+	if (vdisk_info & VDISK_READONLY)
+		set_disk_ro(gd, 1);
+
+	if (vdisk_info & VDISK_REMOVABLE)
+		gd->flags |= GENHD_FL_REMOVABLE;
+
+	if (vdisk_info & VDISK_CDROM)
+		gd->flags |= GENHD_FL_CD;
+
+	return 0;
+
+ out:
+	return err;
+}
+
+static void kick_pending_request_queues(struct blkfront_info *info)
+{
+	if (!RING_FULL(&info->ring)) {
+		/* Re-enable calldowns. */
+		blk_start_queue(info->rq);
+		/* Kick things off immediately. */
+		do_blkif_request(info->rq);
+	}
+}
+
+static void blkif_restart_queue(struct work_struct *work)
+{
+	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+
+	spin_lock_irq(&blkif_io_lock);
+	if (info->connected == BLKIF_STATE_CONNECTED)
+		kick_pending_request_queues(info);
+	spin_unlock_irq(&blkif_io_lock);
+}
+
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+	/* Prevent new requests being issued until we fix things up. */
+	spin_lock_irq(&blkif_io_lock);
+	info->connected = suspend ?
+		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+	/* No more blkif_request(). */
+	if (info->rq)
+		blk_stop_queue(info->rq);
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&info->callback);
+	spin_unlock_irq(&blkif_io_lock);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_scheduled_work();
+
+	/* Free resources associated with old device channel. */
+	if (info->ring_ref != GRANT_INVALID_REF) {
+		gnttab_end_foreign_access(info->ring_ref, 0,
+					  (unsigned long)info->ring.sring);
+		info->ring_ref = GRANT_INVALID_REF;
+		info->ring.sring = NULL;
+	}
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info);
+	info->evtchn = info->irq = 0;
+
+}
+
+static void blkif_completion(struct blk_shadow *s)
+{
+	int i;
+	for (i = 0; i < s->req.nr_segments; i++)
+		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+}
+
+static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+{
+	struct request *req;
+	struct blkif_response *bret;
+	RING_IDX i, rp;
+	unsigned long flags;
+	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+	int uptodate;
+
+	spin_lock_irqsave(&blkif_io_lock, flags);
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+		spin_unlock_irqrestore(&blkif_io_lock, flags);
+		return IRQ_HANDLED;
+	}
+
+ again:
+	rp = info->ring.sring->rsp_prod;
+	rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+	for (i = info->ring.rsp_cons; i != rp; i++) {
+		unsigned long id;
+		int ret;
+
+		bret = RING_GET_RESPONSE(&info->ring, i);
+		id   = bret->id;
+		req  = (struct request *)info->shadow[id].request;
+
+		blkif_completion(&info->shadow[id]);
+
+		add_id_to_freelist(info, id);
+
+		uptodate = (bret->status == BLKIF_RSP_OKAY);
+		switch (bret->operation) {
+		case BLKIF_OP_WRITE_BARRIER:
+			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
+				       info->gd->disk_name);
+				uptodate = -EOPNOTSUPP;
+				info->feature_barrier = 0;
+				xlvbd_barrier(info);
+			}
+			/* fall through */
+		case BLKIF_OP_READ:
+		case BLKIF_OP_WRITE:
+			if (unlikely(bret->status != BLKIF_RSP_OKAY))
+				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
+					"request: %x\n", bret->status);
+
+			ret = end_that_request_first(req, uptodate,
+				req->hard_nr_sectors);
+			BUG_ON(ret);
+			end_that_request_last(req, uptodate);
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	info->ring.rsp_cons = i;
+
+	if (i != info->ring.req_prod_pvt) {
+		int more_to_do;
+		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+		if (more_to_do)
+			goto again;
+	} else
+		info->ring.sring->rsp_event = i + 1;
+
+	kick_pending_request_queues(info);
+
+	spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+
+static int setup_blkring(struct xenbus_device *dev,
+			 struct blkfront_info *info)
+{
+	struct blkif_sring *sring;
+	int err;
+
+	info->ring_ref = GRANT_INVALID_REF;
+
+	sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL);
+	if (!sring) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+		return -ENOMEM;
+	}
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+	if (err < 0) {
+		free_page((unsigned long)sring);
+		info->ring.sring = NULL;
+		goto fail;
+	}
+	info->ring_ref = err;
+
+	err = xenbus_alloc_evtchn(dev, &info->evtchn);
+	if (err)
+		goto fail;
+
+	err = bind_evtchn_to_irqhandler(info->evtchn,
+					blkif_interrupt,
+					IRQF_SAMPLE_RANDOM, "blkif", info);
+	if (err <= 0) {
+		xenbus_dev_fatal(dev, err,
+				 "bind_evtchn_to_irqhandler failed");
+		goto fail;
+	}
+	info->irq = err;
+
+	return 0;
+fail:
+	blkif_free(info, 0);
+	return err;
+}
+
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+			   struct blkfront_info *info)
+{
+	const char *message = NULL;
+	struct xenbus_transaction xbt;
+	int err;
+
+	/* Create shared ring, alloc event channel. */
+	err = setup_blkring(dev, info);
+	if (err)
+		goto out;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_blkring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename,
+			    "ring-ref", "%u", info->ring_ref);
+	if (err) {
+		message = "writing ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename,
+			    "event-channel", "%u", info->evtchn);
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_blkring;
+	}
+
+	xenbus_switch_state(dev, XenbusStateInitialised);
+
+	return 0;
+
+ abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+	blkif_free(info, 0);
+ out:
+	return err;
+}
+
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those.  Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	int err, vdevice, i;
+	struct blkfront_info *info;
+
+	/* FIXME: Use dynamic device id if this is not set. */
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+			   "virtual-device", "%i", &vdevice);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading virtual-device");
+		return err;
+	}
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+		return -ENOMEM;
+	}
+
+	info->xbdev = dev;
+	info->vdevice = vdevice;
+	info->connected = BLKIF_STATE_DISCONNECTED;
+	INIT_WORK(&info->work, blkif_restart_queue);
+
+	for (i = 0; i < BLK_RING_SIZE; i++)
+		info->shadow[i].req.id = i+1;
+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+	/* Front end dir is a number, which is used as the id. */
+	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
+	dev->dev.driver_data = info;
+
+	err = talk_to_backend(dev, info);
+	if (err) {
+		kfree(info);
+		dev->dev.driver_data = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
+
+static int blkif_recover(struct blkfront_info *info)
+{
+	int i;
+	struct blkif_request *req;
+	struct blk_shadow *copy;
+	int j;
+
+	/* Stage 1: Make a safe copy of the shadow state. */
+	copy = kmalloc(sizeof(info->shadow), GFP_KERNEL);
+	if (!copy)
+		return -ENOMEM;
+	memcpy(copy, info->shadow, sizeof(info->shadow));
+
+	/* Stage 2: Set up free list. */
+	memset(&info->shadow, 0, sizeof(info->shadow));
+	for (i = 0; i < BLK_RING_SIZE; i++)
+		info->shadow[i].req.id = i+1;
+	info->shadow_free = info->ring.req_prod_pvt;
+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+	/* Stage 3: Find pending requests and requeue them. */
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		/* Not in use? */
+		if (copy[i].request == 0)
+			continue;
+
+		/* Grab a request slot and copy shadow state into it. */
+		req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+		*req = copy[i].req;
+
+		/* We get a new request id, and must reset the shadow state. */
+		req->id = get_id_from_freelist(info);
+		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+
+		/* Rewrite any grant references invalidated by susp/resume. */
+		for (j = 0; j < req->nr_segments; j++)
+			gnttab_grant_foreign_access_ref(
+				req->seg[j].gref,
+				info->xbdev->otherend_id,
+				pfn_to_mfn(info->shadow[req->id].frame[j]),
+				rq_data_dir(
+					(struct request *)
+					info->shadow[req->id].request));
+		info->shadow[req->id].req = *req;
+
+		info->ring.req_prod_pvt++;
+	}
+
+	kfree(copy);
+
+	xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+	spin_lock_irq(&blkif_io_lock);
+
+	/* Now safe for us to use the shared ring */
+	info->connected = BLKIF_STATE_CONNECTED;
+
+	/* Send off requeued requests */
+	flush_requests(info);
+
+	/* Kick any other new requests queued since we resumed */
+	kick_pending_request_queues(info);
+
+	spin_unlock_irq(&blkif_io_lock);
+
+	return 0;
+}
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->dev.driver_data;
+	int err;
+
+	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
+
+	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+
+	err = talk_to_backend(dev, info);
+	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+		err = blkif_recover(info);
+
+	return err;
+}
+
+
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void blkfront_connect(struct blkfront_info *info)
+{
+	unsigned long long sectors;
+	unsigned long sector_size;
+	unsigned int binfo;
+	int err;
+
+	if ((info->connected == BLKIF_STATE_CONNECTED) ||
+	    (info->connected == BLKIF_STATE_SUSPENDED) )
+		return;
+
+	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
+		__func__, info->xbdev->otherend);
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "sectors", "%llu", &sectors,
+			    "info", "%u", &binfo,
+			    "sector-size", "%lu", &sector_size,
+			    NULL);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err,
+				 "reading backend fields at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-barrier", "%lu", &info->feature_barrier,
+			    NULL);
+	if (err)
+		info->feature_barrier = 0;
+
+	err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
+				  sectors, info->vdevice,
+				  binfo, sector_size, info);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+	/* Kick pending requests. */
+	spin_lock_irq(&blkif_io_lock);
+	info->connected = BLKIF_STATE_CONNECTED;
+	kick_pending_request_queues(info);
+	spin_unlock_irq(&blkif_io_lock);
+
+	add_disk(info->gd);
+}
+
+/**
+ * Handle the change of state of the backend to Closing.  We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend.  Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void blkfront_closing(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->dev.driver_data;
+	unsigned long flags;
+
+	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
+
+	if (info->rq == NULL)
+		goto out;
+
+	spin_lock_irqsave(&blkif_io_lock, flags);
+
+	del_gendisk(info->gd);
+
+	/* No more blkif_request(). */
+	blk_stop_queue(info->rq);
+
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&info->callback);
+	spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_scheduled_work();
+
+	blk_cleanup_queue(info->rq);
+	info->rq = NULL;
+
+ out:
+	xenbus_frontend_closed(dev);
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+			    enum xenbus_state backend_state)
+{
+	struct blkfront_info *info = dev->dev.driver_data;
+	struct block_device *bd;
+
+	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateConnected:
+		blkfront_connect(info);
+		break;
+
+	case XenbusStateClosing:
+		bd = bdget(info->dev);
+		if (bd == NULL)
+			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+
+		mutex_lock(&bd->bd_mutex);
+		if (info->users > 0)
+			xenbus_dev_error(dev, -EBUSY,
+					 "Device in use; refusing to close");
+		else
+			blkfront_closing(dev);
+		mutex_unlock(&bd->bd_mutex);
+		bdput(bd);
+		break;
+	}
+}
+
+static int blkfront_remove(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->dev.driver_data;
+
+	dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
+
+	blkif_free(info, 0);
+
+	kfree(info);
+
+	return 0;
+}
+
+static int blkif_open(struct inode *inode, struct file *filep)
+{
+	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+	info->users++;
+	return 0;
+}
+
+static int blkif_release(struct inode *inode, struct file *filep)
+{
+	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+	info->users--;
+	if (info->users == 0) {
+		/* Check whether we have been instructed to close.  We will
+		   have ignored this request initially, as the device was
+		   still mounted. */
+		struct xenbus_device *dev = info->xbdev;
+		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+
+		if (state == XenbusStateClosing)
+			blkfront_closing(dev);
+	}
+	return 0;
+}
+
+static struct block_device_operations xlvbd_block_fops =
+{
+	.owner = THIS_MODULE,
+	.open = blkif_open,
+	.release = blkif_release,
+};
+
+
+static struct xenbus_device_id blkfront_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+static struct xenbus_driver blkfront = {
+	.name = "vbd",
+	.owner = THIS_MODULE,
+	.ids = blkfront_ids,
+	.probe = blkfront_probe,
+	.remove = blkfront_remove,
+	.resume = blkfront_resume,
+	.otherend_changed = backend_changed,
+};
+
+static int __init xlblk_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
+		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
+		       XENVBD_MAJOR, DEV_NAME);
+		return -ENODEV;
+	}
+
+	return xenbus_register_frontend(&blkfront);
+}
+module_init(xlblk_init);
+
+
+static void xlblk_exit(void)
+{
+	return xenbus_unregister_driver(&blkfront);
+}
+module_exit(xlblk_exit);
+
+MODULE_DESCRIPTION("Xen virtual block device frontend");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
diff --git a/include/linux/major.h b/include/linux/major.h
index 7e7c9093919a..0cb98053537a 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -158,6 +158,8 @@
 #define VXSPEC_MAJOR		200	/* VERITAS volume config driver */
 #define VXDMP_MAJOR		201	/* VERITAS volume multipath driver */
 
+#define XENVBD_MAJOR		202	/* Xen virtual block device */
+
 #define MSR_MAJOR		202
 #define CPUID_MAJOR		203
 
-- 
cgit v1.2.3